Fix for CSR Adaptive (382eb38f) · Commits · TNL / tnl-dev

src/TNL/Matrices/Legacy/CSR.h

+9 −9

Original line number	Diff line number	Diff line
		@@ -107,13 +107,10 @@ public:

		Containers::Vector< Block<Index>, Device, Index > blocks;

		/* Configuration of SpMV kernels ------------------------------------------- */
		/* Configuration of CSR SpMV kernels ----------------------------------------- */

		/* Block sizes */

		// Execute 1024 threads per block for float, (12 elements per thread) for 48KB cache
		// 512 threads per block for double (12 elements per thread)
		static constexpr Index THREADS_ADAPTIVE = sizeof(Real) == 4 ? 1024 : 512;
		static constexpr Index THREADS_ADAPTIVE = 1024;
		static constexpr Index THREADS_SCALAR = 1024;
		static constexpr Index THREADS_VECTOR = 1024;
		static constexpr Index THREADS_LIGHT = 1024;
		@@ -127,8 +124,11 @@ public:
		/* Number of elements in shared memory */
		static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real);

		/* Number of warps in block for CSR Adaptive */
		static constexpr Index WARPS = THREADS_ADAPTIVE / 32;

		/* Number of elements in shared memory per one warp */
		static constexpr Index SHARED_PER_WARP = SHARED / (THREADS_ADAPTIVE / 32);
		static constexpr Index SHARED_PER_WARP = SHARED / WARPS;
		/* -------------------------------------------------------------------------- */

src/TNL/Matrices/Legacy/CSR_impl.h

+17 −17

Original line number	Diff line number	Diff line
		@@ -130,7 +130,7 @@ Index findLimit(const Index start,
		matrix.getRowPointers().getElement(current);
		sum += elements;
		if (sum > matrix.SHARED_PER_WARP) {
		if (current - start > 1) { // extra row
		if (current - start > 0) { // extra row
		type = Type::STREAM;
		return current;
		} else { // one long row
		@@ -804,7 +804,7 @@ Index CSR< Real, Device, Index, KernelType >::getHybridModeSplit() const
		template< typename Real,
		typename Index,
		int warpSize,
		int SHARED,
		int WARPS,
		int SHARED_PER_WARP,
		int MAX_ELEM_PER_WARP >
		__global__
		@@ -816,7 +816,7 @@ void SpMVCSRAdaptive( const Real *inVector,
		const Block<Index> *blocks,
		Index blocksSize,
		Index gridID) {
		__shared__ Real shared[SHARED];
		__shared__ Real shared[WARPS][SHARED_PER_WARP];
		const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
		const Index blockIdx = index / warpSize;
		if (blockIdx >= blocksSize)
		@@ -826,25 +826,25 @@ void SpMVCSRAdaptive( const Real *inVector,
		const Index laneID = threadIdx.x & 31; // & is cheaper than %
		Block<Index> block = blocks[blockIdx];
		const Index minID = rowPointers[block.index[0]/* minRow */];
		Index i, to, offset, maxID;
		Index i, to, maxID;
		if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b1000000) {
		/////////////////////////////////////* CSR STREAM *//////////////
		const Index maxRow = block.index[0]/* minRow */ +
		/* maxRow - minRow */(block.twobytes[sizeof(Index) == 4 ? 3 : 5] & 0x3FFF);
		const Index warpID = threadIdx.x / 32;
		maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4];
		/* offset between shared and global addresses */
		offset = minID - (threadIdx.x / warpSize * SHARED_PER_WARP);
		/* Copy and calculate elements from global to shared memory, coalesced */

		/* Stream data to shared memory */
		for (i = laneID + minID; i < maxID; i += warpSize)
		shared[i - offset] = values[i] * inVector[columnIndexes[i]];
		shared[warpID][i - minID] = values[i] * inVector[columnIndexes[i]];

		const Index maxRow = block.index[0]/* minRow */ +
		/* maxRow - minRow */(block.twobytes[sizeof(Index) == 4 ? 3 : 5] & 0x3FFF);
		/* Calculate result */
		for (i = block.index[0]/* minRow */ + laneID; i < maxRow; i += warpSize) {
		to = rowPointers[i + 1] - offset; // end of preprocessed data
		to = rowPointers[i + 1] - minID; // end of preprocessed data
		result = 0;
		/* Scalar reduction */
		for (Index sharedID = rowPointers[i] - offset; sharedID < to; ++sharedID)
		result += shared[sharedID];
		for (Index sharedID = rowPointers[i] - minID; sharedID < to; ++sharedID)
		result += shared[warpID][sharedID];

		outVector[i] = result; // Write result
		}
		@@ -864,10 +864,10 @@ void SpMVCSRAdaptive( const Real *inVector,
		if (laneID == 0) outVector[block.index[0]/* minRow */] = result; // Write result
		} else {
		/////////////////////////////////////* CSR VECTOR L */////////////
		maxID = rowPointers[block.index[0]/* minRow */ + 1];

		offset = block.index[1]/* warpInRow / MAX_ELEM_PER_WARP;
		/* Number of elements processed by previous warps */
		const Index offset = block.index[1]/* warpInRow / MAX_ELEM_PER_WARP;
		to = minID + (block.index[1]/* warpInRow / + 1) MAX_ELEM_PER_WARP;
		maxID = rowPointers[block.index[0]/* minRow */ + 1];
		if (to > maxID) to = maxID;
		for (i = minID + offset + laneID; i < to; i += warpSize)
		result += values[i] * inVector[columnIndexes[i]];
		@@ -1754,7 +1754,7 @@ void SpMVCSRAdaptivePrepare( const Real *inVector,
		}

		SpMVCSRAdaptive< Real, Index, warpSize,
		matrix.SHARED,
		matrix.WARPS,
		matrix.SHARED_PER_WARP,
		matrix.MAX_ELEMENTS_PER_WARP >
		<<<blocks, threads>>>(