Refactoring legacy adaptive CSR sparse matrix. (ecdac7b0) · Commits · TNL / tnl-dev

src/Benchmarks/SpMV/ReferenceFormats/Legacy/CSR_impl.h

+123 −93

Original line number	Diff line number	Diff line
		@@ -193,7 +193,6 @@ void CSR< Real, Device, Index, KernelType >::setBlocks()
		this->rowPointers.getElement(start)
		);
		}

		start = nextStart;
		}
		inBlock.emplace_back(nextStart);
		@@ -836,88 +835,6 @@ Index CSR< Real, Device, Index, KernelType >::getHybridModeSplit() const

		#ifdef HAVE_CUDA

		template< typename Real,
		typename Index,
		int warpSize,
		int WARPS,
		int SHARED_PER_WARP,
		int MAX_ELEM_PER_WARP >
		__global__
		void SpMVCSRAdaptive( const Real *inVector,
		Real *outVector,
		const Index* rowPointers,
		const Index* columnIndexes,
		const Real* values,
		const Block<Index> *blocks,
		Index blocksSize,
		Index gridID) {
		__shared__ Real shared[WARPS][SHARED_PER_WARP];
		const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
		const Index blockIdx = index / warpSize;
		if (blockIdx >= blocksSize)
		return;

		Real result = 0.0;
		const Index laneID = threadIdx.x & 31; // & is cheaper than %
		Block<Index> block = blocks[blockIdx];
		const Index minID = rowPointers[block.index[0]/* minRow */];
		Index i, to, maxID;
		if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b1000000) {
		/////////////////////////////////////* CSR STREAM *//////////////
		const Index warpID = threadIdx.x / 32;
		maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4];

		/* Stream data to shared memory */
		for (i = laneID + minID; i < maxID; i += warpSize)
		shared[warpID][i - minID] = values[i] * inVector[columnIndexes[i]];

		const Index maxRow = block.index[0]/* minRow */ +
		/* maxRow - minRow */(block.twobytes[sizeof(Index) == 4 ? 3 : 5] & 0x3FFF);
		/* Calculate result */
		for (i = block.index[0]/* minRow */ + laneID; i < maxRow; i += warpSize) {
		to = rowPointers[i + 1] - minID; // end of preprocessed data
		result = 0;
		/* Scalar reduction */
		for (Index sharedID = rowPointers[i] - minID; sharedID < to; ++sharedID)
		result += shared[warpID][sharedID];

		outVector[i] = result; // Write result
		}
		} else if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b10000000) {
		/////////////////////////////////////* CSR VECTOR *//////////////
		maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4];

		for (i = minID + laneID; i < maxID; i += warpSize)
		result += values[i] * inVector[columnIndexes[i]];

		/* Parallel reduction */
		result += __shfl_down_sync(0xFFFFFFFF, result, 16);
		result += __shfl_down_sync(0xFFFFFFFF, result, 8);
		result += __shfl_down_sync(0xFFFFFFFF, result, 4);
		result += __shfl_down_sync(0xFFFFFFFF, result, 2);
		result += __shfl_down_sync(0xFFFFFFFF, result, 1);
		if (laneID == 0) outVector[block.index[0]/* minRow */] = result; // Write result
		} else {
		/////////////////////////////////////* CSR VECTOR L */////////////
		/* Number of elements processed by previous warps */
		const Index offset = block.index[1]/* warpInRow / MAX_ELEM_PER_WARP;
		to = minID + (block.index[1]/* warpInRow / + 1) MAX_ELEM_PER_WARP;
		maxID = rowPointers[block.index[0]/* minRow */ + 1];
		if (to > maxID) to = maxID;
		for (i = minID + offset + laneID; i < to; i += warpSize)
		{
		result += values[i] * inVector[columnIndexes[i]];
		}

		/* Parallel reduction */
		result += __shfl_down_sync(0xFFFFFFFF, result, 16);
		result += __shfl_down_sync(0xFFFFFFFF, result, 8);
		result += __shfl_down_sync(0xFFFFFFFF, result, 4);
		result += __shfl_down_sync(0xFFFFFFFF, result, 2);
		result += __shfl_down_sync(0xFFFFFFFF, result, 1);
		if (laneID == 0) atomicAdd(&outVector[block.index[0]/* minRow */], result);
		}
		}

		template< typename Real,
		typename Index>
		@@ -1767,6 +1684,110 @@ void SpMVCSRMultiVectorPrepare( const Real *inVector,
		}
		}

		template< typename Real, typename Index >
		__device__ Real CSRFetch( const Index* columnIndexes, const Real* values, const Real* vector, const Index i )
		{
		return values[ i ] * vector[ columnIndexes[ i ] ];
		}

		template< typename Real,
		typename Index,
		int warpSize,
		int WARPS,
		int SHARED_PER_WARP,
		int MAX_ELEM_PER_WARP,
		typename Fetch >
		__global__
		void SpMVCSRAdaptive( const Real *inVector,
		Real *outVector,
		const Index* rowPointers,
		const Index* columnIndexes,
		const Real* values,
		const Block<Index> *blocks,
		Index blocksSize,
		Index gridID,
		const Fetch fetch )
		{
		__shared__ Real shared[WARPS][SHARED_PER_WARP];
		const Index index = ( ( gridID * MAX_X_DIM + blockIdx.x ) * blockDim.x ) + threadIdx.x;
		const Index blockIdx = index / warpSize;
		if( blockIdx >= blocksSize )
		return;

		Real result = 0.0;
		bool compute( true );
		const Index laneID = threadIdx.x & 31; // & is cheaper than %
		Block<Index> block = blocks[blockIdx];
		const Index minID = rowPointers[block.index[0]/* minRow */];
		Index i, to, maxID;

		if( block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b1000000)
		{
		/////////////////////////////////////* CSR STREAM *//////////////
		const Index warpID = threadIdx.x / 32;
		maxID = minID + block.twobytes[sizeof(Index) == 4 ? 2 : 4];
		// ^-> maxID - minID

		// Stream data to shared memory
		for (i = laneID + minID; i < maxID; i += warpSize)
		//shared[warpID][i - minID] = fetch( i, compute ); //CSRFetch( columnIndexes, values, inVector, i );
		shared[warpID][i - minID] = values[i] * inVector[columnIndexes[i]];

		const Index maxRow = block.index[0] + // minRow
		(block.twobytes[sizeof(Index) == 4 ? 3 : 5] & 0x3FFF); // maxRow - minRow
		// Calculate result
		for (i = block.index[0]+ laneID; i < maxRow; i += warpSize) // block.index[0] -> minRow
		{
		to = rowPointers[i + 1] - minID; // end of preprocessed data
		result = 0;
		// Scalar reduction
		for (Index sharedID = rowPointers[i] - minID; sharedID < to; ++sharedID)
		result += shared[warpID][sharedID];

		outVector[i] = result; // Write result
		}
		} else if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b10000000) {
		//////////////////////////////////// CSR VECTOR /////////////
		maxID = minID + // maxID - minID
		block.twobytes[sizeof(Index) == 4 ? 2 : 4];

		for (i = minID + laneID; i < maxID; i += warpSize)
		result += values[i] * inVector[columnIndexes[i]];

		// Parallel reduction
		result += __shfl_down_sync(0xFFFFFFFF, result, 16);
		result += __shfl_down_sync(0xFFFFFFFF, result, 8);
		result += __shfl_down_sync(0xFFFFFFFF, result, 4);
		result += __shfl_down_sync(0xFFFFFFFF, result, 2);
		result += __shfl_down_sync(0xFFFFFFFF, result, 1);
		// Write result
		if (laneID == 0) outVector[block.index[0]] = result; // block.index[0] -> minRow
		} else {
		//////////////////////////////////// CSR VECTOR L ////////////
		// Number of elements processed by previous warps
		const Index offset = block.index[1] * MAX_ELEM_PER_WARP;
		// ^ warpInRow
		to = minID + (block.index[1] + 1) * MAX_ELEM_PER_WARP;
		// ^ warpInRow
		maxID = rowPointers[block.index[0] + 1];
		// ^ minRow
		if (to > maxID) to = maxID;
		for (i = minID + offset + laneID; i < to; i += warpSize)
		{
		result += values[i] * inVector[columnIndexes[i]];
		}

		// Parallel reduction
		result += __shfl_down_sync(0xFFFFFFFF, result, 16);
		result += __shfl_down_sync(0xFFFFFFFF, result, 8);
		result += __shfl_down_sync(0xFFFFFFFF, result, 4);
		result += __shfl_down_sync(0xFFFFFFFF, result, 2);
		result += __shfl_down_sync(0xFFFFFFFF, result, 1);
		if (laneID == 0) atomicAdd(&outVector[block.index[0]], result);
		// ^ minRow
		}
		}

		template< typename Real,
		typename Index,
		typename Device,
		@@ -1778,18 +1799,27 @@ void SpMVCSRAdaptivePrepare( const Real *inVector,
		Index blocks;
		const Index threads = matrix.THREADS_ADAPTIVE;

		/* Fill blocks */
		const Index* columnIndexesData = matrix.getColumnIndexes().getData();
		const Real* valuesData = matrix.getValues().getData();
		auto fetch = [=] __cuda_callable__ ( Index globalIdx, bool& compute ) -> Real {
		return valuesData[ globalIdx ] * inVector[ columnIndexesData[ globalIdx ] ];
		};

		// Fill blocks
		size_t neededThreads = matrix.blocks.getSize() * warpSize; // one warp per block
		/* Execute kernels on device */
		for (Index grid = 0; neededThreads != 0; ++grid) {
		if (MAX_X_DIM * threads >= neededThreads) {
		// Execute kernels on device
		for( Index grid = 0; neededThreads != 0; ++grid )
		{
		if( MAX_X_DIM * threads >= neededThreads )
		{
		blocks = roundUpDivision(neededThreads, threads);
		neededThreads = 0;
		} else {
		}
		else
		{
		blocks = MAX_X_DIM;
		neededThreads -= MAX_X_DIM * threads;
		}

		SpMVCSRAdaptive< Real, Index, warpSize,
		matrix.WARPS,
		matrix.SHARED_PER_WARP,
		@@ -1802,8 +1832,8 @@ void SpMVCSRAdaptivePrepare( const Real *inVector,
		matrix.getValues().getData(),
		matrix.blocks.getData(),
		matrix.blocks.getSize() - 1, // last block shouldn't be used
		grid
		);
		grid,
		fetch );
		}
		}