Added CSR Adaptive and CSR Stream (f620b0ee) · Commits · TNL / tnl-dev

src/TNL/Matrices/Legacy/CSR.h

+2 −2

Original line number	Diff line number	Diff line
		@@ -230,7 +230,7 @@ public:
		__device__
		void vectorProductCuda( const InVector& inVector,
		OutVector& outVector,
		int gridIdx, unsigned *blocks, size_t size ) const;
		int gridIdx, int *blocks, size_t size ) const;

		template< typename InVector,
		typename OutVector,
		@@ -247,7 +247,7 @@ public:
		void spmvCSRAdaptive( const InVector& inVector,
		OutVector& outVector,
		int gridIdx,
		unsigned *blocks,
		int *blocks,
		size_t blocks_size) const;
		#endif

src/TNL/Matrices/Legacy/CSR_impl.h

+161 −116

Original line number	Diff line number	Diff line
		@@ -732,6 +732,35 @@ void CSR< Real, Device, Index >::spmvCudaLightSpmv( const InVector& inVector,
		}
		}

		/* template< typename Real,
		typename Device,
		typename Index,
		typename InVector,
		int warpSize >
		__global__
		void spmvCSRVectorHelper( const InVector& inVector,
		Real *out,
		size_t from,
		size_t to,
		size_t perWarp)
		{
		const size_t index = blockIdx.x * blockDim.x + threadIdx.x;
		const size_t warpID = index / warpSize;
		const size_t laneID = index % warpSize;
		const size_t minID = from + warpID * perWarp;
		size_t maxID = from + (warpID + 1) * perWarp;
		if (minID >= to) return;
		if (maxID >= to ) maxID = to;

		Real result = 0.0;
		for (IndexType i = minID + laneID; i < maxID; i += warpSize) {
		const IndexType column = this->columnIndexes[i];
		if (column >= this->getColumns())
		continue;
		result += this->values[i] * inVector[column];
		}
		atomicAdd(out, result);
		} */

		template< typename Real,
		typename Device,
		@@ -743,59 +772,58 @@ __device__
		void CSR< Real, Device, Index >::spmvCSRAdaptive( const InVector& inVector,
		OutVector& outVector,
		int gridIdx,
		unsigned *blocks,
		int *blocks,
		size_t blocks_size) const
		{
		constexpr IndexType SHARED = 1024; /* TODO: delete */
		/* Configuration ---------------------------------------------------*/
		constexpr size_t SHARED = 49152/sizeof(float);
		constexpr size_t SHARED_PER_WARP = SHARED / warpSize;
		constexpr size_t MAX_PER_WARP = 65536;
		constexpr size_t ELEMENTS_PER_WARP = 1024;
		constexpr size_t THREADS_PER_BLOCK = 1024;
		constexpr size_t WARPS_PER_BLOCK = THREADS_PER_BLOCK / warpSize;
		//--------------------------------------------------------------------
		const IndexType index = blockIdx.x * blockDim.x + threadIdx.x;
		__shared__ unsigned blockCnt;
		__shared__ float shared_res[SHARED];
		float result = 0.0;
		if (index == 0) blockCnt = 0; // Init shared variable
		__syncthreads();

		const IndexType laneID = index % warpSize;
		IndexType blockIdx;

		while (true) {
		/* Get row number */
		if (laneID == 0) blockIdx = atomicAdd(&blockCnt, 1);
		/* Propagate row number in warp */
		blockIdx = __shfl_sync((unsigned)(warpSize - 1), blockIdx, 0);
		IndexType blockIdx = index / warpSize;
		__shared__ float shared_res[SHARED];
		Real result = 0.0;
		if (blockIdx >= blocks_size - 1)
		return;
		/* Number of elements */
		const IndexType minRow = blocks[blockIdx];
		const IndexType maxRow = blocks[blockIdx + 1];
		const IndexType minID = this->rowPointers[minRow];
		const IndexType maxID = this->rowPointers[maxRow];
		const IndexType elements = maxID - minID;
		result = 0.0;
		/* rows per block more than 1 */
		if ((maxRow - minRow) > 1) {
		/////////////////////////////////////* CSR STREAM *//////////////
		/* Copy and calculate elements from global to shared memory, coalesced */
		const IndexType offset = threadIdx.x / warpSize * SHARED_PER_WARP;
		for (IndexType i = laneID; i < elements; i += warpSize) {
		const IndexType elementIdx = i + minID;
		const IndexType column = this->columnIndexes[elementIdx];
		if (column >= this->getColumns())
		break;
		shared_res[i] = this->values[elementIdx] * inVector[column];
		continue;
		shared_res[i + offset] = this->values[elementIdx] * inVector[column];
		}

		const IndexType row = minRow + laneID;
		if (row >= maxRow - 1)
		continue;
		if (row >= maxRow)
		return;
		/* Calculate result */
		const IndexType to = this->rowPointers[row + 1] - minID;
		for (IndexType i = this->rowPointers[row] - minID; i < to; ++i) {
		result += shared_res[i];
		result += shared_res[i + offset];
		}
		outVector[row] = result; // Write result
		} else {
		} else if (elements <= MAX_PER_WARP) {
		/////////////////////////////////////* CSR VECTOR *//////////////
		for (IndexType i = minID + laneID; i < maxID; i += warpSize) {
		const IndexType column = this->columnIndexes[i];
		IndexType column = this->columnIndexes[i];
		if (column >= this->getColumns())
		break;

		result += this->values[i] * inVector[column];
		}
		/* Reduction */
		@@ -805,7 +833,18 @@ void CSR< Real, Device, Index >::spmvCSRAdaptive( const InVector& inVector,
		result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2);
		result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1);
		if (laneID == 0) outVector[minRow] = result; // Write result
		}
		} else {
		/////////////////////////////////////* CSR VECTOR LONG *//////////////
		const size_t warps = (elements - ELEMENTS_PER_WARP) / ELEMENTS_PER_WARP + 1;
		const size_t blocks = warps <= WARPS_PER_BLOCK ? 1 : warps / WARPS_PER_BLOCK + 1;
		const size_t threads_per_block = blocks == 1 ? warps * warpSize : WARPS_PER_BLOCK * warpSize;
		// spmvCSRVectorHelper<InVector, warpSize> <<<blocks, threads_per_block>>>(
		// inVector,
		// &outVector[minRow],
		// (size_t)(minID + ELEMENTS_PER_WARP),
		// (size_t)maxID,
		// (size_t)ELEMENTS_PER_WARP
		// );
		}
		}

		@@ -862,7 +901,7 @@ __device__
		void CSR< Real, Device, Index >::vectorProductCuda( const InVector& inVector,
		OutVector& outVector,
		int gridIdx,
		unsigned *blocks, size_t size ) const
		int *blocks, size_t size ) const
		{
		spmvCSRAdaptive< InVector, OutVector, warpSize >( inVector, outVector, gridIdx, blocks, size );
		return;
		@@ -939,7 +978,7 @@ __global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Inde
		const InVector* inVector,
		OutVector* outVector,
		int gridIdx,
		unsigned *blocks, size_t size)
		int *blocks, size_t size)
		{
		typedef CSR< Real, Devices::Cuda, Index > Matrix;
		static_assert( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value, "" );
		@@ -965,7 +1004,7 @@ template< typename Real,
		void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix,
		const InVector& inVector,
		OutVector& outVector,
		unsigned *blocks,
		int *blocks,
		size_t size )
		{
		#ifdef HAVE_CUDA
		@@ -974,7 +1013,10 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix,
		Matrix* kernel_this = Cuda::passToDevice( matrix );
		InVector* kernel_inVector = Cuda::passToDevice( inVector );
		OutVector* kernel_outVector = Cuda::passToDevice( outVector );
		unsigned kernelBlocks = Cuda::passToDevice( blocks );
		int *kernelBlocks;
		cudaMalloc((void *)&kernelBlocks, sizeof(int) size);
		cudaMemcpy(kernelBlocks, blocks, size * sizeof(int), cudaMemcpyHostToDevice);

		TNL_CHECK_CUDA_DEVICE;
		dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
		const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
		@@ -984,48 +1026,51 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix,
		if( gridIdx == cudaGrids - 1 )
		cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
		const int sharedMemory = cudaBlockSize.x * sizeof( Real );
		if( matrix.getCudaWarpSize() == 32 )
		// const int threads = cudaBlockSize.x;
		if( matrix.getCudaWarpSize() == 32 ) {
		// printf("BL %d BLSIZE %d\n", (int)cudaBlocks, (int)threads);
		CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 32 >
		<<< cudaGridSize, cudaBlockSize, sharedMemory >>>
		( kernel_this,
		kernel_inVector,
		kernel_outVector,
		gridIdx, kernelBlocks, size );
		if( matrix.getCudaWarpSize() == 16 )
		CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 16 >
		<<< cudaGridSize, cudaBlockSize, sharedMemory >>>
		( kernel_this,
		kernel_inVector,
		kernel_outVector,
		gridIdx, kernelBlocks, size );
		if( matrix.getCudaWarpSize() == 8 )
		CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 8 >
		<<< cudaGridSize, cudaBlockSize, sharedMemory >>>
		( kernel_this,
		kernel_inVector,
		kernel_outVector,
		gridIdx, kernelBlocks, size );
		if( matrix.getCudaWarpSize() == 4 )
		CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 4 >
		<<< cudaGridSize, cudaBlockSize, sharedMemory >>>
		( kernel_this,
		kernel_inVector,
		kernel_outVector,
		gridIdx, kernelBlocks, size );
		if( matrix.getCudaWarpSize() == 2 )
		CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 2 >
		<<< cudaGridSize, cudaBlockSize, sharedMemory >>>
		( kernel_this,
		kernel_inVector,
		kernel_outVector,
		gridIdx, kernelBlocks, size );
		if( matrix.getCudaWarpSize() == 1 )
		CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 1 >
		<<< cudaGridSize, cudaBlockSize, sharedMemory >>>
		<<< 2, 1024 >>>
		( kernel_this,
		kernel_inVector,
		kernel_outVector,
		gridIdx, kernelBlocks, size );
		}
		// if( matrix.getCudaWarpSize() == 16 )
		// CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 16 >
		// <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
		// ( kernel_this,
		// kernel_inVector,
		// kernel_outVector,
		// gridIdx, kernelBlocks, size );
		// if( matrix.getCudaWarpSize() == 8 )
		// CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 8 >
		// <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
		// ( kernel_this,
		// kernel_inVector,
		// kernel_outVector,
		// gridIdx, kernelBlocks, size );
		// if( matrix.getCudaWarpSize() == 4 )
		// CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 4 >
		// <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
		// ( kernel_this,
		// kernel_inVector,
		// kernel_outVector,
		// gridIdx, kernelBlocks, size );
		// if( matrix.getCudaWarpSize() == 2 )
		// CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 2 >
		// <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
		// ( kernel_this,
		// kernel_inVector,
		// kernel_outVector,
		// gridIdx, kernelBlocks, size );
		// if( matrix.getCudaWarpSize() == 1 )
		// CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 1 >
		// <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
		// ( kernel_this,
		// kernel_inVector,
		// kernel_outVector,
		// gridIdx, kernelBlocks, size );

		}
		TNL_CHECK_CUDA_DEVICE;
		@@ -1145,32 +1190,32 @@ class CSRDeviceDependentCode< Devices::Cuda >
		inVector.getData(),
		outVector.getData() );
		#else
		std::vector<unsigned> inBlock;
		constexpr int SHARED = 49152/sizeof(float);
		constexpr int SHARED_PER_WARP = SHARED / 32;
		std::vector<int> inBlock;
		inBlock.push_back(0);
		unsigned sum = 0;
		for (size_t i = 1; i < matrix.getRowPointers().getSize(); ++i) {
		printf("PTR %d\n", (int)matrix.getRowPointers().getElement(i));
		}
		for (size_t i = 1; i < matrix.getRowPointers().getSize() - 1; ++i) {
		size_t elements = matrix.getRowPointers().getElement(i + 1) -
		matrix.getRowPointers().getElement(i);
		if (elements > 1024) {
		if (sum == 0) {
		size_t sum = 0;
		size_t i;
		int prev_i = 0;
		for (i = 1; i < matrix.getRowPointers().getSize() - 1; ++i) {
		size_t elements = matrix.getRowPointers().getElement(i) -
		matrix.getRowPointers().getElement(i - 1);
		sum += elements;
		if (sum > SHARED_PER_WARP) {
		if (i - prev_i == 1) {
		inBlock.push_back(i);
		}
		else {
		} else {
		inBlock.push_back(i - 1);
		inBlock.push_back(i);
		--i;
		}
		sum = 0;
		}
		sum += elements;

		if (sum <= 1024)
		prev_i = i;
		continue;
		else {
		inBlock.push_back(i - 1);
		sum = elements;
		}
		if (i - prev_i == 32) {
		inBlock.push_back(i);
		prev_i = i;
		sum = 0;
		}
		}
		inBlock.push_back(matrix.getRowPointers().getSize() - 1);

src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp

+66 −74

Original line number	Diff line number	Diff line
		@@ -1259,9 +1259,11 @@ void test_VectorProductLarger()
		Matrix m;
		m.reset();
		m.setDimensions( m_rows, m_cols );
		typename Matrix::CompressedRowLengthsVector rowLengths;
		rowLengths.setSize( m_rows );
		rowLengths.setValue( 20 );
		typename Matrix::CompressedRowLengthsVector rowLengths(
		{11, 2, 4, 0, 6, 4, 1, 2, 20, 18, 6, 20, 10, 0, 20, 10, 2, 20, 10, 12}
		);
		// rowLengths.setSize( m_rows );
		// rowLengths.setValue( 20 );
		m.setCompressedRowLengths( rowLengths );

		/* 0 */
		@@ -1389,93 +1391,83 @@ void test_VectorProductGiant()
		using RealType = typename Matrix::RealType;
		using DeviceType = typename Matrix::DeviceType;
		using IndexType = typename Matrix::IndexType;
		const IndexType N = 200; // Should be a multiple of 10
		/*
		* Sets up the following NxN sparse matrix:
		*
		* / 1 0 1 0 ... 1 0 \
		* \| 0 0 0 0 ... 0 0 \|
		* \| 0 2 0 2 ... 0 2 \|
		* \| 0 0 0 0 ... 0 0 \|
		* \| 3 -3 3 -3 ... 3 -3 \|
		* \| 0 0 0 0 ... 0 0 \|
		* \| 5 5 5 5 ... 5 5 \|
		* \| 0 0 0 0 ... 0 0 \|
		* \| 0 0 0 0 ... 0 0 \|
		* \| 2 0 0 0 ... 0 0 \| <- the only 2 in the row
		*
		* ..................
		*/

		const IndexType m_rows = N;
		const IndexType m_cols = N;
		IndexType m_rows = 100;
		IndexType m_cols = 100;

		Matrix m;
		m.reset();
		m.setDimensions( m_rows, m_cols );
		typename Matrix::CompressedRowLengthsVector rowLengths;
		rowLengths.setSize( m_rows );
		rowLengths.setValue( m_cols );
		typename Matrix::CompressedRowLengthsVector rowLengths(
		{
		100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
		100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
		100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
		100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
		100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
		100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
		100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
		100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
		100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
		100, 100, 100, 100, 100, 100, 100, 100, 100, 100
		}
		);

		m.setCompressedRowLengths( rowLengths );

		for ( IndexType row = 0; row < m_rows; ++row ) {
		if (row % 10 == 0) {
		for ( IndexType col = 0; col < m_cols; ++col ) {
		if (col % 2)
		m.setElement( row, col, 1 );
		}
		}
		else if (row % 10 == 2) {
		for ( IndexType col = 0; col < m_cols; ++col ) {
		if (col % 2 == 1)
		m.setElement( row, col, 2 );
		}
		}
		else if (row % 10 == 4) {
		for ( IndexType col = 0; col < m_cols; ++col ) {
		if (col % 2)
		m.setElement( row, col, 3 );
		else
		m.setElement( row, col, -3 );
		}
		}
		else if (row % 10 == 6) {
		for ( IndexType col = 0; col < m_cols; ++col ) {
		m.setElement( row, col, 5 );
		}
		}
		else if (row % 10 == 9) {
		m.setElement( row, 0, 2 );
		}
		}
		for (int i = 0; i < m_rows; ++i)
		for (int j = 0; j < m_cols; ++j)
		m.setElement( i, j, i + 1 );

		using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >;

		VectorType inVector;
		inVector.setSize( N );
		for( IndexType i = 0; i < inVector.getSize(); i++ )
		inVector.setElement( i, 2 );
		inVector.setSize( m_rows );
		for( IndexType i = 0; i < inVector.getSize(); ++i )
		inVector.setElement( i, 1 );

		VectorType outVector;
		outVector.setSize( N );
		for( IndexType j = 0; j < outVector.getSize(); j++ )
		outVector.setElement( j, 0 );

		outVector.setSize( m_rows );
		for( IndexType i = 0; i < outVector.getSize(); ++i )
		outVector.setElement( i, 0 );

		m.vectorProduct( inVector, outVector);

		for ( IndexType i = 0; i < outVector.getSize(); ++i ) {
		if ( i % 10 == 0 )
		EXPECT_EQ( outVector.getElement( i ), 200 /* (N) */ );
		else if ( i % 10 == 2 )
		EXPECT_EQ( outVector.getElement( i ), 400 /* (2 * N) */ );
		else if ( i % 10 == 6 )
		EXPECT_EQ( outVector.getElement( i ), 2000 /* (10 * N) */ );
		else if ( i % 10 == 9 )
		EXPECT_EQ( outVector.getElement( i ), 4 );
		else
		EXPECT_EQ( outVector.getElement( i ), 0 );
		for (int i = 0; i < m_rows; ++i)
		EXPECT_EQ( outVector.getElement( i ), (i + 1) * 100 );

		//-----------------------------------------------------

		m_rows = 2;
		m_cols = 1000;

		m.reset();
		m.setDimensions( m_rows, m_cols );
		typename Matrix::CompressedRowLengthsVector rowLengths2(
		{
		1000, 1000
		}
		);

		m.setCompressedRowLengths( rowLengths2 );

		for (int i = 0; i < m_rows; ++i)
		for (int j = 0; j < m_cols; ++j)
		m.setElement( i, j, i + 1 );

		VectorType inVector2;
		inVector2.setSize( m_cols );
		for( IndexType i = 0; i < inVector2.getSize(); i++ )
		inVector2.setElement( i, 1 );

		VectorType outVector2;
		outVector2.setSize( m_rows );
		for( IndexType i = 0; i < outVector2.getSize(); ++i )
		outVector2.setElement( i, 0 );
		m.vectorProduct( inVector2, outVector2);

		for (int i = 0; i < m_rows; ++i)
		EXPECT_EQ( outVector2.getElement( i ), (i + 1) * 1000 );
		}

		template< typename Matrix >