Fixes after rebase - it works even with CUDA. (8254fd03) · Commits · TNL / tnl-dev

src/Benchmarks/SpMV/spmv.h

+8 −4

Original line number	Diff line number	Diff line
		@@ -232,8 +232,10 @@ benchmarkSpMV( Benchmark& benchmark,
		resultcuSPARSEDeviceVector2 = deviceVector2;

		// Difference between GPU (curent format) and GPU-cuSPARSE results
		Real cuSparseDifferenceAbsMax = resultDeviceVector2.differenceAbsMax( resultcuSPARSEDeviceVector2 );
		Real cuSparseDifferenceLpNorm = resultDeviceVector2.differenceLpNorm( resultcuSPARSEDeviceVector2, 1 );
		//Real cuSparseDifferenceAbsMax = resultDeviceVector2.differenceAbsMax( resultcuSPARSEDeviceVector2 );
		Real cuSparseDifferenceAbsMax = max( abs( resultDeviceVector2 - resultcuSPARSEDeviceVector2 ) );
		//Real cuSparseDifferenceLpNorm = resultDeviceVector2.differenceLpNorm( resultcuSPARSEDeviceVector2, 1 );
		Real cuSparseDifferenceLpNorm = lpNorm( resultDeviceVector2 - resultcuSPARSEDeviceVector2, 1 );

		std::string GPUxGPUcuSparse_resultDifferenceAbsMax = "GPUxGPUcuSPARSE differenceAbsMax = " + std::to_string( cuSparseDifferenceAbsMax );
		std::string GPUxGPUcuSparse_resultDifferenceLpNorm = "GPUxGPUcuSPARSE differenceLpNorm = " + std::to_string( cuSparseDifferenceLpNorm );
		@@ -243,8 +245,10 @@ benchmarkSpMV( Benchmark& benchmark,


		// Difference between CPU and GPU results for the current format
		Real differenceAbsMax = resultHostVector2.differenceAbsMax( resultDeviceVector2 );
		Real differenceLpNorm = resultHostVector2.differenceLpNorm( resultDeviceVector2, 1 );
		//Real differenceAbsMax = resultHostVector2.differenceAbsMax( resultDeviceVector2 );
		Real differenceAbsMax = max( abs( resultHostVector2 - resultDeviceVector2 ) );
		//Real differenceLpNorm = resultHostVector2.differenceLpNorm( resultDeviceVector2, 1 );
		Real differenceLpNorm = lpNorm( resultHostVector2 - resultDeviceVector2, 1 );

		std::string CPUxGPU_resultDifferenceAbsMax = "CPUxGPU differenceAbsMax = " + std::to_string( differenceAbsMax );
		std::string CPUxGPU_resultDifferenceLpNorm = "CPUxGPU differenceLpNorm = " + std::to_string( differenceLpNorm );

src/TNL/Matrices/AdEllpack_impl.h

+16 −16

Original line number	Diff line number	Diff line
		@@ -1064,7 +1064,7 @@ void AdEllpack< Real, Device, Index >::spmvCuda4( const InVector& inVector,
		OutVector& outVector,
		const int gridIdx ) const
		{
		IndexType globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
		IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
		IndexType warpIdx = globalIdx >> 5;
		IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 );
		if( globalIdx >= this->reduceMap.getSize() )
		@@ -1129,14 +1129,14 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector,
		OutVector& outVector,
		const int gridIdx ) const
		{
		IndexType globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
		IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
		IndexType warpIdx = globalIdx >> 5;
		IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 );
		if( globalIdx >= this->reduceMap.getSize() )
		return;

		const int blockSize = 128;
		Real* temp = Devices::Cuda::getSharedMemory< Real >();
		Real* temp = Cuda::getSharedMemory< Real >();
		__shared__ IndexType reduceMap[ blockSize ];
		reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ];
		temp[ threadIdx.x ] = 0.0;
		@@ -1207,14 +1207,14 @@ void AdEllpack< Real, Device, Index >::spmvCuda16( const InVector& inVector,
		OutVector& outVector,
		const int gridIdx ) const
		{
		IndexType globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
		IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
		IndexType warpIdx = globalIdx >> 5;
		IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 );
		if( globalIdx >= this->reduceMap.getSize() )
		return;

		const int blockSize = 128;
		Real* temp = Devices::Cuda::getSharedMemory< Real >();
		Real* temp = Cuda::getSharedMemory< Real >();
		__shared__ IndexType reduceMap[ blockSize ];
		reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ];
		temp[ threadIdx.x ] = 0.0;
		@@ -1293,7 +1293,7 @@ void AdEllpack< Real, Device, Index >::spmvCuda32( const InVector& inVector,
		return;

		const int blockSize = 96;
		Real* temp = Devices::Cuda::getSharedMemory< Real >();
		Real* temp = Cuda::getSharedMemory< Real >();
		__shared__ IndexType reduceMap[ blockSize ];
		reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ];
		temp[ threadIdx.x ] = 0.0;
		@@ -1441,9 +1441,9 @@ public:
		#ifdef HAVE_CUDA
		typedef AdEllpack< Real, Devices::Cuda, Index > Matrix;
		typedef typename Matrix::IndexType IndexType;
		Matrix* kernel_this = Devices::Cuda::passToDevice( matrix );
		InVector* kernel_inVector = Devices::Cuda::passToDevice( inVector );
		OutVector* kernel_outVector = Devices::Cuda::passToDevice( outVector );
		Matrix* kernel_this = Cuda::passToDevice( matrix );
		InVector* kernel_inVector = Cuda::passToDevice( inVector );
		OutVector* kernel_outVector = Cuda::passToDevice( outVector );
		TNL_CHECK_CUDA_DEVICE;

		if( matrix.totalLoad < 2 )
		@@ -1510,16 +1510,16 @@ public:
		gridIdx );
		}
		TNL_CHECK_CUDA_DEVICE;
		Devices::Cuda::freeFromDevice( kernel_this );
		Devices::Cuda::freeFromDevice( kernel_inVector );
		Devices::Cuda::freeFromDevice( kernel_outVector );
		Cuda::freeFromDevice( kernel_this );
		Cuda::freeFromDevice( kernel_inVector );
		Cuda::freeFromDevice( kernel_outVector );
		TNL_CHECK_CUDA_DEVICE;
		}
		else if( matrix.totalLoad < 16 )
		{
		dim3 blockSize( 128 ), cudaGridSize( Cuda::getMaxGridSize() );
		IndexType cudaBlocks = roundUpDivision( matrix.reduceMap.getSize(), blockSize.x );
		IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() );
		IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
		for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
		{
		if( gridIdx == cudaGrids - 1 )
		@@ -1556,9 +1556,9 @@ public:
		gridIdx );
		}
		TNL_CHECK_CUDA_DEVICE;
		Devices::Cuda::freeFromDevice( kernel_this );
		Devices::Cuda::freeFromDevice( kernel_inVector );
		Devices::Cuda::freeFromDevice( kernel_outVector );
		Cuda::freeFromDevice( kernel_this );
		Cuda::freeFromDevice( kernel_inVector );
		Cuda::freeFromDevice( kernel_outVector );
		TNL_CHECK_CUDA_DEVICE;
		}
		#endif // HAVE_CUDA

src/TNL/Matrices/BiEllpack_impl.h

+2 −2

Original line number	Diff line number	Diff line
		@@ -1406,7 +1406,7 @@ public:
		for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
		{
		if( gridIdx == cudaGrids - 1 )
		cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
		cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
		performRowBubbleSortCuda< Real, Index >
		<<< cudaGridSize, cudaBlockSize >>>
		( kernel_this,
		@@ -1436,7 +1436,7 @@ public:
		for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
		{
		if( gridIdx == cudaGrids - 1 )
		cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
		cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
		computeColumnSizesCuda< Real, Index >
		<<< cudaGridSize, cudaBlockSize >>>
		( kernel_this,

src/TNL/Matrices/ChunkedEllpack_impl.h

+2 −2

Original line number	Diff line number	Diff line
		@@ -1230,8 +1230,8 @@ ChunkedEllpack< Real, Device, Index >::operator=( const ChunkedEllpack< Real2, D

		// host -> cuda
		if( std::is_same< Device, Devices::Cuda >::value ) {
		typename ValuesVector::HostType tmpValues;
		typename ColumnIndexesVector::HostType tmpColumnIndexes;
		typename ValuesVector::Self< typename ValuesVector::RealType, Devices::Host > tmpValues;
		typename ColumnIndexesVector::Self< typename ColumnIndexesVector::RealType, Devices::Host > tmpColumnIndexes;
		tmpValues.setLike( matrix.values );
		tmpColumnIndexes.setLike( matrix.columnIndexes );