Loading src/Benchmarks/SpMV/spmv.h +8 −4 Original line number Diff line number Diff line Loading @@ -232,8 +232,10 @@ benchmarkSpMV( Benchmark& benchmark, resultcuSPARSEDeviceVector2 = deviceVector2; // Difference between GPU (curent format) and GPU-cuSPARSE results Real cuSparseDifferenceAbsMax = resultDeviceVector2.differenceAbsMax( resultcuSPARSEDeviceVector2 ); Real cuSparseDifferenceLpNorm = resultDeviceVector2.differenceLpNorm( resultcuSPARSEDeviceVector2, 1 ); //Real cuSparseDifferenceAbsMax = resultDeviceVector2.differenceAbsMax( resultcuSPARSEDeviceVector2 ); Real cuSparseDifferenceAbsMax = max( abs( resultDeviceVector2 - resultcuSPARSEDeviceVector2 ) ); //Real cuSparseDifferenceLpNorm = resultDeviceVector2.differenceLpNorm( resultcuSPARSEDeviceVector2, 1 ); Real cuSparseDifferenceLpNorm = lpNorm( resultDeviceVector2 - resultcuSPARSEDeviceVector2, 1 ); std::string GPUxGPUcuSparse_resultDifferenceAbsMax = "GPUxGPUcuSPARSE differenceAbsMax = " + std::to_string( cuSparseDifferenceAbsMax ); std::string GPUxGPUcuSparse_resultDifferenceLpNorm = "GPUxGPUcuSPARSE differenceLpNorm = " + std::to_string( cuSparseDifferenceLpNorm ); Loading @@ -243,8 +245,10 @@ benchmarkSpMV( Benchmark& benchmark, // Difference between CPU and GPU results for the current format Real differenceAbsMax = resultHostVector2.differenceAbsMax( resultDeviceVector2 ); Real differenceLpNorm = resultHostVector2.differenceLpNorm( resultDeviceVector2, 1 ); //Real differenceAbsMax = resultHostVector2.differenceAbsMax( resultDeviceVector2 ); Real differenceAbsMax = max( abs( resultHostVector2 - resultDeviceVector2 ) ); //Real differenceLpNorm = resultHostVector2.differenceLpNorm( resultDeviceVector2, 1 ); Real differenceLpNorm = lpNorm( resultHostVector2 - resultDeviceVector2, 1 ); std::string CPUxGPU_resultDifferenceAbsMax = "CPUxGPU differenceAbsMax = " + std::to_string( differenceAbsMax ); std::string CPUxGPU_resultDifferenceLpNorm = "CPUxGPU differenceLpNorm = " + std::to_string( differenceLpNorm ); Loading src/TNL/Matrices/AdEllpack_impl.h +16 −16 Original line number Diff line number Diff line Loading @@ -1064,7 +1064,7 @@ void AdEllpack< Real, Device, Index >::spmvCuda4( const InVector& inVector, OutVector& outVector, const int gridIdx ) const { IndexType globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; IndexType warpIdx = globalIdx >> 5; IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 ); if( globalIdx >= this->reduceMap.getSize() ) Loading Loading @@ -1129,14 +1129,14 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector, OutVector& outVector, const int gridIdx ) const { IndexType globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; IndexType warpIdx = globalIdx >> 5; IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 ); if( globalIdx >= this->reduceMap.getSize() ) return; const int blockSize = 128; Real* temp = Devices::Cuda::getSharedMemory< Real >(); Real* temp = Cuda::getSharedMemory< Real >(); __shared__ IndexType reduceMap[ blockSize ]; reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ]; temp[ threadIdx.x ] = 0.0; Loading Loading @@ -1207,14 +1207,14 @@ void AdEllpack< Real, Device, Index >::spmvCuda16( const InVector& inVector, OutVector& outVector, const int gridIdx ) const { IndexType globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; IndexType warpIdx = globalIdx >> 5; IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 ); if( globalIdx >= this->reduceMap.getSize() ) return; const int blockSize = 128; Real* temp = Devices::Cuda::getSharedMemory< Real >(); Real* temp = Cuda::getSharedMemory< Real >(); __shared__ IndexType reduceMap[ blockSize ]; reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ]; temp[ threadIdx.x ] = 0.0; Loading Loading @@ -1293,7 +1293,7 @@ void AdEllpack< Real, Device, Index >::spmvCuda32( const InVector& inVector, return; const int blockSize = 96; Real* temp = Devices::Cuda::getSharedMemory< Real >(); Real* temp = Cuda::getSharedMemory< Real >(); __shared__ IndexType reduceMap[ blockSize ]; reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ]; temp[ threadIdx.x ] = 0.0; Loading Loading @@ -1441,9 +1441,9 @@ public: #ifdef HAVE_CUDA typedef AdEllpack< Real, Devices::Cuda, Index > Matrix; typedef typename Matrix::IndexType IndexType; Matrix* kernel_this = Devices::Cuda::passToDevice( matrix ); InVector* kernel_inVector = Devices::Cuda::passToDevice( inVector ); OutVector* kernel_outVector = Devices::Cuda::passToDevice( outVector ); Matrix* kernel_this = Cuda::passToDevice( matrix ); InVector* kernel_inVector = Cuda::passToDevice( inVector ); OutVector* kernel_outVector = Cuda::passToDevice( outVector ); TNL_CHECK_CUDA_DEVICE; if( matrix.totalLoad < 2 ) Loading Loading @@ -1510,16 +1510,16 @@ public: gridIdx ); } TNL_CHECK_CUDA_DEVICE; Devices::Cuda::freeFromDevice( kernel_this ); Devices::Cuda::freeFromDevice( kernel_inVector ); Devices::Cuda::freeFromDevice( kernel_outVector ); Cuda::freeFromDevice( kernel_this ); Cuda::freeFromDevice( kernel_inVector ); Cuda::freeFromDevice( kernel_outVector ); TNL_CHECK_CUDA_DEVICE; } else if( matrix.totalLoad < 16 ) { dim3 blockSize( 128 ), cudaGridSize( Cuda::getMaxGridSize() ); IndexType cudaBlocks = roundUpDivision( matrix.reduceMap.getSize(), blockSize.x ); IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() ); IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) Loading Loading @@ -1556,9 +1556,9 @@ public: gridIdx ); } TNL_CHECK_CUDA_DEVICE; Devices::Cuda::freeFromDevice( kernel_this ); Devices::Cuda::freeFromDevice( kernel_inVector ); Devices::Cuda::freeFromDevice( kernel_outVector ); Cuda::freeFromDevice( kernel_this ); Cuda::freeFromDevice( kernel_inVector ); Cuda::freeFromDevice( kernel_outVector ); TNL_CHECK_CUDA_DEVICE; } #endif // HAVE_CUDA Loading src/TNL/Matrices/BiEllpack_impl.h +2 −2 Original line number Diff line number Diff line Loading @@ -1406,7 +1406,7 @@ public: for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize(); cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); performRowBubbleSortCuda< Real, Index > <<< cudaGridSize, cudaBlockSize >>> ( kernel_this, Loading Loading @@ -1436,7 +1436,7 @@ public: for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize(); cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); computeColumnSizesCuda< Real, Index > <<< cudaGridSize, cudaBlockSize >>> ( kernel_this, Loading src/TNL/Matrices/ChunkedEllpack_impl.h +2 −2 Original line number Diff line number Diff line Loading @@ -1230,8 +1230,8 @@ ChunkedEllpack< Real, Device, Index >::operator=( const ChunkedEllpack< Real2, D // host -> cuda if( std::is_same< Device, Devices::Cuda >::value ) { typename ValuesVector::HostType tmpValues; typename ColumnIndexesVector::HostType tmpColumnIndexes; typename ValuesVector::Self< typename ValuesVector::RealType, Devices::Host > tmpValues; typename ColumnIndexesVector::Self< typename ColumnIndexesVector::RealType, Devices::Host > tmpColumnIndexes; tmpValues.setLike( matrix.values ); tmpColumnIndexes.setLike( matrix.columnIndexes ); Loading Loading
src/Benchmarks/SpMV/spmv.h +8 −4 Original line number Diff line number Diff line Loading @@ -232,8 +232,10 @@ benchmarkSpMV( Benchmark& benchmark, resultcuSPARSEDeviceVector2 = deviceVector2; // Difference between GPU (curent format) and GPU-cuSPARSE results Real cuSparseDifferenceAbsMax = resultDeviceVector2.differenceAbsMax( resultcuSPARSEDeviceVector2 ); Real cuSparseDifferenceLpNorm = resultDeviceVector2.differenceLpNorm( resultcuSPARSEDeviceVector2, 1 ); //Real cuSparseDifferenceAbsMax = resultDeviceVector2.differenceAbsMax( resultcuSPARSEDeviceVector2 ); Real cuSparseDifferenceAbsMax = max( abs( resultDeviceVector2 - resultcuSPARSEDeviceVector2 ) ); //Real cuSparseDifferenceLpNorm = resultDeviceVector2.differenceLpNorm( resultcuSPARSEDeviceVector2, 1 ); Real cuSparseDifferenceLpNorm = lpNorm( resultDeviceVector2 - resultcuSPARSEDeviceVector2, 1 ); std::string GPUxGPUcuSparse_resultDifferenceAbsMax = "GPUxGPUcuSPARSE differenceAbsMax = " + std::to_string( cuSparseDifferenceAbsMax ); std::string GPUxGPUcuSparse_resultDifferenceLpNorm = "GPUxGPUcuSPARSE differenceLpNorm = " + std::to_string( cuSparseDifferenceLpNorm ); Loading @@ -243,8 +245,10 @@ benchmarkSpMV( Benchmark& benchmark, // Difference between CPU and GPU results for the current format Real differenceAbsMax = resultHostVector2.differenceAbsMax( resultDeviceVector2 ); Real differenceLpNorm = resultHostVector2.differenceLpNorm( resultDeviceVector2, 1 ); //Real differenceAbsMax = resultHostVector2.differenceAbsMax( resultDeviceVector2 ); Real differenceAbsMax = max( abs( resultHostVector2 - resultDeviceVector2 ) ); //Real differenceLpNorm = resultHostVector2.differenceLpNorm( resultDeviceVector2, 1 ); Real differenceLpNorm = lpNorm( resultHostVector2 - resultDeviceVector2, 1 ); std::string CPUxGPU_resultDifferenceAbsMax = "CPUxGPU differenceAbsMax = " + std::to_string( differenceAbsMax ); std::string CPUxGPU_resultDifferenceLpNorm = "CPUxGPU differenceLpNorm = " + std::to_string( differenceLpNorm ); Loading
src/TNL/Matrices/AdEllpack_impl.h +16 −16 Original line number Diff line number Diff line Loading @@ -1064,7 +1064,7 @@ void AdEllpack< Real, Device, Index >::spmvCuda4( const InVector& inVector, OutVector& outVector, const int gridIdx ) const { IndexType globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; IndexType warpIdx = globalIdx >> 5; IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 ); if( globalIdx >= this->reduceMap.getSize() ) Loading Loading @@ -1129,14 +1129,14 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector, OutVector& outVector, const int gridIdx ) const { IndexType globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; IndexType warpIdx = globalIdx >> 5; IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 ); if( globalIdx >= this->reduceMap.getSize() ) return; const int blockSize = 128; Real* temp = Devices::Cuda::getSharedMemory< Real >(); Real* temp = Cuda::getSharedMemory< Real >(); __shared__ IndexType reduceMap[ blockSize ]; reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ]; temp[ threadIdx.x ] = 0.0; Loading Loading @@ -1207,14 +1207,14 @@ void AdEllpack< Real, Device, Index >::spmvCuda16( const InVector& inVector, OutVector& outVector, const int gridIdx ) const { IndexType globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; IndexType warpIdx = globalIdx >> 5; IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 ); if( globalIdx >= this->reduceMap.getSize() ) return; const int blockSize = 128; Real* temp = Devices::Cuda::getSharedMemory< Real >(); Real* temp = Cuda::getSharedMemory< Real >(); __shared__ IndexType reduceMap[ blockSize ]; reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ]; temp[ threadIdx.x ] = 0.0; Loading Loading @@ -1293,7 +1293,7 @@ void AdEllpack< Real, Device, Index >::spmvCuda32( const InVector& inVector, return; const int blockSize = 96; Real* temp = Devices::Cuda::getSharedMemory< Real >(); Real* temp = Cuda::getSharedMemory< Real >(); __shared__ IndexType reduceMap[ blockSize ]; reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ]; temp[ threadIdx.x ] = 0.0; Loading Loading @@ -1441,9 +1441,9 @@ public: #ifdef HAVE_CUDA typedef AdEllpack< Real, Devices::Cuda, Index > Matrix; typedef typename Matrix::IndexType IndexType; Matrix* kernel_this = Devices::Cuda::passToDevice( matrix ); InVector* kernel_inVector = Devices::Cuda::passToDevice( inVector ); OutVector* kernel_outVector = Devices::Cuda::passToDevice( outVector ); Matrix* kernel_this = Cuda::passToDevice( matrix ); InVector* kernel_inVector = Cuda::passToDevice( inVector ); OutVector* kernel_outVector = Cuda::passToDevice( outVector ); TNL_CHECK_CUDA_DEVICE; if( matrix.totalLoad < 2 ) Loading Loading @@ -1510,16 +1510,16 @@ public: gridIdx ); } TNL_CHECK_CUDA_DEVICE; Devices::Cuda::freeFromDevice( kernel_this ); Devices::Cuda::freeFromDevice( kernel_inVector ); Devices::Cuda::freeFromDevice( kernel_outVector ); Cuda::freeFromDevice( kernel_this ); Cuda::freeFromDevice( kernel_inVector ); Cuda::freeFromDevice( kernel_outVector ); TNL_CHECK_CUDA_DEVICE; } else if( matrix.totalLoad < 16 ) { dim3 blockSize( 128 ), cudaGridSize( Cuda::getMaxGridSize() ); IndexType cudaBlocks = roundUpDivision( matrix.reduceMap.getSize(), blockSize.x ); IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() ); IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) Loading Loading @@ -1556,9 +1556,9 @@ public: gridIdx ); } TNL_CHECK_CUDA_DEVICE; Devices::Cuda::freeFromDevice( kernel_this ); Devices::Cuda::freeFromDevice( kernel_inVector ); Devices::Cuda::freeFromDevice( kernel_outVector ); Cuda::freeFromDevice( kernel_this ); Cuda::freeFromDevice( kernel_inVector ); Cuda::freeFromDevice( kernel_outVector ); TNL_CHECK_CUDA_DEVICE; } #endif // HAVE_CUDA Loading
src/TNL/Matrices/BiEllpack_impl.h +2 −2 Original line number Diff line number Diff line Loading @@ -1406,7 +1406,7 @@ public: for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize(); cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); performRowBubbleSortCuda< Real, Index > <<< cudaGridSize, cudaBlockSize >>> ( kernel_this, Loading Loading @@ -1436,7 +1436,7 @@ public: for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize(); cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); computeColumnSizesCuda< Real, Index > <<< cudaGridSize, cudaBlockSize >>> ( kernel_this, Loading
src/TNL/Matrices/ChunkedEllpack_impl.h +2 −2 Original line number Diff line number Diff line Loading @@ -1230,8 +1230,8 @@ ChunkedEllpack< Real, Device, Index >::operator=( const ChunkedEllpack< Real2, D // host -> cuda if( std::is_same< Device, Devices::Cuda >::value ) { typename ValuesVector::HostType tmpValues; typename ColumnIndexesVector::HostType tmpColumnIndexes; typename ValuesVector::Self< typename ValuesVector::RealType, Devices::Host > tmpValues; typename ColumnIndexesVector::Self< typename ColumnIndexesVector::RealType, Devices::Host > tmpColumnIndexes; tmpValues.setLike( matrix.values ); tmpColumnIndexes.setLike( matrix.columnIndexes ); Loading