diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h index 45f715a5b3be31bbb9052a73db392ceeeff4425d..d8b20898310a1353cad1bced491410be620abc08 100644 --- a/src/Benchmarks/SpMV/spmv.h +++ b/src/Benchmarks/SpMV/spmv.h @@ -232,8 +232,10 @@ benchmarkSpMV( Benchmark& benchmark, resultcuSPARSEDeviceVector2 = deviceVector2; // Difference between GPU (curent format) and GPU-cuSPARSE results - Real cuSparseDifferenceAbsMax = resultDeviceVector2.differenceAbsMax( resultcuSPARSEDeviceVector2 ); - Real cuSparseDifferenceLpNorm = resultDeviceVector2.differenceLpNorm( resultcuSPARSEDeviceVector2, 1 ); + //Real cuSparseDifferenceAbsMax = resultDeviceVector2.differenceAbsMax( resultcuSPARSEDeviceVector2 ); + Real cuSparseDifferenceAbsMax = max( abs( resultDeviceVector2 - resultcuSPARSEDeviceVector2 ) ); + //Real cuSparseDifferenceLpNorm = resultDeviceVector2.differenceLpNorm( resultcuSPARSEDeviceVector2, 1 ); + Real cuSparseDifferenceLpNorm = lpNorm( resultDeviceVector2 - resultcuSPARSEDeviceVector2, 1 ); std::string GPUxGPUcuSparse_resultDifferenceAbsMax = "GPUxGPUcuSPARSE differenceAbsMax = " + std::to_string( cuSparseDifferenceAbsMax ); std::string GPUxGPUcuSparse_resultDifferenceLpNorm = "GPUxGPUcuSPARSE differenceLpNorm = " + std::to_string( cuSparseDifferenceLpNorm ); @@ -243,8 +245,10 @@ benchmarkSpMV( Benchmark& benchmark, // Difference between CPU and GPU results for the current format - Real differenceAbsMax = resultHostVector2.differenceAbsMax( resultDeviceVector2 ); - Real differenceLpNorm = resultHostVector2.differenceLpNorm( resultDeviceVector2, 1 ); + //Real differenceAbsMax = resultHostVector2.differenceAbsMax( resultDeviceVector2 ); + Real differenceAbsMax = max( abs( resultHostVector2 - resultDeviceVector2 ) ); + //Real differenceLpNorm = resultHostVector2.differenceLpNorm( resultDeviceVector2, 1 ); + Real differenceLpNorm = lpNorm( resultHostVector2 - resultDeviceVector2, 1 ); std::string CPUxGPU_resultDifferenceAbsMax = "CPUxGPU differenceAbsMax = " + std::to_string( differenceAbsMax ); std::string CPUxGPU_resultDifferenceLpNorm = "CPUxGPU differenceLpNorm = " + std::to_string( differenceLpNorm ); diff --git a/src/TNL/Matrices/AdEllpack_impl.h b/src/TNL/Matrices/AdEllpack_impl.h index bea4a1b4fb4d7d093c70456686f66b77d00aec7d..b7b97ff93550ef8c7289b749156e1fd5973e2f7d 100644 --- a/src/TNL/Matrices/AdEllpack_impl.h +++ b/src/TNL/Matrices/AdEllpack_impl.h @@ -1064,7 +1064,7 @@ void AdEllpack< Real, Device, Index >::spmvCuda4( const InVector& inVector, OutVector& outVector, const int gridIdx ) const { - IndexType globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; IndexType warpIdx = globalIdx >> 5; IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 ); if( globalIdx >= this->reduceMap.getSize() ) @@ -1129,14 +1129,14 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector, OutVector& outVector, const int gridIdx ) const { - IndexType globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; IndexType warpIdx = globalIdx >> 5; IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 ); if( globalIdx >= this->reduceMap.getSize() ) return; const int blockSize = 128; - Real* temp = Devices::Cuda::getSharedMemory< Real >(); + Real* temp = Cuda::getSharedMemory< Real >(); __shared__ IndexType reduceMap[ blockSize ]; reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ]; temp[ threadIdx.x ] = 0.0; @@ -1207,14 +1207,14 @@ void AdEllpack< Real, Device, Index >::spmvCuda16( const InVector& inVector, OutVector& outVector, const int gridIdx ) const { - IndexType globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; IndexType warpIdx = globalIdx >> 5; IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 ); if( globalIdx >= this->reduceMap.getSize() ) return; const int blockSize = 128; - Real* temp = Devices::Cuda::getSharedMemory< Real >(); + Real* temp = Cuda::getSharedMemory< Real >(); __shared__ IndexType reduceMap[ blockSize ]; reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ]; temp[ threadIdx.x ] = 0.0; @@ -1293,7 +1293,7 @@ void AdEllpack< Real, Device, Index >::spmvCuda32( const InVector& inVector, return; const int blockSize = 96; - Real* temp = Devices::Cuda::getSharedMemory< Real >(); + Real* temp = Cuda::getSharedMemory< Real >(); __shared__ IndexType reduceMap[ blockSize ]; reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ]; temp[ threadIdx.x ] = 0.0; @@ -1441,9 +1441,9 @@ public: #ifdef HAVE_CUDA typedef AdEllpack< Real, Devices::Cuda, Index > Matrix; typedef typename Matrix::IndexType IndexType; - Matrix* kernel_this = Devices::Cuda::passToDevice( matrix ); - InVector* kernel_inVector = Devices::Cuda::passToDevice( inVector ); - OutVector* kernel_outVector = Devices::Cuda::passToDevice( outVector ); + Matrix* kernel_this = Cuda::passToDevice( matrix ); + InVector* kernel_inVector = Cuda::passToDevice( inVector ); + OutVector* kernel_outVector = Cuda::passToDevice( outVector ); TNL_CHECK_CUDA_DEVICE; if( matrix.totalLoad < 2 ) @@ -1510,16 +1510,16 @@ public: gridIdx ); } TNL_CHECK_CUDA_DEVICE; - Devices::Cuda::freeFromDevice( kernel_this ); - Devices::Cuda::freeFromDevice( kernel_inVector ); - Devices::Cuda::freeFromDevice( kernel_outVector ); + Cuda::freeFromDevice( kernel_this ); + Cuda::freeFromDevice( kernel_inVector ); + Cuda::freeFromDevice( kernel_outVector ); TNL_CHECK_CUDA_DEVICE; } else if( matrix.totalLoad < 16 ) { dim3 blockSize( 128 ), cudaGridSize( Cuda::getMaxGridSize() ); IndexType cudaBlocks = roundUpDivision( matrix.reduceMap.getSize(), blockSize.x ); - IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() ); + IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) @@ -1556,9 +1556,9 @@ public: gridIdx ); } TNL_CHECK_CUDA_DEVICE; - Devices::Cuda::freeFromDevice( kernel_this ); - Devices::Cuda::freeFromDevice( kernel_inVector ); - Devices::Cuda::freeFromDevice( kernel_outVector ); + Cuda::freeFromDevice( kernel_this ); + Cuda::freeFromDevice( kernel_inVector ); + Cuda::freeFromDevice( kernel_outVector ); TNL_CHECK_CUDA_DEVICE; } #endif // HAVE_CUDA diff --git a/src/TNL/Matrices/BiEllpack_impl.h b/src/TNL/Matrices/BiEllpack_impl.h index 2789c92ebd2fd0f60935aa3cb20b09487d63dd4f..c659b758e9cffe531a101baf8fe3cd812436fe2c 100644 --- a/src/TNL/Matrices/BiEllpack_impl.h +++ b/src/TNL/Matrices/BiEllpack_impl.h @@ -1406,7 +1406,7 @@ public: for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize(); + cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); performRowBubbleSortCuda< Real, Index > <<< cudaGridSize, cudaBlockSize >>> ( kernel_this, @@ -1436,7 +1436,7 @@ public: for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize(); + cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); computeColumnSizesCuda< Real, Index > <<< cudaGridSize, cudaBlockSize >>> ( kernel_this, diff --git a/src/TNL/Matrices/ChunkedEllpack_impl.h b/src/TNL/Matrices/ChunkedEllpack_impl.h index a77b4a7667ae8bc990f91f35d472e6952840208d..23ba2ed5e012e5f3e6cfdef43441031b28d79f5c 100644 --- a/src/TNL/Matrices/ChunkedEllpack_impl.h +++ b/src/TNL/Matrices/ChunkedEllpack_impl.h @@ -1230,8 +1230,8 @@ ChunkedEllpack< Real, Device, Index >::operator=( const ChunkedEllpack< Real2, D // host -> cuda if( std::is_same< Device, Devices::Cuda >::value ) { - typename ValuesVector::HostType tmpValues; - typename ColumnIndexesVector::HostType tmpColumnIndexes; + typename ValuesVector::Self< typename ValuesVector::RealType, Devices::Host > tmpValues; + typename ColumnIndexesVector::Self< typename ColumnIndexesVector::RealType, Devices::Host > tmpColumnIndexes; tmpValues.setLike( matrix.values ); tmpColumnIndexes.setLike( matrix.columnIndexes );