From ce8f92fcbfa2f03ad6b241ffbcfe8e9eaa146a89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Thu, 16 Dec 2021 11:06:19 +0100 Subject: [PATCH 1/3] Added missing TNL_CHECK_CUDA_DEVICE and fixed stream synchronization after segments CUDA kernels --- src/TNL/Algorithms/Segments/BiEllpackView.hpp | 4 ++-- src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp | 2 ++ src/TNL/Algorithms/Segments/EllpackView.hpp | 6 ++++-- .../Algorithms/Segments/Kernels/CSRAdaptiveKernelView.hpp | 2 ++ src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp | 2 ++ src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp | 3 ++- src/TNL/Algorithms/Segments/Kernels/CSRVectorKernel.hpp | 5 ++++- 7 files changed, 18 insertions(+), 6 deletions(-) diff --git a/src/TNL/Algorithms/Segments/BiEllpackView.hpp b/src/TNL/Algorithms/Segments/BiEllpackView.hpp index 2014ae3dc..c45844fa2 100644 --- a/src/TNL/Algorithms/Segments/BiEllpackView.hpp +++ b/src/TNL/Algorithms/Segments/BiEllpackView.hpp @@ -428,9 +428,9 @@ reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& detail::BiEllpackreduceSegmentsKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, BlockDim > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( *this, gridIdx, first, last, fetch, reduction, keeper, zero ); - cudaThreadSynchronize(); - TNL_CHECK_CUDA_DEVICE; } + cudaStreamSynchronize(0); + TNL_CHECK_CUDA_DEVICE; #endif } } diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp b/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp index 6133a8438..6f9ee48fe 100644 --- a/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp +++ b/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp @@ -460,6 +460,8 @@ reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( *this, gridIdx, first, last, fetch, reduction, keeper, zero ); } + cudaStreamSynchronize(0); + TNL_CHECK_CUDA_DEVICE; #endif } } diff --git a/src/TNL/Algorithms/Segments/EllpackView.hpp b/src/TNL/Algorithms/Segments/EllpackView.hpp index b5311d793..9985d9b9b 100644 --- a/src/TNL/Algorithms/Segments/EllpackView.hpp +++ b/src/TNL/Algorithms/Segments/EllpackView.hpp @@ -111,7 +111,8 @@ struct EllpackCudaReductionDispatcher dim3 blockSize( 256 ); dim3 gridSize( blocksCount ); EllpackCudaReductionKernelFull<<< gridSize, blockSize >>>( first, last, fetch, reduction, keeper, zero, segmentSize ); - cudaDeviceSynchronize(); + cudaStreamSynchronize(0); + TNL_CHECK_CUDA_DEVICE; #endif } }; @@ -133,7 +134,8 @@ struct EllpackCudaReductionDispatcher< Index, Fetch, Reduction, ResultKeeper, Re dim3 blockSize( 256 ); dim3 gridSize( blocksCount ); EllpackCudaReductionKernelCompact<<< gridSize, blockSize >>>( first, last, fetch, reduction, keeper, zero, segmentSize ); - cudaDeviceSynchronize(); + cudaStreamSynchronize(0); + TNL_CHECK_CUDA_DEVICE; #endif } }; diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.hpp index 9c495fd70..86307e754 100644 --- a/src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.hpp +++ b/src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.hpp @@ -272,6 +272,8 @@ struct CSRAdaptiveKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reductio zero, args... ); } + cudaStreamSynchronize(0); + TNL_CHECK_CUDA_DEVICE; #endif } }; diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp index 68198f995..6ecad2dda 100644 --- a/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp +++ b/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp @@ -297,6 +297,8 @@ reduceSegments( const OffsetsView& offsets, throw std::runtime_error( std::string( "Wrong value of threadsPerSegment: " ) + std::to_string( this->threadsPerSegment ) ); } } + cudaStreamSynchronize(0); + TNL_CHECK_CUDA_DEVICE; #endif } diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp index 93d3e2800..37f237533 100644 --- a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp +++ b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp @@ -500,8 +500,9 @@ struct CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduce, Kee grid, offsets, first, last, fetch, reduce, keep, zero ); }*/ } + cudaStreamSynchronize(0); + TNL_CHECK_CUDA_DEVICE; #endif - } }; diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRVectorKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRVectorKernel.hpp index cf7d80af6..a72e3b951 100644 --- a/src/TNL/Algorithms/Segments/Kernels/CSRVectorKernel.hpp +++ b/src/TNL/Algorithms/Segments/Kernels/CSRVectorKernel.hpp @@ -152,9 +152,12 @@ reduceSegments( const OffsetsView& offsets, reduceSegmentsCSRKernelVector< OffsetsView, IndexType, Fetch, Reduction, ResultKeeper, Real, Args... > <<< gridSize, blockSize >>>( gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args... ); - }; + } + cudaStreamSynchronize(0); + TNL_CHECK_CUDA_DEVICE; #endif } + } // namespace Segments } // namespace Algorithms } // namespace TNL -- GitLab From c4cc606a9e4abf23c446909b1ba5fab75a304e69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Thu, 16 Dec 2021 12:51:58 +0100 Subject: [PATCH 2/3] Fixed reduction of segments to work even for empty matrices --- src/TNL/Algorithms/Segments/EllpackView.hpp | 4 ++++ src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp | 2 ++ src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp | 3 +++ src/TNL/Algorithms/Segments/Kernels/CSRVectorKernel.hpp | 3 +++ src/UnitTests/Matrices/SparseMatrixTest.hpp | 3 +++ 5 files changed, 15 insertions(+) diff --git a/src/TNL/Algorithms/Segments/EllpackView.hpp b/src/TNL/Algorithms/Segments/EllpackView.hpp index 9985d9b9b..ca418e691 100644 --- a/src/TNL/Algorithms/Segments/EllpackView.hpp +++ b/src/TNL/Algorithms/Segments/EllpackView.hpp @@ -105,6 +105,8 @@ struct EllpackCudaReductionDispatcher exec( Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Index segmentSize ) { #ifdef HAVE_CUDA + if( last <= first ) + return; const Index segmentsCount = last - first; const Index threadsCount = segmentsCount * 32; const Index blocksCount = Cuda::getNumberOfBlocks( threadsCount, 256 ); @@ -128,6 +130,8 @@ struct EllpackCudaReductionDispatcher< Index, Fetch, Reduction, ResultKeeper, Re exec( Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Index segmentSize ) { #ifdef HAVE_CUDA + if( last <= first ) + return; const Index segmentsCount = last - first; const Index threadsCount = segmentsCount * 32; const Index blocksCount = Cuda::getNumberOfBlocks( threadsCount, 256 ); diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp index 6ecad2dda..1483c1881 100644 --- a/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp +++ b/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp @@ -245,6 +245,8 @@ reduceSegments( const OffsetsView& offsets, TNL_ASSERT_LE( this->threadsPerSegment, ThreadsInBlock, "" ); #ifdef HAVE_CUDA + if( last <= first ) + return; const size_t threadsCount = this->threadsPerSegment * ( last - first ); dim3 blocksCount, gridsCount, blockSize( ThreadsInBlock ); TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount ); diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp index 37f237533..4c00a5410 100644 --- a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp +++ b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp @@ -424,6 +424,9 @@ struct CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduce, Kee const Index threadsPerSegment ) { #ifdef HAVE_CUDA + if( last <= first ) + return; + const size_t threads = 128; Index blocks, groupSize; diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRVectorKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRVectorKernel.hpp index a72e3b951..a070761f7 100644 --- a/src/TNL/Algorithms/Segments/Kernels/CSRVectorKernel.hpp +++ b/src/TNL/Algorithms/Segments/Kernels/CSRVectorKernel.hpp @@ -140,6 +140,9 @@ reduceSegments( const OffsetsView& offsets, Args... args ) { #ifdef HAVE_CUDA + if( last <= first ) + return; + const Index warpsCount = last - first; const size_t threadsCount = warpsCount * TNL::Cuda::getWarpSize(); dim3 blocksCount, gridsCount, blockSize( 256 ); diff --git a/src/UnitTests/Matrices/SparseMatrixTest.hpp b/src/UnitTests/Matrices/SparseMatrixTest.hpp index 1716b0ab8..444da8666 100644 --- a/src/UnitTests/Matrices/SparseMatrixTest.hpp +++ b/src/UnitTests/Matrices/SparseMatrixTest.hpp @@ -243,6 +243,9 @@ void test_SetDimensions() EXPECT_EQ( m.getRows(), 9 ); EXPECT_EQ( m.getColumns(), 8 ); + + // test empty matrix + m.setDimensions( 0, 0 ); } template< typename Matrix > -- GitLab From ade448eef0494e2aeffec6871125b45586a91d1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Thu, 16 Dec 2021 13:23:02 +0100 Subject: [PATCH 3/3] Fixed setting threads per segment in CSR kernels --- .../Segments/Kernels/CSRHybridKernel.hpp | 3 +++ .../Segments/Kernels/CSRLightKernel.hpp | 27 +++++++++---------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp index 1483c1881..7feb50fef 100644 --- a/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp +++ b/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp @@ -175,7 +175,10 @@ void CSRHybridKernel< Index, Device, ThreadsInBlock >:: init( const Offsets& offsets ) { + TNL_ASSERT_GT( offsets.getSize(), 0, "offsets size must be strictly positive" ); const Index segmentsCount = offsets.getSize() - 1; + if( segmentsCount <= 0 ) + return; const Index elementsInSegment = std::ceil( ( double ) offsets.getElement( segmentsCount ) / ( double ) segmentsCount ); this->threadsPerSegment = TNL::min( std::pow( 2, std::ceil( std::log2( elementsInSegment ) ) ), ThreadsInBlock ); //TNL::Cuda::getWarpSize() ); TNL_ASSERT_GE( threadsPerSegment, 0, "" ); diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp index 4c00a5410..0ceecc371 100644 --- a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp +++ b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp @@ -517,21 +517,24 @@ void CSRLightKernel< Index, Device >:: init( const Offsets& offsets ) { + TNL_ASSERT_GT( offsets.getSize(), 0, "offsets size must be strictly positive" ); const Index segmentsCount = offsets.getSize() - 1; + if( segmentsCount <= 0 ) + return; if( this->getThreadsMapping() == CSRLightAutomaticThreads ) { const Index elementsInSegment = roundUpDivision( offsets.getElement( segmentsCount ), segmentsCount ); // non zeroes per row if( elementsInSegment <= 2 ) - this->threadsPerSegment = 2; + setThreadsPerSegment( 2 ); else if( elementsInSegment <= 4 ) - this->threadsPerSegment = 4; + setThreadsPerSegment( 4 ); else if( elementsInSegment <= 8 ) - this->threadsPerSegment = 8; + setThreadsPerSegment( 8 ); else if( elementsInSegment <= 16 ) - this->threadsPerSegment = 16; + setThreadsPerSegment( 16 ); else //if (nnz <= 2 * matrix.MAX_ELEMENTS_PER_WARP) - this->threadsPerSegment = 32; // CSR Vector + setThreadsPerSegment( 32 ); // CSR Vector //else // threadsPerSegment = roundUpDivision(nnz, matrix.MAX_ELEMENTS_PER_WARP) * 32; // CSR MultiVector } @@ -540,22 +543,18 @@ init( const Offsets& offsets ) { const Index elementsInSegment = roundUpDivision( offsets.getElement( segmentsCount ), segmentsCount ); // non zeroes per row if( elementsInSegment <= 2 ) - this->threadsPerSegment = 2; + setThreadsPerSegment( 2 ); else if( elementsInSegment <= 4 ) - this->threadsPerSegment = 4; + setThreadsPerSegment( 4 ); else if( elementsInSegment <= 8 ) - this->threadsPerSegment = 8; + setThreadsPerSegment( 8 ); else if( elementsInSegment <= 16 ) - this->threadsPerSegment = 16; + setThreadsPerSegment( 16 ); else //if (nnz <= 2 * matrix.MAX_ELEMENTS_PER_WARP) - this->threadsPerSegment = 32; // CSR Vector + setThreadsPerSegment( 32 ); // CSR Vector //else // threadsPerSegment = roundUpDivision(nnz, matrix.MAX_ELEMENTS_PER_WARP) * 32; // CSR MultiVector } - - TNL_ASSERT_GE( this->threadsPerSegment, 0, "" ); - TNL_ASSERT_LE( this->threadsPerSegment, 33, "" ); - } template< typename Index, -- GitLab