diff --git a/src/TNL/Algorithms/Segments/BiEllpackView.hpp b/src/TNL/Algorithms/Segments/BiEllpackView.hpp index 2014ae3dc983dbf025a51a971deaa9dca90d1ef4..c45844fa23ef90b3957d4039e300a4793124adc1 100644 --- a/src/TNL/Algorithms/Segments/BiEllpackView.hpp +++ b/src/TNL/Algorithms/Segments/BiEllpackView.hpp @@ -428,9 +428,9 @@ reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& detail::BiEllpackreduceSegmentsKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, BlockDim > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( *this, gridIdx, first, last, fetch, reduction, keeper, zero ); - cudaThreadSynchronize(); - TNL_CHECK_CUDA_DEVICE; } + cudaStreamSynchronize(0); + TNL_CHECK_CUDA_DEVICE; #endif } } diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp b/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp index 6133a843844b089bc60a16da7181cc8149c14c2a..6f9ee48fe0fcf140ae341b28bb1661efd8c5c543 100644 --- a/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp +++ b/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp @@ -460,6 +460,8 @@ reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction& <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( *this, gridIdx, first, last, fetch, reduction, keeper, zero ); } + cudaStreamSynchronize(0); + TNL_CHECK_CUDA_DEVICE; #endif } } diff --git a/src/TNL/Algorithms/Segments/EllpackView.hpp b/src/TNL/Algorithms/Segments/EllpackView.hpp index b5311d7939e1d49d826b6ba76bffb11af23aff0f..ca418e691f8066424ef0e47dc5227eeeb15962e9 100644 --- a/src/TNL/Algorithms/Segments/EllpackView.hpp +++ b/src/TNL/Algorithms/Segments/EllpackView.hpp @@ -105,13 +105,16 @@ struct EllpackCudaReductionDispatcher exec( Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Index segmentSize ) { #ifdef HAVE_CUDA + if( last <= first ) + return; const Index segmentsCount = last - first; const Index threadsCount = segmentsCount * 32; const Index blocksCount = Cuda::getNumberOfBlocks( threadsCount, 256 ); dim3 blockSize( 256 ); dim3 gridSize( blocksCount ); EllpackCudaReductionKernelFull<<< gridSize, blockSize >>>( first, last, fetch, reduction, keeper, zero, segmentSize ); - cudaDeviceSynchronize(); + cudaStreamSynchronize(0); + TNL_CHECK_CUDA_DEVICE; #endif } }; @@ -127,13 +130,16 @@ struct EllpackCudaReductionDispatcher< Index, Fetch, Reduction, ResultKeeper, Re exec( Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Index segmentSize ) { #ifdef HAVE_CUDA + if( last <= first ) + return; const Index segmentsCount = last - first; const Index threadsCount = segmentsCount * 32; const Index blocksCount = Cuda::getNumberOfBlocks( threadsCount, 256 ); dim3 blockSize( 256 ); dim3 gridSize( blocksCount ); EllpackCudaReductionKernelCompact<<< gridSize, blockSize >>>( first, last, fetch, reduction, keeper, zero, segmentSize ); - cudaDeviceSynchronize(); + cudaStreamSynchronize(0); + TNL_CHECK_CUDA_DEVICE; #endif } }; diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.hpp index 9c495fd70ad15b17db8219c69cc0638b0c42d7fe..86307e7540cddc5e0bf5acb55eac5e6ab08de722 100644 --- a/src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.hpp +++ b/src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.hpp @@ -272,6 +272,8 @@ struct CSRAdaptiveKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reductio zero, args... ); } + cudaStreamSynchronize(0); + TNL_CHECK_CUDA_DEVICE; #endif } }; diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp index 68198f995f4a8daecd67b8e05881432aacc4372d..7feb50feff212944ce6606451ae8db503206e810 100644 --- a/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp +++ b/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp @@ -175,7 +175,10 @@ void CSRHybridKernel< Index, Device, ThreadsInBlock >:: init( const Offsets& offsets ) { + TNL_ASSERT_GT( offsets.getSize(), 0, "offsets size must be strictly positive" ); const Index segmentsCount = offsets.getSize() - 1; + if( segmentsCount <= 0 ) + return; const Index elementsInSegment = std::ceil( ( double ) offsets.getElement( segmentsCount ) / ( double ) segmentsCount ); this->threadsPerSegment = TNL::min( std::pow( 2, std::ceil( std::log2( elementsInSegment ) ) ), ThreadsInBlock ); //TNL::Cuda::getWarpSize() ); TNL_ASSERT_GE( threadsPerSegment, 0, "" ); @@ -245,6 +248,8 @@ reduceSegments( const OffsetsView& offsets, TNL_ASSERT_LE( this->threadsPerSegment, ThreadsInBlock, "" ); #ifdef HAVE_CUDA + if( last <= first ) + return; const size_t threadsCount = this->threadsPerSegment * ( last - first ); dim3 blocksCount, gridsCount, blockSize( ThreadsInBlock ); TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount ); @@ -297,6 +302,8 @@ reduceSegments( const OffsetsView& offsets, throw std::runtime_error( std::string( "Wrong value of threadsPerSegment: " ) + std::to_string( this->threadsPerSegment ) ); } } + cudaStreamSynchronize(0); + TNL_CHECK_CUDA_DEVICE; #endif } diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp index 93d3e2800e9d6b6b9277306458ea50c0c897a4e0..0ceecc371642a82e23daa248be6a152c880198e3 100644 --- a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp +++ b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp @@ -424,6 +424,9 @@ struct CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduce, Kee const Index threadsPerSegment ) { #ifdef HAVE_CUDA + if( last <= first ) + return; + const size_t threads = 128; Index blocks, groupSize; @@ -500,8 +503,9 @@ struct CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduce, Kee grid, offsets, first, last, fetch, reduce, keep, zero ); }*/ } + cudaStreamSynchronize(0); + TNL_CHECK_CUDA_DEVICE; #endif - } }; @@ -513,21 +517,24 @@ void CSRLightKernel< Index, Device >:: init( const Offsets& offsets ) { + TNL_ASSERT_GT( offsets.getSize(), 0, "offsets size must be strictly positive" ); const Index segmentsCount = offsets.getSize() - 1; + if( segmentsCount <= 0 ) + return; if( this->getThreadsMapping() == CSRLightAutomaticThreads ) { const Index elementsInSegment = roundUpDivision( offsets.getElement( segmentsCount ), segmentsCount ); // non zeroes per row if( elementsInSegment <= 2 ) - this->threadsPerSegment = 2; + setThreadsPerSegment( 2 ); else if( elementsInSegment <= 4 ) - this->threadsPerSegment = 4; + setThreadsPerSegment( 4 ); else if( elementsInSegment <= 8 ) - this->threadsPerSegment = 8; + setThreadsPerSegment( 8 ); else if( elementsInSegment <= 16 ) - this->threadsPerSegment = 16; + setThreadsPerSegment( 16 ); else //if (nnz <= 2 * matrix.MAX_ELEMENTS_PER_WARP) - this->threadsPerSegment = 32; // CSR Vector + setThreadsPerSegment( 32 ); // CSR Vector //else // threadsPerSegment = roundUpDivision(nnz, matrix.MAX_ELEMENTS_PER_WARP) * 32; // CSR MultiVector } @@ -536,22 +543,18 @@ init( const Offsets& offsets ) { const Index elementsInSegment = roundUpDivision( offsets.getElement( segmentsCount ), segmentsCount ); // non zeroes per row if( elementsInSegment <= 2 ) - this->threadsPerSegment = 2; + setThreadsPerSegment( 2 ); else if( elementsInSegment <= 4 ) - this->threadsPerSegment = 4; + setThreadsPerSegment( 4 ); else if( elementsInSegment <= 8 ) - this->threadsPerSegment = 8; + setThreadsPerSegment( 8 ); else if( elementsInSegment <= 16 ) - this->threadsPerSegment = 16; + setThreadsPerSegment( 16 ); else //if (nnz <= 2 * matrix.MAX_ELEMENTS_PER_WARP) - this->threadsPerSegment = 32; // CSR Vector + setThreadsPerSegment( 32 ); // CSR Vector //else // threadsPerSegment = roundUpDivision(nnz, matrix.MAX_ELEMENTS_PER_WARP) * 32; // CSR MultiVector } - - TNL_ASSERT_GE( this->threadsPerSegment, 0, "" ); - TNL_ASSERT_LE( this->threadsPerSegment, 33, "" ); - } template< typename Index, diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRVectorKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRVectorKernel.hpp index cf7d80af65b8bbefcbba0490b5658ae905fc9e6d..a070761f752e0e53372b3ef350743b42d0b4b59a 100644 --- a/src/TNL/Algorithms/Segments/Kernels/CSRVectorKernel.hpp +++ b/src/TNL/Algorithms/Segments/Kernels/CSRVectorKernel.hpp @@ -140,6 +140,9 @@ reduceSegments( const OffsetsView& offsets, Args... args ) { #ifdef HAVE_CUDA + if( last <= first ) + return; + const Index warpsCount = last - first; const size_t threadsCount = warpsCount * TNL::Cuda::getWarpSize(); dim3 blocksCount, gridsCount, blockSize( 256 ); @@ -152,9 +155,12 @@ reduceSegments( const OffsetsView& offsets, reduceSegmentsCSRKernelVector< OffsetsView, IndexType, Fetch, Reduction, ResultKeeper, Real, Args... > <<< gridSize, blockSize >>>( gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args... ); - }; + } + cudaStreamSynchronize(0); + TNL_CHECK_CUDA_DEVICE; #endif } + } // namespace Segments } // namespace Algorithms } // namespace TNL diff --git a/src/UnitTests/Matrices/SparseMatrixTest.hpp b/src/UnitTests/Matrices/SparseMatrixTest.hpp index 1716b0ab894d506fe8eed2f13eb29821cb2266cf..444da8666fbbb2d4a6615d59f8266f9d855fd157 100644 --- a/src/UnitTests/Matrices/SparseMatrixTest.hpp +++ b/src/UnitTests/Matrices/SparseMatrixTest.hpp @@ -243,6 +243,9 @@ void test_SetDimensions() EXPECT_EQ( m.getRows(), 9 ); EXPECT_EQ( m.getColumns(), 8 ); + + // test empty matrix + m.setDimensions( 0, 0 ); } template< typename Matrix >