From ce8f92fcbfa2f03ad6b241ffbcfe8e9eaa146a89 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Thu, 16 Dec 2021 11:06:19 +0100
Subject: [PATCH 1/3] Added missing TNL_CHECK_CUDA_DEVICE and fixed stream
 synchronization after segments CUDA kernels

---
 src/TNL/Algorithms/Segments/BiEllpackView.hpp               | 4 ++--
 src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp          | 2 ++
 src/TNL/Algorithms/Segments/EllpackView.hpp                 | 6 ++++--
 .../Algorithms/Segments/Kernels/CSRAdaptiveKernelView.hpp   | 2 ++
 src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp     | 2 ++
 src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp      | 3 ++-
 src/TNL/Algorithms/Segments/Kernels/CSRVectorKernel.hpp     | 5 ++++-
 7 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/TNL/Algorithms/Segments/BiEllpackView.hpp b/src/TNL/Algorithms/Segments/BiEllpackView.hpp
index 2014ae3dc..c45844fa2 100644
--- a/src/TNL/Algorithms/Segments/BiEllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/BiEllpackView.hpp
@@ -428,9 +428,9 @@ reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction&
          detail::BiEllpackreduceSegmentsKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, BlockDim  >
             <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
             ( *this, gridIdx, first, last, fetch, reduction, keeper, zero );
-         cudaThreadSynchronize();
-         TNL_CHECK_CUDA_DEVICE;
       }
+      cudaStreamSynchronize(0);
+      TNL_CHECK_CUDA_DEVICE;
 #endif
    }
 }
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp b/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
index 6133a8438..6f9ee48fe 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
@@ -460,6 +460,8 @@ reduceSegments( IndexType first, IndexType last, Fetch& fetch, const Reduction&
             <<< cudaGridSize, cudaBlockSize, sharedMemory  >>>
             ( *this, gridIdx, first, last, fetch, reduction, keeper, zero );
       }
+      cudaStreamSynchronize(0);
+      TNL_CHECK_CUDA_DEVICE;
 #endif
    }
 }
diff --git a/src/TNL/Algorithms/Segments/EllpackView.hpp b/src/TNL/Algorithms/Segments/EllpackView.hpp
index b5311d793..9985d9b9b 100644
--- a/src/TNL/Algorithms/Segments/EllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/EllpackView.hpp
@@ -111,7 +111,8 @@ struct EllpackCudaReductionDispatcher
       dim3 blockSize( 256 );
       dim3 gridSize( blocksCount );
       EllpackCudaReductionKernelFull<<< gridSize, blockSize >>>( first, last, fetch, reduction, keeper, zero, segmentSize );
-      cudaDeviceSynchronize();
+      cudaStreamSynchronize(0);
+      TNL_CHECK_CUDA_DEVICE;
    #endif
    }
 };
@@ -133,7 +134,8 @@ struct EllpackCudaReductionDispatcher< Index, Fetch, Reduction, ResultKeeper, Re
       dim3 blockSize( 256 );
       dim3 gridSize( blocksCount );
       EllpackCudaReductionKernelCompact<<< gridSize, blockSize >>>( first, last, fetch, reduction, keeper, zero, segmentSize );
-      cudaDeviceSynchronize();
+      cudaStreamSynchronize(0);
+      TNL_CHECK_CUDA_DEVICE;
    #endif
    }
 };
diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.hpp
index 9c495fd70..86307e754 100644
--- a/src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.hpp
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernelView.hpp
@@ -272,6 +272,8 @@ struct CSRAdaptiveKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reductio
                zero,
                args... );
       }
+      cudaStreamSynchronize(0);
+      TNL_CHECK_CUDA_DEVICE;
 #endif
    }
 };
diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp
index 68198f995..6ecad2dda 100644
--- a/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp
@@ -297,6 +297,8 @@ reduceSegments( const OffsetsView& offsets,
                 throw std::runtime_error( std::string( "Wrong value of threadsPerSegment: " ) + std::to_string( this->threadsPerSegment ) );
         }
     }
+    cudaStreamSynchronize(0);
+    TNL_CHECK_CUDA_DEVICE;
 #endif
 }
 
diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
index 93d3e2800..37f237533 100644
--- a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
@@ -500,8 +500,9 @@ struct CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduce, Kee
                      grid, offsets, first, last, fetch, reduce, keep, zero );
          }*/
       }
+      cudaStreamSynchronize(0);
+      TNL_CHECK_CUDA_DEVICE;
 #endif
-
    }
 };
 
diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRVectorKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRVectorKernel.hpp
index cf7d80af6..a72e3b951 100644
--- a/src/TNL/Algorithms/Segments/Kernels/CSRVectorKernel.hpp
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRVectorKernel.hpp
@@ -152,9 +152,12 @@ reduceSegments( const OffsetsView& offsets,
         reduceSegmentsCSRKernelVector< OffsetsView, IndexType, Fetch, Reduction, ResultKeeper, Real, Args... >
         <<< gridSize, blockSize >>>(
             gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args... );
-    };
+    }
+    cudaStreamSynchronize(0);
+    TNL_CHECK_CUDA_DEVICE;
 #endif
 }
+
       } // namespace Segments
    }  // namespace Algorithms
 } // namespace TNL
-- 
GitLab


From c4cc606a9e4abf23c446909b1ba5fab75a304e69 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Thu, 16 Dec 2021 12:51:58 +0100
Subject: [PATCH 2/3] Fixed reduction of segments to work even for empty
 matrices

---
 src/TNL/Algorithms/Segments/EllpackView.hpp             | 4 ++++
 src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp | 2 ++
 src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp  | 3 +++
 src/TNL/Algorithms/Segments/Kernels/CSRVectorKernel.hpp | 3 +++
 src/UnitTests/Matrices/SparseMatrixTest.hpp             | 3 +++
 5 files changed, 15 insertions(+)

diff --git a/src/TNL/Algorithms/Segments/EllpackView.hpp b/src/TNL/Algorithms/Segments/EllpackView.hpp
index 9985d9b9b..ca418e691 100644
--- a/src/TNL/Algorithms/Segments/EllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/EllpackView.hpp
@@ -105,6 +105,8 @@ struct EllpackCudaReductionDispatcher
    exec( Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Index segmentSize )
    {
    #ifdef HAVE_CUDA
+      if( last <= first )
+         return;
       const Index segmentsCount = last - first;
       const Index threadsCount = segmentsCount * 32;
       const Index blocksCount = Cuda::getNumberOfBlocks( threadsCount, 256 );
@@ -128,6 +130,8 @@ struct EllpackCudaReductionDispatcher< Index, Fetch, Reduction, ResultKeeper, Re
    exec( Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Index segmentSize )
    {
    #ifdef HAVE_CUDA
+      if( last <= first )
+         return;
       const Index segmentsCount = last - first;
       const Index threadsCount = segmentsCount * 32;
       const Index blocksCount = Cuda::getNumberOfBlocks( threadsCount, 256 );
diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp
index 6ecad2dda..1483c1881 100644
--- a/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp
@@ -245,6 +245,8 @@ reduceSegments( const OffsetsView& offsets,
     TNL_ASSERT_LE( this->threadsPerSegment, ThreadsInBlock, "" );
 
 #ifdef HAVE_CUDA
+    if( last <= first )
+       return;
     const size_t threadsCount = this->threadsPerSegment * ( last - first );
     dim3 blocksCount, gridsCount, blockSize( ThreadsInBlock );
     TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount );
diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
index 37f237533..4c00a5410 100644
--- a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
@@ -424,6 +424,9 @@ struct CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduce, Kee
                        const Index threadsPerSegment )
    {
 #ifdef HAVE_CUDA
+    if( last <= first )
+       return;
+
       const size_t threads = 128;
       Index blocks, groupSize;
 
diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRVectorKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRVectorKernel.hpp
index a72e3b951..a070761f7 100644
--- a/src/TNL/Algorithms/Segments/Kernels/CSRVectorKernel.hpp
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRVectorKernel.hpp
@@ -140,6 +140,9 @@ reduceSegments( const OffsetsView& offsets,
                          Args... args )
 {
 #ifdef HAVE_CUDA
+    if( last <= first )
+       return;
+
     const Index warpsCount = last - first;
     const size_t threadsCount = warpsCount * TNL::Cuda::getWarpSize();
     dim3 blocksCount, gridsCount, blockSize( 256 );
diff --git a/src/UnitTests/Matrices/SparseMatrixTest.hpp b/src/UnitTests/Matrices/SparseMatrixTest.hpp
index 1716b0ab8..444da8666 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest.hpp
+++ b/src/UnitTests/Matrices/SparseMatrixTest.hpp
@@ -243,6 +243,9 @@ void test_SetDimensions()
 
    EXPECT_EQ( m.getRows(), 9 );
    EXPECT_EQ( m.getColumns(), 8 );
+
+   // test empty matrix
+   m.setDimensions( 0, 0 );
 }
 
 template< typename Matrix >
-- 
GitLab


From ade448eef0494e2aeffec6871125b45586a91d1b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Thu, 16 Dec 2021 13:23:02 +0100
Subject: [PATCH 3/3] Fixed setting threads per segment in CSR kernels

---
 .../Segments/Kernels/CSRHybridKernel.hpp      |  3 +++
 .../Segments/Kernels/CSRLightKernel.hpp       | 27 +++++++++----------
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp
index 1483c1881..7feb50fef 100644
--- a/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRHybridKernel.hpp
@@ -175,7 +175,10 @@ void
 CSRHybridKernel< Index, Device, ThreadsInBlock >::
 init( const Offsets& offsets )
 {
+    TNL_ASSERT_GT( offsets.getSize(), 0, "offsets size must be strictly positive" );
     const Index segmentsCount = offsets.getSize() - 1;
+    if( segmentsCount <= 0 )
+       return;
     const Index elementsInSegment = std::ceil( ( double ) offsets.getElement( segmentsCount ) / ( double ) segmentsCount );
     this->threadsPerSegment = TNL::min( std::pow( 2, std::ceil( std::log2( elementsInSegment ) ) ), ThreadsInBlock ); //TNL::Cuda::getWarpSize() );
     TNL_ASSERT_GE( threadsPerSegment, 0, "" );
diff --git a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
index 4c00a5410..0ceecc371 100644
--- a/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
+++ b/src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp
@@ -517,21 +517,24 @@ void
 CSRLightKernel< Index, Device >::
 init( const Offsets& offsets )
 {
+   TNL_ASSERT_GT( offsets.getSize(), 0, "offsets size must be strictly positive" );
    const Index segmentsCount = offsets.getSize() - 1;
+    if( segmentsCount <= 0 )
+       return;
 
    if( this->getThreadsMapping() == CSRLightAutomaticThreads )
    {
       const Index elementsInSegment = roundUpDivision( offsets.getElement( segmentsCount ), segmentsCount ); // non zeroes per row
       if( elementsInSegment <= 2 )
-         this->threadsPerSegment = 2;
+         setThreadsPerSegment( 2 );
       else if( elementsInSegment <= 4 )
-         this->threadsPerSegment = 4;
+         setThreadsPerSegment( 4 );
       else if( elementsInSegment <= 8 )
-         this->threadsPerSegment = 8;
+         setThreadsPerSegment( 8 );
       else if( elementsInSegment <= 16 )
-         this->threadsPerSegment = 16;
+         setThreadsPerSegment( 16 );
       else //if (nnz <= 2 * matrix.MAX_ELEMENTS_PER_WARP)
-         this->threadsPerSegment = 32; // CSR Vector
+         setThreadsPerSegment( 32 ); // CSR Vector
       //else
       //   threadsPerSegment = roundUpDivision(nnz, matrix.MAX_ELEMENTS_PER_WARP) * 32; // CSR MultiVector
    }
@@ -540,22 +543,18 @@ init( const Offsets& offsets )
    {
       const Index elementsInSegment = roundUpDivision( offsets.getElement( segmentsCount ), segmentsCount ); // non zeroes per row
       if( elementsInSegment <= 2 )
-         this->threadsPerSegment = 2;
+         setThreadsPerSegment( 2 );
       else if( elementsInSegment <= 4 )
-         this->threadsPerSegment = 4;
+         setThreadsPerSegment( 4 );
       else if( elementsInSegment <= 8 )
-         this->threadsPerSegment = 8;
+         setThreadsPerSegment( 8 );
       else if( elementsInSegment <= 16 )
-         this->threadsPerSegment = 16;
+         setThreadsPerSegment( 16 );
       else //if (nnz <= 2 * matrix.MAX_ELEMENTS_PER_WARP)
-         this->threadsPerSegment = 32; // CSR Vector
+         setThreadsPerSegment( 32 ); // CSR Vector
       //else
       //   threadsPerSegment = roundUpDivision(nnz, matrix.MAX_ELEMENTS_PER_WARP) * 32; // CSR MultiVector
    }
-
-   TNL_ASSERT_GE( this->threadsPerSegment, 0, "" );
-   TNL_ASSERT_LE( this->threadsPerSegment, 33, "" );
-
 }
 
 template< typename Index,
-- 
GitLab