From 2641210dac5db0a06084258e476f61a15fab432c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 31 Mar 2020 15:44:45 +0200
Subject: [PATCH] Fixing CUDA code in Legacy CSR.

---
 src/TNL/Matrices/Legacy/CSR.h      |  8 ++---
 src/TNL/Matrices/Legacy/CSR_impl.h | 51 ++++++++++++++++--------------
 2 files changed, 30 insertions(+), 29 deletions(-)

diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h
index f86420339b..46e616d165 100644
--- a/src/TNL/Matrices/Legacy/CSR.h
+++ b/src/TNL/Matrices/Legacy/CSR.h
@@ -31,9 +31,9 @@ class CusparseCSR;
 template< typename Device >
 class CSRDeviceDependentCode;
 
-enum CSRKernel { CSRScalar, CSRVector, CSRLight, CSRAdaptive, CSRStream };
+enum CSRKernel { CSRScalar, CSRVector, CSRHybrid, CSRLight, CSRAdaptive, CSRStream };
 
-template< typename Real, typename Device = Devices::Host, typename Index = int, CSRKernel KernelType = CSRAdaptive >
+template< typename Real, typename Device = Devices::Host, typename Index = int, CSRKernel KernelType = CSRScalar >
 class CSR : public Sparse< Real, Device, Index >
 {
 private:
@@ -225,9 +225,7 @@ public:
    __device__
    void spmvCudaVectorized( const InVector& inVector,
                             OutVector& outVector,
-                            const IndexType warpStart,
-                            const IndexType warpEnd,
-                            const IndexType inWarpIdx ) const;
+                            const IndexType gridIdx ) const;
 
    template< typename InVector,
              typename OutVector,
diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h
index c4c4be89d5..12a9eb5541 100644
--- a/src/TNL/Matrices/Legacy/CSR_impl.h
+++ b/src/TNL/Matrices/Legacy/CSR_impl.h
@@ -905,10 +905,13 @@ template< typename Real,
 __device__
 void CSR< Real, Device, Index, KernelType >::spmvCudaVectorized( const InVector& inVector,
                                                               OutVector& outVector,
-                                                              const IndexType warpStart,
-                                                              const IndexType warpEnd,
-                                                              const IndexType inWarpIdx ) const
+                                                              const IndexType gridIdx ) const
 {
+   IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const IndexType warpStart = warpSize * ( globalIdx / warpSize );
+   const IndexType warpEnd = min( warpStart + warpSize, this->getRows() );
+   const IndexType inWarpIdx = globalIdx % warpSize;
+
    volatile Real* aux = Cuda::getSharedMemory< Real >();
    for( IndexType row = warpStart; row < warpEnd; row++ )
    {
@@ -957,10 +960,10 @@ void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector&
          // TODO:
          break;
       case CSRVector:
-         spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, warpStart, warpEnd, inWarpIdx );
+         spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, gridIdx );
          break;
       case CSRLight:
-         spmvCSRLight< InVector, OutVector, warpSize >( inVector, outVector, gridIdx );
+         spmvCudaLightSpmv< InVector, OutVector, warpSize >( inVector, outVector, gridIdx );
          break;
       case CSRAdaptive:
          spmvCSRAdaptive< InVector, OutVector, warpSize >( inVector, outVector, gridIdx, blocks, size );
@@ -970,9 +973,8 @@ void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector&
          break;
    }
 
-   return;
 
-   IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   /*IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    const IndexType warpStart = warpSize * ( globalIdx / warpSize );
    const IndexType warpEnd = min( warpStart + warpSize, this->getRows() );
    const IndexType inWarpIdx = globalIdx % warpSize;
@@ -980,9 +982,9 @@ void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector&
    if( this->getCudaKernelType() == vector )
       
 
-   /****
-    * Hybrid mode
-    */
+   /////
+   // Hybrid mode
+   //
    const Index firstRow = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x;
    const IndexType lastRow = min( this->getRows(), firstRow + blockDim. x );
    const IndexType nonzerosPerRow = ( this->rowPointers[ lastRow ] - this->rowPointers[ firstRow ] ) /
@@ -990,19 +992,19 @@ void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector&
 
    if( nonzerosPerRow < this->getHybridModeSplit() )
    {
-      /****
-       * Use the scalar mode
-       */
+      /////
+      // Use the scalar mode
+      //
       if( globalIdx < this->getRows() )
           outVector[ globalIdx ] = this->rowVectorProduct( globalIdx, inVector );
    }
    else
    {
-      /****
-       * Use the vector mode
-       */
+      ////
+      // Use the vector mode
+      //
       spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, warpStart, warpEnd, inWarpIdx );
-   }
+   }*/
 }
 #endif
 
@@ -1038,10 +1040,11 @@ class CSRDeviceDependentCode< Devices::Host >
 #ifdef HAVE_CUDA
 template< typename Real,
           typename Index,
+          CSRKernel KernelType,
           typename InVector,
           typename OutVector,
           int warpSize >
-__global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Index >* matrix,
+__global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Index, KernelType >* matrix,
                                                      const InVector* inVector,
                                                      OutVector* outVector,
                                                      int gridIdx,
@@ -1050,13 +1053,12 @@ __global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Inde
    typedef CSR< Real, Devices::Cuda, Index > Matrix;
    static_assert( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value, "" );
    const typename Matrix::IndexType rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   if( matrix->getCudaKernelType() == Matrix::scalar )
+   if( KernelType == CSRScalar )
    {
       if( rowIdx < matrix->getRows() )
          ( *outVector )[ rowIdx ] = matrix->rowVectorProduct( rowIdx, *inVector );
    }
-   if( matrix->getCudaKernelType() == Matrix::vector ||
-       matrix->getCudaKernelType() == Matrix::hybrid )
+   else
    {
       matrix->template vectorProductCuda< InVector, OutVector, warpSize >
                                         ( *inVector, *outVector, gridIdx, blocks, size );
@@ -1066,16 +1068,17 @@ __global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Inde
 
 template< typename Real,
           typename Index,
+          CSRKernel KernelType,
           typename InVector,
           typename OutVector >
-void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix,
+void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index, KernelType >& matrix,
                                     const InVector& inVector,
                                     OutVector& outVector,
                                     int *blocks,
                                     size_t size )
 {
 #ifdef HAVE_CUDA
-   typedef CSR< Real, Devices::Cuda, Index > Matrix;
+   typedef CSR< Real, Devices::Cuda, Index, KernelType > Matrix;
    typedef typename Matrix::IndexType IndexType;
    Matrix* kernel_this = Cuda::passToDevice( matrix );
    InVector* kernel_inVector = Cuda::passToDevice( inVector );
@@ -1096,7 +1099,7 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix,
       // const int threads = cudaBlockSize.x;
       if( matrix.getCudaWarpSize() == 32 ) {
          // printf("BL %d BLSIZE %d\n", (int)cudaBlocks, (int)threads);
-         CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 32 >
+         CSRVectorProductCudaKernel< Real, Index, KernelType, InVector, OutVector, 32 >
                                             <<< 2, 1024 >>>
                                             ( kernel_this,
                                               kernel_inVector,
-- 
GitLab