From e72fc80b745bb3ef0a3c171bb9d531606b3cd918 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Sun, 27 Apr 2014 18:26:35 +0200
Subject: [PATCH] Implementing the SlicedEllpack matrix in CUDA.

---
 .../matrices/tnlEllpackMatrix_impl.h          |   6 +-
 .../matrices/tnlSlicedEllpackMatrix_impl.h    | 164 ++++++++++++++++--
 src/matrices/tnlSlicedEllpackMatrix.h         |  18 +-
 3 files changed, 168 insertions(+), 20 deletions(-)

diff --git a/src/implementation/matrices/tnlEllpackMatrix_impl.h b/src/implementation/matrices/tnlEllpackMatrix_impl.h
index a206aaf198..0a3028899c 100644
--- a/src/implementation/matrices/tnlEllpackMatrix_impl.h
+++ b/src/implementation/matrices/tnlEllpackMatrix_impl.h
@@ -668,7 +668,7 @@ class tnlEllpackMatrixDeviceDependentCode< tnlCuda >
       static Index getRowBegin( const tnlEllpackMatrix< Real, Device, Index >& matrix,
                                 const Index row )
       {
-         return row * matrix.rowLengths;
+         return row;
       }
 
       template< typename Real,
@@ -679,7 +679,7 @@ class tnlEllpackMatrixDeviceDependentCode< tnlCuda >
       static Index getRowEnd( const tnlEllpackMatrix< Real, Device, Index >& matrix,
                                 const Index row )
       {
-         return ( row + 1 ) * matrix.rowLengths;
+         return row + getElementStep( matrix ) * matrix.rowLengths;
       }
 
       template< typename Real,
@@ -689,7 +689,7 @@ class tnlEllpackMatrixDeviceDependentCode< tnlCuda >
 #endif
       static Index getElementStep( const tnlEllpackMatrix< Real, Device, Index >& matrix )
       {
-         return 1;
+         return matrix.alignedRows;
       }
 };
 
diff --git a/src/implementation/matrices/tnlSlicedEllpackMatrix_impl.h b/src/implementation/matrices/tnlSlicedEllpackMatrix_impl.h
index 74a839d979..22e1c79a9e 100644
--- a/src/implementation/matrices/tnlSlicedEllpackMatrix_impl.h
+++ b/src/implementation/matrices/tnlSlicedEllpackMatrix_impl.h
@@ -75,26 +75,11 @@ bool tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::setRowLengths( co
    if( ! this->sliceRowLengths.setSize( slices ) ||
        ! this->slicePointers.setSize( slices + 1 ) )
       return false;
-   IndexType row( 0 ), slice( 0 ), sliceRowLength( 0 );
 
    /****
     * Compute maximal row length in each slice
     */
-   while( row < this->rows )
-   {
-      sliceRowLength = Max( rowLengths.getElement( row++ ), sliceRowLength );
-      if( row % SliceSize == 0 )
-      {
-         this->sliceRowLengths.setElement( slice, sliceRowLength );
-         this->slicePointers.setElement( slice++, sliceRowLength*SliceSize );
-         sliceRowLength = 0;
-      }
-   }
-   if( row % SliceSize != 0 )
-   {
-      this->sliceRowLengths.setElement( slice, sliceRowLength );
-      this->slicePointers.setElement( slice++, sliceRowLength*SliceSize );
-   }
+   DeviceDependentCode::computeMaximalRowLengthInSlices( *this, rowLengths );
 
    /****
     * Compute the slice pointers using the exclusive prefix sum
@@ -567,4 +552,151 @@ void tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::print( ostream& s
    }
 }
 
+template<>
+class tnlSlicedEllpackMatrixDeviceDependentCode< tnlHost >
+{
+   public:
+
+      typedef tnlHost Device;
+
+      template< typename Real,
+                typename Index,
+                int SliceSize >
+      static Index getRowBegin( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix,
+                                const Index row )
+      {
+         return row * matrix.rowLengths;
+      }
+
+      template< typename Real,
+                typename Index,
+                int SliceSize >
+      static Index getRowEnd( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix,
+                                const Index row )
+      {
+         return ( row + 1 ) * matrix.rowLengths;
+      }
+
+      template< typename Real,
+                typename Index,
+                int SliceSize >
+      static Index getElementStep( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix )
+      {
+         return 1;
+      }
+
+      template< typename Real,
+                typename Index,
+                int SliceSize >
+      static bool computeMaximalRowLengthInSlices( tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix,
+                                                   const typename tnlSlicedEllpackMatrix< Real, Device, Index >::RowLengthsVector& rowLengths )
+      {
+         Index row( 0 ), slice( 0 ), sliceRowLength( 0 );
+         while( row < matrix.getRows() )
+         {
+            sliceRowLength = Max( rowLengths.getElement( row++ ), sliceRowLength );
+            if( row % SliceSize == 0 )
+            {
+               matrix.sliceRowLengths.setElement( slice, sliceRowLength );
+               matrix.slicePointers.setElement( slice++, sliceRowLength * SliceSize );
+               sliceRowLength = 0;
+            }
+         }
+         if( row % SliceSize != 0 )
+         {
+            matrix.sliceRowLengths.setElement( slice, sliceRowLength );
+            matrix.slicePointers.setElement( slice++, sliceRowLength * SliceSize );
+         }
+      }
+};
+
+#ifdef HAVE_CUDA
+template< typename Real,
+          typename Device,
+          typename Index,
+          int SliceSize >
+__global__ void tnlSlicedEllpackMatrix_compuetMaximalRowLengthInSlices_CudaKernel<<< cudaGridSize, cudaBlockSize >>>
+                                                                                ( tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >* matrix,
+                                                                                  const typename tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::RowLentghsVector* rowLengths,
+                                                                                  int gridIdx )
+{
+
+}
+#endif
+
+template<>
+class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda >
+{
+   public:
+
+      typedef tnlCuda Device;
+
+      template< typename Real,
+                typename Index,
+                int SliceSize >
+#ifdef HAVE_CUDA
+      __device__ __host__
+#endif
+      static Index getRowBegin( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix,
+                                const Index row )
+      {
+         return row * matrix.rowLengths;
+      }
+
+      template< typename Real,
+                typename Index,
+                int SliceSize >
+#ifdef HAVE_CUDA
+      __device__ __host__
+#endif
+      static Index getRowEnd( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix,
+                                const Index row )
+      {
+         return ( row + 1 ) * matrix.rowLengths;
+      }
+
+      template< typename Real,
+                typename Index,
+                int SliceSize >
+#ifdef HAVE_CUDA
+      __device__ __host__
+#endif
+      static Index getElementStep( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix )
+      {
+         return 1;
+      }
+
+      template< typename Real,
+                typename Index,
+                int SliceSize >
+      static bool computeMaximalRowLengthInSlices( tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix,
+                                                   const typename tnlSlicedEllpackMatrix< Real, Device, Index >::RowLengthsVector& rowLengths )
+      {
+#ifdef HAVE_CUDA
+         typedef tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize > Matrix;
+         typedef typename Matrix::RowLengthsVector RowLengthsVector;
+         Matrix* kernel_matrix = tnlCuda::passToDevice( matrix );
+         RowLengthsVector* kernel_rowLengths = tnlCuda::passToDevice( rowLengths );
+         const Index numberOfSlices = roundUpDivision( matrix.getRows(), SliceSize );
+         dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() );
+         const IndexType cudaBlocks = roundUpDivision( numberOfSlices, cudaBlockSize.x );
+         const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() );
+         for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
+         {
+            if( gridIdx == cudaGrids - 1 )
+               cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize();
+            tnlSlicedEllpackMatrix_compuetMaximalRowLengthInSlices_CudaKernel<<< cudaGridSize, cudaBlockSize >>>
+                                                                             ( kernel_matrix,
+                                                                               kernel_rowLengths,
+                                                                               gridIdx );
+         }
+         tnlCuda::freeFromDevice( kernel_matrix );
+         tnlCuda::freeFromDevice( kernel_rowLengths );
+         checkCudaDevice;
+#endif
+      }
+};
+
+
+
 #endif /* TNLSLICEDELLPACKMATRIX_IMPL_H_ */
diff --git a/src/matrices/tnlSlicedEllpackMatrix.h b/src/matrices/tnlSlicedEllpackMatrix.h
index 29a20a0d24..847b17fd27 100644
--- a/src/matrices/tnlSlicedEllpackMatrix.h
+++ b/src/matrices/tnlSlicedEllpackMatrix.h
@@ -21,6 +21,9 @@
 #include <matrices/tnlSparseMatrix.h>
 #include <core/vectors/tnlVector.h>
 
+template< typename Device >
+class tnlSlicedEllpackMatrixDeviceDependentCode;
+
 template< typename Real = double,
           typename Device = tnlHost,
           typename Index = int,
@@ -32,7 +35,10 @@ class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
    typedef Real RealType;
    typedef Device DeviceType;
    typedef Index IndexType;
-   typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >:: RowLengthsVector RowLengthsVector;
+   typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >::RowLengthsVector RowLengthsVector;
+   typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >::ValuesVector ValuesVector;
+   typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector;
+   typedef tnlSlicedEllpackMatrix< Real, Device, Index > ThisType;
 
    tnlSlicedEllpackMatrix();
 
@@ -130,6 +136,13 @@ class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                 IndexType* columns,
                 RealType* values ) const;
 
+   template< typename Vector >
+   #ifdef HAVE_CUDA
+      __device__ __host__
+   #endif
+   typename Vector::RealType rowVectorProduct( const IndexType row,
+                                               const Vector& vector ) const;
+
    template< typename Vector >
    void vectorProduct( const Vector& inVector,
                        Vector& outVector ) const;
@@ -163,6 +176,9 @@ class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
 
    tnlVector< Index, Device, Index > slicePointers, sliceRowLengths;
 
+   typedef tnlSlicedEllpackMatrixDeviceDependentCode< DeviceType > DeviceDependentCode;
+   friend class tnlSlicedEllpackMatrixDeviceDependentCode< DeviceType >;
+
 };
 
 #include <implementation/matrices/tnlSlicedEllpackMatrix_impl.h>
-- 
GitLab