Commit e72fc80b authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Implementing the SlicedEllpack matrix in CUDA.

parent 3e00d6f0
Loading
Loading
Loading
Loading
+3 −3
Original line number Diff line number Diff line
@@ -668,7 +668,7 @@ class tnlEllpackMatrixDeviceDependentCode< tnlCuda >
      static Index getRowBegin( const tnlEllpackMatrix< Real, Device, Index >& matrix,
                                const Index row )
      {
         return row * matrix.rowLengths;
         return row;
      }

      template< typename Real,
@@ -679,7 +679,7 @@ class tnlEllpackMatrixDeviceDependentCode< tnlCuda >
      static Index getRowEnd( const tnlEllpackMatrix< Real, Device, Index >& matrix,
                                const Index row )
      {
         return ( row + 1 ) * matrix.rowLengths;
         return row + getElementStep( matrix ) * matrix.rowLengths;
      }

      template< typename Real,
@@ -689,7 +689,7 @@ class tnlEllpackMatrixDeviceDependentCode< tnlCuda >
#endif
      static Index getElementStep( const tnlEllpackMatrix< Real, Device, Index >& matrix )
      {
         return 1;
         return matrix.alignedRows;
      }
};

+148 −16
Original line number Diff line number Diff line
@@ -75,26 +75,11 @@ bool tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::setRowLengths( co
   if( ! this->sliceRowLengths.setSize( slices ) ||
       ! this->slicePointers.setSize( slices + 1 ) )
      return false;
   IndexType row( 0 ), slice( 0 ), sliceRowLength( 0 );

   /****
    * Compute maximal row length in each slice
    */
   while( row < this->rows )
   {
      sliceRowLength = Max( rowLengths.getElement( row++ ), sliceRowLength );
      if( row % SliceSize == 0 )
      {
         this->sliceRowLengths.setElement( slice, sliceRowLength );
         this->slicePointers.setElement( slice++, sliceRowLength*SliceSize );
         sliceRowLength = 0;
      }
   }
   if( row % SliceSize != 0 )
   {
      this->sliceRowLengths.setElement( slice, sliceRowLength );
      this->slicePointers.setElement( slice++, sliceRowLength*SliceSize );
   }
   DeviceDependentCode::computeMaximalRowLengthInSlices( *this, rowLengths );

   /****
    * Compute the slice pointers using the exclusive prefix sum
@@ -567,4 +552,151 @@ void tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::print( ostream& s
   }
}

template<>
class tnlSlicedEllpackMatrixDeviceDependentCode< tnlHost >
{
   public:

      typedef tnlHost Device;

      template< typename Real,
                typename Index,
                int SliceSize >
      static Index getRowBegin( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix,
                                const Index row )
      {
         return row * matrix.rowLengths;
      }

      template< typename Real,
                typename Index,
                int SliceSize >
      static Index getRowEnd( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix,
                                const Index row )
      {
         return ( row + 1 ) * matrix.rowLengths;
      }

      template< typename Real,
                typename Index,
                int SliceSize >
      static Index getElementStep( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix )
      {
         return 1;
      }

      template< typename Real,
                typename Index,
                int SliceSize >
      static bool computeMaximalRowLengthInSlices( tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix,
                                                   const typename tnlSlicedEllpackMatrix< Real, Device, Index >::RowLengthsVector& rowLengths )
      {
         Index row( 0 ), slice( 0 ), sliceRowLength( 0 );
         while( row < matrix.getRows() )
         {
            sliceRowLength = Max( rowLengths.getElement( row++ ), sliceRowLength );
            if( row % SliceSize == 0 )
            {
               matrix.sliceRowLengths.setElement( slice, sliceRowLength );
               matrix.slicePointers.setElement( slice++, sliceRowLength * SliceSize );
               sliceRowLength = 0;
            }
         }
         if( row % SliceSize != 0 )
         {
            matrix.sliceRowLengths.setElement( slice, sliceRowLength );
            matrix.slicePointers.setElement( slice++, sliceRowLength * SliceSize );
         }
      }
};

#ifdef HAVE_CUDA
template< typename Real,
          typename Device,
          typename Index,
          int SliceSize >
__global__ void tnlSlicedEllpackMatrix_compuetMaximalRowLengthInSlices_CudaKernel<<< cudaGridSize, cudaBlockSize >>>
                                                                                ( tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >* matrix,
                                                                                  const typename tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::RowLentghsVector* rowLengths,
                                                                                  int gridIdx )
{

}
#endif

template<>
class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda >
{
   public:

      typedef tnlCuda Device;

      template< typename Real,
                typename Index,
                int SliceSize >
#ifdef HAVE_CUDA
      __device__ __host__
#endif
      static Index getRowBegin( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix,
                                const Index row )
      {
         return row * matrix.rowLengths;
      }

      template< typename Real,
                typename Index,
                int SliceSize >
#ifdef HAVE_CUDA
      __device__ __host__
#endif
      static Index getRowEnd( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix,
                                const Index row )
      {
         return ( row + 1 ) * matrix.rowLengths;
      }

      template< typename Real,
                typename Index,
                int SliceSize >
#ifdef HAVE_CUDA
      __device__ __host__
#endif
      static Index getElementStep( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix )
      {
         return 1;
      }

      template< typename Real,
                typename Index,
                int SliceSize >
      static bool computeMaximalRowLengthInSlices( tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix,
                                                   const typename tnlSlicedEllpackMatrix< Real, Device, Index >::RowLengthsVector& rowLengths )
      {
#ifdef HAVE_CUDA
         typedef tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize > Matrix;
         typedef typename Matrix::RowLengthsVector RowLengthsVector;
         Matrix* kernel_matrix = tnlCuda::passToDevice( matrix );
         RowLengthsVector* kernel_rowLengths = tnlCuda::passToDevice( rowLengths );
         const Index numberOfSlices = roundUpDivision( matrix.getRows(), SliceSize );
         dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() );
         const IndexType cudaBlocks = roundUpDivision( numberOfSlices, cudaBlockSize.x );
         const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() );
         for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
         {
            if( gridIdx == cudaGrids - 1 )
               cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize();
            tnlSlicedEllpackMatrix_compuetMaximalRowLengthInSlices_CudaKernel<<< cudaGridSize, cudaBlockSize >>>
                                                                             ( kernel_matrix,
                                                                               kernel_rowLengths,
                                                                               gridIdx );
         }
         tnlCuda::freeFromDevice( kernel_matrix );
         tnlCuda::freeFromDevice( kernel_rowLengths );
         checkCudaDevice;
#endif
      }
};



#endif /* TNLSLICEDELLPACKMATRIX_IMPL_H_ */
+17 −1
Original line number Diff line number Diff line
@@ -21,6 +21,9 @@
#include <matrices/tnlSparseMatrix.h>
#include <core/vectors/tnlVector.h>

template< typename Device >
class tnlSlicedEllpackMatrixDeviceDependentCode;

template< typename Real = double,
          typename Device = tnlHost,
          typename Index = int,
@@ -33,6 +36,9 @@ class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
   typedef Device DeviceType;
   typedef Index IndexType;
   typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >::RowLengthsVector RowLengthsVector;
   typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >::ValuesVector ValuesVector;
   typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector;
   typedef tnlSlicedEllpackMatrix< Real, Device, Index > ThisType;

   tnlSlicedEllpackMatrix();

@@ -130,6 +136,13 @@ class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                IndexType* columns,
                RealType* values ) const;

   template< typename Vector >
   #ifdef HAVE_CUDA
      __device__ __host__
   #endif
   typename Vector::RealType rowVectorProduct( const IndexType row,
                                               const Vector& vector ) const;

   template< typename Vector >
   void vectorProduct( const Vector& inVector,
                       Vector& outVector ) const;
@@ -163,6 +176,9 @@ class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >

   tnlVector< Index, Device, Index > slicePointers, sliceRowLengths;

   typedef tnlSlicedEllpackMatrixDeviceDependentCode< DeviceType > DeviceDependentCode;
   friend class tnlSlicedEllpackMatrixDeviceDependentCode< DeviceType >;

};

#include <implementation/matrices/tnlSlicedEllpackMatrix_impl.h>