From e72fc80b745bb3ef0a3c171bb9d531606b3cd918 Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz> Date: Sun, 27 Apr 2014 18:26:35 +0200 Subject: [PATCH] Implementing the SlicedEllpack matrix in CUDA. --- .../matrices/tnlEllpackMatrix_impl.h | 6 +- .../matrices/tnlSlicedEllpackMatrix_impl.h | 164 ++++++++++++++++-- src/matrices/tnlSlicedEllpackMatrix.h | 18 +- 3 files changed, 168 insertions(+), 20 deletions(-) diff --git a/src/implementation/matrices/tnlEllpackMatrix_impl.h b/src/implementation/matrices/tnlEllpackMatrix_impl.h index a206aaf198..0a3028899c 100644 --- a/src/implementation/matrices/tnlEllpackMatrix_impl.h +++ b/src/implementation/matrices/tnlEllpackMatrix_impl.h @@ -668,7 +668,7 @@ class tnlEllpackMatrixDeviceDependentCode< tnlCuda > static Index getRowBegin( const tnlEllpackMatrix< Real, Device, Index >& matrix, const Index row ) { - return row * matrix.rowLengths; + return row; } template< typename Real, @@ -679,7 +679,7 @@ class tnlEllpackMatrixDeviceDependentCode< tnlCuda > static Index getRowEnd( const tnlEllpackMatrix< Real, Device, Index >& matrix, const Index row ) { - return ( row + 1 ) * matrix.rowLengths; + return row + getElementStep( matrix ) * matrix.rowLengths; } template< typename Real, @@ -689,7 +689,7 @@ class tnlEllpackMatrixDeviceDependentCode< tnlCuda > #endif static Index getElementStep( const tnlEllpackMatrix< Real, Device, Index >& matrix ) { - return 1; + return matrix.alignedRows; } }; diff --git a/src/implementation/matrices/tnlSlicedEllpackMatrix_impl.h b/src/implementation/matrices/tnlSlicedEllpackMatrix_impl.h index 74a839d979..22e1c79a9e 100644 --- a/src/implementation/matrices/tnlSlicedEllpackMatrix_impl.h +++ b/src/implementation/matrices/tnlSlicedEllpackMatrix_impl.h @@ -75,26 +75,11 @@ bool tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::setRowLengths( co if( ! this->sliceRowLengths.setSize( slices ) || ! this->slicePointers.setSize( slices + 1 ) ) return false; - IndexType row( 0 ), slice( 0 ), sliceRowLength( 0 ); /**** * Compute maximal row length in each slice */ - while( row < this->rows ) - { - sliceRowLength = Max( rowLengths.getElement( row++ ), sliceRowLength ); - if( row % SliceSize == 0 ) - { - this->sliceRowLengths.setElement( slice, sliceRowLength ); - this->slicePointers.setElement( slice++, sliceRowLength*SliceSize ); - sliceRowLength = 0; - } - } - if( row % SliceSize != 0 ) - { - this->sliceRowLengths.setElement( slice, sliceRowLength ); - this->slicePointers.setElement( slice++, sliceRowLength*SliceSize ); - } + DeviceDependentCode::computeMaximalRowLengthInSlices( *this, rowLengths ); /**** * Compute the slice pointers using the exclusive prefix sum @@ -567,4 +552,151 @@ void tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::print( ostream& s } } +template<> +class tnlSlicedEllpackMatrixDeviceDependentCode< tnlHost > +{ + public: + + typedef tnlHost Device; + + template< typename Real, + typename Index, + int SliceSize > + static Index getRowBegin( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix, + const Index row ) + { + return row * matrix.rowLengths; + } + + template< typename Real, + typename Index, + int SliceSize > + static Index getRowEnd( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix, + const Index row ) + { + return ( row + 1 ) * matrix.rowLengths; + } + + template< typename Real, + typename Index, + int SliceSize > + static Index getElementStep( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix ) + { + return 1; + } + + template< typename Real, + typename Index, + int SliceSize > + static bool computeMaximalRowLengthInSlices( tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix, + const typename tnlSlicedEllpackMatrix< Real, Device, Index >::RowLengthsVector& rowLengths ) + { + Index row( 0 ), slice( 0 ), sliceRowLength( 0 ); + while( row < matrix.getRows() ) + { + sliceRowLength = Max( rowLengths.getElement( row++ ), sliceRowLength ); + if( row % SliceSize == 0 ) + { + matrix.sliceRowLengths.setElement( slice, sliceRowLength ); + matrix.slicePointers.setElement( slice++, sliceRowLength * SliceSize ); + sliceRowLength = 0; + } + } + if( row % SliceSize != 0 ) + { + matrix.sliceRowLengths.setElement( slice, sliceRowLength ); + matrix.slicePointers.setElement( slice++, sliceRowLength * SliceSize ); + } + } +}; + +#ifdef HAVE_CUDA +template< typename Real, + typename Device, + typename Index, + int SliceSize > +__global__ void tnlSlicedEllpackMatrix_compuetMaximalRowLengthInSlices_CudaKernel<<< cudaGridSize, cudaBlockSize >>> + ( tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >* matrix, + const typename tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::RowLentghsVector* rowLengths, + int gridIdx ) +{ + +} +#endif + +template<> +class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda > +{ + public: + + typedef tnlCuda Device; + + template< typename Real, + typename Index, + int SliceSize > +#ifdef HAVE_CUDA + __device__ __host__ +#endif + static Index getRowBegin( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix, + const Index row ) + { + return row * matrix.rowLengths; + } + + template< typename Real, + typename Index, + int SliceSize > +#ifdef HAVE_CUDA + __device__ __host__ +#endif + static Index getRowEnd( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix, + const Index row ) + { + return ( row + 1 ) * matrix.rowLengths; + } + + template< typename Real, + typename Index, + int SliceSize > +#ifdef HAVE_CUDA + __device__ __host__ +#endif + static Index getElementStep( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix ) + { + return 1; + } + + template< typename Real, + typename Index, + int SliceSize > + static bool computeMaximalRowLengthInSlices( tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix, + const typename tnlSlicedEllpackMatrix< Real, Device, Index >::RowLengthsVector& rowLengths ) + { +#ifdef HAVE_CUDA + typedef tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize > Matrix; + typedef typename Matrix::RowLengthsVector RowLengthsVector; + Matrix* kernel_matrix = tnlCuda::passToDevice( matrix ); + RowLengthsVector* kernel_rowLengths = tnlCuda::passToDevice( rowLengths ); + const Index numberOfSlices = roundUpDivision( matrix.getRows(), SliceSize ); + dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() ); + const IndexType cudaBlocks = roundUpDivision( numberOfSlices, cudaBlockSize.x ); + const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() ); + for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) + { + if( gridIdx == cudaGrids - 1 ) + cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize(); + tnlSlicedEllpackMatrix_compuetMaximalRowLengthInSlices_CudaKernel<<< cudaGridSize, cudaBlockSize >>> + ( kernel_matrix, + kernel_rowLengths, + gridIdx ); + } + tnlCuda::freeFromDevice( kernel_matrix ); + tnlCuda::freeFromDevice( kernel_rowLengths ); + checkCudaDevice; +#endif + } +}; + + + #endif /* TNLSLICEDELLPACKMATRIX_IMPL_H_ */ diff --git a/src/matrices/tnlSlicedEllpackMatrix.h b/src/matrices/tnlSlicedEllpackMatrix.h index 29a20a0d24..847b17fd27 100644 --- a/src/matrices/tnlSlicedEllpackMatrix.h +++ b/src/matrices/tnlSlicedEllpackMatrix.h @@ -21,6 +21,9 @@ #include <matrices/tnlSparseMatrix.h> #include <core/vectors/tnlVector.h> +template< typename Device > +class tnlSlicedEllpackMatrixDeviceDependentCode; + template< typename Real = double, typename Device = tnlHost, typename Index = int, @@ -32,7 +35,10 @@ class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index > typedef Real RealType; typedef Device DeviceType; typedef Index IndexType; - typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >:: RowLengthsVector RowLengthsVector; + typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >::RowLengthsVector RowLengthsVector; + typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >::ValuesVector ValuesVector; + typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector; + typedef tnlSlicedEllpackMatrix< Real, Device, Index > ThisType; tnlSlicedEllpackMatrix(); @@ -130,6 +136,13 @@ class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index > IndexType* columns, RealType* values ) const; + template< typename Vector > + #ifdef HAVE_CUDA + __device__ __host__ + #endif + typename Vector::RealType rowVectorProduct( const IndexType row, + const Vector& vector ) const; + template< typename Vector > void vectorProduct( const Vector& inVector, Vector& outVector ) const; @@ -163,6 +176,9 @@ class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index > tnlVector< Index, Device, Index > slicePointers, sliceRowLengths; + typedef tnlSlicedEllpackMatrixDeviceDependentCode< DeviceType > DeviceDependentCode; + friend class tnlSlicedEllpackMatrixDeviceDependentCode< DeviceType >; + }; #include <implementation/matrices/tnlSlicedEllpackMatrix_impl.h> -- GitLab