diff --git a/src/implementation/matrices/tnlEllpackMatrix_impl.h b/src/implementation/matrices/tnlEllpackMatrix_impl.h index a206aaf1983702c2d1ec1a3ede94b21bbc8835cc..0a3028899c471b56cd84573c91b52ddb538ed803 100644 --- a/src/implementation/matrices/tnlEllpackMatrix_impl.h +++ b/src/implementation/matrices/tnlEllpackMatrix_impl.h @@ -668,7 +668,7 @@ class tnlEllpackMatrixDeviceDependentCode< tnlCuda > static Index getRowBegin( const tnlEllpackMatrix< Real, Device, Index >& matrix, const Index row ) { - return row * matrix.rowLengths; + return row; } template< typename Real, @@ -679,7 +679,7 @@ class tnlEllpackMatrixDeviceDependentCode< tnlCuda > static Index getRowEnd( const tnlEllpackMatrix< Real, Device, Index >& matrix, const Index row ) { - return ( row + 1 ) * matrix.rowLengths; + return row + getElementStep( matrix ) * matrix.rowLengths; } template< typename Real, @@ -689,7 +689,7 @@ class tnlEllpackMatrixDeviceDependentCode< tnlCuda > #endif static Index getElementStep( const tnlEllpackMatrix< Real, Device, Index >& matrix ) { - return 1; + return matrix.alignedRows; } }; diff --git a/src/implementation/matrices/tnlSlicedEllpackMatrix_impl.h b/src/implementation/matrices/tnlSlicedEllpackMatrix_impl.h index 74a839d979a7f0c33ee19898b3146b7ac4cb165b..22e1c79a9e03e76f4c0a86f5269a346a2d5641b8 100644 --- a/src/implementation/matrices/tnlSlicedEllpackMatrix_impl.h +++ b/src/implementation/matrices/tnlSlicedEllpackMatrix_impl.h @@ -75,26 +75,11 @@ bool tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::setRowLengths( co if( ! this->sliceRowLengths.setSize( slices ) || ! this->slicePointers.setSize( slices + 1 ) ) return false; - IndexType row( 0 ), slice( 0 ), sliceRowLength( 0 ); /**** * Compute maximal row length in each slice */ - while( row < this->rows ) - { - sliceRowLength = Max( rowLengths.getElement( row++ ), sliceRowLength ); - if( row % SliceSize == 0 ) - { - this->sliceRowLengths.setElement( slice, sliceRowLength ); - this->slicePointers.setElement( slice++, sliceRowLength*SliceSize ); - sliceRowLength = 0; - } - } - if( row % SliceSize != 0 ) - { - this->sliceRowLengths.setElement( slice, sliceRowLength ); - this->slicePointers.setElement( slice++, sliceRowLength*SliceSize ); - } + DeviceDependentCode::computeMaximalRowLengthInSlices( *this, rowLengths ); /**** * Compute the slice pointers using the exclusive prefix sum @@ -567,4 +552,151 @@ void tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::print( ostream& s } } +template<> +class tnlSlicedEllpackMatrixDeviceDependentCode< tnlHost > +{ + public: + + typedef tnlHost Device; + + template< typename Real, + typename Index, + int SliceSize > + static Index getRowBegin( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix, + const Index row ) + { + return row * matrix.rowLengths; + } + + template< typename Real, + typename Index, + int SliceSize > + static Index getRowEnd( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix, + const Index row ) + { + return ( row + 1 ) * matrix.rowLengths; + } + + template< typename Real, + typename Index, + int SliceSize > + static Index getElementStep( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix ) + { + return 1; + } + + template< typename Real, + typename Index, + int SliceSize > + static bool computeMaximalRowLengthInSlices( tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix, + const typename tnlSlicedEllpackMatrix< Real, Device, Index >::RowLengthsVector& rowLengths ) + { + Index row( 0 ), slice( 0 ), sliceRowLength( 0 ); + while( row < matrix.getRows() ) + { + sliceRowLength = Max( rowLengths.getElement( row++ ), sliceRowLength ); + if( row % SliceSize == 0 ) + { + matrix.sliceRowLengths.setElement( slice, sliceRowLength ); + matrix.slicePointers.setElement( slice++, sliceRowLength * SliceSize ); + sliceRowLength = 0; + } + } + if( row % SliceSize != 0 ) + { + matrix.sliceRowLengths.setElement( slice, sliceRowLength ); + matrix.slicePointers.setElement( slice++, sliceRowLength * SliceSize ); + } + } +}; + +#ifdef HAVE_CUDA +template< typename Real, + typename Device, + typename Index, + int SliceSize > +__global__ void tnlSlicedEllpackMatrix_compuetMaximalRowLengthInSlices_CudaKernel<<< cudaGridSize, cudaBlockSize >>> + ( tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >* matrix, + const typename tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::RowLentghsVector* rowLengths, + int gridIdx ) +{ + +} +#endif + +template<> +class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda > +{ + public: + + typedef tnlCuda Device; + + template< typename Real, + typename Index, + int SliceSize > +#ifdef HAVE_CUDA + __device__ __host__ +#endif + static Index getRowBegin( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix, + const Index row ) + { + return row * matrix.rowLengths; + } + + template< typename Real, + typename Index, + int SliceSize > +#ifdef HAVE_CUDA + __device__ __host__ +#endif + static Index getRowEnd( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix, + const Index row ) + { + return ( row + 1 ) * matrix.rowLengths; + } + + template< typename Real, + typename Index, + int SliceSize > +#ifdef HAVE_CUDA + __device__ __host__ +#endif + static Index getElementStep( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix ) + { + return 1; + } + + template< typename Real, + typename Index, + int SliceSize > + static bool computeMaximalRowLengthInSlices( tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix, + const typename tnlSlicedEllpackMatrix< Real, Device, Index >::RowLengthsVector& rowLengths ) + { +#ifdef HAVE_CUDA + typedef tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize > Matrix; + typedef typename Matrix::RowLengthsVector RowLengthsVector; + Matrix* kernel_matrix = tnlCuda::passToDevice( matrix ); + RowLengthsVector* kernel_rowLengths = tnlCuda::passToDevice( rowLengths ); + const Index numberOfSlices = roundUpDivision( matrix.getRows(), SliceSize ); + dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() ); + const IndexType cudaBlocks = roundUpDivision( numberOfSlices, cudaBlockSize.x ); + const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() ); + for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) + { + if( gridIdx == cudaGrids - 1 ) + cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize(); + tnlSlicedEllpackMatrix_compuetMaximalRowLengthInSlices_CudaKernel<<< cudaGridSize, cudaBlockSize >>> + ( kernel_matrix, + kernel_rowLengths, + gridIdx ); + } + tnlCuda::freeFromDevice( kernel_matrix ); + tnlCuda::freeFromDevice( kernel_rowLengths ); + checkCudaDevice; +#endif + } +}; + + + #endif /* TNLSLICEDELLPACKMATRIX_IMPL_H_ */ diff --git a/src/matrices/tnlSlicedEllpackMatrix.h b/src/matrices/tnlSlicedEllpackMatrix.h index 29a20a0d243589fbca5e9b11797ca177c4cd3095..847b17fd275b01a4b3d1e85eebc40835348f3b17 100644 --- a/src/matrices/tnlSlicedEllpackMatrix.h +++ b/src/matrices/tnlSlicedEllpackMatrix.h @@ -21,6 +21,9 @@ #include <matrices/tnlSparseMatrix.h> #include <core/vectors/tnlVector.h> +template< typename Device > +class tnlSlicedEllpackMatrixDeviceDependentCode; + template< typename Real = double, typename Device = tnlHost, typename Index = int, @@ -32,7 +35,10 @@ class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index > typedef Real RealType; typedef Device DeviceType; typedef Index IndexType; - typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >:: RowLengthsVector RowLengthsVector; + typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >::RowLengthsVector RowLengthsVector; + typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >::ValuesVector ValuesVector; + typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector; + typedef tnlSlicedEllpackMatrix< Real, Device, Index > ThisType; tnlSlicedEllpackMatrix(); @@ -130,6 +136,13 @@ class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index > IndexType* columns, RealType* values ) const; + template< typename Vector > + #ifdef HAVE_CUDA + __device__ __host__ + #endif + typename Vector::RealType rowVectorProduct( const IndexType row, + const Vector& vector ) const; + template< typename Vector > void vectorProduct( const Vector& inVector, Vector& outVector ) const; @@ -163,6 +176,9 @@ class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index > tnlVector< Index, Device, Index > slicePointers, sliceRowLengths; + typedef tnlSlicedEllpackMatrixDeviceDependentCode< DeviceType > DeviceDependentCode; + friend class tnlSlicedEllpackMatrixDeviceDependentCode< DeviceType >; + }; #include <implementation/matrices/tnlSlicedEllpackMatrix_impl.h>