Loading src/implementation/matrices/tnlEllpackMatrix_impl.h +3 −3 Original line number Diff line number Diff line Loading @@ -668,7 +668,7 @@ class tnlEllpackMatrixDeviceDependentCode< tnlCuda > static Index getRowBegin( const tnlEllpackMatrix< Real, Device, Index >& matrix, const Index row ) { return row * matrix.rowLengths; return row; } template< typename Real, Loading @@ -679,7 +679,7 @@ class tnlEllpackMatrixDeviceDependentCode< tnlCuda > static Index getRowEnd( const tnlEllpackMatrix< Real, Device, Index >& matrix, const Index row ) { return ( row + 1 ) * matrix.rowLengths; return row + getElementStep( matrix ) * matrix.rowLengths; } template< typename Real, Loading @@ -689,7 +689,7 @@ class tnlEllpackMatrixDeviceDependentCode< tnlCuda > #endif static Index getElementStep( const tnlEllpackMatrix< Real, Device, Index >& matrix ) { return 1; return matrix.alignedRows; } }; Loading src/implementation/matrices/tnlSlicedEllpackMatrix_impl.h +148 −16 Original line number Diff line number Diff line Loading @@ -75,26 +75,11 @@ bool tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::setRowLengths( co if( ! this->sliceRowLengths.setSize( slices ) || ! this->slicePointers.setSize( slices + 1 ) ) return false; IndexType row( 0 ), slice( 0 ), sliceRowLength( 0 ); /**** * Compute maximal row length in each slice */ while( row < this->rows ) { sliceRowLength = Max( rowLengths.getElement( row++ ), sliceRowLength ); if( row % SliceSize == 0 ) { this->sliceRowLengths.setElement( slice, sliceRowLength ); this->slicePointers.setElement( slice++, sliceRowLength*SliceSize ); sliceRowLength = 0; } } if( row % SliceSize != 0 ) { this->sliceRowLengths.setElement( slice, sliceRowLength ); this->slicePointers.setElement( slice++, sliceRowLength*SliceSize ); } DeviceDependentCode::computeMaximalRowLengthInSlices( *this, rowLengths ); /**** * Compute the slice pointers using the exclusive prefix sum Loading Loading @@ -567,4 +552,151 @@ void tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::print( ostream& s } } template<> class tnlSlicedEllpackMatrixDeviceDependentCode< tnlHost > { public: typedef tnlHost Device; template< typename Real, typename Index, int SliceSize > static Index getRowBegin( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix, const Index row ) { return row * matrix.rowLengths; } template< typename Real, typename Index, int SliceSize > static Index getRowEnd( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix, const Index row ) { return ( row + 1 ) * matrix.rowLengths; } template< typename Real, typename Index, int SliceSize > static Index getElementStep( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix ) { return 1; } template< typename Real, typename Index, int SliceSize > static bool computeMaximalRowLengthInSlices( tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix, const typename tnlSlicedEllpackMatrix< Real, Device, Index >::RowLengthsVector& rowLengths ) { Index row( 0 ), slice( 0 ), sliceRowLength( 0 ); while( row < matrix.getRows() ) { sliceRowLength = Max( rowLengths.getElement( row++ ), sliceRowLength ); if( row % SliceSize == 0 ) { matrix.sliceRowLengths.setElement( slice, sliceRowLength ); matrix.slicePointers.setElement( slice++, sliceRowLength * SliceSize ); sliceRowLength = 0; } } if( row % SliceSize != 0 ) { matrix.sliceRowLengths.setElement( slice, sliceRowLength ); matrix.slicePointers.setElement( slice++, sliceRowLength * SliceSize ); } } }; #ifdef HAVE_CUDA template< typename Real, typename Device, typename Index, int SliceSize > __global__ void tnlSlicedEllpackMatrix_compuetMaximalRowLengthInSlices_CudaKernel<<< cudaGridSize, cudaBlockSize >>> ( tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >* matrix, const typename tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::RowLentghsVector* rowLengths, int gridIdx ) { } #endif template<> class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda > { public: typedef tnlCuda Device; template< typename Real, typename Index, int SliceSize > #ifdef HAVE_CUDA __device__ __host__ #endif static Index getRowBegin( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix, const Index row ) { return row * matrix.rowLengths; } template< typename Real, typename Index, int SliceSize > #ifdef HAVE_CUDA __device__ __host__ #endif static Index getRowEnd( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix, const Index row ) { return ( row + 1 ) * matrix.rowLengths; } template< typename Real, typename Index, int SliceSize > #ifdef HAVE_CUDA __device__ __host__ #endif static Index getElementStep( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix ) { return 1; } template< typename Real, typename Index, int SliceSize > static bool computeMaximalRowLengthInSlices( tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix, const typename tnlSlicedEllpackMatrix< Real, Device, Index >::RowLengthsVector& rowLengths ) { #ifdef HAVE_CUDA typedef tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize > Matrix; typedef typename Matrix::RowLengthsVector RowLengthsVector; Matrix* kernel_matrix = tnlCuda::passToDevice( matrix ); RowLengthsVector* kernel_rowLengths = tnlCuda::passToDevice( rowLengths ); const Index numberOfSlices = roundUpDivision( matrix.getRows(), SliceSize ); dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() ); const IndexType cudaBlocks = roundUpDivision( numberOfSlices, cudaBlockSize.x ); const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize(); tnlSlicedEllpackMatrix_compuetMaximalRowLengthInSlices_CudaKernel<<< cudaGridSize, cudaBlockSize >>> ( kernel_matrix, kernel_rowLengths, gridIdx ); } tnlCuda::freeFromDevice( kernel_matrix ); tnlCuda::freeFromDevice( kernel_rowLengths ); checkCudaDevice; #endif } }; #endif /* TNLSLICEDELLPACKMATRIX_IMPL_H_ */ src/matrices/tnlSlicedEllpackMatrix.h +17 −1 Original line number Diff line number Diff line Loading @@ -21,6 +21,9 @@ #include <matrices/tnlSparseMatrix.h> #include <core/vectors/tnlVector.h> template< typename Device > class tnlSlicedEllpackMatrixDeviceDependentCode; template< typename Real = double, typename Device = tnlHost, typename Index = int, Loading @@ -33,6 +36,9 @@ class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index > typedef Device DeviceType; typedef Index IndexType; typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >::RowLengthsVector RowLengthsVector; typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >::ValuesVector ValuesVector; typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector; typedef tnlSlicedEllpackMatrix< Real, Device, Index > ThisType; tnlSlicedEllpackMatrix(); Loading Loading @@ -130,6 +136,13 @@ class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index > IndexType* columns, RealType* values ) const; template< typename Vector > #ifdef HAVE_CUDA __device__ __host__ #endif typename Vector::RealType rowVectorProduct( const IndexType row, const Vector& vector ) const; template< typename Vector > void vectorProduct( const Vector& inVector, Vector& outVector ) const; Loading Loading @@ -163,6 +176,9 @@ class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index > tnlVector< Index, Device, Index > slicePointers, sliceRowLengths; typedef tnlSlicedEllpackMatrixDeviceDependentCode< DeviceType > DeviceDependentCode; friend class tnlSlicedEllpackMatrixDeviceDependentCode< DeviceType >; }; #include <implementation/matrices/tnlSlicedEllpackMatrix_impl.h> Loading Loading
src/implementation/matrices/tnlEllpackMatrix_impl.h +3 −3 Original line number Diff line number Diff line Loading @@ -668,7 +668,7 @@ class tnlEllpackMatrixDeviceDependentCode< tnlCuda > static Index getRowBegin( const tnlEllpackMatrix< Real, Device, Index >& matrix, const Index row ) { return row * matrix.rowLengths; return row; } template< typename Real, Loading @@ -679,7 +679,7 @@ class tnlEllpackMatrixDeviceDependentCode< tnlCuda > static Index getRowEnd( const tnlEllpackMatrix< Real, Device, Index >& matrix, const Index row ) { return ( row + 1 ) * matrix.rowLengths; return row + getElementStep( matrix ) * matrix.rowLengths; } template< typename Real, Loading @@ -689,7 +689,7 @@ class tnlEllpackMatrixDeviceDependentCode< tnlCuda > #endif static Index getElementStep( const tnlEllpackMatrix< Real, Device, Index >& matrix ) { return 1; return matrix.alignedRows; } }; Loading
src/implementation/matrices/tnlSlicedEllpackMatrix_impl.h +148 −16 Original line number Diff line number Diff line Loading @@ -75,26 +75,11 @@ bool tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::setRowLengths( co if( ! this->sliceRowLengths.setSize( slices ) || ! this->slicePointers.setSize( slices + 1 ) ) return false; IndexType row( 0 ), slice( 0 ), sliceRowLength( 0 ); /**** * Compute maximal row length in each slice */ while( row < this->rows ) { sliceRowLength = Max( rowLengths.getElement( row++ ), sliceRowLength ); if( row % SliceSize == 0 ) { this->sliceRowLengths.setElement( slice, sliceRowLength ); this->slicePointers.setElement( slice++, sliceRowLength*SliceSize ); sliceRowLength = 0; } } if( row % SliceSize != 0 ) { this->sliceRowLengths.setElement( slice, sliceRowLength ); this->slicePointers.setElement( slice++, sliceRowLength*SliceSize ); } DeviceDependentCode::computeMaximalRowLengthInSlices( *this, rowLengths ); /**** * Compute the slice pointers using the exclusive prefix sum Loading Loading @@ -567,4 +552,151 @@ void tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::print( ostream& s } } template<> class tnlSlicedEllpackMatrixDeviceDependentCode< tnlHost > { public: typedef tnlHost Device; template< typename Real, typename Index, int SliceSize > static Index getRowBegin( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix, const Index row ) { return row * matrix.rowLengths; } template< typename Real, typename Index, int SliceSize > static Index getRowEnd( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix, const Index row ) { return ( row + 1 ) * matrix.rowLengths; } template< typename Real, typename Index, int SliceSize > static Index getElementStep( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix ) { return 1; } template< typename Real, typename Index, int SliceSize > static bool computeMaximalRowLengthInSlices( tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix, const typename tnlSlicedEllpackMatrix< Real, Device, Index >::RowLengthsVector& rowLengths ) { Index row( 0 ), slice( 0 ), sliceRowLength( 0 ); while( row < matrix.getRows() ) { sliceRowLength = Max( rowLengths.getElement( row++ ), sliceRowLength ); if( row % SliceSize == 0 ) { matrix.sliceRowLengths.setElement( slice, sliceRowLength ); matrix.slicePointers.setElement( slice++, sliceRowLength * SliceSize ); sliceRowLength = 0; } } if( row % SliceSize != 0 ) { matrix.sliceRowLengths.setElement( slice, sliceRowLength ); matrix.slicePointers.setElement( slice++, sliceRowLength * SliceSize ); } } }; #ifdef HAVE_CUDA template< typename Real, typename Device, typename Index, int SliceSize > __global__ void tnlSlicedEllpackMatrix_compuetMaximalRowLengthInSlices_CudaKernel<<< cudaGridSize, cudaBlockSize >>> ( tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >* matrix, const typename tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::RowLentghsVector* rowLengths, int gridIdx ) { } #endif template<> class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda > { public: typedef tnlCuda Device; template< typename Real, typename Index, int SliceSize > #ifdef HAVE_CUDA __device__ __host__ #endif static Index getRowBegin( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix, const Index row ) { return row * matrix.rowLengths; } template< typename Real, typename Index, int SliceSize > #ifdef HAVE_CUDA __device__ __host__ #endif static Index getRowEnd( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix, const Index row ) { return ( row + 1 ) * matrix.rowLengths; } template< typename Real, typename Index, int SliceSize > #ifdef HAVE_CUDA __device__ __host__ #endif static Index getElementStep( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix ) { return 1; } template< typename Real, typename Index, int SliceSize > static bool computeMaximalRowLengthInSlices( tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix, const typename tnlSlicedEllpackMatrix< Real, Device, Index >::RowLengthsVector& rowLengths ) { #ifdef HAVE_CUDA typedef tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize > Matrix; typedef typename Matrix::RowLengthsVector RowLengthsVector; Matrix* kernel_matrix = tnlCuda::passToDevice( matrix ); RowLengthsVector* kernel_rowLengths = tnlCuda::passToDevice( rowLengths ); const Index numberOfSlices = roundUpDivision( matrix.getRows(), SliceSize ); dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() ); const IndexType cudaBlocks = roundUpDivision( numberOfSlices, cudaBlockSize.x ); const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize(); tnlSlicedEllpackMatrix_compuetMaximalRowLengthInSlices_CudaKernel<<< cudaGridSize, cudaBlockSize >>> ( kernel_matrix, kernel_rowLengths, gridIdx ); } tnlCuda::freeFromDevice( kernel_matrix ); tnlCuda::freeFromDevice( kernel_rowLengths ); checkCudaDevice; #endif } }; #endif /* TNLSLICEDELLPACKMATRIX_IMPL_H_ */
src/matrices/tnlSlicedEllpackMatrix.h +17 −1 Original line number Diff line number Diff line Loading @@ -21,6 +21,9 @@ #include <matrices/tnlSparseMatrix.h> #include <core/vectors/tnlVector.h> template< typename Device > class tnlSlicedEllpackMatrixDeviceDependentCode; template< typename Real = double, typename Device = tnlHost, typename Index = int, Loading @@ -33,6 +36,9 @@ class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index > typedef Device DeviceType; typedef Index IndexType; typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >::RowLengthsVector RowLengthsVector; typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >::ValuesVector ValuesVector; typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector; typedef tnlSlicedEllpackMatrix< Real, Device, Index > ThisType; tnlSlicedEllpackMatrix(); Loading Loading @@ -130,6 +136,13 @@ class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index > IndexType* columns, RealType* values ) const; template< typename Vector > #ifdef HAVE_CUDA __device__ __host__ #endif typename Vector::RealType rowVectorProduct( const IndexType row, const Vector& vector ) const; template< typename Vector > void vectorProduct( const Vector& inVector, Vector& outVector ) const; Loading Loading @@ -163,6 +176,9 @@ class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index > tnlVector< Index, Device, Index > slicePointers, sliceRowLengths; typedef tnlSlicedEllpackMatrixDeviceDependentCode< DeviceType > DeviceDependentCode; friend class tnlSlicedEllpackMatrixDeviceDependentCode< DeviceType >; }; #include <implementation/matrices/tnlSlicedEllpackMatrix_impl.h> Loading