Loading .gitignore +2 −0 Original line number Diff line number Diff line Loading @@ -14,6 +14,8 @@ /Debug /Release .settings # /src/ /src/Makefile.in Loading CMakeLists.txt +1 −0 Original line number Diff line number Diff line Loading @@ -34,6 +34,7 @@ endif() if( WITH_CUDA STREQUAL "yes" ) AddCompilerFlag( "-DHAVE_NOT_CXX11 -U_GLIBCXX_ATOMIC_BUILTINS -U_GLIBCXX_USE_INT128" ) set( CUDA_LINKER_OPTIONS "-arch sm_20 -shared" ) # TODO: fix this in the rest of cmake files else() AddCompilerFlag( "-std=gnu++0x" ) endif() Loading install +1 −1 Original line number Diff line number Diff line Loading @@ -2,7 +2,7 @@ TARGET=TNL INSTALL_PREFIX=${HOME}/local WITH_CUDA=no WITH_CUDA=yes WITH_CUSPARSE=no CUDA_ARCHITECTURE=2.0 TEMPLATE_EXPLICIT_INSTANTIATION=yes Loading src/implementation/matrices/tnlTridiagonalMatrix_impl.h +216 −66 Original line number Diff line number Diff line Loading @@ -21,6 +21,85 @@ #include <core/tnlAssert.h> #include <matrices/tnlTridiagonalMatrix.h> template< typename Device > class tnlTridiagonalMatrixDeviceDependentCode; template<> class tnlTridiagonalMatrixDeviceDependentCode< tnlHost > { public: template< typename Index > static Index getElementIndex( const Index rows, const Index row, const Index column ) { return 3*row + column - row; } template< typename Vector, typename Index, typename ValuesType > static typename Vector::RealType rowVectorProduct( const Index rows, const ValuesType& values, const Index row, const Vector& vector ) { if( row == 0 ) return vector[ 0 ] * values[ 0 ] + vector[ 1 ] * values[ 1 ]; Index i = 3 * row - 1; if( row == rows - 1 ) return vector[ row - 1 ] * values[ i++ ] + vector[ row ] * values[ i ]; return vector[ row - 1 ] * values[ i++ ] + vector[ row ] * values[ i++ ] + vector[ row + 1 ] * values[ i ]; } }; template<> class tnlTridiagonalMatrixDeviceDependentCode< tnlCuda > { public: template< typename Index > #ifdef HAVE_CUDA __device__ __host__ #endif static Index getElementIndex( const Index rows, const Index row, const Index column ) { return ( column - row + 1 )*rows + row - 1; } template< typename Vector, typename Index, typename ValuesType > #ifdef HAVE_CUDA __device__ #endif static typename Vector::RealType rowVectorProduct( const Index rows, const ValuesType& values, const Index row, const Vector& vector ) { if( row == 0 ) return vector[ 0 ] * values[ 0 ] + vector[ 1 ] * values[ rows - 1 ]; Index i = row - 1; if( row == rows - 1 ) return vector[ row - 1 ] * values[ i ] + vector[ row ] * values[ i + rows ]; return vector[ row - 1 ] * values[ i ] + vector[ row ] * values[ i + rows ] + vector[ row + 1 ] * values[ i + 2*rows ]; } }; template< typename Real, typename Device, typename Index > Loading Loading @@ -58,6 +137,7 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::setDimensions( const IndexType if( ! values.setSize( 3*Min( rows, columns ) ) ) return false; this->values.setValue( 0.0 ); return true; } template< typename Real, Loading Loading @@ -174,7 +254,7 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::setElementFast( const IndexTyp const IndexType column, const RealType& value ) { this->values.setElement( this->getElementIndex( row, column ), value ); this->values[ this->getElementIndex( row, column ) ] = value; return true; } Loading @@ -189,7 +269,6 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::setElement( const IndexType ro return true; } template< typename Real, typename Device, typename Index > Loading @@ -201,7 +280,8 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::addElementFast( const IndexTyp const RealType& value, const RealType& thisElementMultiplicator ) { this->values.addElement( this->getElementIndex( row, column ), value, thisElementMultiplicator ); const Index i = this->getElementIndex( row, column ); this->values[ i ] = thisElementMultiplicator*this->values[ i ] + value; } template< typename Real, Loading @@ -212,10 +292,10 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::addElement( const IndexType ro const RealType& value, const RealType& thisElementMultiplicator ) { //this->values.addElement( this->getElementIndex( row, column ), value, thisElementMultiplicator ); const Index i = this->getElementIndex( row, column ); this->values.setElement( i, thisElementMultiplicator * this->values.getElement( i ) + value ); } template< typename Real, typename Device, typename Index > Loading @@ -231,7 +311,7 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::setRowFast( const IndexType ro cerr << " elements = " << elements << " this->columns = " << this->columns << " this->getName() = " << this->getName() ); return this->addRow( row, columns, values, elements, 0.0 ); return this->addRowFast( row, columns, values, elements, 0.0 ); } template< typename Real, Loading @@ -246,10 +326,9 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::setRow( const IndexType row, cerr << " elements = " << elements << " this->columns = " << this->columns << " this->getName() = " << this->getName() ); //return this->addRow( row, columns, values, elements, 0.0 ); return this->addRow( row, columns, values, elements, 0.0 ); } template< typename Real, typename Device, typename Index > Loading @@ -273,7 +352,7 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::addRowFast( const IndexType ro const IndexType& column = columns[ i ]; if( column < row - 1 || column > row + 1 ) return false; addElement( row, column, values[ i ], thisRowMultiplicator ); addElementFast( row, column, values[ i ], thisRowMultiplicator ); } return true; } Loading @@ -291,16 +370,16 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::addRow( const IndexType row, cerr << " elements = " << elements << " this->columns = " << this->columns << " this->getName() = " << this->getName() ); /*if( elements > 3 ) if( elements > 3 ) return false; for( IndexType i = 0; i < elements; i++ ) { const IndexType& column = columns[ i ]; const IndexType column = columns[ i ]; if( column < row - 1 || column > row + 1 ) return false; addElement( row, column, values[ i ], thisRowMultiplicator ); } return true;*/ return true; } template< typename Real, Loading @@ -314,7 +393,7 @@ Real tnlTridiagonalMatrix< Real, Device, Index >::getElementFast( const IndexTyp { if( abs( column - row ) > 1 ) return 0.0; return this->values.getElement( this->getElementIndex( row, column ) ); return this->values[ this->getElementIndex( row, column ) ]; } template< typename Real, Loading @@ -323,10 +402,11 @@ template< typename Real, Real tnlTridiagonalMatrix< Real, Device, Index >::getElement( const IndexType row, const IndexType column ) const { return this->getElementFast( row, column ); if( abs( column - row ) > 1 ) return 0.0; return this->values.getElement( this->getElementIndex( row, column ) ); } template< typename Real, typename Device, typename Index > Loading Loading @@ -357,56 +437,50 @@ void tnlTridiagonalMatrix< Real, Device, Index >::getRow( const IndexType row, IndexType* columns, RealType* values ) const { /*IndexType elementPointer( 0 ); IndexType elementPointer( 0 ); for( IndexType i = -1; i <= 1; i++ ) { const IndexType column = row + 1; if( column >= 0 && column < this->getColumns() ) { columns[ elementPointer ] = column; values[ elementPointer ] = this->values[ this->getElementIndex( row, column ) ]; values[ elementPointer ] = this->values.getElement( this->getElementIndex( row, column ) ); elementPointer++; } }*/ } template< typename Real, typename Device, typename Index > Real& tnlTridiagonalMatrix< Real, Device, Index >::operator()( const IndexType row, const IndexType column ) { return this->values[ this->getElementIndex( row, column ) ]; } template< typename Real, typename Device, typename Index > const Real& tnlTridiagonalMatrix< Real, Device, Index >::operator()( const IndexType row, const IndexType column ) const template< typename Vector > #ifdef HAVE_CUDA __device__ __host__ #endif typename Vector::RealType tnlTridiagonalMatrix< Real, Device, Index >::rowVectorProduct( const IndexType row, const Vector& vector ) const { return this->values[ this->getElementIndex( row, column ) ]; return tnlTridiagonalMatrixDeviceDependentCode< Device >:: rowVectorProduct( this->rows, this->values, row, vector ); } #ifdef HAVE_CUDA template< typename Real, typename Device, typename Index > template< typename Vector > typename Vector::RealType tnlTridiagonalMatrix< Real, Device, Index >::rowVectorProduct( const IndexType row, const Vector& vector ) const typename Index, typename Vector > __global__ void tnlTridiagonalMatrixVectorProductCudaKernel( tnlTridiagonalMatrix< Real, tnlCuda, Index >* matrix, const Vector* inVector, Vector* outVector, const Index gridIdx ) { if( row == 0 ) return vector[ 0 ] * this->values[ 0 ] + vector[ 1 ] * this->values[ 1 ]; IndexType i = 3 * row - 1; if( row == this->rows - 1 ) return vector[ row - 1 ] * this->values[ i++ ] + vector[ row ] * this->values[ i ]; return vector[ row - 1 ] * this->values[ i++ ] + vector[ row ] * this->values[ i++ ] + vector[ row + 1 ] * this->values[ i ]; const Index rowIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( rowIdx < matrix->getRows() ) ( *outVector )[ rowIdx ] = matrix->rowVectorProduct( rowIdx, *inVector ); } #endif template< typename Real, typename Device, Loading @@ -426,8 +500,32 @@ void tnlTridiagonalMatrix< Real, Device, Index >::vectorProduct( const Vector& i << "Vector size: " << outVector.getSize() << endl << "Vector name: " << outVector.getName() << endl ); if( Device::getDevice() == tnlHostDevice ) for( IndexType row = 0; row < this->getRows(); row++ ) outVector[ row ] = this->rowVectorProduct( row, inVector ); if( Device::getDevice() == tnlCudaDevice ) { #ifdef HAVE_CUDA ThisType* kernel_this = tnlCuda::passToDevice( *this ); Vector* kernel_inVector = tnlCuda::passToDevice( inVector ); Vector* kernel_outVector = tnlCuda::passToDevice( outVector ); dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() ); const IndexType cudaBlocks = roundUpDivision( this->getRows(), cudaBlockSize.x ); const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize(); tnlTridiagonalMatrixVectorProductCudaKernel<<< cudaGridSize, cudaBlockSize >>> ( kernel_this, kernel_inVector, kernel_outVector, gridIdx ); } tnlCuda::freeFromDevice( kernel_this ); tnlCuda::freeFromDevice( kernel_inVector ); tnlCuda::freeFromDevice( kernel_outVector ); checkCudaDevice; #endif } } template< typename Real, Loading @@ -443,31 +541,52 @@ void tnlTridiagonalMatrix< Real, Device, Index >::addMatrix( const tnlTridiagona << "This matrix rows: " << this->getRows() << endl << "This matrix name: " << this->getName() << endl ); const IndexType lastI = this->values.getSize(); if( thisMatrixMultiplicator == 1.0 ) { for( IndexType i = 0; i < lastI; i++ ) this->values[ i ] += matrixMultiplicator*matrix.values[ i ]; } this->values.alphaXPlusY( matrixMultiplicator, matrix.values ); else { for( IndexType i = 0; i < lastI; i++ ) this->values[ i ] = thisMatrixMultiplicator * this->values[ i ] + matrixMultiplicator*matrix.values[ i ]; this->values.alphaXPlusBetaY( matrixMultiplicator, matrix.values, thisMatrixMultiplicator ); } #ifdef HAVE_CUDA template< typename Real, typename Real2, typename Index, typename Index2 > __global__ void tnlTridiagonalMatrixTranspositionCudaKernel( const tnlTridiagonalMatrix< Real2, tnlCuda, Index2 >* inMatrix, tnlTridiagonalMatrix< Real, tnlCuda, Index >* outMatrix, const Real matrixMultiplicator, const Index gridIdx ) { const Index rowIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( rowIdx < inMatrix->getRows() ) { if( rowIdx > 0 ) outMatrix->setElementFast( rowIdx-1, rowIdx, matrixMultiplicator * inMatrix->getElementFast( rowIdx, rowIdx-1 ) ); outMatrix->setElementFast( rowIdx, rowIdx, matrixMultiplicator * inMatrix->getElementFast( rowIdx, rowIdx ) ); if( rowIdx < inMatrix->getRows()-1 ) outMatrix->setElementFast( rowIdx+1, rowIdx, matrixMultiplicator * inMatrix->getElementFast( rowIdx, rowIdx+1 ) ); } } #endif template< typename Real, typename Device, typename Index > template< typename Real2, typename Index2, int tileDim > template< typename Real2, typename Index2 > void tnlTridiagonalMatrix< Real, Device, Index >::getTransposition( const tnlTridiagonalMatrix< Real2, Device, Index2 >& matrix, const RealType& matrixMultiplicator ) { tnlAssert( this->getRows() == matrix.getRows(), cerr << "This matrix rows: " << this->getRows() << endl << "That matrix rows: " << matrix.getRows() << endl ); if( Device::getDevice() == tnlHostDevice ) { const IndexType& rows = matrix.getRows(); for( IndexType i = 1; i < rows; i++ ) { Loading @@ -477,6 +596,31 @@ void tnlTridiagonalMatrix< Real, Device, Index >::getTransposition( const tnlTri this->setElement( i - 1, i, aux ); } } if( Device::getDevice() == tnlCudaDevice ) { #ifdef HAVE_CUDA ThisType* kernel_this = tnlCuda::passToDevice( *this ); typedef tnlTridiagonalMatrix< Real2, Device, Index2 > InMatrixType; InMatrixType* kernel_inMatrix = tnlCuda::passToDevice( matrix ); dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() ); const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize(); tnlTridiagonalMatrixTranspositionCudaKernel<<< cudaGridSize, cudaBlockSize >>> ( kernel_inMatrix, kernel_this, matrixMultiplicator, gridIdx ); } tnlCuda::freeFromDevice( kernel_this ); tnlCuda::freeFromDevice( kernel_inMatrix ); checkCudaDevice; #endif } } template< typename Real, typename Device, Loading Loading @@ -547,7 +691,7 @@ void tnlTridiagonalMatrix< Real, Device, Index >::print( ostream& str ) const str <<"Row: " << row << " -> "; for( IndexType column = row - 1; column < row + 2; column++ ) if( column >= 0 && column < this->columns ) str << " Col:" << column << "->" << this->operator()( row, column ) << "\t"; str << " Col:" << column << "->" << this->getElement( row, column ) << "\t"; str << endl; } } Loading @@ -555,15 +699,21 @@ void tnlTridiagonalMatrix< Real, Device, Index >::print( ostream& str ) const template< typename Real, typename Device, typename Index > #ifdef HAVE_CUDA __device__ __host__ #endif Index tnlTridiagonalMatrix< Real, Device, Index >::getElementIndex( const IndexType row, const IndexType column ) const { // TODO: remove the #ifndef when CUDA supports std:cerr #ifndef HAVE_CUDA tnlAssert( row >= 0 && column >= 0 && row < this->rows && column < this->rows, cerr << " this->rows = " << this->rows << " row = " << row << " column = " << column ); tnlAssert( abs( row - column ) < 2, cerr << "row = " << row << " column = " << column << endl ); return 3*row + column - row; #endif return tnlTridiagonalMatrixDeviceDependentCode< Device >::getElementIndex( this->rows, row, column ); } Loading src/matrices/tnlTridiagonalMatrix.h +28 −7 Original line number Diff line number Diff line Loading @@ -32,6 +32,7 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index > typedef Device DeviceType; typedef Index IndexType; typedef typename tnlMatrix< Real, Device, Index >::RowLengthsVector RowLengthsVector; typedef tnlTridiagonalMatrix< Real, Device, Index > ThisType; tnlTridiagonalMatrix(); Loading Loading @@ -82,6 +83,11 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index > const RealType& value, const RealType& thisElementMultiplicator = 1.0 ); bool addElement( const IndexType row, const IndexType column, const RealType& value, const RealType& thisElementMultiplicator = 1.0 ); #ifdef HAVE_CUDA __device__ __host__ #endif Loading @@ -90,6 +96,11 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index > const RealType* values, const IndexType elements ); bool setRow( const IndexType row, const IndexType* columns, const RealType* values, const IndexType elements ); #ifdef HAVE_CUDA __device__ __host__ #endif Loading @@ -99,6 +110,12 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index > const IndexType elements, const RealType& thisRowMultiplicator = 1.0 ); bool addRow( const IndexType row, const IndexType* columns, const RealType* values, const IndexType elements, const RealType& thisRowMultiplicator = 1.0 ); #ifdef HAVE_CUDA __device__ __host__ #endif Loading @@ -115,13 +132,14 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index > IndexType* columns, RealType* values ) const; RealType& operator()( const IndexType row, const IndexType column ); const RealType& operator()( const IndexType row, const IndexType column ) const; void getRow( const IndexType row, IndexType* columns, RealType* values ) const; template< typename Vector > #ifdef HAVE_CUDA __device__ __host__ #endif typename Vector::RealType rowVectorProduct( const IndexType row, const Vector& vector ) const; Loading @@ -135,11 +153,11 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index > const RealType& thisMatrixMultiplicator = 1.0 ); #ifdef HAVE_NOT_CXX11 template< typename Real2, typename Index2, int tileDim > template< typename Real2, typename Index2 > void getTransposition( const tnlTridiagonalMatrix< Real2, Device, Index2 >& matrix, const RealType& matrixMultiplicator = 1.0 ); #else template< typename Real2, typename Index2, int tileDim = 32 > template< typename Real2, typename Index2 > void getTransposition( const tnlTridiagonalMatrix< Real2, Device, Index2 >& matrix, const RealType& matrixMultiplicator = 1.0 ); #endif Loading @@ -162,6 +180,9 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index > protected: #ifdef HAVE_CUDA __device__ __host__ #endif IndexType getElementIndex( const IndexType row, const IndexType column ) const; Loading Loading
.gitignore +2 −0 Original line number Diff line number Diff line Loading @@ -14,6 +14,8 @@ /Debug /Release .settings # /src/ /src/Makefile.in Loading
CMakeLists.txt +1 −0 Original line number Diff line number Diff line Loading @@ -34,6 +34,7 @@ endif() if( WITH_CUDA STREQUAL "yes" ) AddCompilerFlag( "-DHAVE_NOT_CXX11 -U_GLIBCXX_ATOMIC_BUILTINS -U_GLIBCXX_USE_INT128" ) set( CUDA_LINKER_OPTIONS "-arch sm_20 -shared" ) # TODO: fix this in the rest of cmake files else() AddCompilerFlag( "-std=gnu++0x" ) endif() Loading
install +1 −1 Original line number Diff line number Diff line Loading @@ -2,7 +2,7 @@ TARGET=TNL INSTALL_PREFIX=${HOME}/local WITH_CUDA=no WITH_CUDA=yes WITH_CUSPARSE=no CUDA_ARCHITECTURE=2.0 TEMPLATE_EXPLICIT_INSTANTIATION=yes Loading
src/implementation/matrices/tnlTridiagonalMatrix_impl.h +216 −66 Original line number Diff line number Diff line Loading @@ -21,6 +21,85 @@ #include <core/tnlAssert.h> #include <matrices/tnlTridiagonalMatrix.h> template< typename Device > class tnlTridiagonalMatrixDeviceDependentCode; template<> class tnlTridiagonalMatrixDeviceDependentCode< tnlHost > { public: template< typename Index > static Index getElementIndex( const Index rows, const Index row, const Index column ) { return 3*row + column - row; } template< typename Vector, typename Index, typename ValuesType > static typename Vector::RealType rowVectorProduct( const Index rows, const ValuesType& values, const Index row, const Vector& vector ) { if( row == 0 ) return vector[ 0 ] * values[ 0 ] + vector[ 1 ] * values[ 1 ]; Index i = 3 * row - 1; if( row == rows - 1 ) return vector[ row - 1 ] * values[ i++ ] + vector[ row ] * values[ i ]; return vector[ row - 1 ] * values[ i++ ] + vector[ row ] * values[ i++ ] + vector[ row + 1 ] * values[ i ]; } }; template<> class tnlTridiagonalMatrixDeviceDependentCode< tnlCuda > { public: template< typename Index > #ifdef HAVE_CUDA __device__ __host__ #endif static Index getElementIndex( const Index rows, const Index row, const Index column ) { return ( column - row + 1 )*rows + row - 1; } template< typename Vector, typename Index, typename ValuesType > #ifdef HAVE_CUDA __device__ #endif static typename Vector::RealType rowVectorProduct( const Index rows, const ValuesType& values, const Index row, const Vector& vector ) { if( row == 0 ) return vector[ 0 ] * values[ 0 ] + vector[ 1 ] * values[ rows - 1 ]; Index i = row - 1; if( row == rows - 1 ) return vector[ row - 1 ] * values[ i ] + vector[ row ] * values[ i + rows ]; return vector[ row - 1 ] * values[ i ] + vector[ row ] * values[ i + rows ] + vector[ row + 1 ] * values[ i + 2*rows ]; } }; template< typename Real, typename Device, typename Index > Loading Loading @@ -58,6 +137,7 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::setDimensions( const IndexType if( ! values.setSize( 3*Min( rows, columns ) ) ) return false; this->values.setValue( 0.0 ); return true; } template< typename Real, Loading Loading @@ -174,7 +254,7 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::setElementFast( const IndexTyp const IndexType column, const RealType& value ) { this->values.setElement( this->getElementIndex( row, column ), value ); this->values[ this->getElementIndex( row, column ) ] = value; return true; } Loading @@ -189,7 +269,6 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::setElement( const IndexType ro return true; } template< typename Real, typename Device, typename Index > Loading @@ -201,7 +280,8 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::addElementFast( const IndexTyp const RealType& value, const RealType& thisElementMultiplicator ) { this->values.addElement( this->getElementIndex( row, column ), value, thisElementMultiplicator ); const Index i = this->getElementIndex( row, column ); this->values[ i ] = thisElementMultiplicator*this->values[ i ] + value; } template< typename Real, Loading @@ -212,10 +292,10 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::addElement( const IndexType ro const RealType& value, const RealType& thisElementMultiplicator ) { //this->values.addElement( this->getElementIndex( row, column ), value, thisElementMultiplicator ); const Index i = this->getElementIndex( row, column ); this->values.setElement( i, thisElementMultiplicator * this->values.getElement( i ) + value ); } template< typename Real, typename Device, typename Index > Loading @@ -231,7 +311,7 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::setRowFast( const IndexType ro cerr << " elements = " << elements << " this->columns = " << this->columns << " this->getName() = " << this->getName() ); return this->addRow( row, columns, values, elements, 0.0 ); return this->addRowFast( row, columns, values, elements, 0.0 ); } template< typename Real, Loading @@ -246,10 +326,9 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::setRow( const IndexType row, cerr << " elements = " << elements << " this->columns = " << this->columns << " this->getName() = " << this->getName() ); //return this->addRow( row, columns, values, elements, 0.0 ); return this->addRow( row, columns, values, elements, 0.0 ); } template< typename Real, typename Device, typename Index > Loading @@ -273,7 +352,7 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::addRowFast( const IndexType ro const IndexType& column = columns[ i ]; if( column < row - 1 || column > row + 1 ) return false; addElement( row, column, values[ i ], thisRowMultiplicator ); addElementFast( row, column, values[ i ], thisRowMultiplicator ); } return true; } Loading @@ -291,16 +370,16 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::addRow( const IndexType row, cerr << " elements = " << elements << " this->columns = " << this->columns << " this->getName() = " << this->getName() ); /*if( elements > 3 ) if( elements > 3 ) return false; for( IndexType i = 0; i < elements; i++ ) { const IndexType& column = columns[ i ]; const IndexType column = columns[ i ]; if( column < row - 1 || column > row + 1 ) return false; addElement( row, column, values[ i ], thisRowMultiplicator ); } return true;*/ return true; } template< typename Real, Loading @@ -314,7 +393,7 @@ Real tnlTridiagonalMatrix< Real, Device, Index >::getElementFast( const IndexTyp { if( abs( column - row ) > 1 ) return 0.0; return this->values.getElement( this->getElementIndex( row, column ) ); return this->values[ this->getElementIndex( row, column ) ]; } template< typename Real, Loading @@ -323,10 +402,11 @@ template< typename Real, Real tnlTridiagonalMatrix< Real, Device, Index >::getElement( const IndexType row, const IndexType column ) const { return this->getElementFast( row, column ); if( abs( column - row ) > 1 ) return 0.0; return this->values.getElement( this->getElementIndex( row, column ) ); } template< typename Real, typename Device, typename Index > Loading Loading @@ -357,56 +437,50 @@ void tnlTridiagonalMatrix< Real, Device, Index >::getRow( const IndexType row, IndexType* columns, RealType* values ) const { /*IndexType elementPointer( 0 ); IndexType elementPointer( 0 ); for( IndexType i = -1; i <= 1; i++ ) { const IndexType column = row + 1; if( column >= 0 && column < this->getColumns() ) { columns[ elementPointer ] = column; values[ elementPointer ] = this->values[ this->getElementIndex( row, column ) ]; values[ elementPointer ] = this->values.getElement( this->getElementIndex( row, column ) ); elementPointer++; } }*/ } template< typename Real, typename Device, typename Index > Real& tnlTridiagonalMatrix< Real, Device, Index >::operator()( const IndexType row, const IndexType column ) { return this->values[ this->getElementIndex( row, column ) ]; } template< typename Real, typename Device, typename Index > const Real& tnlTridiagonalMatrix< Real, Device, Index >::operator()( const IndexType row, const IndexType column ) const template< typename Vector > #ifdef HAVE_CUDA __device__ __host__ #endif typename Vector::RealType tnlTridiagonalMatrix< Real, Device, Index >::rowVectorProduct( const IndexType row, const Vector& vector ) const { return this->values[ this->getElementIndex( row, column ) ]; return tnlTridiagonalMatrixDeviceDependentCode< Device >:: rowVectorProduct( this->rows, this->values, row, vector ); } #ifdef HAVE_CUDA template< typename Real, typename Device, typename Index > template< typename Vector > typename Vector::RealType tnlTridiagonalMatrix< Real, Device, Index >::rowVectorProduct( const IndexType row, const Vector& vector ) const typename Index, typename Vector > __global__ void tnlTridiagonalMatrixVectorProductCudaKernel( tnlTridiagonalMatrix< Real, tnlCuda, Index >* matrix, const Vector* inVector, Vector* outVector, const Index gridIdx ) { if( row == 0 ) return vector[ 0 ] * this->values[ 0 ] + vector[ 1 ] * this->values[ 1 ]; IndexType i = 3 * row - 1; if( row == this->rows - 1 ) return vector[ row - 1 ] * this->values[ i++ ] + vector[ row ] * this->values[ i ]; return vector[ row - 1 ] * this->values[ i++ ] + vector[ row ] * this->values[ i++ ] + vector[ row + 1 ] * this->values[ i ]; const Index rowIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( rowIdx < matrix->getRows() ) ( *outVector )[ rowIdx ] = matrix->rowVectorProduct( rowIdx, *inVector ); } #endif template< typename Real, typename Device, Loading @@ -426,8 +500,32 @@ void tnlTridiagonalMatrix< Real, Device, Index >::vectorProduct( const Vector& i << "Vector size: " << outVector.getSize() << endl << "Vector name: " << outVector.getName() << endl ); if( Device::getDevice() == tnlHostDevice ) for( IndexType row = 0; row < this->getRows(); row++ ) outVector[ row ] = this->rowVectorProduct( row, inVector ); if( Device::getDevice() == tnlCudaDevice ) { #ifdef HAVE_CUDA ThisType* kernel_this = tnlCuda::passToDevice( *this ); Vector* kernel_inVector = tnlCuda::passToDevice( inVector ); Vector* kernel_outVector = tnlCuda::passToDevice( outVector ); dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() ); const IndexType cudaBlocks = roundUpDivision( this->getRows(), cudaBlockSize.x ); const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize(); tnlTridiagonalMatrixVectorProductCudaKernel<<< cudaGridSize, cudaBlockSize >>> ( kernel_this, kernel_inVector, kernel_outVector, gridIdx ); } tnlCuda::freeFromDevice( kernel_this ); tnlCuda::freeFromDevice( kernel_inVector ); tnlCuda::freeFromDevice( kernel_outVector ); checkCudaDevice; #endif } } template< typename Real, Loading @@ -443,31 +541,52 @@ void tnlTridiagonalMatrix< Real, Device, Index >::addMatrix( const tnlTridiagona << "This matrix rows: " << this->getRows() << endl << "This matrix name: " << this->getName() << endl ); const IndexType lastI = this->values.getSize(); if( thisMatrixMultiplicator == 1.0 ) { for( IndexType i = 0; i < lastI; i++ ) this->values[ i ] += matrixMultiplicator*matrix.values[ i ]; } this->values.alphaXPlusY( matrixMultiplicator, matrix.values ); else { for( IndexType i = 0; i < lastI; i++ ) this->values[ i ] = thisMatrixMultiplicator * this->values[ i ] + matrixMultiplicator*matrix.values[ i ]; this->values.alphaXPlusBetaY( matrixMultiplicator, matrix.values, thisMatrixMultiplicator ); } #ifdef HAVE_CUDA template< typename Real, typename Real2, typename Index, typename Index2 > __global__ void tnlTridiagonalMatrixTranspositionCudaKernel( const tnlTridiagonalMatrix< Real2, tnlCuda, Index2 >* inMatrix, tnlTridiagonalMatrix< Real, tnlCuda, Index >* outMatrix, const Real matrixMultiplicator, const Index gridIdx ) { const Index rowIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( rowIdx < inMatrix->getRows() ) { if( rowIdx > 0 ) outMatrix->setElementFast( rowIdx-1, rowIdx, matrixMultiplicator * inMatrix->getElementFast( rowIdx, rowIdx-1 ) ); outMatrix->setElementFast( rowIdx, rowIdx, matrixMultiplicator * inMatrix->getElementFast( rowIdx, rowIdx ) ); if( rowIdx < inMatrix->getRows()-1 ) outMatrix->setElementFast( rowIdx+1, rowIdx, matrixMultiplicator * inMatrix->getElementFast( rowIdx, rowIdx+1 ) ); } } #endif template< typename Real, typename Device, typename Index > template< typename Real2, typename Index2, int tileDim > template< typename Real2, typename Index2 > void tnlTridiagonalMatrix< Real, Device, Index >::getTransposition( const tnlTridiagonalMatrix< Real2, Device, Index2 >& matrix, const RealType& matrixMultiplicator ) { tnlAssert( this->getRows() == matrix.getRows(), cerr << "This matrix rows: " << this->getRows() << endl << "That matrix rows: " << matrix.getRows() << endl ); if( Device::getDevice() == tnlHostDevice ) { const IndexType& rows = matrix.getRows(); for( IndexType i = 1; i < rows; i++ ) { Loading @@ -477,6 +596,31 @@ void tnlTridiagonalMatrix< Real, Device, Index >::getTransposition( const tnlTri this->setElement( i - 1, i, aux ); } } if( Device::getDevice() == tnlCudaDevice ) { #ifdef HAVE_CUDA ThisType* kernel_this = tnlCuda::passToDevice( *this ); typedef tnlTridiagonalMatrix< Real2, Device, Index2 > InMatrixType; InMatrixType* kernel_inMatrix = tnlCuda::passToDevice( matrix ); dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() ); const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize(); tnlTridiagonalMatrixTranspositionCudaKernel<<< cudaGridSize, cudaBlockSize >>> ( kernel_inMatrix, kernel_this, matrixMultiplicator, gridIdx ); } tnlCuda::freeFromDevice( kernel_this ); tnlCuda::freeFromDevice( kernel_inMatrix ); checkCudaDevice; #endif } } template< typename Real, typename Device, Loading Loading @@ -547,7 +691,7 @@ void tnlTridiagonalMatrix< Real, Device, Index >::print( ostream& str ) const str <<"Row: " << row << " -> "; for( IndexType column = row - 1; column < row + 2; column++ ) if( column >= 0 && column < this->columns ) str << " Col:" << column << "->" << this->operator()( row, column ) << "\t"; str << " Col:" << column << "->" << this->getElement( row, column ) << "\t"; str << endl; } } Loading @@ -555,15 +699,21 @@ void tnlTridiagonalMatrix< Real, Device, Index >::print( ostream& str ) const template< typename Real, typename Device, typename Index > #ifdef HAVE_CUDA __device__ __host__ #endif Index tnlTridiagonalMatrix< Real, Device, Index >::getElementIndex( const IndexType row, const IndexType column ) const { // TODO: remove the #ifndef when CUDA supports std:cerr #ifndef HAVE_CUDA tnlAssert( row >= 0 && column >= 0 && row < this->rows && column < this->rows, cerr << " this->rows = " << this->rows << " row = " << row << " column = " << column ); tnlAssert( abs( row - column ) < 2, cerr << "row = " << row << " column = " << column << endl ); return 3*row + column - row; #endif return tnlTridiagonalMatrixDeviceDependentCode< Device >::getElementIndex( this->rows, row, column ); } Loading
src/matrices/tnlTridiagonalMatrix.h +28 −7 Original line number Diff line number Diff line Loading @@ -32,6 +32,7 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index > typedef Device DeviceType; typedef Index IndexType; typedef typename tnlMatrix< Real, Device, Index >::RowLengthsVector RowLengthsVector; typedef tnlTridiagonalMatrix< Real, Device, Index > ThisType; tnlTridiagonalMatrix(); Loading Loading @@ -82,6 +83,11 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index > const RealType& value, const RealType& thisElementMultiplicator = 1.0 ); bool addElement( const IndexType row, const IndexType column, const RealType& value, const RealType& thisElementMultiplicator = 1.0 ); #ifdef HAVE_CUDA __device__ __host__ #endif Loading @@ -90,6 +96,11 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index > const RealType* values, const IndexType elements ); bool setRow( const IndexType row, const IndexType* columns, const RealType* values, const IndexType elements ); #ifdef HAVE_CUDA __device__ __host__ #endif Loading @@ -99,6 +110,12 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index > const IndexType elements, const RealType& thisRowMultiplicator = 1.0 ); bool addRow( const IndexType row, const IndexType* columns, const RealType* values, const IndexType elements, const RealType& thisRowMultiplicator = 1.0 ); #ifdef HAVE_CUDA __device__ __host__ #endif Loading @@ -115,13 +132,14 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index > IndexType* columns, RealType* values ) const; RealType& operator()( const IndexType row, const IndexType column ); const RealType& operator()( const IndexType row, const IndexType column ) const; void getRow( const IndexType row, IndexType* columns, RealType* values ) const; template< typename Vector > #ifdef HAVE_CUDA __device__ __host__ #endif typename Vector::RealType rowVectorProduct( const IndexType row, const Vector& vector ) const; Loading @@ -135,11 +153,11 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index > const RealType& thisMatrixMultiplicator = 1.0 ); #ifdef HAVE_NOT_CXX11 template< typename Real2, typename Index2, int tileDim > template< typename Real2, typename Index2 > void getTransposition( const tnlTridiagonalMatrix< Real2, Device, Index2 >& matrix, const RealType& matrixMultiplicator = 1.0 ); #else template< typename Real2, typename Index2, int tileDim = 32 > template< typename Real2, typename Index2 > void getTransposition( const tnlTridiagonalMatrix< Real2, Device, Index2 >& matrix, const RealType& matrixMultiplicator = 1.0 ); #endif Loading @@ -162,6 +180,9 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index > protected: #ifdef HAVE_CUDA __device__ __host__ #endif IndexType getElementIndex( const IndexType row, const IndexType column ) const; Loading