Loading src/core/tnlCuda.h +9 −11 Original line number Diff line number Diff line Loading @@ -33,15 +33,17 @@ class tnlCuda #ifdef HAVE_CUDA __host__ __device__ #endif static inline tnlDeviceEnum getDevice() { return tnlCudaDevice; }; static inline tnlDeviceEnum getDevice(); static int getMaxGridSize(); static void setMaxGridSize( int newMaxGridSize ); static int getMaxBlockSize(); #ifdef HAVE_CUDA __host__ __device__ #endif static inline int getMaxGridSize(); static void setMaxBlockSize( int newMaxBlockSize ); #ifdef HAVE_CUDA __host__ __device__ #endif static inline int getMaxBlockSize(); static int getGPUTransferBufferSize(); Loading @@ -64,10 +66,6 @@ class tnlCuda static bool checkDevice( const char* file_name, int line ); protected: static int maxGridSize, maxBlockSize; }; #define checkCudaDevice tnlCuda::checkDevice( __FILE__, __LINE__ ) Loading src/implementation/core/arrays/tnlSharedArray_impl.h +1 −0 Original line number Diff line number Diff line Loading @@ -376,6 +376,7 @@ ostream& operator << ( ostream& str, const tnlSharedArray< Element, Device, Inde str << ", " << v. getElement( i ); } str << " ]"; return str; } //}; // namespace implementation Loading src/implementation/core/tnlCuda.cpp +0 −24 Original line number Diff line number Diff line Loading @@ -18,35 +18,11 @@ #include <core/tnlCuda.h> #include <tnlConfig.h> int tnlCuda :: maxGridSize = maxCudaGridSize; int tnlCuda :: maxBlockSize = maxCudaBlockSize; tnlString tnlCuda :: getDeviceType() { return tnlString( "tnlCuda" ); } int tnlCuda :: getMaxGridSize() { return maxGridSize; } void tnlCuda :: setMaxGridSize( int newMaxGridSize ) { maxGridSize = newMaxGridSize; } int tnlCuda :: getMaxBlockSize() { return maxBlockSize; } void tnlCuda :: setMaxBlockSize( int newMaxBlockSize ) { maxBlockSize = newMaxBlockSize; } int tnlCuda::getGPUTransferBufferSize() { return 1 << 20; Loading src/implementation/core/tnlCuda_impl.h +27 −0 Original line number Diff line number Diff line Loading @@ -20,6 +20,33 @@ #ifdef HAVE_CUDA #ifdef HAVE_CUDA __host__ __device__ #endif inline tnlDeviceEnum tnlCuda::getDevice() { return tnlCudaDevice; }; #ifdef HAVE_CUDA __host__ __device__ #endif inline int tnlCuda::getMaxGridSize() { // TODO: make it preprocessor macro constant defined in tnlConfig return 65536; }; #ifdef HAVE_CUDA __host__ __device__ #endif inline int tnlCuda::getMaxBlockSize() { // TODO: make it preprocessor macro constant defined in tnlConfig return 1024; }; template< typename ObjectType > ObjectType* tnlCuda::passToDevice( const ObjectType& object ) { Loading src/implementation/matrices/tnlDenseMatrix_impl.h +138 −32 Original line number Diff line number Diff line Loading @@ -117,6 +117,15 @@ void tnlDenseMatrix< Real, Device, Index >::reset() this->values.reset(); } template< typename Real, typename Device, typename Index > void tnlDenseMatrix< Real, Device, Index >::setValue( const Real& value ) { this->values.setValue( value ); } template< typename Real, typename Device, typename Index > Loading Loading @@ -160,6 +169,7 @@ bool tnlDenseMatrix< Real, Device, Index >::addElementFast( const IndexType row, else this->values.operator[]( elementIndex ) = thisElementMultiplicator * this->values.operator[]( elementIndex ) + value; return true; } template< typename Real, Loading @@ -177,6 +187,7 @@ bool tnlDenseMatrix< Real, Device, Index >::addElement( const IndexType row, else this->values.setElement( elementIndex, thisElementMultiplicator * this->values.getElement( elementIndex ) + value ); return true; } Loading Loading @@ -316,15 +327,33 @@ template< typename Real, typename Device, typename Index > template< typename Vector > #ifdef HAVE_CUDA __device__ __host__ #endif typename Vector::RealType tnlDenseMatrix< Real, Device, Index >::rowVectorProduct( const IndexType row, const Vector& vector ) const { RealType sum( 0.0 ); for( IndexType column = 0; column < this->getColumns(); column++ ) sum += this->getElement( row, column ) * vector.getElement( column ); sum += this->getElementFast( row, column ) * vector[ column ]; return sum; } #ifdef HAVE_CUDA template< typename Real, typename Index, typename Vector > __global__ void tnlDenseMatrixVectorProductCudaKernel( tnlDenseMatrix< Real, tnlCuda, Index >* matrix, const Vector* inVector, Vector* outVector, const Index gridIdx ) { const Index rowIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( rowIdx < matrix->getRows() ) ( *outVector )[ rowIdx ] = matrix->rowVectorProduct( rowIdx, *inVector ); } #endif template< typename Real, typename Device, typename Index > Loading @@ -343,8 +372,31 @@ void tnlDenseMatrix< Real, Device, Index >::vectorProduct( const Vector& inVecto << "Vector size: " << outVector.getSize() << endl << "Vector name: " << outVector.getName() << endl ); if( Device::getDevice() == tnlHostDevice ) for( IndexType row = 0; row < this->getRows(); row++ ) outVector[ row ] = rowVectorProduct( row, inVector ); if( Device::getDevice() == tnlCudaDevice ) { #ifdef HAVE_CUDA ThisType* kernel_this = tnlCuda::passToDevice( *this ); Vector* kernel_inVector = tnlCuda::passToDevice( inVector ); Vector* kernel_outVector = tnlCuda::passToDevice( outVector ); dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() ); const IndexType cudaBlocks = roundUpDivision( this->getRows(), cudaBlockSize.x ); const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize(); tnlDenseMatrixVectorProductCudaKernel<<< cudaGridSize, cudaBlockSize >>> ( kernel_this, kernel_inVector, kernel_outVector, gridIdx ); } tnlCuda::freeFromDevice( kernel_this ); tnlCuda::freeFromDevice( kernel_inVector ); tnlCuda::freeFromDevice( kernel_outVector ); checkCudaDevice; #endif } } template< typename Real, Loading @@ -365,20 +417,29 @@ void tnlDenseMatrix< Real, Device, Index >::addMatrix( const Matrix& matrix, << "That matrix name: " << matrix.getName() << endl ); if( thisMatrixMultiplicator == 1.0 ) { for( IndexType row = 0; row < this->getRows(); row++ ) for( IndexType column = 0; column < this->getColumns(); column++ ) this->operator()( row, column ) += matrixMultiplicator*matrix( row, column ); } this->values.alphaXPlusY( matrixMultiplicator, matrix.values ); else { for( IndexType row = 0; row < this->getRows(); row++ ) for( IndexType column = 0; column < this->getColumns(); column++ ) this->operator()( row, column ) = thisMatrixMultiplicator * this->operator()( row, column) + matrixMultiplicator * matrix( row, column ); this->values.alphaXPlusBetaY( matrixMultiplicator, matrix.values, thisMatrixMultiplicator ); } #ifdef HAVE_CUDA template< typename Real, typename Index, typename Matrix1, typename Matrix2, int tileDim, int tileRowBlockSize > __global__ void tnlDenseMatrixMatrixProductKernel( tnlDenseMatrix< Real, tnlCuda, Index >* reusltMatrix, const Matrix1* matrix1, const Matrix2* matrix2, const Real matrix1Multiplicator, const Real matrix2Multiplicator, const Index gridIdx_x, const Index gridIdx_y ) { } #endif template< typename Real, typename Device, Loading @@ -402,6 +463,7 @@ void tnlDenseMatrix< Real, Device, Index >::getMatrixProduct( const Matrix1& mat << "Matrix2 rows: " << matrix2.getRows() << endl << "Matrix2 name: " << matrix2.getName() << endl ); if( Device::getDevice() == tnlHostDevice ) for( IndexType i = 0; i < this->getRows(); i += tileDim ) for( IndexType j = 0; j < this->getColumns(); j += tileDim ) { Loading @@ -409,7 +471,7 @@ void tnlDenseMatrix< Real, Device, Index >::getMatrixProduct( const Matrix1& mat const IndexType tileColumns = Min( tileDim, this->getColumns() - j ); for( IndexType i1 = i; i1 < i + tileRows; i1++ ) for( IndexType j1 = j; j1 < j + tileColumns; j1++ ) this->operator()( i1, j1 ) = 0.0; this->setElementFast( i1, j1, 0.0 ); for( IndexType k = 0; k < matrix1.getColumns(); k += tileDim ) { Loading @@ -417,9 +479,53 @@ void tnlDenseMatrix< Real, Device, Index >::getMatrixProduct( const Matrix1& mat for( IndexType i1 = 0; i1 < tileRows; i1++ ) for( IndexType j1 = 0; j1 < tileColumns; j1++ ) for( IndexType k1 = k; k1 < lastK; k1++ ) this->operator()( i + i1, j + j1 ) += matrix1( i + i1, k1 ) * matrix2( k1, j + j1 ); this->addElementFast( i + i1, j + j1, matrix1.getElementFast( i + i1, k1 ) * matrix2.getElementFast( k1, j + j1 ) ); } } if( Device::getDevice() == tnlCudaDevice ) { #ifdef HAVE_CUDA dim3 cudaBlockSize( 0 ), cudaGridSize( 0 ); const IndexType rowTiles = roundUpDivision( this->getRows(), tileDim ); const IndexType columnTiles = roundUpDivision( this->getColumns(), tileDim ); cudaBlockSize.x = blockXSize; cudaBlockSize.y = blockYSize; const IndexType rowGrids = roundUpDivision( rowTiles, tnlCuda::getMaxGridSize() ); const IndexType columnGrids = roundUpDivision( columnTiles, tnlCuda::getMaxGridSize() ); for( IndexType gridIdx_x = 0; gridIdx_x < columnGrids; gridIdx_x++ ) for( IndexType gridIdx_y = 0; gridIdx_y < rowGrids; gridIdx_y++ ) { cudaGridSize.x = cudaGridSize.y = tnlCuda::getMaxGridSize(); if( gridIdx_x == columnGrids - 1 ) cudaGridSize.x = columnTiles % tnlCuda::getMaxGridSize(); if( gridIdx_y == rowGrids - 1 ) cudaGridSize.y = rowTiles % tnlCuda::getMaxGridSize(); ThisType* this_kernel = tnlCuda::passToDevice( *this ); Matrix1* matrix1_kernel = tnlCuda::passToDevice( matrix1 ); Matrix2* matrix2_kernel = tnlCuda::passToDevice( matrix2 ); tnlDenseMatrixMatrixProductKernel< Real, Index, Matrix1, Matrix2, tileDim, tileRowBlockSize > <<< cudaGridSize, cudaBlockSize, 3*tileDim*tileDim >>> ( this_kernel, matrix1_kernel, matrix2_kernel, matrix1Multiplicator, matrix2Multiplicator, gridIdx_x, gridIdx_y ); tnlCuda::freeFromDevice( this_kernel ); tnlCuda::freeFromDevice( matrix1_kernel ); tnlCuda::freeFromDevice( matrix2_kernel ); } #endif } } Loading Loading
src/core/tnlCuda.h +9 −11 Original line number Diff line number Diff line Loading @@ -33,15 +33,17 @@ class tnlCuda #ifdef HAVE_CUDA __host__ __device__ #endif static inline tnlDeviceEnum getDevice() { return tnlCudaDevice; }; static inline tnlDeviceEnum getDevice(); static int getMaxGridSize(); static void setMaxGridSize( int newMaxGridSize ); static int getMaxBlockSize(); #ifdef HAVE_CUDA __host__ __device__ #endif static inline int getMaxGridSize(); static void setMaxBlockSize( int newMaxBlockSize ); #ifdef HAVE_CUDA __host__ __device__ #endif static inline int getMaxBlockSize(); static int getGPUTransferBufferSize(); Loading @@ -64,10 +66,6 @@ class tnlCuda static bool checkDevice( const char* file_name, int line ); protected: static int maxGridSize, maxBlockSize; }; #define checkCudaDevice tnlCuda::checkDevice( __FILE__, __LINE__ ) Loading
src/implementation/core/arrays/tnlSharedArray_impl.h +1 −0 Original line number Diff line number Diff line Loading @@ -376,6 +376,7 @@ ostream& operator << ( ostream& str, const tnlSharedArray< Element, Device, Inde str << ", " << v. getElement( i ); } str << " ]"; return str; } //}; // namespace implementation Loading
src/implementation/core/tnlCuda.cpp +0 −24 Original line number Diff line number Diff line Loading @@ -18,35 +18,11 @@ #include <core/tnlCuda.h> #include <tnlConfig.h> int tnlCuda :: maxGridSize = maxCudaGridSize; int tnlCuda :: maxBlockSize = maxCudaBlockSize; tnlString tnlCuda :: getDeviceType() { return tnlString( "tnlCuda" ); } int tnlCuda :: getMaxGridSize() { return maxGridSize; } void tnlCuda :: setMaxGridSize( int newMaxGridSize ) { maxGridSize = newMaxGridSize; } int tnlCuda :: getMaxBlockSize() { return maxBlockSize; } void tnlCuda :: setMaxBlockSize( int newMaxBlockSize ) { maxBlockSize = newMaxBlockSize; } int tnlCuda::getGPUTransferBufferSize() { return 1 << 20; Loading
src/implementation/core/tnlCuda_impl.h +27 −0 Original line number Diff line number Diff line Loading @@ -20,6 +20,33 @@ #ifdef HAVE_CUDA #ifdef HAVE_CUDA __host__ __device__ #endif inline tnlDeviceEnum tnlCuda::getDevice() { return tnlCudaDevice; }; #ifdef HAVE_CUDA __host__ __device__ #endif inline int tnlCuda::getMaxGridSize() { // TODO: make it preprocessor macro constant defined in tnlConfig return 65536; }; #ifdef HAVE_CUDA __host__ __device__ #endif inline int tnlCuda::getMaxBlockSize() { // TODO: make it preprocessor macro constant defined in tnlConfig return 1024; }; template< typename ObjectType > ObjectType* tnlCuda::passToDevice( const ObjectType& object ) { Loading
src/implementation/matrices/tnlDenseMatrix_impl.h +138 −32 Original line number Diff line number Diff line Loading @@ -117,6 +117,15 @@ void tnlDenseMatrix< Real, Device, Index >::reset() this->values.reset(); } template< typename Real, typename Device, typename Index > void tnlDenseMatrix< Real, Device, Index >::setValue( const Real& value ) { this->values.setValue( value ); } template< typename Real, typename Device, typename Index > Loading Loading @@ -160,6 +169,7 @@ bool tnlDenseMatrix< Real, Device, Index >::addElementFast( const IndexType row, else this->values.operator[]( elementIndex ) = thisElementMultiplicator * this->values.operator[]( elementIndex ) + value; return true; } template< typename Real, Loading @@ -177,6 +187,7 @@ bool tnlDenseMatrix< Real, Device, Index >::addElement( const IndexType row, else this->values.setElement( elementIndex, thisElementMultiplicator * this->values.getElement( elementIndex ) + value ); return true; } Loading Loading @@ -316,15 +327,33 @@ template< typename Real, typename Device, typename Index > template< typename Vector > #ifdef HAVE_CUDA __device__ __host__ #endif typename Vector::RealType tnlDenseMatrix< Real, Device, Index >::rowVectorProduct( const IndexType row, const Vector& vector ) const { RealType sum( 0.0 ); for( IndexType column = 0; column < this->getColumns(); column++ ) sum += this->getElement( row, column ) * vector.getElement( column ); sum += this->getElementFast( row, column ) * vector[ column ]; return sum; } #ifdef HAVE_CUDA template< typename Real, typename Index, typename Vector > __global__ void tnlDenseMatrixVectorProductCudaKernel( tnlDenseMatrix< Real, tnlCuda, Index >* matrix, const Vector* inVector, Vector* outVector, const Index gridIdx ) { const Index rowIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( rowIdx < matrix->getRows() ) ( *outVector )[ rowIdx ] = matrix->rowVectorProduct( rowIdx, *inVector ); } #endif template< typename Real, typename Device, typename Index > Loading @@ -343,8 +372,31 @@ void tnlDenseMatrix< Real, Device, Index >::vectorProduct( const Vector& inVecto << "Vector size: " << outVector.getSize() << endl << "Vector name: " << outVector.getName() << endl ); if( Device::getDevice() == tnlHostDevice ) for( IndexType row = 0; row < this->getRows(); row++ ) outVector[ row ] = rowVectorProduct( row, inVector ); if( Device::getDevice() == tnlCudaDevice ) { #ifdef HAVE_CUDA ThisType* kernel_this = tnlCuda::passToDevice( *this ); Vector* kernel_inVector = tnlCuda::passToDevice( inVector ); Vector* kernel_outVector = tnlCuda::passToDevice( outVector ); dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() ); const IndexType cudaBlocks = roundUpDivision( this->getRows(), cudaBlockSize.x ); const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize(); tnlDenseMatrixVectorProductCudaKernel<<< cudaGridSize, cudaBlockSize >>> ( kernel_this, kernel_inVector, kernel_outVector, gridIdx ); } tnlCuda::freeFromDevice( kernel_this ); tnlCuda::freeFromDevice( kernel_inVector ); tnlCuda::freeFromDevice( kernel_outVector ); checkCudaDevice; #endif } } template< typename Real, Loading @@ -365,20 +417,29 @@ void tnlDenseMatrix< Real, Device, Index >::addMatrix( const Matrix& matrix, << "That matrix name: " << matrix.getName() << endl ); if( thisMatrixMultiplicator == 1.0 ) { for( IndexType row = 0; row < this->getRows(); row++ ) for( IndexType column = 0; column < this->getColumns(); column++ ) this->operator()( row, column ) += matrixMultiplicator*matrix( row, column ); } this->values.alphaXPlusY( matrixMultiplicator, matrix.values ); else { for( IndexType row = 0; row < this->getRows(); row++ ) for( IndexType column = 0; column < this->getColumns(); column++ ) this->operator()( row, column ) = thisMatrixMultiplicator * this->operator()( row, column) + matrixMultiplicator * matrix( row, column ); this->values.alphaXPlusBetaY( matrixMultiplicator, matrix.values, thisMatrixMultiplicator ); } #ifdef HAVE_CUDA template< typename Real, typename Index, typename Matrix1, typename Matrix2, int tileDim, int tileRowBlockSize > __global__ void tnlDenseMatrixMatrixProductKernel( tnlDenseMatrix< Real, tnlCuda, Index >* reusltMatrix, const Matrix1* matrix1, const Matrix2* matrix2, const Real matrix1Multiplicator, const Real matrix2Multiplicator, const Index gridIdx_x, const Index gridIdx_y ) { } #endif template< typename Real, typename Device, Loading @@ -402,6 +463,7 @@ void tnlDenseMatrix< Real, Device, Index >::getMatrixProduct( const Matrix1& mat << "Matrix2 rows: " << matrix2.getRows() << endl << "Matrix2 name: " << matrix2.getName() << endl ); if( Device::getDevice() == tnlHostDevice ) for( IndexType i = 0; i < this->getRows(); i += tileDim ) for( IndexType j = 0; j < this->getColumns(); j += tileDim ) { Loading @@ -409,7 +471,7 @@ void tnlDenseMatrix< Real, Device, Index >::getMatrixProduct( const Matrix1& mat const IndexType tileColumns = Min( tileDim, this->getColumns() - j ); for( IndexType i1 = i; i1 < i + tileRows; i1++ ) for( IndexType j1 = j; j1 < j + tileColumns; j1++ ) this->operator()( i1, j1 ) = 0.0; this->setElementFast( i1, j1, 0.0 ); for( IndexType k = 0; k < matrix1.getColumns(); k += tileDim ) { Loading @@ -417,9 +479,53 @@ void tnlDenseMatrix< Real, Device, Index >::getMatrixProduct( const Matrix1& mat for( IndexType i1 = 0; i1 < tileRows; i1++ ) for( IndexType j1 = 0; j1 < tileColumns; j1++ ) for( IndexType k1 = k; k1 < lastK; k1++ ) this->operator()( i + i1, j + j1 ) += matrix1( i + i1, k1 ) * matrix2( k1, j + j1 ); this->addElementFast( i + i1, j + j1, matrix1.getElementFast( i + i1, k1 ) * matrix2.getElementFast( k1, j + j1 ) ); } } if( Device::getDevice() == tnlCudaDevice ) { #ifdef HAVE_CUDA dim3 cudaBlockSize( 0 ), cudaGridSize( 0 ); const IndexType rowTiles = roundUpDivision( this->getRows(), tileDim ); const IndexType columnTiles = roundUpDivision( this->getColumns(), tileDim ); cudaBlockSize.x = blockXSize; cudaBlockSize.y = blockYSize; const IndexType rowGrids = roundUpDivision( rowTiles, tnlCuda::getMaxGridSize() ); const IndexType columnGrids = roundUpDivision( columnTiles, tnlCuda::getMaxGridSize() ); for( IndexType gridIdx_x = 0; gridIdx_x < columnGrids; gridIdx_x++ ) for( IndexType gridIdx_y = 0; gridIdx_y < rowGrids; gridIdx_y++ ) { cudaGridSize.x = cudaGridSize.y = tnlCuda::getMaxGridSize(); if( gridIdx_x == columnGrids - 1 ) cudaGridSize.x = columnTiles % tnlCuda::getMaxGridSize(); if( gridIdx_y == rowGrids - 1 ) cudaGridSize.y = rowTiles % tnlCuda::getMaxGridSize(); ThisType* this_kernel = tnlCuda::passToDevice( *this ); Matrix1* matrix1_kernel = tnlCuda::passToDevice( matrix1 ); Matrix2* matrix2_kernel = tnlCuda::passToDevice( matrix2 ); tnlDenseMatrixMatrixProductKernel< Real, Index, Matrix1, Matrix2, tileDim, tileRowBlockSize > <<< cudaGridSize, cudaBlockSize, 3*tileDim*tileDim >>> ( this_kernel, matrix1_kernel, matrix2_kernel, matrix1Multiplicator, matrix2Multiplicator, gridIdx_x, gridIdx_y ); tnlCuda::freeFromDevice( this_kernel ); tnlCuda::freeFromDevice( matrix1_kernel ); tnlCuda::freeFromDevice( matrix2_kernel ); } #endif } } Loading