diff --git a/src/implementation/matrices/tnlCSRMatrix_impl.h b/src/implementation/matrices/tnlCSRMatrix_impl.h index 26e5d52ab1950da26e7b871aa532f53fd0aff366..c8d383bdbf9997af54637d1789141c7de471cfb6 100644 --- a/src/implementation/matrices/tnlCSRMatrix_impl.h +++ b/src/implementation/matrices/tnlCSRMatrix_impl.h @@ -27,7 +27,8 @@ template< typename Real, typename Device, typename Index > tnlCSRMatrix< Real, Device, Index >::tnlCSRMatrix() -: spmvCudaKernel( scalar ) +: spmvCudaKernel( vector ), + cudaWarpSize( 32 ) //tnlCuda::getWarpSize() ) { }; @@ -557,11 +558,78 @@ void tnlCSRMatrix< Real, Device, Index >::setCudaKernelType( const SPMVCudaKerne template< typename Real, typename Device, typename Index > +#ifdef HAVE_CUDA + __device__ __host__ +#endif typename tnlCSRMatrix< Real, Device, Index >::SPMVCudaKernel tnlCSRMatrix< Real, Device, Index >::getCudaKernelType() const { return this->spmvCudaKernel; } +template< typename Real, + typename Device, + typename Index > +void tnlCSRMatrix< Real, Device, Index >::setCudaWarpSize( const int warpSize ) +{ + this->cudaWarpSize = warpSize; +} + +template< typename Real, + typename Device, + typename Index > +int tnlCSRMatrix< Real, Device, Index >::getCudaWarpSize() const +{ + return this->cudaWarpSize; +} + + +#ifdef HAVE_CUDA +template< typename Real, + typename Device, + typename Index > + template< typename Vector, + int warpSize > +__device__ +void tnlCSRMatrix< Real, Device, Index >::vectorProductCuda( const Vector& inVector, + Vector& outVector, + int gridIdx ) const +{ + Real* aux = getSharedMemory< Real >(); + IndexType globalIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const IndexType warpStart = warpSize * ( globalIdx / warpSize ); + const IndexType warpEnd = Min( warpStart + warpSize, this->getRows() ); + const IndexType inWarpIdx = globalIdx % warpSize; + for( IndexType row = warpStart; row < warpEnd; row++ ) + { + aux[ threadIdx.x ] = 0.0; + + IndexType elementPtr = this->rowPointers[ row ] + inWarpIdx; + const IndexType rowEnd = this->rowPointers[ row + 1 ]; + IndexType column; + while( elementPtr < rowEnd && + ( column = this->columnIndexes[ elementPtr ] ) < this->getColumns() ) + { + aux[ threadIdx.x ] += inVector[ column ] * this->values[ elementPtr ]; + elementPtr += warpSize; + } + if( warpSize == 32 ) + if( inWarpIdx < 16 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 16 ]; + if( warpSize >= 16 ) + if( inWarpIdx < 8 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 8 ]; + if( warpSize >= 8 ) + if( inWarpIdx < 4 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 4 ]; + if( warpSize >= 4 ) + if( inWarpIdx < 2 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 2 ]; + if( warpSize >= 2 ) + if( inWarpIdx < 1 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 1 ]; + __syncthreads(); // TODO: I am not sure why + + if( inWarpIdx == 0 ) + outVector[ row ] = aux[ threadIdx.x ]; + } +} +#endif + template<> class tnlCSRMatrixDeviceDependentCode< tnlHost > { @@ -571,10 +639,11 @@ class tnlCSRMatrixDeviceDependentCode< tnlHost > template< typename Real, typename Index, - typename Vector > + typename InVector, + typename OutVector > static void vectorProduct( const tnlCSRMatrix< Real, Device, Index >& matrix, - const Vector& inVector, - Vector& outVector ) + const InVector& inVector, + OutVector& outVector ) { for( Index row = 0; row < matrix.getRows(); row ++ ) outVector[ row ] = matrix.rowVectorProduct( row, inVector ); @@ -585,17 +654,25 @@ class tnlCSRMatrixDeviceDependentCode< tnlHost > #ifdef HAVE_CUDA template< typename Real, typename Index, - typename Vector > -__global__ void tnlCSRMatrixVectorProductCudaScalarKernel( const tnlCSRMatrix< Real, tnlCuda, Index >* matrix, - const Vector* inVector, - Vector* outVector, - int gridIdx ) + typename Vector, + int warpSize > +__global__ void tnlCSRMatrixVectorProductCudaKernel( const tnlCSRMatrix< Real, tnlCuda, Index >* matrix, + const Vector* inVector, + Vector* outVector, + int gridIdx ) { typedef tnlCSRMatrix< Real, tnlCuda, Index > Matrix; tnlStaticAssert( Matrix::DeviceType::DeviceType == tnlCudaDevice, ); - const typename Matrix::IndexType rowIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - if( rowIdx < matrix->getRows() ) - ( *outVector )[ rowIdx ] = matrix->rowVectorProduct( rowIdx, *inVector ); + const typename Matrix::IndexType rowIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + if( matrix->getCudaKernelType() == Matrix::scalar ) + { + if( rowIdx < matrix->getRows() ) + ( *outVector )[ rowIdx ] = matrix->rowVectorProduct( rowIdx, *inVector ); + } + if( matrix->getCudaKernelType() == Matrix::vector ) + { + matrix->template vectorProductCuda< Vector, warpSize >( *inVector, *outVector, gridIdx ); + } } #endif @@ -613,20 +690,56 @@ void tnlCSRMatrixVectorProductCuda( const tnlCSRMatrix< Real, tnlCuda, Index >& Vector* kernel_inVector = tnlCuda::passToDevice( inVector ); Vector* kernel_outVector = tnlCuda::passToDevice( outVector ); dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() ); - if( matrix.getCudaKernelType() == Matrix::scalar ) + const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); + const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() ); + for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { - const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); - const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() ); - for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) - { - if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize(); - tnlCSRMatrixVectorProductCudaScalarKernel<<< cudaGridSize, cudaBlockSize >>> - ( kernel_this, - kernel_inVector, - kernel_outVector, - gridIdx ); - } + if( gridIdx == cudaGrids - 1 ) + cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize(); + const int sharedMemory = cudaBlockSize.x * sizeof( Real ); + if( matrix.getCudaWarpSize() == 32 ) + tnlCSRMatrixVectorProductCudaKernel< Real, Index, Vector, 32 > + <<< cudaGridSize, cudaBlockSize, sharedMemory >>> + ( kernel_this, + kernel_inVector, + kernel_outVector, + gridIdx ); + if( matrix.getCudaWarpSize() == 16 ) + tnlCSRMatrixVectorProductCudaKernel< Real, Index, Vector, 16 > + <<< cudaGridSize, cudaBlockSize, sharedMemory >>> + ( kernel_this, + kernel_inVector, + kernel_outVector, + gridIdx ); + if( matrix.getCudaWarpSize() == 8 ) + tnlCSRMatrixVectorProductCudaKernel< Real, Index, Vector, 8 > + <<< cudaGridSize, cudaBlockSize, sharedMemory >>> + ( kernel_this, + kernel_inVector, + kernel_outVector, + gridIdx ); + if( matrix.getCudaWarpSize() == 4 ) + tnlCSRMatrixVectorProductCudaKernel< Real, Index, Vector, 4 > + <<< cudaGridSize, cudaBlockSize, sharedMemory >>> + ( kernel_this, + kernel_inVector, + kernel_outVector, + gridIdx ); + if( matrix.getCudaWarpSize() == 2 ) + tnlCSRMatrixVectorProductCudaKernel< Real, Index, Vector, 2 > + <<< cudaGridSize, cudaBlockSize, sharedMemory >>> + ( kernel_this, + kernel_inVector, + kernel_outVector, + gridIdx ); + if( matrix.getCudaWarpSize() == 1 ) + tnlCSRMatrixVectorProductCudaKernel< Real, Index, Vector, 1 > + <<< cudaGridSize, cudaBlockSize, sharedMemory >>> + ( kernel_this, + kernel_inVector, + kernel_outVector, + gridIdx ); + } tnlCuda::freeFromDevice( kernel_this ); tnlCuda::freeFromDevice( kernel_inVector ); @@ -645,10 +758,11 @@ class tnlCSRMatrixDeviceDependentCode< tnlCuda > template< typename Real, typename Index, - typename Vector > + typename InVector, + typename OutVector > static void vectorProduct( const tnlCSRMatrix< Real, Device, Index >& matrix, - const Vector& inVector, - Vector& outVector ) + const InVector& inVector, + OutVector& outVector ) { tnlCSRMatrixVectorProductCuda( matrix, inVector, outVector ); } diff --git a/src/implementation/matrices/tnlChunkedEllpackMatrix_impl.h b/src/implementation/matrices/tnlChunkedEllpackMatrix_impl.h index c178338e6dd3f8179ca037e650f30e8a1a1109ae..38742ae43d526ee8bb02946577dfe60fa54eb514 100644 --- a/src/implementation/matrices/tnlChunkedEllpackMatrix_impl.h +++ b/src/implementation/matrices/tnlChunkedEllpackMatrix_impl.h @@ -1098,9 +1098,10 @@ __device__ void tnlChunkedEllpackMatrix< Real, Device, Index >::computeSliceVect template< typename Real, typename Device, typename Index > - template< typename Vector > -void tnlChunkedEllpackMatrix< Real, Device, Index >::vectorProduct( const Vector& inVector, - Vector& outVector ) const + template< typename InVector, + typename OutVector > +void tnlChunkedEllpackMatrix< Real, Device, Index >::vectorProduct( const InVector& inVector, + OutVector& outVector ) const { DeviceDependentCode::vectorProduct( *this, inVector, outVector ); } @@ -1295,10 +1296,11 @@ class tnlChunkedEllpackMatrixDeviceDependentCode< tnlHost > template< typename Real, typename Index, - typename Vector > + typename InVector, + typename OutVector > static void vectorProduct( const tnlChunkedEllpackMatrix< Real, Device, Index >& matrix, - const Vector& inVector, - Vector& outVector ) + const InVector& inVector, + OutVector& outVector ) { for( Index row = 0; row < matrix.getRows(); row ++ ) outVector[ row ] = matrix.rowVectorProduct( row, inVector ); @@ -1308,10 +1310,11 @@ class tnlChunkedEllpackMatrixDeviceDependentCode< tnlHost > #ifdef HAVE_CUDA template< typename Real, typename Index, - typename Vector > + typename InVector, + typename OutVector > __global__ void tnlChunkedEllpackMatrixVectorProductCudaKernel( const tnlChunkedEllpackMatrix< Real, tnlCuda, Index >* matrix, - const Vector* inVector, - Vector* outVector, + const InVector* inVector, + OutVector* outVector, int gridIdx ) { const Index sliceIdx = gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x; @@ -1359,18 +1362,19 @@ class tnlChunkedEllpackMatrixDeviceDependentCode< tnlCuda > template< typename Real, typename Index, - typename Vector > + typename InVector, + typename OutVector > static void vectorProduct( const tnlChunkedEllpackMatrix< Real, Device, Index >& matrix, - const Vector& inVector, - Vector& outVector ) + const InVector& inVector, + OutVector& outVector ) { #ifdef HAVE_CUDA typedef tnlChunkedEllpackMatrix< Real, tnlCuda, Index > Matrix; typedef Index IndexType; typedef Real RealType; Matrix* kernel_this = tnlCuda::passToDevice( matrix ); - Vector* kernel_inVector = tnlCuda::passToDevice( inVector ); - Vector* kernel_outVector = tnlCuda::passToDevice( outVector ); + InVector* kernel_inVector = tnlCuda::passToDevice( inVector ); + OutVector* kernel_outVector = tnlCuda::passToDevice( outVector ); dim3 cudaBlockSize( matrix.getNumberOfChunksInSlice() ), cudaGridSize( tnlCuda::getMaxGridSize() ); const IndexType cudaBlocks = matrix.getNumberOfSlices(); @@ -1381,7 +1385,7 @@ class tnlChunkedEllpackMatrixDeviceDependentCode< tnlCuda > { if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize(); - tnlChunkedEllpackMatrixVectorProductCudaKernel< Real, Index, Vector > + tnlChunkedEllpackMatrixVectorProductCudaKernel< Real, Index, InVector, OutVector > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( kernel_this, kernel_inVector, diff --git a/src/implementation/matrices/tnlDenseMatrix_impl.h b/src/implementation/matrices/tnlDenseMatrix_impl.h index 3871dbff8b9039ad909ba6b7726b186fb6c352c8..9a10ad6d25317f768dbd94d85986abf0d0062ba5 100644 --- a/src/implementation/matrices/tnlDenseMatrix_impl.h +++ b/src/implementation/matrices/tnlDenseMatrix_impl.h @@ -352,27 +352,13 @@ typename Vector::RealType tnlDenseMatrix< Real, Device, Index >::rowVectorProduc return sum; } -/*#ifdef HAVE_CUDA -template< typename Real, - typename Index, - typename Vector > -__global__ void tnlDenseMatrixVectorProductCudaKernel( tnlDenseMatrix< Real, tnlCuda, Index >* matrix, - const Vector* inVector, - Vector* outVector, - const Index gridIdx ) -{ - const Index rowIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - if( rowIdx < matrix->getRows() ) - ( *outVector )[ rowIdx ] = matrix->rowVectorProduct( rowIdx, *inVector ); -} -#endif*/ - template< typename Real, typename Device, typename Index > - template< typename Vector > -void tnlDenseMatrix< Real, Device, Index >::vectorProduct( const Vector& inVector, - Vector& outVector ) const + template< typename InVector, + typename OutVector > +void tnlDenseMatrix< Real, Device, Index >::vectorProduct( const InVector& inVector, + OutVector& outVector ) const { tnlAssert( this->getColumns() == inVector.getSize(), cerr << "Matrix columns: " << this->getColumns() << endl @@ -385,11 +371,7 @@ void tnlDenseMatrix< Real, Device, Index >::vectorProduct( const Vector& inVecto << "Vector size: " << outVector.getSize() << endl << "Vector name: " << outVector.getName() << endl ); - if( Device::getDevice() == tnlHostDevice ) - for( IndexType row = 0; row < this->getRows(); row++ ) - outVector[ row ] = rowVectorProduct( row, inVector ); - if( Device::getDevice() == tnlCudaDevice ) - tnlMatrixVectorProductCuda( *this, inVector, outVector ); + DeviceDependentCode::vectorProduct( *this, inVector, outVector ); } template< typename Real, @@ -928,5 +910,43 @@ Index tnlDenseMatrix< Real, Device, Index >::getElementIndex( const IndexType ro return column * this->rows + row; } +template<> +class tnlDenseMatrixDeviceDependentCode< tnlHost > +{ + public: + + typedef tnlHost Device; + + template< typename Real, + typename Index, + typename InVector, + typename OutVector > + static void vectorProduct( const tnlDenseMatrix< Real, Device, Index >& matrix, + const InVector& inVector, + OutVector& outVector ) + { + for( Index row = 0; row < matrix.getRows(); row ++ ) + outVector[ row ] = matrix.rowVectorProduct( row, inVector ); + } +}; + +template<> +class tnlDenseMatrixDeviceDependentCode< tnlCuda > +{ + public: + + typedef tnlCuda Device; + + template< typename Real, + typename Index, + typename InVector, + typename OutVector > + static void vectorProduct( const tnlDenseMatrix< Real, Device, Index >& matrix, + const InVector& inVector, + OutVector& outVector ) + { + tnlMatrixVectorProductCuda( matrix, inVector, outVector ); + } +}; #endif /* TNLDENSEMATRIX_IMPL_H_ */ diff --git a/src/implementation/matrices/tnlEllpackMatrix_impl.h b/src/implementation/matrices/tnlEllpackMatrix_impl.h index d08bd09a150d4abfff73eb8916ebc2510409ac34..f9163ceeddb6e8f0eafad85672f03c5d7bd44743 100644 --- a/src/implementation/matrices/tnlEllpackMatrix_impl.h +++ b/src/implementation/matrices/tnlEllpackMatrix_impl.h @@ -489,9 +489,10 @@ typename Vector::RealType tnlEllpackMatrix< Real, Device, Index >::rowVectorProd template< typename Real, typename Device, typename Index > - template< typename Vector > -void tnlEllpackMatrix< Real, Device, Index >::vectorProduct( const Vector& inVector, - Vector& outVector ) const + template< typename InVector, + typename OutVector > +void tnlEllpackMatrix< Real, Device, Index >::vectorProduct( const InVector& inVector, + OutVector& outVector ) const { DeviceDependentCode::vectorProduct( *this, inVector, outVector ); } @@ -665,10 +666,11 @@ class tnlEllpackMatrixDeviceDependentCode< tnlHost > template< typename Real, typename Index, - typename Vector > + typename InVector, + typename OutVector > static void vectorProduct( const tnlEllpackMatrix< Real, Device, Index >& matrix, - const Vector& inVector, - Vector& outVector ) + const InVector& inVector, + OutVector& outVector ) { for( Index row = 0; row < matrix.getRows(); row ++ ) outVector[ row ] = matrix.rowVectorProduct( row, inVector ); @@ -716,10 +718,11 @@ class tnlEllpackMatrixDeviceDependentCode< tnlCuda > template< typename Real, typename Index, - typename Vector > + typename InVector, + typename OutVector > static void vectorProduct( const tnlEllpackMatrix< Real, Device, Index >& matrix, - const Vector& inVector, - Vector& outVector ) + const InVector& inVector, + OutVector& outVector ) { tnlMatrixVectorProductCuda( matrix, inVector, outVector ); } diff --git a/src/implementation/matrices/tnlMatrix_impl.h b/src/implementation/matrices/tnlMatrix_impl.h index 73fea5e04fc34858238f0e9c5e2d1ddb747b712e..ff8ee34e7eb251ed0694947692c0f4f2f5b4fe64 100644 --- a/src/implementation/matrices/tnlMatrix_impl.h +++ b/src/implementation/matrices/tnlMatrix_impl.h @@ -225,10 +225,11 @@ void tnlMatrix< Real, Device, Index >::print( ostream& str ) const #ifdef HAVE_CUDA template< typename Matrix, - typename Vector > + typename InVector, + typename OutVector > __global__ void tnlMatrixVectorProductCudaKernel( const Matrix* matrix, - const Vector* inVector, - Vector* outVector, + const InVector* inVector, + OutVector* outVector, int gridIdx ) { tnlStaticAssert( Matrix::DeviceType::DeviceType == tnlCudaDevice, ); @@ -239,16 +240,17 @@ __global__ void tnlMatrixVectorProductCudaKernel( const Matrix* matrix, #endif template< typename Matrix, - typename Vector > + typename InVector, + typename OutVector > void tnlMatrixVectorProductCuda( const Matrix& matrix, - const Vector& inVector, - Vector& outVector ) + const InVector& inVector, + OutVector& outVector ) { #ifdef HAVE_CUDA typedef typename Matrix::IndexType IndexType; Matrix* kernel_this = tnlCuda::passToDevice( matrix ); - Vector* kernel_inVector = tnlCuda::passToDevice( inVector ); - Vector* kernel_outVector = tnlCuda::passToDevice( outVector ); + InVector* kernel_inVector = tnlCuda::passToDevice( inVector ); + OutVector* kernel_outVector = tnlCuda::passToDevice( outVector ); dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() ); const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() ); diff --git a/src/implementation/matrices/tnlMultidiagonalMatrix_impl.h b/src/implementation/matrices/tnlMultidiagonalMatrix_impl.h index 86ce92d62779846da43017ac58fdfbe2ba9c4b68..46f3d2084d0bd65b00a877d9b640470436114a24 100644 --- a/src/implementation/matrices/tnlMultidiagonalMatrix_impl.h +++ b/src/implementation/matrices/tnlMultidiagonalMatrix_impl.h @@ -468,17 +468,23 @@ typename Vector::RealType tnlMultidiagonalMatrix< Real, Device, Index >::rowVect template< typename Real, typename Device, typename Index > - template< typename Vector > -void tnlMultidiagonalMatrix< Real, Device, Index >::vectorProduct( const Vector& inVector, - Vector& outVector ) const + template< typename InVector, + typename OutVector > +void tnlMultidiagonalMatrix< Real, Device, Index >::vectorProduct( const InVector& inVector, + OutVector& outVector ) const { - if( Device::getDevice() == tnlHostDevice ) - { - for( Index row = 0; row < this->getRows(); row ++ ) - outVector[ row ] = this->rowVectorProduct( row, inVector ); - } - if( Device::getDevice() == tnlCudaDevice ) - tnlMatrixVectorProductCuda( *this, inVector, outVector ); + tnlAssert( this->getColumns() == inVector.getSize(), + cerr << "Matrix columns: " << this->getColumns() << endl + << "Matrix name: " << this->getName() << endl + << "Vector size: " << inVector.getSize() << endl + << "Vector name: " << inVector.getName() << endl ); + tnlAssert( this->getRows() == outVector.getSize(), + cerr << "Matrix rows: " << this->getRows() << endl + << "Matrix name: " << this->getName() << endl + << "Vector size: " << outVector.getSize() << endl + << "Vector name: " << outVector.getName() << endl ); + + DeviceDependentCode::vectorProduct( *this, inVector, outVector ); } template< typename Real, @@ -680,6 +686,8 @@ class tnlMultidiagonalMatrixDeviceDependentCode< tnlHost > { public: + typedef tnlHost Device; + template< typename Index > static Index getElementIndex( const Index rows, const Index diagonals, @@ -688,6 +696,18 @@ class tnlMultidiagonalMatrixDeviceDependentCode< tnlHost > { return row*diagonals + diagonal; } + + template< typename Real, + typename Index, + typename InVector, + typename OutVector > + static void vectorProduct( const tnlMultidiagonalMatrix< Real, Device, Index >& matrix, + const InVector& inVector, + OutVector& outVector ) + { + for( Index row = 0; row < matrix.getRows(); row ++ ) + outVector[ row ] = matrix.rowVectorProduct( row, inVector ); + } }; template<> @@ -695,6 +715,8 @@ class tnlMultidiagonalMatrixDeviceDependentCode< tnlCuda > { public: + typedef tnlCuda Device; + template< typename Index > #ifdef HAVE_CUDA __device__ __host__ @@ -706,6 +728,17 @@ class tnlMultidiagonalMatrixDeviceDependentCode< tnlCuda > { return diagonal*rows + row; } + + template< typename Real, + typename Index, + typename InVector, + typename OutVector > + static void vectorProduct( const tnlMultidiagonalMatrix< Real, Device, Index >& matrix, + const InVector& inVector, + OutVector& outVector ) + { + tnlMatrixVectorProductCuda( matrix, inVector, outVector ); + } }; diff --git a/src/implementation/matrices/tnlSlicedEllpackMatrix_impl.h b/src/implementation/matrices/tnlSlicedEllpackMatrix_impl.h index 31a01c47ca623cec1499ba5a629ec966dc7a5ef3..db4a6edeb37cd4373c446a54246e33d854c18c9e 100644 --- a/src/implementation/matrices/tnlSlicedEllpackMatrix_impl.h +++ b/src/implementation/matrices/tnlSlicedEllpackMatrix_impl.h @@ -492,9 +492,10 @@ template< typename Real, typename Device, typename Index, int SliceSize > - template< typename Vector > -void tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::vectorProduct( const Vector& inVector, - Vector& outVector ) const + template< typename InVector, + typename OutVector > +void tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::vectorProduct( const InVector& inVector, + OutVector& outVector ) const { DeviceDependentCode::vectorProduct( *this, inVector, outVector ); } @@ -732,11 +733,12 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlHost > template< typename Real, typename Index, - typename Vector, + typename InVector, + typename OutVector, int SliceSize > static void vectorProduct( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix, - const Vector& inVector, - Vector& outVector ) + const InVector& inVector, + OutVector& outVector ) { for( Index row = 0; row < matrix.getRows(); row ++ ) outVector[ row ] = matrix.rowVectorProduct( row, inVector ); @@ -836,11 +838,12 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda > template< typename Real, typename Index, - typename Vector, + typename InVector, + typename OutVector, int SliceSize > static void vectorProduct( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix, - const Vector& inVector, - Vector& outVector ) + const InVector& inVector, + OutVector& outVector ) { tnlMatrixVectorProductCuda( matrix, inVector, outVector ); } diff --git a/src/implementation/matrices/tnlTridiagonalMatrix_impl.h b/src/implementation/matrices/tnlTridiagonalMatrix_impl.h index f37ca4ba3ff4b81f552c42d287ef9dff1db1e7e0..e493a475695029e6c8439582a0f5056f3b3fd66d 100644 --- a/src/implementation/matrices/tnlTridiagonalMatrix_impl.h +++ b/src/implementation/matrices/tnlTridiagonalMatrix_impl.h @@ -394,9 +394,10 @@ typename Vector::RealType tnlTridiagonalMatrix< Real, Device, Index >::rowVector template< typename Real, typename Device, typename Index > - template< typename Vector > -void tnlTridiagonalMatrix< Real, Device, Index >::vectorProduct( const Vector& inVector, - Vector& outVector ) const + template< typename InVector, + typename OutVector > +void tnlTridiagonalMatrix< Real, Device, Index >::vectorProduct( const InVector& inVector, + OutVector& outVector ) const { tnlAssert( this->getColumns() == inVector.getSize(), cerr << "Matrix columns: " << this->getColumns() << endl @@ -409,12 +410,7 @@ void tnlTridiagonalMatrix< Real, Device, Index >::vectorProduct( const Vector& i << "Vector size: " << outVector.getSize() << endl << "Vector name: " << outVector.getName() << endl ); - if( Device::getDevice() == tnlHostDevice ) - for( IndexType row = 0; row < this->getRows(); row++ ) - outVector[ row ] = this->rowVectorProduct( row, inVector ); - - if( Device::getDevice() == tnlCudaDevice ) - tnlMatrixVectorProductCuda( *this, inVector, outVector ); + DeviceDependentCode::vectorProduct( *this, inVector, outVector ); } template< typename Real, @@ -610,6 +606,8 @@ class tnlTridiagonalMatrixDeviceDependentCode< tnlHost > { public: + typedef tnlHost Device; + template< typename Index > static Index getElementIndex( const Index rows, const Index row, @@ -637,12 +635,26 @@ class tnlTridiagonalMatrixDeviceDependentCode< tnlHost > vector[ row ] * values[ i++ ] + vector[ row + 1 ] * values[ i ]; } + + template< typename Real, + typename Index, + typename InVector, + typename OutVector > + static void vectorProduct( const tnlTridiagonalMatrix< Real, Device, Index >& matrix, + const InVector& inVector, + OutVector& outVector ) + { + for( Index row = 0; row < matrix.getRows(); row ++ ) + outVector[ row ] = matrix.rowVectorProduct( row, inVector ); + } }; template<> class tnlTridiagonalMatrixDeviceDependentCode< tnlCuda > { public: + + typedef tnlCuda Device; template< typename Index > #ifdef HAVE_CUDA @@ -677,6 +689,17 @@ class tnlTridiagonalMatrixDeviceDependentCode< tnlCuda > vector[ row ] * values[ i + rows ] + vector[ row + 1 ] * values[ i + 2*rows ]; } + + template< typename Real, + typename Index, + typename InVector, + typename OutVector > + static void vectorProduct( const tnlTridiagonalMatrix< Real, Device, Index >& matrix, + const InVector& inVector, + OutVector& outVector ) + { + tnlMatrixVectorProductCuda( matrix, inVector, outVector ); + } }; diff --git a/src/implementation/solvers/linear/krylov/tnlGMRESSolver_impl.h b/src/implementation/solvers/linear/krylov/tnlGMRESSolver_impl.h index 1b9a5947a25fea617a420c5bfeebfb5d479a8c5b..97142cca8fd503941fb23e0c877e25522fb138f2 100644 --- a/src/implementation/solvers/linear/krylov/tnlGMRESSolver_impl.h +++ b/src/implementation/solvers/linear/krylov/tnlGMRESSolver_impl.h @@ -31,7 +31,7 @@ tnlGMRESSolver< Matrix, Preconditioner > :: tnlGMRESSolver() _sn( "tnlGMRESSolver::_sn" ), _H( "tnlGMRESSolver:_H" ), size( 0 ), - restarting( 0 ), + restarting( 10 ), matrix( 0 ), preconditioner( 0 ) { @@ -164,7 +164,7 @@ bool tnlGMRESSolver< Matrix, Preconditioner > :: solve( const Vector& b, Vector& */ if( preconditioner ) { - matrix -> vectorProduct( vi, _M_tmp ); + matrix->vectorProduct( vi, _M_tmp ); this->preconditioner->solve( _M_tmp, _w ); } else diff --git a/src/matrices/tnlCSRMatrix.h b/src/matrices/tnlCSRMatrix.h index 4c0552325149dd4b6b5d7d0b2edce7fec9870aa2..7031b88cbc60991f3e53746167c11709e4f94b62 100644 --- a/src/matrices/tnlCSRMatrix.h +++ b/src/matrices/tnlCSRMatrix.h @@ -136,7 +136,8 @@ class tnlCSRMatrix : public tnlSparseMatrix< Real, Device, Index > typename Vector::RealType rowVectorProduct( const IndexType row, const Vector& vector ) const; - template< typename InVector, typename OutVector > + template< typename InVector, + typename OutVector > void vectorProduct( const InVector& inVector, OutVector& outVector ) const; @@ -167,14 +168,32 @@ class tnlCSRMatrix : public tnlSparseMatrix< Real, Device, Index > void setCudaKernelType( const SPMVCudaKernel kernel ); +#ifdef HAVE_CUDA + __device__ __host__ +#endif SPMVCudaKernel getCudaKernelType() const; + void setCudaWarpSize( const int warpSize ); + + int getCudaWarpSize() const; + +#ifdef HAVE_CUDA + template< typename Vector, + int warpSize > + __device__ + void vectorProductCuda( const Vector& inVector, + Vector& outVector, + int gridIdx ) const; +#endif + protected: tnlVector< Index, Device, Index > rowPointers; SPMVCudaKernel spmvCudaKernel; + int cudaWarpSize; + typedef tnlCSRMatrixDeviceDependentCode< DeviceType > DeviceDependentCode; friend class tnlCSRMatrixDeviceDependentCode< DeviceType >; diff --git a/src/matrices/tnlChunkedEllpackMatrix.h b/src/matrices/tnlChunkedEllpackMatrix.h index 007b1e58c30b78a9453770ae223aa83ce2c47f4c..87cace2d49db4ca9b1ed9a7eff1ff6de2b624d21 100644 --- a/src/matrices/tnlChunkedEllpackMatrix.h +++ b/src/matrices/tnlChunkedEllpackMatrix.h @@ -191,9 +191,10 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index > #endif - template< typename Vector > - void vectorProduct( const Vector& inVector, - Vector& outVector ) const; + template< typename InVector, + typename OutVector > + void vectorProduct( const InVector& inVector, + OutVector& outVector ) const; template< typename Real2, typename Index2 > void addMatrix( const tnlChunkedEllpackMatrix< Real2, Device, Index2 >& matrix, diff --git a/src/matrices/tnlDenseMatrix.h b/src/matrices/tnlDenseMatrix.h index 80cd8f6dfaf9155627d5440d9bd43828daef97de..d991c5aefa3644ea1b8d3324ec3f42833afe8120 100644 --- a/src/matrices/tnlDenseMatrix.h +++ b/src/matrices/tnlDenseMatrix.h @@ -22,6 +22,9 @@ #include <matrices/tnlMatrix.h> #include <core/arrays/tnlArray.h> +template< typename Device > +class tnlDenseMatrixDeviceDependentCode; + template< typename Real = double, typename Device = tnlHost, typename Index = int > @@ -145,9 +148,9 @@ class tnlDenseMatrix : public tnlMatrix< Real, Device, Index > typename Vector::RealType rowVectorProduct( const IndexType row, const Vector& vector ) const; - template< typename Vector > - void vectorProduct( const Vector& inVector, - Vector& outVector ) const; + template< typename InVector, typename OutVector > + void vectorProduct( const InVector& inVector, + OutVector& outVector ) const; template< typename Matrix > void addMatrix( const Matrix& matrix, @@ -202,6 +205,8 @@ class tnlDenseMatrix : public tnlMatrix< Real, Device, Index > IndexType getElementIndex( const IndexType row, const IndexType column ) const; + typedef tnlDenseMatrixDeviceDependentCode< DeviceType > DeviceDependentCode; + friend class tnlDenseMatrixDeviceDependentCode< DeviceType >; }; diff --git a/src/matrices/tnlEllpackMatrix.h b/src/matrices/tnlEllpackMatrix.h index 646c0219c67d0dd7b2caaa8d939de8ab43f88402..be90ae6f711e6f041067dfc45c4980a6dc420bec 100644 --- a/src/matrices/tnlEllpackMatrix.h +++ b/src/matrices/tnlEllpackMatrix.h @@ -148,9 +148,10 @@ template< typename Vector > typename Vector::RealType rowVectorProduct( const IndexType row, const Vector& vector ) const; - template< typename Vector > - void vectorProduct( const Vector& inVector, - Vector& outVector ) const; + template< typename InVector, + typename OutVector > + void vectorProduct( const InVector& inVector, + OutVector& outVector ) const; template< typename Real2, typename Index2 > void addMatrix( const tnlEllpackMatrix< Real2, Device, Index2 >& matrix, diff --git a/src/matrices/tnlMatrix.h b/src/matrices/tnlMatrix.h index e927ca66ab6b24912cf18d06f7d0da159761301d..a80d2843a4a9917b6cf30e9d2e03b0f35bb395a4 100644 --- a/src/matrices/tnlMatrix.h +++ b/src/matrices/tnlMatrix.h @@ -133,10 +133,11 @@ ostream& operator << ( ostream& str, const tnlMatrix< Real, Device, Index >& m ) } template< typename Matrix, - typename Vector > + typename InVector, + typename OutVector > void tnlMatrixVectorProductCuda( const Matrix& matrix, - const Vector& inVector, - Vector& outVector ); + const InVector& inVector, + OutVector& outVector ); #include <implementation/matrices/tnlMatrix_impl.h> diff --git a/src/matrices/tnlMultidiagonalMatrix.h b/src/matrices/tnlMultidiagonalMatrix.h index 831a4fbc2b320a683235b6f596172f87d3fd82ae..d0ecf615c4d9f6c15a506250e031e90804376501 100644 --- a/src/matrices/tnlMultidiagonalMatrix.h +++ b/src/matrices/tnlMultidiagonalMatrix.h @@ -21,6 +21,9 @@ #include <matrices/tnlMatrix.h> #include <core/vectors/tnlVector.h> +template< typename Device > +class tnlMultidiagonalMatrixDeviceDependentCode; + template< typename Real, typename Device = tnlHost, typename Index = int > class tnlMultidiagonalMatrix : public tnlMatrix< Real, Device, Index > { @@ -148,9 +151,10 @@ class tnlMultidiagonalMatrix : public tnlMatrix< Real, Device, Index > typename Vector::RealType rowVectorProduct( const IndexType row, const Vector& vector ) const; - template< typename Vector > - void vectorProduct( const Vector& inVector, - Vector& outVector ) const; + template< typename InVector, + typename OutVector > + void vectorProduct( const InVector& inVector, + OutVector& outVector ) const; template< typename Real2, typename Index2 > void addMatrix( const tnlMultidiagonalMatrix< Real2, Device, Index2 >& matrix, @@ -183,7 +187,6 @@ class tnlMultidiagonalMatrix : public tnlMatrix< Real, Device, Index > const IndexType column, IndexType& index ) const; - #ifdef HAVE_CUDA __device__ __host__ #endif @@ -195,6 +198,10 @@ class tnlMultidiagonalMatrix : public tnlMatrix< Real, Device, Index > tnlVector< Index, Device, Index > diagonalsShift; + typedef tnlMultidiagonalMatrixDeviceDependentCode< DeviceType > DeviceDependentCode; + friend class tnlMultidiagonalMatrixDeviceDependentCode< DeviceType >; + + }; #include <implementation/matrices/tnlMultidiagonalMatrix_impl.h> diff --git a/src/matrices/tnlSlicedEllpackMatrix.h b/src/matrices/tnlSlicedEllpackMatrix.h index d3dfc76b74f2434021823bc03a87166bbcedb31a..2530868d274a8c944a06095ff24fb293f0bb2f4f 100644 --- a/src/matrices/tnlSlicedEllpackMatrix.h +++ b/src/matrices/tnlSlicedEllpackMatrix.h @@ -158,9 +158,10 @@ class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index > typename Vector::RealType rowVectorProduct( const IndexType row, const Vector& vector ) const; - template< typename Vector > - void vectorProduct( const Vector& inVector, - Vector& outVector ) const; + template< typename InVector, + typename OutVector > + void vectorProduct( const InVector& inVector, + OutVector& outVector ) const; template< typename Real2, typename Index2 > void addMatrix( const tnlSlicedEllpackMatrix< Real2, Device, Index2 >& matrix, diff --git a/src/matrices/tnlTridiagonalMatrix.h b/src/matrices/tnlTridiagonalMatrix.h index cba9cd70b677739fba7d8665af19d14bfdb4a69f..7bcafd79e0ae21684b87a1b3789a0ca5a492d58d 100644 --- a/src/matrices/tnlTridiagonalMatrix.h +++ b/src/matrices/tnlTridiagonalMatrix.h @@ -21,6 +21,9 @@ #include <matrices/tnlMatrix.h> #include <core/vectors/tnlVector.h> +template< typename Device > +class tnlTridiagonalMatrixDeviceDependentCode; + template< typename Real = double, typename Device = tnlHost, typename Index = int > @@ -143,9 +146,10 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index > typename Vector::RealType rowVectorProduct( const IndexType row, const Vector& vector ) const; - template< typename Vector > - void vectorProduct( const Vector& inVector, - Vector& outVector ) const; + template< typename InVector, + typename OutVector > + void vectorProduct( const InVector& inVector, + OutVector& outVector ) const; template< typename Real2, typename Index2 > void addMatrix( const tnlTridiagonalMatrix< Real2, Device, Index2 >& matrix, @@ -187,6 +191,9 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index > const IndexType column ) const; tnlVector< RealType, DeviceType, IndexType > values; + + typedef tnlTridiagonalMatrixDeviceDependentCode< DeviceType > DeviceDependentCode; + friend class tnlTridiagonalMatrixDeviceDependentCode< DeviceType >; }; #include <implementation/matrices/tnlTridiagonalMatrix_impl.h> diff --git a/tests/benchmarks/CMakeLists.txt b/tests/benchmarks/CMakeLists.txt index fbbc8f18becd1dd325c2a777f14cc5434130bf4d..7eacc115c2072d74bbc77cc4a8305882c2292e0f 100755 --- a/tests/benchmarks/CMakeLists.txt +++ b/tests/benchmarks/CMakeLists.txt @@ -1,5 +1,31 @@ ADD_SUBDIRECTORY( share ) +IF( BUILD_CUDA ) + CUDA_ADD_EXECUTABLE( tnl-benchmark-spmv${debugExt} tnl-benchmark-spmv.cu + OPTIONS -arch sm_20 -shared ) + SET_TARGET_PROPERTIES( tnl-benchmark-spmv${debugExt} PROPERTIES CUDA_COMPILE_FLAGS "${CXX_OPTIMIZE_FLAGS}" ) + CUDA_ADD_EXECUTABLE( tnl-benchmark-linear-solvers${debugExt} tnl-benchmark-linear-solvers.cu + OPTIONS -arch sm_20 -shared ) + SET_TARGET_PROPERTIES( tnl-benchmark-spmv${debugExt} PROPERTIES CUDA_COMPILE_FLAGS "${CXX_OPTIMIZE_FLAGS}" ) + SET_TARGET_PROPERTIES( tnl-benchmark-linear-solvers${debugExt} PROPERTIES CUDA_COMPILE_FLAGS "${CXX_OPTIMIZE_FLAGS}" ) +ELSE() + ADD_EXECUTABLE( tnl-benchmark-spmv${debugExt} tnl-benchmark-spmv.cpp ) + ADD_EXECUTABLE( tnl-benchmark-linear-solvers${debugExt} tnl-benchmark-linear-solvers.cpp ) + SET_TARGET_PROPERTIES( tnl-benchmark-spmv${debugExt} PROPERTIES COMPILE_FLAGS "${CXX_OPTIMIZE_FLAGS}" ) + SET_TARGET_PROPERTIES( tnl-benchmark-linear-solvers${debugExt} PROPERTIES COMPILE_FLAGS "${CXX_OPTIMIZE_FLAGS}" ) +ENDIF() +TARGET_LINK_LIBRARIES( tnl-benchmark-spmv${debugExt} tnl${debugExt}-${tnlVersion} ) +TARGET_LINK_LIBRARIES( tnl-benchmark-linear-solvers${debugExt} tnl${debugExt}-${tnlVersion} ) + +INSTALL( TARGETS tnl-benchmark-spmv${debugExt} + tnl-benchmark-linear-solvers${debugExt} + RUNTIME DESTINATION bin ) + + + + + +#################################################### SET( tnlSpmvBenchmark_headers sparse-matrix-benchmark.h tnlSpmvBenchmark.h tnlSpmvBenchmarkAdaptiveRgCSRMatrix.h @@ -19,21 +45,7 @@ ELSE() ENDIF() TARGET_LINK_LIBRARIES( tnl-sparse-matrix-benchmark${debugExt} tnl${debugExt}-${tnlVersion} ${CUSPARSE_LIBRARY} ) - - -IF( BUILD_CUDA ) - CUDA_ADD_EXECUTABLE( tnl-benchmark-spmv${debugExt} tnl-benchmark-spmv.cu - OPTIONS -arch sm_20 -shared ) - SET_TARGET_PROPERTIES( tnl-benchmark-spmv${debugExt} PROPERTIES CUDA_COMPILE_FLAGS "${CXX_OPTIMIZE_FLAGS}" ) -ELSE() - ADD_EXECUTABLE( tnl-benchmark-spmv${debugExt} tnl-benchmark-spmv.cpp ) - SET_TARGET_PROPERTIES( tnl-benchmark-spmv${debugExt} PROPERTIES COMPILE_FLAGS "${CXX_OPTIMIZE_FLAGS}" ) -ENDIF() -TARGET_LINK_LIBRARIES( tnl-benchmark-spmv${debugExt} tnl${debugExt}-${tnlVersion} ) - -INSTALL( TARGETS tnl-sparse-matrix-benchmark${debugExt} - tnl-benchmark-spmv${debugExt} - RUNTIME DESTINATION bin ) + #IF( BUILD_CUDA ) diff --git a/tests/benchmarks/share/CMakeLists.txt b/tests/benchmarks/share/CMakeLists.txt index 3734e0f0a7a9f2c1b79cf57bc8d4e167ca356f05..e8bb6840d9ef9b75fa1c6faef90785611898c5e6 100755 --- a/tests/benchmarks/share/CMakeLists.txt +++ b/tests/benchmarks/share/CMakeLists.txt @@ -1,6 +1,7 @@ INSTALL( FILES tnl-sparse-matrix-benchmark.cfg.desc tnl-matrix-solvers-benchmark.cfg.desc tnl-benchmark-spmv.cfg.desc + tnl-benchmark-linear-solvers.cfg.desc DESTINATION share/tnl-${tnlVersion} ) INSTALL( FILES matrix-market florida-matrix-market @@ -9,6 +10,7 @@ INSTALL( FILES matrix-market draw-matrices run-matrix-solvers-benchmark run-tnl-benchmark-spmv + run-tnl-benchmark-linear-solvers cuda-profiler.conf process-cuda-profile.pl tnl-log-to-html.py diff --git a/tests/benchmarks/share/run-tnl-benchmark-linear-solvers b/tests/benchmarks/share/run-tnl-benchmark-linear-solvers new file mode 100644 index 0000000000000000000000000000000000000000..d0cc977f610951a2e52258dfb26ea6ed2840012d --- /dev/null +++ b/tests/benchmarks/share/run-tnl-benchmark-linear-solvers @@ -0,0 +1,68 @@ +#!/usr/bin/env bash + +DEBUG="no" +STOP_TIME="1" +export CUDA_PROFILE=0 + +PWD=`pwd` +IWD="$PWD" +BASE="ftp://math.nist.gov/pub/MatrixMarket2/Harwell-Boeing/" +BENCHMARK="tnl-benchmark-linear-solvers" +BENCHMARK_DBG="tnl-benchmark-linear-solvers-dbg" + +export CUDA_PROFILE_CONFIG="$IWD/cuda-profiler.conf" +PROCESS_CUDA_PROFILE="$IWD/process-cuda-profile.pl" +source matrix-market +source florida-matrix-market + +for link in $MM_MATRICES; +do + echo "======================================================================================================" + matrix=matrices`echo $link | sed 's/ftp:\/\/math.nist.gov\/pub//'` + unzipped_matrix=`echo $matrix | sed 's/.gz//'` + if test ! -e $matrix; + then + echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first." + #echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first." >> sparse-matrix-benchmark.log + else + gunzip -c ${matrix} > ${unzipped_matrix} + echo "Benchmarking with the matrix $unzipped_matrix ..." + export CUDA_PROFILE_LOG=$unzipped_matrix.float.log + if test x$DEBUG = xyes; + then + gdb --args ${BENCHMARK_DBG} --test mtx --input-file $unzipped_matrix --log-file sparse-matrix-benchmark.log --verbose 1 + else + $BENCHMARK --test mtx --input-file $unzipped_matrix --log-file sparse-matrix-benchmark.log --verbose 1 + fi + #perl $PROCESS_CUDA_PROFILE $unzipped_matrix.float.log sparse-matrix-profiling-float.log + fi +done + +for link in $FLORIDA_MM_MATRICES; +do + matrix=matrices`echo $link | sed 's/http:\/\/www.cise.ufl.edu\/research\/sparse//'` + if test ! -e $matrix; + then + echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first." + #echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first." >> sparse-matrix-benchmark.log + else + DIRNAME=`dirname $matrix` + FILENAME=`basename $matrix` + cd $DIRNAME + tar zxvf $FILENAME + cd $IWD + SUBDIRNAME=`echo $FILENAME | sed 's/.tar.gz//'` + rm -f $DIRNAME/$SUBDIRNAME/*_b.mtx # these are usualy in array format + for file in $DIRNAME/$SUBDIRNAME/*.mtx; + do + echo "======================================================================================================" + echo "Benchmarking with the matrix $file ..." + if test x$DEBUG = xyes; + then + gdb --args $BENCHMARK --test mtx --input-file $file --log-file sparse-matrix-benchmark.log --verbose 1 + else + $BENCHMARK --test mtx --input-file $file --log-file sparse-matrix-benchmark.log --verbose 1 + fi + done + fi +done diff --git a/tests/benchmarks/share/run-tnl-benchmark-spmv b/tests/benchmarks/share/run-tnl-benchmark-spmv index 6abb115919bf2c7a50c26d9fee5f2052eabb65ed..75ea08219bb454533bd882594d44772586ef50cb 100644 --- a/tests/benchmarks/share/run-tnl-benchmark-spmv +++ b/tests/benchmarks/share/run-tnl-benchmark-spmv @@ -7,8 +7,8 @@ export CUDA_PROFILE=0 PWD=`pwd` IWD="$PWD" BASE="ftp://math.nist.gov/pub/MatrixMarket2/Harwell-Boeing/" -SPARSE_MATRIX_BENCHMARK="tnl-benchmark-spmv" -SPARSE_MATRIX_BENCHMARK_DBG="tnl-benchmark-spmv-dbg" +BENCHMARK="tnl-benchmark-spmv" +BENCHMARK_DBG="tnl-benchmark-spmv-dbg" export CUDA_PROFILE_CONFIG="$IWD/cuda-profiler.conf" PROCESS_CUDA_PROFILE="$IWD/process-cuda-profile.pl" @@ -17,7 +17,7 @@ source florida-matrix-market for link in $MM_MATRICES; do - echo "###############################################################################################" + echo "======================================================================================================" matrix=matrices`echo $link | sed 's/ftp:\/\/math.nist.gov\/pub//'` unzipped_matrix=`echo $matrix | sed 's/.gz//'` if test ! -e $matrix; @@ -30,9 +30,9 @@ do export CUDA_PROFILE_LOG=$unzipped_matrix.float.log if test x$DEBUG = xyes; then - gdb --args ${SPARSE_MATRIX_BENCHMARK_DBG} --test mtx --input-file $unzipped_matrix --log-file sparse-matrix-benchmark.log --stop-time $STOP_TIME --verbose 1 + gdb --args ${BENCHMARK_DBG} --test mtx --input-file $unzipped_matrix --log-file sparse-matrix-benchmark.log --stop-time $STOP_TIME --verbose 1 else - $SPARSE_MATRIX_BENCHMARK --test mtx --input-file $unzipped_matrix --pdf-file $unzipped_matrix.pdf --log-file sparse-matrix-benchmark.log --stop-time $STOP_TIME --verbose 1 + $BENCHMARK --test mtx --input-file $unzipped_matrix --pdf-file $unzipped_matrix.pdf --log-file sparse-matrix-benchmark.log --stop-time $STOP_TIME --verbose 1 fi #perl $PROCESS_CUDA_PROFILE $unzipped_matrix.float.log sparse-matrix-profiling-float.log fi @@ -54,14 +54,14 @@ do SUBDIRNAME=`echo $FILENAME | sed 's/.tar.gz//'` rm -f $DIRNAME/$SUBDIRNAME/*_b.mtx # these are usualy in array format for file in $DIRNAME/$SUBDIRNAME/*.mtx; - do - echo "###############################################################################################" + do + echo "======================================================================================================" echo "Benchmarking with the matrix $file ..." if test x$DEBUG = xyes; then - gdb --args $SPARSE_MATRIX_BENCHMARK --test mtx --input-file $file --pdf-file $file.pdf --log-file sparse-matrix-benchmark.log --stop-time $STOP_TIME --verbose 1 + gdb --args $BENCHMARK --test mtx --input-file $file --pdf-file $file.pdf --log-file sparse-matrix-benchmark.log --stop-time $STOP_TIME --verbose 1 else - $SPARSE_MATRIX_BENCHMARK --test mtx --input-file $file --pdf-file $file.pdf --log-file sparse-matrix-benchmark.log --stop-time $STOP_TIME --verbose 1 + $BENCHMARK --test mtx --input-file $file --pdf-file $file.pdf --log-file sparse-matrix-benchmark.log --stop-time $STOP_TIME --verbose 1 fi done fi diff --git a/tests/benchmarks/share/tnl-benchmark-linear-solvers.cfg.desc b/tests/benchmarks/share/tnl-benchmark-linear-solvers.cfg.desc new file mode 100644 index 0000000000000000000000000000000000000000..9f104892d592ce0e90dd5baacf555ea3748b3a69 --- /dev/null +++ b/tests/benchmarks/share/tnl-benchmark-linear-solvers.cfg.desc @@ -0,0 +1,11 @@ +group benchmarkSetup +{ + string test(!) [Can be tridiagonal, multidiagonal, multidiagonal-with-long-rows, mtx, tnl.]; + string input-file(!) [Input binary file name.]; + string log-file("tnl-benchmark-linear-solvers.log") [Log file name.]; + string precision("double") [Precision of the arithmetics.]; + string matrix-format( "csr" ) [Matrix format can be dense, tridiagonal, multidiagonal, ellpack, sliced-ellpack, chunked-ellpack, csr.]; + string solver( "gmres" ) [Solver can be sor, cg, gmres, ... ]; + string device( "host" ) [Device cen host or cuda.]; + integer verbose(1) [Verbose mode.]; +},[Arguments to set-up the benchmark.]; \ No newline at end of file diff --git a/tests/benchmarks/tnl-benchmark-linear-solvers.cpp b/tests/benchmarks/tnl-benchmark-linear-solvers.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6e050dccd0c42656d756b6dc1083f700779a6950 --- /dev/null +++ b/tests/benchmarks/tnl-benchmark-linear-solvers.cpp @@ -0,0 +1,18 @@ +/*************************************************************************** + tnl-benchmark-linear-solvers.cpp - description + ------------------- + begin : Jun 22, 2014 + copyright : (C) 2014 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/*************************************************************************** + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + ***************************************************************************/ + +#include "tnl-benchmark-linear-solvers.h" diff --git a/tests/benchmarks/tnl-benchmark-linear-solvers.cu b/tests/benchmarks/tnl-benchmark-linear-solvers.cu new file mode 100644 index 0000000000000000000000000000000000000000..317bbc9aa6c18ab9b6fa98e1b8ba7a60e7d32b47 --- /dev/null +++ b/tests/benchmarks/tnl-benchmark-linear-solvers.cu @@ -0,0 +1,18 @@ +/*************************************************************************** + tnl-benchmark-linear-solvers.cu - description + ------------------- + begin : Jun 22, 2014 + copyright : (C) 2014 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/*************************************************************************** + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + ***************************************************************************/ + +#include "tnl-benchmark-linear-solvers.h" \ No newline at end of file diff --git a/tests/benchmarks/tnl-benchmark-linear-solvers.h b/tests/benchmarks/tnl-benchmark-linear-solvers.h new file mode 100644 index 0000000000000000000000000000000000000000..42d53e1900bdaf042c0f5a54f0ce4863764e5982 --- /dev/null +++ b/tests/benchmarks/tnl-benchmark-linear-solvers.h @@ -0,0 +1,195 @@ +/*************************************************************************** + tnl-benchmark-linear-solvers.h - description + ------------------- + begin : Jun 22, 2014 + copyright : (C) 2014 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/*************************************************************************** + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + ***************************************************************************/ + +#ifndef TNL_BENCHMARK_LINEAR_SOLVERS_H_ +#define TNL_BENCHMARK_LINEAR_SOLVERS_H_ + +#include <fstream> +#include <iomanip> +#include <unistd.h> + +#include <config/tnlConfigDescription.h> +#include <config/tnlParameterContainer.h> +#include <core/tnlTimerRT.h> +#include <matrices/tnlDenseMatrix.h> +#include <matrices/tnlTridiagonalMatrix.h> +#include <matrices/tnlMultidiagonalMatrix.h> +#include <matrices/tnlCSRMatrix.h> +#include <matrices/tnlEllpackMatrix.h> +#include <matrices/tnlSlicedEllpackMatrix.h> +#include <matrices/tnlChunkedEllpackMatrix.h> +#include <matrices/tnlMatrixReader.h> +#include <solvers/linear/krylov/tnlGMRESSolver.h> +#include <solvers/linear/tnlLinearResidueGetter.h> +#include <solvers/tnlIterativeSolverMonitor.h> + +using namespace std; + +#include "tnlConfig.h" +const char configFile[] = TNL_CONFIG_DIRECTORY "tnl-benchmark-linear-solvers.cfg.desc"; + +template< typename Solver > +bool benchmarkSolver( const tnlParameterContainer& parameters, + const typename Solver::MatrixType& matrix) +{ + typedef typename Solver::MatrixType MatrixType; + typedef typename MatrixType::RealType RealType; + typedef typename MatrixType::DeviceType DeviceType; + typedef typename MatrixType::IndexType IndexType; + typedef tnlVector< RealType, DeviceType, IndexType > VectorType; + + VectorType x, y, b; + x.setSize( matrix.getColumns() ); + x.setValue( 1.0 / ( RealType ) matrix.getColumns() ); + y.setSize( matrix.getColumns() ); + b.setSize( matrix.getRows() ); + matrix.vectorProduct( x, b ); + + Solver solver; + tnlIterativeSolverMonitor< RealType, IndexType > monitor; + monitor.setVerbose( 1 ); + solver.setSolverMonitor( monitor ); + solver.setMatrix( matrix ); + solver.setMaxResidue( 1.0e-6 ); + solver.template solve< VectorType, tnlLinearResidueGetter< MatrixType, VectorType > >( b, y ); + cout << endl; + return true; +} + +template< typename Matrix > +bool readMatrix( const tnlParameterContainer& parameters, + Matrix& matrix ) +{ + const tnlString fileName = parameters.GetParameter< tnlString >( "input-file" ); + + Matrix* hostMatrix; + if( Matrix::DeviceType::DeviceType == tnlCudaDevice ) + { + + } + if( Matrix::DeviceType::DeviceType == tnlHostDevice ) + { + hostMatrix = &matrix; + try + { + if( ! tnlMatrixReader< Matrix >::readMtxFile( fileName, *hostMatrix ) ) + { + cerr << "I am not able to read the matrix file " << fileName << "." << endl; + /*logFile << endl; + logFile << inputFileName << endl; + logFile << "Benchmark failed: Unable to read the matrix." << endl;*/ + return false; + } + } + catch( std::bad_alloc ) + { + cerr << "Not enough memory to read the matrix." << endl; + /*logFile << endl; + logFile << inputFileName << endl; + logFile << "Benchmark failed: Not enough memory." << endl;*/ + return false; + } + } + return true; +} + +template< typename Matrix > +bool resolveLinearSolver( const tnlParameterContainer& parameters ) +{ + const tnlString& solver = parameters.GetParameter< tnlString >( "solver" ); + + Matrix matrix; + if( ! readMatrix( parameters, matrix ) ) + return false; + + if( solver == "gmres" ) + return benchmarkSolver< tnlGMRESSolver< Matrix > >( parameters, matrix ); + + cerr << "Unknown solver " << solver << "." << endl; + return false; +} + +template< typename Real, + typename Device > +bool resolveMatrixFormat( const tnlParameterContainer& parameters ) +{ + const tnlString& matrixFormat = parameters.GetParameter< tnlString >( "matrix-format" ); + + if( matrixFormat == "dense" ) + return resolveLinearSolver< tnlDenseMatrix< Real, Device, int > >( parameters ); + + if( matrixFormat == "tridiagonal" ) + return resolveLinearSolver< tnlTridiagonalMatrix< Real, Device, int > >( parameters ); + + if( matrixFormat == "multidiagonal" ) + return resolveLinearSolver< tnlMultidiagonalMatrix< Real, Device, int > >( parameters ); + + if( matrixFormat == "ellpack" ) + return resolveLinearSolver< tnlEllpackMatrix< Real, Device, int > >( parameters ); + + if( matrixFormat == "sliced-ellpack" ) + return resolveLinearSolver< tnlSlicedEllpackMatrix< Real, Device, int > >( parameters ); + + if( matrixFormat == "chunked-ellpack" ) + return resolveLinearSolver< tnlChunkedEllpackMatrix< Real, Device, int > >( parameters ); + + if( matrixFormat == "csr" ) + return resolveLinearSolver< tnlCSRMatrix< Real, Device, int > >( parameters ); + + cerr << "Unknown matrix format " << matrixFormat << "." << endl; + return false; +} + +template< typename Real > +bool resolveDevice( const tnlParameterContainer& parameters ) +{ + const tnlString& device = parameters.GetParameter< tnlString >( "device" ); + + if( device == "host" ) + return resolveMatrixFormat< Real, tnlHost >( parameters ); + + //if( device == "cuda" ) + // return resolveMatrixFormat< Real, tnlCuda >( parameters ); + + cerr << "Uknown device " << device << "." << endl; + return false; +} + +int main( int argc, char* argv[] ) +{ + tnlParameterContainer parameters; + tnlConfigDescription conf_desc; + + if( conf_desc. ParseConfigDescription( configFile ) != 0 ) + return 1; + if( ! ParseCommandLine( argc, argv, conf_desc, parameters ) ) + { + conf_desc. PrintUsage( argv[ 0 ] ); + return 1; + } + const tnlString& precision = parameters.GetParameter< tnlString >( "precision" ); + if( precision == "float" ) + if( ! resolveDevice< float >( parameters ) ) + return EXIT_FAILURE; + if( precision == "double" ) + if( ! resolveDevice< double >( parameters ) ) + return EXIT_FAILURE; + return EXIT_SUCCESS; +} + + +#endif /* TNL_BENCHMARK_LINEAR_SOLVERS_H_ */ diff --git a/tests/benchmarks/tnl-benchmark-spmv.h b/tests/benchmarks/tnl-benchmark-spmv.h index 0a7a994adb8b8d7b53c2f1eb07efc9642fdb37cf..eb431fe5596d7f78a817282cdadce5a90df3e86a 100644 --- a/tests/benchmarks/tnl-benchmark-spmv.h +++ b/tests/benchmarks/tnl-benchmark-spmv.h @@ -58,9 +58,35 @@ bool initLogFile( fstream& logFile, const tnlString& fileName ) logFile << "# Speedup" << speedupColoring << endl; #ifdef HAVE_CUDA logFile << "# CUDA" << endl; - logFile << "# Gflops" << endl; - logFile << "# Throughput" << endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-cuda-speedup.txt" << endl; + logFile << "# Scalar" << endl; + logFile << "# Gflops" << endl; + logFile << "# Throughput" << endl; + logFile << "# Speedup" << speedupColoring << " SORT - csr-scalar-cuda-speedup.txt" << endl; + logFile << "# Vector" << endl; + logFile << "# Warp Size 1" << endl; + logFile << "# Gflops" << endl; + logFile << "# Throughput" << endl; + logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-cuda-speedup.txt" << endl; + logFile << "# Warp Size 2" << endl; + logFile << "# Gflops" << endl; + logFile << "# Throughput" << endl; + logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-cuda-speedup.txt" << endl; + logFile << "# Warp Size 4" << endl; + logFile << "# Gflops" << endl; + logFile << "# Throughput" << endl; + logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-cuda-speedup.txt" << endl; + logFile << "# Warp Size 8" << endl; + logFile << "# Gflops" << endl; + logFile << "# Throughput" << endl; + logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-cuda-speedup.txt" << endl; + logFile << "# Warp Size 16" << endl; + logFile << "# Gflops" << endl; + logFile << "# Throughput" << endl; + logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-cuda-speedup.txt" << endl; + logFile << "# Warp Size 32" << endl; + logFile << "# Gflops" << endl; + logFile << "# Throughput" << endl; + logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-cuda-speedup.txt" << endl; #endif logFile << "#Ellpack Format" << endl; logFile << "# Padding (in %)" << paddingColoring << endl; @@ -235,12 +261,23 @@ bool setupBenchmark( const tnlParameterContainer& parameters ) { typedef tnlCSRMatrix< Real, tnlHost, int > CSRMatrixType; CSRMatrixType csrMatrix; - if( ! tnlMatrixReader< CSRMatrixType >::readMtxFile( inputFileName, csrMatrix ) ) + try { - cerr << "I am not able to read the matrix file " << inputFileName << "." << endl; + if( ! tnlMatrixReader< CSRMatrixType >::readMtxFile( inputFileName, csrMatrix ) ) + { + cerr << "I am not able to read the matrix file " << inputFileName << "." << endl; + logFile << endl; + logFile << inputFileName << endl; + logFile << "Benchmark failed: Unable to read the matrix." << endl; + return false; + } + } + catch( std::bad_alloc ) + { + cerr << "Not enough memory to read the matrix." << endl; logFile << endl; logFile << inputFileName << endl; - logFile << "Benchmark failed." << endl; + logFile << "Benchmark failed: Not enough memory." << endl; return false; } if( verbose ) @@ -289,16 +326,78 @@ bool setupBenchmark( const tnlParameterContainer& parameters ) if( ! cudaCSRMatrix.copyFrom( csrMatrix, rowLengthsCuda ) ) { cerr << "I am not able to transfer the matrix on GPU." << endl; - writeTestFailed( logFile, 3 ); + writeTestFailed( logFile, 21 ); } else { cout << " done. \r"; + cudaCSRMatrix.setCudaKernelType( CSRMatrixCudaType::scalar ); + benchmarkMatrix( cudaCSRMatrix, + cudaX, + cudaB, + nonzeroElements, + "CSR Cuda Scalar", + stopTime, + baseline, + verbose, + logFile ); + cudaCSRMatrix.setCudaKernelType( CSRMatrixCudaType::vector ); + cudaCSRMatrix.setCudaWarpSize( 1 ); + benchmarkMatrix( cudaCSRMatrix, + cudaX, + cudaB, + nonzeroElements, + "CSR Cuda Vector 1", + stopTime, + baseline, + verbose, + logFile ); + cudaCSRMatrix.setCudaWarpSize( 2 ); + benchmarkMatrix( cudaCSRMatrix, + cudaX, + cudaB, + nonzeroElements, + "CSR Cuda Vector 2", + stopTime, + baseline, + verbose, + logFile ); + cudaCSRMatrix.setCudaWarpSize( 4 ); + benchmarkMatrix( cudaCSRMatrix, + cudaX, + cudaB, + nonzeroElements, + "CSR Cuda Vector 4", + stopTime, + baseline, + verbose, + logFile ); + cudaCSRMatrix.setCudaWarpSize( 8 ); + benchmarkMatrix( cudaCSRMatrix, + cudaX, + cudaB, + nonzeroElements, + "CSR Cuda Vector 8", + stopTime, + baseline, + verbose, + logFile ); + cudaCSRMatrix.setCudaWarpSize( 16 ); + benchmarkMatrix( cudaCSRMatrix, + cudaX, + cudaB, + nonzeroElements, + "CSR Cuda Vector 16", + stopTime, + baseline, + verbose, + logFile ); + cudaCSRMatrix.setCudaWarpSize( 32 ); benchmarkMatrix( cudaCSRMatrix, cudaX, cudaB, nonzeroElements, - "CSR Cuda", + "CSR Cuda Vector 32", stopTime, baseline, verbose,