Loading src/matrices/tnlEllpackMatrix_impl.h +70 −1 Original line number Diff line number Diff line Loading @@ -115,6 +115,7 @@ bool tnlEllpackMatrix< Real, Device, Index >::setLike( const tnlEllpackMatrix< R if( ! tnlSparseMatrix< Real, Device, Index >::setLike( matrix ) ) return false; this->rowLengths = matrix.rowLengths; this->alignedRows = matrix.alignedRows; return true; } Loading @@ -125,6 +126,7 @@ void tnlEllpackMatrix< Real, Device, Index > :: reset() { tnlSparseMatrix< Real, Device, Index >::reset(); this->rowLengths = 0; this->alignedRows = 0; } template< typename Real, Loading Loading @@ -693,6 +695,39 @@ class tnlEllpackMatrixDeviceDependentCode< tnlHost > } }; #ifdef HAVE_CUDA template< typename Real, typename Index > __global__ void tnlEllpackMatrixVectorProductCudaKernel( const Index rows, const Index columns, const Index compressedRowsLengths, const Index alignedRows, const Index* columnIndexes, const Real* values, const Real* inVector, Real* outVector, const Index gridIdx ) { const Index rowIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( rowIdx >= rows ) return; Index i = rowIdx; Index el( 0 ); Real result( 0.0 ); Index columnIndex; while( el++ < compressedRowsLengths && ( columnIndex = columnIndexes[ i ] ) < columns ) { result += values[ i ] * inVector[ columnIndex ]; i += alignedRows; } outVector[ rowIdx ] = result; } #endif template<> class tnlEllpackMatrixDeviceDependentCode< tnlCuda > { Loading Loading @@ -734,7 +769,41 @@ class tnlEllpackMatrixDeviceDependentCode< tnlCuda > const InVector& inVector, OutVector& outVector ) { tnlMatrixVectorProductCuda( matrix, inVector, outVector ); //tnlMatrixVectorProductCuda( matrix, inVector, outVector ); #ifdef HAVE_CUDA typedef tnlEllpackMatrix< Real, Device, Index > Matrix; typedef typename Matrix::IndexType IndexType; //Matrix* kernel_this = tnlCuda::passToDevice( matrix ); //InVector* kernel_inVector = tnlCuda::passToDevice( inVector ); //OutVector* kernel_outVector = tnlCuda::passToDevice( outVector ); dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() ); const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize(); tnlEllpackMatrixVectorProductCudaKernel < Real, Index > <<< cudaGridSize, cudaBlockSize >>> ( matrix.getRows(), matrix.getColumns(), matrix.rowLengths, matrix.alignedRows, matrix.columnIndexes.getData(), matrix.values.getData(), inVector.getData(), outVector.getData(), gridIdx ); checkCudaDevice; } //tnlCuda::freeFromDevice( kernel_this ); //tnlCuda::freeFromDevice( kernel_inVector ); //tnlCuda::freeFromDevice( kernel_outVector ); checkCudaDevice; cudaThreadSynchronize(); #endif } }; Loading src/matrices/tnlSlicedEllpackMatrix_impl.h +58 −14 Original line number Diff line number Diff line Loading @@ -777,6 +777,42 @@ __global__ void tnlSlicedEllpackMatrix_computeMaximalRowLengthInSlices_CudaKerne } #endif #ifdef HAVE_CUDA template< typename Real, typename Index, int SliceSize > __global__ void tnlSlicedEllpackMatrixVectorProductCudaKernel( const Index rows, const Index columns, const Index* slicePointers, const Index* sliceCompressedRowsLengths, const Index* columnIndexes, const Real* values, const Real* inVector, Real* outVector, const Index gridIdx ) { const Index rowIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( rowIdx >= rows ) return; const Index sliceIdx = rowIdx / SliceSize; const Index slicePointer = slicePointers[ sliceIdx ]; const Index rowLength = sliceCompressedRowsLengths[ sliceIdx ]; Index i = slicePointer + rowIdx - sliceIdx * SliceSize; const Index rowEnd = i + rowLength * SliceSize; Real result( 0.0 ); Index columnIndex; while( i < rowEnd && ( columnIndex = columnIndexes[ i ] ) < columns ) { result += values[ i ] * inVector[ columnIndex ]; i += SliceSize; } outVector[ rowIdx ] = result; } #endif template<> class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda > { Loading Loading @@ -862,13 +898,13 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda > const InVector& inVector, OutVector& outVector ) { tnlMatrixVectorProductCuda( matrix, inVector, outVector ); /*#ifdef HAVE_CUDA //tnlMatrixVectorProductCuda( matrix, inVector, outVector ); #ifdef HAVE_CUDA typedef tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize > Matrix; typedef typename Matrix::IndexType IndexType; Matrix* kernel_this = tnlCuda::passToDevice( matrix ); InVector* kernel_inVector = tnlCuda::passToDevice( inVector ); OutVector* kernel_outVector = tnlCuda::passToDevice( outVector ); //Matrix* kernel_this = tnlCuda::passToDevice( matrix ); //InVector* kernel_inVector = tnlCuda::passToDevice( inVector ); //OutVector* kernel_outVector = tnlCuda::passToDevice( outVector ); dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() ); const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() ); Loading @@ -876,18 +912,26 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda > { if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize(); tnlMatrixVectorProductCudaKernel<<< cudaGridSize, cudaBlockSize >>> ( kernel_this, kernel_inVector, kernel_outVector, tnlSlicedEllpackMatrixVectorProductCudaKernel < Real, Index, SliceSize > <<< cudaGridSize, cudaBlockSize >>> ( matrix.getRows(), matrix.getColumns(), matrix.slicePointers.getData(), matrix.sliceCompressedRowsLengths.getData(), matrix.columnIndexes.getData(), matrix.values.getData(), inVector.getData(), outVector.getData(), gridIdx ); checkCudaDevice; } tnlCuda::freeFromDevice( kernel_this ); tnlCuda::freeFromDevice( kernel_inVector ); tnlCuda::freeFromDevice( kernel_outVector ); //tnlCuda::freeFromDevice( kernel_this ); //tnlCuda::freeFromDevice( kernel_inVector ); //tnlCuda::freeFromDevice( kernel_outVector ); checkCudaDevice; #endif*/ cudaThreadSynchronize(); #endif } }; Loading tests/benchmarks/tnl-cuda-benchmarks.h +55 −6 Original line number Diff line number Diff line Loading @@ -22,12 +22,49 @@ #include <core/vectors/tnlVector.h> #include <core/tnlTimerRT.h> #include <matrices/tnlSlicedEllpackMatrix.h> #include <matrices/tnlEllpackMatrix.h> #ifdef HAVE_CUBLAS //#include <cublas.h> #endif template< typename Matrix > __global__ void setCudaTestMatrixKernel( Matrix* matrix, const int elementsPerRow, const int gridIdx ) { const int rowIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( rowIdx >= matrix->getRows() ) return; int col = Max( 0, rowIdx - elementsPerRow / 2 ); for( int element = 0; element < elementsPerRow; element++ ) { if( col + element < matrix->getColumns() ) matrix->setElementFast( rowIdx, col + element, 1.0 ); } } template< typename Matrix > void setCudaTestMatrix( Matrix& matrix, const int elementsPerRow ) { typedef typename Matrix::IndexType IndexType; typedef typename Matrix::RealType RealType; Matrix* kernel_matrix = tnlCuda::passToDevice( matrix ); dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() ); const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize(); setCudaTestMatrixKernel< Matrix > <<< cudaGridSize, cudaBlockSize >>> ( kernel_matrix, elementsPerRow, gridIdx ); checkCudaDevice; } tnlCuda::freeFromDevice( kernel_matrix ); } int main( int argc, char* argv[] ) { Loading Loading @@ -195,8 +232,9 @@ int main( int argc, char* argv[] ) * Sliced Ellpack test */ const int elementsPerRow( 5 ); tnlSlicedEllpackMatrix< double, tnlHost > hostMatrix; tnlSlicedEllpackMatrix< double, tnlCuda > deviceMatrix; typedef tnlEllpackMatrix< double, tnlCuda > DeviceMatrix; tnlEllpackMatrix< double, tnlHost > hostMatrix; DeviceMatrix deviceMatrix; tnlVector< int, tnlHost, int > hostRowLengths; tnlVector< int, tnlCuda, int > deviceRowLengths; hostRowLengths.setSize( size ); Loading @@ -206,21 +244,28 @@ int main( int argc, char* argv[] ) hostRowLengths.setValue( elementsPerRow ); deviceRowLengths.setValue( elementsPerRow ); hostMatrix.setCompressedRowsLengths( hostRowLengths ); deviceMatrix.setCompressedRowsLengths( deviceRowLengths ); if( ! deviceMatrix.setCompressedRowsLengths( deviceRowLengths ) ) { cerr << "Unable to allocate matrix elements." << endl; return false; } int elements( 0 ); for( int row = 0; row < size; row++ ) { cout << "Row " << row << "/" << size << " \r" << flush; int col = Max( 0, row - elementsPerRow / 2 ); for( int element = 0; element < elementsPerRow; element++ ) { if( col + element < size ) { hostMatrix.setElement( row, col + element, 1.0 ); deviceMatrix.setElement( row, col + element, 1.0 ); //deviceMatrix.setElement( row, col + element, 1.0 ); elements++; } } } cout << endl; setCudaTestMatrix< DeviceMatrix >( deviceMatrix, elementsPerRow ); datasetSize = loops * elements * sizeof( double ) / oneGB; hostVector.setValue( 1.0 ); deviceVector.setValue( 1.0 ); Loading @@ -242,7 +287,11 @@ int main( int argc, char* argv[] ) if( hostVector2 != deviceVector2 ) { cerr << "Error in SliceEllpack Spmv kernel." << endl; cerr << "Error in SliceEllpack Spmv kernel at positions" << endl; //for( int i = 0; i < size; i++ ) // if( hostVector2.getElement( i ) != deviceVector2.getElement( i ) ) // cerr << " " << i; } bandwidth = 2 * datasetSize / loops / timer.getTime(); cout << timer.getTime() << " => " << bandwidth << " GB/s" << " speedup " << hostTime / timer.getTime() << endl; Loading src/matrices/tnlEllpackMatrix.h +1 −1 File changed.Contains only whitespace changes. Show changes Loading
src/matrices/tnlEllpackMatrix_impl.h +70 −1 Original line number Diff line number Diff line Loading @@ -115,6 +115,7 @@ bool tnlEllpackMatrix< Real, Device, Index >::setLike( const tnlEllpackMatrix< R if( ! tnlSparseMatrix< Real, Device, Index >::setLike( matrix ) ) return false; this->rowLengths = matrix.rowLengths; this->alignedRows = matrix.alignedRows; return true; } Loading @@ -125,6 +126,7 @@ void tnlEllpackMatrix< Real, Device, Index > :: reset() { tnlSparseMatrix< Real, Device, Index >::reset(); this->rowLengths = 0; this->alignedRows = 0; } template< typename Real, Loading Loading @@ -693,6 +695,39 @@ class tnlEllpackMatrixDeviceDependentCode< tnlHost > } }; #ifdef HAVE_CUDA template< typename Real, typename Index > __global__ void tnlEllpackMatrixVectorProductCudaKernel( const Index rows, const Index columns, const Index compressedRowsLengths, const Index alignedRows, const Index* columnIndexes, const Real* values, const Real* inVector, Real* outVector, const Index gridIdx ) { const Index rowIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( rowIdx >= rows ) return; Index i = rowIdx; Index el( 0 ); Real result( 0.0 ); Index columnIndex; while( el++ < compressedRowsLengths && ( columnIndex = columnIndexes[ i ] ) < columns ) { result += values[ i ] * inVector[ columnIndex ]; i += alignedRows; } outVector[ rowIdx ] = result; } #endif template<> class tnlEllpackMatrixDeviceDependentCode< tnlCuda > { Loading Loading @@ -734,7 +769,41 @@ class tnlEllpackMatrixDeviceDependentCode< tnlCuda > const InVector& inVector, OutVector& outVector ) { tnlMatrixVectorProductCuda( matrix, inVector, outVector ); //tnlMatrixVectorProductCuda( matrix, inVector, outVector ); #ifdef HAVE_CUDA typedef tnlEllpackMatrix< Real, Device, Index > Matrix; typedef typename Matrix::IndexType IndexType; //Matrix* kernel_this = tnlCuda::passToDevice( matrix ); //InVector* kernel_inVector = tnlCuda::passToDevice( inVector ); //OutVector* kernel_outVector = tnlCuda::passToDevice( outVector ); dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() ); const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize(); tnlEllpackMatrixVectorProductCudaKernel < Real, Index > <<< cudaGridSize, cudaBlockSize >>> ( matrix.getRows(), matrix.getColumns(), matrix.rowLengths, matrix.alignedRows, matrix.columnIndexes.getData(), matrix.values.getData(), inVector.getData(), outVector.getData(), gridIdx ); checkCudaDevice; } //tnlCuda::freeFromDevice( kernel_this ); //tnlCuda::freeFromDevice( kernel_inVector ); //tnlCuda::freeFromDevice( kernel_outVector ); checkCudaDevice; cudaThreadSynchronize(); #endif } }; Loading
src/matrices/tnlSlicedEllpackMatrix_impl.h +58 −14 Original line number Diff line number Diff line Loading @@ -777,6 +777,42 @@ __global__ void tnlSlicedEllpackMatrix_computeMaximalRowLengthInSlices_CudaKerne } #endif #ifdef HAVE_CUDA template< typename Real, typename Index, int SliceSize > __global__ void tnlSlicedEllpackMatrixVectorProductCudaKernel( const Index rows, const Index columns, const Index* slicePointers, const Index* sliceCompressedRowsLengths, const Index* columnIndexes, const Real* values, const Real* inVector, Real* outVector, const Index gridIdx ) { const Index rowIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( rowIdx >= rows ) return; const Index sliceIdx = rowIdx / SliceSize; const Index slicePointer = slicePointers[ sliceIdx ]; const Index rowLength = sliceCompressedRowsLengths[ sliceIdx ]; Index i = slicePointer + rowIdx - sliceIdx * SliceSize; const Index rowEnd = i + rowLength * SliceSize; Real result( 0.0 ); Index columnIndex; while( i < rowEnd && ( columnIndex = columnIndexes[ i ] ) < columns ) { result += values[ i ] * inVector[ columnIndex ]; i += SliceSize; } outVector[ rowIdx ] = result; } #endif template<> class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda > { Loading Loading @@ -862,13 +898,13 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda > const InVector& inVector, OutVector& outVector ) { tnlMatrixVectorProductCuda( matrix, inVector, outVector ); /*#ifdef HAVE_CUDA //tnlMatrixVectorProductCuda( matrix, inVector, outVector ); #ifdef HAVE_CUDA typedef tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize > Matrix; typedef typename Matrix::IndexType IndexType; Matrix* kernel_this = tnlCuda::passToDevice( matrix ); InVector* kernel_inVector = tnlCuda::passToDevice( inVector ); OutVector* kernel_outVector = tnlCuda::passToDevice( outVector ); //Matrix* kernel_this = tnlCuda::passToDevice( matrix ); //InVector* kernel_inVector = tnlCuda::passToDevice( inVector ); //OutVector* kernel_outVector = tnlCuda::passToDevice( outVector ); dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() ); const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() ); Loading @@ -876,18 +912,26 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda > { if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize(); tnlMatrixVectorProductCudaKernel<<< cudaGridSize, cudaBlockSize >>> ( kernel_this, kernel_inVector, kernel_outVector, tnlSlicedEllpackMatrixVectorProductCudaKernel < Real, Index, SliceSize > <<< cudaGridSize, cudaBlockSize >>> ( matrix.getRows(), matrix.getColumns(), matrix.slicePointers.getData(), matrix.sliceCompressedRowsLengths.getData(), matrix.columnIndexes.getData(), matrix.values.getData(), inVector.getData(), outVector.getData(), gridIdx ); checkCudaDevice; } tnlCuda::freeFromDevice( kernel_this ); tnlCuda::freeFromDevice( kernel_inVector ); tnlCuda::freeFromDevice( kernel_outVector ); //tnlCuda::freeFromDevice( kernel_this ); //tnlCuda::freeFromDevice( kernel_inVector ); //tnlCuda::freeFromDevice( kernel_outVector ); checkCudaDevice; #endif*/ cudaThreadSynchronize(); #endif } }; Loading
tests/benchmarks/tnl-cuda-benchmarks.h +55 −6 Original line number Diff line number Diff line Loading @@ -22,12 +22,49 @@ #include <core/vectors/tnlVector.h> #include <core/tnlTimerRT.h> #include <matrices/tnlSlicedEllpackMatrix.h> #include <matrices/tnlEllpackMatrix.h> #ifdef HAVE_CUBLAS //#include <cublas.h> #endif template< typename Matrix > __global__ void setCudaTestMatrixKernel( Matrix* matrix, const int elementsPerRow, const int gridIdx ) { const int rowIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( rowIdx >= matrix->getRows() ) return; int col = Max( 0, rowIdx - elementsPerRow / 2 ); for( int element = 0; element < elementsPerRow; element++ ) { if( col + element < matrix->getColumns() ) matrix->setElementFast( rowIdx, col + element, 1.0 ); } } template< typename Matrix > void setCudaTestMatrix( Matrix& matrix, const int elementsPerRow ) { typedef typename Matrix::IndexType IndexType; typedef typename Matrix::RealType RealType; Matrix* kernel_matrix = tnlCuda::passToDevice( matrix ); dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() ); const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize(); setCudaTestMatrixKernel< Matrix > <<< cudaGridSize, cudaBlockSize >>> ( kernel_matrix, elementsPerRow, gridIdx ); checkCudaDevice; } tnlCuda::freeFromDevice( kernel_matrix ); } int main( int argc, char* argv[] ) { Loading Loading @@ -195,8 +232,9 @@ int main( int argc, char* argv[] ) * Sliced Ellpack test */ const int elementsPerRow( 5 ); tnlSlicedEllpackMatrix< double, tnlHost > hostMatrix; tnlSlicedEllpackMatrix< double, tnlCuda > deviceMatrix; typedef tnlEllpackMatrix< double, tnlCuda > DeviceMatrix; tnlEllpackMatrix< double, tnlHost > hostMatrix; DeviceMatrix deviceMatrix; tnlVector< int, tnlHost, int > hostRowLengths; tnlVector< int, tnlCuda, int > deviceRowLengths; hostRowLengths.setSize( size ); Loading @@ -206,21 +244,28 @@ int main( int argc, char* argv[] ) hostRowLengths.setValue( elementsPerRow ); deviceRowLengths.setValue( elementsPerRow ); hostMatrix.setCompressedRowsLengths( hostRowLengths ); deviceMatrix.setCompressedRowsLengths( deviceRowLengths ); if( ! deviceMatrix.setCompressedRowsLengths( deviceRowLengths ) ) { cerr << "Unable to allocate matrix elements." << endl; return false; } int elements( 0 ); for( int row = 0; row < size; row++ ) { cout << "Row " << row << "/" << size << " \r" << flush; int col = Max( 0, row - elementsPerRow / 2 ); for( int element = 0; element < elementsPerRow; element++ ) { if( col + element < size ) { hostMatrix.setElement( row, col + element, 1.0 ); deviceMatrix.setElement( row, col + element, 1.0 ); //deviceMatrix.setElement( row, col + element, 1.0 ); elements++; } } } cout << endl; setCudaTestMatrix< DeviceMatrix >( deviceMatrix, elementsPerRow ); datasetSize = loops * elements * sizeof( double ) / oneGB; hostVector.setValue( 1.0 ); deviceVector.setValue( 1.0 ); Loading @@ -242,7 +287,11 @@ int main( int argc, char* argv[] ) if( hostVector2 != deviceVector2 ) { cerr << "Error in SliceEllpack Spmv kernel." << endl; cerr << "Error in SliceEllpack Spmv kernel at positions" << endl; //for( int i = 0; i < size; i++ ) // if( hostVector2.getElement( i ) != deviceVector2.getElement( i ) ) // cerr << " " << i; } bandwidth = 2 * datasetSize / loops / timer.getTime(); cout << timer.getTime() << " => " << bandwidth << " GB/s" << " speedup " << hostTime / timer.getTime() << endl; Loading