Loading src/core/vectors/tnlSharedVector_impl.h +1 −1 Original line number Diff line number Diff line Loading @@ -142,7 +142,7 @@ template< typename Real, template< typename Vector > bool tnlSharedVector< Real, Device, Index > :: operator != ( const Vector& vector ) const { return tnlSharedArray< Real, Device, Index > :: operator == ( vector ); return tnlSharedArray< Real, Device, Index > :: operator != ( vector ); } template< typename Real, Loading src/core/vectors/tnlVector_impl.h +1 −1 Original line number Diff line number Diff line Loading @@ -127,7 +127,7 @@ template< typename Real, template< typename Vector > bool tnlVector< Real, Device, Index > :: operator != ( const Vector& vector ) const { return tnlArray< Real, Device, Index > :: operator == ( vector ); return tnlArray< Real, Device, Index > :: operator != ( vector ); } template< typename Real, Loading src/matrices/tnlSlicedEllpackMatrix_impl.h +25 −0 Original line number Diff line number Diff line Loading @@ -863,6 +863,31 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda > OutVector& outVector ) { tnlMatrixVectorProductCuda( matrix, inVector, outVector ); /*#ifdef HAVE_CUDA typedef tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize > Matrix; typedef typename Matrix::IndexType IndexType; Matrix* kernel_this = tnlCuda::passToDevice( matrix ); InVector* kernel_inVector = tnlCuda::passToDevice( inVector ); OutVector* kernel_outVector = tnlCuda::passToDevice( outVector ); dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() ); const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize(); tnlMatrixVectorProductCudaKernel<<< cudaGridSize, cudaBlockSize >>> ( kernel_this, kernel_inVector, kernel_outVector, gridIdx ); checkCudaDevice; } tnlCuda::freeFromDevice( kernel_this ); tnlCuda::freeFromDevice( kernel_inVector ); tnlCuda::freeFromDevice( kernel_outVector ); checkCudaDevice; #endif*/ } }; Loading tests/benchmarks/tnl-cuda-benchmarks.h +8 −0 Original line number Diff line number Diff line Loading @@ -222,6 +222,8 @@ int main( int argc, char* argv[] ) } } datasetSize = loops * elements * sizeof( double ) / oneGB; hostVector.setValue( 1.0 ); deviceVector.setValue( 1.0 ); cout << "Benchmarking SpMV on CPU: "; timer.reset(); for( int i = 0; i < loops; i++ ) Loading @@ -236,6 +238,12 @@ int main( int argc, char* argv[] ) for( int i = 0; i < loops; i++ ) deviceMatrix.vectorProduct( deviceVector, deviceVector2 ); timer.stop(); //cout << hostVector2 << endl << deviceVector2 << endl; if( hostVector2 != deviceVector2 ) { cerr << "Error in SliceEllpack Spmv kernel." << endl; } bandwidth = 2 * datasetSize / loops / timer.getTime(); cout << timer.getTime() << " => " << bandwidth << " GB/s" << " speedup " << hostTime / timer.getTime() << endl; Loading Loading
src/core/vectors/tnlSharedVector_impl.h +1 −1 Original line number Diff line number Diff line Loading @@ -142,7 +142,7 @@ template< typename Real, template< typename Vector > bool tnlSharedVector< Real, Device, Index > :: operator != ( const Vector& vector ) const { return tnlSharedArray< Real, Device, Index > :: operator == ( vector ); return tnlSharedArray< Real, Device, Index > :: operator != ( vector ); } template< typename Real, Loading
src/core/vectors/tnlVector_impl.h +1 −1 Original line number Diff line number Diff line Loading @@ -127,7 +127,7 @@ template< typename Real, template< typename Vector > bool tnlVector< Real, Device, Index > :: operator != ( const Vector& vector ) const { return tnlArray< Real, Device, Index > :: operator == ( vector ); return tnlArray< Real, Device, Index > :: operator != ( vector ); } template< typename Real, Loading
src/matrices/tnlSlicedEllpackMatrix_impl.h +25 −0 Original line number Diff line number Diff line Loading @@ -863,6 +863,31 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda > OutVector& outVector ) { tnlMatrixVectorProductCuda( matrix, inVector, outVector ); /*#ifdef HAVE_CUDA typedef tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize > Matrix; typedef typename Matrix::IndexType IndexType; Matrix* kernel_this = tnlCuda::passToDevice( matrix ); InVector* kernel_inVector = tnlCuda::passToDevice( inVector ); OutVector* kernel_outVector = tnlCuda::passToDevice( outVector ); dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() ); const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize(); tnlMatrixVectorProductCudaKernel<<< cudaGridSize, cudaBlockSize >>> ( kernel_this, kernel_inVector, kernel_outVector, gridIdx ); checkCudaDevice; } tnlCuda::freeFromDevice( kernel_this ); tnlCuda::freeFromDevice( kernel_inVector ); tnlCuda::freeFromDevice( kernel_outVector ); checkCudaDevice; #endif*/ } }; Loading
tests/benchmarks/tnl-cuda-benchmarks.h +8 −0 Original line number Diff line number Diff line Loading @@ -222,6 +222,8 @@ int main( int argc, char* argv[] ) } } datasetSize = loops * elements * sizeof( double ) / oneGB; hostVector.setValue( 1.0 ); deviceVector.setValue( 1.0 ); cout << "Benchmarking SpMV on CPU: "; timer.reset(); for( int i = 0; i < loops; i++ ) Loading @@ -236,6 +238,12 @@ int main( int argc, char* argv[] ) for( int i = 0; i < loops; i++ ) deviceMatrix.vectorProduct( deviceVector, deviceVector2 ); timer.stop(); //cout << hostVector2 << endl << deviceVector2 << endl; if( hostVector2 != deviceVector2 ) { cerr << "Error in SliceEllpack Spmv kernel." << endl; } bandwidth = 2 * datasetSize / loops / timer.getTime(); cout << timer.getTime() << " => " << bandwidth << " GB/s" << " speedup " << hostTime / timer.getTime() << endl; Loading