Loading src/matrices/tnlEllpackMatrix_impl.h +14 −4 Original line number Diff line number Diff line Loading @@ -687,11 +687,21 @@ class tnlEllpackMatrixDeviceDependentCode< tnlHost > const InVector& inVector, OutVector& outVector ) { #ifdef HAVE_OPENMP #pragma omp parallel for #endif //#ifdef HAVE_OPENMP //#pragma omp parallel for //#endif // for( Index row = 0; row < matrix.getRows(); row ++ ) // outVector[ row ] = matrix.rowVectorProduct( row, inVector ); Index col; for( Index row = 0; row < matrix.getRows(); row ++ ) outVector[ row ] = matrix.rowVectorProduct( row, inVector ); { outVector[ row ] = 0.0; const Index rowEnd = ( row + 1 ) * matrix.rowLengths; for( Index i = row * matrix.rowLengths; i < rowEnd; i++ ) if( ( col = matrix.columnIndexes[ i ] ) < matrix.columns ) outVector[ row ] += matrix.values[ i ] * inVector[ col ]; } } }; Loading tests/benchmarks/tnl-cuda-benchmarks.h +5 −4 Original line number Diff line number Diff line Loading @@ -252,6 +252,7 @@ int main( int argc, char* argv[] ) int elements( 0 ); for( int row = 0; row < size; row++ ) { if( row % 100 == 0 ) cout << "Row " << row << "/" << size << " \r" << flush; int col = Max( 0, row - elementsPerRow / 2 ); for( int element = 0; element < elementsPerRow; element++ ) Loading @@ -266,7 +267,7 @@ int main( int argc, char* argv[] ) } cout << endl; setCudaTestMatrix< DeviceMatrix >( deviceMatrix, elementsPerRow ); datasetSize = loops * elements * sizeof( double ) / oneGB; datasetSize = loops * elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; hostVector.setValue( 1.0 ); deviceVector.setValue( 1.0 ); cout << "Benchmarking SpMV on CPU: "; Loading @@ -275,7 +276,7 @@ int main( int argc, char* argv[] ) hostMatrix.vectorProduct( hostVector, hostVector2 ); timer.stop(); double hostTime = timer.getTime(); bandwidth = 2 * datasetSize / loops / timer.getTime(); bandwidth = datasetSize / timer.getTime(); cout << timer.getTime() << " => " << bandwidth << " GB/s" << endl; cout << "Benchmarking SpMV on GPU: "; Loading @@ -293,7 +294,7 @@ int main( int argc, char* argv[] ) // cerr << " " << i; } bandwidth = 2 * datasetSize / loops / timer.getTime(); bandwidth = datasetSize / timer.getTime(); cout << timer.getTime() << " => " << bandwidth << " GB/s" << " speedup " << hostTime / timer.getTime() << endl; return EXIT_SUCCESS; Loading Loading
src/matrices/tnlEllpackMatrix_impl.h +14 −4 Original line number Diff line number Diff line Loading @@ -687,11 +687,21 @@ class tnlEllpackMatrixDeviceDependentCode< tnlHost > const InVector& inVector, OutVector& outVector ) { #ifdef HAVE_OPENMP #pragma omp parallel for #endif //#ifdef HAVE_OPENMP //#pragma omp parallel for //#endif // for( Index row = 0; row < matrix.getRows(); row ++ ) // outVector[ row ] = matrix.rowVectorProduct( row, inVector ); Index col; for( Index row = 0; row < matrix.getRows(); row ++ ) outVector[ row ] = matrix.rowVectorProduct( row, inVector ); { outVector[ row ] = 0.0; const Index rowEnd = ( row + 1 ) * matrix.rowLengths; for( Index i = row * matrix.rowLengths; i < rowEnd; i++ ) if( ( col = matrix.columnIndexes[ i ] ) < matrix.columns ) outVector[ row ] += matrix.values[ i ] * inVector[ col ]; } } }; Loading
tests/benchmarks/tnl-cuda-benchmarks.h +5 −4 Original line number Diff line number Diff line Loading @@ -252,6 +252,7 @@ int main( int argc, char* argv[] ) int elements( 0 ); for( int row = 0; row < size; row++ ) { if( row % 100 == 0 ) cout << "Row " << row << "/" << size << " \r" << flush; int col = Max( 0, row - elementsPerRow / 2 ); for( int element = 0; element < elementsPerRow; element++ ) Loading @@ -266,7 +267,7 @@ int main( int argc, char* argv[] ) } cout << endl; setCudaTestMatrix< DeviceMatrix >( deviceMatrix, elementsPerRow ); datasetSize = loops * elements * sizeof( double ) / oneGB; datasetSize = loops * elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; hostVector.setValue( 1.0 ); deviceVector.setValue( 1.0 ); cout << "Benchmarking SpMV on CPU: "; Loading @@ -275,7 +276,7 @@ int main( int argc, char* argv[] ) hostMatrix.vectorProduct( hostVector, hostVector2 ); timer.stop(); double hostTime = timer.getTime(); bandwidth = 2 * datasetSize / loops / timer.getTime(); bandwidth = datasetSize / timer.getTime(); cout << timer.getTime() << " => " << bandwidth << " GB/s" << endl; cout << "Benchmarking SpMV on GPU: "; Loading @@ -293,7 +294,7 @@ int main( int argc, char* argv[] ) // cerr << " " << i; } bandwidth = 2 * datasetSize / loops / timer.getTime(); bandwidth = datasetSize / timer.getTime(); cout << timer.getTime() << " => " << bandwidth << " GB/s" << " speedup " << hostTime / timer.getTime() << endl; return EXIT_SUCCESS; Loading