Loading src/core/arrays/tnlArrayOperationsCuda_impl.h +5 −2 Original line number Diff line number Diff line Loading @@ -256,6 +256,9 @@ bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory( const Element1* dest const Element2* source, const Index size ) { /*** * Here, destination is on host and source is on CUDA device. */ tnlAssert( destination, ); tnlAssert( source, ); tnlAssert( size >= 0, cerr << "size = " << size ); Loading @@ -280,7 +283,7 @@ bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory( const Element1* dest delete[] host_buffer; return false; } if( ! tnlArrayOperations< tnlHost >::compareMemory( host_buffer, destination, transfer ) ) if( ! tnlArrayOperations< tnlHost >::compareMemory( &destination[ compared ], host_buffer, transfer ) ) { delete[] host_buffer; return false; Loading src/matrices/tnlEllpackMatrix_impl.h +12 −9 Original line number Diff line number Diff line Loading @@ -687,12 +687,12 @@ class tnlEllpackMatrixDeviceDependentCode< tnlHost > const InVector& inVector, OutVector& outVector ) { //#ifdef HAVE_OPENMP //#pragma omp parallel for //#endif // for( Index row = 0; row < matrix.getRows(); row ++ ) // outVector[ row ] = matrix.rowVectorProduct( row, inVector ); Index col; #ifdef HAVE_OPENMP #pragma omp parallel for #endif for( Index row = 0; row < matrix.getRows(); row ++ ) outVector[ row ] = matrix.rowVectorProduct( row, inVector ); /*Index col; for( Index row = 0; row < matrix.getRows(); row ++ ) { outVector[ row ] = 0.0; Loading @@ -700,8 +700,7 @@ class tnlEllpackMatrixDeviceDependentCode< tnlHost > for( Index i = row * matrix.rowLengths; i < rowEnd; i++ ) if( ( col = matrix.columnIndexes[ i ] ) < matrix.columns ) outVector[ row ] += matrix.values[ i ] * inVector[ col ]; } }*/ } }; Loading @@ -714,6 +713,7 @@ __global__ void tnlEllpackMatrixVectorProductCudaKernel( const Index columns, const Index compressedRowsLengths, const Index alignedRows, const Index paddingIndex, const Index* columnIndexes, const Real* values, const Real* inVector, Loading @@ -727,7 +727,9 @@ __global__ void tnlEllpackMatrixVectorProductCudaKernel( Index el( 0 ); Real result( 0.0 ); Index columnIndex; while( el++ < compressedRowsLengths && ( columnIndex = columnIndexes[ i ] ) < columns ) while( el++ < compressedRowsLengths && ( columnIndex = columnIndexes[ i ] ) < columns && columnIndex != paddingIndex ) { result += values[ i ] * inVector[ columnIndex ]; i += alignedRows; Loading Loading @@ -800,6 +802,7 @@ class tnlEllpackMatrixDeviceDependentCode< tnlCuda > matrix.getColumns(), matrix.rowLengths, matrix.alignedRows, matrix.getPaddingIndex(), matrix.columnIndexes.getData(), matrix.values.getData(), inVector.getData(), Loading src/matrices/tnlSlicedEllpackMatrix_impl.h +5 −1 Original line number Diff line number Diff line Loading @@ -787,6 +787,7 @@ __global__ void tnlSlicedEllpackMatrixVectorProductCudaKernel( const Index columns, const Index* slicePointers, const Index* sliceCompressedRowsLengths, const Index paddingIndex, const Index* columnIndexes, const Real* values, const Real* inVector, Loading @@ -803,7 +804,9 @@ __global__ void tnlSlicedEllpackMatrixVectorProductCudaKernel( const Index rowEnd = i + rowLength * SliceSize; Real result( 0.0 ); Index columnIndex; while( i < rowEnd && ( columnIndex = columnIndexes[ i ] ) < columns ) while( i < rowEnd && ( columnIndex = columnIndexes[ i ] ) < columns && columnIndex < paddingIndex ) { result += values[ i ] * inVector[ columnIndex ]; i += SliceSize; Loading Loading @@ -919,6 +922,7 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda > matrix.getColumns(), matrix.slicePointers.getData(), matrix.sliceCompressedRowsLengths.getData(), matrix.getPaddingIndex(), matrix.columnIndexes.getData(), matrix.values.getData(), inVector.getData(), Loading tests/benchmarks/tnl-cuda-benchmarks.h +12 −15 Original line number Diff line number Diff line Loading @@ -36,11 +36,12 @@ __global__ void setCudaTestMatrixKernel( Matrix* matrix, const int rowIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( rowIdx >= matrix->getRows() ) return; int col = Max( 0, rowIdx - elementsPerRow / 2 ); int col = rowIdx - elementsPerRow / 2; for( int element = 0; element < elementsPerRow; element++ ) { if( col + element < matrix->getColumns() ) matrix->setElementFast( rowIdx, col + element, 1.0 ); if( col + element >= 0 && col + element < matrix->getColumns() ) matrix->setElementFast( rowIdx, col + element, element + 1 ); } } Loading Loading @@ -152,7 +153,7 @@ int main( int argc, char* argv[] ) timer.reset(); timer.start(); for( int i = 0; i < loops; i++ ) resultDevice = deviceVector.scalarProduct( deviceVector ); resultDevice = deviceVector.scalarProduct( deviceVector2 ); timer.stop(); bandwidth = 2 * datasetSize / timer.getTime(); cout << "Time: " << timer.getTime() << " bandwidth: " << bandwidth << " GB/sec." << endl; Loading Loading @@ -194,6 +195,7 @@ int main( int argc, char* argv[] ) timer.start(); for( int i = 0; i < loops; i++ ) resultDevice = deviceVector.lpNorm( 2.0 ); timer.stop(); bandwidth = datasetSize / timer.getTime(); cout << "Time: " << timer.getTime() << " bandwidth: " << bandwidth << " GB/sec." << endl; Loading Loading @@ -254,12 +256,12 @@ int main( int argc, char* argv[] ) { if( row % 100 == 0 ) cout << "Row " << row << "/" << size << " \r" << flush; int col = Max( 0, row - elementsPerRow / 2 ); int col = row - elementsPerRow / 2; for( int element = 0; element < elementsPerRow; element++ ) { if( col + element < size ) if( col + element >= 0 && col + element < size ) { hostMatrix.setElement( row, col + element, 1.0 ); hostMatrix.setElement( row, col + element, element + 1 ); //deviceMatrix.setElement( row, col + element, 1.0 ); elements++; } Loading @@ -280,20 +282,15 @@ int main( int argc, char* argv[] ) cout << timer.getTime() << " => " << bandwidth << " GB/s" << endl; cout << "Benchmarking SpMV on GPU: "; deviceVector2.setValue( 0.0 ); timer.reset(); for( int i = 0; i < loops; i++ ) deviceMatrix.vectorProduct( deviceVector, deviceVector2 ); timer.stop(); //cout << hostVector2 << endl << deviceVector2 << endl; if( hostVector2 != deviceVector2 ) { cerr << "Error in SliceEllpack Spmv kernel at positions" << endl; //for( int i = 0; i < size; i++ ) // if( hostVector2.getElement( i ) != deviceVector2.getElement( i ) ) // cerr << " " << i; cerr << "Error in Spmv kernel" << endl; } bandwidth = datasetSize / timer.getTime(); cout << timer.getTime() << " => " << bandwidth << " GB/s" << " speedup " << hostTime / timer.getTime() << endl; Loading src/mesh/grids/tnlTraverser_Grid2D_impl.h +1 −1 File changed.Contains only whitespace changes. Show changes Loading
src/core/arrays/tnlArrayOperationsCuda_impl.h +5 −2 Original line number Diff line number Diff line Loading @@ -256,6 +256,9 @@ bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory( const Element1* dest const Element2* source, const Index size ) { /*** * Here, destination is on host and source is on CUDA device. */ tnlAssert( destination, ); tnlAssert( source, ); tnlAssert( size >= 0, cerr << "size = " << size ); Loading @@ -280,7 +283,7 @@ bool tnlArrayOperations< tnlHost, tnlCuda >::compareMemory( const Element1* dest delete[] host_buffer; return false; } if( ! tnlArrayOperations< tnlHost >::compareMemory( host_buffer, destination, transfer ) ) if( ! tnlArrayOperations< tnlHost >::compareMemory( &destination[ compared ], host_buffer, transfer ) ) { delete[] host_buffer; return false; Loading
src/matrices/tnlEllpackMatrix_impl.h +12 −9 Original line number Diff line number Diff line Loading @@ -687,12 +687,12 @@ class tnlEllpackMatrixDeviceDependentCode< tnlHost > const InVector& inVector, OutVector& outVector ) { //#ifdef HAVE_OPENMP //#pragma omp parallel for //#endif // for( Index row = 0; row < matrix.getRows(); row ++ ) // outVector[ row ] = matrix.rowVectorProduct( row, inVector ); Index col; #ifdef HAVE_OPENMP #pragma omp parallel for #endif for( Index row = 0; row < matrix.getRows(); row ++ ) outVector[ row ] = matrix.rowVectorProduct( row, inVector ); /*Index col; for( Index row = 0; row < matrix.getRows(); row ++ ) { outVector[ row ] = 0.0; Loading @@ -700,8 +700,7 @@ class tnlEllpackMatrixDeviceDependentCode< tnlHost > for( Index i = row * matrix.rowLengths; i < rowEnd; i++ ) if( ( col = matrix.columnIndexes[ i ] ) < matrix.columns ) outVector[ row ] += matrix.values[ i ] * inVector[ col ]; } }*/ } }; Loading @@ -714,6 +713,7 @@ __global__ void tnlEllpackMatrixVectorProductCudaKernel( const Index columns, const Index compressedRowsLengths, const Index alignedRows, const Index paddingIndex, const Index* columnIndexes, const Real* values, const Real* inVector, Loading @@ -727,7 +727,9 @@ __global__ void tnlEllpackMatrixVectorProductCudaKernel( Index el( 0 ); Real result( 0.0 ); Index columnIndex; while( el++ < compressedRowsLengths && ( columnIndex = columnIndexes[ i ] ) < columns ) while( el++ < compressedRowsLengths && ( columnIndex = columnIndexes[ i ] ) < columns && columnIndex != paddingIndex ) { result += values[ i ] * inVector[ columnIndex ]; i += alignedRows; Loading Loading @@ -800,6 +802,7 @@ class tnlEllpackMatrixDeviceDependentCode< tnlCuda > matrix.getColumns(), matrix.rowLengths, matrix.alignedRows, matrix.getPaddingIndex(), matrix.columnIndexes.getData(), matrix.values.getData(), inVector.getData(), Loading
src/matrices/tnlSlicedEllpackMatrix_impl.h +5 −1 Original line number Diff line number Diff line Loading @@ -787,6 +787,7 @@ __global__ void tnlSlicedEllpackMatrixVectorProductCudaKernel( const Index columns, const Index* slicePointers, const Index* sliceCompressedRowsLengths, const Index paddingIndex, const Index* columnIndexes, const Real* values, const Real* inVector, Loading @@ -803,7 +804,9 @@ __global__ void tnlSlicedEllpackMatrixVectorProductCudaKernel( const Index rowEnd = i + rowLength * SliceSize; Real result( 0.0 ); Index columnIndex; while( i < rowEnd && ( columnIndex = columnIndexes[ i ] ) < columns ) while( i < rowEnd && ( columnIndex = columnIndexes[ i ] ) < columns && columnIndex < paddingIndex ) { result += values[ i ] * inVector[ columnIndex ]; i += SliceSize; Loading Loading @@ -919,6 +922,7 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda > matrix.getColumns(), matrix.slicePointers.getData(), matrix.sliceCompressedRowsLengths.getData(), matrix.getPaddingIndex(), matrix.columnIndexes.getData(), matrix.values.getData(), inVector.getData(), Loading
tests/benchmarks/tnl-cuda-benchmarks.h +12 −15 Original line number Diff line number Diff line Loading @@ -36,11 +36,12 @@ __global__ void setCudaTestMatrixKernel( Matrix* matrix, const int rowIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( rowIdx >= matrix->getRows() ) return; int col = Max( 0, rowIdx - elementsPerRow / 2 ); int col = rowIdx - elementsPerRow / 2; for( int element = 0; element < elementsPerRow; element++ ) { if( col + element < matrix->getColumns() ) matrix->setElementFast( rowIdx, col + element, 1.0 ); if( col + element >= 0 && col + element < matrix->getColumns() ) matrix->setElementFast( rowIdx, col + element, element + 1 ); } } Loading Loading @@ -152,7 +153,7 @@ int main( int argc, char* argv[] ) timer.reset(); timer.start(); for( int i = 0; i < loops; i++ ) resultDevice = deviceVector.scalarProduct( deviceVector ); resultDevice = deviceVector.scalarProduct( deviceVector2 ); timer.stop(); bandwidth = 2 * datasetSize / timer.getTime(); cout << "Time: " << timer.getTime() << " bandwidth: " << bandwidth << " GB/sec." << endl; Loading Loading @@ -194,6 +195,7 @@ int main( int argc, char* argv[] ) timer.start(); for( int i = 0; i < loops; i++ ) resultDevice = deviceVector.lpNorm( 2.0 ); timer.stop(); bandwidth = datasetSize / timer.getTime(); cout << "Time: " << timer.getTime() << " bandwidth: " << bandwidth << " GB/sec." << endl; Loading Loading @@ -254,12 +256,12 @@ int main( int argc, char* argv[] ) { if( row % 100 == 0 ) cout << "Row " << row << "/" << size << " \r" << flush; int col = Max( 0, row - elementsPerRow / 2 ); int col = row - elementsPerRow / 2; for( int element = 0; element < elementsPerRow; element++ ) { if( col + element < size ) if( col + element >= 0 && col + element < size ) { hostMatrix.setElement( row, col + element, 1.0 ); hostMatrix.setElement( row, col + element, element + 1 ); //deviceMatrix.setElement( row, col + element, 1.0 ); elements++; } Loading @@ -280,20 +282,15 @@ int main( int argc, char* argv[] ) cout << timer.getTime() << " => " << bandwidth << " GB/s" << endl; cout << "Benchmarking SpMV on GPU: "; deviceVector2.setValue( 0.0 ); timer.reset(); for( int i = 0; i < loops; i++ ) deviceMatrix.vectorProduct( deviceVector, deviceVector2 ); timer.stop(); //cout << hostVector2 << endl << deviceVector2 << endl; if( hostVector2 != deviceVector2 ) { cerr << "Error in SliceEllpack Spmv kernel at positions" << endl; //for( int i = 0; i < size; i++ ) // if( hostVector2.getElement( i ) != deviceVector2.getElement( i ) ) // cerr << " " << i; cerr << "Error in Spmv kernel" << endl; } bandwidth = datasetSize / timer.getTime(); cout << timer.getTime() << " => " << bandwidth << " GB/s" << " speedup " << hostTime / timer.getTime() << endl; Loading
src/mesh/grids/tnlTraverser_Grid2D_impl.h +1 −1 File changed.Contains only whitespace changes. Show changes