Loading src/TNL/Matrices/Ellpack.h +2 −1 Original line number Diff line number Diff line Loading @@ -153,7 +153,8 @@ public: template< typename InVector, typename OutVector > void vectorProduct( const InVector& inVector, OutVector& outVector ) const; OutVector& outVector, RealType multiplicator = 1.0 ) const; template< typename Real2, typename Index2 > void addMatrix( const Ellpack< Real2, Device, Index2 >& matrix, Loading src/TNL/Matrices/Ellpack_impl.h +11 −7 Original line number Diff line number Diff line Loading @@ -518,9 +518,10 @@ template< typename Real, template< typename InVector, typename OutVector > void Ellpack< Real, Device, Index >::vectorProduct( const InVector& inVector, OutVector& outVector ) const OutVector& outVector, RealType multiplicator ) const { DeviceDependentCode::vectorProduct( *this, inVector, outVector ); DeviceDependentCode::vectorProduct( *this, inVector, outVector, multiplicator ); } template< typename Real, Loading Loading @@ -815,13 +816,14 @@ class EllpackDeviceDependentCode< Devices::Host > typename OutVector > static void vectorProduct( const Ellpack< Real, Device, Index >& matrix, const InVector& inVector, OutVector& outVector ) OutVector& outVector, Real multiplicator ) { #ifdef HAVE_OPENMP #pragma omp parallel for if( Devices::Host::isOMPEnabled() ) #endif for( Index row = 0; row < matrix.getRows(); row ++ ) outVector[ row ] = matrix.rowVectorProduct( row, inVector ); outVector[ row ] = matrix.rowVectorProduct( row, inVector ) * multiplicator; /*Index col; for( Index row = 0; row < matrix.getRows(); row ++ ) { Loading @@ -848,6 +850,7 @@ __global__ void EllpackVectorProductCudaKernel( const Real* values, const Real* inVector, Real* outVector, Real multiplicator, const Index gridIdx ) { const Index rowIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; Loading @@ -864,7 +867,7 @@ __global__ void EllpackVectorProductCudaKernel( result += values[ i ] * inVector[ columnIndex ]; i += alignedRows; } outVector[ rowIdx ] = result; outVector[ rowIdx ] = result * multiplicator; } #endif Loading Loading @@ -909,7 +912,8 @@ class EllpackDeviceDependentCode< Devices::Cuda > typename OutVector > static void vectorProduct( const Ellpack< Real, Device, Index >& matrix, const InVector& inVector, OutVector& outVector ) OutVector& outVector, Real multiplicator ) { //MatrixVectorProductCuda( matrix, inVector, outVector ); #ifdef HAVE_CUDA Loading Loading @@ -937,6 +941,7 @@ class EllpackDeviceDependentCode< Devices::Cuda > matrix.values.getData(), inVector.getData(), outVector.getData(), multiplicator, gridIdx ); TNL_CHECK_CUDA_DEVICE; } Loading @@ -946,7 +951,6 @@ class EllpackDeviceDependentCode< Devices::Cuda > TNL_CHECK_CUDA_DEVICE; cudaDeviceSynchronize(); #endif } }; Loading Loading
src/TNL/Matrices/Ellpack.h +2 −1 Original line number Diff line number Diff line Loading @@ -153,7 +153,8 @@ public: template< typename InVector, typename OutVector > void vectorProduct( const InVector& inVector, OutVector& outVector ) const; OutVector& outVector, RealType multiplicator = 1.0 ) const; template< typename Real2, typename Index2 > void addMatrix( const Ellpack< Real2, Device, Index2 >& matrix, Loading
src/TNL/Matrices/Ellpack_impl.h +11 −7 Original line number Diff line number Diff line Loading @@ -518,9 +518,10 @@ template< typename Real, template< typename InVector, typename OutVector > void Ellpack< Real, Device, Index >::vectorProduct( const InVector& inVector, OutVector& outVector ) const OutVector& outVector, RealType multiplicator ) const { DeviceDependentCode::vectorProduct( *this, inVector, outVector ); DeviceDependentCode::vectorProduct( *this, inVector, outVector, multiplicator ); } template< typename Real, Loading Loading @@ -815,13 +816,14 @@ class EllpackDeviceDependentCode< Devices::Host > typename OutVector > static void vectorProduct( const Ellpack< Real, Device, Index >& matrix, const InVector& inVector, OutVector& outVector ) OutVector& outVector, Real multiplicator ) { #ifdef HAVE_OPENMP #pragma omp parallel for if( Devices::Host::isOMPEnabled() ) #endif for( Index row = 0; row < matrix.getRows(); row ++ ) outVector[ row ] = matrix.rowVectorProduct( row, inVector ); outVector[ row ] = matrix.rowVectorProduct( row, inVector ) * multiplicator; /*Index col; for( Index row = 0; row < matrix.getRows(); row ++ ) { Loading @@ -848,6 +850,7 @@ __global__ void EllpackVectorProductCudaKernel( const Real* values, const Real* inVector, Real* outVector, Real multiplicator, const Index gridIdx ) { const Index rowIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; Loading @@ -864,7 +867,7 @@ __global__ void EllpackVectorProductCudaKernel( result += values[ i ] * inVector[ columnIndex ]; i += alignedRows; } outVector[ rowIdx ] = result; outVector[ rowIdx ] = result * multiplicator; } #endif Loading Loading @@ -909,7 +912,8 @@ class EllpackDeviceDependentCode< Devices::Cuda > typename OutVector > static void vectorProduct( const Ellpack< Real, Device, Index >& matrix, const InVector& inVector, OutVector& outVector ) OutVector& outVector, Real multiplicator ) { //MatrixVectorProductCuda( matrix, inVector, outVector ); #ifdef HAVE_CUDA Loading Loading @@ -937,6 +941,7 @@ class EllpackDeviceDependentCode< Devices::Cuda > matrix.values.getData(), inVector.getData(), outVector.getData(), multiplicator, gridIdx ); TNL_CHECK_CUDA_DEVICE; } Loading @@ -946,7 +951,6 @@ class EllpackDeviceDependentCode< Devices::Cuda > TNL_CHECK_CUDA_DEVICE; cudaDeviceSynchronize(); #endif } }; Loading