Loading src/TNL/Matrices/Legacy/CSR.h +3 −5 Original line number Diff line number Diff line Loading @@ -31,9 +31,9 @@ class CusparseCSR; template< typename Device > class CSRDeviceDependentCode; enum CSRKernel { CSRScalar, CSRVector, CSRLight, CSRAdaptive, CSRStream }; enum CSRKernel { CSRScalar, CSRVector, CSRHybrid, CSRLight, CSRAdaptive, CSRStream }; template< typename Real, typename Device = Devices::Host, typename Index = int, CSRKernel KernelType = CSRAdaptive > template< typename Real, typename Device = Devices::Host, typename Index = int, CSRKernel KernelType = CSRScalar > class CSR : public Sparse< Real, Device, Index > { private: Loading Loading @@ -225,9 +225,7 @@ public: __device__ void spmvCudaVectorized( const InVector& inVector, OutVector& outVector, const IndexType warpStart, const IndexType warpEnd, const IndexType inWarpIdx ) const; const IndexType gridIdx ) const; template< typename InVector, typename OutVector, Loading src/TNL/Matrices/Legacy/CSR_impl.h +27 −24 Original line number Diff line number Diff line Loading @@ -905,10 +905,13 @@ template< typename Real, __device__ void CSR< Real, Device, Index, KernelType >::spmvCudaVectorized( const InVector& inVector, OutVector& outVector, const IndexType warpStart, const IndexType warpEnd, const IndexType inWarpIdx ) const const IndexType gridIdx ) const { IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; const IndexType warpStart = warpSize * ( globalIdx / warpSize ); const IndexType warpEnd = min( warpStart + warpSize, this->getRows() ); const IndexType inWarpIdx = globalIdx % warpSize; volatile Real* aux = Cuda::getSharedMemory< Real >(); for( IndexType row = warpStart; row < warpEnd; row++ ) { Loading Loading @@ -957,10 +960,10 @@ void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector& // TODO: break; case CSRVector: spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, warpStart, warpEnd, inWarpIdx ); spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, gridIdx ); break; case CSRLight: spmvCSRLight< InVector, OutVector, warpSize >( inVector, outVector, gridIdx ); spmvCudaLightSpmv< InVector, OutVector, warpSize >( inVector, outVector, gridIdx ); break; case CSRAdaptive: spmvCSRAdaptive< InVector, OutVector, warpSize >( inVector, outVector, gridIdx, blocks, size ); Loading @@ -970,9 +973,8 @@ void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector& break; } return; IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; /*IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; const IndexType warpStart = warpSize * ( globalIdx / warpSize ); const IndexType warpEnd = min( warpStart + warpSize, this->getRows() ); const IndexType inWarpIdx = globalIdx % warpSize; Loading @@ -980,9 +982,9 @@ void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector& if( this->getCudaKernelType() == vector ) /**** * Hybrid mode */ ///// // Hybrid mode // const Index firstRow = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x; const IndexType lastRow = min( this->getRows(), firstRow + blockDim. x ); const IndexType nonzerosPerRow = ( this->rowPointers[ lastRow ] - this->rowPointers[ firstRow ] ) / Loading @@ -990,19 +992,19 @@ void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector& if( nonzerosPerRow < this->getHybridModeSplit() ) { /**** * Use the scalar mode */ ///// // Use the scalar mode // if( globalIdx < this->getRows() ) outVector[ globalIdx ] = this->rowVectorProduct( globalIdx, inVector ); } else { /**** * Use the vector mode */ //// // Use the vector mode // spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, warpStart, warpEnd, inWarpIdx ); } }*/ } #endif Loading Loading @@ -1038,10 +1040,11 @@ class CSRDeviceDependentCode< Devices::Host > #ifdef HAVE_CUDA template< typename Real, typename Index, CSRKernel KernelType, typename InVector, typename OutVector, int warpSize > __global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Index >* matrix, __global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Index, KernelType >* matrix, const InVector* inVector, OutVector* outVector, int gridIdx, Loading @@ -1050,13 +1053,12 @@ __global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Inde typedef CSR< Real, Devices::Cuda, Index > Matrix; static_assert( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value, "" ); const typename Matrix::IndexType rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( matrix->getCudaKernelType() == Matrix::scalar ) if( KernelType == CSRScalar ) { if( rowIdx < matrix->getRows() ) ( *outVector )[ rowIdx ] = matrix->rowVectorProduct( rowIdx, *inVector ); } if( matrix->getCudaKernelType() == Matrix::vector || matrix->getCudaKernelType() == Matrix::hybrid ) else { matrix->template vectorProductCuda< InVector, OutVector, warpSize > ( *inVector, *outVector, gridIdx, blocks, size ); Loading @@ -1066,16 +1068,17 @@ __global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Inde template< typename Real, typename Index, CSRKernel KernelType, typename InVector, typename OutVector > void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix, void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index, KernelType >& matrix, const InVector& inVector, OutVector& outVector, int *blocks, size_t size ) { #ifdef HAVE_CUDA typedef CSR< Real, Devices::Cuda, Index > Matrix; typedef CSR< Real, Devices::Cuda, Index, KernelType > Matrix; typedef typename Matrix::IndexType IndexType; Matrix* kernel_this = Cuda::passToDevice( matrix ); InVector* kernel_inVector = Cuda::passToDevice( inVector ); Loading @@ -1096,7 +1099,7 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix, // const int threads = cudaBlockSize.x; if( matrix.getCudaWarpSize() == 32 ) { // printf("BL %d BLSIZE %d\n", (int)cudaBlocks, (int)threads); CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 32 > CSRVectorProductCudaKernel< Real, Index, KernelType, InVector, OutVector, 32 > <<< 2, 1024 >>> ( kernel_this, kernel_inVector, Loading Loading
src/TNL/Matrices/Legacy/CSR.h +3 −5 Original line number Diff line number Diff line Loading @@ -31,9 +31,9 @@ class CusparseCSR; template< typename Device > class CSRDeviceDependentCode; enum CSRKernel { CSRScalar, CSRVector, CSRLight, CSRAdaptive, CSRStream }; enum CSRKernel { CSRScalar, CSRVector, CSRHybrid, CSRLight, CSRAdaptive, CSRStream }; template< typename Real, typename Device = Devices::Host, typename Index = int, CSRKernel KernelType = CSRAdaptive > template< typename Real, typename Device = Devices::Host, typename Index = int, CSRKernel KernelType = CSRScalar > class CSR : public Sparse< Real, Device, Index > { private: Loading Loading @@ -225,9 +225,7 @@ public: __device__ void spmvCudaVectorized( const InVector& inVector, OutVector& outVector, const IndexType warpStart, const IndexType warpEnd, const IndexType inWarpIdx ) const; const IndexType gridIdx ) const; template< typename InVector, typename OutVector, Loading
src/TNL/Matrices/Legacy/CSR_impl.h +27 −24 Original line number Diff line number Diff line Loading @@ -905,10 +905,13 @@ template< typename Real, __device__ void CSR< Real, Device, Index, KernelType >::spmvCudaVectorized( const InVector& inVector, OutVector& outVector, const IndexType warpStart, const IndexType warpEnd, const IndexType inWarpIdx ) const const IndexType gridIdx ) const { IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; const IndexType warpStart = warpSize * ( globalIdx / warpSize ); const IndexType warpEnd = min( warpStart + warpSize, this->getRows() ); const IndexType inWarpIdx = globalIdx % warpSize; volatile Real* aux = Cuda::getSharedMemory< Real >(); for( IndexType row = warpStart; row < warpEnd; row++ ) { Loading Loading @@ -957,10 +960,10 @@ void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector& // TODO: break; case CSRVector: spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, warpStart, warpEnd, inWarpIdx ); spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, gridIdx ); break; case CSRLight: spmvCSRLight< InVector, OutVector, warpSize >( inVector, outVector, gridIdx ); spmvCudaLightSpmv< InVector, OutVector, warpSize >( inVector, outVector, gridIdx ); break; case CSRAdaptive: spmvCSRAdaptive< InVector, OutVector, warpSize >( inVector, outVector, gridIdx, blocks, size ); Loading @@ -970,9 +973,8 @@ void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector& break; } return; IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; /*IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; const IndexType warpStart = warpSize * ( globalIdx / warpSize ); const IndexType warpEnd = min( warpStart + warpSize, this->getRows() ); const IndexType inWarpIdx = globalIdx % warpSize; Loading @@ -980,9 +982,9 @@ void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector& if( this->getCudaKernelType() == vector ) /**** * Hybrid mode */ ///// // Hybrid mode // const Index firstRow = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x; const IndexType lastRow = min( this->getRows(), firstRow + blockDim. x ); const IndexType nonzerosPerRow = ( this->rowPointers[ lastRow ] - this->rowPointers[ firstRow ] ) / Loading @@ -990,19 +992,19 @@ void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector& if( nonzerosPerRow < this->getHybridModeSplit() ) { /**** * Use the scalar mode */ ///// // Use the scalar mode // if( globalIdx < this->getRows() ) outVector[ globalIdx ] = this->rowVectorProduct( globalIdx, inVector ); } else { /**** * Use the vector mode */ //// // Use the vector mode // spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, warpStart, warpEnd, inWarpIdx ); } }*/ } #endif Loading Loading @@ -1038,10 +1040,11 @@ class CSRDeviceDependentCode< Devices::Host > #ifdef HAVE_CUDA template< typename Real, typename Index, CSRKernel KernelType, typename InVector, typename OutVector, int warpSize > __global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Index >* matrix, __global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Index, KernelType >* matrix, const InVector* inVector, OutVector* outVector, int gridIdx, Loading @@ -1050,13 +1053,12 @@ __global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Inde typedef CSR< Real, Devices::Cuda, Index > Matrix; static_assert( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value, "" ); const typename Matrix::IndexType rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( matrix->getCudaKernelType() == Matrix::scalar ) if( KernelType == CSRScalar ) { if( rowIdx < matrix->getRows() ) ( *outVector )[ rowIdx ] = matrix->rowVectorProduct( rowIdx, *inVector ); } if( matrix->getCudaKernelType() == Matrix::vector || matrix->getCudaKernelType() == Matrix::hybrid ) else { matrix->template vectorProductCuda< InVector, OutVector, warpSize > ( *inVector, *outVector, gridIdx, blocks, size ); Loading @@ -1066,16 +1068,17 @@ __global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Inde template< typename Real, typename Index, CSRKernel KernelType, typename InVector, typename OutVector > void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix, void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index, KernelType >& matrix, const InVector& inVector, OutVector& outVector, int *blocks, size_t size ) { #ifdef HAVE_CUDA typedef CSR< Real, Devices::Cuda, Index > Matrix; typedef CSR< Real, Devices::Cuda, Index, KernelType > Matrix; typedef typename Matrix::IndexType IndexType; Matrix* kernel_this = Cuda::passToDevice( matrix ); InVector* kernel_inVector = Cuda::passToDevice( inVector ); Loading @@ -1096,7 +1099,7 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix, // const int threads = cudaBlockSize.x; if( matrix.getCudaWarpSize() == 32 ) { // printf("BL %d BLSIZE %d\n", (int)cudaBlocks, (int)threads); CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 32 > CSRVectorProductCudaKernel< Real, Index, KernelType, InVector, OutVector, 32 > <<< 2, 1024 >>> ( kernel_this, kernel_inVector, Loading