From 2641210dac5db0a06084258e476f61a15fab432c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com> Date: Tue, 31 Mar 2020 15:44:45 +0200 Subject: [PATCH] Fixing CUDA code in Legacy CSR. --- src/TNL/Matrices/Legacy/CSR.h | 8 ++--- src/TNL/Matrices/Legacy/CSR_impl.h | 51 ++++++++++++++++-------------- 2 files changed, 30 insertions(+), 29 deletions(-) diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h index f86420339b..46e616d165 100644 --- a/src/TNL/Matrices/Legacy/CSR.h +++ b/src/TNL/Matrices/Legacy/CSR.h @@ -31,9 +31,9 @@ class CusparseCSR; template< typename Device > class CSRDeviceDependentCode; -enum CSRKernel { CSRScalar, CSRVector, CSRLight, CSRAdaptive, CSRStream }; +enum CSRKernel { CSRScalar, CSRVector, CSRHybrid, CSRLight, CSRAdaptive, CSRStream }; -template< typename Real, typename Device = Devices::Host, typename Index = int, CSRKernel KernelType = CSRAdaptive > +template< typename Real, typename Device = Devices::Host, typename Index = int, CSRKernel KernelType = CSRScalar > class CSR : public Sparse< Real, Device, Index > { private: @@ -225,9 +225,7 @@ public: __device__ void spmvCudaVectorized( const InVector& inVector, OutVector& outVector, - const IndexType warpStart, - const IndexType warpEnd, - const IndexType inWarpIdx ) const; + const IndexType gridIdx ) const; template< typename InVector, typename OutVector, diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index c4c4be89d5..12a9eb5541 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -905,10 +905,13 @@ template< typename Real, __device__ void CSR< Real, Device, Index, KernelType >::spmvCudaVectorized( const InVector& inVector, OutVector& outVector, - const IndexType warpStart, - const IndexType warpEnd, - const IndexType inWarpIdx ) const + const IndexType gridIdx ) const { + IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const IndexType warpStart = warpSize * ( globalIdx / warpSize ); + const IndexType warpEnd = min( warpStart + warpSize, this->getRows() ); + const IndexType inWarpIdx = globalIdx % warpSize; + volatile Real* aux = Cuda::getSharedMemory< Real >(); for( IndexType row = warpStart; row < warpEnd; row++ ) { @@ -957,10 +960,10 @@ void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector& // TODO: break; case CSRVector: - spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, warpStart, warpEnd, inWarpIdx ); + spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, gridIdx ); break; case CSRLight: - spmvCSRLight< InVector, OutVector, warpSize >( inVector, outVector, gridIdx ); + spmvCudaLightSpmv< InVector, OutVector, warpSize >( inVector, outVector, gridIdx ); break; case CSRAdaptive: spmvCSRAdaptive< InVector, OutVector, warpSize >( inVector, outVector, gridIdx, blocks, size ); @@ -970,9 +973,8 @@ void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector& break; } - return; - IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + /*IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; const IndexType warpStart = warpSize * ( globalIdx / warpSize ); const IndexType warpEnd = min( warpStart + warpSize, this->getRows() ); const IndexType inWarpIdx = globalIdx % warpSize; @@ -980,9 +982,9 @@ void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector& if( this->getCudaKernelType() == vector ) - /**** - * Hybrid mode - */ + ///// + // Hybrid mode + // const Index firstRow = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x; const IndexType lastRow = min( this->getRows(), firstRow + blockDim. x ); const IndexType nonzerosPerRow = ( this->rowPointers[ lastRow ] - this->rowPointers[ firstRow ] ) / @@ -990,19 +992,19 @@ void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector& if( nonzerosPerRow < this->getHybridModeSplit() ) { - /**** - * Use the scalar mode - */ + ///// + // Use the scalar mode + // if( globalIdx < this->getRows() ) outVector[ globalIdx ] = this->rowVectorProduct( globalIdx, inVector ); } else { - /**** - * Use the vector mode - */ + //// + // Use the vector mode + // spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, warpStart, warpEnd, inWarpIdx ); - } + }*/ } #endif @@ -1038,10 +1040,11 @@ class CSRDeviceDependentCode< Devices::Host > #ifdef HAVE_CUDA template< typename Real, typename Index, + CSRKernel KernelType, typename InVector, typename OutVector, int warpSize > -__global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Index >* matrix, +__global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Index, KernelType >* matrix, const InVector* inVector, OutVector* outVector, int gridIdx, @@ -1050,13 +1053,12 @@ __global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Inde typedef CSR< Real, Devices::Cuda, Index > Matrix; static_assert( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value, "" ); const typename Matrix::IndexType rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - if( matrix->getCudaKernelType() == Matrix::scalar ) + if( KernelType == CSRScalar ) { if( rowIdx < matrix->getRows() ) ( *outVector )[ rowIdx ] = matrix->rowVectorProduct( rowIdx, *inVector ); } - if( matrix->getCudaKernelType() == Matrix::vector || - matrix->getCudaKernelType() == Matrix::hybrid ) + else { matrix->template vectorProductCuda< InVector, OutVector, warpSize > ( *inVector, *outVector, gridIdx, blocks, size ); @@ -1066,16 +1068,17 @@ __global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Inde template< typename Real, typename Index, + CSRKernel KernelType, typename InVector, typename OutVector > -void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix, +void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index, KernelType >& matrix, const InVector& inVector, OutVector& outVector, int *blocks, size_t size ) { #ifdef HAVE_CUDA - typedef CSR< Real, Devices::Cuda, Index > Matrix; + typedef CSR< Real, Devices::Cuda, Index, KernelType > Matrix; typedef typename Matrix::IndexType IndexType; Matrix* kernel_this = Cuda::passToDevice( matrix ); InVector* kernel_inVector = Cuda::passToDevice( inVector ); @@ -1096,7 +1099,7 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix, // const int threads = cudaBlockSize.x; if( matrix.getCudaWarpSize() == 32 ) { // printf("BL %d BLSIZE %d\n", (int)cudaBlocks, (int)threads); - CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 32 > + CSRVectorProductCudaKernel< Real, Index, KernelType, InVector, OutVector, 32 > <<< 2, 1024 >>> ( kernel_this, kernel_inVector, -- GitLab