Commit 2641210d authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Fixing CUDA code in Legacy CSR.

parent d6909164
Loading
Loading
Loading
Loading
+3 −5
Original line number Diff line number Diff line
@@ -31,9 +31,9 @@ class CusparseCSR;
template< typename Device >
class CSRDeviceDependentCode;

enum CSRKernel { CSRScalar, CSRVector, CSRLight, CSRAdaptive, CSRStream };
enum CSRKernel { CSRScalar, CSRVector, CSRHybrid, CSRLight, CSRAdaptive, CSRStream };

template< typename Real, typename Device = Devices::Host, typename Index = int, CSRKernel KernelType = CSRAdaptive >
template< typename Real, typename Device = Devices::Host, typename Index = int, CSRKernel KernelType = CSRScalar >
class CSR : public Sparse< Real, Device, Index >
{
private:
@@ -225,9 +225,7 @@ public:
   __device__
   void spmvCudaVectorized( const InVector& inVector,
                            OutVector& outVector,
                            const IndexType warpStart,
                            const IndexType warpEnd,
                            const IndexType inWarpIdx ) const;
                            const IndexType gridIdx ) const;

   template< typename InVector,
             typename OutVector,
+27 −24
Original line number Diff line number Diff line
@@ -905,10 +905,13 @@ template< typename Real,
__device__
void CSR< Real, Device, Index, KernelType >::spmvCudaVectorized( const InVector& inVector,
                                                              OutVector& outVector,
                                                              const IndexType warpStart,
                                                              const IndexType warpEnd,
                                                              const IndexType inWarpIdx ) const
                                                              const IndexType gridIdx ) const
{
   IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
   const IndexType warpStart = warpSize * ( globalIdx / warpSize );
   const IndexType warpEnd = min( warpStart + warpSize, this->getRows() );
   const IndexType inWarpIdx = globalIdx % warpSize;

   volatile Real* aux = Cuda::getSharedMemory< Real >();
   for( IndexType row = warpStart; row < warpEnd; row++ )
   {
@@ -957,10 +960,10 @@ void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector&
         // TODO:
         break;
      case CSRVector:
         spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, warpStart, warpEnd, inWarpIdx );
         spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, gridIdx );
         break;
      case CSRLight:
         spmvCSRLight< InVector, OutVector, warpSize >( inVector, outVector, gridIdx );
         spmvCudaLightSpmv< InVector, OutVector, warpSize >( inVector, outVector, gridIdx );
         break;
      case CSRAdaptive:
         spmvCSRAdaptive< InVector, OutVector, warpSize >( inVector, outVector, gridIdx, blocks, size );
@@ -970,9 +973,8 @@ void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector&
         break;
   }

   return;

   IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
   /*IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
   const IndexType warpStart = warpSize * ( globalIdx / warpSize );
   const IndexType warpEnd = min( warpStart + warpSize, this->getRows() );
   const IndexType inWarpIdx = globalIdx % warpSize;
@@ -980,9 +982,9 @@ void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector&
   if( this->getCudaKernelType() == vector )
      

   /****
    * Hybrid mode
    */
   /////
   // Hybrid mode
   //
   const Index firstRow = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x;
   const IndexType lastRow = min( this->getRows(), firstRow + blockDim. x );
   const IndexType nonzerosPerRow = ( this->rowPointers[ lastRow ] - this->rowPointers[ firstRow ] ) /
@@ -990,19 +992,19 @@ void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector&

   if( nonzerosPerRow < this->getHybridModeSplit() )
   {
      /****
       * Use the scalar mode
       */
      /////
      // Use the scalar mode
      //
      if( globalIdx < this->getRows() )
          outVector[ globalIdx ] = this->rowVectorProduct( globalIdx, inVector );
   }
   else
   {
      /****
       * Use the vector mode
       */
      ////
      // Use the vector mode
      //
      spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, warpStart, warpEnd, inWarpIdx );
   }
   }*/
}
#endif

@@ -1038,10 +1040,11 @@ class CSRDeviceDependentCode< Devices::Host >
#ifdef HAVE_CUDA
template< typename Real,
          typename Index,
          CSRKernel KernelType,
          typename InVector,
          typename OutVector,
          int warpSize >
__global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Index >* matrix,
__global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Index, KernelType >* matrix,
                                                     const InVector* inVector,
                                                     OutVector* outVector,
                                                     int gridIdx,
@@ -1050,13 +1053,12 @@ __global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Inde
   typedef CSR< Real, Devices::Cuda, Index > Matrix;
   static_assert( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value, "" );
   const typename Matrix::IndexType rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
   if( matrix->getCudaKernelType() == Matrix::scalar )
   if( KernelType == CSRScalar )
   {
      if( rowIdx < matrix->getRows() )
         ( *outVector )[ rowIdx ] = matrix->rowVectorProduct( rowIdx, *inVector );
   }
   if( matrix->getCudaKernelType() == Matrix::vector ||
       matrix->getCudaKernelType() == Matrix::hybrid )
   else
   {
      matrix->template vectorProductCuda< InVector, OutVector, warpSize >
                                        ( *inVector, *outVector, gridIdx, blocks, size );
@@ -1066,16 +1068,17 @@ __global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Inde

template< typename Real,
          typename Index,
          CSRKernel KernelType,
          typename InVector,
          typename OutVector >
void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix,
void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index, KernelType >& matrix,
                                    const InVector& inVector,
                                    OutVector& outVector,
                                    int *blocks,
                                    size_t size )
{
#ifdef HAVE_CUDA
   typedef CSR< Real, Devices::Cuda, Index > Matrix;
   typedef CSR< Real, Devices::Cuda, Index, KernelType > Matrix;
   typedef typename Matrix::IndexType IndexType;
   Matrix* kernel_this = Cuda::passToDevice( matrix );
   InVector* kernel_inVector = Cuda::passToDevice( inVector );
@@ -1096,7 +1099,7 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix,
      // const int threads = cudaBlockSize.x;
      if( matrix.getCudaWarpSize() == 32 ) {
         // printf("BL %d BLSIZE %d\n", (int)cudaBlocks, (int)threads);
         CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 32 >
         CSRVectorProductCudaKernel< Real, Index, KernelType, InVector, OutVector, 32 >
                                            <<< 2, 1024 >>>
                                            ( kernel_this,
                                              kernel_inVector,