Commit 331c083a authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Implementing the Chunked Ellpack matrix in CUDA.

parent 6b27c609
Loading
Loading
Loading
Loading
+5 −1
Original line number Diff line number Diff line
@@ -52,6 +52,10 @@ class tnlCuda
#endif
   static inline int getWarpSize();

   template< typename Index >
#ifdef HAVE_CUDA
   __device__ static Index getGlobalThreadIdx( const Index gridIdx = 0 );
#endif

#ifdef HAVE_CUDA
   __host__ __device__
+2 −1
Original line number Diff line number Diff line
@@ -55,6 +55,7 @@ __device__ void reduceAligned( const Operation& operation,
   }
}


/***
 * For each thread in block with thread ID smaller then s this function reduces
 * data elements with indices tid and tid + s. This is a modified version of
@@ -143,6 +144,7 @@ __global__ void tnlCUDAReductionKernel( const Operation operation,
      sdata[ tid ] = operation. firstReductionOnDevice( tid, gid, sdata, deviceInput, deviceInput2 );
   __syncthreads();

   unsigned int n = lastTId < blockDim. x ? lastTId : blockDim. x;

   /***
    *  Perform the parallel reduction.
@@ -153,7 +155,6 @@ __global__ void tnlCUDAReductionKernel( const Operation operation,
    *  We also separate the case when the blockDim. x is power of 2 and the algorithm
    *  can be written in more efficient way without some conditions.
    */
   unsigned int n = lastTId < blockDim. x ? lastTId : blockDim. x;
   if( n == 128 || n ==  64 || n ==  32 || n ==  16 ||
       n ==   8 || n ==   4 || n ==   2 || n == 256 ||
       n == 512 )
+9 −0
Original line number Diff line number Diff line
@@ -55,6 +55,15 @@ inline int tnlCuda::getWarpSize()
   return 32;
}

#ifdef HAVE_CUDA
template< typename Index >
__device__ Index tnlCuda::getGlobalThreadIdx( const Index gridIdx )
{
   return ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
}
#endif


#ifdef HAVE_CUDA
__host__ __device__
#endif
+94 −4
Original line number Diff line number Diff line
@@ -18,11 +18,14 @@
#ifndef TNLCHUNKEDELLPACKMATRIX_IMPL_H_
#define TNLCHUNKEDELLPACKMATRIX_IMPL_H_


#include <matrices/tnlChunkedEllpackMatrix.h>
#include <core/vectors/tnlVector.h>
#include <core/mfuncs.h>

#ifdef HAVE_CUDA
#include <cuda.h>
#endif

template< typename Real,
          typename Device,
          typename Index >
@@ -179,7 +182,6 @@ bool tnlChunkedEllpackMatrix< Real, Device, Index >::setSlice( const RowLengthsV
              cerr << " maxChunkInSlice = " << maxChunkInSlice << endl );
#endif


   /****
    * Set-up the slice info.
    */
@@ -221,8 +223,41 @@ bool tnlChunkedEllpackMatrix< Real, Device, Index >::setRowLengths( const RowLen
      return false;

   this->rowPointers.setElement( 0, 0 );
   if( DeviceType::DeviceType == tnlHostDevice )
   {
      for( IndexType sliceIndex = 0; sliceIndex < numberOfSlices; sliceIndex++ )
         this->setSlice( rowLengths, sliceIndex, elementsToAllocation );
   }
   if( DeviceType::DeviceType == tnlCudaDevice )
   {
#ifdef HAVE_CUDA
      typedef tnlChunkedEllpackMatrix< Real, Device, Index > Matrix;
      Matrix* kernel_matrix = tnlCuda::passToDevice( *this );
      RowLengthsVector* kernel_rowLengths = tnlCuda::passToDevice( rowLengths );
      Index* kernel_elementsToAllocation = tnlCuda::passToDevice( elementsToAllocation );
      dim3 cudaBlockSize( 256 ),
           cudaGridSize( tnlCuda::getMaxGridSize() );
      const Index cudaBlocks = roundUpDivision( numberOfSlices, cudaBlockSize.x );
      const Index cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() );
      for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
      {
         if( gridIdx == cudaGrids - 1 )
            cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize();
         tnlChunkedEllpackMatrix_setSlices_CudaKernel< Real, Index, 256 >
                                                     <<< cudaGridSize, cudaBlockSize, cudaBlockSize.x * sizeof( Index ) >>>
                                                     ( kernel_matrix,
                                                       kernel_rowLengths,
                                                       numberOfSlices,
                                                       kernel_elementsToAllocation,
                                                       gridIdx );
      }
      elementsToAllocation = tnlCuda::passFromDevice( *kernel_elementsToAllocation );
      tnlCuda::freeFromDevice( kernel_matrix );
      tnlCuda::freeFromDevice( kernel_rowLengths );
      tnlCuda::freeFromDevice( kernel_elementsToAllocation );
      checkCudaDevice;
#endif
   }

   this->rowPointers.computePrefixSum();

@@ -874,4 +909,59 @@ class tnlChunkedEllpackMatrixDeviceDependentCode< tnlCuda >

};

#ifdef HAVE_CUDA
template< typename Real,
          typename Index,
          int blockSize >
__global__ void tnlChunkedEllpackMatrix_setSlices_CudaKernel( tnlChunkedEllpackMatrix< Real, tnlCuda, Index >* matrix,
                                                              const typename tnlChunkedEllpackMatrix< Real, tnlCuda, Index >::RowLengthsVector* rowLengths,
                                                              const Index numberOfSlices,
                                                              Index* elementsToAllocation,
                                                              const Index gridIdx )
{
   Index* threadElementsToAllocation = getSharedMemory< Index >();
   const Index sliceIdx = tnlCuda::getGlobalThreadIdx< Index >( gridIdx );
   if( sliceIdx < numberOfSlices )
      matrix->setSlice( *rowLengths,
                        sliceIdx,
                        threadElementsToAllocation[ threadIdx.x ] );
   else
      threadElementsToAllocation[ threadIdx.x ] = 0;

   /****
    * Reduce elements to allocation from each thread to whole block
    */
   if( blockSize >= 512 )
     {
        if( threadIdx.x < 256 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 256 ];
        __syncthreads();
     }
     if( blockSize >= 256 )
     {
        if( threadIdx.x < 128 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 128 ];
        __syncthreads();
     }
     if( blockSize >= 128 )
     {
        if( threadIdx.x < 64 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 64 ];
        __syncthreads();
     }

     /***
      * This runs in one warp so it is synchronised implicitly.
      */
     if ( threadIdx.x < 32)
     {
        if( blockSize >= 64 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 32 ];
        if( blockSize >= 32 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 16 ];
        if( blockSize >= 16 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 8 ];
        if( blockSize >=  8 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 4 ];
        if( blockSize >=  4 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 2 ];
        if( blockSize >=  2 ) threadElementsToAllocation[ threadIdx.x ] += threadElementsToAllocation[ threadIdx.x + 1 ];
     }
     atomicAdd( elementsToAllocation, threadElementsToAllocation[ 0 ] );
}

#endif

#endif /* TNLCHUNKEDELLPACKMATRIX_IMPL_H_ */
+22 −0
Original line number Diff line number Diff line
@@ -25,6 +25,22 @@ template< typename Device >
class tnlChunkedEllpackMatrixDeviceDependentCode;

template< typename Real, typename Device = tnlHost, typename Index = int >
class tnlChunkedEllpackMatrix;

#ifdef HAVE_CUDA
template< typename Real,
          typename Index,
          int blockSize >
__global__ void tnlChunkedEllpackMatrix_setSlices_CudaKernel( tnlChunkedEllpackMatrix< Real, tnlCuda, Index >* matrix,
                                                              const typename tnlChunkedEllpackMatrix< Real, tnlCuda, Index >::RowLengthsVector* rowLengths,
                                                              const Index numberOfSlices,
                                                              Index* elementsToAllocation,
                                                              const Index gridIdx );
#endif



template< typename Real, typename Device, typename Index >
class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
{
   public:
@@ -209,7 +225,13 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >

   typedef tnlChunkedEllpackMatrixDeviceDependentCode< DeviceType > DeviceDependentCode;
   friend class tnlChunkedEllpackMatrixDeviceDependentCode< DeviceType >;

#ifdef HAVE_CUDA
   friend void tnlChunkedEllpackMatrix_setSlices_CudaKernel< Real, Index, 256 >( tnlChunkedEllpackMatrix< Real, tnlCuda, Index >* matrix,
                                                                                 const RowLengthsVector* rowLengths,
                                                                                 const Index numberOfSlices,
                                                                                 Index* elementsToAllocation,
                                                                                 const Index gridIdx );
#endif


Loading