Commit d13fa1f1 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Changed CudaReductionKernel to use static instead of dynamic shared memory

Fixes #78
parent ac818bfa
Loading
Loading
Loading
Loading
+35 −38
Original line number Diff line number Diff line
@@ -15,7 +15,6 @@
#include <TNL/Assert.h>
#include <TNL/Math.h>
#include <TNL/Cuda/DeviceInfo.h>
#include <TNL/Cuda/SharedMemory.h>
#include <TNL/Algorithms/CudaReductionBuffer.h>
#include <TNL/Algorithms/MultiDeviceMemoryOperations.h>
#include <TNL/Exceptions/CudaSupportMissing.h>
@@ -54,7 +53,11 @@ CudaReductionKernel( const Result zero,
                     const Index end,
                     Result* output )
{
   Result* sdata = Cuda::getSharedMemory< Result >();
   TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaReductionKernel" );
   // when there is only one warp per blockSize.x, we need to allocate two warps
   // worth of shared memory so that we don't index shared memory out of bounds
   constexpr int shmemElements = (blockSize <= 32) ? 2 * blockSize : blockSize;
   __shared__ Result sdata[shmemElements];

   // Get the thread id (tid), global thread id (gid) and gridSize.
   const Index tid = threadIdx.x;
@@ -150,8 +153,12 @@ CudaReductionWithArgumentKernel( const Result zero,
                                 Index* idxOutput,
                                 const Index* idxInput = nullptr )
{
   Result* sdata = Cuda::getSharedMemory< Result >();
   Index* sidx = reinterpret_cast< Index* >( &sdata[ blockDim.x ] );
   TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaReductionKernel" );
   // when there is only one warp per blockSize.x, we need to allocate two warps
   // worth of shared memory so that we don't index shared memory out of bounds
   constexpr int shmemElements = (blockSize <= 32) ? 2 * blockSize : blockSize;
   __shared__ Result sdata[shmemElements];
   __shared__ Index sidx[shmemElements];

   // Get the thread id (tid), global thread id (gid) and gridSize.
   const Index tid = threadIdx.x;
@@ -409,12 +416,6 @@ struct CudaReductionKernelLauncher
         blockSize.x = Reduction_maxThreadsPerBlock;
         gridSize.x = TNL::min( Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );

         // when there is only one warp per blockSize.x, we need to allocate two warps
         // worth of shared memory so that we don't index shared memory out of bounds
         const Index shmem = (blockSize.x <= 32)
                  ? 2 * blockSize.x * sizeof( Result )
                  : blockSize.x * sizeof( Result );

         // This is "general", but this method always sets blockSize.x to a specific value,
         // so runtime switch is not necessary - it only prolongs the compilation time.
/*
@@ -423,55 +424,55 @@ struct CudaReductionKernelLauncher
         {
            case 512:
               CudaReductionKernel< 512 >
               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
               break;
            case 256:
               cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

               CudaReductionKernel< 256 >
               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
               break;
            case 128:
               cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

               CudaReductionKernel< 128 >
               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
               break;
            case  64:
               cudaFuncSetCacheConfig(CudaReductionKernel<  64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

               CudaReductionKernel<  64 >
               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
               break;
            case  32:
               cudaFuncSetCacheConfig(CudaReductionKernel<  32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

               CudaReductionKernel<  32 >
               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
               break;
            case  16:
               cudaFuncSetCacheConfig(CudaReductionKernel<  16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

               CudaReductionKernel<  16 >
               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
               break;
           case   8:
               cudaFuncSetCacheConfig(CudaReductionKernel<   8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

               CudaReductionKernel<   8 >
               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
               break;
            case   4:
               cudaFuncSetCacheConfig(CudaReductionKernel<   4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

               CudaReductionKernel<   4 >
               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
               break;
            case   2:
               cudaFuncSetCacheConfig(CudaReductionKernel<   2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

               CudaReductionKernel<   2 >
               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
               break;
            case   1:
               TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl );
@@ -486,8 +487,9 @@ struct CudaReductionKernelLauncher
         if( blockSize.x == Reduction_maxThreadsPerBlock ) {
            cudaFuncSetCacheConfig(CudaReductionKernel< Reduction_maxThreadsPerBlock, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

            // shared memory is allocated statically inside the kernel
            CudaReductionKernel< Reduction_maxThreadsPerBlock >
            <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, begin, end, output);
            <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, begin, end, output);
            cudaStreamSynchronize(0);
            TNL_CHECK_CUDA_DEVICE;
         }
@@ -519,12 +521,6 @@ struct CudaReductionKernelLauncher
         blockSize.x = Reduction_maxThreadsPerBlock;
         gridSize.x = TNL::min( Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );

         // when there is only one warp per blockSize.x, we need to allocate two warps
         // worth of shared memory so that we don't index shared memory out of bounds
         const Index shmem = (blockSize.x <= 32)
                  ? 2 * blockSize.x * ( sizeof( Result ) + sizeof( Index ) )
                  : blockSize.x * ( sizeof( Result ) + sizeof( Index ) );

         // This is "general", but this method always sets blockSize.x to a specific value,
         // so runtime switch is not necessary - it only prolongs the compilation time.
/*
@@ -533,55 +529,55 @@ struct CudaReductionKernelLauncher
         {
            case 512:
               CudaReductionWithArgumentKernel< 512 >
               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
               break;
            case 256:
               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

               CudaReductionWithArgumentKernel< 256 >
               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
               break;
            case 128:
               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

               CudaReductionWithArgumentKernel< 128 >
               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
               break;
            case  64:
               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

               CudaReductionWithArgumentKernel<  64 >
               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
               break;
            case  32:
               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

               CudaReductionWithArgumentKernel<  32 >
               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
               break;
            case  16:
               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

               CudaReductionWithArgumentKernel<  16 >
               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
               break;
           case   8:
               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

               CudaReductionWithArgumentKernel<   8 >
               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
               break;
            case   4:
               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

               CudaReductionWithArgumentKernel<   4 >
               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
               break;
            case   2:
               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

               CudaReductionWithArgumentKernel<   2 >
               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
               break;
            case   1:
               TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl );
@@ -596,8 +592,9 @@ struct CudaReductionKernelLauncher
         if( blockSize.x == Reduction_maxThreadsPerBlock ) {
            cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

            // shared memory is allocated statically inside the kernel
            CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock >
            <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, begin, end, output, idxOutput, idxInput );
            <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, begin, end, output, idxOutput, idxInput );
            cudaStreamSynchronize(0);
            TNL_CHECK_CUDA_DEVICE;
         }