Commit 57d66051 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

CudaScanKernelLauncher: configuration of blockSize depending on the ValueType

parent 8accbc52
Loading
Loading
Loading
Loading
+7 −2
Original line number Diff line number Diff line
@@ -436,7 +436,10 @@ CudaScanKernelUniformShift( OutputView output,
 */
template< ScanType scanType,
          ScanPhaseType phaseType,
          int blockSize = 256,
          typename ValueType,
          // use blockSize=256 for 32-bit value types, scale with sizeof(ValueType)
          // to keep shared memory requirements constant
          int blockSize = 256 * 4 / sizeof(ValueType),
          // valuesPerThread should be odd to avoid shared memory bank conflicts
          int valuesPerThread = 7 >
struct CudaScanKernelLauncher
@@ -520,6 +523,7 @@ struct CudaScanKernelLauncher
                      Reduction&& reduction,
                      typename OutputArray::ValueType zero )
   {
      static_assert( std::is_same< ValueType, typename OutputArray::ValueType >::value, "invalid configuration of ValueType" );
      using Index = typename InputArray::IndexType;

      if( end - begin <= blockSize * valuesPerThread ) {
@@ -639,7 +643,7 @@ struct CudaScanKernelLauncher

         // blockResults now contains scan results for each block. The first phase
         // ends by computing an exclusive scan of this array.
         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase >::perform(
         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::perform(
            blockResults,
            blockResults,
            0,
@@ -689,6 +693,7 @@ struct CudaScanKernelLauncher
                       typename OutputArray::ValueType zero,
                       typename OutputArray::ValueType shift )
   {
      static_assert( std::is_same< ValueType, typename OutputArray::ValueType >::value, "invalid configuration of ValueType" );
      using Index = typename InputArray::IndexType;

      // if the input was already scanned with just one block in the first phase,
+3 −3
Original line number Diff line number Diff line
@@ -278,7 +278,7 @@ perform( const InputArray& input,
   if( end <= begin )
      return;

   detail::CudaScanKernelLauncher< Type, PhaseType >::perform(
   detail::CudaScanKernelLauncher< Type, PhaseType, typename OutputArray::ValueType >::perform(
      input,
      output,
      begin,
@@ -312,7 +312,7 @@ performFirstPhase( const InputArray& input,
      return block_results;
   }

   return detail::CudaScanKernelLauncher< Type, PhaseType >::performFirstPhase(
   return detail::CudaScanKernelLauncher< Type, PhaseType, typename OutputArray::ValueType >::performFirstPhase(
      input,
      output,
      begin,
@@ -346,7 +346,7 @@ performSecondPhase( const InputArray& input,
   if( end <= begin )
      return;

   detail::CudaScanKernelLauncher< Type, PhaseType >::performSecondPhase(
   detail::CudaScanKernelLauncher< Type, PhaseType, typename OutputArray::ValueType >::performSecondPhase(
      input,
      output,
      blockShifts,
+10 −10
Original line number Diff line number Diff line
@@ -87,14 +87,14 @@ protected:
#ifdef HAVE_CUDA
      if( std::is_same< DeviceType, Devices::Cuda >::value )
      {
         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase >::resetMaxGridSize();
         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase >::maxGridSize() = 3;
         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase >::resetMaxGridSize();
         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase >::maxGridSize() = 3;
         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase >::resetMaxGridSize();
         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase >::maxGridSize() = 3;
         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase >::resetMaxGridSize();
         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase >::maxGridSize() = 3;
         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::resetMaxGridSize();
         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::maxGridSize() = 3;
         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::resetMaxGridSize();
         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::maxGridSize() = 3;
         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::resetMaxGridSize();
         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::maxGridSize() = 3;
         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::resetMaxGridSize();
         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::maxGridSize() = 3;
      }
#endif
   }
@@ -106,8 +106,8 @@ protected:
      // skip the check for too small arrays
      if( check_cuda_grids && array.getLocalRange().getSize() > 256 ) {
         // we don't care which kernel launcher was actually used
         const auto gridsCount = TNL::max( CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInFirstPhase >::gridsCount(),
                                           CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInSecondPhase >::gridsCount() );
         const auto gridsCount = TNL::max( CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInFirstPhase, ValueType >::gridsCount(),
                                           CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInSecondPhase, ValueType >::gridsCount() );
         EXPECT_GT( gridsCount, 1 );
      }
#endif
+10 −10
Original line number Diff line number Diff line
@@ -59,14 +59,14 @@ protected:
#ifdef HAVE_CUDA
      if( std::is_same< DeviceType, Devices::Cuda >::value )
      {
         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase >::resetMaxGridSize();
         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase >::maxGridSize() = 3;
         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase >::resetMaxGridSize();
         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase >::maxGridSize() = 3;
         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase >::resetMaxGridSize();
         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase >::maxGridSize() = 3;
         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase >::resetMaxGridSize();
         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase >::maxGridSize() = 3;
         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::resetMaxGridSize();
         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::maxGridSize() = 3;
         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::resetMaxGridSize();
         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::maxGridSize() = 3;
         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::resetMaxGridSize();
         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::maxGridSize() = 3;
         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::resetMaxGridSize();
         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::maxGridSize() = 3;
      }
#endif
   }
@@ -78,8 +78,8 @@ protected:
      // skip the check for too small arrays
      if( array.getSize() > 256 ) {
         // we don't care which kernel launcher was actually used
         const auto gridsCount = TNL::max( CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInFirstPhase >::gridsCount(),
                                           CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInSecondPhase >::gridsCount() );
         const auto gridsCount = TNL::max( CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInFirstPhase, ValueType >::gridsCount(),
                                           CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInSecondPhase, ValueType >::gridsCount() );
         EXPECT_GT( gridsCount, 1 );
      }
#endif