Commit 909fef43 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Merge branch 'parallel-reduction' into develop

* parallel-reduction:
  Fixed ArrayOperations::compareMemory for MIC
  Fixed parallel reduction for MIC
  General parallel reduction on host
  Refactoring parallel reduction operations
parents 65b6705b c36595bd
Loading
Loading
Loading
Loading
+3 −3
Original line number Diff line number Diff line
@@ -18,7 +18,7 @@
#include <TNL/Exceptions/CudaBadAlloc.h>
#include <TNL/Containers/Algorithms/ArrayOperations.h>
#include <TNL/Containers/Algorithms/Reduction.h>
#include <TNL/Containers/Algorithms/reduction-operations.h>
#include <TNL/Containers/Algorithms/ReductionOperations.h>

namespace TNL {
namespace Containers {   
@@ -202,8 +202,8 @@ compareMemory( const Element1* destination,
   TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." );
   //TODO: The parallel reduction on the CUDA device with different element types is needed.
   bool result = false;
   Algorithms::tnlParallelReductionEqualities< Element1, Index > reductionEqualities;
   reductionOnCudaDevice( reductionEqualities, size, destination, source, result );
   Algorithms::ParallelReductionEqualities< Element1, Element2 > reductionEqualities;
   Reduction< Devices::Cuda >::reduce( reductionEqualities, size, destination, source, result );
   return result;
}

+3 −3
Original line number Diff line number Diff line
@@ -20,7 +20,7 @@
#include <TNL/Exceptions/MICBadAlloc.h>
#include <TNL/Containers/Algorithms/ArrayOperations.h>
#include <TNL/Containers/Algorithms/Reduction.h>
#include <TNL/Containers/Algorithms/reduction-operations.h>
#include <TNL/Containers/Algorithms/ReductionOperations.h>

namespace TNL {
namespace Containers {
@@ -191,9 +191,9 @@ compareMemory( const Element1* destination,
   }
   else
   {
      Devices::MICHider<const Element1> src_ptr;
      Devices::MICHider<const Element2> src_ptr;
      src_ptr.pointer=source;
      Devices::MICHider<const Element2> dst_ptr;
      Devices::MICHider<const Element1> dst_ptr;
      dst_ptr.pointer=destination;
      bool ret=false;
      #pragma offload target(mic) in(src_ptr,dst_ptr,size) out(ret)
+1 −1
Original line number Diff line number Diff line
@@ -6,7 +6,6 @@ set( headers ArrayOperations.h
             ArrayOperationsMIC_impl.h
             cuda-prefix-sum.h
             cuda-prefix-sum_impl.h
             reduction-operations.h
             CublasWrapper.h
             CudaMultireductionKernel.h
             CudaReductionBuffer.h
@@ -15,6 +14,7 @@ set( headers ArrayOperations.h
             Multireduction_impl.h
             Reduction.h
             Reduction_impl.h
             ReductionOperations.h
             VectorOperations.h
             VectorOperationsHost_impl.h
             VectorOperationsCuda_impl.h
+49 −51
Original line number Diff line number Diff line
@@ -41,18 +41,18 @@ static constexpr int Multireduction_registersPerThread = 38; // empirically de
   static constexpr int Multireduction_minBlocksPerMultiprocessor = 4;
#endif

template< typename Operation, int blockSizeX >      
template< int blockSizeX, typename Operation, typename Index >
__global__ void
__launch_bounds__( Multireduction_maxThreadsPerBlock, Multireduction_minBlocksPerMultiprocessor )
CudaMultireductionKernel( Operation operation,
                          const typename Operation::IndexType n,
                          const typename Operation::IndexType size,
                          const typename Operation::RealType* input1,
                          const typename Operation::IndexType ldInput1,
                          const typename Operation::RealType* input2,
                          const int n,
                          const Index size,
                          const typename Operation::DataType1* input1,
                          const Index ldInput1,
                          const typename Operation::DataType2* input2,
                          typename Operation::ResultType* output )
{
   typedef typename Operation::IndexType IndexType;
   typedef Index IndexType;
   typedef typename Operation::ResultType ResultType;

   ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >();
@@ -84,21 +84,21 @@ CudaMultireductionKernel( Operation operation,
   sdata[ tid ] = operation.initialValue();
   while( gid + 4 * gridSizeX < size )
   {
      operation.cudaFirstReduction( sdata[ tid ], gid,                 input1, input2 );
      operation.cudaFirstReduction( sdata[ tid ], gid + gridSizeX,     input1, input2 );
      operation.cudaFirstReduction( sdata[ tid ], gid + 2 * gridSizeX, input1, input2 );
      operation.cudaFirstReduction( sdata[ tid ], gid + 3 * gridSizeX, input1, input2 );
      operation.firstReduction( sdata[ tid ], gid,                 input1, input2 );
      operation.firstReduction( sdata[ tid ], gid + gridSizeX,     input1, input2 );
      operation.firstReduction( sdata[ tid ], gid + 2 * gridSizeX, input1, input2 );
      operation.firstReduction( sdata[ tid ], gid + 3 * gridSizeX, input1, input2 );
      gid += 4 * gridSizeX;
   }
   while( gid + 2 * gridSizeX < size )
   {
      operation.cudaFirstReduction( sdata[ tid ], gid,                 input1, input2 );
      operation.cudaFirstReduction( sdata[ tid ], gid + gridSizeX,     input1, input2 );
      operation.firstReduction( sdata[ tid ], gid,                 input1, input2 );
      operation.firstReduction( sdata[ tid ], gid + gridSizeX,     input1, input2 );
      gid += 2 * gridSizeX;
   }
   while( gid < size )
   {
      operation.cudaFirstReduction( sdata[ tid ], gid,                 input1, input2 );
      operation.firstReduction( sdata[ tid ], gid,                 input1, input2 );
      gid += gridSizeX;
   }
   __syncthreads();
@@ -111,25 +111,25 @@ CudaMultireductionKernel( Operation operation,
    */
   if( blockSizeX >= 1024 ) {
      if( threadIdx.x < 512 ) {
         operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 512 ] );
         operation.commonReduction( sdata[ tid ], sdata[ tid + 512 ] );
      }
      __syncthreads();
   }
   if( blockSizeX >= 512 ) {
      if( threadIdx.x < 256 ) {
         operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 256 ] );
         operation.commonReduction( sdata[ tid ], sdata[ tid + 256 ] );
      }
      __syncthreads();
   }
   if( blockSizeX >= 256 ) {
      if( threadIdx.x < 128 ) {
         operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 128 ] );
         operation.commonReduction( sdata[ tid ], sdata[ tid + 128 ] );
      }
      __syncthreads();
   }
   if( blockSizeX >= 128 ) {
      if( threadIdx.x <  64 ) {
         operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 64 ] );
         operation.commonReduction( sdata[ tid ], sdata[ tid + 64 ] );
      }
      __syncthreads();
   }
@@ -144,22 +144,22 @@ CudaMultireductionKernel( Operation operation,
   if( threadIdx.x < 32 ) {
      volatile ResultType* vsdata = sdata;
      if( blockSizeX >= 64 ) {
         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 32 ] );
         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 32 ] );
      }
      if( blockSizeX >= 32 ) {
         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 16 ] );
         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 16 ] );
      }
      if( blockSizeX >= 16 ) {
         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 8 ] );
         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 8 ] );
      }
      if( blockSizeX >=  8 ) {
         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 4 ] );
         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 4 ] );
      }
      if( blockSizeX >=  4 ) {
         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 2 ] );
         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 2 ] );
      }
      if( blockSizeX >=  2 ) {
         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 1 ] );
         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 1 ] );
      }
   }

@@ -171,18 +171,16 @@ CudaMultireductionKernel( Operation operation,
   }
}

template< typename Operation >
typename Operation::IndexType
template< typename Operation, typename Index >
int
CudaMultireductionKernelLauncher( Operation& operation,
                                  int n,
                                  const typename Operation::IndexType size,
                                  const typename Operation::RealType* input1,
                                  const typename Operation::IndexType ldInput1,
                                  const typename Operation::RealType* input2,
                                  const int n,
                                  const Index size,
                                  const typename Operation::DataType1* input1,
                                  const Index ldInput1,
                                  const typename Operation::DataType2* input2,
                                  typename Operation::ResultType*& output )
{
   typedef typename Operation::IndexType IndexType;
   typedef typename Operation::RealType RealType;
   typedef typename Operation::ResultType ResultType;

   // The number of blocks should be a multiple of the number of multiprocessors
@@ -240,7 +238,7 @@ CudaMultireductionKernelLauncher( Operation& operation,

   // when there is only one warp per blockSize.x, we need to allocate two warps
   // worth of shared memory so that we don't index shared memory out of bounds
   const IndexType shmem = (blockSize.x <= 32)
   const Index shmem = (blockSize.x <= 32)
            ? 2 * blockSize.x * blockSize.y * sizeof( ResultType )
            : blockSize.x * blockSize.y * sizeof( ResultType );

@@ -252,55 +250,55 @@ CudaMultireductionKernelLauncher( Operation& operation,
   switch( blockSize.x )
   {
      case 512:
         CudaMultireductionKernel< Operation, 512 >
         CudaMultireductionKernel< 512 >
         <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
         break;
      case 256:
         cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 256 >, cudaFuncCachePreferShared);
         cudaFuncSetCacheConfig(CudaMultireductionKernel< 256, Operation, Index >, cudaFuncCachePreferShared);

         CudaMultireductionKernel< Operation, 256 >
         CudaMultireductionKernel< 256 >
         <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
         break;
      case 128:
         cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 128 >, cudaFuncCachePreferShared);
         cudaFuncSetCacheConfig(CudaMultireductionKernel< 128, Operation, Index >, cudaFuncCachePreferShared);

         CudaMultireductionKernel< Operation, 128 >
         CudaMultireductionKernel< 128 >
         <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
         break;
      case  64:
         cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation,  64 >, cudaFuncCachePreferShared);
         cudaFuncSetCacheConfig(CudaMultireductionKernel<  64, Operation, Index >, cudaFuncCachePreferShared);

         CudaMultireductionKernel< Operation,  64 >
         CudaMultireductionKernel<  64 >
         <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
         break;
      case  32:
         cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation,  32 >, cudaFuncCachePreferShared);
         cudaFuncSetCacheConfig(CudaMultireductionKernel<  32, Operation, Index >, cudaFuncCachePreferShared);

         CudaMultireductionKernel< Operation,  32 >
         CudaMultireductionKernel<  32 >
         <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
         break;
      case  16:
         cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation,  16 >, cudaFuncCachePreferShared);
         cudaFuncSetCacheConfig(CudaMultireductionKernel<  16, Operation, Index >, cudaFuncCachePreferShared);

         CudaMultireductionKernel< Operation,  16 >
         CudaMultireductionKernel<  16 >
         <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
         break;
     case   8:
         cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation,   8 >, cudaFuncCachePreferShared);
         cudaFuncSetCacheConfig(CudaMultireductionKernel<   8, Operation, Index >, cudaFuncCachePreferShared);

         CudaMultireductionKernel< Operation,   8 >
         CudaMultireductionKernel<   8 >
         <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
         break;
      case   4:
         cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation,   4 >, cudaFuncCachePreferShared);
         cudaFuncSetCacheConfig(CudaMultireductionKernel<   4, Operation, Index >, cudaFuncCachePreferShared);

         CudaMultireductionKernel< Operation,   4 >
         CudaMultireductionKernel<   4 >
        <<< gridSize, blockSize, shmem >>>( operation,  n,size, input1, ldInput1, input2, output);
        break;
      case   2:
         cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation,   2 >, cudaFuncCachePreferShared);
         cudaFuncSetCacheConfig(CudaMultireductionKernel<   2, Operation, Index >, cudaFuncCachePreferShared);

         CudaMultireductionKernel< Operation,   2 >
         CudaMultireductionKernel<   2 >
         <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
         break;
      case   1:
+45 −46
Original line number Diff line number Diff line
@@ -39,16 +39,16 @@ static constexpr int Reduction_registersPerThread = 32; // empirically determi
   static constexpr int Reduction_minBlocksPerMultiprocessor = 4;
#endif

template< typename Operation, int blockSize >
template< int blockSize, typename Operation, typename Index >
__global__ void
__launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor )
CudaReductionKernel( Operation operation,
                     const typename Operation::IndexType size,
                     const typename Operation::RealType* input1,
                     const typename Operation::RealType* input2,
                     const Index size,
                     const typename Operation::DataType1* input1,
                     const typename Operation::DataType2* input2,
                     typename Operation::ResultType* output )
{
   typedef typename Operation::IndexType IndexType;
   typedef Index IndexType;
   typedef typename Operation::ResultType ResultType;

   ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >();
@@ -69,21 +69,21 @@ CudaReductionKernel( Operation operation,
    */
   while( gid + 4 * gridSize < size )
   {
      operation.cudaFirstReduction( sdata[ tid ], gid,                input1, input2 );
      operation.cudaFirstReduction( sdata[ tid ], gid + gridSize,     input1, input2 );
      operation.cudaFirstReduction( sdata[ tid ], gid + 2 * gridSize, input1, input2 );
      operation.cudaFirstReduction( sdata[ tid ], gid + 3 * gridSize, input1, input2 );
      operation.firstReduction( sdata[ tid ], gid,                input1, input2 );
      operation.firstReduction( sdata[ tid ], gid + gridSize,     input1, input2 );
      operation.firstReduction( sdata[ tid ], gid + 2 * gridSize, input1, input2 );
      operation.firstReduction( sdata[ tid ], gid + 3 * gridSize, input1, input2 );
      gid += 4 * gridSize;
   }
   while( gid + 2 * gridSize < size )
   {
      operation.cudaFirstReduction( sdata[ tid ], gid,                input1, input2 );
      operation.cudaFirstReduction( sdata[ tid ], gid + gridSize,     input1, input2 );
      operation.firstReduction( sdata[ tid ], gid,                input1, input2 );
      operation.firstReduction( sdata[ tid ], gid + gridSize,     input1, input2 );
      gid += 2 * gridSize;
   }
   while( gid < size )
   {
      operation.cudaFirstReduction( sdata[ tid ], gid,                input1, input2 );
      operation.firstReduction( sdata[ tid ], gid,                input1, input2 );
      gid += gridSize;
   }
   __syncthreads();
@@ -98,19 +98,19 @@ CudaReductionKernel( Operation operation,
   if( blockSize >= 1024 )
   {
      if( tid < 512 )
         operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 512 ] );
         operation.commonReduction( sdata[ tid ], sdata[ tid + 512 ] );
      __syncthreads();
   }
   if( blockSize >= 512 )
   {
      if( tid < 256 )
         operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 256 ] );
         operation.commonReduction( sdata[ tid ], sdata[ tid + 256 ] );
      __syncthreads();
   }
   if( blockSize >= 256 )
   {
      if( tid < 128 )
         operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 128 ] );
         operation.commonReduction( sdata[ tid ], sdata[ tid + 128 ] );
      __syncthreads();
      //printf( "2: tid %d data %f \n", tid, sdata[ tid ] );
   }
@@ -118,7 +118,7 @@ CudaReductionKernel( Operation operation,
   if( blockSize >= 128 )
   {
      if( tid <  64 )
         operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 64 ] );
         operation.commonReduction( sdata[ tid ], sdata[ tid + 64 ] );
      __syncthreads();
      //printf( "3: tid %d data %f \n", tid, sdata[ tid ] );
   }
@@ -132,34 +132,34 @@ CudaReductionKernel( Operation operation,
      volatile ResultType* vsdata = sdata;
      if( blockSize >= 64 )
      {
         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 32 ] );
         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 32 ] );
         //printf( "4: tid %d data %f \n", tid, sdata[ tid ] );
      }
      // TODO: If blocksize == 32, the following does not work
      // We do not check if tid < 16. Fix it!!!
      if( blockSize >= 32 )
      {
         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 16 ] );
         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 16 ] );
         //printf( "5: tid %d data %f \n", tid, sdata[ tid ] );
      }
      if( blockSize >= 16 )
      {
         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 8 ] );
         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 8 ] );
         //printf( "6: tid %d data %f \n", tid, sdata[ tid ] );
      }
      if( blockSize >=  8 )
      {
         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 4 ] );
         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 4 ] );
         //printf( "7: tid %d data %f \n", tid, sdata[ tid ] );
      }
      if( blockSize >=  4 )
      {
         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 2 ] );
         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 2 ] );
         //printf( "8: tid %d data %f \n", tid, sdata[ tid ] );
      }
      if( blockSize >=  2 )
      {
         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 1 ] );
         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 1 ] );
         //printf( "9: tid %d data %f \n", tid, sdata[ tid ] );
      }
   }
@@ -175,16 +175,15 @@ CudaReductionKernel( Operation operation,

}

template< typename Operation >
typename Operation::IndexType
template< typename Operation, typename Index >
int
CudaReductionKernelLauncher( Operation& operation,
                             const typename Operation::IndexType size,
                             const typename Operation::RealType* input1,
                             const typename Operation::RealType* input2,
                             const Index size,
                             const typename Operation::DataType1* input1,
                             const typename Operation::DataType2* input2,
                             typename Operation::ResultType*& output )
{
   typedef typename Operation::IndexType IndexType;
   typedef typename Operation::RealType RealType;
   typedef Index IndexType;
   typedef typename Operation::ResultType ResultType;

   // The number of blocks should be a multiple of the number of multiprocessors
@@ -222,55 +221,55 @@ CudaReductionKernelLauncher( Operation& operation,
   switch( blockSize.x )
   {
      case 512:
         CudaReductionKernel< Operation, 512 >
         CudaReductionKernel< 512 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
      case 256:
         cudaFuncSetCacheConfig(CudaReductionKernel< Operation, 256 >, cudaFuncCachePreferShared);
         cudaFuncSetCacheConfig(CudaReductionKernel< 256, Operation, Index >, cudaFuncCachePreferShared);

         CudaReductionKernel< Operation, 256 >
         CudaReductionKernel< 256 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
      case 128:
         cudaFuncSetCacheConfig(CudaReductionKernel< Operation, 128 >, cudaFuncCachePreferShared);
         cudaFuncSetCacheConfig(CudaReductionKernel< 128, Operation, Index >, cudaFuncCachePreferShared);

         CudaReductionKernel< Operation, 128 >
         CudaReductionKernel< 128 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
      case  64:
         cudaFuncSetCacheConfig(CudaReductionKernel< Operation,  64 >, cudaFuncCachePreferShared);
         cudaFuncSetCacheConfig(CudaReductionKernel<  64, Operation, Index >, cudaFuncCachePreferShared);

         CudaReductionKernel< Operation,  64 >
         CudaReductionKernel<  64 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
      case  32:
         cudaFuncSetCacheConfig(CudaReductionKernel< Operation,  32 >, cudaFuncCachePreferShared);
         cudaFuncSetCacheConfig(CudaReductionKernel<  32, Operation, Index >, cudaFuncCachePreferShared);

         CudaReductionKernel< Operation,  32 >
         CudaReductionKernel<  32 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
      case  16:
         cudaFuncSetCacheConfig(CudaReductionKernel< Operation,  16 >, cudaFuncCachePreferShared);
         cudaFuncSetCacheConfig(CudaReductionKernel<  16, Operation, Index >, cudaFuncCachePreferShared);

         CudaReductionKernel< Operation,  16 >
         CudaReductionKernel<  16 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
     case   8:
         cudaFuncSetCacheConfig(CudaReductionKernel< Operation,   8 >, cudaFuncCachePreferShared);
         cudaFuncSetCacheConfig(CudaReductionKernel<   8, Operation, Index >, cudaFuncCachePreferShared);

         CudaReductionKernel< Operation,   8 >
         CudaReductionKernel<   8 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
      case   4:
         cudaFuncSetCacheConfig(CudaReductionKernel< Operation,   4 >, cudaFuncCachePreferShared);
         cudaFuncSetCacheConfig(CudaReductionKernel<   4, Operation, Index >, cudaFuncCachePreferShared);

         CudaReductionKernel< Operation,   4 >
         CudaReductionKernel<   4 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
      case   2:
         cudaFuncSetCacheConfig(CudaReductionKernel< Operation,   2 >, cudaFuncCachePreferShared);
         cudaFuncSetCacheConfig(CudaReductionKernel<   2, Operation, Index >, cudaFuncCachePreferShared);

         CudaReductionKernel< Operation,   2 >
         CudaReductionKernel<   2 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
      case   1:
Loading