Commit e28f675f authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Renamed CudaReduction.h to CudaReductionKernel.h

parent b730b971
Loading
Loading
Loading
Loading
+2 −3
Original line number Diff line number Diff line
@@ -5,11 +5,10 @@ set( headers cuda-prefix-sum.h
             cuda-reduction.h             
             cuda-reduction_impl.h
             reduction-operations.h
             CudaReduction.h
             CudaReduction_impl.h
             CudaReductionBuffer.h
             CublasWrapper.h
             CudaMultireductionKernel.h
             CudaReductionBuffer.h
             CudaReductionKernel.h
             Multireduction.h
             Multireduction_impl.h
   )
+0 −45
Original line number Diff line number Diff line
/***************************************************************************
                          CudaReduction.h  -  description
                             -------------------
    begin                : Jun 17, 2015
    copyright            : (C) 2015 by Tomas Oberhuber
    email                : tomas.oberhuber@fjfi.cvut.cz
 ***************************************************************************/

/* See Copyright Notice in tnl/Copyright */

#pragma once

namespace TNL {
namespace Containers {
namespace Algorithms {
   
#ifdef HAVE_CUDA

template< typename Operation, int blockSize >
class CudaReduction
{
   public:

      typedef typename Operation::IndexType IndexType;
      typedef typename Operation::RealType RealType;
      typedef typename Operation::ResultType ResultType;

 
      __device__ static void reduce( Operation& operation,
                                     const IndexType size,
                                     const RealType* input1,
                                     const RealType* input2,
                                     ResultType* output );
};

#endif

} // namespace Algorithms
} // namespace Containers
} // namespace TNL

#ifdef HAVE_CUDA
#include <TNL/Containers/Algorithms/CudaReduction_impl.h>
#endif
+90 −10
Original line number Diff line number Diff line
/***************************************************************************
                          CudaReduction_impl.h  -  description
                          CudaReductionKernel.h  -  description
                             -------------------
    begin                : Jun 17, 2015
    copyright            : (C) 2015 by Tomas Oberhuber
    copyright            : (C) 2015 by Tomas Oberhuber et al.
    email                : tomas.oberhuber@fjfi.cvut.cz
 ***************************************************************************/

@@ -14,16 +14,20 @@ namespace TNL {
namespace Containers {
namespace Algorithms {

#ifdef HAVE_CUDA

template< typename Operation, int blockSize >
__device__
__global__
void
CudaReduction< Operation, blockSize >::
reduce( Operation& operation,
        const IndexType size,
        const RealType* input1,
        const RealType* input2,
        ResultType* output )
CudaReductionKernel( Operation& operation,
                     const typename Operation::IndexType size,
                     const typename Operation::RealType* input1,
                     const typename Operation::RealType* input2,
                     typename Operation::ResultType* output )
{
   typedef typename Operation::IndexType IndexType;
   typedef typename Operation::ResultType ResultType;

   extern __shared__ __align__ ( 8 ) char __sdata[];

   ResultType* sdata = reinterpret_cast< ResultType* >( __sdata );
@@ -148,7 +152,83 @@ reduce( Operation& operation,

}

template< typename Operation >
typename Operation::IndexType
CudaReductionKernelLauncher( Operation& operation,
                             const typename Operation::IndexType size,
                             const typename Operation::RealType* input1,
                             const typename Operation::RealType* input2,
                             typename Operation::ResultType*& output )
{
   typedef typename Operation::IndexType IndexType;
   typedef typename Operation::RealType RealType;
   typedef typename Operation::ResultType ResultType;

   // TODO: optimize similarly to multireduction
   const int minGPUReductionDataSize = 256;
   const IndexType desGridSize( minGPUReductionDataSize );
   dim3 blockSize( 256 ), gridSize( 0 );
   gridSize.x = min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );

   // create reference to the reduction buffer singleton and set default size
   CudaReductionBuffer & cudaReductionBuffer = CudaReductionBuffer::getInstance( 8 * minGPUReductionDataSize );

   if( ! cudaReductionBuffer.setSize( gridSize.x * sizeof( ResultType ) ) )
      return false;
   output = cudaReductionBuffer.template getData< ResultType >();
   IndexType shmem = blockSize.x * sizeof( ResultType );

   /***
    * Depending on the blockSize we generate appropriate template instance.
    */
   switch( blockSize.x )
   {
      case 512:
         CudaReductionKernel< Operation, 512 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
      case 256:
         CudaReductionKernel< Operation, 256 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
      case 128:
         CudaReductionKernel< Operation, 128 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
      case  64:
         CudaReductionKernel< Operation,  64 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
      case  32:
         CudaReductionKernel< Operation,  32 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
      case  16:
         CudaReductionKernel< Operation,  16 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
     case   8:
         CudaReductionKernel< Operation,   8 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
      case   4:
         CudaReductionKernel< Operation,   4 >
        <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
        break;
      case   2:
         CudaReductionKernel< Operation,   2 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
      case   1:
         Assert( false, std::cerr << "blockSize should not be 1." << std::endl );
      default:
         Assert( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
   }
   //checkCudaDevice;
   return gridSize. x;
}
#endif

} // namespace Algorithms
} // namespace Containers
} // namespace TNL
+6 −92
Original line number Diff line number Diff line
@@ -20,7 +20,7 @@
#include <TNL/Containers/ArrayOperations.h>
#include <TNL/Math.h>
#include <TNL/Containers/Algorithms/CudaReductionBuffer.h>
#include <TNL/Containers/Algorithms/CudaReduction.h>
#include <TNL/Containers/Algorithms/CudaReductionKernel.h>

#ifdef CUDA_REDUCTION_PROFILING
#include <iostream>
@@ -40,92 +40,6 @@ namespace Algorithms {
const int minGPUReductionDataSize = 256;//65536; //16384;//1024;//256;

#ifdef HAVE_CUDA

template< typename Operation, int blockSize >
__global__ void
CudaReductionKernel( Operation operation,
                     const typename Operation::IndexType size,
                     const typename Operation::RealType* input1,
                     const typename Operation::RealType* input2,
                     typename Operation::ResultType* output )
{
   typedef CudaReduction< Operation, blockSize > Reduction;
   Reduction::reduce( operation, size, input1, input2, output );
};

template< typename Operation >
typename Operation::IndexType
reduceOnCudaDevice( Operation& operation,
                    const typename Operation::IndexType size,
                    const typename Operation::RealType* input1,
                    const typename Operation::RealType* input2,
                    typename Operation::ResultType*& output)
{
   typedef typename Operation::IndexType IndexType;
   typedef typename Operation::RealType RealType;
   typedef typename Operation::ResultType ResultType;
 
   const IndexType desGridSize( minGPUReductionDataSize );
   dim3 blockSize( 256 ), gridSize( 0 );
   gridSize.x = min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );
 
   // create reference to the reduction buffer singleton and set default size
   CudaReductionBuffer & cudaReductionBuffer = CudaReductionBuffer::getInstance( 8 * minGPUReductionDataSize );
 
   if( ! cudaReductionBuffer.setSize( gridSize.x * sizeof( ResultType ) ) )
      return false;
   output = cudaReductionBuffer.template getData< ResultType >();
   IndexType shmem = blockSize.x * sizeof( ResultType );
 
   /***
    * Depending on the blockSize we generate appropriate template instance.
    */
   switch( blockSize.x )
   {
      case 512:
         CudaReductionKernel< Operation, 512 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
      case 256:
         CudaReductionKernel< Operation, 256 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
      case 128:
         CudaReductionKernel< Operation, 128 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
      case  64:
         CudaReductionKernel< Operation,  64 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
      case  32:
         CudaReductionKernel< Operation,  32 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
      case  16:
         CudaReductionKernel< Operation,  16 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
     case   8:
         CudaReductionKernel< Operation,   8 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
      case   4:
         CudaReductionKernel< Operation,   4 >
        <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
        break;
      case   2:
         CudaReductionKernel< Operation,   2 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
      case   1:
         Assert( false, std::cerr << "blockSize should not be 1." << std::endl );
      default:
         Assert( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
   }
   //checkCudaDevice;
   return gridSize. x;
}
#endif

template< typename Operation >
@@ -172,7 +86,7 @@ reductionOnCudaDevice( Operation& operation,
    * Reduce the data on the CUDA device.
    */
   ResultType* deviceAux1( 0 );
   IndexType reducedSize = reduceOnCudaDevice( operation,
   IndexType reducedSize = CudaReductionKernelLauncher( operation,
                                                        size,
                                                        deviceInput1,
                                                        deviceInput2,