/***************************************************************************
                          CudaReductionKernel.h  -  description
                             -------------------
    begin                : Jun 17, 2015
    copyright            : (C) 2015 by Tomas Oberhuber et al.
    email                : tomas.oberhuber@fjfi.cvut.cz
 ***************************************************************************/

/* See Copyright Notice in tnl/Copyright */

#pragma once

#ifdef HAVE_CUDA
#include <cuda.h>
#endif

#include <TNL/Assert.h>
#include <TNL/Math.h>
#include <TNL/Devices/CudaDeviceInfo.h>
#include <TNL/Containers/Algorithms/CudaReductionBuffer.h>

namespace TNL {
namespace Containers {
namespace Algorithms {

#ifdef HAVE_CUDA
/****
 * The performance of this kernel is very sensitive to register usage.
 * Compile with --ptxas-options=-v and configure these constants for given
 * architecture so that there are no local memory spills.
 */
static constexpr int Reduction_maxThreadsPerBlock = 256;  // must be a power of 2
#if (__CUDA_ARCH__ >= 300 )
   static constexpr int Reduction_minBlocksPerMultiprocessor = 8;
#else
   static constexpr int Reduction_minBlocksPerMultiprocessor = 4;
#endif

template< typename Operation, int blockSize >
__global__ void
__launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor )
CudaReductionKernel( Operation operation,
                     const typename Operation::IndexType size,
                     const typename Operation::RealType* input1,
                     const typename Operation::RealType* input2,
                     typename Operation::ResultType* output )
{
   typedef typename Operation::IndexType IndexType;
   typedef typename Operation::ResultType ResultType;

   ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >();

   /***
    * Get thread id (tid) and global thread id (gid).
    * gridSize is the number of element processed by all blocks at the
    * same time.
    */
   const IndexType tid = threadIdx.x;
         IndexType gid = blockIdx.x * blockDim. x + threadIdx.x;
   const IndexType gridSize = blockDim.x * gridDim.x;

   sdata[ tid ] = operation.initialValue();
   /***
    * Read data into the shared memory. We start with the
    * sequential reduction.
    */
   while( gid + 4 * gridSize < size )
   {
      operation.cudaFirstReduction( sdata[ tid ], gid,                input1, input2 );
      operation.cudaFirstReduction( sdata[ tid ], gid + gridSize,     input1, input2 );
      operation.cudaFirstReduction( sdata[ tid ], gid + 2 * gridSize, input1, input2 );
      operation.cudaFirstReduction( sdata[ tid ], gid + 3 * gridSize, input1, input2 );
      gid += 4 * gridSize;
   }
   while( gid + 2 * gridSize < size )
   {
      operation.cudaFirstReduction( sdata[ tid ], gid,                input1, input2 );
      operation.cudaFirstReduction( sdata[ tid ], gid + gridSize,     input1, input2 );
      gid += 2 * gridSize;
   }
   while( gid < size )
   {
      operation.cudaFirstReduction( sdata[ tid ], gid,                input1, input2 );
      gid += gridSize;
   }
   __syncthreads();


   //printf( "1: tid %d data %f \n", tid, sdata[ tid ] );

   //return;
   /***
    *  Perform the parallel reduction.
    */
   if( blockSize >= 1024 )
   {
      if( tid < 512 )
         operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 512 ] );
      __syncthreads();
   }
   if( blockSize >= 512 )
   {
      if( tid < 256 )
         operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 256 ] );
      __syncthreads();
   }
   if( blockSize >= 256 )
   {
      if( tid < 128 )
         operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 128 ] );
      __syncthreads();
      //printf( "2: tid %d data %f \n", tid, sdata[ tid ] );
   }

   if( blockSize >= 128 )
   {
      if( tid <  64 )
         operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 64 ] );
      __syncthreads();
      //printf( "3: tid %d data %f \n", tid, sdata[ tid ] );
   }


   /***
    * This runs in one warp so it is synchronized implicitly.
    */   
   if( tid < 32 )
   {
      volatile ResultType* vsdata = sdata;
      if( blockSize >= 64 )
      {
         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 32 ] );
         //printf( "4: tid %d data %f \n", tid, sdata[ tid ] );
      }
      // TODO: If blocksize == 32, the following does not work
      // We do not check if tid < 16. Fix it!!!
      if( blockSize >= 32 )
      {
         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 16 ] );
         //printf( "5: tid %d data %f \n", tid, sdata[ tid ] );
      }
      if( blockSize >= 16 )
      {
         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 8 ] );
         //printf( "6: tid %d data %f \n", tid, sdata[ tid ] );
      }
      if( blockSize >=  8 )
      {
         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 4 ] );
         //printf( "7: tid %d data %f \n", tid, sdata[ tid ] );
      }
      if( blockSize >=  4 )
      {
         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 2 ] );
         //printf( "8: tid %d data %f \n", tid, sdata[ tid ] );
      }
      if( blockSize >=  2 )
      {
         operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 1 ] );
         //printf( "9: tid %d data %f \n", tid, sdata[ tid ] );
      }
   }

   /***
    * Store the result back in the global memory.
    */
   if( tid == 0 )
   {
      //printf( "Block %d result = %f \n", blockIdx.x, sdata[ 0 ] );
      output[ blockIdx.x ] = sdata[ 0 ];
   }

}

template< typename Operation >
typename Operation::IndexType
CudaReductionKernelLauncher( Operation& operation,
                             const typename Operation::IndexType size,
                             const typename Operation::RealType* input1,
                             const typename Operation::RealType* input2,
                             typename Operation::ResultType*& output )
{
   typedef typename Operation::IndexType IndexType;
   typedef typename Operation::RealType RealType;
   typedef typename Operation::ResultType ResultType;

   // The number of blocks should be a multiple of the number of multiprocessors
   // to ensure optimum balancing of the load. This is very important, because
   // we run the kernel with a fixed number of blocks, so the amount of work per
   // block increases with enlarging the problem, so even small imbalance can
   // cost us dearly.
   // On Tesla K40c, desGridSize = 4 * 6 * 15 = 360.
//   const IndexType desGridSize = 4 * Reduction_minBlocksPerMultiprocessor
//                                   * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
   // On Tesla K40c, desGridSize = 6 * 15 = 90.
   const IndexType desGridSize = Reduction_minBlocksPerMultiprocessor
                               * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
   dim3 blockSize( 256 ), gridSize( 0 );
   gridSize.x = min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );

   // create reference to the reduction buffer singleton and set size
   const size_t buf_size = desGridSize * sizeof( ResultType );
   CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
   if( ! cudaReductionBuffer.setSize( buf_size ) )
      throw 1;
   output = cudaReductionBuffer.template getData< ResultType >();

   // when there is only one warp per blockSize.x, we need to allocate two warps
   // worth of shared memory so that we don't index shared memory out of bounds
   const IndexType shmem = (blockSize.x <= 32)
            ? 2 * blockSize.x * sizeof( ResultType )
            : blockSize.x * sizeof( ResultType );

   /***
    * Depending on the blockSize we generate appropriate template instance.
    */
   switch( blockSize.x )
   {
      case 512:
         CudaReductionKernel< Operation, 512 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
      case 256:
         cudaFuncSetCacheConfig(CudaReductionKernel< Operation, 256 >, cudaFuncCachePreferShared);

         CudaReductionKernel< Operation, 256 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
      case 128:
         cudaFuncSetCacheConfig(CudaReductionKernel< Operation, 128 >, cudaFuncCachePreferShared);

         CudaReductionKernel< Operation, 128 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
      case  64:
         cudaFuncSetCacheConfig(CudaReductionKernel< Operation,  64 >, cudaFuncCachePreferShared);

         CudaReductionKernel< Operation,  64 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
      case  32:
         cudaFuncSetCacheConfig(CudaReductionKernel< Operation,  32 >, cudaFuncCachePreferShared);

         CudaReductionKernel< Operation,  32 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
      case  16:
         cudaFuncSetCacheConfig(CudaReductionKernel< Operation,  16 >, cudaFuncCachePreferShared);

         CudaReductionKernel< Operation,  16 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
     case   8:
         cudaFuncSetCacheConfig(CudaReductionKernel< Operation,   8 >, cudaFuncCachePreferShared);

         CudaReductionKernel< Operation,   8 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
      case   4:
         cudaFuncSetCacheConfig(CudaReductionKernel< Operation,   4 >, cudaFuncCachePreferShared);

         CudaReductionKernel< Operation,   4 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
      case   2:
         cudaFuncSetCacheConfig(CudaReductionKernel< Operation,   2 >, cudaFuncCachePreferShared);

         CudaReductionKernel< Operation,   2 >
         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
         break;
      case   1:
         TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl );
      default:
         TNL_ASSERT( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
   }
   checkCudaDevice;

   // return the size of the output array on the CUDA device
   return gridSize.x;
}
#endif

} // namespace Algorithms
} // namespace Containers
} // namespace TNL