/***************************************************************************
                          ParallelFor.h  -  description
                             -------------------
    begin                : Mar 10, 2017
    copyright            : (C) 2017 by Tomas Oberhuber et al.
    email                : tomas.oberhuber@fjfi.cvut.cz
 ***************************************************************************/

/* See Copyright Notice in tnl/Copyright */

#pragma once

#include <TNL/Devices/Host.h>
#include <TNL/Devices/Cuda.h>
#include <TNL/Devices/CudaDeviceInfo.h>

/*
 * The implementation of ParallelFor is not meant to provide maximum performance
 * at every cost, but maximum flexibility for operating with data stored on the
 * device.
 *
 * The grid-stride loop for CUDA has been inspired by Nvidia's blog post:
 * https://devblogs.nvidia.com/parallelforall/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
 *
 * Implemented by: Jakub Klinkovsky
 */

namespace TNL {

template< typename Device = Devices::Host >
struct ParallelFor
{
   template< typename Index,
             typename Function,
             typename... FunctionArgs >
   static void exec( Index start, Index end, Function f, FunctionArgs... args )
   {
#ifdef HAVE_OPENMP
      #pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() && end - start > 512 )
#endif
      for( Index i = start; i < end; i++ )
         f( i, args... );
   }
};

#ifdef HAVE_CUDA
template< typename Index,
          typename Function,
          typename... FunctionArgs >
__global__ void
ParallelForKernel( Index start, Index end, Function f, FunctionArgs... args )
{
   for( Index i = start + blockIdx.x * blockDim.x + threadIdx.x;
        i < end;
        i += blockDim.x * gridDim.x )
   {
      f( i, args... );
   }
}
#endif

template<>
struct ParallelFor< Devices::Cuda >
{
   template< typename Index,
             typename Function,
             typename... FunctionArgs >
   static void exec( Index start, Index end, Function f, FunctionArgs... args )
   {
#ifdef HAVE_CUDA
      if( end > start ) {
         dim3 blockSize( 256 );
         dim3 gridSize;
         const int desGridSize = 32 * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
         gridSize.x = min( desGridSize, Devices::Cuda::getNumberOfBlocks( end - start, blockSize.x ) );

         Devices::Cuda::synchronizeDevice();
         ParallelForKernel<<< gridSize, blockSize >>>( start, end, f, args... );
         cudaDeviceSynchronize();
         TNL_CHECK_CUDA_DEVICE;
      }
#endif
   }
};

} // namespace TNL