Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
/***************************************************************************
IndexPermutationApplier.h - description
-------------------
begin : Mar 10, 2017
copyright : (C) 2017 by Tomas Oberhuber et al.
email : tomas.oberhuber@fjfi.cvut.cz
***************************************************************************/
/* See Copyright Notice in tnl/Copyright */
#pragma once
#include <TNL/Devices/Host.h>
#include <TNL/Devices/Cuda.h>
#include <TNL/Devices/CudaDeviceInfo.h>
/*
* The implementation of ParallelFor is not meant to provide maximum performance
* at every cost, but maximum flexibility for operating with data stored on the
* device.
*
* The grid-stride loop for CUDA has been inspired by Nvidia's blog post:
* https://devblogs.nvidia.com/parallelforall/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
*/
namespace TNL {
template< typename Device = Devices::Host >
struct ParallelFor
{
template< typename Index,
typename Function,
typename... FunctionArgs >
static void exec( Index start, Index end, Function f, FunctionArgs... args )
{
#ifdef HAVE_OPENMP
#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() && end - start > 512 )
#endif
for( Index i = start; i < end; i++ )
f( i, args... );
}
};
#ifdef HAVE_CUDA
template< typename Index,
typename Function,
typename... FunctionArgs >
__global__ void
ParallelForKernel( Index start, Index end, Function f, FunctionArgs... args )
{
for( Index i = start + blockIdx.x * blockDim.x + threadIdx.x;
i < end;
i += blockDim.x * gridDim.x )
{
f( i, args... );
}
}
#endif
template<>
struct ParallelFor< Devices::Cuda >
{
template< typename Index,
typename Function,
typename... FunctionArgs >
static void exec( Index start, Index end, Function f, FunctionArgs... args )
{
#ifdef HAVE_CUDA
if( end > start ) {
dim3 blockSize( 256 );
dim3 gridSize;
const int desGridSize = 32 * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
gridSize.x = min( desGridSize, Devices::Cuda::getNumberOfBlocks( end - start, blockSize.x ) );
Devices::Cuda::synchronizeDevice();
ParallelForKernel<<< gridSize, blockSize >>>( start, end, f, args... );
}
#endif
}
};
} // namespace TNL