Newer
Older
/***************************************************************************
-------------------
begin : Mar 10, 2017
copyright : (C) 2017 by Tomas Oberhuber et al.
email : tomas.oberhuber@fjfi.cvut.cz
***************************************************************************/
/* See Copyright Notice in tnl/Copyright */
#pragma once
#include <TNL/Devices/Host.h>
#include <TNL/Devices/Cuda.h>
#include <TNL/Devices/CudaDeviceInfo.h>
/*
* The implementation of ParallelFor is not meant to provide maximum performance
* at every cost, but maximum flexibility for operating with data stored on the
* device.
*
* The grid-stride loop for CUDA has been inspired by Nvidia's blog post:
* https://devblogs.nvidia.com/parallelforall/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
*
* Implemented by: Jakub Klinkovsky
*/
namespace TNL {
template< typename Device = Devices::Host >
struct ParallelFor
{
template< typename Index,
typename Function,
typename... FunctionArgs >
static void exec( Index start, Index end, Function f, FunctionArgs... args )
{
#ifdef HAVE_OPENMP
// Benchmarks show that this is significantly faster compared
// to '#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() && end - start > 512 )'
if( TNL::Devices::Host::isOMPEnabled() && end - start > 512 )
{
#pragma omp parallel for
for( Index i = start; i < end; i++ )
f( i, args... );
}
else
for( Index i = start; i < end; i++ )
f( i, args... );
#else
for( Index i = start; i < end; i++ )
f( i, args... );
template< typename Device = Devices::Host >
struct ParallelFor2D
{
template< typename Index,
typename Function,
typename... FunctionArgs >
static void exec( Index startX, Index startY, Index endX, Index endY, Function f, FunctionArgs... args )
{
#ifdef HAVE_OPENMP
// Benchmarks show that this is significantly faster compared
// to '#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() )'
if( TNL::Devices::Host::isOMPEnabled() )
{
#pragma omp parallel for
for( Index i = startX; i < endX; i++ )
for( Index j = startY; j < endY; j++ )
f( i, j, args... );
}
else
for( Index i = startX; i < endX; i++ )
for( Index j = startY; j < endY; j++ )
f( i, j, args... );
#else
for( Index i = startX; i < endX; i++ )
for( Index j = startY; j < endY; j++ )
f( i, j, args... );
#endif
}
};
template< typename Device = Devices::Host >
struct ParallelFor3D
{
template< typename Index,
typename Function,
typename... FunctionArgs >
static void exec( Index startX, Index startY, Index startZ, Index endX, Index endY, Index endZ, Function f, FunctionArgs... args )
{
#ifdef HAVE_OPENMP
// Benchmarks show that this is significantly faster compared
// to '#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() )'
if( TNL::Devices::Host::isOMPEnabled() )
{
#pragma omp parallel for collapse(2)
for( Index i = startX; i < endX; i++ )
for( Index j = startY; j < endY; j++ )
for( Index k = startZ; k < endZ; k++ )
f( i, j, k, args... );
}
else
for( Index i = startX; i < endX; i++ )
for( Index j = startY; j < endY; j++ )
for( Index k = startZ; k < endZ; k++ )
f( i, j, k, args... );
#else
for( Index i = startX; i < endX; i++ )
for( Index j = startY; j < endY; j++ )
for( Index k = startZ; k < endZ; k++ )
f( i, j, k, args... );
#endif
template< bool gridStrideX = true,
typename Index,
typename Function,
typename... FunctionArgs >
__global__ void
ParallelForKernel( Index start, Index end, Function f, FunctionArgs... args )
{
Index i = start + blockIdx.x * blockDim.x + threadIdx.x;
while( i < end ) {
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
if( gridStrideX ) i += blockDim.x * gridDim.x;
else break;
}
}
template< bool gridStrideX = true,
bool gridStrideY = true,
typename Index,
typename Function,
typename... FunctionArgs >
__global__ void
ParallelFor2DKernel( Index startX, Index startY, Index endX, Index endY, Function f, FunctionArgs... args )
{
Index j = startY + blockIdx.y * blockDim.y + threadIdx.y;
Index i = startX + blockIdx.x * blockDim.x + threadIdx.x;
while( j < endY ) {
while( i < endX ) {
f( i, j, args... );
if( gridStrideX ) i += blockDim.x * gridDim.x;
else break;
}
if( gridStrideY ) j += blockDim.y * gridDim.y;
else break;
}
}
template< bool gridStrideX = true,
bool gridStrideY = true,
bool gridStrideZ = true,
typename Index,
typename Function,
typename... FunctionArgs >
__global__ void
ParallelFor3DKernel( Index startX, Index startY, Index startZ, Index endX, Index endY, Index endZ, Function f, FunctionArgs... args )
{
Index k = startZ + blockIdx.z * blockDim.z + threadIdx.z;
Index j = startY + blockIdx.y * blockDim.y + threadIdx.y;
Index i = startX + blockIdx.x * blockDim.x + threadIdx.x;
while( k < endZ ) {
while( j < endY ) {
while( i < endX ) {
f( i, j, k, args... );
if( gridStrideX ) i += blockDim.x * gridDim.x;
else break;
}
if( gridStrideY ) j += blockDim.y * gridDim.y;
else break;
}
if( gridStrideZ ) k += blockDim.z * gridDim.z;
else break;
}
}
#endif
template<>
struct ParallelFor< Devices::Cuda >
{
template< typename Index,
typename Function,
typename... FunctionArgs >
static void exec( Index start, Index end, Function f, FunctionArgs... args )
{
#ifdef HAVE_CUDA
if( end > start ) {
dim3 blockSize( 256 );
dim3 gridSize;
gridSize.x = TNL::min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( end - start, blockSize.x ) );
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
if( Devices::Cuda::getNumberOfGrids( end - start ) == 1 )
ParallelForKernel< false ><<< gridSize, blockSize >>>( start, end, f, args... );
else {
// decrease the grid size and align to the number of multiprocessors
const int desGridSize = 32 * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
gridSize.x = TNL::min( desGridSize, Devices::Cuda::getNumberOfBlocks( end - start, blockSize.x ) );
ParallelForKernel< true ><<< gridSize, blockSize >>>( start, end, f, args... );
}
cudaDeviceSynchronize();
TNL_CHECK_CUDA_DEVICE;
}
#else
throw Exceptions::CudaSupportMissing();
#endif
}
};
template<>
struct ParallelFor2D< Devices::Cuda >
{
template< typename Index,
typename Function,
typename... FunctionArgs >
static void exec( Index startX, Index startY, Index endX, Index endY, Function f, FunctionArgs... args )
{
#ifdef HAVE_CUDA
if( endX > startX && endY > startY ) {
const Index sizeX = endX - startX;
const Index sizeY = endY - startY;
dim3 blockSize;
if( sizeX >= sizeY * sizeY ) {
Jakub Klinkovský
committed
blockSize.x = TNL::min( 256, sizeX );
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
blockSize.y = 1;
}
else if( sizeY >= sizeX * sizeX ) {
blockSize.x = 1;
blockSize.y = TNL::min( 256, sizeY );
}
else {
blockSize.x = TNL::min( 32, sizeX );
blockSize.y = TNL::min( 8, sizeY );
}
dim3 gridSize;
gridSize.x = TNL::min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( sizeX, blockSize.x ) );
gridSize.y = TNL::min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( sizeY, blockSize.y ) );
dim3 gridCount;
gridCount.x = Devices::Cuda::getNumberOfGrids( sizeX );
gridCount.y = Devices::Cuda::getNumberOfGrids( sizeY );
if( gridCount.x == 1 && gridCount.y == 1 )
ParallelFor2DKernel< false, false ><<< gridSize, blockSize >>>
( startX, startY, endX, endY, f, args... );
else if( gridCount.x == 1 && gridCount.y > 1 )
ParallelFor2DKernel< false, true ><<< gridSize, blockSize >>>
( startX, startY, endX, endY, f, args... );
else if( gridCount.x > 1 && gridCount.y == 1 )
ParallelFor2DKernel< true, false ><<< gridSize, blockSize >>>
( startX, startY, endX, endY, f, args... );
else
ParallelFor2DKernel< true, true ><<< gridSize, blockSize >>>
( startX, startY, endX, endY, f, args... );
cudaDeviceSynchronize();
TNL_CHECK_CUDA_DEVICE;
}
#else
throw Exceptions::CudaSupportMissing();
#endif
}
};
template<>
struct ParallelFor3D< Devices::Cuda >
{
template< typename Index,
typename Function,
typename... FunctionArgs >
static void exec( Index startX, Index startY, Index startZ, Index endX, Index endY, Index endZ, Function f, FunctionArgs... args )
{
#ifdef HAVE_CUDA
if( endX > startX && endY > startY && endZ > startZ ) {
const Index sizeX = endX - startX;
const Index sizeY = endY - startY;
const Index sizeZ = endZ - startZ;
dim3 blockSize;
if( sizeX >= sizeY * sizeZ ) {
blockSize.x = TNL::min( 256, sizeX );
blockSize.y = 1;
blockSize.z = 1;
}
else if( sizeY >= sizeX * sizeZ ) {
blockSize.x = 1;
blockSize.y = TNL::min( 256, sizeY );
blockSize.z = 1;
}
else if( sizeZ >= sizeX * sizeY ) {
blockSize.x = 1;
blockSize.y = 1;
blockSize.z = TNL::min( 256, sizeZ );
}
else {
blockSize.x = TNL::min( 16, sizeX );
blockSize.y = TNL::min( 4, sizeY );
blockSize.z = TNL::min( 4, sizeZ );
}
dim3 gridSize;
gridSize.x = TNL::min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( sizeX, blockSize.x ) );
gridSize.y = TNL::min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( sizeY, blockSize.y ) );
gridSize.z = TNL::min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( sizeZ, blockSize.z ) );
dim3 gridCount;
gridCount.x = Devices::Cuda::getNumberOfGrids( sizeX );
gridCount.y = Devices::Cuda::getNumberOfGrids( sizeY );
gridCount.z = Devices::Cuda::getNumberOfGrids( sizeZ );
if( gridCount.x == 1 && gridCount.y == 1 && gridCount.z == 1 )
ParallelFor3DKernel< false, false, false ><<< gridSize, blockSize >>>
( startX, startY, startZ, endX, endY, endZ, f, args... );
else if( gridCount.x == 1 && gridCount.y == 1 && gridCount.z > 1 )
ParallelFor3DKernel< false, false, true ><<< gridSize, blockSize >>>
( startX, startY, startZ, endX, endY, endZ, f, args... );
else if( gridCount.x == 1 && gridCount.y > 1 && gridCount.z == 1 )
ParallelFor3DKernel< false, true, false ><<< gridSize, blockSize >>>
( startX, startY, startZ, endX, endY, endZ, f, args... );
else if( gridCount.x > 1 && gridCount.y == 1 && gridCount.z == 1 )
ParallelFor3DKernel< true, false, false ><<< gridSize, blockSize >>>
( startX, startY, startZ, endX, endY, endZ, f, args... );
else if( gridCount.x == 1 && gridCount.y > 1 && gridCount.z > 1 )
ParallelFor3DKernel< false, true, true ><<< gridSize, blockSize >>>
( startX, startY, startZ, endX, endY, endZ, f, args... );
else if( gridCount.x > 1 && gridCount.y > 1 && gridCount.z == 1 )
ParallelFor3DKernel< true, true, false ><<< gridSize, blockSize >>>
( startX, startY, startZ, endX, endY, endZ, f, args... );
else if( gridCount.x > 1 && gridCount.y == 1 && gridCount.z > 1 )
ParallelFor3DKernel< true, false, true ><<< gridSize, blockSize >>>
( startX, startY, startZ, endX, endY, endZ, f, args... );
else
ParallelFor3DKernel< true, true, true ><<< gridSize, blockSize >>>
( startX, startY, startZ, endX, endY, endZ, f, args... );
cudaDeviceSynchronize();
TNL_CHECK_CUDA_DEVICE;
#else
throw Exceptions::CudaSupportMissing();
#endif
}
};
} // namespace TNL