Newer
Older
/***************************************************************************
-------------------
begin : Jan 2, 2016
copyright : (C) 2016 by Tomas Oberhuber
email : tomas.oberhuber@fjfi.cvut.cz
***************************************************************************/
/* See Copyright Notice in tnl/Copyright */
//#define GRID_TRAVERSER_USE_STREAMS
Tomáš Oberhuber
committed
#include "GridTraverser.h"
#include <TNL/Exceptions/CudaSupportMissing.h>
template< typename Real,
typename Index >
template<
typename GridEntity,
typename EntitiesProcessor,
typename UserData,
bool processOnlyBoundaryEntities >
void
GridTraverser< Meshes::Grid< 1, Real, Devices::Host, Index > >::
const GridPointer& gridPointer,
const CoordinatesType begin,
const CoordinatesType end,
Tomáš Oberhuber
committed
UserData& userData,
const int& stream )
GridEntity entity( *gridPointer );
GridEntity entity( *gridPointer );
entity.getCoordinates() = begin;
entity.refresh();
Tomáš Oberhuber
committed
EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
entity.getCoordinates() = end;
entity.refresh();
Tomáš Oberhuber
committed
EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
#ifdef HAVE_OPENMP
if( Devices::Host::isOMPEnabled() && end.x() - begin.x() > 512 )
{
#pragma omp parallel firstprivate( begin, end )
{
Tomáš Oberhuber
committed
GridEntity entity( *gridPointer );
#pragma omp for
// TODO: g++ 5.5 crashes when coding this loop without auxiliary x as bellow
for( IndexType x = begin.x(); x <= end.x(); x++ )
{
entity.getCoordinates().x() = x;
entity.refresh();
EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
}
}
}
else
{
GridEntity entity( *gridPointer );
Tomáš Oberhuber
committed
for( entity.getCoordinates().x() = begin.x();
entity.getCoordinates().x() <= end.x();
entity.getCoordinates().x() ++ )
{
entity.refresh();
EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
}
}
#else
GridEntity entity( *gridPointer );
Tomáš Oberhuber
committed
for( entity.getCoordinates().x() = begin.x();
entity.getCoordinates().x() <= end.x();
entity.getCoordinates().x() ++ )
{
entity.refresh();
EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
}
#endif
}
}
/****
* 1D traverser, CUDA
*/
#ifdef HAVE_CUDA
template< typename Real,
typename Index,
typename GridEntity,
typename UserData,
GridTraverser1D(
const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid,
Tomáš Oberhuber
committed
UserData userData,
const typename GridEntity::CoordinatesType begin,
const typename GridEntity::CoordinatesType end,
const Index gridIdx )
{
typedef Real RealType;
typedef Index IndexType;
typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType;
typename GridType::CoordinatesType coordinates;
coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
if( coordinates <= end )
GridEntity entity( *grid, coordinates );
Tomáš Oberhuber
committed
EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
template< typename Real,
typename Index,
typename GridEntity,
typename UserData,
typename EntitiesProcessor >
GridBoundaryTraverser1D(
const Meshes::Grid< 1, Real, Devices::Cuda, Index >* grid,
Tomáš Oberhuber
committed
UserData userData,
const typename GridEntity::CoordinatesType begin,
const typename GridEntity::CoordinatesType end )
{
typedef Real RealType;
typedef Index IndexType;
typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType;
typename GridType::CoordinatesType coordinates;
coordinates.x() = begin.x();
GridEntity entity( *grid, coordinates );
Tomáš Oberhuber
committed
EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
coordinates.x() = end.x();
GridEntity entity( *grid, coordinates );
Tomáš Oberhuber
committed
EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
template< typename Real,
typename Index >
template<
typename GridEntity,
typename EntitiesProcessor,
typename UserData,
bool processOnlyBoundaryEntities >
void
GridTraverser< Meshes::Grid< 1, Real, Devices::Cuda, Index > >::
const GridPointer& gridPointer,
const CoordinatesType& begin,
const CoordinatesType& end,
Tomáš Oberhuber
committed
UserData& userData,
const int& stream )
auto& pool = CudaStreamPool::getInstance();
const cudaStream_t& s = pool.getStream( stream );
Devices::Cuda::synchronizeDevice();
if( processOnlyBoundaryEntities )
{
dim3 cudaBlockSize( 2 );
dim3 cudaBlocks( 1 );
GridBoundaryTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor >
<<< cudaBlocks, cudaBlockSize, 0, s >>>
( &gridPointer.template getData< Devices::Cuda >(),
Tomáš Oberhuber
committed
userData,
}
else
{
dim3 cudaBlockSize( 256 );
dim3 cudaBlocks;
cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x );
const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x );
for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ )
GridTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor >
<<< cudaBlocks, cudaBlockSize, 0, s >>>
( &gridPointer.template getData< Devices::Cuda >(),
Tomáš Oberhuber
committed
userData,
// only launches into the stream 0 are synchronized
if( stream == 0 )
{
cudaStreamSynchronize( s );
TNL_CHECK_CUDA_DEVICE;
#else
throw Exceptions::CudaSupportMissing();
/****
* 1D traverser, MIC
*/
template< typename Real,
typename Index >
template<
typename GridEntity,
typename EntitiesProcessor,
typename UserData,
bool processOnlyBoundaryEntities >
void
GridTraverser< Meshes::Grid< 1, Real, Devices::MIC, Index > >::
processEntities(
const GridPointer& gridPointer,
const CoordinatesType& begin,
const CoordinatesType& end,
Tomáš Oberhuber
committed
UserData& userData,
const int& stream )
{
std::cout << "Not Implemented yet Grid Traverser <1, Real, Device::MIC>" << std::endl;
/*
auto& pool = CudaStreamPool::getInstance();
const cudaStream_t& s = pool.getStream( stream );
Devices::Cuda::synchronizeDevice();
if( processOnlyBoundaryEntities )
{
dim3 cudaBlockSize( 2 );
dim3 cudaBlocks( 1 );
GridBoundaryTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor >
<<< cudaBlocks, cudaBlockSize, 0, s >>>
( &gridPointer.template getData< Devices::Cuda >(),
Tomáš Oberhuber
committed
userData,
begin,
end );
}
else
{
dim3 cudaBlockSize( 256 );
dim3 cudaBlocks;
cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x );
const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x );
for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ )
GridTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor >
<<< cudaBlocks, cudaBlockSize, 0, s >>>
( &gridPointer.template getData< Devices::Cuda >(),
Tomáš Oberhuber
committed
userData,
begin,
end,
gridXIdx );
}
// only launches into the stream 0 are synchronized
if( stream == 0 )
{
cudaStreamSynchronize( s );
/****
* 2D traverser, host
*/
template< typename Real,
typename Index >
template<
typename GridEntity,
typename EntitiesProcessor,
typename UserData,
bool processOnlyBoundaryEntities,
int XOrthogonalBoundary,
int YOrthogonalBoundary,
typename... GridEntityParameters >
GridTraverser< Meshes::Grid< 2, Real, Devices::Host, Index > >::
const GridPointer& gridPointer,
const CoordinatesType begin,
const CoordinatesType end,
Tomáš Oberhuber
committed
UserData& userData,
const int& stream,
const GridEntityParameters&... gridEntityParameters )
{
if( processOnlyBoundaryEntities )
{
GridEntity entity( *gridPointer, begin, gridEntityParameters... );
if( YOrthogonalBoundary )
for( entity.getCoordinates().x() = begin.x();
entity.getCoordinates().x() <= end.x();
entity.getCoordinates().x() ++ )
{
entity.getCoordinates().y() = begin.y();
entity.refresh();
Tomáš Oberhuber
committed
EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
entity.getCoordinates().y() = end.y();
entity.refresh();
Tomáš Oberhuber
committed
EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
}
if( XOrthogonalBoundary )
for( entity.getCoordinates().y() = begin.y();
entity.getCoordinates().y() <= end.y();
entity.getCoordinates().y() ++ )
{
entity.getCoordinates().x() = begin.x();
entity.refresh();
Tomáš Oberhuber
committed
EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
entity.getCoordinates().x() = end.x();
entity.refresh();
Tomáš Oberhuber
committed
EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
#ifdef HAVE_OPENMP
Tomáš Oberhuber
committed
if( Devices::Host::isOMPEnabled() )
Tomáš Oberhuber
committed
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
#pragma omp parallel firstprivate( begin, end )
{
GridEntity entity( *gridPointer );
#pragma omp for
// TODO: g++ 5.5 crashes when coding this loop without auxiliary x and y as bellow
for( IndexType y = begin.y(); y <= end.y(); y ++ )
for( IndexType x = begin.x(); x <= end.x(); x ++ )
{
entity.getCoordinates().x() = x;
entity.getCoordinates().y() = y;
entity.refresh();
EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
}
}
}
else
{
GridEntity entity( *gridPointer );
for( entity.getCoordinates().y() = begin.y();
entity.getCoordinates().y() <= end.y();
entity.getCoordinates().y() ++ )
for( entity.getCoordinates().x() = begin.x();
entity.getCoordinates().x() <= end.x();
entity.getCoordinates().x() ++ )
{
entity.refresh();
EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
}
Tomáš Oberhuber
committed
#else
GridEntity entity( *gridPointer );
for( entity.getCoordinates().y() = begin.y();
entity.getCoordinates().y() <= end.y();
entity.getCoordinates().y() ++ )
for( entity.getCoordinates().x() = begin.x();
entity.getCoordinates().x() <= end.x();
entity.getCoordinates().x() ++ )
{
entity.refresh();
EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
}
#endif
}
}
/****
* 2D traverser, CUDA
*/
Tomáš Oberhuber
committed
#ifdef HAVE_CUDA
template< typename Real,
typename Index,
typename GridEntity,
typename UserData,
typename EntitiesProcessor,
bool processOnlyBoundaryEntities,
typename... GridEntityParameters >
GridTraverser2D(
const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid,
Tomáš Oberhuber
committed
UserData userData,
const typename GridEntity::CoordinatesType begin,
const typename GridEntity::CoordinatesType end,
const GridEntityParameters... gridEntityParameters )
typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType;
typename GridType::CoordinatesType coordinates;
coordinates.x() = begin.x() + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
coordinates.y() = begin.y() + Devices::Cuda::getGlobalThreadIdx_y( gridIdx );
if( coordinates <= end )
{
GridEntity entity( *grid, coordinates, gridEntityParameters... );
entity.refresh();
if( ! processOnlyBoundaryEntities || entity.isBoundaryEntity() )
{
EntitiesProcessor::processEntity
( *grid,
Tomáš Oberhuber
committed
userData,
Tomáš Oberhuber
committed
// Boundary traverser using streams
template< typename Real,
typename Index,
typename GridEntity,
typename UserData,
typename EntitiesProcessor,
bool processOnlyBoundaryEntities,
typename... GridEntityParameters >
__global__ void
GridTraverser2DBoundaryAlongX(
const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid,
UserData userData,
const Index beginX,
const Index endX,
const Index fixedY,
const dim3 gridIdx,
const GridEntityParameters... gridEntityParameters )
{
typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType;
typename GridType::CoordinatesType coordinates;
coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
Tomáš Oberhuber
committed
coordinates.y() = fixedY;
Tomáš Oberhuber
committed
if( coordinates.x() <= endX )
{
GridEntity entity( *grid, coordinates, gridEntityParameters... );
entity.refresh();
EntitiesProcessor::processEntity
( *grid,
userData,
entity );
Tomáš Oberhuber
committed
}
Tomáš Oberhuber
committed
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
}
// Boundary traverser using streams
template< typename Real,
typename Index,
typename GridEntity,
typename UserData,
typename EntitiesProcessor,
bool processOnlyBoundaryEntities,
typename... GridEntityParameters >
__global__ void
GridTraverser2DBoundaryAlongY(
const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid,
UserData userData,
const Index beginY,
const Index endY,
const Index fixedX,
const dim3 gridIdx,
const GridEntityParameters... gridEntityParameters )
{
typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType;
typename GridType::CoordinatesType coordinates;
coordinates.x() = fixedX;
coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
if( coordinates.y() <= endY )
{
GridEntity entity( *grid, coordinates, gridEntityParameters... );
entity.refresh();
EntitiesProcessor::processEntity
( *grid,
userData,
entity );
}
}
template< typename Real,
typename Index,
typename GridEntity,
typename UserData,
typename EntitiesProcessor,
bool processOnlyBoundaryEntities,
typename... GridEntityParameters >
__global__ void
Tomáš Oberhuber
committed
GridTraverser2DBoundary(
const Meshes::Grid< 2, Real, Devices::Cuda, Index >* grid,
Tomáš Oberhuber
committed
UserData userData,
const Index beginX,
const Index endX,
const Index beginY,
const Index endY,
Tomáš Oberhuber
committed
const Index blocksPerFace,
const dim3 gridIdx,
const GridEntityParameters... gridEntityParameters )
{
Tomáš Oberhuber
committed
using GridType = Meshes::Grid< 2, Real, Devices::Cuda, Index >;
using CoordinatesType = typename GridType::CoordinatesType;
Tomáš Oberhuber
committed
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
const Index faceIdx = blockIdx.x / blocksPerFace;
const Index faceBlockIdx = blockIdx.x % blocksPerFace;
const Index threadId = faceBlockIdx * blockDim. x + threadIdx.x;
if( faceIdx < 2 )
{
const Index entitiesAlongX = endX - beginX + 1;
if( threadId < entitiesAlongX )
{
GridEntity entity( *grid,
CoordinatesType( beginX + threadId, faceIdx == 0 ? beginY : endY ),
gridEntityParameters... );
//printf( "faceIdx %d Thread %d -> %d %d \n ", faceIdx, threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
entity.refresh();
EntitiesProcessor::processEntity( *grid, userData, entity );
}
}
else
{
const Index entitiesAlongY = endY - beginY - 1;
if( threadId < entitiesAlongY )
{
GridEntity entity( *grid,
CoordinatesType( faceIdx == 2 ? beginX : endX, beginY + threadId + 1 ),
gridEntityParameters... );
//printf( "faceIdx %d Thread %d -> %d %d \n ", faceIdx, threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
entity.refresh();
EntitiesProcessor::processEntity( *grid, userData, entity );
}
}
/*const Index aux = max( entitiesAlongX, entitiesAlongY );
const Index& warpSize = Devices::Cuda::getWarpSize();
const Index threadsPerAxis = warpSize * ( aux / warpSize + ( aux % warpSize != 0 ) );
Tomáš Oberhuber
committed
Index threadId = Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
Tomáš Oberhuber
committed
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
GridEntity entity( *grid,
CoordinatesType( 0, 0 ),
gridEntityParameters... );
CoordinatesType& coordinates = entity.getCoordinates();
const Index axisIndex = threadId / threadsPerAxis;
//printf( "axisIndex %d, threadId %d thradsPerAxis %d \n", axisIndex, threadId, threadsPerAxis );
threadId -= axisIndex * threadsPerAxis;
switch( axisIndex )
{
case 1:
coordinates = CoordinatesType( beginX + threadId, beginY );
if( threadId < entitiesAlongX )
{
//printf( "X1: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() );
entity.refresh();
EntitiesProcessor::processEntity( *grid, userData, entity );
}
break;
case 2:
coordinates = CoordinatesType( beginX + threadId, endY );
if( threadId < entitiesAlongX )
{
//printf( "X2: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() );
entity.refresh();
EntitiesProcessor::processEntity( *grid, userData, entity );
}
break;
case 3:
coordinates = CoordinatesType( beginX, beginY + threadId + 1 );
if( threadId < entitiesAlongY )
{
//printf( "Y1: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() );
entity.refresh();
EntitiesProcessor::processEntity( *grid, userData, entity );
}
break;
case 4:
coordinates = CoordinatesType( endX, beginY + threadId + 1 );
if( threadId < entitiesAlongY )
{
//printf( "Y2: Thread %d -> %d %d \n ", threadId, coordinates.x(), coordinates.y() );
entity.refresh();
EntitiesProcessor::processEntity( *grid, userData, entity );
}
break;
}*/
/*if( threadId < entitiesAlongX )
Tomáš Oberhuber
committed
GridEntity entity( *grid,
CoordinatesType( beginX + threadId, beginY ),
gridEntityParameters... );
//printf( "X1: Thread %d -> %d %d x %d %d \n ", threadId,
// entity.getCoordinates().x(), entity.getCoordinates().y(),
// grid->getDimensions().x(), grid->getDimensions().y() );
entity.refresh();
EntitiesProcessor::processEntity( *grid, userData, entity );
}
else if( ( threadId -= entitiesAlongX ) < entitiesAlongX && threadId >= 0 )
{
GridEntity entity( *grid,
CoordinatesType( beginX + threadId, endY ),
gridEntityParameters... );
Tomáš Oberhuber
committed
//printf( "X2: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
EntitiesProcessor::processEntity( *grid, userData, entity );
}
else if( ( ( threadId -= entitiesAlongX ) < entitiesAlongY - 1 ) && threadId >= 0 )
{
GridEntity entity( *grid,
CoordinatesType( beginX, beginY + threadId + 1 ),
gridEntityParameters... );
entity.refresh();
//printf( "Y1: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
EntitiesProcessor::processEntity( *grid, userData, entity );
}
else if( ( ( threadId -= entitiesAlongY - 1 ) < entitiesAlongY - 1 ) && threadId >= 0 )
{
GridEntity entity( *grid,
CoordinatesType( endX, beginY + threadId + 1 ),
gridEntityParameters... );
entity.refresh();
//printf( "Y2: Thread %d -> %d %d \n ", threadId, entity.getCoordinates().x(), entity.getCoordinates().y() );
EntitiesProcessor::processEntity( *grid, userData, entity );
Tomáš Oberhuber
committed
}*/
Tomáš Oberhuber
committed
#endif // HAVE_CUDA
template< typename Real,
typename Index >
template<
typename GridEntity,
typename EntitiesProcessor,
typename UserData,
bool processOnlyBoundaryEntities,
int XOrthogonalBoundary,
int YOrthogonalBoundary,
typename... GridEntityParameters >
GridTraverser< Meshes::Grid< 2, Real, Devices::Cuda, Index > >::
const GridPointer& gridPointer,
const CoordinatesType& begin,
const CoordinatesType& end,
Tomáš Oberhuber
committed
UserData& userData,
const int& stream,
const GridEntityParameters&... gridEntityParameters )
#ifdef HAVE_CUDA
if( processOnlyBoundaryEntities &&
( GridEntity::getEntityDimension() == 2 || GridEntity::getEntityDimension() == 0 ) )
Tomáš Oberhuber
committed
#ifdef GRID_TRAVERSER_USE_STREAMS
Tomáš Oberhuber
committed
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
dim3 cudaBlockSize( 256 );
dim3 cudaBlocksCountAlongX, cudaGridsCountAlongX,
cudaBlocksCountAlongY, cudaGridsCountAlongY;
Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongX, cudaGridsCountAlongX, end.x() - begin.x() + 1 );
Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongY, cudaGridsCountAlongY, end.y() - begin.y() - 1 );
auto& pool = CudaStreamPool::getInstance();
Devices::Cuda::synchronizeDevice();
const cudaStream_t& s1 = pool.getStream( stream );
const cudaStream_t& s2 = pool.getStream( stream + 1 );
dim3 gridIdx, cudaGridSize;
for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongX.x; gridIdx.x++ )
{
Devices::Cuda::setupGrid( cudaBlocksCountAlongX, cudaGridsCountAlongX, gridIdx, cudaGridSize );
//Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCountAlongX, cudaGridSize, cudaGridsCountAlongX );
GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
<<< cudaGridSize, cudaBlockSize, 0, s1 >>>
( &gridPointer.template getData< Devices::Cuda >(),
userData,
begin.x(),
end.x(),
begin.y(),
gridIdx,
gridEntityParameters... );
GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
<<< cudaGridSize, cudaBlockSize, 0, s2 >>>
( &gridPointer.template getData< Devices::Cuda >(),
userData,
begin.x(),
end.x(),
end.y(),
gridIdx,
gridEntityParameters... );
}
const cudaStream_t& s3 = pool.getStream( stream + 2 );
const cudaStream_t& s4 = pool.getStream( stream + 3 );
for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongY.x; gridIdx.x++ )
{
Devices::Cuda::setupGrid( cudaBlocksCountAlongY, cudaGridsCountAlongY, gridIdx, cudaGridSize );
GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
<<< cudaGridSize, cudaBlockSize, 0, s3 >>>
( &gridPointer.template getData< Devices::Cuda >(),
userData,
begin.y() + 1,
end.y() - 1,
begin.x(),
gridIdx,
gridEntityParameters... );
GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
<<< cudaGridSize, cudaBlockSize, 0, s4 >>>
( &gridPointer.template getData< Devices::Cuda >(),
userData,
begin.y() + 1,
end.y() - 1,
end.x(),
gridIdx,
gridEntityParameters... );
}
cudaStreamSynchronize( s1 );
cudaStreamSynchronize( s2 );
cudaStreamSynchronize( s3 );
cudaStreamSynchronize( s4 );
#else // not defined GRID_TRAVERSER_USE_STREAMS
Tomáš Oberhuber
committed
dim3 cudaBlockSize( 256 );
dim3 cudaBlocksCount, cudaGridsCount;
Tomáš Oberhuber
committed
const IndexType entitiesAlongX = end.x() - begin.x() + 1;
const IndexType entitiesAlongY = end.x() - begin.x() - 1;
const IndexType maxFaceSize = max( entitiesAlongX, entitiesAlongY );
const IndexType blocksPerFace = maxFaceSize / cudaBlockSize.x + ( maxFaceSize % cudaBlockSize.x != 0 );
IndexType cudaThreadsCount = 4 * cudaBlockSize.x * blocksPerFace;
Tomáš Oberhuber
committed
Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, cudaThreadsCount );
Tomáš Oberhuber
committed
//std::cerr << "blocksPerFace = " << blocksPerFace << "Threads count = " << cudaThreadsCount
// << "cudaBlockCount = " << cudaBlocksCount.x << std::endl;
dim3 gridIdx, cudaGridSize;
Tomáš Oberhuber
committed
Devices::Cuda::synchronizeDevice();
for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x++ )
Tomáš Oberhuber
committed
Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
//Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCountAlongX, cudaGridSize, cudaGridsCountAlongX );
Tomáš Oberhuber
committed
GridTraverser2DBoundary< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
<<< cudaGridSize, cudaBlockSize >>>
( &gridPointer.template getData< Devices::Cuda >(),
Tomáš Oberhuber
committed
userData,
begin.x(),
end.x(),
begin.y(),
end.y(),
Tomáš Oberhuber
committed
blocksPerFace,
gridIdx,
gridEntityParameters... );
Tomáš Oberhuber
committed
}
Tomáš Oberhuber
committed
#endif //GRID_TRAVERSER_USE_STREAMS
//getchar();
TNL_CHECK_CUDA_DEVICE;
}
else
{
dim3 cudaBlockSize( 16, 16 );
dim3 cudaBlocksCount, cudaGridsCount;
Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount,
end.x() - begin.x() + 1,
end.y() - begin.y() + 1 );
auto& pool = CudaStreamPool::getInstance();
const cudaStream_t& s = pool.getStream( stream );
dim3 gridIdx, cudaGridSize;
for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ )
for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ )
{
Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
//Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCount, cudaGridSize, cudaGridsCount );
GridTraverser2D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
<<< cudaGridSize, cudaBlockSize, 0, s >>>
( &gridPointer.template getData< Devices::Cuda >(),
Tomáš Oberhuber
committed
userData,
Tomáš Oberhuber
committed
// only launches into the stream 0 are synchronized
if( stream == 0 )
{
cudaStreamSynchronize( s );
TNL_CHECK_CUDA_DEVICE;
#else
throw Exceptions::CudaSupportMissing();
Tomáš Oberhuber
committed
/****
* 2D traverser, MIC
*/
template< typename Real,
typename Index >
template<
typename GridEntity,
typename EntitiesProcessor,
typename UserData,
bool processOnlyBoundaryEntities,
int XOrthogonalBoundary,
int YOrthogonalBoundary,
typename... GridEntityParameters >
void
GridTraverser< Meshes::Grid< 2, Real, Devices::MIC, Index > >::
processEntities(
const GridPointer& gridPointer,
const CoordinatesType& begin,
const CoordinatesType& end,
Tomáš Oberhuber
committed
UserData& userData,
const int& stream,
const GridEntityParameters&... gridEntityParameters )
{
#ifdef HAVE_MIC
Devices::MIC::synchronizeDevice();
//TOHLE JE PRUSER -- nemim poslat vypustku --
//GridEntity entity( gridPointer.template getData< Devices::MIC >(), begin, gridEntityParameters... );
Devices::MICHider<const GridType> hMicGrid;
hMicGrid.pointer=& gridPointer.template getData< Devices::MIC >();
Devices::MICHider<UserData> hMicUserData;
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
hMicUserData.pointer=& userDataPointer.template modifyData<Devices::MIC>();
TNLMICSTRUCT(begin, const CoordinatesType);
TNLMICSTRUCT(end, const CoordinatesType);
#pragma offload target(mic) in(sbegin,send,hMicUserData,hMicGrid)
{
#pragma omp parallel firstprivate( sbegin, send )
{
TNLMICSTRUCTUSE(begin, const CoordinatesType);
TNLMICSTRUCTUSE(end, const CoordinatesType);
GridEntity entity( *(hMicGrid.pointer), *(kernelbegin) );
if( processOnlyBoundaryEntities )
{
if( YOrthogonalBoundary )
#pragma omp for
for( auto k = kernelbegin->x();
k <= kernelend->x();
k ++ )
{
entity.getCoordinates().x() = k;
entity.getCoordinates().y() = kernelbegin->y();
entity.refresh();
EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity );
entity.getCoordinates().y() = kernelend->y();
entity.refresh();
EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity );
}
if( XOrthogonalBoundary )
#pragma omp for
for( auto k = kernelbegin->y();
k <= kernelend->y();
k ++ )
{
entity.getCoordinates().y() = k;
entity.getCoordinates().x() = kernelbegin->x();
entity.refresh();
EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity );
entity.getCoordinates().x() = kernelend->x();
entity.refresh();
EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity );
}
}
else
{
#pragma omp for
for( IndexType y = kernelbegin->y(); y <= kernelend->y(); y ++ )
for( IndexType x = kernelbegin->x(); x <= kernelend->x(); x ++ )
{
// std::cerr << x << " " <<y << std::endl;
entity.getCoordinates().x() = x;
entity.getCoordinates().y() = y;
entity.refresh();
EntitiesProcessor::processEntity( entity.getMesh(), *(hMicUserData.pointer), entity );
}
}
}
}
#endif
}
template< typename Real,
typename Index >
template<
typename GridEntity,
typename EntitiesProcessor,
typename UserData,
bool processOnlyBoundaryEntities,
int XOrthogonalBoundary,
int YOrthogonalBoundary,
int ZOrthogonalBoundary,
typename... GridEntityParameters >
GridTraverser< Meshes::Grid< 3, Real, Devices::Host, Index > >::
const GridPointer& gridPointer,
const CoordinatesType begin,
const CoordinatesType end,
Tomáš Oberhuber
committed
UserData& userData,
const int& stream,
const GridEntityParameters&... gridEntityParameters )
{
if( processOnlyBoundaryEntities )
{
GridEntity entity( *gridPointer, begin, gridEntityParameters... );
if( ZOrthogonalBoundary )
for( entity.getCoordinates().y() = begin.y();
entity.getCoordinates().y() <= end.y();
entity.getCoordinates().y() ++ )
for( entity.getCoordinates().x() = begin.x();
entity.getCoordinates().x() <= end.x();
entity.getCoordinates().x() ++ )
{
entity.getCoordinates().z() = begin.z();
entity.refresh();
Tomáš Oberhuber
committed
EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
entity.getCoordinates().z() = end.z();
entity.refresh();
Tomáš Oberhuber
committed
EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
}
if( YOrthogonalBoundary )
for( entity.getCoordinates().z() = begin.z();
entity.getCoordinates().z() <= end.z();
entity.getCoordinates().z() ++ )
for( entity.getCoordinates().x() = begin.x();
entity.getCoordinates().x() <= end.x();
entity.getCoordinates().x() ++ )
{
entity.getCoordinates().y() = begin.y();
entity.refresh();
Tomáš Oberhuber
committed
EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
entity.getCoordinates().y() = end.y();
entity.refresh();
Tomáš Oberhuber
committed
EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
}
if( XOrthogonalBoundary )
for( entity.getCoordinates().z() = begin.z();
entity.getCoordinates().z() <= end.z();
entity.getCoordinates().z() ++ )
for( entity.getCoordinates().y() = begin.y();
entity.getCoordinates().y() <= end.y();
entity.getCoordinates().y() ++ )
{
entity.getCoordinates().x() = begin.x();
entity.refresh();
Tomáš Oberhuber
committed
EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
entity.getCoordinates().x() = end.x();
entity.refresh();
Tomáš Oberhuber
committed
EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );
Tomáš Oberhuber
committed
#ifdef HAVE_OPENMP
if( Devices::Host::isOMPEnabled() )
{
#pragma omp parallel firstprivate( begin, end )
{
GridEntity entity( *gridPointer );
#pragma omp for
// TODO: g++ 5.5 crashes when coding this loop without auxiliary x and y as bellow
for( IndexType z = begin.z(); z <= end.z(); z ++ )
for( IndexType y = begin.y(); y <= end.y(); y ++ )
for( IndexType x = begin.x(); x <= end.x(); x ++ )
{
entity.getCoordinates().x() = x;
entity.getCoordinates().y() = y;
entity.getCoordinates().z() = z;
entity.refresh();
EntitiesProcessor::processEntity( entity.getMesh(), userData, entity );