Loading src/TNL/Devices/Cuda.cu +101 −0 Original line number Diff line number Diff line Loading @@ -15,6 +15,101 @@ namespace TNL { namespace Devices { void Cuda::setupThreads( const dim3& blockSize, dim3& blocksCount, dim3& gridsCount, long long int xThreads, long long int yThreads, long long int zThreads ) { if( blockSize.x ) blocksCount.x = xThreads / blockSize.x + ( xThreads % blockSize.x != 0 ); else blocksCount.x = 0; if( blockSize.y ) blocksCount.y = yThreads / blockSize.y + ( yThreads % blockSize.y != 0 ); else blocksCount.y = 0; if( blockSize.z ) blocksCount.z = xThreads / blockSize.z + ( zThreads % blockSize.z != 0 ); else blocksCount.z = 0; /**** * TODO: Fix the following: * I do not known how to get max grid size in kernels :( * int currentDevice( 0 ); cudaGetDevice( currentDevice ); cudaDeviceProp properties; cudaGetDeviceProperties( &properties, currentDevice ); gridsCount.x = blocksCount.x / properties.maxGridSize[ 0 ] + ( blocksCount.x % properties.maxGridSize[ 0 ] != 0 ); gridsCount.y = blocksCount.y / properties.maxGridSize[ 1 ] + ( blocksCount.y % properties.maxGridSize[ 1 ] != 0 ); gridsCount.z = blocksCount.z / properties.maxGridSize[ 2 ] + ( blocksCount.z % properties.maxGridSize[ 2 ] != 0 ); */ gridsCount.x = blocksCount.x / getMaxGridSize() + ( blocksCount.x % getMaxGridSize() != 0 ); gridsCount.y = blocksCount.y / getMaxGridSize() + ( blocksCount.y % getMaxGridSize() != 0 ); gridsCount.z = blocksCount.z / getMaxGridSize() + ( blocksCount.z % getMaxGridSize() != 0 ); } void Cuda::setupGrid( const dim3& blocksCount, const dim3& gridsCount, const dim3& gridIdx, dim3& gridSize ) { int currentDevice( 0 ); cudaGetDevice( ¤tDevice ); cudaDeviceProp properties; cudaGetDeviceProperties( &properties, currentDevice ); /**** * TODO: fix the following if( gridIdx.x < gridsCount.x ) gridSize.x = properties.maxGridSize[ 0 ]; else gridSize.x = blocksCount.x % properties.maxGridSize[ 0 ]; if( gridIdx.y < gridsCount.y ) gridSize.y = properties.maxGridSize[ 1 ]; else gridSize.y = blocksCount.y % properties.maxGridSize[ 1 ]; if( gridIdx.z < gridsCount.z ) gridSize.z = properties.maxGridSize[ 2 ]; else gridSize.z = blocksCount.z % properties.maxGridSize[ 2 ];*/ if( gridIdx.x < gridsCount.x ) gridSize.x = getMaxGridSize(); else gridSize.x = blocksCount.x % getMaxGridSize(); if( gridIdx.y < gridsCount.y ) gridSize.y = getMaxGridSize(); else gridSize.y = blocksCount.y % getMaxGridSize(); if( gridIdx.z < gridsCount.z ) gridSize.z = getMaxGridSize(); else gridSize.z = blocksCount.z % getMaxGridSize(); } void Cuda::printThreadsSetup( const dim3& blockSize, const dim3& blocksCount, const dim3& gridSize, const dim3& gridsCount, std::ostream& str ) { /*str << "Block size: " << blockSize << std::endl << " Blocks count: " << blocksCount << std::endl << " Grid size: " << gridSize << std::endl << " Grids count: " << gridsCount << std::endl;*/ } bool Cuda::checkDevice( const char* file_name, int line, cudaError error ) { if( error == cudaSuccess ) Loading Loading @@ -397,5 +492,11 @@ bool Cuda::checkDevice( const char* file_name, int line, cudaError error ) return false; } std::ostream& operator << ( std::ostream& str, const dim3& d ) { str << "( " << d.x << ", " << d.y << ", " << d.z << " )"; return str; } } // namespace Devices } // namespace TNL src/TNL/Devices/Cuda.h +52 −1 Original line number Diff line number Diff line Loading @@ -49,17 +49,67 @@ class Cuda static inline constexpr int getGPUTransferBufferSize(); #ifdef HAVE_CUDA /*** * This function is obsolete and should be replaced by the following functions. */ __device__ static inline int getGlobalThreadIdx( const int gridIdx = 0, const int gridSize = getMaxGridSize() ); __device__ static inline int getGlobalThreadIdx_x( const dim3& gridIdx ); __device__ static inline int getGlobalThreadIdx_y( const dim3& gridIdx ); __device__ static inline int getGlobalThreadIdx_z( const dim3& gridIdx ); #endif /**** * This functions helps to count number of CUDA blocks depending on the * number of the CUDA threads and the block size. * It is obsolete and it will be replaced by setupThreads. */ static int getNumberOfBlocks( const int threads, const int blockSize ); /**** * This functions helps to count number of CUDA grids depending on the * number of the CUDA blocks and maximum grid size. * It is obsolete and it will be replaced by setupThreads. */ static int getNumberOfGrids( const int blocks, const int gridSize = getMaxGridSize() ); #ifdef HAVE_CUDA /*! This method sets up gridSize and computes number of grids depending * on total number of CUDA threads. */ static void setupThreads( const dim3& blockSize, dim3& blocksCount, dim3& gridsCount, long long int xThreads, long long int yThreads = 0, long long int zThreads = 0 ); /*! This method sets up grid size when one iterates over more grids. * If gridIdx.? < gridsCount.? then the gridSize.? is set to maximum * allowed by CUDA. Otherwise gridSize.? is set to the size of the grid * in the last loop i.e. blocksCount.? % maxGridSize.?. */ static void setupGrid( const dim3& blocksCount, const dim3& gridsCount, const dim3& gridIdx, dim3& gridSize ); static void printThreadsSetup( const dim3& blockSize, const dim3& blocksCount, const dim3& gridSize, const dim3& gridsCount, std::ostream& str = std::cout ); #endif template< typename ObjectType > static ObjectType* passToDevice( const ObjectType& object ); Loading Loading @@ -135,6 +185,7 @@ class Cuda #ifdef HAVE_CUDA #define checkCudaDevice ::TNL::Devices::Cuda::checkDevice( __FILE__, __LINE__, cudaGetLastError() ) std::ostream& operator << ( std::ostream& str, const dim3& d ); #else #define checkCudaDevice ::TNL::Devices::Cuda::checkDevice() #endif Loading src/TNL/Devices/Cuda_impl.h +16 −0 Original line number Diff line number Diff line Loading @@ -53,6 +53,22 @@ __device__ inline int Cuda::getGlobalThreadIdx( const int gridIdx, const int gri { return ( gridIdx * gridSize + blockIdx.x ) * blockDim.x + threadIdx.x; } __device__ inline int Cuda::getGlobalThreadIdx_x( const dim3& gridIdx ) { return ( gridIdx.x * getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; } __device__ inline int Cuda::getGlobalThreadIdx_y( const dim3& gridIdx ) { return ( gridIdx.y * getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; } __device__ inline int Cuda::getGlobalThreadIdx_z( const dim3& gridIdx ) { return ( gridIdx.z * getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; } #endif Loading src/TNL/Meshes/GridDetails/GridTraverser_impl.h +20 −12 Original line number Diff line number Diff line Loading @@ -357,13 +357,13 @@ GridTraverser2DBoundaryAlongX( const Index beginX, const Index endX, const Index fixedY, const Index gridIdx, const dim3 gridIdx, const GridEntityParameters... gridEntityParameters ) { typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType; typename GridType::CoordinatesType coordinates; coordinates.x() = beginX + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); coordinates.y() = fixedY; if( coordinates.x() <= endX ) Loading Loading @@ -391,14 +391,14 @@ GridTraverser2DBoundaryAlongY( const Index beginY, const Index endY, const Index fixedX, const Index gridIdx, const dim3 gridIdx, const GridEntityParameters... gridEntityParameters ) { typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType; typename GridType::CoordinatesType coordinates; coordinates.x() = fixedX; coordinates.y() = beginY + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); if( coordinates.y() <= endY ) { Loading Loading @@ -438,23 +438,30 @@ processEntities( ( GridEntity::getDimensions() == 2 || GridEntity::getDimensions() == 0 ) ) { dim3 cudaBlockSize( 256 ); const IndexType entitiesAlongX = end.x() - begin.x() + 1; dim3 cudaBlocksCountAlongX, cudaGridsCountAlongX, cudaBlocksCountAlongY, cudaGridsCountAlongY; Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongX, cudaGridsCountAlongX, end.x() - begin.x() + 1 ); Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongY, cudaGridsCountAlongY, end.y() - begin.y() - 1 ); /*const IndexType entitiesAlongX = end.x() - begin.x() + 1; const IndexType entitiesAlongY = end.y() - begin.y() - 1; dim3 cudaBlocksAlongX, cudaBlocksAlongY; cudaBlocksAlongX.x = Devices::Cuda::getNumberOfBlocks( entitiesAlongX, cudaBlockSize.x ); cudaBlocksAlongY.x = Devices::Cuda::getNumberOfBlocks( entitiesAlongY, cudaBlockSize.x ); const IndexType cudaGridsAlongX = Devices::Cuda::getNumberOfGrids( cudaBlocksAlongX.x ); const IndexType cudaGridsAlongY = Devices::Cuda::getNumberOfGrids( cudaBlocksAlongY.x ); const IndexType cudaGridsAlongY = Devices::Cuda::getNumberOfGrids( cudaBlocksAlongY.x );*/ auto& pool = CudaStreamPool::getInstance(); Devices::Cuda::synchronizeDevice(); const cudaStream_t& s1 = pool.getStream( stream ); const cudaStream_t& s2 = pool.getStream( stream + 1 ); for( IndexType gridIdx = 0; gridIdx < cudaGridsAlongX; gridIdx++ ) dim3 gridIdx, cudaGridSize; for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongX.x; gridIdx.x++ ) { Devices::Cuda::setupGrid( cudaBlocksCountAlongX, cudaGridsCountAlongX, gridIdx, cudaGridSize ); GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > <<< cudaBlocksAlongX, cudaBlockSize, 0, s1 >>> <<< cudaGridSize, cudaBlockSize, 0, s1 >>> ( &gridPointer.template getData< Devices::Cuda >(), &userDataPointer.template modifyData< Devices::Cuda >(), begin.x(), Loading @@ -463,7 +470,7 @@ processEntities( gridIdx, gridEntityParameters... ); GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > <<< cudaBlocksAlongX, cudaBlockSize, 0, s2 >>> <<< cudaGridSize, cudaBlockSize, 0, s2 >>> ( &gridPointer.template getData< Devices::Cuda >(), &userDataPointer.template modifyData< Devices::Cuda >(), begin.x(), Loading @@ -474,10 +481,11 @@ processEntities( } const cudaStream_t& s3 = pool.getStream( stream + 2 ); const cudaStream_t& s4 = pool.getStream( stream + 3 ); for( IndexType gridIdx = 0; gridIdx < cudaGridsAlongX; gridIdx++ ) for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongY.x; gridIdx.x++ ) { Devices::Cuda::setupGrid( cudaBlocksCountAlongY, cudaGridsCountAlongY, gridIdx, cudaGridSize ); GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > <<< cudaBlocksAlongY, cudaBlockSize, 0, s3 >>> <<< cudaGridSize, cudaBlockSize, 0, s3 >>> ( &gridPointer.template getData< Devices::Cuda >(), &userDataPointer.template modifyData< Devices::Cuda >(), begin.y() + 1, Loading @@ -486,7 +494,7 @@ processEntities( gridIdx, gridEntityParameters... ); GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > <<< cudaBlocksAlongY, cudaBlockSize, 0, s4 >>> <<< cudaGridSize, cudaBlockSize, 0, s4 >>> ( &gridPointer.template getData< Devices::Cuda >(), &userDataPointer.template modifyData< Devices::Cuda >(), begin.y() + 1, Loading Loading
src/TNL/Devices/Cuda.cu +101 −0 Original line number Diff line number Diff line Loading @@ -15,6 +15,101 @@ namespace TNL { namespace Devices { void Cuda::setupThreads( const dim3& blockSize, dim3& blocksCount, dim3& gridsCount, long long int xThreads, long long int yThreads, long long int zThreads ) { if( blockSize.x ) blocksCount.x = xThreads / blockSize.x + ( xThreads % blockSize.x != 0 ); else blocksCount.x = 0; if( blockSize.y ) blocksCount.y = yThreads / blockSize.y + ( yThreads % blockSize.y != 0 ); else blocksCount.y = 0; if( blockSize.z ) blocksCount.z = xThreads / blockSize.z + ( zThreads % blockSize.z != 0 ); else blocksCount.z = 0; /**** * TODO: Fix the following: * I do not known how to get max grid size in kernels :( * int currentDevice( 0 ); cudaGetDevice( currentDevice ); cudaDeviceProp properties; cudaGetDeviceProperties( &properties, currentDevice ); gridsCount.x = blocksCount.x / properties.maxGridSize[ 0 ] + ( blocksCount.x % properties.maxGridSize[ 0 ] != 0 ); gridsCount.y = blocksCount.y / properties.maxGridSize[ 1 ] + ( blocksCount.y % properties.maxGridSize[ 1 ] != 0 ); gridsCount.z = blocksCount.z / properties.maxGridSize[ 2 ] + ( blocksCount.z % properties.maxGridSize[ 2 ] != 0 ); */ gridsCount.x = blocksCount.x / getMaxGridSize() + ( blocksCount.x % getMaxGridSize() != 0 ); gridsCount.y = blocksCount.y / getMaxGridSize() + ( blocksCount.y % getMaxGridSize() != 0 ); gridsCount.z = blocksCount.z / getMaxGridSize() + ( blocksCount.z % getMaxGridSize() != 0 ); } void Cuda::setupGrid( const dim3& blocksCount, const dim3& gridsCount, const dim3& gridIdx, dim3& gridSize ) { int currentDevice( 0 ); cudaGetDevice( ¤tDevice ); cudaDeviceProp properties; cudaGetDeviceProperties( &properties, currentDevice ); /**** * TODO: fix the following if( gridIdx.x < gridsCount.x ) gridSize.x = properties.maxGridSize[ 0 ]; else gridSize.x = blocksCount.x % properties.maxGridSize[ 0 ]; if( gridIdx.y < gridsCount.y ) gridSize.y = properties.maxGridSize[ 1 ]; else gridSize.y = blocksCount.y % properties.maxGridSize[ 1 ]; if( gridIdx.z < gridsCount.z ) gridSize.z = properties.maxGridSize[ 2 ]; else gridSize.z = blocksCount.z % properties.maxGridSize[ 2 ];*/ if( gridIdx.x < gridsCount.x ) gridSize.x = getMaxGridSize(); else gridSize.x = blocksCount.x % getMaxGridSize(); if( gridIdx.y < gridsCount.y ) gridSize.y = getMaxGridSize(); else gridSize.y = blocksCount.y % getMaxGridSize(); if( gridIdx.z < gridsCount.z ) gridSize.z = getMaxGridSize(); else gridSize.z = blocksCount.z % getMaxGridSize(); } void Cuda::printThreadsSetup( const dim3& blockSize, const dim3& blocksCount, const dim3& gridSize, const dim3& gridsCount, std::ostream& str ) { /*str << "Block size: " << blockSize << std::endl << " Blocks count: " << blocksCount << std::endl << " Grid size: " << gridSize << std::endl << " Grids count: " << gridsCount << std::endl;*/ } bool Cuda::checkDevice( const char* file_name, int line, cudaError error ) { if( error == cudaSuccess ) Loading Loading @@ -397,5 +492,11 @@ bool Cuda::checkDevice( const char* file_name, int line, cudaError error ) return false; } std::ostream& operator << ( std::ostream& str, const dim3& d ) { str << "( " << d.x << ", " << d.y << ", " << d.z << " )"; return str; } } // namespace Devices } // namespace TNL
src/TNL/Devices/Cuda.h +52 −1 Original line number Diff line number Diff line Loading @@ -49,17 +49,67 @@ class Cuda static inline constexpr int getGPUTransferBufferSize(); #ifdef HAVE_CUDA /*** * This function is obsolete and should be replaced by the following functions. */ __device__ static inline int getGlobalThreadIdx( const int gridIdx = 0, const int gridSize = getMaxGridSize() ); __device__ static inline int getGlobalThreadIdx_x( const dim3& gridIdx ); __device__ static inline int getGlobalThreadIdx_y( const dim3& gridIdx ); __device__ static inline int getGlobalThreadIdx_z( const dim3& gridIdx ); #endif /**** * This functions helps to count number of CUDA blocks depending on the * number of the CUDA threads and the block size. * It is obsolete and it will be replaced by setupThreads. */ static int getNumberOfBlocks( const int threads, const int blockSize ); /**** * This functions helps to count number of CUDA grids depending on the * number of the CUDA blocks and maximum grid size. * It is obsolete and it will be replaced by setupThreads. */ static int getNumberOfGrids( const int blocks, const int gridSize = getMaxGridSize() ); #ifdef HAVE_CUDA /*! This method sets up gridSize and computes number of grids depending * on total number of CUDA threads. */ static void setupThreads( const dim3& blockSize, dim3& blocksCount, dim3& gridsCount, long long int xThreads, long long int yThreads = 0, long long int zThreads = 0 ); /*! This method sets up grid size when one iterates over more grids. * If gridIdx.? < gridsCount.? then the gridSize.? is set to maximum * allowed by CUDA. Otherwise gridSize.? is set to the size of the grid * in the last loop i.e. blocksCount.? % maxGridSize.?. */ static void setupGrid( const dim3& blocksCount, const dim3& gridsCount, const dim3& gridIdx, dim3& gridSize ); static void printThreadsSetup( const dim3& blockSize, const dim3& blocksCount, const dim3& gridSize, const dim3& gridsCount, std::ostream& str = std::cout ); #endif template< typename ObjectType > static ObjectType* passToDevice( const ObjectType& object ); Loading Loading @@ -135,6 +185,7 @@ class Cuda #ifdef HAVE_CUDA #define checkCudaDevice ::TNL::Devices::Cuda::checkDevice( __FILE__, __LINE__, cudaGetLastError() ) std::ostream& operator << ( std::ostream& str, const dim3& d ); #else #define checkCudaDevice ::TNL::Devices::Cuda::checkDevice() #endif Loading
src/TNL/Devices/Cuda_impl.h +16 −0 Original line number Diff line number Diff line Loading @@ -53,6 +53,22 @@ __device__ inline int Cuda::getGlobalThreadIdx( const int gridIdx, const int gri { return ( gridIdx * gridSize + blockIdx.x ) * blockDim.x + threadIdx.x; } __device__ inline int Cuda::getGlobalThreadIdx_x( const dim3& gridIdx ) { return ( gridIdx.x * getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; } __device__ inline int Cuda::getGlobalThreadIdx_y( const dim3& gridIdx ) { return ( gridIdx.y * getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; } __device__ inline int Cuda::getGlobalThreadIdx_z( const dim3& gridIdx ) { return ( gridIdx.z * getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; } #endif Loading
src/TNL/Meshes/GridDetails/GridTraverser_impl.h +20 −12 Original line number Diff line number Diff line Loading @@ -357,13 +357,13 @@ GridTraverser2DBoundaryAlongX( const Index beginX, const Index endX, const Index fixedY, const Index gridIdx, const dim3 gridIdx, const GridEntityParameters... gridEntityParameters ) { typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType; typename GridType::CoordinatesType coordinates; coordinates.x() = beginX + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); coordinates.y() = fixedY; if( coordinates.x() <= endX ) Loading Loading @@ -391,14 +391,14 @@ GridTraverser2DBoundaryAlongY( const Index beginY, const Index endY, const Index fixedX, const Index gridIdx, const dim3 gridIdx, const GridEntityParameters... gridEntityParameters ) { typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType; typename GridType::CoordinatesType coordinates; coordinates.x() = fixedX; coordinates.y() = beginY + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); if( coordinates.y() <= endY ) { Loading Loading @@ -438,23 +438,30 @@ processEntities( ( GridEntity::getDimensions() == 2 || GridEntity::getDimensions() == 0 ) ) { dim3 cudaBlockSize( 256 ); const IndexType entitiesAlongX = end.x() - begin.x() + 1; dim3 cudaBlocksCountAlongX, cudaGridsCountAlongX, cudaBlocksCountAlongY, cudaGridsCountAlongY; Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongX, cudaGridsCountAlongX, end.x() - begin.x() + 1 ); Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongY, cudaGridsCountAlongY, end.y() - begin.y() - 1 ); /*const IndexType entitiesAlongX = end.x() - begin.x() + 1; const IndexType entitiesAlongY = end.y() - begin.y() - 1; dim3 cudaBlocksAlongX, cudaBlocksAlongY; cudaBlocksAlongX.x = Devices::Cuda::getNumberOfBlocks( entitiesAlongX, cudaBlockSize.x ); cudaBlocksAlongY.x = Devices::Cuda::getNumberOfBlocks( entitiesAlongY, cudaBlockSize.x ); const IndexType cudaGridsAlongX = Devices::Cuda::getNumberOfGrids( cudaBlocksAlongX.x ); const IndexType cudaGridsAlongY = Devices::Cuda::getNumberOfGrids( cudaBlocksAlongY.x ); const IndexType cudaGridsAlongY = Devices::Cuda::getNumberOfGrids( cudaBlocksAlongY.x );*/ auto& pool = CudaStreamPool::getInstance(); Devices::Cuda::synchronizeDevice(); const cudaStream_t& s1 = pool.getStream( stream ); const cudaStream_t& s2 = pool.getStream( stream + 1 ); for( IndexType gridIdx = 0; gridIdx < cudaGridsAlongX; gridIdx++ ) dim3 gridIdx, cudaGridSize; for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongX.x; gridIdx.x++ ) { Devices::Cuda::setupGrid( cudaBlocksCountAlongX, cudaGridsCountAlongX, gridIdx, cudaGridSize ); GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > <<< cudaBlocksAlongX, cudaBlockSize, 0, s1 >>> <<< cudaGridSize, cudaBlockSize, 0, s1 >>> ( &gridPointer.template getData< Devices::Cuda >(), &userDataPointer.template modifyData< Devices::Cuda >(), begin.x(), Loading @@ -463,7 +470,7 @@ processEntities( gridIdx, gridEntityParameters... ); GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > <<< cudaBlocksAlongX, cudaBlockSize, 0, s2 >>> <<< cudaGridSize, cudaBlockSize, 0, s2 >>> ( &gridPointer.template getData< Devices::Cuda >(), &userDataPointer.template modifyData< Devices::Cuda >(), begin.x(), Loading @@ -474,10 +481,11 @@ processEntities( } const cudaStream_t& s3 = pool.getStream( stream + 2 ); const cudaStream_t& s4 = pool.getStream( stream + 3 ); for( IndexType gridIdx = 0; gridIdx < cudaGridsAlongX; gridIdx++ ) for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongY.x; gridIdx.x++ ) { Devices::Cuda::setupGrid( cudaBlocksCountAlongY, cudaGridsCountAlongY, gridIdx, cudaGridSize ); GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > <<< cudaBlocksAlongY, cudaBlockSize, 0, s3 >>> <<< cudaGridSize, cudaBlockSize, 0, s3 >>> ( &gridPointer.template getData< Devices::Cuda >(), &userDataPointer.template modifyData< Devices::Cuda >(), begin.y() + 1, Loading @@ -486,7 +494,7 @@ processEntities( gridIdx, gridEntityParameters... ); GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > <<< cudaBlocksAlongY, cudaBlockSize, 0, s4 >>> <<< cudaGridSize, cudaBlockSize, 0, s4 >>> ( &gridPointer.template getData< Devices::Cuda >(), &userDataPointer.template modifyData< Devices::Cuda >(), begin.y() + 1, Loading