Loading src/TNL/Containers/Algorithms/CudaMultireductionKernel.h +29 −29 Original line number Diff line number Diff line Loading @@ -42,14 +42,12 @@ template< int blockSizeX, typename Result, typename DataFetcher, typename Reduction, typename VolatileReduction, typename Index > __global__ void __launch_bounds__( Multireduction_maxThreadsPerBlock, Multireduction_minBlocksPerMultiprocessor ) CudaMultireductionKernel( const Result zero, DataFetcher dataFetcher, const Reduction reduction, const VolatileReduction volatileReduction, const Index size, const int n, Result* output ) Loading Loading @@ -108,25 +106,29 @@ CudaMultireductionKernel( const Result zero, __syncthreads(); } // This runs in one warp so it is synchronized implicitly. // This runs in one warp so we use __syncwarp() instead of __syncthreads(). if( threadIdx.x < 32 ) { volatile Result* vsdata = sdata; if( blockSizeX >= 64 ) volatileReduction( vsdata[ tid ], vsdata[ tid + 32 ] ); reduction( sdata[ tid ], sdata[ tid + 32 ] ); __syncwarp(); // Note that here we do not have to check if tid < 16 etc, because we have // 2 * blockSize.x elements of shared memory per block, so we do not // access out of bounds. The results for the upper half will be undefined, // but unused anyway. if( blockSizeX >= 32 ) volatileReduction( vsdata[ tid ], vsdata[ tid + 16 ] ); reduction( sdata[ tid ], sdata[ tid + 16 ] ); __syncwarp(); if( blockSizeX >= 16 ) volatileReduction( vsdata[ tid ], vsdata[ tid + 8 ] ); reduction( sdata[ tid ], sdata[ tid + 8 ] ); __syncwarp(); if( blockSizeX >= 8 ) volatileReduction( vsdata[ tid ], vsdata[ tid + 4 ] ); reduction( sdata[ tid ], sdata[ tid + 4 ] ); __syncwarp(); if( blockSizeX >= 4 ) volatileReduction( vsdata[ tid ], vsdata[ tid + 2 ] ); reduction( sdata[ tid ], sdata[ tid + 2 ] ); __syncwarp(); if( blockSizeX >= 2 ) volatileReduction( vsdata[ tid ], vsdata[ tid + 1 ] ); reduction( sdata[ tid ], sdata[ tid + 1 ] ); } // Store the result back in the global memory. Loading @@ -139,13 +141,11 @@ CudaMultireductionKernel( const Result zero, template< typename Result, typename DataFetcher, typename Reduction, typename VolatileReduction, typename Index > int CudaMultireductionKernelLauncher( const Result zero, DataFetcher dataFetcher, const Reduction reduction, const VolatileReduction volatileReduction, const Index size, const int n, Result*& output ) Loading Loading @@ -215,55 +215,55 @@ CudaMultireductionKernelLauncher( const Result zero, { case 512: CudaMultireductionKernel< 512 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output ); <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); break; case 256: cudaFuncSetCacheConfig(CudaMultireductionKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaMultireductionKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 256 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output ); <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); break; case 128: cudaFuncSetCacheConfig(CudaMultireductionKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaMultireductionKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 128 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output ); <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); break; case 64: cudaFuncSetCacheConfig(CudaMultireductionKernel< 64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaMultireductionKernel< 64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 64 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output ); <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); break; case 32: cudaFuncSetCacheConfig(CudaMultireductionKernel< 32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaMultireductionKernel< 32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 32 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output ); <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); break; case 16: cudaFuncSetCacheConfig(CudaMultireductionKernel< 16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaMultireductionKernel< 16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 16 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output ); <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); break; case 8: cudaFuncSetCacheConfig(CudaMultireductionKernel< 8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaMultireductionKernel< 8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 8 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output ); <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); break; case 4: cudaFuncSetCacheConfig(CudaMultireductionKernel< 4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaMultireductionKernel< 4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 4 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output ); <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); break; case 2: cudaFuncSetCacheConfig(CudaMultireductionKernel< 2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaMultireductionKernel< 2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 2 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output ); <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); break; case 1: throw std::logic_error( "blockSize should not be 1." ); Loading src/TNL/Containers/Algorithms/CudaReductionKernel.h +72 −81 File changed.Preview size limit exceeded, changes collapsed. Show changes src/TNL/Containers/Algorithms/Multireduction.hpp +1 −1 Original line number Diff line number Diff line Loading @@ -196,7 +196,7 @@ reduce( const Result zero, // start the reduction on the GPU Result* deviceAux1 = nullptr; const int reducedSize = CudaMultireductionKernelLauncher( zero, dataFetcher, reduction, volatileReduction, size, n, deviceAux1 ); const int reducedSize = CudaMultireductionKernelLauncher( zero, dataFetcher, reduction, size, n, deviceAux1 ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); Loading src/TNL/Containers/Algorithms/Reduction.hpp +2 −4 Original line number Diff line number Diff line Loading @@ -288,7 +288,6 @@ reduce( const Index size, Result* deviceAux1( 0 ); Index reducedSize = reductionLauncher.start( reduction, volatileReduction, dataFetcher, zero, deviceAux1 ); Loading Loading @@ -324,7 +323,7 @@ reduce( const Index size, } else { // data can't be safely reduced on host, so continue with the reduction on the GPU auto result = reductionLauncher.finish( reduction, volatileReduction, zero ); auto result = reductionLauncher.finish( reduction, zero ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); Loading Loading @@ -368,7 +367,6 @@ reduceWithArgument( const Index size, Index* deviceIndexes( nullptr ); Index reducedSize = reductionLauncher.startWithArgument( reduction, volatileReduction, dataFetcher, zero, deviceAux1, Loading Loading @@ -409,7 +407,7 @@ reduceWithArgument( const Index size, } else { // data can't be safely reduced on host, so continue with the reduction on the GPU auto result = reductionLauncher.finishWithArgument( reduction, volatileReduction, zero ); auto result = reductionLauncher.finishWithArgument( reduction, zero ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); Loading Loading
src/TNL/Containers/Algorithms/CudaMultireductionKernel.h +29 −29 Original line number Diff line number Diff line Loading @@ -42,14 +42,12 @@ template< int blockSizeX, typename Result, typename DataFetcher, typename Reduction, typename VolatileReduction, typename Index > __global__ void __launch_bounds__( Multireduction_maxThreadsPerBlock, Multireduction_minBlocksPerMultiprocessor ) CudaMultireductionKernel( const Result zero, DataFetcher dataFetcher, const Reduction reduction, const VolatileReduction volatileReduction, const Index size, const int n, Result* output ) Loading Loading @@ -108,25 +106,29 @@ CudaMultireductionKernel( const Result zero, __syncthreads(); } // This runs in one warp so it is synchronized implicitly. // This runs in one warp so we use __syncwarp() instead of __syncthreads(). if( threadIdx.x < 32 ) { volatile Result* vsdata = sdata; if( blockSizeX >= 64 ) volatileReduction( vsdata[ tid ], vsdata[ tid + 32 ] ); reduction( sdata[ tid ], sdata[ tid + 32 ] ); __syncwarp(); // Note that here we do not have to check if tid < 16 etc, because we have // 2 * blockSize.x elements of shared memory per block, so we do not // access out of bounds. The results for the upper half will be undefined, // but unused anyway. if( blockSizeX >= 32 ) volatileReduction( vsdata[ tid ], vsdata[ tid + 16 ] ); reduction( sdata[ tid ], sdata[ tid + 16 ] ); __syncwarp(); if( blockSizeX >= 16 ) volatileReduction( vsdata[ tid ], vsdata[ tid + 8 ] ); reduction( sdata[ tid ], sdata[ tid + 8 ] ); __syncwarp(); if( blockSizeX >= 8 ) volatileReduction( vsdata[ tid ], vsdata[ tid + 4 ] ); reduction( sdata[ tid ], sdata[ tid + 4 ] ); __syncwarp(); if( blockSizeX >= 4 ) volatileReduction( vsdata[ tid ], vsdata[ tid + 2 ] ); reduction( sdata[ tid ], sdata[ tid + 2 ] ); __syncwarp(); if( blockSizeX >= 2 ) volatileReduction( vsdata[ tid ], vsdata[ tid + 1 ] ); reduction( sdata[ tid ], sdata[ tid + 1 ] ); } // Store the result back in the global memory. Loading @@ -139,13 +141,11 @@ CudaMultireductionKernel( const Result zero, template< typename Result, typename DataFetcher, typename Reduction, typename VolatileReduction, typename Index > int CudaMultireductionKernelLauncher( const Result zero, DataFetcher dataFetcher, const Reduction reduction, const VolatileReduction volatileReduction, const Index size, const int n, Result*& output ) Loading Loading @@ -215,55 +215,55 @@ CudaMultireductionKernelLauncher( const Result zero, { case 512: CudaMultireductionKernel< 512 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output ); <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); break; case 256: cudaFuncSetCacheConfig(CudaMultireductionKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaMultireductionKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 256 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output ); <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); break; case 128: cudaFuncSetCacheConfig(CudaMultireductionKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaMultireductionKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 128 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output ); <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); break; case 64: cudaFuncSetCacheConfig(CudaMultireductionKernel< 64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaMultireductionKernel< 64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 64 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output ); <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); break; case 32: cudaFuncSetCacheConfig(CudaMultireductionKernel< 32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaMultireductionKernel< 32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 32 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output ); <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); break; case 16: cudaFuncSetCacheConfig(CudaMultireductionKernel< 16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaMultireductionKernel< 16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 16 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output ); <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); break; case 8: cudaFuncSetCacheConfig(CudaMultireductionKernel< 8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaMultireductionKernel< 8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 8 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output ); <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); break; case 4: cudaFuncSetCacheConfig(CudaMultireductionKernel< 4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaMultireductionKernel< 4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 4 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output ); <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); break; case 2: cudaFuncSetCacheConfig(CudaMultireductionKernel< 2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaMultireductionKernel< 2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 2 > <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output ); <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); break; case 1: throw std::logic_error( "blockSize should not be 1." ); Loading
src/TNL/Containers/Algorithms/CudaReductionKernel.h +72 −81 File changed.Preview size limit exceeded, changes collapsed. Show changes
src/TNL/Containers/Algorithms/Multireduction.hpp +1 −1 Original line number Diff line number Diff line Loading @@ -196,7 +196,7 @@ reduce( const Result zero, // start the reduction on the GPU Result* deviceAux1 = nullptr; const int reducedSize = CudaMultireductionKernelLauncher( zero, dataFetcher, reduction, volatileReduction, size, n, deviceAux1 ); const int reducedSize = CudaMultireductionKernelLauncher( zero, dataFetcher, reduction, size, n, deviceAux1 ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); Loading
src/TNL/Containers/Algorithms/Reduction.hpp +2 −4 Original line number Diff line number Diff line Loading @@ -288,7 +288,6 @@ reduce( const Index size, Result* deviceAux1( 0 ); Index reducedSize = reductionLauncher.start( reduction, volatileReduction, dataFetcher, zero, deviceAux1 ); Loading Loading @@ -324,7 +323,7 @@ reduce( const Index size, } else { // data can't be safely reduced on host, so continue with the reduction on the GPU auto result = reductionLauncher.finish( reduction, volatileReduction, zero ); auto result = reductionLauncher.finish( reduction, zero ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); Loading Loading @@ -368,7 +367,6 @@ reduceWithArgument( const Index size, Index* deviceIndexes( nullptr ); Index reducedSize = reductionLauncher.startWithArgument( reduction, volatileReduction, dataFetcher, zero, deviceAux1, Loading Loading @@ -409,7 +407,7 @@ reduceWithArgument( const Index size, } else { // data can't be safely reduced on host, so continue with the reduction on the GPU auto result = reductionLauncher.finishWithArgument( reduction, volatileReduction, zero ); auto result = reductionLauncher.finishWithArgument( reduction, zero ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); Loading