Found a way to avoid using volatile in CUDA reduction: __syncwarp() (cbc2fff9) · Commits · TNL / tnl-dev

src/TNL/Containers/Algorithms/CudaMultireductionKernel.h

+29 −29

Original line number	Diff line number	Diff line
		@@ -42,14 +42,12 @@ template< int blockSizeX,
		typename Result,
		typename DataFetcher,
		typename Reduction,
		typename VolatileReduction,
		typename Index >
		__global__ void
		__launch_bounds__( Multireduction_maxThreadsPerBlock, Multireduction_minBlocksPerMultiprocessor )
		CudaMultireductionKernel( const Result zero,
		DataFetcher dataFetcher,
		const Reduction reduction,
		const VolatileReduction volatileReduction,
		const Index size,
		const int n,
		Result* output )
		@@ -108,25 +106,29 @@ CudaMultireductionKernel( const Result zero,
		__syncthreads();
		}

		// This runs in one warp so it is synchronized implicitly.
		// This runs in one warp so we use __syncwarp() instead of __syncthreads().
		if( threadIdx.x < 32 ) {
		volatile Result* vsdata = sdata;
		if( blockSizeX >= 64 )
		volatileReduction( vsdata[ tid ], vsdata[ tid + 32 ] );
		reduction( sdata[ tid ], sdata[ tid + 32 ] );
		__syncwarp();
		// Note that here we do not have to check if tid < 16 etc, because we have
		// 2 * blockSize.x elements of shared memory per block, so we do not
		// access out of bounds. The results for the upper half will be undefined,
		// but unused anyway.
		if( blockSizeX >= 32 )
		volatileReduction( vsdata[ tid ], vsdata[ tid + 16 ] );
		reduction( sdata[ tid ], sdata[ tid + 16 ] );
		__syncwarp();
		if( blockSizeX >= 16 )
		volatileReduction( vsdata[ tid ], vsdata[ tid + 8 ] );
		reduction( sdata[ tid ], sdata[ tid + 8 ] );
		__syncwarp();
		if( blockSizeX >= 8 )
		volatileReduction( vsdata[ tid ], vsdata[ tid + 4 ] );
		reduction( sdata[ tid ], sdata[ tid + 4 ] );
		__syncwarp();
		if( blockSizeX >= 4 )
		volatileReduction( vsdata[ tid ], vsdata[ tid + 2 ] );
		reduction( sdata[ tid ], sdata[ tid + 2 ] );
		__syncwarp();
		if( blockSizeX >= 2 )
		volatileReduction( vsdata[ tid ], vsdata[ tid + 1 ] );
		reduction( sdata[ tid ], sdata[ tid + 1 ] );
		}

		// Store the result back in the global memory.
		@@ -139,13 +141,11 @@ CudaMultireductionKernel( const Result zero,
		template< typename Result,
		typename DataFetcher,
		typename Reduction,
		typename VolatileReduction,
		typename Index >
		int
		CudaMultireductionKernelLauncher( const Result zero,
		DataFetcher dataFetcher,
		const Reduction reduction,
		const VolatileReduction volatileReduction,
		const Index size,
		const int n,
		Result*& output )
		@@ -215,55 +215,55 @@ CudaMultireductionKernelLauncher( const Result zero,
		{
		case 512:
		CudaMultireductionKernel< 512 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
		break;
		case 256:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
		cudaFuncSetCacheConfig(CudaMultireductionKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< 256 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
		break;
		case 128:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
		cudaFuncSetCacheConfig(CudaMultireductionKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< 128 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
		break;
		case 64:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< 64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
		cudaFuncSetCacheConfig(CudaMultireductionKernel< 64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< 64 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
		break;
		case 32:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< 32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
		cudaFuncSetCacheConfig(CudaMultireductionKernel< 32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< 32 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
		break;
		case 16:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< 16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
		cudaFuncSetCacheConfig(CudaMultireductionKernel< 16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< 16 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
		break;
		case 8:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< 8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
		cudaFuncSetCacheConfig(CudaMultireductionKernel< 8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< 8 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
		break;
		case 4:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< 4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
		cudaFuncSetCacheConfig(CudaMultireductionKernel< 4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< 4 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
		break;
		case 2:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< 2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
		cudaFuncSetCacheConfig(CudaMultireductionKernel< 2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< 2 >
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
		break;
		case 1:
		throw std::logic_error( "blockSize should not be 1." );

src/TNL/Containers/Algorithms/CudaReductionKernel.h

+72 −81

File changed.

Preview size limit exceeded, changes collapsed.

src/TNL/Containers/Algorithms/Multireduction.hpp

+1 −1

Original line number	Diff line number	Diff line
		@@ -196,7 +196,7 @@ reduce( const Result zero,

		// start the reduction on the GPU
		Result* deviceAux1 = nullptr;
		const int reducedSize = CudaMultireductionKernelLauncher( zero, dataFetcher, reduction, volatileReduction, size, n, deviceAux1 );
		const int reducedSize = CudaMultireductionKernelLauncher( zero, dataFetcher, reduction, size, n, deviceAux1 );

		#ifdef CUDA_REDUCTION_PROFILING
		timer.stop();

src/TNL/Containers/Algorithms/Reduction.hpp

+2 −4

Original line number	Diff line number	Diff line
		@@ -288,7 +288,6 @@ reduce( const Index size,
		Result* deviceAux1( 0 );
		Index reducedSize = reductionLauncher.start(
		reduction,
		volatileReduction,
		dataFetcher,
		zero,
		deviceAux1 );
		@@ -324,7 +323,7 @@ reduce( const Index size,
		}
		else {
		// data can't be safely reduced on host, so continue with the reduction on the GPU
		auto result = reductionLauncher.finish( reduction, volatileReduction, zero );
		auto result = reductionLauncher.finish( reduction, zero );

		#ifdef CUDA_REDUCTION_PROFILING
		timer.stop();
		@@ -368,7 +367,6 @@ reduceWithArgument( const Index size,
		Index* deviceIndexes( nullptr );
		Index reducedSize = reductionLauncher.startWithArgument(
		reduction,
		volatileReduction,
		dataFetcher,
		zero,
		deviceAux1,
		@@ -409,7 +407,7 @@ reduceWithArgument( const Index size,
		}
		else {
		// data can't be safely reduced on host, so continue with the reduction on the GPU
		auto result = reductionLauncher.finishWithArgument( reduction, volatileReduction, zero );
		auto result = reductionLauncher.finishWithArgument( reduction, zero );

		#ifdef CUDA_REDUCTION_PROFILING
		timer.stop();