Moved multireduction kernel launcher to CudaMultireductionKernel.h (8172e35e) · Commits · TNL / tnl-dev

src/TNL/Containers/Algorithms/CudaMultireductionKernel.h

+142 −0

Original line number	Diff line number	Diff line
		@@ -148,6 +148,148 @@ CudaMultireductionKernel( Operation& operation,
		output[ blockIdx.x ] = sdata[ tid ];
		}
		}

		template< typename Operation >
		typename Operation::IndexType
		CudaMultireductionKernelLauncher( Operation& operation,
		int n,
		const typename Operation::IndexType size,
		const typename Operation::RealType* input1,
		const typename Operation::IndexType ldInput1,
		const typename Operation::RealType* input2,
		typename Operation::ResultType*& output )
		{
		typedef typename Operation::IndexType IndexType;
		typedef typename Operation::RealType RealType;
		typedef typename Operation::ResultType ResultType;

		// The number of blocks should be a multiple of the number of multiprocessors
		// to ensure optimum balancing of the load. This is very important, because
		// we run the kernel with a fixed number of blocks, so the amount of work per
		// block increases with enlarging the problem, so even small imbalance can
		// cost us dearly.
		// On Tesla K40c, desGridSizeX = 4 * 6 * 15 = 360.
		// const IndexType desGridSizeX = 4 * Multireduction_minBlocksPerMultiprocessor
		// * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
		// On Tesla K40c, desGridSizeX = 6 * 15 = 90.
		const IndexType desGridSizeX = Multireduction_minBlocksPerMultiprocessor
		* Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
		dim3 blockSize, gridSize;

		// version A: max 16 rows of threads
		blockSize.y = min( n, 16 );

		// version B: up to 16 rows of threads, then "minimize" number of inactive rows
		// if( n <= 16 )
		// blockSize.y = n;
		// else {
		// int r = (n - 1) % 16 + 1;
		// if( r > 12 )
		// blockSize.y = 16;
		// else if( r > 8 )
		// blockSize.y = 4;
		// else if( r > 4 )
		// blockSize.y = 8;
		// else
		// blockSize.y = 4;
		// }

		// blockSize.x has to be a power of 2
		blockSize.x = Multireduction_maxThreadsPerBlock;
		while( blockSize.x * blockSize.y > Multireduction_maxThreadsPerBlock )
		blockSize.x /= 2;

		gridSize.x = min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSizeX );
		gridSize.y = Devices::Cuda::getNumberOfBlocks( n, blockSize.y );

		if( gridSize.y > (unsigned) Devices::Cuda::getMaxGridSize() ) {
		std::cerr << "Maximum gridSize.y limit exceeded (limit is 65535, attempted " << gridSize.y << ")." << std::endl;
		throw 1;
		}

		// create reference to the reduction buffer singleton and set default size
		// (make an overestimate to avoid reallocation on every call if n is increased by 1 each time)
		const size_t buf_size = 8 * ( n / 8 + 1 ) * desGridSizeX * sizeof( ResultType );
		CudaReductionBuffer & cudaReductionBuffer = CudaReductionBuffer::getInstance();
		if( ! cudaReductionBuffer.setSize( buf_size ) )
		throw 1;
		output = cudaReductionBuffer.template getData< ResultType >();

		// when there is only one warp per blockSize.x, we need to allocate two warps
		// worth of shared memory so that we don't index shared memory out of bounds
		const IndexType shmem = (blockSize.x <= 32)
		? 2 * blockSize.x * blockSize.y * sizeof( ResultType )
		: blockSize.x * blockSize.y * sizeof( ResultType );

		//cout << "Multireduction of " << n << " datasets, block size (" << blockSize.x << "," << blockSize.y << "), grid size (" << gridSize.x << "," << gridSize.y << "), shmem " << shmem << endl;

		/***
		* Depending on the blockSize we generate appropriate template instance.
		*/
		switch( blockSize.x )
		{
		case 512:
		CudaMultireductionKernel< Operation, 512 >
		<<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
		break;
		case 256:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 256 >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< Operation, 256 >
		<<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
		break;
		case 128:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 128 >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< Operation, 128 >
		<<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
		break;
		case 64:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 64 >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< Operation, 64 >
		<<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
		break;
		case 32:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 32 >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< Operation, 32 >
		<<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
		break;
		case 16:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 16 >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< Operation, 16 >
		<<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
		break;
		case 8:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 8 >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< Operation, 8 >
		<<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
		break;
		case 4:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 4 >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< Operation, 4 >
		<<< gridSize, blockSize, shmem >>>( operation, n,size, input1, ldInput1, input2, output);
		break;
		case 2:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 2 >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< Operation, 2 >
		<<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
		break;
		case 1:
		Assert( false, std::cerr << "blockSize should not be 1." << std::endl );
		default:
		Assert( false, std::cerr << "Block size is " << blockSize.x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." << std::endl );
		}
		checkCudaDevice;

		// return the size of the output array on the CUDA device
		return gridSize.x;
		}
		#endif

		} // namespace Algorithms

src/TNL/Containers/Algorithms/Multireduction_impl.h

+7 −149

Original line number	Diff line number	Diff line
		@@ -29,148 +29,6 @@ namespace Algorithms {
		// TODO: benchmarks with different values
		static constexpr int Multireduction_minGpuDataSize = 256;//65536; //16384;//1024;//256;

		#ifdef HAVE_CUDA
		template< typename Operation >
		typename Operation::IndexType
		multireduceOnCudaDevice( Operation& operation,
		int n,
		const typename Operation::IndexType size,
		const typename Operation::RealType* input1,
		const typename Operation::IndexType ldInput1,
		const typename Operation::RealType* input2,
		typename Operation::ResultType*& output )
		{
		typedef typename Operation::IndexType IndexType;
		typedef typename Operation::RealType RealType;
		typedef typename Operation::ResultType ResultType;

		// The number of blocks should be a multiple of the number of multiprocessors
		// to ensure optimum balancing of the load. This is very important, because
		// we run the kernel with a fixed number of blocks, so the amount of work per
		// block increases with enlarging the problem, so even small imbalance can
		// cost us dearly.
		// On Tesla K40c, desGridSizeX = 4 * 6 * 15 = 360.
		// const IndexType desGridSizeX = 4 * Multireduction_minBlocksPerMultiprocessor
		// * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
		// On Tesla K40c, desGridSizeX = 6 * 15 = 90.
		const IndexType desGridSizeX = Multireduction_minBlocksPerMultiprocessor
		* Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
		dim3 blockSize, gridSize;

		// version A: max 16 rows of threads
		blockSize.y = min( n, 16 );

		// version B: up to 16 rows of threads, then "minimize" number of inactive rows
		// if( n <= 16 )
		// blockSize.y = n;
		// else {
		// int r = (n - 1) % 16 + 1;
		// if( r > 12 )
		// blockSize.y = 16;
		// else if( r > 8 )
		// blockSize.y = 4;
		// else if( r > 4 )
		// blockSize.y = 8;
		// else
		// blockSize.y = 4;
		// }

		// blockSize.x has to be a power of 2
		blockSize.x = Multireduction_maxThreadsPerBlock;
		while( blockSize.x * blockSize.y > Multireduction_maxThreadsPerBlock )
		blockSize.x /= 2;

		gridSize.x = min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSizeX );
		gridSize.y = Devices::Cuda::getNumberOfBlocks( n, blockSize.y );

		if( gridSize.y > (unsigned) Devices::Cuda::getMaxGridSize() ) {
		std::cerr << "Maximum gridSize.y limit exceeded (limit is 65535, attempted " << gridSize.y << ")." << std::endl;
		throw 1;
		}

		// create reference to the reduction buffer singleton and set default size
		// (make an overestimate to avoid reallocation on every call if n is increased by 1 each time)
		const size_t buf_size = 8 * ( n / 8 + 1 ) * desGridSizeX * sizeof( ResultType );
		CudaReductionBuffer & cudaReductionBuffer = CudaReductionBuffer::getInstance();
		if( ! cudaReductionBuffer.setSize( buf_size ) )
		throw 1;
		output = cudaReductionBuffer.template getData< ResultType >();

		// when there is only one warp per blockSize.x, we need to allocate two warps
		// worth of shared memory so that we don't index shared memory out of bounds
		const IndexType shmem = (blockSize.x <= 32)
		? 2 * blockSize.x * blockSize.y * sizeof( ResultType )
		: blockSize.x * blockSize.y * sizeof( ResultType );

		//cout << "Multireduction of " << n << " datasets, block size (" << blockSize.x << "," << blockSize.y << "), grid size (" << gridSize.x << "," << gridSize.y << "), shmem " << shmem << endl;

		/***
		* Depending on the blockSize we generate appropriate template instance.
		*/
		switch( blockSize.x )
		{
		case 512:
		CudaMultireductionKernel< Operation, 512 >
		<<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
		break;
		case 256:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 256 >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< Operation, 256 >
		<<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
		break;
		case 128:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 128 >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< Operation, 128 >
		<<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
		break;
		case 64:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 64 >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< Operation, 64 >
		<<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
		break;
		case 32:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 32 >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< Operation, 32 >
		<<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
		break;
		case 16:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 16 >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< Operation, 16 >
		<<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
		break;
		case 8:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 8 >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< Operation, 8 >
		<<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
		break;
		case 4:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 4 >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< Operation, 4 >
		<<< gridSize, blockSize, shmem >>>( operation, n,size, input1, ldInput1, input2, output);
		break;
		case 2:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 2 >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< Operation, 2 >
		<<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
		break;
		case 1:
		Assert( false, std::cerr << "blockSize should not be 1." << std::endl );
		default:
		Assert( false, std::cerr << "Block size is " << blockSize.x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." << std::endl );
		}
		checkCudaDevice;
		return gridSize.x;
		}
		#endif

		/*
		* Parameters:
		* operation: the operation used for reduction
		@@ -224,7 +82,7 @@ bool multireductionOnCudaDevice( Operation& operation,
		* Reduce the data on the CUDA device.
		*/
		ResultType* deviceAux1 = nullptr;
		const IndexType reducedSize = multireduceOnCudaDevice( operation,
		const IndexType reducedSize = CudaMultireductionKernelLauncher( operation,
		n,
		size,
		deviceInput1,