Fixed use of buffer in CUDA paralell reduction. (3f891223) · Commits · TNL / tnl-dev

src/TNL/Containers/Algorithms/CudaReductionKernel.h

+154 −96

Original line number	Diff line number	Diff line
		@@ -178,22 +178,14 @@ CudaReductionKernel( const Result zero,

		}

		template< typename DataFetcher,
		typename Reduction,
		typename VolatileReduction,
		typename Index,
		template< typename Index,
		typename Result >
		int
		CudaReductionKernelLauncher( const Index size,
		const Reduction& reduction,
		const VolatileReduction& volatileReduction,
		const DataFetcher& dataFetcher,
		const Result& zero,
		Result*& output )
		struct CudaReductionKernelLauncher
		{
		using IndexType = Index;
		using ResultType = Result;

		////
		// The number of blocks should be a multiple of the number of multiprocessors
		// to ensure optimum balancing of the load. This is very important, because
		// we run the kernel with a fixed number of blocks, so the amount of work per
		@@ -203,20 +195,78 @@ CudaReductionKernelLauncher( const Index size,
		// where blocksPerMultiprocessor is determined according to the number of
		// available registers on the multiprocessor.
		// On Tesla K40c, desGridSize = 8 * 15 = 120.
		const int activeDevice = Devices::CudaDeviceInfo::getActiveDevice();
		const int blocksdPerMultiprocessor = Devices::CudaDeviceInfo::getRegistersPerMultiprocessor( activeDevice )
		/ ( Reduction_maxThreadsPerBlock * Reduction_registersPerThread );
		const int desGridSize = blocksdPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice );
		dim3 blockSize, gridSize;
		blockSize.x = Reduction_maxThreadsPerBlock;
		gridSize.x = min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );
		CudaReductionKernelLauncher( const Index size )
		: activeDevice( Devices::CudaDeviceInfo::getActiveDevice() ),
		blocksdPerMultiprocessor( Devices::CudaDeviceInfo::getRegistersPerMultiprocessor( activeDevice )
		/ ( Reduction_maxThreadsPerBlock * Reduction_registersPerThread ) ),
		desGridSize( blocksdPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice ) ),
		originalSize( size )
		{
		}

		template< typename DataFetcher,
		typename Reduction,
		typename VolatileReduction >
		int start( const Reduction& reduction,
		const VolatileReduction& volatileReduction,
		const DataFetcher& dataFetcher,
		const Result& zero,
		ResultType*& output )
		{
		////
		// create reference to the reduction buffer singleton and set size
		const size_t buf_size = desGridSize * sizeof( ResultType );
		const size_t buf_size = 2 * desGridSize * sizeof( ResultType );
		CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
		cudaReductionBuffer.setSize( buf_size );
		output = cudaReductionBuffer.template getData< ResultType >();

		this-> reducedSize = this->launch( originalSize, reduction, volatileReduction, dataFetcher, zero, output );
		return this->reducedSize;
		}

		template< typename Reduction,
		typename VolatileReduction >
		Result finish( const Reduction& reduction,
		const VolatileReduction& volatileReduction,
		const Result& zero )
		{
		////
		// Input is the first half of the buffer, output is the second half
		const size_t buf_size = desGridSize * sizeof( ResultType );
		CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
		ResultType* input = cudaReductionBuffer.template getData< ResultType >();
		ResultType* output = &input[ buf_size ];

		auto copyFetch = [=] __cuda_callable__ ( IndexType i ) { return input[ i ]; };
		while( this->reducedSize > 1 )
		{
		this-> reducedSize = this->launch( this->reducedSize, reduction, volatileReduction, copyFetch, zero, output );
		std::swap( input, output );
		}

		////
		// Copy result on CPU
		ResultType result;
		ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( &result, output, 1 );
		return result;
		}

		protected:
		template< typename DataFetcher,
		typename Reduction,
		typename VolatileReduction >
		int launch( const Index size,
		const Reduction& reduction,
		const VolatileReduction& volatileReduction,
		const DataFetcher& dataFetcher,
		const Result& zero,
		Result* output )
		{
		dim3 blockSize, gridSize;
		blockSize.x = Reduction_maxThreadsPerBlock;
		gridSize.x = min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );

		////
		// when there is only one warp per blockSize.x, we need to allocate two warps
		// worth of shared memory so that we don't index shared memory out of bounds
		const IndexType shmem = (blockSize.x <= 32)
		@@ -287,9 +337,17 @@ CudaReductionKernelLauncher( const Index size,
		}
		TNL_CHECK_CUDA_DEVICE;

		////
		// return the size of the output array on the CUDA device
		return gridSize.x;
		}

		const int activeDevice;
		const int blocksdPerMultiprocessor;
		const int desGridSize;
		const IndexType originalSize;
		IndexType reducedSize;
		};
		#endif

		} // namespace Algorithms

src/TNL/Containers/Algorithms/Reduction.hpp

+9 −16

Original line number	Diff line number	Diff line
		@@ -91,11 +91,13 @@ Reduction< Devices::Cuda >::
		timer.start();
		#endif

		CudaReductionKernelLauncher< IndexType, ResultType > reductionLauncher( size );

		/****
		* Reduce the data on the CUDA device.
		*/
		ResultType* deviceAux1( 0 );
		IndexType reducedSize = CudaReductionKernelLauncher( size,
		IndexType reducedSize = reductionLauncher.start(
		reduction,
		volatileReduction,
		dataFetcher,
		@@ -112,7 +114,6 @@ Reduction< Devices::Cuda >::
		/***
		* Transfer the reduced data from device to host.
		*/
		//ResultType* resultArray[ reducedSize ];
		std::unique_ptr< ResultType[] > resultArray{ new ResultType[ reducedSize ] };
		ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray.get(), deviceAux1, reducedSize );

		@@ -139,15 +140,7 @@ Reduction< Devices::Cuda >::
		/***
		* Data can't be safely reduced on host, so continue with the reduction on the CUDA device.
		*/
		auto copyFetch = [=] __cuda_callable__ ( IndexType i ) { return deviceAux1[ i ]; };
		while( reducedSize > 1 ) {
		reducedSize = CudaReductionKernelLauncher( reducedSize,
		reduction,
		volatileReduction,
		copyFetch,
		zero,
		deviceAux1 );
		}
		auto result = reductionLauncher.finish( reduction, volatileReduction, zero );

		#ifdef CUDA_REDUCTION_PROFILING
		timer.stop();
		@@ -156,14 +149,14 @@ Reduction< Devices::Cuda >::
		timer.start();
		#endif

		ResultType resultArray[ 1 ];
		ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray, deviceAux1, reducedSize );
		const ResultType result = resultArray[ 0 ];
		//ResultType resultArray[ 1 ];
		//ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray, deviceAux1, reducedSize );
		//const ResultType result = resultArray[ 0 ];

		#ifdef CUDA_REDUCTION_PROFILING
		/*#ifdef CUDA_REDUCTION_PROFILING
		timer.stop();
		std::cout << " Transferring the result to CPU took " << timer.getRealTime() << " sec. " << std::endl;
		#endif
		#endif*/

		return result;
		}

src/TNL/Containers/Algorithms/CommonVectorOperations.hpp

+2 −2

File changed.

Contains only whitespace changes.