Small fixes in parallel reduction (592f6355) · Commits · TNL / tnl-dev

src/TNL/Containers/Algorithms/CudaReductionKernel.h

+7 −7

Original line number	Diff line number	Diff line
		@@ -57,9 +57,9 @@ CudaReductionKernel( Operation operation,
		* gridSize is the number of element processed by all blocks at the
		* same time.
		*/
		IndexType tid = threadIdx. x;
		const IndexType tid = threadIdx.x;
		IndexType gid = blockIdx.x * blockDim. x + threadIdx.x;
		IndexType gridSize = blockDim. x * gridDim.x;
		const IndexType gridSize = blockDim.x * gridDim.x;

		sdata[ tid ] = operation.initialValue();
		/***

+7 −7

Original line number	Diff line number	Diff line
		@@ -103,7 +103,7 @@ reduce( Operation& operation,
		deviceAux1 );
		#ifdef CUDA_REDUCTION_PROFILING
		timer.stop();
		cout << " Multireduction of " << n << " datasets on GPU to size " << reducedSize << " took " << timer.getRealTime() << " sec. " << endl;
		std::cout << " Multireduction of " << n << " datasets on GPU to size " << reducedSize << " took " << timer.getRealTime() << " sec. " << std::endl;
		timer.reset();
		timer.start();
		#endif
		@@ -117,18 +117,18 @@ reduce( Operation& operation,

		#ifdef CUDA_REDUCTION_PROFILING
		timer.stop();
		cout << " Transferring data to CPU took " << timer.getRealTime() << " sec. " << endl;
		std::cout << " Transferring data to CPU took " << timer.getRealTime() << " sec. " << std::endl;
		timer.reset();
		timer.start();
		#endif

		// cout << "resultArray = [";
		// std::cout << "resultArray = [";
		// for( int i = 0; i < n * reducedSize; i++ ) {
		// cout << resultArray[ i ];
		// std::cout << resultArray[ i ];
		// if( i < n * reducedSize - 1 )
		// cout << ", ";
		// std::cout << ", ";
		// }
		// cout << "]" << endl;
		// std::cout << "]" << std::endl;

		/***
		* Reduce the data on the host system.
		@@ -138,7 +138,7 @@ reduce( Operation& operation,

		#ifdef CUDA_REDUCTION_PROFILING
		timer.stop();
		cout << " Multireduction of small data set on CPU took " << timer.getRealTime() << " sec. " << endl;
		std::cout << " Multireduction of small data set on CPU took " << timer.getRealTime() << " sec. " << std::endl;
		#endif

		return checkCudaDevice;

+0 −3

Original line number	Diff line number	Diff line
		@@ -36,9 +36,6 @@ namespace Algorithms {
		*/
		const int minGPUReductionDataSize = 256;//65536; //16384;//1024;//256;

		#ifdef HAVE_CUDA
		#endif

		template< typename Operation >
		bool
		reductionOnCudaDevice( Operation& operation,