Rewritten multireduction with lambda functions (b74a24d2) · Commits · TNL / tnl-dev

src/TNL/Containers/Algorithms/CudaMultireductionKernel.h

+108 −141

Original line number	Diff line number	Diff line
		@@ -12,14 +12,11 @@

		#pragma once

		#ifdef HAVE_CUDA
		#include <cuda.h>
		#endif

		#include <TNL/Assert.h>
		#include <TNL/Math.h>
		#include <TNL/Devices/CudaDeviceInfo.h>
		#include <TNL/Containers/Algorithms/CudaReductionBuffer.h>
		#include <TNL/Exceptions/CudaSupportMissing.h>

		namespace TNL {
		namespace Containers {
		@@ -32,157 +29,128 @@ namespace Algorithms {
		* architecture so that there are no local memory spills.
		*/
		static constexpr int Multireduction_maxThreadsPerBlock = 256; // must be a power of 2
		static constexpr int Multireduction_registersPerThread = 38; // empirically determined optimal value
		static constexpr int Multireduction_registersPerThread = 32; // empirically determined optimal value

		// __CUDA_ARCH__ is defined only in device code!
		#if (__CUDA_ARCH__ >= 300 )
		static constexpr int Multireduction_minBlocksPerMultiprocessor = 6;
		static constexpr int Multireduction_minBlocksPerMultiprocessor = 8;
		#else
		static constexpr int Multireduction_minBlocksPerMultiprocessor = 4;
		#endif

		template< int blockSizeX, typename Operation, typename Index >
		template< int blockSizeX,
		typename Result,
		typename DataFetcher,
		typename Reduction,
		typename VolatileReduction,
		typename Index >
		__global__ void
		__launch_bounds__( Multireduction_maxThreadsPerBlock, Multireduction_minBlocksPerMultiprocessor )
		CudaMultireductionKernel( Operation operation,
		const int n,
		CudaMultireductionKernel( const Result zero,
		DataFetcher dataFetcher,
		const Reduction reduction,
		const VolatileReduction volatileReduction,
		const Index size,
		const typename Operation::DataType1* input1,
		const Index ldInput1,
		const typename Operation::DataType2* input2,
		typename Operation::ResultType* output )
		const int n,
		Result* output )
		{
		typedef Index IndexType;
		typedef typename Operation::ResultType ResultType;
		Result* sdata = Devices::Cuda::getSharedMemory< Result >();

		ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >();
		// Get the thread id (tid), global thread id (gid) and gridSize.
		const Index tid = threadIdx.y * blockDim.x + threadIdx.x;
		Index gid = blockIdx.x * blockDim.x + threadIdx.x;
		const Index gridSizeX = blockDim.x * gridDim.x;

		/***
		* Get thread id (tid) and global element id (gid).
		* gridSizeX is the number of elements in the direction of x-axis
		* processed by all blocks at the same time.
		*/
		const IndexType tid = threadIdx.y * blockDim.x + threadIdx.x;
		IndexType gid = blockIdx.x * blockDim.x + threadIdx.x;
		const IndexType gridSizeX = blockDim.x * gridDim.x;
		// Get the dataset index.
		const int y = blockIdx.y * blockDim.y + threadIdx.y;
		if( y >= n ) return;

		/***
		* Shift input1 and output pointers.
		*/
		const IndexType y = blockIdx.y * blockDim.y + threadIdx.y;
		if( y < n ) {
		input1 += y * ldInput1;
		output += y * gridDim.x;
		}
		else
		return;
		sdata[ tid ] = zero;

		/***
		* Start with the sequential reduction and push the
		* result into the shared memory.
		*/
		sdata[ tid ] = operation.initialValue();
		while( gid + 4 * gridSizeX < size )
		{
		operation.firstReduction( sdata[ tid ], gid, input1, input2 );
		operation.firstReduction( sdata[ tid ], gid + gridSizeX, input1, input2 );
		operation.firstReduction( sdata[ tid ], gid + 2 * gridSizeX, input1, input2 );
		operation.firstReduction( sdata[ tid ], gid + 3 * gridSizeX, input1, input2 );
		// Start with the sequential reduction and push the result into the shared memory.
		while( gid + 4 * gridSizeX < size ) {
		reduction( sdata[ tid ], dataFetcher( gid, y ) );
		reduction( sdata[ tid ], dataFetcher( gid + gridSizeX, y ) );
		reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSizeX, y ) );
		reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSizeX, y ) );
		gid += 4 * gridSizeX;
		}
		while( gid + 2 * gridSizeX < size )
		{
		operation.firstReduction( sdata[ tid ], gid, input1, input2 );
		operation.firstReduction( sdata[ tid ], gid + gridSizeX, input1, input2 );
		while( gid + 2 * gridSizeX < size ) {
		reduction( sdata[ tid ], dataFetcher( gid, y ) );
		reduction( sdata[ tid ], dataFetcher( gid + gridSizeX, y ) );
		gid += 2 * gridSizeX;
		}
		while( gid < size )
		{
		operation.firstReduction( sdata[ tid ], gid, input1, input2 );
		while( gid < size ) {
		reduction( sdata[ tid ], dataFetcher( gid, y ) );
		gid += gridSizeX;
		}
		__syncthreads();


		//printf( "1: tid %d data %f \n", tid, sdata[ tid ] );

		/***
		* Perform the parallel reduction.
		*/
		// Perform the parallel reduction.
		if( blockSizeX >= 1024 ) {
		if( threadIdx.x < 512 ) {
		operation.commonReduction( sdata[ tid ], sdata[ tid + 512 ] );
		}
		if( threadIdx.x < 512 )
		reduction( sdata[ tid ], sdata[ tid + 512 ] );
		__syncthreads();
		}
		if( blockSizeX >= 512 ) {
		if( threadIdx.x < 256 ) {
		operation.commonReduction( sdata[ tid ], sdata[ tid + 256 ] );
		}
		if( threadIdx.x < 256 )
		reduction( sdata[ tid ], sdata[ tid + 256 ] );
		__syncthreads();
		}
		if( blockSizeX >= 256 ) {
		if( threadIdx.x < 128 ) {
		operation.commonReduction( sdata[ tid ], sdata[ tid + 128 ] );
		}
		if( threadIdx.x < 128 )
		reduction( sdata[ tid ], sdata[ tid + 128 ] );
		__syncthreads();
		}
		if( blockSizeX >= 128 ) {
		if( threadIdx.x < 64 ) {
		operation.commonReduction( sdata[ tid ], sdata[ tid + 64 ] );
		}
		if( threadIdx.x < 64 )
		reduction( sdata[ tid ], sdata[ tid + 64 ] );
		__syncthreads();
		}

		/***
		* This runs in one warp so it is synchronized implicitly.
		*
		* When the blockSizeX is less then or equal to the warp size, the shared memory
		* must be at least 2 * blockSizeX elements per block, otherwise unallocated memory
		* will be accessed !!!
		*/
		// This runs in one warp so it is synchronized implicitly.
		if( threadIdx.x < 32 ) {
		volatile ResultType* vsdata = sdata;
		if( blockSizeX >= 64 ) {
		operation.commonReduction( vsdata[ tid ], vsdata[ tid + 32 ] );
		}
		if( blockSizeX >= 32 ) {
		operation.commonReduction( vsdata[ tid ], vsdata[ tid + 16 ] );
		}
		if( blockSizeX >= 16 ) {
		operation.commonReduction( vsdata[ tid ], vsdata[ tid + 8 ] );
		}
		if( blockSizeX >= 8 ) {
		operation.commonReduction( vsdata[ tid ], vsdata[ tid + 4 ] );
		}
		if( blockSizeX >= 4 ) {
		operation.commonReduction( vsdata[ tid ], vsdata[ tid + 2 ] );
		}
		if( blockSizeX >= 2 ) {
		operation.commonReduction( vsdata[ tid ], vsdata[ tid + 1 ] );
		}
		volatile Result* vsdata = sdata;
		if( blockSizeX >= 64 )
		volatileReduction( vsdata[ tid ], vsdata[ tid + 32 ] );
		// Note that here we do not have to check if tid < 16 etc, because we have
		// 2 * blockSize.x elements of shared memory per block, so we do not
		// access out of bounds. The results for the upper half will be undefined,
		// but unused anyway.
		if( blockSizeX >= 32 )
		volatileReduction( vsdata[ tid ], vsdata[ tid + 16 ] );
		if( blockSizeX >= 16 )
		volatileReduction( vsdata[ tid ], vsdata[ tid + 8 ] );
		if( blockSizeX >= 8 )
		volatileReduction( vsdata[ tid ], vsdata[ tid + 4 ] );
		if( blockSizeX >= 4 )
		volatileReduction( vsdata[ tid ], vsdata[ tid + 2 ] );
		if( blockSizeX >= 2 )
		volatileReduction( vsdata[ tid ], vsdata[ tid + 1 ] );
		}

		/***
		* Store the result back in the global memory.
		*/
		// Store the result back in the global memory.
		if( threadIdx.x == 0 ) {
		output[ blockIdx.x ] = sdata[ tid ];
		output[ blockIdx.x + y * gridDim.x ] = sdata[ tid ];
		}
		}
		#endif

		template< typename Operation, typename Index >
		template< typename Result,
		typename DataFetcher,
		typename Reduction,
		typename VolatileReduction,
		typename Index >
		int
		CudaMultireductionKernelLauncher( Operation& operation,
		const int n,
		CudaMultireductionKernelLauncher( const Result zero,
		DataFetcher dataFetcher,
		const Reduction reduction,
		const VolatileReduction volatileReduction,
		const Index size,
		const typename Operation::DataType1* input1,
		const Index ldInput1,
		const typename Operation::DataType2* input2,
		typename Operation::ResultType*& output )
		const int n,
		Result*& output )
		{
		typedef typename Operation::ResultType ResultType;

		#ifdef HAVE_CUDA
		// The number of blocks should be a multiple of the number of multiprocessors
		// to ensure optimum balancing of the load. This is very important, because
		// we run the kernel with a fixed number of blocks, so the amount of work per
		@@ -231,87 +199,86 @@ CudaMultireductionKernelLauncher( Operation& operation,

		// create reference to the reduction buffer singleton and set size
		// (make an overestimate to avoid reallocation on every call if n is increased by 1 each time)
		const size_t buf_size = 8 * ( n / 8 + 1 ) * desGridSizeX * sizeof( ResultType );
		const std::size_t buf_size = 8 * ( n / 8 + 1 ) * desGridSizeX * sizeof( Result );
		CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
		cudaReductionBuffer.setSize( buf_size );
		output = cudaReductionBuffer.template getData< ResultType >();
		output = cudaReductionBuffer.template getData< Result >();

		// when there is only one warp per blockSize.x, we need to allocate two warps
		// worth of shared memory so that we don't index shared memory out of bounds
		const Index shmem = (blockSize.x <= 32)
		? 2 * blockSize.x * blockSize.y * sizeof( ResultType )
		: blockSize.x * blockSize.y * sizeof( ResultType );

		//cout << "Multireduction of " << n << " datasets, block size (" << blockSize.x << "," << blockSize.y << "), grid size (" << gridSize.x << "," << gridSize.y << "), shmem " << shmem <<std::endl;
		? 2 * blockSize.x * blockSize.y * sizeof( Result )
		: blockSize.x * blockSize.y * sizeof( Result );

		/***
		* Depending on the blockSize we generate appropriate template instance.
		*/
		// Depending on the blockSize we generate appropriate template instance.
		switch( blockSize.x )
		{
		case 512:
		CudaMultireductionKernel< 512 >
		<<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
		break;
		case 256:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< 256, Operation, Index >, cudaFuncCachePreferShared);
		cudaFuncSetCacheConfig(CudaMultireductionKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< 256 >
		<<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
		break;
		case 128:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< 128, Operation, Index >, cudaFuncCachePreferShared);
		cudaFuncSetCacheConfig(CudaMultireductionKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< 128 >
		<<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
		break;
		case 64:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< 64, Operation, Index >, cudaFuncCachePreferShared);
		cudaFuncSetCacheConfig(CudaMultireductionKernel< 64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< 64 >
		<<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
		break;
		case 32:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< 32, Operation, Index >, cudaFuncCachePreferShared);
		cudaFuncSetCacheConfig(CudaMultireductionKernel< 32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< 32 >
		<<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
		break;
		case 16:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< 16, Operation, Index >, cudaFuncCachePreferShared);
		cudaFuncSetCacheConfig(CudaMultireductionKernel< 16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< 16 >
		<<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
		break;
		case 8:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< 8, Operation, Index >, cudaFuncCachePreferShared);
		cudaFuncSetCacheConfig(CudaMultireductionKernel< 8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< 8 >
		<<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
		break;
		case 4:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< 4, Operation, Index >, cudaFuncCachePreferShared);
		cudaFuncSetCacheConfig(CudaMultireductionKernel< 4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< 4 >
		<<< gridSize, blockSize, shmem >>>( operation, n,size, input1, ldInput1, input2, output);
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
		break;
		case 2:
		cudaFuncSetCacheConfig(CudaMultireductionKernel< 2, Operation, Index >, cudaFuncCachePreferShared);
		cudaFuncSetCacheConfig(CudaMultireductionKernel< 2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);

		CudaMultireductionKernel< 2 >
		<<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
		<<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
		break;
		case 1:
		throw std::logic_error( "blockSize should not be 1." );
		default:
		throw std::logic_error( "Block size is " + std::to_string(blockSize.x) + " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
		}
		cudaStreamSynchronize(0);
		TNL_CHECK_CUDA_DEVICE;

		// return the size of the output array on the CUDA device
		return gridSize.x;
		}
		#else
		throw Exceptions::CudaSupportMissing();
		#endif
		}

		} // namespace Algorithms
		} // namespace Containers

src/TNL/Containers/Algorithms/Multireduction.h

+50 −20

Original line number	Diff line number	Diff line
		@@ -20,40 +20,70 @@ namespace Containers {
		namespace Algorithms {

		template< typename Device >
		class Multireduction;
		struct Multireduction;

		template<>
		class Multireduction< Devices::Cuda >
		struct Multireduction< Devices::Host >
		{
		public:
		template< typename Operation, typename Index >
		/**
		* Parameters:
		* zero: starting value for reduction
		* dataFetcher: callable object such that `dataFetcher( i, j )` yields
		* the i-th value to be reduced from the j-th dataset
		* (i = 0,...,size-1; j = 0,...,n-1)
		* reduction: callable object representing the reduction operation
		* volatileReduction: callable object representing the reduction operation
		* size: the size of each dataset
		* n: number of datasets to be reduced
		* result: output array of size = n
		*/
		template< typename Result,
		typename DataFetcher,
		typename Reduction,
		typename VolatileReduction,
		typename Index >
		static void
		reduce( Operation& operation,
		const int n,
		reduce( const Result zero,
		DataFetcher dataFetcher,
		const Reduction reduction,
		const VolatileReduction volatileReduction,
		const Index size,
		const typename Operation::DataType1* deviceInput1,
		const Index ldInput1,
		const typename Operation::DataType2* deviceInput2,
		typename Operation::ResultType* hostResult );
		const int n,
		Result* result );
		};

		template<>
		class Multireduction< Devices::Host >
		struct Multireduction< Devices::Cuda >
		{
		public:
		template< typename Operation, typename Index >
		/**
		* Parameters:
		* zero: starting value for reduction
		* dataFetcher: callable object such that `dataFetcher( i, j )` yields
		* the i-th value to be reduced from the j-th dataset
		* (i = 0,...,size-1; j = 0,...,n-1)
		* reduction: callable object representing the reduction operation
		* volatileReduction: callable object representing the reduction operation
		* size: the size of each dataset
		* n: number of datasets to be reduced
		* hostResult: output array of size = n
		*/
		template< typename Result,
		typename DataFetcher,
		typename Reduction,
		typename VolatileReduction,
		typename Index >
		static void
		reduce( Operation& operation,
		const int n,
		reduce( const Result zero,
		DataFetcher dataFetcher,
		const Reduction reduction,
		const VolatileReduction volatileReduction,
		const Index size,
		const typename Operation::DataType1* deviceInput1,
		const Index ldInput1,
		const typename Operation::DataType2* deviceInput2,
		typename Operation::ResultType* hostResult );
		const int n,
		Result* hostResult );
		};

		} // namespace Algorithms
		} // namespace Containers
		} // namespace TNL

		#include "Multireduction_impl.h"
		#include "Multireduction.hpp"

src/TNL/Containers/Algorithms/Multireduction_impl.h→src/TNL/Containers/Algorithms/Multireduction.hpp

+231 −0

File changed and moved.

Preview size limit exceeded, changes collapsed.

src/TNL/Solvers/Linear/GMRES.h

+7 −0

Original line number	Diff line number	Diff line
		@@ -84,9 +84,16 @@ protected:
		void hauseholder_cwy( VectorViewType v,
		const int i );

		// nvcc allows __cuda_callable__ lambdas only in public methods
		#ifdef __NVCC__
		public:
		#endif
		void hauseholder_cwy_transposed( VectorViewType z,
		const int i,
		ConstVectorViewType w );
		#ifdef __NVCC__
		protected:
		#endif

		template< typename Vector >
		void update( const int k,

src/TNL/Solvers/Linear/GMRES_impl.h

+40 −13

Original line number	Diff line number	Diff line
		@@ -15,7 +15,6 @@
		#include <type_traits>
		#include <cmath>

		#include <TNL/Exceptions/CudaSupportMissing.h>
		#include <TNL/Containers/Algorithms/Multireduction.h>
		#include <TNL/Matrices/MatrixOperations.h>

		@@ -427,14 +426,28 @@ hauseholder_generate( const int i,
		if( i > 0 ) {
		// aux = Y_{i-1}^T * y_i
		RealType aux[ i ];
		Containers::Algorithms::ParallelReductionScalarProduct< RealType, RealType > scalarProduct;
		// Containers::Algorithms::ParallelReductionScalarProduct< RealType, RealType > scalarProduct;
		// Containers::Algorithms::Multireduction< DeviceType >::reduce
		// ( scalarProduct,
		// i,
		// size,
		// Y.getData(),
		// ldSize,
		// Traits::getConstLocalView( y_i ).getData(),
		// aux );
		const RealType* _Y = Y.getData();
		const RealType* _y_i = Traits::getConstLocalView( y_i ).getData();
		const IndexType ldSize = this->ldSize;
		auto fetch = [_Y, _y_i, ldSize] __cuda_callable__ ( IndexType idx, int k ) { return _Y[ idx + k * ldSize ] * _y_i[ idx ]; };
		auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
		auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; };
		Containers::Algorithms::Multireduction< DeviceType >::reduce
		( scalarProduct,
		i,
		( (RealType) 0,
		fetch,
		reduction,
		volatileReduction,
		size,
		Y.getData(),
		ldSize,
		Traits::getLocalView( y_i ).getData(),
		i,
		aux );
		// no-op if the problem is not distributed
		CommunicatorType::Allreduce( aux, i, MPI_SUM, Traits::getCommunicationGroup( *this->matrix ) );
		@@ -525,14 +538,28 @@ hauseholder_cwy_transposed( VectorViewType z,
		{
		// aux = Y_i^T * w
		RealType aux[ i + 1 ];
		Containers::Algorithms::ParallelReductionScalarProduct< RealType, RealType > scalarProduct;
		// Containers::Algorithms::ParallelReductionScalarProduct< RealType, RealType > scalarProduct;
		// Containers::Algorithms::Multireduction< DeviceType >::reduce
		// ( scalarProduct,
		// i + 1,
		// size,
		// Y.getData(),
		// ldSize,
		// Traits::getConstLocalView( w ).getData(),
		// aux );
		const RealType* _Y = Y.getData();
		const RealType* _w = Traits::getConstLocalView( w ).getData();
		const IndexType ldSize = this->ldSize;
		auto fetch = [_Y, _w, ldSize] __cuda_callable__ ( IndexType idx, int k ) { return _Y[ idx + k * ldSize ] * _w[ idx ]; };
		auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
		auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; };
		Containers::Algorithms::Multireduction< DeviceType >::reduce
		( scalarProduct,
		i + 1,
		( (RealType) 0,
		fetch,
		reduction,
		volatileReduction,
		size,
		Y.getData(),
		ldSize,
		Traits::getConstLocalView( w ).getData(),
		i + 1,
		aux );
		// no-op if the problem is not distributed
		Traits::CommunicatorType::Allreduce( aux, i + 1, MPI_SUM, Traits::getCommunicationGroup( *this->matrix ) );