Parallelization of the CWYGMRES solver using OpenMP (1c128e27) · Commits · TNL / tnl-dev

src/TNL/Containers/Algorithms/Multireduction.h

+36 −14

Original line number	Diff line number	Diff line
		#pragma once

		#include <TNL/Devices/Host.h>
		#include <TNL/Devices/Cuda.h>

		namespace TNL {
		namespace Containers {
		namespace Algorithms {

		template< typename Device >
		class Multireduction
		{
		};

		template<>
		class Multireduction< Devices::Cuda >
		{
		public:
		template< typename Operation >
		bool multireductionOnCudaDevice( const Operation& operation,
		static bool
		reduce( Operation& operation,
		int n,
		const typename Operation::IndexType size,
		const typename Operation::RealType* deviceInput1,
		const typename Operation::IndexType ldInput1,
		const typename Operation::RealType* deviceInput2,
		typename Operation::ResultType* hostResult );
		};

		template<>
		class Multireduction< Devices::Host >
		{
		public:
		template< typename Operation >
		bool multireductionOnHostDevice( const Operation& operation,
		static bool
		reduce( Operation& operation,
		int n,
		const typename Operation::IndexType size,
		const typename Operation::RealType* deviceInput1,
		const typename Operation::IndexType ldInput1,
		const typename Operation::RealType* deviceInput2,
		typename Operation::ResultType* hostResult );
		};

		} // namespace Algorithms
		} // namespace Containers

src/TNL/Containers/Algorithms/Multireduction_impl.h

+95 −22

Original line number	Diff line number	Diff line
		#pragma once

		#include "Multireduction.h"

		//#define CUDA_REDUCTION_PROFILING

		#include <TNL/Assert.h>
		@@ -34,7 +36,9 @@ static constexpr int Multireduction_minGpuDataSize = 256;//65536; //16384;//1024
		* hostResult: output array of size = n
		*/
		template< typename Operation >
		bool multireductionOnCudaDevice( Operation& operation,
		bool
		Multireduction< Devices::Cuda >::
		reduce( Operation& operation,
		int n,
		const typename Operation::IndexType size,
		const typename Operation::RealType* deviceInput1,
		@@ -44,6 +48,7 @@ bool multireductionOnCudaDevice( Operation& operation,
		{
		#ifdef HAVE_CUDA
		Assert( n > 0, );
		Assert( size <= ldInput1, );

		typedef typename Operation::IndexType IndexType;
		typedef typename Operation::RealType RealType;
		@@ -62,10 +67,10 @@ bool multireductionOnCudaDevice( Operation& operation,
		RealType hostArray2[ Multireduction_minGpuDataSize ];
		if( ! ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( hostArray2, deviceInput2, n * size ) )
		return false;
		return multireductionOnHostDevice( operation, n, size, hostArray1, ldInput1, hostArray2, hostResult );
		return Multireduction< Devices::Host >::reduce( operation, n, size, hostArray1, ldInput1, hostArray2, hostResult );
		}
		else {
		return multireductionOnHostDevice( operation, n, size, hostArray1, ldInput1, nullptr, hostResult );
		return Multireduction< Devices::Host >::reduce( operation, n, size, hostArray1, ldInput1, nullptr, hostResult );
		}
		}

		@@ -119,7 +124,7 @@ bool multireductionOnCudaDevice( Operation& operation,
		* Reduce the data on the host system.
		*/
		LaterReductionOperation laterReductionOperation;
		multireductionOnHostDevice( laterReductionOperation, n, reducedSize, resultArray, reducedSize, (RealType*) nullptr, hostResult );
		Multireduction< Devices::Host >::reduce( laterReductionOperation, n, reducedSize, resultArray, reducedSize, (RealType*) nullptr, hostResult );

		#ifdef CUDA_REDUCTION_PROFILING
		timer.stop();
		@@ -144,7 +149,9 @@ bool multireductionOnCudaDevice( Operation& operation,
		* hostResult: output array of size = n
		*/
		template< typename Operation >
		bool multireductionOnHostDevice( Operation& operation,
		bool
		Multireduction< Devices::Host >::
		reduce( Operation& operation,
		int n,
		const typename Operation::IndexType size,
		const typename Operation::RealType* input1,
		@@ -152,15 +159,81 @@ bool multireductionOnHostDevice( Operation& operation,
		const typename Operation::RealType* input2,
		typename Operation::ResultType* result )
		{
		Assert( n > 0, );
		Assert( size <= ldInput1, );

		typedef typename Operation::IndexType IndexType;
		typedef typename Operation::RealType RealType;
		typedef typename Operation::ResultType ResultType;

		const int block_size = 128;
		const int blocks = size / block_size;

		#ifdef HAVE_OPENMP
		if( TNL::Devices::Host::isOMPEnabled() && blocks >= 2 )
		#pragma omp parallel
		{
		// first thread initializes the result array
		#pragma omp single nowait
		{
		for( int k = 0; k < n; k++ )
		result[ k ] = operation.initialValue();
		}

		// initialize array for thread-local results
		ResultType r[ n ];
		for( int k = 0; k < n; k++ )
		r[ k ] = operation.initialValue();

		#pragma omp for nowait
		for( int b = 0; b < blocks; b++ ) {
		const int offset = b * block_size;
		for( int k = 0; k < n; k++ ) {
		const RealType* _input1 = input1 + k * ldInput1;
		for( IndexType i = 0; i < block_size; i++ )
		r[ k ] = operation.reduceOnHost( offset + i, r[ k ], _input1, input2 );
		}
		}

		// the first thread that reaches here processes the last, incomplete block
		#pragma omp single nowait
		{
		for( int k = 0; k < n; k++ ) {
		const RealType* _input1 = input1 + k * ldInput1;
		for( IndexType i = blocks * block_size; i < size; i++ )
		r[ k ] = operation.reduceOnHost( i, r[ k ], _input1, input2 );
		}
		}

		// inter-thread reduction of local results
		for( int k = 0; k < n; k++ )
		#pragma omp critical
		{
		operation.commonReductionOnDevice( result[ k ], r[ k ] );
		}
		}
		else {
		#endif
		for( int k = 0; k < n; k++ )
		result[ k ] = operation.initialValue();

		for( int b = 0; b < blocks; b++ ) {
		const int offset = b * block_size;
		for( int k = 0; k < n; k++ ) {
		const RealType* _input1 = input1 + k * ldInput1;
		for( IndexType i = 0; i < block_size; i++ )
		result[ k ] = operation.reduceOnHost( offset + i, result[ k ], _input1, input2 );
		}
		}

		for( int k = 0; k < n; k++ ) {
		const RealType* _input1 = input1 + k * ldInput1;
		for( IndexType i = 0; i < size; i++ )
		for( IndexType i = blocks * block_size; i < size; i++ )
		result[ k ] = operation.reduceOnHost( i, result[ k ], _input1, input2 );
		}
		#ifdef HAVE_OPENMP
		}
		#endif

		return true;
		}

src/TNL/Matrices/MatrixOperations.h

+6 −0

Original line number	Diff line number	Diff line
		@@ -40,6 +40,9 @@ public:
		Assert( m <= lda, );

		if( beta != 0.0 ) {
		#ifdef HAVE_OPENMP
		#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() )
		#endif
		for( IndexType j = 0; j < m; j++ ) {
		RealType tmp = 0.0;
		for( int k = 0; k < n; k++ )
		@@ -49,6 +52,9 @@ public:
		}
		else {
		// the vector y might be uninitialized, and 0.0 * NaN = NaN
		#ifdef HAVE_OPENMP
		#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() )
		#endif
		for( IndexType j = 0; j < m; j++ ) {
		RealType tmp = 0.0;
		for( int k = 0; k < n; k++ )

src/TNL/Solvers/Linear/CWYGMRES_impl.h

+24 −42

Original line number	Diff line number	Diff line
		@@ -388,17 +388,9 @@ hauseholder_generate( DeviceVector& Y,
		if( i > 0 ) {
		// aux = Y_{i-1}^T * y_i
		RealType aux[ i ];
		if( std::is_same< DeviceType, Devices::Host >::value ) {
		// this is optimized better than multireductionOnHostDevice
		DeviceVector y_k;
		for( int k = 0; k < i; k++ ) {
		y_k.bind( &Y.getData()[ k * ldSize ], size );
		aux[ k ] = y_k.scalarProduct( y_i );
		}
		}
		if( std::is_same< DeviceType, Devices::Cuda >::value ) {
		Containers::Algorithms::tnlParallelReductionScalarProduct< RealType, IndexType > scalarProduct;
		if( ! multireductionOnCudaDevice( scalarProduct,
		if( ! Containers::Algorithms::Multireduction< DeviceType >::reduce
		( scalarProduct,
		i,
		size,
		Y.getData(),
		@@ -406,10 +398,9 @@ hauseholder_generate( DeviceVector& Y,
		y_i.getData(),
		aux ) )
		{
		std::cerr << "multireductionOnCudaDevice failed" << std::endl;
		std::cerr << "multireduction failed" << std::endl;
		throw 1;
		}
		}

		// [T_i]_{0..i-1} = - T_{i-1} * t_i * aux
		for( int k = 0; k < i; k++ ) {
		@@ -503,17 +494,9 @@ hauseholder_cwy_transposed( DeviceVector& z,
		{
		// aux = Y_i^T * w
		RealType aux[ i + 1 ];
		if( std::is_same< DeviceType, Devices::Host >::value ) {
		// this is optimized better than multireductionOnHostDevice
		DeviceVector y_k;
		for( int k = 0; k <= i; k++ ) {
		y_k.bind( &Y.getData()[ k * ldSize ], size );
		aux[ k ] = y_k.scalarProduct( w );
		}
		}
		if( std::is_same< DeviceType, Devices::Cuda >::value ) {
		Containers::Algorithms::tnlParallelReductionScalarProduct< RealType, IndexType > scalarProduct;
		if( ! multireductionOnCudaDevice( scalarProduct,
		if( ! Containers::Algorithms::Multireduction< DeviceType >::reduce
		( scalarProduct,
		i + 1,
		size,
		Y.getData(),
		@@ -521,10 +504,9 @@ hauseholder_cwy_transposed( DeviceVector& z,
		w.getData(),
		aux ) )
		{
		std::cerr << "multireductionOnCudaDevice failed" << std::endl;
		std::cerr << "multireduction failed" << std::endl;
		throw 1;
		}
		}

		// aux = T_i^T * aux
		// Note that T_i^T is lower triangular, so we can overwrite the aux vector with the result in place