Removed volatile reduction completely (13b89a71) · Commits · TNL / tnl-dev

src/Benchmarks/BLAS/CommonVectorOperations.hpp

+17 −34

Original line number	Diff line number	Diff line
		@@ -31,8 +31,7 @@ getVectorMax( const Vector& v )
		const auto* data = v.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
		auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() );
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
		}

		template< typename Device >
		@@ -49,8 +48,7 @@ getVectorMin( const Vector& v )
		const auto* data = v.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return data[ i ]; };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); };
		auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
		}

		template< typename Device >
		@@ -67,8 +65,7 @@ getVectorAbsMax( const Vector& v )
		const auto* data = v.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
		auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() );
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
		}

		template< typename Device >
		@@ -85,8 +82,7 @@ getVectorAbsMin( const Vector& v )
		const auto* data = v.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); };
		auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
		}

		template< typename Device >
		@@ -103,8 +99,7 @@ getVectorL1Norm( const Vector& v )
		const auto* data = v.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
		auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 );
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 );
		}

		template< typename Device >
		@@ -121,8 +116,7 @@ getVectorL2Norm( const Vector& v )
		const auto* data = v.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data[ i ] * data[ i ]; };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
		auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
		return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ) );
		return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 ) );
		}

		template< typename Device >
		@@ -146,8 +140,7 @@ getVectorLpNorm( const Vector& v,
		const auto* data = v.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data[ i ] ), p ); };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
		auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
		return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ), 1.0 / p );
		return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 ), 1.0 / p );
		}

		template< typename Device >
		@@ -167,8 +160,7 @@ getVectorSum( const Vector& v )
		const auto* data = v.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
		auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 );
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 );
		}

		template< typename Device >
		@@ -188,8 +180,7 @@ getVectorDifferenceMax( const Vector1& v1,
		const auto* data2 = v2.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
		auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() );
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
		}

		template< typename Device >
		@@ -209,8 +200,7 @@ getVectorDifferenceMin( const Vector1& v1,
		const auto* data2 = v2.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); };
		auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
		}

		template< typename Device >
		@@ -230,8 +220,7 @@ getVectorDifferenceAbsMax( const Vector1& v1,
		const auto* data2 = v2.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
		auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() );
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
		}

		template< typename Device >
		@@ -251,8 +240,7 @@ getVectorDifferenceAbsMin( const Vector1& v1,
		const auto* data2 = v2.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); };
		auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
		}

		template< typename Device >
		@@ -272,8 +260,7 @@ getVectorDifferenceL1Norm( const Vector1& v1,
		const auto* data2 = v2.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
		auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 );
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 );
		}

		template< typename Device >
		@@ -296,8 +283,7 @@ getVectorDifferenceL2Norm( const Vector1& v1,
		return diff * diff;
		};
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
		auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
		return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ) );
		return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 ) );
		}

		template< typename Device >
		@@ -324,8 +310,7 @@ getVectorDifferenceLpNorm( const Vector1& v1,
		const auto* data2 = v2.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data1[ i ] - data2[ i ] ), p ); };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
		auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
		return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ), 1.0 / p );
		return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 ), 1.0 / p );
		}

		template< typename Device >
		@@ -345,8 +330,7 @@ getVectorDifferenceSum( const Vector1& v1,
		const auto* data2 = v2.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
		auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 );
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 );
		}

		template< typename Device >
		@@ -366,8 +350,7 @@ getScalarProduct( const Vector1& v1,
		const auto* data2 = v2.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] * data2[ i ]; };
		auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
		auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 );
		return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 );
		}

		} // namespace Benchmarks

src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp

+3 −6

Original line number	Diff line number	Diff line
		@@ -135,8 +135,7 @@ compare( const Element1* destination,

		auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return ( destination[ i ] == source[ i ] ); };
		auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
		auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
		return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, true );
		return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, true );
		}

		template< typename Element,
		@@ -153,8 +152,7 @@ containsValue( const Element* data,

		auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return ( data[ i ] == value ); };
		auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a \|= b; };
		auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a \|= b; };
		return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, false );
		return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, false );
		}

		template< typename Element,
		@@ -171,8 +169,7 @@ containsOnlyValue( const Element* data,

		auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return ( data[ i ] == value ); };
		auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
		auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
		return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, true );
		return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, true );
		}

src/TNL/Containers/Algorithms/Multireduction.h

+0 −6

Original line number	Diff line number	Diff line
		@@ -32,7 +32,6 @@ struct Multireduction< Devices::Host >
		* the i-th value to be reduced from the j-th dataset
		* (i = 0,...,size-1; j = 0,...,n-1)
		* reduction: callable object representing the reduction operation
		* volatileReduction: callable object representing the reduction operation
		* size: the size of each dataset
		* n: number of datasets to be reduced
		* result: output array of size = n
		@@ -40,13 +39,11 @@ struct Multireduction< Devices::Host >
		template< typename Result,
		typename DataFetcher,
		typename Reduction,
		typename VolatileReduction,
		typename Index >
		static void
		reduce( const Result zero,
		DataFetcher dataFetcher,
		const Reduction reduction,
		const VolatileReduction volatileReduction,
		const Index size,
		const int n,
		Result* result );
		@@ -62,7 +59,6 @@ struct Multireduction< Devices::Cuda >
		* the i-th value to be reduced from the j-th dataset
		* (i = 0,...,size-1; j = 0,...,n-1)
		* reduction: callable object representing the reduction operation
		* volatileReduction: callable object representing the reduction operation
		* size: the size of each dataset
		* n: number of datasets to be reduced
		* hostResult: output array of size = n
		@@ -70,13 +66,11 @@ struct Multireduction< Devices::Cuda >
		template< typename Result,
		typename DataFetcher,
		typename Reduction,
		typename VolatileReduction,
		typename Index >
		static void
		reduce( const Result zero,
		DataFetcher dataFetcher,
		const Reduction reduction,
		const VolatileReduction volatileReduction,
		const Index size,
		const int n,
		Result* hostResult );

src/TNL/Containers/Algorithms/Multireduction.hpp

+1 −5

Original line number	Diff line number	Diff line
		@@ -33,14 +33,12 @@ namespace Algorithms {
		template< typename Result,
		typename DataFetcher,
		typename Reduction,
		typename VolatileReduction,
		typename Index >
		void
		Multireduction< Devices::Host >::
		reduce( const Result zero,
		DataFetcher dataFetcher,
		const Reduction reduction,
		const VolatileReduction volatileReduction,
		const Index size,
		const int n,
		Result* result )
		@@ -173,14 +171,12 @@ reduce( const Result zero,
		template< typename Result,
		typename DataFetcher,
		typename Reduction,
		typename VolatileReduction,
		typename Index >
		void
		Multireduction< Devices::Cuda >::
		reduce( const Result zero,
		DataFetcher dataFetcher,
		const Reduction reduction,
		const VolatileReduction volatileReduction,
		const Index size,
		const int n,
		Result* hostResult )
		@@ -218,7 +214,7 @@ reduce( const Result zero,

		// finish the reduction on the host
		auto dataFetcherFinish = [&] ( int i, int k ) { return resultArray[ i + k * reducedSize ]; };
		Multireduction< Devices::Host >::reduce( zero, dataFetcherFinish, reduction, volatileReduction, reducedSize, n, hostResult );
		Multireduction< Devices::Host >::reduce( zero, dataFetcherFinish, reduction, reducedSize, n, hostResult );

		#ifdef CUDA_REDUCTION_PROFILING
		timer.stop();

src/TNL/Containers/Algorithms/Reduction.h

+0 −8

Original line number	Diff line number	Diff line
		@@ -30,24 +30,20 @@ struct Reduction< Devices::Host >
		template< typename Index,
		typename Result,
		typename ReductionOperation,
		typename VolatileReductionOperation,
		typename DataFetcher >
		static Result
		reduce( const Index size,
		ReductionOperation& reduction,
		VolatileReductionOperation& volatileReduction,
		DataFetcher& dataFetcher,
		const Result& zero );

		template< typename Index,
		typename Result,
		typename ReductionOperation,
		typename VolatileReductionOperation,
		typename DataFetcher >
		static std::pair< Index, Result >
		reduceWithArgument( const Index size,
		ReductionOperation& reduction,
		VolatileReductionOperation& volatileReduction,
		DataFetcher& dataFetcher,
		const Result& zero );
		};
		@@ -58,24 +54,20 @@ struct Reduction< Devices::Cuda >
		template< typename Index,
		typename Result,
		typename ReductionOperation,
		typename VolatileReductionOperation,
		typename DataFetcher >
		static Result
		reduce( const Index size,
		ReductionOperation& reduction,
		VolatileReductionOperation& volatileReduction,
		DataFetcher& dataFetcher,
		const Result& zero );

		template< typename Index,
		typename Result,
		typename ReductionOperation,
		typename VolatileReductionOperation,
		typename DataFetcher >
		static std::pair< Index, Result >
		reduceWithArgument( const Index size,
		ReductionOperation& reduction,
		VolatileReductionOperation& volatileReduction,
		DataFetcher& dataFetcher,
		const Result& zero );
		};