diff --git a/src/TNL/Containers/Algorithms/ArrayAssignment.h b/src/TNL/Containers/Algorithms/ArrayAssignment.h index 62ab43a84aa4046ee195439bd3102d1be579f4f7..7f5b8482ee0bb570fa288e8ba7209e12ee90938a 100644 --- a/src/TNL/Containers/Algorithms/ArrayAssignment.h +++ b/src/TNL/Containers/Algorithms/ArrayAssignment.h @@ -86,8 +86,6 @@ struct ArrayAssignment< Array, T, false > }; - - } // namespace Algorithms } // namespace Containers } // namespace TNL diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp index 3efe7e4d472bb5d9766faee73946941a480374e4..0badc8d799c5d5e81a10393c1ceb987ab3fc8e1e 100644 --- a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp +++ b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp @@ -193,7 +193,6 @@ copySTLList( DestinationElement* destination, copiedElements += copySize; } } - template< typename Element1, typename Element2, typename Index > @@ -205,8 +204,11 @@ compareMemory( const Element1* destination, { TNL_ASSERT_TRUE( destination, "Attempted to compare data through a nullptr." ); TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." ); - Algorithms::ParallelReductionEqualities< Element1, Element2 > reductionEqualities; - return Reduction< Devices::Cuda >::reduce( reductionEqualities, size, destination, source ); + + auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return ( destination[ i ] == source[ i ] ); }; + auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; + auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; + return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, true ); } template< typename Element, @@ -219,10 +221,12 @@ containsValue( const Element* data, { TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." ); TNL_ASSERT_GE( size, 0, "" ); + if( size == 0 ) return false; - Algorithms::ParallelReductionContainsValue< Element > reductionContainsValue; - reductionContainsValue.setValue( value ); - return Reduction< Devices::Cuda >::reduce( reductionContainsValue, size, data, nullptr ); + auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return ( data[ i ] == value ); }; + auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a |= b; }; + auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a |= b; }; + return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, false ); } template< typename Element, @@ -236,9 +240,12 @@ containsOnlyValue( const Element* data, TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." ); TNL_ASSERT_GE( size, 0, "" ); if( size == 0 ) return false; - Algorithms::ParallelReductionContainsOnlyValue< Element > reductionContainsOnlyValue; - reductionContainsOnlyValue.setValue( value ); - return Reduction< Devices::Cuda >::reduce( reductionContainsOnlyValue, size, data, nullptr ); + + if( size == 0 ) return false; + auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return ( data[ i ] == value ); }; + auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; + auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; + return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, true ); } diff --git a/src/TNL/Containers/Algorithms/CommonVectorOperations.h b/src/TNL/Containers/Algorithms/CommonVectorOperations.h new file mode 100644 index 0000000000000000000000000000000000000000..bd362fbabba8c93d64857dbc394860f6b23928f9 --- /dev/null +++ b/src/TNL/Containers/Algorithms/CommonVectorOperations.h @@ -0,0 +1,79 @@ +/*************************************************************************** + CommonVectorOperations.h - description + ------------------- + begin : Apr 12, 2019 + copyright : (C) 2019 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +namespace TNL { +namespace Containers { +namespace Algorithms { + +template< typename Device > +struct CommonVectorOperations +{ + using DeviceType = Device; + + template< typename Vector, typename ResultType = typename Vector::RealType > + static ResultType getVectorMax( const Vector& v ); + + template< typename Vector, typename ResultType = typename Vector::RealType > + static ResultType getVectorMin( const Vector& v ); + + template< typename Vector, typename ResultType = typename Vector::RealType > + static ResultType getVectorAbsMax( const Vector& v ); + + template< typename Vector, typename ResultType = typename Vector::RealType > + static ResultType getVectorAbsMin( const Vector& v ); + + template< typename Vector, typename ResultType = typename Vector::RealType > + static ResultType getVectorL1Norm( const Vector& v ); + + template< typename Vector, typename ResultType = typename Vector::RealType > + static ResultType getVectorL2Norm( const Vector& v ); + + template< typename Vector, typename ResultType = typename Vector::RealType, typename Scalar > + static ResultType getVectorLpNorm( const Vector& v, const Scalar p ); + + template< typename Vector, typename ResultType = typename Vector::RealType > + static ResultType getVectorSum( const Vector& v ); + + template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > + static ResultType getVectorDifferenceMax( const Vector1& v1, const Vector2& v2 ); + + template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > + static ResultType getVectorDifferenceMin( const Vector1& v1, const Vector2& v2 ); + + template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > + static ResultType getVectorDifferenceAbsMax( const Vector1& v1, const Vector2& v2 ); + + template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > + static ResultType getVectorDifferenceAbsMin( const Vector1& v1, const Vector2& v2 ); + + template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > + static ResultType getVectorDifferenceL1Norm( const Vector1& v1, const Vector2& v2 ); + + template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > + static ResultType getVectorDifferenceL2Norm( const Vector1& v1, const Vector2& v2 ); + + template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType, typename Scalar > + static ResultType getVectorDifferenceLpNorm( const Vector1& v1, const Vector2& v2, const Scalar p ); + + template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > + static ResultType getVectorDifferenceSum( const Vector1& v1, const Vector2& v2 ); + + template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > + static ResultType getScalarProduct( const Vector1& v1, const Vector2& v2 ); + +}; + +} // namespace Algorithms +} // namespace Containers +} // namespace TNL + +#include <TNL/Containers/Algorithms/CommonVectorOperations.hpp> diff --git a/src/TNL/Containers/Algorithms/CommonVectorOperations.hpp b/src/TNL/Containers/Algorithms/CommonVectorOperations.hpp new file mode 100644 index 0000000000000000000000000000000000000000..ef3317cb7a0636e3dc5ef32a1798addfdf90b6cb --- /dev/null +++ b/src/TNL/Containers/Algorithms/CommonVectorOperations.hpp @@ -0,0 +1,375 @@ +/*************************************************************************** + CommonVectorOperations.hpp - description + ------------------- + begin : Apr 12, 2019 + copyright : (C) 2019 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include <TNL/Containers/Algorithms/CommonVectorOperations.h> +#include <TNL/Containers/Algorithms/Reduction.h> + +namespace TNL { +namespace Containers { +namespace Algorithms { + +template< typename Device > + template< typename Vector, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorMax( const Vector& v ) +{ + TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); + + using RealType = typename Vector::RealType; + using IndexType = typename Vector::IndexType; + + const auto* data = v.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); }; + return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() ); +} + +template< typename Device > + template< typename Vector, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorMin( const Vector& v ) +{ + TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); + + using RealType = typename Vector::RealType; + using IndexType = typename Vector::IndexType; + + const auto* data = v.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data[ i ]; }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); }; + return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() ); +} + +template< typename Device > + template< typename Vector, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorAbsMax( const Vector& v ) +{ + TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); + + using RealType = typename Vector::RealType; + using IndexType = typename Vector::IndexType; + + const auto* data = v.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); }; + return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() ); +} + +template< typename Device > + template< typename Vector, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorAbsMin( const Vector& v ) +{ + TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); + + using RealType = typename Vector::RealType; + using IndexType = typename Vector::IndexType; + + const auto* data = v.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); }; + return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() ); +} + +template< typename Device > + template< typename Vector, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorL1Norm( const Vector& v ) +{ + TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); + + using RealType = typename Vector::RealType; + using IndexType = typename Vector::IndexType; + + const auto* data = v.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; + return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ); +} + +template< typename Device > + template< typename Vector, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorL2Norm( const Vector& v ) +{ + TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); + + using RealType = typename Vector::RealType; + using IndexType = typename Vector::IndexType; + + const auto* data = v.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data[ i ] * data[ i ]; }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; + return std::sqrt( Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ) ); +} + +template< typename Device > + template< typename Vector, typename ResultType, typename Scalar > +ResultType +CommonVectorOperations< Device >:: +getVectorLpNorm( const Vector& v, + const Scalar p ) +{ + TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); + TNL_ASSERT_GE( p, 1.0, "Parameter of the L^p norm must be at least 1.0." ); + + using RealType = typename Vector::RealType; + using IndexType = typename Vector::IndexType; + + if( p == 1.0 ) + return getVectorL1Norm< Vector, ResultType >( v ); + if( p == 2.0 ) + return getVectorL2Norm< Vector, ResultType >( v ); + + const auto* data = v.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data[ i ] ), p ); }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; + return std::pow( Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ), 1.0 / p ); +} + +template< typename Device > + template< typename Vector, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorSum( const Vector& v ) +{ + TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); + + if( std::is_same< ResultType, bool >::value ) + abort(); + + using RealType = typename Vector::RealType; + using IndexType = typename Vector::IndexType; + + const auto* data = v.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; + return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ); +} + +template< typename Device > + template< typename Vector1, typename Vector2, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorDifferenceMax( const Vector1& v1, + const Vector2& v2 ) +{ + TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); + TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); + + using RealType = typename Vector1::RealType; + using IndexType = typename Vector1::IndexType; + + const auto* data1 = v1.getData(); + const auto* data2 = v2.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); }; + return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() ); +} + +template< typename Device > + template< typename Vector1, typename Vector2, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorDifferenceMin( const Vector1& v1, + const Vector2& v2 ) +{ + TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); + TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); + + using RealType = typename Vector1::RealType; + using IndexType = typename Vector1::IndexType; + + const auto* data1 = v1.getData(); + const auto* data2 = v2.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); }; + return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() ); +} + +template< typename Device > + template< typename Vector1, typename Vector2, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorDifferenceAbsMax( const Vector1& v1, + const Vector2& v2 ) +{ + TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); + TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); + + using RealType = typename Vector1::RealType; + using IndexType = typename Vector1::IndexType; + + const auto* data1 = v1.getData(); + const auto* data2 = v2.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); }; + return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() ); +} + +template< typename Device > + template< typename Vector1, typename Vector2, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorDifferenceAbsMin( const Vector1& v1, + const Vector2& v2 ) +{ + TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); + TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); + + using RealType = typename Vector1::RealType; + using IndexType = typename Vector1::IndexType; + + const auto* data1 = v1.getData(); + const auto* data2 = v2.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); }; + return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() ); +} + +template< typename Device > + template< typename Vector1, typename Vector2, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorDifferenceL1Norm( const Vector1& v1, + const Vector2& v2 ) +{ + TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); + TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); + + using RealType = typename Vector1::RealType; + using IndexType = typename Vector1::IndexType; + + const auto* data1 = v1.getData(); + const auto* data2 = v2.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; + return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ); +} + +template< typename Device > + template< typename Vector1, typename Vector2, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorDifferenceL2Norm( const Vector1& v1, + const Vector2& v2 ) +{ + TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); + TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); + + using RealType = typename Vector1::RealType; + using IndexType = typename Vector1::IndexType; + + const auto* data1 = v1.getData(); + const auto* data2 = v2.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { + auto diff = data1[ i ] - data2[ i ]; + return diff * diff; + }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; + return std::sqrt( Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ) ); +} + +template< typename Device > + template< typename Vector1, typename Vector2, typename ResultType, typename Scalar > +ResultType +CommonVectorOperations< Device >:: +getVectorDifferenceLpNorm( const Vector1& v1, + const Vector2& v2, + const Scalar p ) +{ + TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); + TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); + TNL_ASSERT_GE( p, 1.0, "Parameter of the L^p norm must be at least 1.0." ); + + if( p == 1.0 ) + return getVectorDifferenceL1Norm< Vector1, Vector2, ResultType >( v1, v2 ); + if( p == 2.0 ) + return getVectorDifferenceL2Norm< Vector1, Vector2, ResultType >( v1, v2 ); + + using RealType = typename Vector1::RealType; + using IndexType = typename Vector1::IndexType; + + const auto* data1 = v1.getData(); + const auto* data2 = v2.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data1[ i ] - data2[ i ] ), p ); }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; + return std::pow( Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ), 1.0 / p ); +} + +template< typename Device > + template< typename Vector1, typename Vector2, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorDifferenceSum( const Vector1& v1, + const Vector2& v2 ) +{ + TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); + TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); + + using RealType = typename Vector1::RealType; + using IndexType = typename Vector1::IndexType; + + const auto* data1 = v1.getData(); + const auto* data2 = v2.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; + return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ); +} + +template< typename Device > + template< typename Vector1, typename Vector2, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getScalarProduct( const Vector1& v1, + const Vector2& v2 ) +{ + TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); + TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); + + using RealType = typename Vector1::RealType; + using IndexType = typename Vector1::IndexType; + + const auto* data1 = v1.getData(); + const auto* data2 = v2.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] * data2[ i ]; }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; + return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ); +} + +} // namespace Algorithms +} // namespace Containers +} // namespace TNL diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h index a5823849d443bd70831ddff6252631fb333d4aff..8701b8b7df3ade43bbf9f6cd195142915226d110 100644 --- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h +++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h @@ -39,17 +39,23 @@ static constexpr int Reduction_registersPerThread = 32; // empirically determi static constexpr int Reduction_minBlocksPerMultiprocessor = 4; #endif -template< int blockSize, typename Operation, typename Index > +template< int blockSize, + typename Result, + typename DataFetcher, + typename Reduction, + typename VolatileReduction, + typename Index > __global__ void __launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor ) -CudaReductionKernel( Operation operation, +CudaReductionKernel( const Result zero, + const DataFetcher dataFetcher, + const Reduction reduction, + const VolatileReduction volatileReduction, const Index size, - const typename Operation::DataType1* input1, - const typename Operation::DataType2* input2, - typename Operation::ResultType* output ) + Result* output ) { - typedef Index IndexType; - typedef typename Operation::ResultType ResultType; + using IndexType = Index; + using ResultType = Result; ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >(); @@ -62,55 +68,53 @@ CudaReductionKernel( Operation operation, IndexType gid = blockIdx.x * blockDim. x + threadIdx.x; const IndexType gridSize = blockDim.x * gridDim.x; - sdata[ tid ] = operation.initialValue(); + sdata[ tid ] = zero; + /*** * Read data into the shared memory. We start with the * sequential reduction. */ while( gid + 4 * gridSize < size ) { - operation.firstReduction( sdata[ tid ], gid, input1, input2 ); - operation.firstReduction( sdata[ tid ], gid + gridSize, input1, input2 ); - operation.firstReduction( sdata[ tid ], gid + 2 * gridSize, input1, input2 ); - operation.firstReduction( sdata[ tid ], gid + 3 * gridSize, input1, input2 ); + reduction( sdata[ tid ], dataFetcher( gid ) ); + reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); + reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ) ); + reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ) ); gid += 4 * gridSize; } while( gid + 2 * gridSize < size ) { - operation.firstReduction( sdata[ tid ], gid, input1, input2 ); - operation.firstReduction( sdata[ tid ], gid + gridSize, input1, input2 ); + reduction( sdata[ tid ], dataFetcher( gid ) ); + reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); gid += 2 * gridSize; } while( gid < size ) { - operation.firstReduction( sdata[ tid ], gid, input1, input2 ); + reduction( sdata[ tid ], dataFetcher( gid ) ); gid += gridSize; } __syncthreads(); - //printf( "1: tid %d data %f \n", tid, sdata[ tid ] ); - - //return; /*** * Perform the parallel reduction. */ if( blockSize >= 1024 ) { if( tid < 512 ) - operation.commonReduction( sdata[ tid ], sdata[ tid + 512 ] ); + reduction( sdata[ tid ], sdata[ tid + 512 ] ); __syncthreads(); } if( blockSize >= 512 ) { if( tid < 256 ) - operation.commonReduction( sdata[ tid ], sdata[ tid + 256 ] ); + reduction( sdata[ tid ], sdata[ tid + 256 ] ); __syncthreads(); } if( blockSize >= 256 ) { if( tid < 128 ) - operation.commonReduction( sdata[ tid ], sdata[ tid + 128 ] ); + reduction( sdata[ tid ], sdata[ tid + 128 ] ); __syncthreads(); //printf( "2: tid %d data %f \n", tid, sdata[ tid ] ); } @@ -118,48 +122,47 @@ CudaReductionKernel( Operation operation, if( blockSize >= 128 ) { if( tid < 64 ) - operation.commonReduction( sdata[ tid ], sdata[ tid + 64 ] ); + reduction( sdata[ tid ], sdata[ tid + 64 ] ); __syncthreads(); //printf( "3: tid %d data %f \n", tid, sdata[ tid ] ); } - /*** * This runs in one warp so it is synchronized implicitly. - */ + */ if( tid < 32 ) { volatile ResultType* vsdata = sdata; if( blockSize >= 64 ) { - operation.commonReduction( vsdata[ tid ], vsdata[ tid + 32 ] ); + volatileReduction( vsdata[ tid ], vsdata[ tid + 32 ] ); //printf( "4: tid %d data %f \n", tid, sdata[ tid ] ); } // TODO: If blocksize == 32, the following does not work // We do not check if tid < 16. Fix it!!! if( blockSize >= 32 ) { - operation.commonReduction( vsdata[ tid ], vsdata[ tid + 16 ] ); + volatileReduction( vsdata[ tid ], vsdata[ tid + 16 ] ); //printf( "5: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 16 ) { - operation.commonReduction( vsdata[ tid ], vsdata[ tid + 8 ] ); + volatileReduction( vsdata[ tid ], vsdata[ tid + 8 ] ); //printf( "6: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 8 ) { - operation.commonReduction( vsdata[ tid ], vsdata[ tid + 4 ] ); + volatileReduction( vsdata[ tid ], vsdata[ tid + 4 ] ); //printf( "7: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 4 ) { - operation.commonReduction( vsdata[ tid ], vsdata[ tid + 2 ] ); + volatileReduction( vsdata[ tid ], vsdata[ tid + 2 ] ); //printf( "8: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 2 ) { - operation.commonReduction( vsdata[ tid ], vsdata[ tid + 1 ] ); + volatileReduction( vsdata[ tid ], vsdata[ tid + 1 ] ); //printf( "9: tid %d data %f \n", tid, sdata[ tid ] ); } } @@ -175,17 +178,14 @@ CudaReductionKernel( Operation operation, } -template< typename Operation, typename Index > -int -CudaReductionKernelLauncher( Operation& operation, - const Index size, - const typename Operation::DataType1* input1, - const typename Operation::DataType2* input2, - typename Operation::ResultType*& output ) +template< typename Index, + typename Result > +struct CudaReductionKernelLauncher { - typedef Index IndexType; - typedef typename Operation::ResultType ResultType; + using IndexType = Index; + using ResultType = Result; + //// // The number of blocks should be a multiple of the number of multiprocessors // to ensure optimum balancing of the load. This is very important, because // we run the kernel with a fixed number of blocks, so the amount of work per @@ -195,93 +195,159 @@ CudaReductionKernelLauncher( Operation& operation, // where blocksPerMultiprocessor is determined according to the number of // available registers on the multiprocessor. // On Tesla K40c, desGridSize = 8 * 15 = 120. - const int activeDevice = Devices::CudaDeviceInfo::getActiveDevice(); - const int blocksdPerMultiprocessor = Devices::CudaDeviceInfo::getRegistersPerMultiprocessor( activeDevice ) - / ( Reduction_maxThreadsPerBlock * Reduction_registersPerThread ); - const int desGridSize = blocksdPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice ); - dim3 blockSize, gridSize; - blockSize.x = Reduction_maxThreadsPerBlock; - gridSize.x = min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize ); - - // create reference to the reduction buffer singleton and set size - const size_t buf_size = desGridSize * sizeof( ResultType ); - CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance(); - cudaReductionBuffer.setSize( buf_size ); - output = cudaReductionBuffer.template getData< ResultType >(); - - // when there is only one warp per blockSize.x, we need to allocate two warps - // worth of shared memory so that we don't index shared memory out of bounds - const IndexType shmem = (blockSize.x <= 32) - ? 2 * blockSize.x * sizeof( ResultType ) - : blockSize.x * sizeof( ResultType ); + CudaReductionKernelLauncher( const Index size ) + : activeDevice( Devices::CudaDeviceInfo::getActiveDevice() ), + blocksdPerMultiprocessor( Devices::CudaDeviceInfo::getRegistersPerMultiprocessor( activeDevice ) + / ( Reduction_maxThreadsPerBlock * Reduction_registersPerThread ) ), + desGridSize( blocksdPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice ) ), + originalSize( size ) + { + } - /*** - * Depending on the blockSize we generate appropriate template instance. - */ - switch( blockSize.x ) + template< typename DataFetcher, + typename Reduction, + typename VolatileReduction > + int start( const Reduction& reduction, + const VolatileReduction& volatileReduction, + const DataFetcher& dataFetcher, + const Result& zero, + ResultType*& output ) { - case 512: - CudaReductionKernel< 512 > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); - break; - case 256: - cudaFuncSetCacheConfig(CudaReductionKernel< 256, Operation, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 256 > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); - break; - case 128: - cudaFuncSetCacheConfig(CudaReductionKernel< 128, Operation, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 128 > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); - break; - case 64: - cudaFuncSetCacheConfig(CudaReductionKernel< 64, Operation, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 64 > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); - break; - case 32: - cudaFuncSetCacheConfig(CudaReductionKernel< 32, Operation, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 32 > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); - break; - case 16: - cudaFuncSetCacheConfig(CudaReductionKernel< 16, Operation, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 16 > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); - break; - case 8: - cudaFuncSetCacheConfig(CudaReductionKernel< 8, Operation, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 8 > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); - break; - case 4: - cudaFuncSetCacheConfig(CudaReductionKernel< 4, Operation, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 4 > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); - break; - case 2: - cudaFuncSetCacheConfig(CudaReductionKernel< 2, Operation, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 2 > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); - break; - case 1: - TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl ); - default: - TNL_ASSERT( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); + //// + // create reference to the reduction buffer singleton and set size + const size_t buf_size = 2 * desGridSize * sizeof( ResultType ); + CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance(); + cudaReductionBuffer.setSize( buf_size ); + output = cudaReductionBuffer.template getData< ResultType >(); + + this-> reducedSize = this->launch( originalSize, reduction, volatileReduction, dataFetcher, zero, output ); + return this->reducedSize; } - TNL_CHECK_CUDA_DEVICE; - // return the size of the output array on the CUDA device - return gridSize.x; -} + template< typename Reduction, + typename VolatileReduction > + Result finish( const Reduction& reduction, + const VolatileReduction& volatileReduction, + const Result& zero ) + { + //// + // Input is the first half of the buffer, output is the second half + const size_t buf_size = desGridSize * sizeof( ResultType ); + CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance(); + ResultType* input = cudaReductionBuffer.template getData< ResultType >(); + ResultType* output = &input[ buf_size ]; + + auto copyFetch = [=] __cuda_callable__ ( IndexType i ) { return input[ i ]; }; + while( this->reducedSize > 1 ) + { + this-> reducedSize = this->launch( this->reducedSize, reduction, volatileReduction, copyFetch, zero, output ); + std::swap( input, output ); + } + + //// + // Copy result on CPU + ResultType result; + ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( &result, output, 1 ); + return result; + } + + protected: + template< typename DataFetcher, + typename Reduction, + typename VolatileReduction > + int launch( const Index size, + const Reduction& reduction, + const VolatileReduction& volatileReduction, + const DataFetcher& dataFetcher, + const Result& zero, + Result* output ) + { + dim3 blockSize, gridSize; + blockSize.x = Reduction_maxThreadsPerBlock; + gridSize.x = min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize ); + + //// + // when there is only one warp per blockSize.x, we need to allocate two warps + // worth of shared memory so that we don't index shared memory out of bounds + const IndexType shmem = (blockSize.x <= 32) + ? 2 * blockSize.x * sizeof( ResultType ) + : blockSize.x * sizeof( ResultType ); + + /*** + * Depending on the blockSize we generate appropriate template instance. + */ + switch( blockSize.x ) + { + case 512: + CudaReductionKernel< 512 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 256: + cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 256 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 128: + cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 128 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 64: + cudaFuncSetCacheConfig(CudaReductionKernel< 64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 64 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 32: + cudaFuncSetCacheConfig(CudaReductionKernel< 32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 32 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 16: + cudaFuncSetCacheConfig(CudaReductionKernel< 16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 16 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 8: + cudaFuncSetCacheConfig(CudaReductionKernel< 8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 8 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 4: + cudaFuncSetCacheConfig(CudaReductionKernel< 4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 4 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 2: + cudaFuncSetCacheConfig(CudaReductionKernel< 2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 2 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 1: + TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl ); + default: + TNL_ASSERT( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); + } + TNL_CHECK_CUDA_DEVICE; + + //// + // return the size of the output array on the CUDA device + return gridSize.x; + } + + const int activeDevice; + const int blocksdPerMultiprocessor; + const int desGridSize; + const IndexType originalSize; + IndexType reducedSize; +}; #endif } // namespace Algorithms diff --git a/src/TNL/Containers/Algorithms/Reduction.h b/src/TNL/Containers/Algorithms/Reduction.h index d4f45f30e435590629475d107254822485b33979..7ac0d4c028a5365430b67707d63ca185cca4a275 100644 --- a/src/TNL/Containers/Algorithms/Reduction.h +++ b/src/TNL/Containers/Algorithms/Reduction.h @@ -10,7 +10,7 @@ // Implemented by: Tomas Oberhuber, Jakub Klinkovsky -#pragma once +#pragma once #include <TNL/Devices/Host.h> #include <TNL/Devices/Cuda.h> @@ -18,7 +18,7 @@ namespace TNL { namespace Containers { -namespace Algorithms { +namespace Algorithms { template< typename Device > class Reduction @@ -29,40 +29,55 @@ template<> class Reduction< Devices::Cuda > { public: - template< typename Operation, typename Index > - static typename Operation::ResultType - reduce( Operation& operation, - const Index size, - const typename Operation::DataType1* deviceInput1, - const typename Operation::DataType2* deviceInput2 ); + template< typename Index, + typename Result, + typename ReductionOperation, + typename VolatileReductionOperation, + typename DataFetcher > + static Result + reduce( const Index size, + ReductionOperation& reduction, + VolatileReductionOperation& volatileReduction, + DataFetcher& dataFetcher, + const Result& zero ); }; template<> class Reduction< Devices::Host > { public: - template< typename Operation, typename Index > - static typename Operation::ResultType - reduce( Operation& operation, - const Index size, - const typename Operation::DataType1* deviceInput1, - const typename Operation::DataType2* deviceInput2 ); + template< typename Index, + typename Result, + typename ReductionOperation, + typename VolatileReductionOperation, + typename DataFetcher > + static Result + reduce( const Index size, + ReductionOperation& reduction, + VolatileReductionOperation& volatileReduction, + DataFetcher& dataFetcher, + const Result& zero ); }; template<> class Reduction< Devices::MIC > { public: - template< typename Operation, typename Index > - static typename Operation::ResultType - reduce( Operation& operation, - const Index size, - const typename Operation::DataType1* deviceInput1, - const typename Operation::DataType2* deviceInput2 ); + template< typename Index, + typename Result, + typename ReductionOperation, + typename VolatileReductionOperation, + typename DataFetcher > + static Result + reduce( const Index size, + ReductionOperation& reduction, + VolatileReductionOperation& volatileReduction, + DataFetcher& dataFetcher, + const Result& zero ); }; } // namespace Algorithms } // namespace Containers } // namespace TNL -#include "Reduction_impl.h" +#include "Reduction.hpp" diff --git a/src/TNL/Containers/Algorithms/Reduction_impl.h b/src/TNL/Containers/Algorithms/Reduction.hpp similarity index 57% rename from src/TNL/Containers/Algorithms/Reduction_impl.h rename to src/TNL/Containers/Algorithms/Reduction.hpp index b6a8562497734c7a68222300f5ebed64a7be6941..31c93504cc18f3616171b855a286345953ca19d0 100644 --- a/src/TNL/Containers/Algorithms/Reduction_impl.h +++ b/src/TNL/Containers/Algorithms/Reduction.hpp @@ -38,35 +38,39 @@ namespace Algorithms { */ static constexpr int Reduction_minGpuDataSize = 256;//65536; //16384;//1024;//256; -template< typename Operation, typename Index > -typename Operation::ResultType + +template< typename Index, + typename Result, + typename ReductionOperation, + typename VolatileReductionOperation, + typename DataFetcher > +Result Reduction< Devices::Cuda >:: -reduce( Operation& operation, - const Index size, - const typename Operation::DataType1* deviceInput1, - const typename Operation::DataType2* deviceInput2 ) + reduce( const Index size, + ReductionOperation& reduction, + VolatileReductionOperation& volatileReduction, + DataFetcher& dataFetcher, + const Result& zero ) { #ifdef HAVE_CUDA - typedef Index IndexType; - typedef typename Operation::DataType1 DataType1; - typedef typename Operation::DataType2 DataType2; - typedef typename Operation::ResultType ResultType; - typedef typename Operation::LaterReductionOperation LaterReductionOperation; + using IndexType = Index; + using ResultType = Result; /*** * Only fundamental and pointer types can be safely reduced on host. Complex * objects stored on the device might contain pointers into the device memory, * in which case reduction on host might fail. */ - constexpr bool can_reduce_all_on_host = std::is_fundamental< DataType1 >::value || std::is_fundamental< DataType2 >::value || std::is_pointer< DataType1 >::value || std::is_pointer< DataType2 >::value; + //constexpr bool can_reduce_all_on_host = std::is_fundamental< DataType1 >::value || std::is_fundamental< DataType2 >::value || std::is_pointer< DataType1 >::value || std::is_pointer< DataType2 >::value; constexpr bool can_reduce_later_on_host = std::is_fundamental< ResultType >::value || std::is_pointer< ResultType >::value; /*** * First check if the input array(s) is/are large enough for the reduction on GPU. * Otherwise copy it/them to host and reduce on CPU. */ - if( can_reduce_all_on_host && size <= Reduction_minGpuDataSize ) + // With lambda function we cannot reduce data on host - we do not know the data here. + /*if( can_reduce_all_on_host && size <= Reduction_minGpuDataSize ) { typename std::remove_const< DataType1 >::type hostArray1[ Reduction_minGpuDataSize ]; ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( hostArray1, deviceInput1, size ); @@ -74,12 +78,12 @@ reduce( Operation& operation, using _DT2 = typename std::conditional< std::is_same< DataType2, void >::value, DataType1, DataType2 >::type; typename std::remove_const< _DT2 >::type hostArray2[ Reduction_minGpuDataSize ]; ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( hostArray2, (_DT2*) deviceInput2, size ); - return Reduction< Devices::Host >::reduce( operation, size, hostArray1, hostArray2 ); + return Reduction< Devices::Host >::reduce( zero, dataFetcher, reduction, size, hostArray1, hostArray2 ); } else { return Reduction< Devices::Host >::reduce( operation, size, hostArray1, (DataType2*) nullptr ); } - } + }*/ #ifdef CUDA_REDUCTION_PROFILING Timer timer; @@ -87,15 +91,18 @@ reduce( Operation& operation, timer.start(); #endif + CudaReductionKernelLauncher< IndexType, ResultType > reductionLauncher( size ); + /**** * Reduce the data on the CUDA device. */ ResultType* deviceAux1( 0 ); - IndexType reducedSize = CudaReductionKernelLauncher( operation, - size, - deviceInput1, - deviceInput2, - deviceAux1 ); + IndexType reducedSize = reductionLauncher.start( + reduction, + volatileReduction, + dataFetcher, + zero, + deviceAux1 ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); std::cout << " Reduction on GPU to size " << reducedSize << " took " << timer.getRealTime() << " sec. " << std::endl; @@ -107,8 +114,8 @@ reduce( Operation& operation, /*** * Transfer the reduced data from device to host. */ - ResultType resultArray[ reducedSize ]; - ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray, deviceAux1, reducedSize ); + std::unique_ptr< ResultType[] > resultArray{ new ResultType[ reducedSize ] }; + ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray.get(), deviceAux1, reducedSize ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); @@ -120,28 +127,20 @@ reduce( Operation& operation, /*** * Reduce the data on the host system. */ - LaterReductionOperation laterReductionOperation; - const ResultType result = Reduction< Devices::Host >::reduce( laterReductionOperation, reducedSize, resultArray, (void*) nullptr ); + auto fetch = [&] ( IndexType i ) { return resultArray[ i ]; }; + const ResultType result = Reduction< Devices::Host >::reduce( reducedSize, reduction, volatileReduction, fetch, zero ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); std::cout << " Reduction of small data set on CPU took " << timer.getRealTime() << " sec. " << std::endl; #endif - return result; } else { /*** * Data can't be safely reduced on host, so continue with the reduction on the CUDA device. */ - LaterReductionOperation laterReductionOperation; - while( reducedSize > 1 ) { - reducedSize = CudaReductionKernelLauncher( laterReductionOperation, - reducedSize, - deviceAux1, - (ResultType*) 0, - deviceAux1 ); - } + auto result = reductionLauncher.finish( reduction, volatileReduction, zero ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); @@ -150,14 +149,14 @@ reduce( Operation& operation, timer.start(); #endif - ResultType resultArray[ 1 ]; - ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray, deviceAux1, reducedSize ); - const ResultType result = resultArray[ 0 ]; + //ResultType resultArray[ 1 ]; + //ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray, deviceAux1, reducedSize ); + //const ResultType result = resultArray[ 0 ]; - #ifdef CUDA_REDUCTION_PROFILING + /*#ifdef CUDA_REDUCTION_PROFILING timer.stop(); std::cout << " Transferring the result to CPU took " << timer.getRealTime() << " sec. " << std::endl; - #endif + #endif*/ return result; } @@ -166,18 +165,21 @@ reduce( Operation& operation, #endif }; -template< typename Operation, typename Index > -typename Operation::ResultType +template< typename Index, + typename Result, + typename ReductionOperation, + typename VolatileReductionOperation, + typename DataFetcher > +Result Reduction< Devices::Host >:: -reduce( Operation& operation, - const Index size, - const typename Operation::DataType1* input1, - const typename Operation::DataType2* input2 ) + reduce( const Index size, + ReductionOperation& reduction, + VolatileReductionOperation& volatileReduction, + DataFetcher& dataFetcher, + const Result& zero ) { - typedef Index IndexType; - typedef typename Operation::DataType1 DataType1; - typedef typename Operation::DataType2 DataType2; - typedef typename Operation::ResultType ResultType; + using IndexType = Index; + using ResultType = Result; constexpr int block_size = 128; const int blocks = size / block_size; @@ -185,23 +187,20 @@ reduce( Operation& operation, #ifdef HAVE_OPENMP if( TNL::Devices::Host::isOMPEnabled() && size >= 2 * block_size ) { // global result variable - ResultType result = operation.initialValue(); + ResultType result = zero; #pragma omp parallel { // initialize array for thread-local results - ResultType r[ 4 ] = { operation.initialValue(), - operation.initialValue(), - operation.initialValue(), - operation.initialValue() }; + ResultType r[ 4 ] = { zero, zero, zero, zero }; #pragma omp for nowait for( int b = 0; b < blocks; b++ ) { const IndexType offset = b * block_size; for( int i = 0; i < block_size; i += 4 ) { - operation.firstReduction( r[ 0 ], offset + i, input1, input2 ); - operation.firstReduction( r[ 1 ], offset + i + 1, input1, input2 ); - operation.firstReduction( r[ 2 ], offset + i + 2, input1, input2 ); - operation.firstReduction( r[ 3 ], offset + i + 3, input1, input2 ); + reduction( r[ 0 ], dataFetcher( offset + i ) ); + reduction( r[ 1 ], dataFetcher( offset + i + 1 ) ); + reduction( r[ 2 ], dataFetcher( offset + i + 2 ) ); + reduction( r[ 3 ], dataFetcher( offset + i + 3 ) ); } } @@ -209,18 +208,18 @@ reduce( Operation& operation, #pragma omp single nowait { for( IndexType i = blocks * block_size; i < size; i++ ) - operation.firstReduction( r[ 0 ], i, input1, input2 ); + reduction( r[ 0 ], dataFetcher( i ) ); } // local reduction of unrolled results - operation.commonReduction( r[ 0 ], r[ 2 ] ); - operation.commonReduction( r[ 1 ], r[ 3 ] ); - operation.commonReduction( r[ 0 ], r[ 1 ] ); + reduction( r[ 0 ], r[ 2 ] ); + reduction( r[ 1 ], r[ 3 ] ); + reduction( r[ 0 ], r[ 1 ] ); // inter-thread reduction of local results #pragma omp critical { - operation.commonReduction( result, r[ 0 ] ); + reduction( result, r[ 0 ] ); } } return result; @@ -229,37 +228,34 @@ reduce( Operation& operation, #endif if( blocks > 1 ) { // initialize array for unrolled results - ResultType r[ 4 ] = { operation.initialValue(), - operation.initialValue(), - operation.initialValue(), - operation.initialValue() }; + ResultType r[ 4 ] = { zero, zero, zero, zero }; // main reduction (explicitly unrolled loop) for( int b = 0; b < blocks; b++ ) { const IndexType offset = b * block_size; for( int i = 0; i < block_size; i += 4 ) { - operation.firstReduction( r[ 0 ], offset + i, input1, input2 ); - operation.firstReduction( r[ 1 ], offset + i + 1, input1, input2 ); - operation.firstReduction( r[ 2 ], offset + i + 2, input1, input2 ); - operation.firstReduction( r[ 3 ], offset + i + 3, input1, input2 ); + reduction( r[ 0 ], dataFetcher( offset + i ) ); + reduction( r[ 1 ], dataFetcher( offset + i + 1 ) ); + reduction( r[ 2 ], dataFetcher( offset + i + 2 ) ); + reduction( r[ 3 ], dataFetcher( offset + i + 3 ) ); } } // reduction of the last, incomplete block (not unrolled) for( IndexType i = blocks * block_size; i < size; i++ ) - operation.firstReduction( r[ 0 ], i, input1, input2 ); + reduction( r[ 0 ], dataFetcher( i ) ); + //operation.dataFetcher( r[ 0 ], i, input1, input2 ); // reduction of unrolled results - operation.commonReduction( r[ 0 ], r[ 2 ] ); - operation.commonReduction( r[ 1 ], r[ 3 ] ); - operation.commonReduction( r[ 0 ], r[ 1 ] ); - + reduction( r[ 0 ], r[ 2 ] ); + reduction( r[ 1 ], r[ 3 ] ); + reduction( r[ 0 ], r[ 1 ] ); return r[ 0 ]; } else { - ResultType result = operation.initialValue(); + ResultType result = zero; for( IndexType i = 0; i < size; i++ ) - operation.firstReduction( result, i, input1, input2 ); + reduction( result, dataFetcher( i ) ); return result; } #ifdef HAVE_OPENMP diff --git a/src/TNL/Containers/Algorithms/VectorOperations.h b/src/TNL/Containers/Algorithms/VectorOperations.h index a3ef89fc3213c81db53a6f8a0f867379734b588e..1c89dbca036ba7fecba775622890b8cf833b3f51 100644 --- a/src/TNL/Containers/Algorithms/VectorOperations.h +++ b/src/TNL/Containers/Algorithms/VectorOperations.h @@ -11,6 +11,7 @@ #pragma once #include <TNL/Containers/Algorithms/Reduction.h> +#include <TNL/Containers/Algorithms/CommonVectorOperations.h> #include <TNL/Devices/Host.h> #include <TNL/Devices/Cuda.h> @@ -22,7 +23,7 @@ template< typename Device > class VectorOperations{}; template<> -class VectorOperations< Devices::Host > +class VectorOperations< Devices::Host > : public CommonVectorOperations< Devices::Host > { public: template< typename Vector > @@ -36,7 +37,7 @@ public: const typename Vector::RealType& value, const Scalar thisElementMultiplicator ); - template< typename Vector, typename ResultType = typename Vector::RealType > + /*template< typename Vector, typename ResultType = typename Vector::RealType > static ResultType getVectorMax( const Vector& v ); template< typename Vector, typename ResultType = typename Vector::RealType > @@ -83,12 +84,13 @@ public: template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > static ResultType getVectorDifferenceSum( const Vector1& v1, const Vector2& v2 ); + */ template< typename Vector, typename Scalar > static void vectorScalarMultiplication( Vector& v, Scalar alpha ); - template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > - static ResultType getScalarProduct( const Vector1& v1, const Vector2& v2 ); + /*template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > + static ResultType getScalarProduct( const Vector1& v1, const Vector2& v2 );*/ template< typename Vector1, typename Vector2, typename Scalar1, typename Scalar2 > static void addVector( Vector1& y, @@ -116,7 +118,7 @@ public: }; template<> -class VectorOperations< Devices::Cuda > +class VectorOperations< Devices::Cuda > : public CommonVectorOperations< Devices::Cuda > { public: template< typename Vector > @@ -130,7 +132,7 @@ public: const typename Vector::RealType& value, const Scalar thisElementMultiplicator ); - template< typename Vector, typename ResultType = typename Vector::RealType > + /*template< typename Vector, typename ResultType = typename Vector::RealType > static ResultType getVectorMax( const Vector& v ); template< typename Vector, typename ResultType = typename Vector::RealType > @@ -177,12 +179,13 @@ public: template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > static ResultType getVectorDifferenceSum( const Vector1& v1, const Vector2& v2 ); + */ template< typename Vector, typename Scalar > static void vectorScalarMultiplication( Vector& v, const Scalar alpha ); - template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > - static ResultType getScalarProduct( const Vector1& v1, const Vector2& v2 ); + /*template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType > + static ResultType getScalarProduct( const Vector1& v1, const Vector2& v2 );*/ template< typename Vector1, typename Vector2, typename Scalar1, typename Scalar2 > static void addVector( Vector1& y, diff --git a/src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h b/src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h index 05f334b051dc5ef3f7471bd3a1cef32aebe607e0..d79b5f06909d08cb918336ad3225c0fa206443d7 100644 --- a/src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h +++ b/src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h @@ -39,7 +39,7 @@ addElement( Vector& v, v[ i ] = thisElementMultiplicator * v[ i ] + value; } -template< typename Vector, typename ResultType > +/*template< typename Vector, typename ResultType > ResultType VectorOperations< Devices::Cuda >:: getVectorMax( const Vector& v ) @@ -314,7 +314,7 @@ getVectorDifferenceSum( const Vector1& v1, v1.getSize(), v1.getData(), v2.getData() ); -} +}*/ #ifdef HAVE_CUDA template< typename Real, typename Index, typename Scalar > @@ -358,7 +358,7 @@ vectorScalarMultiplication( Vector& v, } -template< typename Vector1, typename Vector2, typename ResultType > +/*template< typename Vector1, typename Vector2, typename ResultType > ResultType VectorOperations< Devices::Cuda >:: getScalarProduct( const Vector1& v1, @@ -372,7 +372,7 @@ getScalarProduct( const Vector1& v1, v1.getSize(), v1.getData(), v2.getData() ); -} +}*/ #ifdef HAVE_CUDA template< typename Real1, typename Real2, typename Index, typename Scalar1, typename Scalar2 > diff --git a/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h b/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h index 67400da64305767c86a85f600759f7f6ddafa691..7b1bba80c4cd619275bda20d537c5646ec154a0b 100644 --- a/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h +++ b/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h @@ -41,283 +41,6 @@ addElement( Vector& v, v[ i ] = thisElementMultiplicator * v[ i ] + value; } -template< typename Vector, typename ResultType > -ResultType -VectorOperations< Devices::Host >:: -getVectorMax( const Vector& v ) -{ - typedef typename Vector::RealType RealType; - - TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); - - Algorithms::ParallelReductionMax< RealType, ResultType > operation; - return Reduction< Devices::Host >::reduce( operation, - v.getSize(), - v.getData(), - ( RealType* ) 0 ); -} - -template< typename Vector, typename ResultType > -ResultType -VectorOperations< Devices::Host >:: -getVectorMin( const Vector& v ) -{ - typedef typename Vector::RealType RealType; - - TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); - - Algorithms::ParallelReductionMin< RealType, ResultType > operation; - return Reduction< Devices::Host >::reduce( operation, - v.getSize(), - v.getData(), - ( RealType* ) 0 ); -} - -template< typename Vector, typename ResultType > -ResultType -VectorOperations< Devices::Host >:: -getVectorAbsMax( const Vector& v ) -{ - typedef typename Vector::RealType RealType; - - TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); - - Algorithms::ParallelReductionAbsMax< RealType, ResultType > operation; - return Reduction< Devices::Host >::reduce( operation, - v.getSize(), - v.getData(), - ( RealType* ) 0 ); -} - - -template< typename Vector, typename ResultType > -ResultType -VectorOperations< Devices::Host >:: -getVectorAbsMin( const Vector& v ) -{ - typedef typename Vector::RealType RealType; - - TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); - - Algorithms::ParallelReductionAbsMin< RealType, ResultType > operation; - return Reduction< Devices::Host >::reduce( operation, - v.getSize(), - v.getData(), - ( RealType* ) 0 ); -} - -template< typename Vector, typename ResultType > -ResultType -VectorOperations< Devices::Host >:: -getVectorL1Norm( const Vector& v ) -{ - typedef typename Vector::RealType RealType; - - TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); - - Algorithms::ParallelReductionAbsSum< RealType, ResultType > operation; - return Reduction< Devices::Host >::reduce( operation, - v.getSize(), - v.getData(), - ( RealType* ) 0 ); -} - -template< typename Vector, typename ResultType > -ResultType -VectorOperations< Devices::Host >:: -getVectorL2Norm( const Vector& v ) -{ - typedef typename Vector::RealType Real; - - TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); - - Algorithms::ParallelReductionL2Norm< Real, ResultType > operation; - const ResultType result = Reduction< Devices::Host >::reduce( operation, - v.getSize(), - v.getData(), - ( Real* ) 0 ); - return std::sqrt( result ); -} - -template< typename Vector, typename ResultType, typename Scalar > -ResultType -VectorOperations< Devices::Host >:: -getVectorLpNorm( const Vector& v, - const Scalar p ) -{ - typedef typename Vector::RealType Real; - - TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); - TNL_ASSERT_GE( p, 1.0, "Parameter of the L^p norm must be at least 1.0." ); - - if( p == 1.0 ) - return getVectorL1Norm< Vector, ResultType >( v ); - if( p == 2.0 ) - return getVectorL2Norm< Vector, ResultType >( v ); - - Algorithms::ParallelReductionLpNorm< Real, ResultType, Scalar > operation; - operation.setPower( p ); - const ResultType result = Reduction< Devices::Host >::reduce( operation, - v.getSize(), - v.getData(), - ( Real* ) 0 ); - return std::pow( result, 1.0 / p ); -} - -template< typename Vector, typename ResultType > -ResultType -VectorOperations< Devices::Host >:: -getVectorSum( const Vector& v ) -{ - typedef typename Vector::RealType Real; - - TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); - - Algorithms::ParallelReductionSum< Real, ResultType > operation; - return Reduction< Devices::Host >::reduce( operation, - v.getSize(), - v.getData(), - ( Real* ) 0 ); -} - -template< typename Vector1, typename Vector2, typename ResultType > -ResultType -VectorOperations< Devices::Host >:: -getVectorDifferenceMax( const Vector1& v1, - const Vector2& v2 ) -{ - TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); - TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); - - Algorithms::ParallelReductionDiffMax< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation; - return Reduction< Devices::Host >::reduce( operation, - v1.getSize(), - v1.getData(), - v2.getData() ); -} - -template< typename Vector1, typename Vector2, typename ResultType > -ResultType -VectorOperations< Devices::Host >:: -getVectorDifferenceMin( const Vector1& v1, - const Vector2& v2 ) -{ - TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); - TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); - - Algorithms::ParallelReductionDiffMin< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation; - return Reduction< Devices::Host >::reduce( operation, - v1.getSize(), - v1.getData(), - v2.getData() ); -} - -template< typename Vector1, typename Vector2, typename ResultType > -ResultType -VectorOperations< Devices::Host >:: -getVectorDifferenceAbsMax( const Vector1& v1, - const Vector2& v2 ) -{ - TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); - TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); - - Algorithms::ParallelReductionDiffAbsMax< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation; - return Reduction< Devices::Host >::reduce( operation, - v1.getSize(), - v1.getData(), - v2.getData() ); -} - -template< typename Vector1, typename Vector2, typename ResultType > -ResultType -VectorOperations< Devices::Host >:: -getVectorDifferenceAbsMin( const Vector1& v1, - const Vector2& v2 ) -{ - TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); - TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); - - Algorithms::ParallelReductionDiffAbsMin< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation; - return Reduction< Devices::Host >::reduce( operation, - v1.getSize(), - v1.getData(), - v2.getData() ); -} - -template< typename Vector1, typename Vector2, typename ResultType > -ResultType -VectorOperations< Devices::Host >:: -getVectorDifferenceL1Norm( const Vector1& v1, - const Vector2& v2 ) -{ - TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); - TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); - - Algorithms::ParallelReductionDiffAbsSum< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation; - return Reduction< Devices::Host >::reduce( operation, - v1.getSize(), - v1.getData(), - v2.getData() ); -} - -template< typename Vector1, typename Vector2, typename ResultType > -ResultType -VectorOperations< Devices::Host >:: -getVectorDifferenceL2Norm( const Vector1& v1, - const Vector2& v2 ) -{ - TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); - TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); - - Algorithms::ParallelReductionDiffL2Norm< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation; - const ResultType result = Reduction< Devices::Host >::reduce( operation, - v1.getSize(), - v1.getData(), - v2.getData() ); - return std::sqrt( result ); -} - - -template< typename Vector1, typename Vector2, typename ResultType, typename Scalar > -ResultType -VectorOperations< Devices::Host >:: -getVectorDifferenceLpNorm( const Vector1& v1, - const Vector2& v2, - const Scalar p ) -{ - TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); - TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); - TNL_ASSERT_GE( p, 1.0, "Parameter of the L^p norm must be at least 1.0." ); - - if( p == 1.0 ) - return getVectorDifferenceL1Norm< Vector1, Vector2, ResultType >( v1, v2 ); - if( p == 2.0 ) - return getVectorDifferenceL2Norm< Vector1, Vector2, ResultType >( v1, v2 ); - - Algorithms::ParallelReductionDiffLpNorm< typename Vector1::RealType, typename Vector2::RealType, ResultType, Scalar > operation; - operation.setPower( p ); - const ResultType result = Reduction< Devices::Host >::reduce( operation, - v1.getSize(), - v1.getData(), - v2.getData() ); - return std::pow( result, 1.0 / p ); -} - -template< typename Vector1, typename Vector2, typename ResultType > -ResultType -VectorOperations< Devices::Host >:: -getVectorDifferenceSum( const Vector1& v1, - const Vector2& v2 ) -{ - TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); - TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); - - Algorithms::ParallelReductionDiffSum< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation; - return Reduction< Devices::Host >::reduce( operation, - v1.getSize(), - v1.getData(), - v2.getData() ); -} template< typename Vector, typename Scalar > @@ -339,7 +62,7 @@ vectorScalarMultiplication( Vector& v, } -template< typename Vector1, typename Vector2, typename ResultType > +/*template< typename Vector1, typename Vector2, typename ResultType > ResultType VectorOperations< Devices::Host >:: getScalarProduct( const Vector1& v1, @@ -353,7 +76,7 @@ getScalarProduct( const Vector1& v1, v1.getSize(), v1.getData(), v2.getData() ); -} +}*/ template< typename Vector1, typename Vector2, typename Scalar1, typename Scalar2 > void diff --git a/src/TNL/Math.h b/src/TNL/Math.h index 68eb1f556f246c2a3e0909e0f58ebd2ee57b17dd..dcfa91a3450d2ef33b0475edb77f6801dfea96e0 100644 --- a/src/TNL/Math.h +++ b/src/TNL/Math.h @@ -8,7 +8,7 @@ /* See Copyright Notice in tnl/Copyright */ -#pragma once +#pragma once #include <cmath> #include <type_traits> @@ -18,6 +18,13 @@ namespace TNL { +template< typename T1, typename T2, typename ResultType = typename std::common_type< T1, T2 >::type > +__cuda_callable__ inline +ResultType sum( const T1& a, const T2& b ) +{ + return a + b; +} + /** * \brief This function returns minimum of two numbers. * @@ -105,7 +112,7 @@ template< typename T1, typename T2, typename ResultType = typename std::common_t __cuda_callable__ ResultType argMax( const T1& a, const T2& b ) { - return ( a > b ) ? a : b; + return ( a > b ) ? a : b; } /*** @@ -125,7 +132,7 @@ template< typename T1, typename T2, typename ResultType = typename std::common_t __cuda_callable__ ResultType argAbsMax( const T1& a, const T2& b ) { - return ( TNL::abs( a ) > TNL::abs( b ) ) ? a : b; + return ( TNL::abs( a ) > TNL::abs( b ) ) ? a : b; } /**