Loading src/TNL/Containers/Algorithms/reduction-operations.h +17 −187 Original line number Diff line number Diff line Loading @@ -10,186 +10,16 @@ #pragma once #include <cmath> #include <TNL/Constants.h> #include <TNL/Math.h> #include <TNL/Devices/Cuda.h> #ifdef HAVE_CUDA #include <cuda.h> #include <TNL/Math.h> #endif namespace TNL { namespace Containers { namespace Algorithms { #ifdef HAVE_CUDA /*** * This function returns minimum of two numbers stored on the device. * TODO: Make it tnlMin, tnlMax etc. */ template< class T > __device__ T tnlCudaMin( const T& a, const T& b ) { return a < b ? a : b; } __device__ inline int tnlCudaMin( const int& a, const int& b ) { return min( a, b ); } __device__ inline float tnlCudaMin( const float& a, const float& b ) { return fminf( a, b ); } __device__ inline double tnlCudaMin( const double& a, const double& b ) { return fmin( a, b ); } template< class T > __device__ T tnlCudaMin( volatile const T& a, volatile const T& b ) { return a < b ? a : b; } __device__ inline int tnlCudaMin( volatile const int& a, volatile const int& b ) { return min( a, b ); } __device__ inline float tnlCudaMin( volatile const float& a, volatile const float& b ) { return fminf( a, b ); } __device__ inline double tnlCudaMin( volatile const double& a, volatile const double& b ) { return fmin( a, b ); } /*** * This function returns maximum of two numbers stored on the device. */ template< class T > __device__ T tnlCudaMax( const T& a, const T& b ) { return a > b ? a : b; } __device__ inline int tnlCudaMax( const int& a, const int& b ) { return max( a, b ); } __device__ inline float tnlCudaMax( const float& a, const float& b ) { return fmaxf( a, b ); } __device__ inline double tnlCudaMax( const double& a, const double& b ) { return fmax( a, b ); } template< class T > __device__ T tnlCudaMax( volatile const T& a, volatile const T& b ) { return a > b ? a : b; } __device__ inline int tnlCudaMax( volatile const int& a, volatile const int& b ) { return max( a, b ); } __device__ inline float tnlCudaMax( volatile const float& a, volatile const float& b ) { return fmaxf( a, b ); } __device__ inline double tnlCudaMax( volatile const double& a, volatile const double& b ) { return fmax( a, b ); } /*** * This function returns absolute value of given number on the device. */ __device__ inline int tnlCudaAbs( const int& a ) { return ::abs( a ); } __device__ inline long int tnlCudaAbs( const long int& a ) { return ::abs( a ); } __device__ inline float tnlCudaAbs( const float& a ) { return ::abs( a ); } __device__ inline double tnlCudaAbs( const double& a ) { return ::abs( a ); } __device__ inline long double tnlCudaAbs( const long double& a ) { return ::abs( ( double ) a ); } __device__ inline int tnlCudaAbs( volatile const int& a ) { return ::abs( a ); } __device__ inline long int tnlCudaAbs( volatile const long int& a ) { return ::abs( a ); } __device__ inline float tnlCudaAbs( volatile const float& a ) { return ::abs( a ); } __device__ inline double tnlCudaAbs( volatile const double& a ) { return ::abs( a ); } __device__ inline long double tnlCudaAbs( volatile const long double& a ) { return ::abs( ( double ) a ); } template< typename Type1, typename Type2 > __device__ Type1 tnlCudaPow( const Type1& x, const Type2& power ) { return ( Type1 ) ::pow( ( double ) x, ( double ) power ); } #endif template< typename Real, typename Index > class tnlParallelReductionSum { Loading Loading @@ -256,19 +86,19 @@ class tnlParallelReductionMin const RealType* data1, const RealType* data2 ) { result = tnlCudaMin( result, data1[ index ] ); result = TNL::min( result, data1[ index ] ); } __cuda_callable__ void commonReductionOnDevice( ResultType& result, const ResultType& data ) { result = tnlCudaMin( result, data ); result = TNL::min( result, data ); }; __cuda_callable__ void commonReductionOnDevice( volatile ResultType& result, volatile const ResultType& data ) { result = tnlCudaMin( result, data ); result = TNL::min( result, data ); }; }; Loading Loading @@ -297,19 +127,19 @@ class tnlParallelReductionMax const RealType* data1, const RealType* data2 ) { result = tnlCudaMax( result, data1[ index ] ); result = TNL::max( result, data1[ index ] ); } __cuda_callable__ void commonReductionOnDevice( ResultType& result, const ResultType& data ) { result = tnlCudaMax( result, data ); result = TNL::max( result, data ); }; __cuda_callable__ void commonReductionOnDevice( volatile ResultType& result, volatile const ResultType& data ) { result = tnlCudaMax( result, data ); result = TNL::max( result, data ); }; }; Loading Loading @@ -421,7 +251,7 @@ class tnlParallelReductionAbsSum : public tnlParallelReductionSum< Real, Index > const RealType* data1, const RealType* data2 ) { result += tnlCudaAbs( data1[ index ] ); result += TNL::abs( data1[ index ] ); } }; Loading Loading @@ -450,7 +280,7 @@ class tnlParallelReductionAbsMin : public tnlParallelReductionMin< Real, Index > const RealType* data1, const RealType* data2 ) { result = tnlCudaMin( result, tnlCudaAbs( data1[ index ] ) ); result = TNL::min( result, TNL::abs( data1[ index ] ) ); } }; Loading Loading @@ -479,7 +309,7 @@ class tnlParallelReductionAbsMax : public tnlParallelReductionMax< Real, Index > const RealType* data1, const RealType* data2 ) { result = tnlCudaMax( result, tnlCudaAbs( data1[ index ] ) ); result = TNL::max( result, TNL::abs( data1[ index ] ) ); } }; Loading Loading @@ -545,7 +375,7 @@ class tnlParallelReductionLpNorm : public tnlParallelReductionSum< Real, Index > const RealType* data1, const RealType* data2 ) { result += tnlCudaPow( tnlCudaAbs( data1[ index ] ), p ); result += TNL::pow( TNL::abs( data1[ index ] ), p ); } protected: Loading Loading @@ -694,7 +524,7 @@ class tnlParallelReductionDiffMin : public tnlParallelReductionMin< Real, Index const RealType* data1, const RealType* data2 ) { result = tnlCudaMin( result, data1[ index ] - data2[ index ] ); result = TNL::min( result, data1[ index ] - data2[ index ] ); } }; Loading Loading @@ -723,7 +553,7 @@ class tnlParallelReductionDiffMax : public tnlParallelReductionMax< Real, Index const RealType* data1, const RealType* data2 ) { result = tnlCudaMax( result, data1[ index ] - data2[ index ] ); result = TNL::max( result, data1[ index ] - data2[ index ] ); } }; Loading Loading @@ -752,7 +582,7 @@ class tnlParallelReductionDiffAbsSum : public tnlParallelReductionMax< Real, Ind const RealType* data1, const RealType* data2 ) { result += tnlCudaAbs( data1[ index ] - data2[ index ] ); result += TNL::abs( data1[ index ] - data2[ index ] ); } }; Loading Loading @@ -781,7 +611,7 @@ class tnlParallelReductionDiffAbsMin : public tnlParallelReductionMin< Real, Ind const RealType* data1, const RealType* data2 ) { result = tnlCudaMin( result, tnlCudaAbs( data1[ index ] - data2[ index ] ) ); result = TNL::min( result, TNL::abs( data1[ index ] - data2[ index ] ) ); } }; Loading Loading @@ -810,7 +640,7 @@ class tnlParallelReductionDiffAbsMax : public tnlParallelReductionMax< Real, Ind const RealType* data1, const RealType* data2 ) { result = tnlCudaMax( result, tnlCudaAbs( data1[ index ] - data2[ index ] ) ); result = TNL::max( result, TNL::abs( data1[ index ] - data2[ index ] ) ); } }; Loading Loading @@ -879,7 +709,7 @@ class tnlParallelReductionDiffLpNorm : public tnlParallelReductionSum< Real, Ind const RealType* data1, const RealType* data2 ) { result += tnlCudaPow( tnlCudaAbs( data1[ index ] - data2[ index ] ), p ); result += TNL::pow( TNL::abs( data1[ index ] - data2[ index ] ), p ); } protected: Loading src/TNL/Math.h +137 −25 Original line number Diff line number Diff line Loading @@ -11,18 +11,76 @@ #pragma once #include <cmath> #include <stdlib.h> #include <type_traits> #include <TNL/Devices/Cuda.h> #ifdef HAVE_CUDA #include <cuda.h> #endif namespace TNL { template< typename T1, typename T2 > using enable_if_same_base = std::enable_if< std::is_same< typename std::decay< T1 >::type, T2 >::value, T2 >; /*** * This function returns minimum of two numbers. * Specializations use the functions defined in the CUDA's math_functions.h * in CUDA device code and STL functions otherwise. */ template< typename Type1, typename Type2 > __cuda_callable__ __cuda_callable__ inline Type1 min( const Type1& a, const Type2& b ) { return a < b ? a : b; }; // specialization for int template< class T > __cuda_callable__ inline typename enable_if_same_base< T, int >::type min( const T& a, const T& b ) { #ifdef __CUDA_ARCH__ return ::min( a, b ); #else return std::min( a, b ); #endif } // specialization for float template< class T > __cuda_callable__ inline typename enable_if_same_base< T, float >::type min( const T& a, const T& b ) { #ifdef __CUDA_ARCH__ return ::fminf( a, b ); #else return std::fmin( a, b ); #endif } // specialization for double template< class T > __cuda_callable__ inline typename enable_if_same_base< T, double >::type min( const T& a, const T& b ) { #ifdef __CUDA_ARCH__ return ::fmin( a, b ); #else return std::fmin( a, b ); #endif } /*** * This function returns maximum of two numbers. * Specializations use the functions defined in the CUDA's math_functions.h * in CUDA device code and STL functions otherwise. */ template< typename Type1, typename Type2 > __cuda_callable__ Type1 max( const Type1& a, const Type2& b ) Loading @@ -30,49 +88,103 @@ Type1 max( const Type1& a, const Type2& b ) return a > b ? a : b; }; template< typename Type > __cuda_callable__ void swap( Type& a, Type& b ) // specialization for int template< class T > __cuda_callable__ inline typename enable_if_same_base< T, int >::type max( const T& a, const T& b ) { Type tmp( a ); a = b; b = tmp; }; #ifdef __CUDA_ARCH__ return ::max( a, b ); #else return std::max( a, b ); #endif } // specialization for float template< class T > __cuda_callable__ T sign( const T& a ) __cuda_callable__ inline typename enable_if_same_base< T, float >::type max( const T& a, const T& b ) { if( a < ( T ) 0 ) return -1; if( a == ( T ) 0 ) return 0; return 1; }; #ifdef __CUDA_ARCH__ return ::fmaxf( a, b ); #else return std::fmax( a, b ); #endif } // specialization for double template< class T > __cuda_callable__ T abs( const T& n ) __cuda_callable__ inline typename enable_if_same_base< T, double >::type max( const T& a, const T& b ) { #ifdef __CUDA_ARCH__ return ::fmax( a, b ); #else return std::fmax( a, b ); #endif } /*** * This function returns absolute value of given number. * Specializations use the functions defined in the CUDA's math_functions.h * in CUDA device code and STL functions otherwise. */ template< class T > __cuda_callable__ inline typename std::enable_if< ! std::is_arithmetic< T >::value, T >::type abs( const T& n ) { if( n < ( T ) 0 ) return -n; return n; }; } __cuda_callable__ inline int abs( const int& n ) // specialization for any arithmetic type (e.g. int, float, double) template< class T > __cuda_callable__ inline typename std::enable_if< std::is_arithmetic< T >::value, T >::type abs( const T& n ) { #ifdef __CUDA_ARCH__ return ::abs( n ); }; #else return std::abs( n ); #endif } template< class T > __cuda_callable__ inline T pow( const T& base, const T& exp ) { #ifdef __CUDA_ARCH__ return ::pow( base, exp ); #else return std::pow( base, exp ); #endif } template< typename Type > __cuda_callable__ inline float abs( const float& f ) void swap( Type& a, Type& b ) { return ::fabs( f ); Type tmp( a ); a = b; b = tmp; }; template< class T > __cuda_callable__ inline double abs( const double& d ) T sign( const T& a ) { return ::fabs( d ); if( a < ( T ) 0 ) return -1; if( a == ( T ) 0 ) return 0; return 1; }; template< typename Real > Loading Loading
src/TNL/Containers/Algorithms/reduction-operations.h +17 −187 Original line number Diff line number Diff line Loading @@ -10,186 +10,16 @@ #pragma once #include <cmath> #include <TNL/Constants.h> #include <TNL/Math.h> #include <TNL/Devices/Cuda.h> #ifdef HAVE_CUDA #include <cuda.h> #include <TNL/Math.h> #endif namespace TNL { namespace Containers { namespace Algorithms { #ifdef HAVE_CUDA /*** * This function returns minimum of two numbers stored on the device. * TODO: Make it tnlMin, tnlMax etc. */ template< class T > __device__ T tnlCudaMin( const T& a, const T& b ) { return a < b ? a : b; } __device__ inline int tnlCudaMin( const int& a, const int& b ) { return min( a, b ); } __device__ inline float tnlCudaMin( const float& a, const float& b ) { return fminf( a, b ); } __device__ inline double tnlCudaMin( const double& a, const double& b ) { return fmin( a, b ); } template< class T > __device__ T tnlCudaMin( volatile const T& a, volatile const T& b ) { return a < b ? a : b; } __device__ inline int tnlCudaMin( volatile const int& a, volatile const int& b ) { return min( a, b ); } __device__ inline float tnlCudaMin( volatile const float& a, volatile const float& b ) { return fminf( a, b ); } __device__ inline double tnlCudaMin( volatile const double& a, volatile const double& b ) { return fmin( a, b ); } /*** * This function returns maximum of two numbers stored on the device. */ template< class T > __device__ T tnlCudaMax( const T& a, const T& b ) { return a > b ? a : b; } __device__ inline int tnlCudaMax( const int& a, const int& b ) { return max( a, b ); } __device__ inline float tnlCudaMax( const float& a, const float& b ) { return fmaxf( a, b ); } __device__ inline double tnlCudaMax( const double& a, const double& b ) { return fmax( a, b ); } template< class T > __device__ T tnlCudaMax( volatile const T& a, volatile const T& b ) { return a > b ? a : b; } __device__ inline int tnlCudaMax( volatile const int& a, volatile const int& b ) { return max( a, b ); } __device__ inline float tnlCudaMax( volatile const float& a, volatile const float& b ) { return fmaxf( a, b ); } __device__ inline double tnlCudaMax( volatile const double& a, volatile const double& b ) { return fmax( a, b ); } /*** * This function returns absolute value of given number on the device. */ __device__ inline int tnlCudaAbs( const int& a ) { return ::abs( a ); } __device__ inline long int tnlCudaAbs( const long int& a ) { return ::abs( a ); } __device__ inline float tnlCudaAbs( const float& a ) { return ::abs( a ); } __device__ inline double tnlCudaAbs( const double& a ) { return ::abs( a ); } __device__ inline long double tnlCudaAbs( const long double& a ) { return ::abs( ( double ) a ); } __device__ inline int tnlCudaAbs( volatile const int& a ) { return ::abs( a ); } __device__ inline long int tnlCudaAbs( volatile const long int& a ) { return ::abs( a ); } __device__ inline float tnlCudaAbs( volatile const float& a ) { return ::abs( a ); } __device__ inline double tnlCudaAbs( volatile const double& a ) { return ::abs( a ); } __device__ inline long double tnlCudaAbs( volatile const long double& a ) { return ::abs( ( double ) a ); } template< typename Type1, typename Type2 > __device__ Type1 tnlCudaPow( const Type1& x, const Type2& power ) { return ( Type1 ) ::pow( ( double ) x, ( double ) power ); } #endif template< typename Real, typename Index > class tnlParallelReductionSum { Loading Loading @@ -256,19 +86,19 @@ class tnlParallelReductionMin const RealType* data1, const RealType* data2 ) { result = tnlCudaMin( result, data1[ index ] ); result = TNL::min( result, data1[ index ] ); } __cuda_callable__ void commonReductionOnDevice( ResultType& result, const ResultType& data ) { result = tnlCudaMin( result, data ); result = TNL::min( result, data ); }; __cuda_callable__ void commonReductionOnDevice( volatile ResultType& result, volatile const ResultType& data ) { result = tnlCudaMin( result, data ); result = TNL::min( result, data ); }; }; Loading Loading @@ -297,19 +127,19 @@ class tnlParallelReductionMax const RealType* data1, const RealType* data2 ) { result = tnlCudaMax( result, data1[ index ] ); result = TNL::max( result, data1[ index ] ); } __cuda_callable__ void commonReductionOnDevice( ResultType& result, const ResultType& data ) { result = tnlCudaMax( result, data ); result = TNL::max( result, data ); }; __cuda_callable__ void commonReductionOnDevice( volatile ResultType& result, volatile const ResultType& data ) { result = tnlCudaMax( result, data ); result = TNL::max( result, data ); }; }; Loading Loading @@ -421,7 +251,7 @@ class tnlParallelReductionAbsSum : public tnlParallelReductionSum< Real, Index > const RealType* data1, const RealType* data2 ) { result += tnlCudaAbs( data1[ index ] ); result += TNL::abs( data1[ index ] ); } }; Loading Loading @@ -450,7 +280,7 @@ class tnlParallelReductionAbsMin : public tnlParallelReductionMin< Real, Index > const RealType* data1, const RealType* data2 ) { result = tnlCudaMin( result, tnlCudaAbs( data1[ index ] ) ); result = TNL::min( result, TNL::abs( data1[ index ] ) ); } }; Loading Loading @@ -479,7 +309,7 @@ class tnlParallelReductionAbsMax : public tnlParallelReductionMax< Real, Index > const RealType* data1, const RealType* data2 ) { result = tnlCudaMax( result, tnlCudaAbs( data1[ index ] ) ); result = TNL::max( result, TNL::abs( data1[ index ] ) ); } }; Loading Loading @@ -545,7 +375,7 @@ class tnlParallelReductionLpNorm : public tnlParallelReductionSum< Real, Index > const RealType* data1, const RealType* data2 ) { result += tnlCudaPow( tnlCudaAbs( data1[ index ] ), p ); result += TNL::pow( TNL::abs( data1[ index ] ), p ); } protected: Loading Loading @@ -694,7 +524,7 @@ class tnlParallelReductionDiffMin : public tnlParallelReductionMin< Real, Index const RealType* data1, const RealType* data2 ) { result = tnlCudaMin( result, data1[ index ] - data2[ index ] ); result = TNL::min( result, data1[ index ] - data2[ index ] ); } }; Loading Loading @@ -723,7 +553,7 @@ class tnlParallelReductionDiffMax : public tnlParallelReductionMax< Real, Index const RealType* data1, const RealType* data2 ) { result = tnlCudaMax( result, data1[ index ] - data2[ index ] ); result = TNL::max( result, data1[ index ] - data2[ index ] ); } }; Loading Loading @@ -752,7 +582,7 @@ class tnlParallelReductionDiffAbsSum : public tnlParallelReductionMax< Real, Ind const RealType* data1, const RealType* data2 ) { result += tnlCudaAbs( data1[ index ] - data2[ index ] ); result += TNL::abs( data1[ index ] - data2[ index ] ); } }; Loading Loading @@ -781,7 +611,7 @@ class tnlParallelReductionDiffAbsMin : public tnlParallelReductionMin< Real, Ind const RealType* data1, const RealType* data2 ) { result = tnlCudaMin( result, tnlCudaAbs( data1[ index ] - data2[ index ] ) ); result = TNL::min( result, TNL::abs( data1[ index ] - data2[ index ] ) ); } }; Loading Loading @@ -810,7 +640,7 @@ class tnlParallelReductionDiffAbsMax : public tnlParallelReductionMax< Real, Ind const RealType* data1, const RealType* data2 ) { result = tnlCudaMax( result, tnlCudaAbs( data1[ index ] - data2[ index ] ) ); result = TNL::max( result, TNL::abs( data1[ index ] - data2[ index ] ) ); } }; Loading Loading @@ -879,7 +709,7 @@ class tnlParallelReductionDiffLpNorm : public tnlParallelReductionSum< Real, Ind const RealType* data1, const RealType* data2 ) { result += tnlCudaPow( tnlCudaAbs( data1[ index ] - data2[ index ] ), p ); result += TNL::pow( TNL::abs( data1[ index ] - data2[ index ] ), p ); } protected: Loading
src/TNL/Math.h +137 −25 Original line number Diff line number Diff line Loading @@ -11,18 +11,76 @@ #pragma once #include <cmath> #include <stdlib.h> #include <type_traits> #include <TNL/Devices/Cuda.h> #ifdef HAVE_CUDA #include <cuda.h> #endif namespace TNL { template< typename T1, typename T2 > using enable_if_same_base = std::enable_if< std::is_same< typename std::decay< T1 >::type, T2 >::value, T2 >; /*** * This function returns minimum of two numbers. * Specializations use the functions defined in the CUDA's math_functions.h * in CUDA device code and STL functions otherwise. */ template< typename Type1, typename Type2 > __cuda_callable__ __cuda_callable__ inline Type1 min( const Type1& a, const Type2& b ) { return a < b ? a : b; }; // specialization for int template< class T > __cuda_callable__ inline typename enable_if_same_base< T, int >::type min( const T& a, const T& b ) { #ifdef __CUDA_ARCH__ return ::min( a, b ); #else return std::min( a, b ); #endif } // specialization for float template< class T > __cuda_callable__ inline typename enable_if_same_base< T, float >::type min( const T& a, const T& b ) { #ifdef __CUDA_ARCH__ return ::fminf( a, b ); #else return std::fmin( a, b ); #endif } // specialization for double template< class T > __cuda_callable__ inline typename enable_if_same_base< T, double >::type min( const T& a, const T& b ) { #ifdef __CUDA_ARCH__ return ::fmin( a, b ); #else return std::fmin( a, b ); #endif } /*** * This function returns maximum of two numbers. * Specializations use the functions defined in the CUDA's math_functions.h * in CUDA device code and STL functions otherwise. */ template< typename Type1, typename Type2 > __cuda_callable__ Type1 max( const Type1& a, const Type2& b ) Loading @@ -30,49 +88,103 @@ Type1 max( const Type1& a, const Type2& b ) return a > b ? a : b; }; template< typename Type > __cuda_callable__ void swap( Type& a, Type& b ) // specialization for int template< class T > __cuda_callable__ inline typename enable_if_same_base< T, int >::type max( const T& a, const T& b ) { Type tmp( a ); a = b; b = tmp; }; #ifdef __CUDA_ARCH__ return ::max( a, b ); #else return std::max( a, b ); #endif } // specialization for float template< class T > __cuda_callable__ T sign( const T& a ) __cuda_callable__ inline typename enable_if_same_base< T, float >::type max( const T& a, const T& b ) { if( a < ( T ) 0 ) return -1; if( a == ( T ) 0 ) return 0; return 1; }; #ifdef __CUDA_ARCH__ return ::fmaxf( a, b ); #else return std::fmax( a, b ); #endif } // specialization for double template< class T > __cuda_callable__ T abs( const T& n ) __cuda_callable__ inline typename enable_if_same_base< T, double >::type max( const T& a, const T& b ) { #ifdef __CUDA_ARCH__ return ::fmax( a, b ); #else return std::fmax( a, b ); #endif } /*** * This function returns absolute value of given number. * Specializations use the functions defined in the CUDA's math_functions.h * in CUDA device code and STL functions otherwise. */ template< class T > __cuda_callable__ inline typename std::enable_if< ! std::is_arithmetic< T >::value, T >::type abs( const T& n ) { if( n < ( T ) 0 ) return -n; return n; }; } __cuda_callable__ inline int abs( const int& n ) // specialization for any arithmetic type (e.g. int, float, double) template< class T > __cuda_callable__ inline typename std::enable_if< std::is_arithmetic< T >::value, T >::type abs( const T& n ) { #ifdef __CUDA_ARCH__ return ::abs( n ); }; #else return std::abs( n ); #endif } template< class T > __cuda_callable__ inline T pow( const T& base, const T& exp ) { #ifdef __CUDA_ARCH__ return ::pow( base, exp ); #else return std::pow( base, exp ); #endif } template< typename Type > __cuda_callable__ inline float abs( const float& f ) void swap( Type& a, Type& b ) { return ::fabs( f ); Type tmp( a ); a = b; b = tmp; }; template< class T > __cuda_callable__ inline double abs( const double& d ) T sign( const T& a ) { return ::fabs( d ); if( a < ( T ) 0 ) return -1; if( a == ( T ) 0 ) return 0; return 1; }; template< typename Real > Loading