diff --git a/buildAll b/buildAll index d553862bd5fdf6878b16afb67f934c1424acfe75..3986a9ea1db990dc0960376c7c614a5f44b7ff45 100755 --- a/buildAll +++ b/buildAll @@ -2,7 +2,7 @@ TARGET=TNL INSTALL_PREFIX=${HOME}/local -WITH_CUDA=yes +WITH_CUDA=no WITH_CUSPARSE=no CUDA_ARCHITECTURE=2.0 VERBOSE=1 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 674a3e3ae508b060fc4e83cb01286e3d27dad44b..a2055fe17986df13e20f6b479bfb37589a976210 100755 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -10,13 +10,28 @@ ADD_SUBDIRECTORY( solvers ) ADD_SUBDIRECTORY( legacy ) ADD_SUBDIRECTORY( implementation ) -ADD_LIBRARY( tnl${debugExt}-${tnlVersion} SHARED - ${tnl_config_SOURCES} - ${tnl_core_SOURCES} - ${tnl_implementation_SOURCES} - ${tnl_legacy_SOURCES} - ${tnl_debug_SOURCES} - ${tnl_matrix_SOURCES} ) +set( tnl_SOURCES ${tnl_config_SOURCES} + ${tnl_core_SOURCES} + ${tnl_implementation_SOURCES} + ${tnl_legacy_SOURCES} + ${tnl_debug_SOURCES} + ${tnl_matrix_SOURCES} ) + +set( tnl_CUDA__SOURCES ${tnl_config_CUDA__SOURCES} + ${tnl_core_CUDA__SOURCES} + ${tnl_implementation_CUDA__SOURCES} + ${tnl_legacy_CUDA__SOURCES} + ${tnl_debug_CUDA__SOURCES} + ${tnl_matrix_CUDA__SOURCES} ) + + +if( BUILD_CUDA ) + CUDA_ADD_LIBRARY( tnl${debugExt}-${tnlVersion} SHARED + ${tnl_CUDA__SOURCES} ) +else( BUILD_CUDA ) + ADD_LIBRARY( tnl${debugExt}-${tnlVersion} SHARED + ${tnl_SOURCES} ) +endif( BUILD_CUDA ) SET_TARGET_PROPERTIES( tnl${debugExt}-${tnlVersion} PROPERTIES SOVERSION 0 @@ -26,19 +41,25 @@ TARGET_LINK_LIBRARIES( tnl${debugExt}-${tnlVersion} INSTALL( TARGETS tnl${debugExt}-${tnlVersion} DESTINATION lib ) IF( BUILD_MPI ) - ADD_LIBRARY( tnl-mpi${debugExt}-${tnlVersion} SHARED - ${tnl_config_SOURCES} - ${tnl_core_SOURCES} - ${tnl_implementation_SOURCES} - ${tnl_debug_SOURCES} - ${tnl_matrix_SOURCES} ) - SET_TARGET_PROPERTIES( tnl-mpi${debugExt}-${tnlVersion} PROPERTIES + + if( BUILD_CUDA ) + CUDA_ADD_LIBRARY( tnl-mpi${debugExt}-${tnlVersion} SHARED + ${tnl_CUDA__SOURCES} ) + else( BUILD_CUDA ) + ADD_LIBRARY( tnl-mpi${debugExt}-${tnlVersion} SHARED + ${tnl_SOURCES} ) + endif( BUILD_CUDA ) + + SET_TARGET_PROPERTIES( tnl-mpi${debugExt}-${tnlVersion} PROPERTIES SOVERSION 0 VERSION ${tnlVersion} ) - TARGET_LINK_LIBRARIES( tnl-mpi${debugExt}-${tnlVersion} + + TARGET_LINK_LIBRARIES( tnl-mpi${debugExt}-${tnlVersion} ${MPI_LIBRARIES} ${BZIP2_LIBRARIES} ) - INSTALL( TARGETS tnl-mpi${debugExt}-${tnlVersion} DESTINATION lib ) + + INSTALL( TARGETS tnl-mpi${debugExt}-${tnlVersion} DESTINATION lib ) + ENDIF() diff --git a/src/config/CMakeLists.txt b/src/config/CMakeLists.txt index a1fdbf958285f77ce50fd511ae1e5b2be48ef0ec..6878b4b35db21fb81c50c9f207e1f29f7adc7666 100755 --- a/src/config/CMakeLists.txt +++ b/src/config/CMakeLists.txt @@ -13,12 +13,20 @@ SET( headers tnlConfigDescription.h ) SET( CURRENT_DIR ${CMAKE_SOURCE_DIR}/src/config ) -SET( tnl_config_SOURCES +set( common_SOURCES ${CURRENT_DIR}/tnlConfigDescription.cpp ${CURRENT_DIR}/tnlConfigDescriptionScanner.cpp ${CURRENT_DIR}/tnlConfigDescriptionParser.cpp ${CURRENT_DIR}/tnlParameterContainer.cpp - ${CURRENT_DIR}/parse.cc - PARENT_SCOPE ) + ${CURRENT_DIR}/parse.cc ) +SET( tnl_config_SOURCES + ${common_SOURCES} + PARENT_SCOPE ) + +if( BUILD_CUDA ) +SET( tnl_config_CUDA__SOURCES + ${common_SOURCES} + PARENT_SCOPE ) +endif() INSTALL( FILES ${headers} DESTINATION include/tnl-${tnlVersion}/config ) diff --git a/src/core/cuda/CMakeLists.txt b/src/core/cuda/CMakeLists.txt index 3290b863cf2973ba56b3ec6e98c45bb96646b109..64f7fb8c273bf0d843547a7ad7bd99034beb9505 100755 --- a/src/core/cuda/CMakeLists.txt +++ b/src/core/cuda/CMakeLists.txt @@ -1,5 +1,5 @@ set( headers device-check.h - reduction.h + cuda-reduction.h reduction-operations.h ) INSTALL( FILES ${headers} DESTINATION include/tnl-${tnlVersion}/core/cuda ) \ No newline at end of file diff --git a/src/implementation/core/cuda/reduction-operations_impl.h b/src/core/cuda/cuda-reduction.h similarity index 56% rename from src/implementation/core/cuda/reduction-operations_impl.h rename to src/core/cuda/cuda-reduction.h index 113b837a5909e9f834ed08a8dbcd6d9c8027ab93..e097e3869c19c11211f83afb3f699cef09a40ce8 100644 --- a/src/implementation/core/cuda/reduction-operations_impl.h +++ b/src/core/cuda/cuda-reduction.h @@ -1,8 +1,8 @@ /*************************************************************************** - reduction-operations_impl.h - description + cuda-reduction.h - description ------------------- - begin : Mar 22, 2013 - copyright : (C) 2013 by Tomas Oberhuber + begin : Oct 28, 2010 + copyright : (C) 2010 by Tomas Oberhuber email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ @@ -15,9 +15,16 @@ * * ***************************************************************************/ -#ifndef REDUCTION_OPERATIONS_IMPL_H_ -#define REDUCTION_OPERATIONS_IMPL_H_ +#ifndef CUDA_REDUCTION_H_ +#define CUDA_REDUCTION_H_ +template< typename Operation > +bool reductionOnCudaDevice( const Operation& operation, + const typename Operation :: IndexType size, + const typename Operation :: RealType* deviceInput1, + const typename Operation :: RealType* deviceInput2, + typename Operation :: ResultType& result ); +#include <implementation/core/cuda/cuda-reduction_impl.h> -#endif /* REDUCTION_OPERATIONS_IMPL_H_ */ +#endif /* CUDA_REDUCTION_H_ */ diff --git a/src/core/cuda/reduction-operations.h b/src/core/cuda/reduction-operations.h index 9a6d8a0c4affd944bc1da8ee10d27bd9b76a215f..acb92a015fdec23b7997e5f84c4887d168444838 100644 --- a/src/core/cuda/reduction-operations.h +++ b/src/core/cuda/reduction-operations.h @@ -22,10 +22,6 @@ #include <cuda.h> #include <core/mfuncs.h> -enum tnlTupleOperation { tnlParallelReductionLpNorm, - tnlParallelReductionSdot }; - - /*** * This function returns minimum of two numbers stored on the device. */ @@ -35,20 +31,20 @@ template< class T > __device__ T tnlCudaMin( const T& a, return a < b ? a : b; } -__device__ int tnlCudaMin( const int& a, - const int& b ) +__device__ inline int tnlCudaMin( const int& a, + const int& b ) { return min( a, b ); } -__device__ float tnlCudaMin( const float& a, - const float& b ) +__device__ inline float tnlCudaMin( const float& a, + const float& b ) { return fminf( a, b ); } -__device__ double tnlCudaMin( const double& a, - const double& b ) +__device__ inline double tnlCudaMin( const double& a, + const double& b ) { return fmin( a, b ); } @@ -62,20 +58,20 @@ template< class T > __device__ T tnlCudaMax( const T& a, return a > b ? a : b; } -__device__ int tnlCudaMax( const int& a, - const int& b ) +__device__ inline int tnlCudaMax( const int& a, + const int& b ) { return max( a, b ); } -__device__ float tnlCudaMax( const float& a, - const float& b ) +__device__ inline float tnlCudaMax( const float& a, + const float& b ) { return fmaxf( a, b ); } -__device__ double tnlCudaMax( const double& a, - const double& b ) +__device__ inline double tnlCudaMax( const double& a, + const double& b ) { return fmax( a, b ); } @@ -83,20 +79,21 @@ __device__ double tnlCudaMax( const double& a, /*** * This function returns absolute value of given number on the device. */ -__device__ int tnlCudaAbs( const int& a ) +__device__ inline int tnlCudaAbs( const int& a ) { return abs( a ); } -__device__ float tnlCudaAbs( const float& a ) +__device__ inline float tnlCudaAbs( const float& a ) { return fabs( a ); } -__device__ double tnlCudaAbs( const double& a ) +__device__ inline double tnlCudaAbs( const double& a ) { return fabs( a ); } +#endif template< typename Real, typename Index > class tnlParallelReductionSum @@ -106,6 +103,7 @@ class tnlParallelReductionSum typedef Real RealType; typedef Index IndexType; typedef Real ResultType; + typedef tnlParallelReductionSum< Real, Index > LaterReductionOperation; ResultType initialValueOnHost( const IndexType idx, const RealType* data1, @@ -121,7 +119,7 @@ class tnlParallelReductionSum { return current + data1[ idx ]; }; - +#ifdef HAVE_CUDA __device__ ResultType initialValueOnDevice( const IndexType idx1, const IndexType idx2, const RealType* data1, @@ -140,7 +138,7 @@ class tnlParallelReductionSum __device__ ResultType firstReductionOnDevice( const IndexType idx1, const IndexType idx2, const IndexType idx3, - const RealType* data1, + const ResultType* data1, const RealType* data2, const RealType* data3 ) const { @@ -149,7 +147,7 @@ class tnlParallelReductionSum __device__ ResultType firstReductionOnDevice( const IndexType idx1, const IndexType idx2, - const RealType* data1, + const ResultType* data1, const RealType* data2, const RealType* data3 ) const { @@ -162,6 +160,7 @@ class tnlParallelReductionSum { return data[ idx1 ] + data[ idx2 ]; }; +#endif }; template< typename Real, typename Index > @@ -171,62 +170,66 @@ class tnlParallelReductionMin typedef Real RealType; typedef Index IndexType; + typedef Real ResultType; + typedef tnlParallelReductionMin< Real, Index > LaterReductionOperation; - RealType initialValueOnHost( const IndexType idx, - const RealType* data1, - const RealType* data2 ) const + ResultType initialValueOnHost( const IndexType idx, + const RealType* data1, + const RealType* data2 ) const { return data1[ idx ]; }; - RealType reduceOnHost( const IndexType idx, - const RealType& current, - const RealType* data1, - const RealType* data2 ) const + ResultType reduceOnHost( const IndexType idx, + const ResultType& current, + const RealType* data1, + const RealType* data2 ) const { return Min( current, data1[ idx ] ); }; - __device__ RealType initialValueOnDevice( const IndexType idx1, - const IndexType idx2, - const RealType* data1, - const RealType* data2 ) const +#ifdef HAVE_CUDA + __device__ ResultType initialValueOnDevice( const IndexType idx1, + const IndexType idx2, + const RealType* data1, + const RealType* data2 ) const { return tnlCudaMin( data1[ idx1 ], data1[ idx2 ] ); } - __device__ RealType initialValueOnDevice( const IndexType idx1, - const RealType* data1, - const RealType* data2 ) const + __device__ ResultType initialValueOnDevice( const IndexType idx1, + const RealType* data1, + const RealType* data2 ) const { return data1[ idx1 ]; }; - __device__ RealType firstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const IndexType idx3, - const RealType* data1, - const RealType* data2, - const RealType* data3 ) const + __device__ ResultType firstReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const IndexType idx3, + const ResultType* data1, + const RealType* data2, + const RealType* data3 ) const { return tnlCudaMin( data1[ idx1 ], tnlCudaMin( data2[ idx2 ], data2[ idx3 ] ) ); }; - __device__ RealType firstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const RealType* data1, - const RealType* data2, - const RealType* data3 ) const + __device__ ResultType firstReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const ResultType* data1, + const RealType* data2, + const RealType* data3 ) const { return tnlCudaMin( data1[ idx1 ], data2[ idx2 ] ); }; - __device__ RealType commonReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const RealType* data ) const + __device__ ResultType commonReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const ResultType* data ) const { return tnlCudaMin( data[ idx1 ], data[ idx2 ] ); }; +#endif }; template< typename Real, typename Index > @@ -236,62 +239,66 @@ class tnlParallelReductionMax typedef Real RealType; typedef Index IndexType; + typedef Real ResultType; + typedef tnlParallelReductionMax< Real, Index > LaterReductionOperation; - RealType initialValueOnHost( const IndexType idx, - const RealType* data1, - const RealType* data2 ) const + ResultType initialValueOnHost( const IndexType idx, + const RealType* data1, + const RealType* data2 ) const { return data1[ idx ]; }; - RealType reduceOnHost( const IndexType idx, - const RealType& current, - const RealType* data1, - const RealType* data2 ) const + ResultType reduceOnHost( const IndexType idx, + const ResultType& current, + const RealType* data1, + const RealType* data2 ) const { return Max( current, data1[ idx ] ); }; - __device__ RealType initialValueOnDevice( const IndexType idx1, - const IndexType idx2, - const RealType* data1, - const RealType* data2 ) const +#ifdef HAVE_CUDA + __device__ ResultType initialValueOnDevice( const IndexType idx1, + const IndexType idx2, + const RealType* data1, + const RealType* data2 ) const { return tnlCudaMax( data1[ idx1 ], data1[ idx2 ] ); } - __device__ RealType initialValueOnDevice( const IndexType idx1, - const RealType* data1, - const RealType* data2 ) const + __device__ ResultType initialValueOnDevice( const IndexType idx1, + const RealType* data1, + const RealType* data2 ) const { return data1[ idx1 ]; }; - __device__ RealType firstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const IndexType idx3, - const RealType* data1, - const RealType* data2, - const RealType* data3 ) const + __device__ ResultType firstReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const IndexType idx3, + const ResultType* data1, + const RealType* data2, + const RealType* data3 ) const { return tnlCudaMax( data1[ idx1 ], tnlCudaMax( data2[ idx2 ], data2[ idx3 ] ) ); }; - __device__ RealType firstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const RealType* data1, - const RealType* data2, - const RealType* data3 ) const + __device__ ResultType firstReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const ResultType* data1, + const RealType* data2, + const RealType* data3 ) const { return tnlCudaMax( data1[ idx1 ], data2[ idx2 ] ); }; - __device__ RealType commonReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const RealType* data ) const + __device__ ResultType commonReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const ResultType* data ) const { return tnlCudaMax( data[ idx1 ], data[ idx2 ] ); }; +#endif }; template< typename Real, typename Index > @@ -301,62 +308,66 @@ class tnlParallelReductionAbsSum typedef Real RealType; typedef Index IndexType; + typedef Real ResultType; + typedef tnlParallelReductionSum< Real, Index > LaterReductionOperation; - RealType initialValueOnHost( const IndexType idx, - const RealType* data1, - const RealType* data2 ) const + ResultType initialValueOnHost( const IndexType idx, + const RealType* data1, + const RealType* data2 ) const { return tnlAbs( data1[ idx ] ); }; - RealType reduceOnHost( const IndexType idx, - const RealType& current, - const RealType* data1, - const RealType* data2 ) const + ResultType reduceOnHost( const IndexType idx, + const ResultType& current, + const RealType* data1, + const RealType* data2 ) const { return current + tnlAbs( data1[ idx ] ); }; - __device__ RealType initialValueOnDevice( const IndexType idx1, - const IndexType idx2, - const RealType* data1, - const RealType* data2 ) const +#ifdef HAVE_CUDA + __device__ ResultType initialValueOnDevice( const IndexType idx1, + const IndexType idx2, + const RealType* data1, + const RealType* data2 ) const { return tnlCudaAbs( data1[ idx1 ] ) + tnlCudaAbs( data1[ idx2 ] ); }; - __device__ RealType initialValueOnDevice( const IndexType idx1, - const RealType* data1, - const RealType* data2 ) const + __device__ ResultType initialValueOnDevice( const IndexType idx1, + const RealType* data1, + const RealType* data2 ) const { return tnlCudaAbs( data1[ idx1 ] ); }; - __device__ RealType firstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const IndexType idx3, - const RealType* data1, - const RealType* data2, - const RealType* data3 ) const + __device__ ResultType firstReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const IndexType idx3, + const ResultType* data1, + const RealType* data2, + const RealType* data3 ) const { return data1[ idx1 ] + tnlCudaAbs( data2[ idx2 ] ) + tnlCudaAbs( data2[ idx3 ] ); }; - __device__ RealType firstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const RealType* data1, - const RealType* data2, - const RealType* data3 ) const + __device__ ResultType firstReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const ResultType* data1, + const RealType* data2, + const RealType* data3 ) const { return data1[ idx1 ] + tnlCudaAbs( data2[ idx2 ] ); }; - __device__ RealType commonReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const RealType* data ) const + __device__ ResultType commonReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const ResultType* data ) const { return data[ idx1 ] + data[ idx2 ]; }; +#endif }; template< typename Real, typename Index > @@ -366,62 +377,66 @@ class tnlParallelReductionAbsMin typedef Real RealType; typedef Index IndexType; + typedef Real ResultType; + typedef tnlParallelReductionMin< Real, Index > LaterReductionOperation; - RealType initialValueOnHost( const IndexType idx, - const RealType* data1, - const RealType* data2 ) const + ResultType initialValueOnHost( const IndexType idx, + const RealType* data1, + const RealType* data2 ) const { return tnlAbs( data1[ idx ] ); }; - RealType reduceOnHost( const IndexType idx, - const RealType& current, - const RealType* data1, - const RealType* data2 ) const + ResultType reduceOnHost( const IndexType idx, + const ResultType& current, + const RealType* data1, + const RealType* data2 ) const { return Min( current, tnlAbs( data1[ idx ] ) ); }; - __device__ RealType initialValueOnDevice( const IndexType idx1, - const IndexType idx2, - const RealType* data1, - const RealType* data2 ) const +#ifdef HAVE_CUDA + __device__ ResultType initialValueOnDevice( const IndexType idx1, + const IndexType idx2, + const RealType* data1, + const RealType* data2 ) const { return tnlCudaMin( tnlCudaAbs( data1[ idx1 ] ), tnlCudaAbs( data1[ idx2 ] ) ); } - __device__ RealType initialValueOnDevice( const IndexType idx1, - const RealType* data1, - const RealType* data2 ) const + __device__ ResultType initialValueOnDevice( const IndexType idx1, + const RealType* data1, + const RealType* data2 ) const { return tnlCudaAbs( data1[ idx1 ] ); }; - __device__ RealType firstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const IndexType idx3, - const RealType* data1, - const RealType* data2, - const RealType* data3 ) const + __device__ ResultType firstReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const IndexType idx3, + const ResultType* data1, + const RealType* data2, + const RealType* data3 ) const { return tnlCudaMin( data1[ idx1 ], tnlCudaMin( tnlCudaAbs( data2[ idx2 ] ), tnlCudaAbs( data2[ idx3 ] ) ) ); }; - __device__ RealType firstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const RealType* data1, - const RealType* data2, - const RealType* data3 ) const + __device__ ResultType firstReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const ResultType* data1, + const RealType* data2, + const RealType* data3 ) const { return tnlCudaMin( data1[ idx1 ], tnlCudaAbs( data2[ idx2 ] ) ); }; - __device__ RealType commonReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const RealType* data ) const + __device__ ResultType commonReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const ResultType* data ) const { return tnlCudaMin( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) ); }; +#endif }; template< typename Real, typename Index > @@ -431,67 +446,498 @@ class tnlParallelReductionAbsMax typedef Real RealType; typedef Index IndexType; + typedef Real ResultType; + typedef tnlParallelReductionMax< Real, Index > LaterReductionOperation; - RealType initialValueOnHost( const IndexType idx, - const RealType* data1, - const RealType* data2 ) const + ResultType initialValueOnHost( const IndexType idx, + const RealType* data1, + const RealType* data2 ) const { return tnlAbs( data1[ idx ] ); }; - RealType reduceOnHost( const IndexType idx, - const RealType& current, - const RealType* data1, - const RealType* data2 ) const + ResultType reduceOnHost( const IndexType idx, + const ResultType& current, + const RealType* data1, + const RealType* data2 ) const { return Max( current, tnlAbs( data1[ idx ] ) ); }; - __device__ RealType initialValueOnDevice( const IndexType idx1, - const IndexType idx2, - const RealType* data1, - const RealType* data2 ) const +#ifdef HAVE_CUDA + __device__ ResultType initialValueOnDevice( const IndexType idx1, + const IndexType idx2, + const RealType* data1, + const RealType* data2 ) const { return tnlCudaMax( tnlCudaAbs( data1[ idx1 ] ), tnlCudaAbs( data1[ idx2 ] ) ); } - __device__ RealType initialValueOnDevice( const IndexType idx1, - const RealType* data1, - const RealType* data2 ) const + __device__ ResultType initialValueOnDevice( const IndexType idx1, + const RealType* data1, + const RealType* data2 ) const { return tnlCudaAbs( data1[ idx1 ] ); }; - __device__ RealType firstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const IndexType idx3, - const RealType* data1, - const RealType* data2, - const RealType* data3 ) const + __device__ ResultType firstReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const IndexType idx3, + const ResultType* data1, + const RealType* data2, + const RealType* data3 ) const { return tnlCudaMax( data1[ idx1 ], tnlCudaMax( tnlCudaAbs( data2[ idx2 ] ), tnlCudaAbs( data2[ idx3 ] ) ) ); }; - __device__ RealType firstReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const RealType* data1, - const RealType* data2, - const RealType* data3 ) const + __device__ ResultType firstReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const ResultType* data1, + const RealType* data2, + const RealType* data3 ) const { return tnlCudaMax( data1[ idx1 ], tnlCudaAbs( data2[ idx2 ] ) ); }; - __device__ RealType commonReductionOnDevice( const IndexType idx1, - const IndexType idx2, - const RealType* data ) const + __device__ ResultType commonReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const ResultType* data ) const { return tnlCudaMax( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) ); }; +#endif }; +template< typename Real, typename Index > +class tnlParallelReductionLogicalAnd +{ + public: + + typedef Real RealType; + typedef Index IndexType; + typedef Real ResultType; + typedef tnlParallelReductionLogicalAnd< Real, Index > LaterReductionOperation; + + ResultType initialValueOnHost( const IndexType idx, + const RealType* data1, + const RealType* data2 ) const + { + return data1[ idx ]; + }; + + ResultType reduceOnHost( const IndexType idx, + const ResultType& current, + const RealType* data1, + const RealType* data2 ) const + { + return current && data1[ idx ]; + }; + +#ifdef HAVE_CUDA + __device__ ResultType initialValueOnDevice( const IndexType idx1, + const IndexType idx2, + const RealType* data1, + const RealType* data2 ) const + { + return data1[ idx1 ] && data1[ idx2 ]; + } + + __device__ ResultType initialValueOnDevice( const IndexType idx1, + const RealType* data1, + const RealType* data2 ) const + { + return data1[ idx1 ]; + }; + + __device__ ResultType firstReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const IndexType idx3, + const ResultType* data1, + const RealType* data2, + const RealType* data3 ) const + { + return data1[ idx1 ] && data2[ idx2 ] && data2[ idx3 ]; + }; -#include <implementation/core/cuda/reduction-operations_impl.h> + __device__ ResultType firstReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const ResultType* data1, + const RealType* data2, + const RealType* data3 ) const + { + return data1[ idx1 ] && data2[ idx2 ]; + }; + __device__ ResultType commonReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const ResultType* data ) const + { + return data[ idx1 ] && data[ idx2 ]; + }; #endif +}; + + +template< typename Real, typename Index > +class tnlParallelReductionLogicalOr +{ + public: + + typedef Real RealType; + typedef Index IndexType; + typedef Real ResultType; + typedef tnlParallelReductionLogicalOr< Real, Index > LaterReductionOperation; + + ResultType initialValueOnHost( const IndexType idx, + const RealType* data1, + const RealType* data2 ) const + { + return data1[ idx ]; + }; + + ResultType reduceOnHost( const IndexType idx, + const ResultType& current, + const RealType* data1, + const RealType* data2 ) const + { + return current || data1[ idx ]; + }; + +#ifdef HAVE_CUDA + __device__ ResultType initialValueOnDevice( const IndexType idx1, + const IndexType idx2, + const RealType* data1, + const RealType* data2 ) const + { + return data1[ idx1 ] || data1[ idx2 ]; + } + + __device__ ResultType initialValueOnDevice( const IndexType idx1, + const RealType* data1, + const RealType* data2 ) const + { + return data1[ idx1 ]; + }; + + __device__ ResultType firstReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const IndexType idx3, + const ResultType* data1, + const RealType* data2, + const RealType* data3 ) const + { + return data1[ idx1 ] || data2[ idx2 ] || data2[ idx3 ]; + }; + + __device__ ResultType firstReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const ResultType* data1, + const RealType* data2, + const RealType* data3 ) const + { + return data1[ idx1 ] || data2[ idx2 ]; + }; + + __device__ ResultType commonReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const ResultType* data ) const + { + return data[ idx1 ] || data[ idx2 ]; + }; +#endif +}; + +template< typename Real, typename Index > +class tnlParallelReductionLpNorm +{ + public: + + typedef Real RealType; + typedef Index IndexType; + typedef Real ResultType; + typedef tnlParallelReductionSum< Real, Index > LaterReductionOperation; + + void setPower( const RealType& p ) + { + this -> p = p; + }; + + ResultType initialValueOnHost( const IndexType idx, + const RealType* data1, + const RealType* data2 ) const + { + return pow( tnlAbs( data1[ idx ] ), p ); + }; + + ResultType reduceOnHost( const IndexType idx, + const ResultType& current, + const RealType* data1, + const RealType* data2 ) const + { + return current + pow( tnlAbs( data1[ idx ] ), p ); + }; + +#ifdef HAVE_CUDA + __device__ ResultType initialValueOnDevice( const IndexType idx1, + const IndexType idx2, + const RealType* data1, + const RealType* data2 ) const + { + return pow( tnlCudaAbs( data1[ idx1 ] ), p ) + pow( tnlCudaAbs( data1[ idx2 ] ), p ); + } + + __device__ ResultType initialValueOnDevice( const IndexType idx1, + const RealType* data1, + const RealType* data2 ) const + { + return pow( tnlCudaAbs( data1[ idx1 ] ), p ); + }; + + __device__ ResultType firstReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const IndexType idx3, + const ResultType* data1, + const RealType* data2, + const RealType* data3 ) const + { + return data1[ idx1 ] + + pow( tnlCudaAbs( data2[ idx2 ] ), p ) + + pow( tnlCudaAbs( data2[ idx3 ] ), p ); + }; + + __device__ ResultType firstReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const ResultType* data1, + const RealType* data2, + const RealType* data3 ) const + { + return data1[ idx1 ] + pow( tnlCudaAbs( data2[ idx2 ] ), p ); + }; + + __device__ ResultType commonReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const ResultType* data ) const + { + return data[ idx1 ] + data[ idx2 ]; + }; +#endif + + protected: + + RealType p; +}; + +template< typename Real, typename Index > +class tnlParallelReductionEqualities +{ + public: + + typedef Real RealType; + typedef Index IndexType; + typedef bool ResultType; + typedef tnlParallelReductionLogicalAnd< bool, Index > LaterReductionOperation; + + ResultType initialValueOnHost( const IndexType idx, + const RealType* data1, + const RealType* data2 ) const + { + return ( data1[ idx ] == data2[ idx ] ); + }; + + ResultType reduceOnHost( const IndexType idx, + const ResultType& current, + const RealType* data1, + const RealType* data2 ) const + { + return current && ( data1[ idx ] == data2[ idx ] ); + }; + +#ifdef HAVE_CUDA + __device__ ResultType initialValueOnDevice( const IndexType idx1, + const IndexType idx2, + const RealType* data1, + const RealType* data2 ) const + { + return ( data1[ idx1 ] == data2[ idx1 ] ) && ( data1[ idx2 ] == data2[ idx2] ); + } + + __device__ ResultType initialValueOnDevice( const IndexType idx1, + const RealType* data1, + const RealType* data2 ) const + { + return ( data1[ idx1 ]== data2[ idx1 ] ); + }; + + __device__ ResultType firstReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const IndexType idx3, + const ResultType* data1, + const RealType* data2, + const RealType* data3 ) const + { + return data1[ idx1 ] && + ( data2[ idx2 ] == data2[ idx2] ) && + ( data2[ idx3 ] == data3[ idx3] ); + }; + + __device__ ResultType firstReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const ResultType* data1, + const RealType* data2, + const RealType* data3 ) const + { + return data1[ idx1 ] && ( data2[ idx2 ] == data3[ idx2 ] ); + }; + + __device__ ResultType commonReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const ResultType* data ) const + { + return data[ idx1 ] && data[ idx2 ]; + }; +#endif +}; + +template< typename Real, typename Index > +class tnlParallelReductionInequalities +{ + public: + + typedef Real RealType; + typedef Index IndexType; + typedef bool ResultType; + typedef tnlParallelReductionLogicalAnd< bool, Index > LaterReductionOperation; + + ResultType initialValueOnHost( const IndexType idx, + const RealType* data1, + const RealType* data2 ) const + { + return ( data1[ idx ] != data2[ idx ] ); + }; + + ResultType reduceOnHost( const IndexType idx, + const ResultType& current, + const RealType* data1, + const RealType* data2 ) const + { + return current && ( data1[ idx ] != data2[ idx ] ); + }; + +#ifdef HAVE_CUDA + __device__ ResultType initialValueOnDevice( const IndexType idx1, + const IndexType idx2, + const RealType* data1, + const RealType* data2 ) const + { + return ( data1[ idx1 ] != data2[ idx1 ] ) && ( data1[ idx2 ] != data2[ idx2] ); + } + + __device__ ResultType initialValueOnDevice( const IndexType idx1, + const RealType* data1, + const RealType* data2 ) const + { + return ( data1[ idx1 ] != data2[ idx1 ] ); + }; + + __device__ ResultType firstReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const IndexType idx3, + const ResultType* data1, + const RealType* data2, + const RealType* data3 ) const + { + return data1[ idx1 ] && + ( data2[ idx2 ] != data2[ idx2] ) && + ( data2[ idx3 ] != data3[ idx3] ); + }; + + __device__ ResultType firstReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const ResultType* data1, + const RealType* data2, + const RealType* data3 ) const + { + return data1[ idx1 ] && ( data2[ idx2 ] != data3[ idx2 ] ); + }; + + __device__ ResultType commonReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const ResultType* data ) const + { + return data[ idx1 ] && data[ idx2 ]; + }; +#endif +}; + +template< typename Real, typename Index > +class tnlParallelReductionSdot +{ + public: + + typedef Real RealType; + typedef Index IndexType; + typedef Real ResultType; + typedef tnlParallelReductionSum< Real, Index > LaterReductionOperation; + + ResultType initialValueOnHost( const IndexType idx, + const RealType* data1, + const RealType* data2 ) const + { + return data1[ idx ] * data2[ idx ]; + }; + + ResultType reduceOnHost( const IndexType idx, + const ResultType& current, + const RealType* data1, + const RealType* data2 ) const + { + return current + ( data1[ idx ] * data2[ idx ] ); + }; + +#ifdef HAVE_CUDA + __device__ ResultType initialValueOnDevice( const IndexType idx1, + const IndexType idx2, + const RealType* data1, + const RealType* data2 ) const + { + return ( data1[ idx1 ] * data2[ idx1 ] ) + ( data1[ idx2 ] * data2[ idx2] ); + } + + __device__ ResultType initialValueOnDevice( const IndexType idx1, + const RealType* data1, + const RealType* data2 ) const + { + return ( data1[ idx1 ] * data2[ idx1 ] ); + }; + + __device__ ResultType firstReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const IndexType idx3, + const ResultType* data1, + const RealType* data2, + const RealType* data3 ) const + { + return data1[ idx1 ] + + ( data2[ idx2 ] * data2[ idx2] ) + + ( data2[ idx3 ] * data3[ idx3] ); + }; + + __device__ ResultType firstReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const ResultType* data1, + const RealType* data2, + const RealType* data3 ) const + { + return data1[ idx1 ] + ( data2[ idx2 ] * data3[ idx2 ] ); + }; + + __device__ ResultType commonReductionOnDevice( const IndexType idx1, + const IndexType idx2, + const ResultType* data ) const + { + return data[ idx1 ] + data[ idx2 ]; + }; +#endif +}; #endif /* REDUCTION_OPERATIONS_H_ */ diff --git a/src/core/cuda/reduction.h b/src/core/cuda/reduction.h deleted file mode 100644 index 13b850cf00f8bf597f01550dc2199dafd809b6c1..0000000000000000000000000000000000000000 --- a/src/core/cuda/reduction.h +++ /dev/null @@ -1,53 +0,0 @@ -/*************************************************************************** - cuda-long-vector-kernels.h - description - ------------------- - begin : Oct 28, 2010 - copyright : (C) 2010 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/*************************************************************************** - * * - * This program is free software; you can redistribute it and/or modify * - * it under the terms of the GNU General Public License as published by * - * the Free Software Foundation; either version 2 of the License, or * - * (at your option) any later version. * - * * - ***************************************************************************/ - -#ifndef CUDALONGVECTORKERNELS_H_ -#define CUDALONGVECTORKERNELS_H_ - -#ifdef HAVE_CUDA -#include <cuda.h> -#endif -#include <iostream> - -/*** - * The template calling the final CUDA kernel for the single vector reduction. - * The template parameters are: - * @param T is the type of data we want to reduce - * @param operation is the operation reducing the data. - * It can be tnlParallelReductionSum, tnlParallelReductionMin or tnlParallelReductionMax. - * The function parameters: - * @param size tells number of elements in the data array. - * @param deviceInput1 is the pointer to an array storing the data we want - * to reduce. This array must stay on the device!. - * @param deviceInput2 is the pointer to an array storing the coupling data for example - * the second vector for the SDOT operation. This array must stay on the device!. - * @param result will contain the result of the reduction if everything was ok - * and the return code is true. - * @param parameter can be used for example for the passing the parameter p of Lp norm. - * @param deviceAux is auxiliary array used to store temporary data during the reduction. - * If one calls this function more then once one might provide this array to avoid repetetive - * allocation of this array on the device inside of this function. - * The size of this array should be size / 128 * sizeof( T ). - */ -template< typename Type, typename ParameterType, typename Index, tnlTupleOperation operation > -bool tnlCUDALongVectorReduction( const Index size, - const Type* deviceInput1, - const Type* deviceInput2, - Type& result, - const ParameterType& parameter, - Type* deviceAux = 0 ); -#endif /* CUDALONGVECTORKERNELS_H_ */ diff --git a/src/implementation/CMakeLists.txt b/src/implementation/CMakeLists.txt index c95a03737c8a40e33eb852bcf8e6a70e082c7fef..f22c95fa29d86b57e588a82339b72d71e5850194 100755 --- a/src/implementation/CMakeLists.txt +++ b/src/implementation/CMakeLists.txt @@ -5,6 +5,14 @@ ADD_SUBDIRECTORY( solvers ) SET( headers ) +IF( BUILD_CUDA ) + set( tnl_implementation_CUDA__SOURCES + ${tnl_implementation_core_CUDA__SOURCES} + ${tnl_implementation_mesh_CUDA__SOURCES} + ${tnl_implementation_solvers_CUDA__SOURCES} + PARENT_SCOPE ) +ENDIF() + set( tnl_implementation_SOURCES ${tnl_implementation_core_SOURCES} ${tnl_implementation_mesh_SOURCES} diff --git a/src/implementation/core/CMakeLists.txt b/src/implementation/core/CMakeLists.txt index d137f42adec988c5bbd11f1b9b3dfacac6c17d9b..2ad45839048ae18618d506adb3a25d07e6572d70 100755 --- a/src/implementation/core/CMakeLists.txt +++ b/src/implementation/core/CMakeLists.txt @@ -1,7 +1,6 @@ ADD_SUBDIRECTORY( cuda ) -SET( headers cuda-long-vector-kernels.h - vector-operations.h +SET( headers vector-operations.h memory-operations.h tnlArray_impl.h tnlHost_impl.h @@ -19,34 +18,39 @@ SET( headers cuda-long-vector-kernels.h tnlVector_impl.h ) SET( CURRENT_DIR ${CMAKE_SOURCE_DIR}/src/implementation/core ) +set( common_SOURCES + ${CURRENT_DIR}/tnlTimerRT.cpp + ${CURRENT_DIR}/tnlFile.cpp + ${CURRENT_DIR}/tnlFlopsCounter.cpp + ${CURRENT_DIR}/tnlLogger.cpp + ${CURRENT_DIR}/tnlObject.cpp + ${CURRENT_DIR}/tnlStatistics.cpp + ${CURRENT_DIR}/tnlString.cpp + ${CURRENT_DIR}/tnlTimerCPU.cpp + ${CURRENT_DIR}/mfilename.cpp + ${CURRENT_DIR}/mpi-supp.cpp + ${CURRENT_DIR}/tnlSharedArray_impl.cpp + ${CURRENT_DIR}/tnlMultiArray_impl.cpp + ${CURRENT_DIR}/tnlMultiVector_impl.cpp + ${CURRENT_DIR}/tnlSharedVector_impl.cpp + ${CURRENT_DIR}/tnlVector_impl.cpp ) + IF( BUILD_CUDA ) set( tnl_implementation_core_CUDA__SOURCES ${tnl_implementation_core_cuda_CUDA__SOURCES} + ${common_SOURCES} + ${CURRENT_DIR}/memory-operations_impl.cu ${CURRENT_DIR}/tnlArray_impl.cu ${CURRENT_DIR}/tnlVector_impl.cu PARENT_SCOPE ) ENDIF() -set( tnl_implementation_core_SOURCES - ${tnl_implementation_core_CUDA__SOURCES} +set( tnl_implementation_core_SOURCES ${tnl_implementation_core_cuda_SOURCES} + ${common_SOURCES} + ${CURRENT_DIR}/memory-operations_impl.cpp ${CURRENT_DIR}/tnlArray_impl.cpp ${CURRENT_DIR}/tnlHost_impl.cpp - ${CURRENT_DIR}/tnlSharedArray_impl.cpp - ${CURRENT_DIR}/tnlMultiArray_impl.cpp - ${CURRENT_DIR}/tnlMultiVector_impl.cpp - ${CURRENT_DIR}/tnlSharedVector_impl.cpp - ${CURRENT_DIR}/tnlVector_impl.cpp - ${CURRENT_DIR}/tnlTimerRT.cpp - ${CURRENT_DIR}/tnlFile.cpp - ${CURRENT_DIR}/tnlFlopsCounter.cpp - ${CURRENT_DIR}/tnlLogger.cpp - ${CURRENT_DIR}/tnlObject.cpp - ${CURRENT_DIR}/tnlStatistics.cpp - ${CURRENT_DIR}/tnlString.cpp - ${CURRENT_DIR}/tnlTimerCPU.cpp - ${CURRENT_DIR}/mfilename.cpp - ${CURRENT_DIR}/mpi-supp.cpp - PARENT_SCOPE ) + PARENT_SCOPE ) diff --git a/src/implementation/core/cuda-long-vector-kernels.h b/src/implementation/core/cuda-long-vector-kernels.h deleted file mode 100644 index 41c2cc80394db884509af6e83397aac38f25bc47..0000000000000000000000000000000000000000 --- a/src/implementation/core/cuda-long-vector-kernels.h +++ /dev/null @@ -1,587 +0,0 @@ -/*************************************************************************** - cuda-long-vector-kernels.h - description - ------------------- - begin : Oct 28, 2010 - copyright : (C) 2010 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/*************************************************************************** - * * - * This program is free software; you can redistribute it and/or modify * - * it under the terms of the GNU General Public License as published by * - * the Free Software Foundation; either version 2 of the License, or * - * (at your option) any later version. * - * * - ***************************************************************************/ - -#ifndef CUDALONGVECTORKERNELS_H_ -#define CUDALONGVECTORKERNELS_H_ - -#ifdef HAVE_CUDA -#include <cuda.h> -#endif -#include <iostream> -#include <core/tnlAssert.h> -#include <core/cuda/reduction-operations.h> -#include <implementation/core/memory-operations.h> - -using namespace std; - - -/**** - * This constant says that arrays smaller than its value - * are going to be reduced on CPU. - */ -const int maxGPUReductionDataSize = 256; - -#ifdef HAVE_CUDA - - -/*** - * For each thread in block with thread ID smaller then s this function reduces - * data elements with indecis tid and tid + s. Here we assume that for each - * tid the tid + s element also exists i.e. we have even number of elements. - */ -template< typename Operation > -__device__ void reduceAligned( const Operation& operation, - typename Operation :: IndexType tid, - typename Operation :: IndexType s, - typename Operation :: RealType* sdata ) -{ - if( tid < s ) - { - sdata[ tid ] = operation. commonReductionOnDevice( tid, tid + s, sdata ); - /*if( operation == tnlParallelReductionAbsMin ) - sdata[ tid ] = tnlCudaMin( tnlCudaAbs( sdata[ tid ] ), tnlCudaAbs( sdata[ tid + s ] ) ); - if( operation == tnlParallelReductionAbsMax ) - sdata[ tid ] = tnlCudaMax( tnlCudaAbs( sdata[ tid ] ), tnlCudaAbs( sdata[ tid + s ] ) ); - if( operation == tnlParallelReductionLpNorm || - operation == tnlParallelReductionSdot ) - sdata[ tid ] = sdata[ tid ] + sdata[ tid + s ];*/ - } -} - -/*** - * For each thread in block with thread ID smaller then s this function reduces - * data elements with indices tid and tid + s. This is a modified version of - * the previous algorithm. Thid one works even for odd number of elements but - * it is a bit slower. - */ -template< typename Operation > -__device__ void reduceNonAligned( const Operation& operation, - typename Operation :: IndexType tid, - typename Operation :: IndexType s, - typename Operation :: IndexType n, - typename Operation :: RealType* sdata ) -{ - if( tid < s ) - { - sdata[ tid ] = operation. commonReductionOnDevice( tid, tid + s, sdata ); - /*if( operation == tnlParallelReductionAbsMin ) - sdata[ tid ] = tnlCudaMin( tnlCudaAbs( sdata[ tid ] ), tnlCudaAbs( sdata[ tid + s ] ) ); - if( operation == tnlParallelReductionAbsMax ) - sdata[ tid ] = tnlCudaMax( tnlCudaAbs( sdata[ tid ] ), tnlCudaAbs( sdata[ tid + s ] ) ); - if( operation == tnlParallelReductionLpNorm || - operation == tnlParallelReductionSdot ) - sdata[ tid ] = sdata[ tid ] + sdata[ tid + s ];*/ - } - /* This is for the case when we have odd number of elements. - * The last one will be reduced using the thread with ID 0. - */ - if( s > 32 ) - __syncthreads(); - if( 2 * s < n && tid == n - 1 ) - { - sdata[ 0 ] = operation. commonReductionOnDevice( 0, tid, sdata ); - /*if( operation == tnlParallelReductionAbsMin ) - sdata[ 0 ] = tnlCudaMin( tnlCudaAbs( sdata[ 0] ), tnlCudaAbs( sdata[ tid + s ] ) ); - if( operation == tnlParallelReductionAbsMax ) - sdata[ 0 ] = tnlCudaMax( tnlCudaAbs( sdata[ 0 ] ), tnlCudaAbs( sdata[ tid + s ] ) ); - if( operation == tnlParallelReductionLpNorm || - operation == tnlParallelReductionSdot ) - sdata[ 0 ] = sdata[ 0 ] + sdata[ tid + s ];*/ - - } -} - -/*** - * The parallel reduction of one vector. - * - * WARNING: This kernel only reduce data in one block. Use rather tnlCUDASimpleReduction2 - * to call this kernel then doing it by yourself. - * This kernel is very inefficient. It is here only for educative and testing reasons. - * Please use tnlCUDAReduction instead. - * - * The kernel parameters: - * @param size is the number of all element to reduce - not just in one block. - * @param deviceInput input data which we want to reduce - * @param deviceOutput an array to which we write the result of reduction. - * Each block of the grid writes one element in this array - * (i.e. the size of this array equals the number of CUDA blocks). - */ -template < typename Operation, int blockSize > -__global__ void tnlCUDAReductionKernel( const Operation operation, - const typename Operation :: IndexType size, - const typename Operation :: RealType* deviceInput, - const typename Operation :: RealType* deviceInput2, - typename Operation :: RealType* deviceOutput ) -{ - extern __shared__ __align__ ( 8 ) char __sdata[]; - - typedef typename Operation :: IndexType IndexType; - typedef typename Operation :: RealType RealType; - RealType* sdata = reinterpret_cast< RealType* >( __sdata ); - - /*** - * Get thread id (tid) and global thread id (gid). - * lastTId is the last relevant thread id in this block. - * gridSize is the number of element processed by all blocks at the - * same time. - */ - IndexType tid = threadIdx. x; - IndexType gid = 2 * blockIdx. x * blockDim. x + threadIdx. x; - IndexType lastTId = size - 2 * blockIdx. x * blockDim. x; - IndexType gridSize = 2 * blockDim. x * gridDim.x; - - /*** - * Read data into the shared memory. We start with the - * sequential reduction. - */ - if( gid + blockDim. x < size ) - { - sdata[ tid ] = operation. initialValueOnDevice( gid, gid + blockDim. x, deviceInput, deviceInput2 ); - /*if( operation == tnlParallelReductionMin ) - sdata[ tid ] = tnlCudaMin( deviceInput[ gid ], deviceInput[ gid + blockDim. x ] ); - if( operation == tnlParallelReductionMax ) - sdata[ tid ] = tnlCudaMax( deviceInput[ gid ], deviceInput[ gid + blockDim. x ] ); - if( operation == tnlParallelReductionAbsMin ) - sdata[ tid ] = tnlCudaMin( tnlCudaAbs( deviceInput[ gid ] ), tnlCudaAbs( deviceInput[ gid + blockDim. x ] ) ); - if( operation == tnlParallelReductionAbsMax ) - sdata[ tid ] = tnlCudaMax( tnlCudaAbs( deviceInput[ gid ] ), tnlCudaAbs( deviceInput[ gid + blockDim. x ] ) ); - if( operation == tnlParallelReductionSum ) - sdata[ tid ] = deviceInput[ gid ] + deviceInput[ gid + blockDim. x ]; - if( operation == tnlParallelReductionLpNorm ) - sdata[ tid ] = powf( tnlCudaAbs( deviceInput[ gid ] ), parameter ) + - powf( tnlCudaAbs( deviceInput[ gid + blockDim. x ] ), parameter ); - if( operation == tnlParallelReductionSdot ) - sdata[ tid ] = deviceInput[ gid ] * deviceInput2[ gid ] + - deviceInput[ gid + blockDim. x ] * deviceInput2[ gid + blockDim. x ];*/ - } - else if( gid < size ) - { - sdata[ tid ] = operation. initialValueOnDevice( gid, deviceInput, deviceInput2 ); - /*if( operation == tnlParallelReductionLpNorm ) - sdata[ tid ] = powf( tnlCudaAbs( deviceInput[ gid ] ), parameter ); - else - if( operation == tnlParallelReductionSdot ) - sdata[ tid ] = deviceInput[ gid ] * deviceInput2[ gid ]; - else - sdata[ tid ] = deviceInput[ gid ];*/ - } - gid += gridSize; - while( gid + blockDim. x < size ) - { - sdata[ tid ] = operation. firstReductionOnDevice( tid, gid, gid + blockDim. x, sdata, deviceInput, deviceInput2 ); - /*if( operation == tnlParallelReductionMin ) - sdata[ tid ] = :: tnlCudaMin( sdata[ tid ], :: tnlCudaMin( deviceInput[ gid ], deviceInput[ gid + blockDim. x ] ) ); - if( operation == tnlParallelReductionMax ) - sdata[ tid ] = :: tnlCudaMax( sdata[ tid ], :: tnlCudaMax( deviceInput[ gid ], deviceInput[ gid + blockDim. x ] ) ); - if( operation == tnlParallelReductionAbsMin ) - sdata[ tid ] = :: tnlCudaMin( tnlCudaAbs( sdata[ tid ] ), :: tnlCudaMin( tnlCudaAbs( deviceInput[ gid ] ), tnlCudaAbs( deviceInput[ gid + blockDim. x ] ) ) ); - if( operation == tnlParallelReductionAbsMax ) - sdata[ tid ] = :: tnlCudaMax( tnlCudaAbs( sdata[ tid ] ), :: tnlCudaMax( tnlCudaAbs( deviceInput[ gid ] ), tnlCudaAbs( deviceInput[ gid + blockDim. x ] ) ) ); - if( operation == tnlParallelReductionSum ) - sdata[ tid ] += deviceInput[gid] + deviceInput[ gid + blockDim. x ]; - if( operation == tnlParallelReductionLpNorm ) - sdata[ tid ] += powf( tnlCudaAbs( deviceInput[gid] ), parameter ) + - powf( tnlCudaAbs( deviceInput[ gid + blockDim. x ] ), parameter ); - if( operation == tnlParallelReductionSdot ) - sdata[ tid ] += deviceInput[ gid ] * deviceInput2[ gid ] + - deviceInput[ gid + blockDim. x] * deviceInput2[ gid + blockDim. x ];*/ - gid += gridSize; - } - if( gid < size ) - { - sdata[ tid ] = operation. firstReductionOnDevice( tid, gid, sdata, deviceInput, deviceInput2 ); - /*if( operation == tnlParallelReductionMin ) - sdata[ tid ] = :: tnlCudaMin( sdata[ tid ], deviceInput[ gid ] ); - if( operation == tnlParallelReductionMax ) - sdata[ tid ] = :: tnlCudaMax( sdata[ tid ], deviceInput[ gid ] ); - if( operation == tnlParallelReductionAbsMin ) - sdata[ tid ] = :: tnlCudaMin( tnlCudaAbs( sdata[ tid ] ), tnlCudaAbs( deviceInput[ gid ] ) ); - if( operation == tnlParallelReductionAbsMax ) - sdata[ tid ] = :: tnlCudaMax( tnlCudaAbs( sdata[ tid ] ), tnlCudaAbs( deviceInput[ gid ] ) ); - if( operation == tnlParallelReductionSum ) - sdata[ tid ] += deviceInput[gid]; - if( operation == tnlParallelReductionLpNorm ) - sdata[ tid ] += powf( tnlCudaAbs( deviceInput[gid] ), parameter ); - if( operation == tnlParallelReductionSdot ) - sdata[ tid ] += deviceInput[ gid ] * deviceInput2[ gid ];*/ - } - __syncthreads(); - - - /*** - * Process the parallel reduction. - * We reduce the data with step s which is one half of the elements to reduce. - * Each thread with ID < s reduce elements tid and tid + s. The result is stored - * in shared memory in sdata 0 .. s. We set s = s / 2 ( i.e. s >>= 1) and repeat - * the algorithm again until s = 1. - * We also separate the case when the blockDim. x is power of 2 and the algorithm - * can be written in more efficient way without some conditions. - */ - unsigned int n = lastTId < blockDim. x ? lastTId : blockDim. x; - if( n == 128 || n == 64 || n == 32 || n == 16 || - n == 8 || n == 4 || n == 2 || n == 256 || - n == 512 ) - { - if( blockSize >= 512 ) - { - if( tid < 256 ) - reduceAligned( operation, tid, 256, sdata ); - __syncthreads(); - } - if( blockSize >= 256 ) - { - if( tid < 128 ) - reduceAligned( operation, tid, 128, sdata ); - __syncthreads(); - } - if( blockSize >= 128 ) - { - if( tid < 64 ) - reduceAligned( operation, tid, 64, sdata ); - __syncthreads(); - } - - /*** - * This runs in one warp so it is synchronised implicitly. - */ - if (tid < 32) - { - if( blockSize >= 64 ) - reduceAligned( operation, tid, 32, sdata ); - if( blockSize >= 32 ) - reduceAligned( operation, tid, 16, sdata ); - if( blockSize >= 16 ) - reduceAligned( operation, tid, 8, sdata ); - if( blockSize >= 8 ) - reduceAligned( operation, tid, 4, sdata ); - if( blockSize >= 4 ) - reduceAligned( operation, tid, 2, sdata ); - if( blockSize >= 2 ) - reduceAligned( operation, tid, 1, sdata ); - } - } - else - { - unsigned int s; - if( n >= 512 ) - { - s = n / 2; - reduceNonAligned( operation, tid, s, n, sdata ); - n = s; - __syncthreads(); - } - if( n >= 256 ) - { - s = n / 2; - reduceNonAligned( operation, tid, s, n, sdata ); - n = s; - __syncthreads(); - } - if( n >= 128 ) - { - s = n / 2; - reduceNonAligned( operation, tid, s, n, sdata ); - n = s; - __syncthreads(); - } - if( n >= 64 ) - { - s = n / 2; - reduceNonAligned( operation, tid, s, n, sdata ); - n = s; - __syncthreads(); - } - if( n >= 32 ) - { - s = n / 2; - reduceNonAligned( operation, tid, s, n, sdata ); - n = s; - __syncthreads(); - } - /*** - * This runs in one warp so it is synchronised implicitly. - */ - if( n >= 16 ) - { - s = n / 2; - reduceNonAligned( operation, tid, s, n, sdata ); - n = s; - } - if( n >= 8 ) - { - s = n / 2; - reduceNonAligned( operation, tid, s, n, sdata ); - n = s; - } - if( n >= 4 ) - { - s = n / 2; - reduceNonAligned( operation, tid, s, n, sdata ); - n = s; - } - if( n >= 2 ) - { - s = n / 2; - reduceNonAligned( operation, tid, s, n, sdata ); - n = s; - } - } - - /*** - * Store the result back in the global memory. - */ - if( tid == 0 ) - deviceOutput[ blockIdx. x ] = sdata[ 0 ]; -} - -template< typename Operation > -typename Operation :: IndexType reduceOnCudaDevice( const Operation& operation, - const typename Operation :: IndexType size, - const typename Operation :: RealType* input1, - const typename Operation :: RealType* input2, - typename Operation :: RealType*& output) -{ - typedef typename Operation :: IndexType IndexType; - typedef typename Operation :: RealType RealType; - - const int desBlockSize = 512; - const int desGridSize = 2048; - dim3 blockSize( 0 ), gridSize( 0 ); - - /*** - * Compute the CUDA block size aligned to the power of two. - */ - blockSize. x = :: Min( size, desBlockSize ); - IndexType alignedBlockSize = 1; - while( alignedBlockSize < blockSize. x ) alignedBlockSize <<= 1; - blockSize. x = alignedBlockSize; - - gridSize. x = Min( ( IndexType ) ( size / blockSize. x + 1 ) / 2, desGridSize ); - - if( ! output && - ! allocateMemoryCuda( output, :: Max( 1, size / desBlockSize ) ) ) - return false; - - IndexType shmem = blockSize. x * sizeof( RealType ); - /*** - * Depending on the blockSize we generate appropriate template instance. - */ - switch( blockSize. x ) - { - case 512: - tnlCUDAReductionKernel< Operation, 512 > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); - break; - case 256: - tnlCUDAReductionKernel< Operation, 256 > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); - break; - case 128: - tnlCUDAReductionKernel< Operation, 128 > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); - break; - case 64: - tnlCUDAReductionKernel< Operation, 64 > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); - break; - case 32: - tnlCUDAReductionKernel< Operation, 32 > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); - break; - case 16: - tnlCUDAReductionKernel< Operation, 16 > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); - break; - case 8: - tnlCUDAReductionKernel< Operation, 8 > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); - break; - case 4: - tnlCUDAReductionKernel< Operation, 4 > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); - break; - case 2: - tnlCUDAReductionKernel< Operation, 2 > - <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); - break; - case 1: - tnlAssert( false, cerr << "blockSize should not be 1." << endl ); - break; - default: - tnlAssert( false, cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); - break; - } - return gridSize. x; -} -#endif - -template< typename Operation > -bool tnlCUDALongVectorReduction( const Operation& operation, - const typename Operation :: IndexType size, - const typename Operation :: RealType* deviceInput1, - const typename Operation :: RealType* deviceInput2, - typename Operation :: RealType& result ) -{ -#ifdef HAVE_CUDA - - typedef typename Operation :: IndexType IndexType; - typedef typename Operation :: RealType RealType; - - /**** - * First check if the input array(s) is/are large enough for the reduction on GPU. - * Otherwise copy it/them to host and reduce on CPU. - */ - RealType hostArray1[ maxGPUReductionDataSize ]; - RealType hostArray2[ maxGPUReductionDataSize ]; - if( size <= maxGPUReductionDataSize ) - { - if( ! copyMemoryCudaToHost( hostArray1, deviceInput1, size ) ) - return false; - if( deviceInput2 && ! copyMemoryCudaToHost( hostArray2, deviceInput2, size ) ) - return false; - result = operation. initialValueOnHost( 0, hostArray1, hostArray2 ); - for( IndexType i = 1; i < size; i ++ ) - result = operation. reduceOnHost( i, result, hostArray1, hostArray2 ); - return true; - } - - /**** - * Reduce the data on the CUDA device. - */ - RealType* deviceAux1( 0 ), *deviceAux2( 0 ); - IndexType reducedSize = reduceOnCudaDevice( operation, - size, - deviceInput1, - deviceInput2, - deviceAux1 ); - - while( reducedSize > maxGPUReductionDataSize ) - { - reducedSize = reduceOnCudaDevice( operation, - reducedSize, - deviceAux1, - ( RealType* ) 0, - deviceAux2 ); - Swap( deviceAux1, deviceAux2 ); - } - - /*** - * Transfer the reduced data from device to host. - */ - RealType resultArray[ maxGPUReductionDataSize ]; - if( ! copyMemoryCudaToHost( resultArray, deviceAux1, reducedSize ) ) - return false; - - /*** - * Reduce the data on the host system. - */ - result = operation. initialValueOnHost( 0, resultArray, ( RealType* ) 0 ); - for( IndexType i = 1; i < reducedSize; i ++ ) - result = operation. reduceOnHost( i, result, resultArray, ( RealType*) 0 ); - - /**** - * Free the memory allocated on the device. - */ - if( deviceAux1 && ! freeMemoryCuda( deviceAux1 ) ) - return false; - if( deviceAux2 && ! freeMemoryCuda( deviceAux2 ) ) - return false; - - - return true; -#else - cerr << "I am sorry but CUDA support is missing on this system " << __FILE__ << " line " << __LINE__ << "." << endl; - return false; -#endif -}; - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -// TODO: result of comparison should not be returned!!! -template< typename Type, typename Index > -bool tnlCUDALongVectorComparison( const Index size, - const Type* deviceInput1, - const Type* deviceInput2, - bool* deviceBoolAux = 0, - Type* deviceAux = 0 ) -{ -#ifdef HAVE_CUDA - tnlAssert( size > 0, - cerr << "You try to compare two CUDA long vectors with non-positive size." << endl - << "The size is " << size ); - //tnlVector< bool, tnlCuda, Index > boolArray( "tnlCUDALongVectorComparison:bool_array" ); - bool* myDeviceBoolAux( 0 ); - if( ! deviceBoolAux ) - { - //if( ! boolArray. setSize( size ) ) - if( ! allocateMemoryCuda( myDeviceBoolAux, size ) ) - return false; - deviceBoolAux = myDeviceBoolAux; - } - dim3 blockSize( 0 ), gridSize( 0 ); - blockSize. x = 256; - gridSize. x = size / blockSize. x + 1; - - //compareTwoVectorsElementwise<<< gridSize, blockSize >>>( size, - // deviceInput1, - // deviceInput2, - // deviceBoolAux ); - if( ! checkCudaDevice ) - return false; - bool result; - if( ! tnlCUDALongVectorReduction< bool, bool, Index, tnlParallelReductionMin >( size, - deviceBoolAux, - ( bool* ) NULL, - result, - 0 ) ) - - - return false; - return result; -#else - cerr << "I am sorry but CUDA support is missing on this system " << __FILE__ << " line " << __LINE__ << "." << endl; - return; -#endif -} - -#endif /* CUDALONGVECTORKERNELS_H_ */ diff --git a/src/implementation/core/cuda/CMakeLists.txt b/src/implementation/core/cuda/CMakeLists.txt index 6d119a124fbc6d614a5d5856fa1166fd1783e68c..54f83098f4c7bb940391702a4b68a08c87bbffc6 100755 --- a/src/implementation/core/cuda/CMakeLists.txt +++ b/src/implementation/core/cuda/CMakeLists.txt @@ -1,15 +1,19 @@ -SET( headers reduction_impl.h - reduction-operations_impl.h ) +SET( headers cuda-reduction_impl.h ) SET( CURRENT_DIR ${CMAKE_SOURCE_DIR}/src/implementation/core/cuda ) IF( BUILD_CUDA ) set( tnl_implementation_core_cuda_CUDA__SOURCES - ${CURRENT_DIR}/reduction-operations_impl.cu - PARENT_SCOPE ) + ${CURRENT_DIR}/cuda-reduction_impl.cu + PARENT_SCOPE ) +else() + set( tnl_implementation_core_cuda_SOURCES + ${CURRENT_DIR}/cuda-reduction_impl.cpp + PARENT_SCOPE ) endif() set( tnl_implementation_core_cuda_SOURCES + ${tnl_implementation_core_cuda_SOURCES} ${CURRENT_DIR}/device-check.cpp PARENT_SCOPE ) diff --git a/src/implementation/core/cuda/cuda-reduction_impl.cpp b/src/implementation/core/cuda/cuda-reduction_impl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e0818522af3a9ecb132a27cb459bf2e895659004 --- /dev/null +++ b/src/implementation/core/cuda/cuda-reduction_impl.cpp @@ -0,0 +1,895 @@ +/*************************************************************************** + cuda-reduction_impl.cpp - description + ------------------- + begin : Mar 24, 2013 + copyright : (C) 2013 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/*************************************************************************** + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + ***************************************************************************/ + +#include <core/cuda/reduction-operations.h> +#include <core/cuda/cuda-reduction.h> + +#ifdef TEMPLATE_EXPLICIT_INSTANTIATION +/**** + * Sum + */ + +extern template bool reductionOnCudaDevice< tnlParallelReductionSum< char, int > > + ( const tnlParallelReductionSum< char, int >& operation, + const typename tnlParallelReductionSum< char, int > :: IndexType size, + const typename tnlParallelReductionSum< char, int > :: RealType* deviceInput1, + const typename tnlParallelReductionSum< char, int > :: RealType* deviceInput2, + typename tnlParallelReductionSum< char, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionSum< int, int > > + ( const tnlParallelReductionSum< int, int >& operation, + const typename tnlParallelReductionSum< int, int > :: IndexType size, + const typename tnlParallelReductionSum< int, int > :: RealType* deviceInput1, + const typename tnlParallelReductionSum< int, int > :: RealType* deviceInput2, + typename tnlParallelReductionSum< int, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionSum< float, int > > + ( const tnlParallelReductionSum< float, int >& operation, + const typename tnlParallelReductionSum< float, int > :: IndexType size, + const typename tnlParallelReductionSum< float, int > :: RealType* deviceInput1, + const typename tnlParallelReductionSum< float, int > :: RealType* deviceInput2, + typename tnlParallelReductionSum< float, int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionSum< double, int > > + ( const tnlParallelReductionSum< double, int>& operation, + const typename tnlParallelReductionSum< double, int > :: IndexType size, + const typename tnlParallelReductionSum< double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionSum< double, int > :: RealType* deviceInput2, + typename tnlParallelReductionSum< double, int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionSum< long double, int > > + ( const tnlParallelReductionSum< long double, int>& operation, + const typename tnlParallelReductionSum< long double, int > :: IndexType size, + const typename tnlParallelReductionSum< long double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionSum< long double, int > :: RealType* deviceInput2, + typename tnlParallelReductionSum< long double, int> :: ResultType& result );*/ + +extern template bool reductionOnCudaDevice< tnlParallelReductionSum< char, long int > > + ( const tnlParallelReductionSum< char, long int >& operation, + const typename tnlParallelReductionSum< char, long int > :: IndexType size, + const typename tnlParallelReductionSum< char, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionSum< char, long int > :: RealType* deviceInput2, + typename tnlParallelReductionSum< char, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionSum< int, long int > > + ( const tnlParallelReductionSum< int, long int >& operation, + const typename tnlParallelReductionSum< int, long int > :: IndexType size, + const typename tnlParallelReductionSum< int, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionSum< int, long int > :: RealType* deviceInput2, + typename tnlParallelReductionSum< int, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionSum< float, long int > > + ( const tnlParallelReductionSum< float, long int >& operation, + const typename tnlParallelReductionSum< float, long int > :: IndexType size, + const typename tnlParallelReductionSum< float, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionSum< float, long int > :: RealType* deviceInput2, + typename tnlParallelReductionSum< float, long int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionSum< double, long int > > + ( const tnlParallelReductionSum< double, long int>& operation, + const typename tnlParallelReductionSum< double, long int > :: IndexType size, + const typename tnlParallelReductionSum< double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionSum< double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionSum< double, long int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionSum< long double, long int > > + ( const tnlParallelReductionSum< long double, long int>& operation, + const typename tnlParallelReductionSum< long double, long int > :: IndexType size, + const typename tnlParallelReductionSum< long double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionSum< long double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionSum< long double, long int> :: ResultType& result );*/ + +/**** + * Min + */ + +extern template bool reductionOnCudaDevice< tnlParallelReductionMin< char, int > > + ( const tnlParallelReductionMin< char, int >& operation, + const typename tnlParallelReductionMin< char, int > :: IndexType size, + const typename tnlParallelReductionMin< char, int > :: RealType* deviceInput1, + const typename tnlParallelReductionMin< char, int > :: RealType* deviceInput2, + typename tnlParallelReductionMin< char, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionMin< int, int > > + ( const tnlParallelReductionMin< int, int >& operation, + const typename tnlParallelReductionMin< int, int > :: IndexType size, + const typename tnlParallelReductionMin< int, int > :: RealType* deviceInput1, + const typename tnlParallelReductionMin< int, int > :: RealType* deviceInput2, + typename tnlParallelReductionMin< int, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionMin< float, int > > + ( const tnlParallelReductionMin< float, int >& operation, + const typename tnlParallelReductionMin< float, int > :: IndexType size, + const typename tnlParallelReductionMin< float, int > :: RealType* deviceInput1, + const typename tnlParallelReductionMin< float, int > :: RealType* deviceInput2, + typename tnlParallelReductionMin< float, int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionMin< double, int > > + ( const tnlParallelReductionMin< double, int>& operation, + const typename tnlParallelReductionMin< double, int > :: IndexType size, + const typename tnlParallelReductionMin< double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionMin< double, int > :: RealType* deviceInput2, + typename tnlParallelReductionMin< double, int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionMin< long double, int > > + ( const tnlParallelReductionMin< long double, int>& operation, + const typename tnlParallelReductionMin< long double, int > :: IndexType size, + const typename tnlParallelReductionMin< long double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionMin< long double, int > :: RealType* deviceInput2, + typename tnlParallelReductionMin< long double, int> :: ResultType& result );*/ + +extern template bool reductionOnCudaDevice< tnlParallelReductionMin< char, long int > > + ( const tnlParallelReductionMin< char, long int >& operation, + const typename tnlParallelReductionMin< char, long int > :: IndexType size, + const typename tnlParallelReductionMin< char, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionMin< char, long int > :: RealType* deviceInput2, + typename tnlParallelReductionMin< char, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionMin< int, long int > > + ( const tnlParallelReductionMin< int, long int >& operation, + const typename tnlParallelReductionMin< int, long int > :: IndexType size, + const typename tnlParallelReductionMin< int, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionMin< int, long int > :: RealType* deviceInput2, + typename tnlParallelReductionMin< int, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionMin< float, long int > > + ( const tnlParallelReductionMin< float, long int >& operation, + const typename tnlParallelReductionMin< float, long int > :: IndexType size, + const typename tnlParallelReductionMin< float, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionMin< float, long int > :: RealType* deviceInput2, + typename tnlParallelReductionMin< float, long int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionMin< double, long int > > + ( const tnlParallelReductionMin< double, long int>& operation, + const typename tnlParallelReductionMin< double, long int > :: IndexType size, + const typename tnlParallelReductionMin< double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionMin< double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionMin< double, long int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionMin< long double, long int > > + ( const tnlParallelReductionMin< long double, long int>& operation, + const typename tnlParallelReductionMin< long double, long int > :: IndexType size, + const typename tnlParallelReductionMin< long double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionMin< long double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionMin< long double, long int> :: ResultType& result );*/ + +/**** + * Max + */ + +extern template bool reductionOnCudaDevice< tnlParallelReductionMax< char, int > > + ( const tnlParallelReductionMax< char, int >& operation, + const typename tnlParallelReductionMax< char, int > :: IndexType size, + const typename tnlParallelReductionMax< char, int > :: RealType* deviceInput1, + const typename tnlParallelReductionMax< char, int > :: RealType* deviceInput2, + typename tnlParallelReductionMax< char, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionMax< int, int > > + ( const tnlParallelReductionMax< int, int >& operation, + const typename tnlParallelReductionMax< int, int > :: IndexType size, + const typename tnlParallelReductionMax< int, int > :: RealType* deviceInput1, + const typename tnlParallelReductionMax< int, int > :: RealType* deviceInput2, + typename tnlParallelReductionMax< int, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionMax< float, int > > + ( const tnlParallelReductionMax< float, int >& operation, + const typename tnlParallelReductionMax< float, int > :: IndexType size, + const typename tnlParallelReductionMax< float, int > :: RealType* deviceInput1, + const typename tnlParallelReductionMax< float, int > :: RealType* deviceInput2, + typename tnlParallelReductionMax< float, int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionMax< double, int > > + ( const tnlParallelReductionMax< double, int>& operation, + const typename tnlParallelReductionMax< double, int > :: IndexType size, + const typename tnlParallelReductionMax< double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionMax< double, int > :: RealType* deviceInput2, + typename tnlParallelReductionMax< double, int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionMax< long double, int > > + ( const tnlParallelReductionMax< long double, int>& operation, + const typename tnlParallelReductionMax< long double, int > :: IndexType size, + const typename tnlParallelReductionMax< long double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionMax< long double, int > :: RealType* deviceInput2, + typename tnlParallelReductionMax< long double, int> :: ResultType& result );*/ + +extern template bool reductionOnCudaDevice< tnlParallelReductionMax< char, long int > > + ( const tnlParallelReductionMax< char, long int >& operation, + const typename tnlParallelReductionMax< char, long int > :: IndexType size, + const typename tnlParallelReductionMax< char, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionMax< char, long int > :: RealType* deviceInput2, + typename tnlParallelReductionMax< char, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionMax< int, long int > > + ( const tnlParallelReductionMax< int, long int >& operation, + const typename tnlParallelReductionMax< int, long int > :: IndexType size, + const typename tnlParallelReductionMax< int, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionMax< int, long int > :: RealType* deviceInput2, + typename tnlParallelReductionMax< int, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionMax< float, long int > > + ( const tnlParallelReductionMax< float, long int >& operation, + const typename tnlParallelReductionMax< float, long int > :: IndexType size, + const typename tnlParallelReductionMax< float, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionMax< float, long int > :: RealType* deviceInput2, + typename tnlParallelReductionMax< float, long int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionMax< double, long int > > + ( const tnlParallelReductionMax< double, long int>& operation, + const typename tnlParallelReductionMax< double, long int > :: IndexType size, + const typename tnlParallelReductionMax< double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionMax< double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionMax< double, long int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionMax< long double, long int > > + ( const tnlParallelReductionMax< long double, long int>& operation, + const typename tnlParallelReductionMax< long double, long int > :: IndexType size, + const typename tnlParallelReductionMax< long double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionMax< long double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionMax< long double, long int> :: ResultType& result );*/ + +/**** + * Abs sum + */ + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< char, int > > + ( const tnlParallelReductionAbsSum< char, int >& operation, + const typename tnlParallelReductionAbsSum< char, int > :: IndexType size, + const typename tnlParallelReductionAbsSum< char, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsSum< char, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsSum< char, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< int, int > > + ( const tnlParallelReductionAbsSum< int, int >& operation, + const typename tnlParallelReductionAbsSum< int, int > :: IndexType size, + const typename tnlParallelReductionAbsSum< int, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsSum< int, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsSum< int, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< float, int > > + ( const tnlParallelReductionAbsSum< float, int >& operation, + const typename tnlParallelReductionAbsSum< float, int > :: IndexType size, + const typename tnlParallelReductionAbsSum< float, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsSum< float, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsSum< float, int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< double, int > > + ( const tnlParallelReductionAbsSum< double, int>& operation, + const typename tnlParallelReductionAbsSum< double, int > :: IndexType size, + const typename tnlParallelReductionAbsSum< double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsSum< double, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsSum< double, int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< long double, int > > + ( const tnlParallelReductionAbsSum< long double, int>& operation, + const typename tnlParallelReductionAbsSum< long double, int > :: IndexType size, + const typename tnlParallelReductionAbsSum< long double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsSum< long double, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsSum< long double, int> :: ResultType& result );*/ + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< char, long int > > + ( const tnlParallelReductionAbsSum< char, long int >& operation, + const typename tnlParallelReductionAbsSum< char, long int > :: IndexType size, + const typename tnlParallelReductionAbsSum< char, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsSum< char, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsSum< char, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< int, long int > > + ( const tnlParallelReductionAbsSum< int, long int >& operation, + const typename tnlParallelReductionAbsSum< int, long int > :: IndexType size, + const typename tnlParallelReductionAbsSum< int, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsSum< int, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsSum< int, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< float, long int > > + ( const tnlParallelReductionAbsSum< float, long int >& operation, + const typename tnlParallelReductionAbsSum< float, long int > :: IndexType size, + const typename tnlParallelReductionAbsSum< float, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsSum< float, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsSum< float, long int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< double, long int > > + ( const tnlParallelReductionAbsSum< double, long int>& operation, + const typename tnlParallelReductionAbsSum< double, long int > :: IndexType size, + const typename tnlParallelReductionAbsSum< double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsSum< double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsSum< double, long int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< long double, long int > > + ( const tnlParallelReductionAbsSum< long double, long int>& operation, + const typename tnlParallelReductionAbsSum< long double, long int > :: IndexType size, + const typename tnlParallelReductionAbsSum< long double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsSum< long double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsSum< long double, long int> :: ResultType& result );*/ + +/**** + * Abs min + */ + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< char, int > > + ( const tnlParallelReductionAbsMin< char, int >& operation, + const typename tnlParallelReductionAbsMin< char, int > :: IndexType size, + const typename tnlParallelReductionAbsMin< char, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMin< char, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMin< char, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< int, int > > + ( const tnlParallelReductionAbsMin< int, int >& operation, + const typename tnlParallelReductionAbsMin< int, int > :: IndexType size, + const typename tnlParallelReductionAbsMin< int, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMin< int, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMin< int, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< float, int > > + ( const tnlParallelReductionAbsMin< float, int >& operation, + const typename tnlParallelReductionAbsMin< float, int > :: IndexType size, + const typename tnlParallelReductionAbsMin< float, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMin< float, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMin< float, int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< double, int > > + ( const tnlParallelReductionAbsMin< double, int>& operation, + const typename tnlParallelReductionAbsMin< double, int > :: IndexType size, + const typename tnlParallelReductionAbsMin< double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMin< double, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMin< double, int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< long double, int > > + ( const tnlParallelReductionAbsMin< long double, int>& operation, + const typename tnlParallelReductionAbsMin< long double, int > :: IndexType size, + const typename tnlParallelReductionAbsMin< long double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMin< long double, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMin< long double, int> :: ResultType& result );*/ + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< char, long int > > + ( const tnlParallelReductionAbsMin< char, long int >& operation, + const typename tnlParallelReductionAbsMin< char, long int > :: IndexType size, + const typename tnlParallelReductionAbsMin< char, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMin< char, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMin< char, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< int, long int > > + ( const tnlParallelReductionAbsMin< int, long int >& operation, + const typename tnlParallelReductionAbsMin< int, long int > :: IndexType size, + const typename tnlParallelReductionAbsMin< int, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMin< int, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMin< int, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< float, long int > > + ( const tnlParallelReductionAbsMin< float, long int >& operation, + const typename tnlParallelReductionAbsMin< float, long int > :: IndexType size, + const typename tnlParallelReductionAbsMin< float, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMin< float, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMin< float, long int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< double, long int > > + ( const tnlParallelReductionAbsMin< double, long int>& operation, + const typename tnlParallelReductionAbsMin< double, long int > :: IndexType size, + const typename tnlParallelReductionAbsMin< double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMin< double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMin< double, long int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< long double, long int > > + ( const tnlParallelReductionAbsMin< long double, long int>& operation, + const typename tnlParallelReductionAbsMin< long double, long int > :: IndexType size, + const typename tnlParallelReductionAbsMin< long double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMin< long double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMin< long double, long int> :: ResultType& result );*/ +/**** + * Abs max + */ + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< char, int > > + ( const tnlParallelReductionAbsMax< char, int >& operation, + const typename tnlParallelReductionAbsMax< char, int > :: IndexType size, + const typename tnlParallelReductionAbsMax< char, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMax< char, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMax< char, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< int, int > > + ( const tnlParallelReductionAbsMax< int, int >& operation, + const typename tnlParallelReductionAbsMax< int, int > :: IndexType size, + const typename tnlParallelReductionAbsMax< int, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMax< int, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMax< int, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< float, int > > + ( const tnlParallelReductionAbsMax< float, int >& operation, + const typename tnlParallelReductionAbsMax< float, int > :: IndexType size, + const typename tnlParallelReductionAbsMax< float, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMax< float, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMax< float, int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< double, int > > + ( const tnlParallelReductionAbsMax< double, int>& operation, + const typename tnlParallelReductionAbsMax< double, int > :: IndexType size, + const typename tnlParallelReductionAbsMax< double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMax< double, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMax< double, int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< long double, int > > + ( const tnlParallelReductionAbsMax< long double, int>& operation, + const typename tnlParallelReductionAbsMax< long double, int > :: IndexType size, + const typename tnlParallelReductionAbsMax< long double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMax< long double, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMax< long double, int> :: ResultType& result );*/ + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< char, long int > > + ( const tnlParallelReductionAbsMax< char, long int >& operation, + const typename tnlParallelReductionAbsMax< char, long int > :: IndexType size, + const typename tnlParallelReductionAbsMax< char, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMax< char, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMax< char, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< int, long int > > + ( const tnlParallelReductionAbsMax< int, long int >& operation, + const typename tnlParallelReductionAbsMax< int, long int > :: IndexType size, + const typename tnlParallelReductionAbsMax< int, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMax< int, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMax< int, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< float, long int > > + ( const tnlParallelReductionAbsMax< float, long int >& operation, + const typename tnlParallelReductionAbsMax< float, long int > :: IndexType size, + const typename tnlParallelReductionAbsMax< float, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMax< float, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMax< float, long int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< double, long int > > + ( const tnlParallelReductionAbsMax< double, long int>& operation, + const typename tnlParallelReductionAbsMax< double, long int > :: IndexType size, + const typename tnlParallelReductionAbsMax< double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMax< double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMax< double, long int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< long double, long int > > + ( const tnlParallelReductionAbsMax< long double, long int>& operation, + const typename tnlParallelReductionAbsMax< long double, long int > :: IndexType size, + const typename tnlParallelReductionAbsMax< long double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMax< long double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMax< long double, long int> :: ResultType& result );*/ + +/**** + * Logical AND + */ +extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< char, int > > + ( const tnlParallelReductionLogicalAnd< char, int >& operation, + const typename tnlParallelReductionLogicalAnd< char, int > :: IndexType size, + const typename tnlParallelReductionLogicalAnd< char, int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalAnd< char, int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalAnd< char, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< int, int > > + ( const tnlParallelReductionLogicalAnd< int, int >& operation, + const typename tnlParallelReductionLogicalAnd< int, int > :: IndexType size, + const typename tnlParallelReductionLogicalAnd< int, int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalAnd< int, int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalAnd< int, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< float, int > > + ( const tnlParallelReductionLogicalAnd< float, int >& operation, + const typename tnlParallelReductionLogicalAnd< float, int > :: IndexType size, + const typename tnlParallelReductionLogicalAnd< float, int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalAnd< float, int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalAnd< float, int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< double, int > > + ( const tnlParallelReductionLogicalAnd< double, int>& operation, + const typename tnlParallelReductionLogicalAnd< double, int > :: IndexType size, + const typename tnlParallelReductionLogicalAnd< double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalAnd< double, int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalAnd< double, int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< long double, int > > + ( const tnlParallelReductionLogicalAnd< long double, int>& operation, + const typename tnlParallelReductionLogicalAnd< long double, int > :: IndexType size, + const typename tnlParallelReductionLogicalAnd< long double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalAnd< long double, int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalAnd< long double, int> :: ResultType& result );*/ + +extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< char, long int > > + ( const tnlParallelReductionLogicalAnd< char, long int >& operation, + const typename tnlParallelReductionLogicalAnd< char, long int > :: IndexType size, + const typename tnlParallelReductionLogicalAnd< char, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalAnd< char, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalAnd< char, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< int, long int > > + ( const tnlParallelReductionLogicalAnd< int, long int >& operation, + const typename tnlParallelReductionLogicalAnd< int, long int > :: IndexType size, + const typename tnlParallelReductionLogicalAnd< int, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalAnd< int, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalAnd< int, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< float, long int > > + ( const tnlParallelReductionLogicalAnd< float, long int >& operation, + const typename tnlParallelReductionLogicalAnd< float, long int > :: IndexType size, + const typename tnlParallelReductionLogicalAnd< float, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalAnd< float, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalAnd< float, long int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< double, long int > > + ( const tnlParallelReductionLogicalAnd< double, long int>& operation, + const typename tnlParallelReductionLogicalAnd< double, long int > :: IndexType size, + const typename tnlParallelReductionLogicalAnd< double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalAnd< double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalAnd< double, long int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< long double, long int > > + ( const tnlParallelReductionLogicalAnd< long double, long int>& operation, + const typename tnlParallelReductionLogicalAnd< long double, long int > :: IndexType size, + const typename tnlParallelReductionLogicalAnd< long double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalAnd< long double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalAnd< long double, long int> :: ResultType& result );*/ + +/**** + * Logical OR + */ +extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< char, int > > + ( const tnlParallelReductionLogicalOr< char, int >& operation, + const typename tnlParallelReductionLogicalOr< char, int > :: IndexType size, + const typename tnlParallelReductionLogicalOr< char, int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalOr< char, int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalOr< char, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< int, int > > + ( const tnlParallelReductionLogicalOr< int, int >& operation, + const typename tnlParallelReductionLogicalOr< int, int > :: IndexType size, + const typename tnlParallelReductionLogicalOr< int, int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalOr< int, int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalOr< int, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< float, int > > + ( const tnlParallelReductionLogicalOr< float, int >& operation, + const typename tnlParallelReductionLogicalOr< float, int > :: IndexType size, + const typename tnlParallelReductionLogicalOr< float, int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalOr< float, int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalOr< float, int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< double, int > > + ( const tnlParallelReductionLogicalOr< double, int>& operation, + const typename tnlParallelReductionLogicalOr< double, int > :: IndexType size, + const typename tnlParallelReductionLogicalOr< double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalOr< double, int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalOr< double, int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< long double, int > > + ( const tnlParallelReductionLogicalOr< long double, int>& operation, + const typename tnlParallelReductionLogicalOr< long double, int > :: IndexType size, + const typename tnlParallelReductionLogicalOr< long double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalOr< long double, int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalOr< long double, int> :: ResultType& result );*/ + +extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< char, long int > > + ( const tnlParallelReductionLogicalOr< char, long int >& operation, + const typename tnlParallelReductionLogicalOr< char, long int > :: IndexType size, + const typename tnlParallelReductionLogicalOr< char, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalOr< char, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalOr< char, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< int, long int > > + ( const tnlParallelReductionLogicalOr< int, long int >& operation, + const typename tnlParallelReductionLogicalOr< int, long int > :: IndexType size, + const typename tnlParallelReductionLogicalOr< int, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalOr< int, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalOr< int, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< float, long int > > + ( const tnlParallelReductionLogicalOr< float, long int >& operation, + const typename tnlParallelReductionLogicalOr< float, long int > :: IndexType size, + const typename tnlParallelReductionLogicalOr< float, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalOr< float, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalOr< float, long int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< double, long int > > + ( const tnlParallelReductionLogicalOr< double, long int>& operation, + const typename tnlParallelReductionLogicalOr< double, long int > :: IndexType size, + const typename tnlParallelReductionLogicalOr< double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalOr< double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalOr< double, long int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< long double, long int > > + ( const tnlParallelReductionLogicalOr< long double, long int>& operation, + const typename tnlParallelReductionLogicalOr< long double, long int > :: IndexType size, + const typename tnlParallelReductionLogicalOr< long double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalOr< long double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalOr< long double, long int> :: ResultType& result );*/ + + +/**** + * Lp Norm + */ +extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< float, int > > + ( const tnlParallelReductionLpNorm< float, int >& operation, + const typename tnlParallelReductionLpNorm< float, int > :: IndexType size, + const typename tnlParallelReductionLpNorm< float, int > :: RealType* deviceInput1, + const typename tnlParallelReductionLpNorm< float, int > :: RealType* deviceInput2, + typename tnlParallelReductionLpNorm< float, int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< double, int > > + ( const tnlParallelReductionLpNorm< double, int>& operation, + const typename tnlParallelReductionLpNorm< double, int > :: IndexType size, + const typename tnlParallelReductionLpNorm< double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionLpNorm< double, int > :: RealType* deviceInput2, + typename tnlParallelReductionLpNorm< double, int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< long double, int > > + ( const tnlParallelReductionLpNorm< long double, int>& operation, + const typename tnlParallelReductionLpNorm< long double, int > :: IndexType size, + const typename tnlParallelReductionLpNorm< long double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionLpNorm< long double, int > :: RealType* deviceInput2, + typename tnlParallelReductionLpNorm< long double, int> :: ResultType& result );*/ + +extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< char, long int > > + ( const tnlParallelReductionLpNorm< char, long int >& operation, + const typename tnlParallelReductionLpNorm< char, long int > :: IndexType size, + const typename tnlParallelReductionLpNorm< char, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLpNorm< char, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLpNorm< char, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< int, long int > > + ( const tnlParallelReductionLpNorm< int, long int >& operation, + const typename tnlParallelReductionLpNorm< int, long int > :: IndexType size, + const typename tnlParallelReductionLpNorm< int, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLpNorm< int, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLpNorm< int, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< float, long int > > + ( const tnlParallelReductionLpNorm< float, long int >& operation, + const typename tnlParallelReductionLpNorm< float, long int > :: IndexType size, + const typename tnlParallelReductionLpNorm< float, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLpNorm< float, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLpNorm< float, long int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< double, long int > > + ( const tnlParallelReductionLpNorm< double, long int>& operation, + const typename tnlParallelReductionLpNorm< double, long int > :: IndexType size, + const typename tnlParallelReductionLpNorm< double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLpNorm< double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLpNorm< double, long int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< long double, long int > > + ( const tnlParallelReductionLpNorm< long double, long int>& operation, + const typename tnlParallelReductionLpNorm< long double, long int > :: IndexType size, + const typename tnlParallelReductionLpNorm< long double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLpNorm< long double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLpNorm< long double, long int> :: ResultType& result );*/ + + +/**** + * Equalities + */ +extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< char, int > > + ( const tnlParallelReductionEqualities< char, int >& operation, + const typename tnlParallelReductionEqualities< char, int > :: IndexType size, + const typename tnlParallelReductionEqualities< char, int > :: RealType* deviceInput1, + const typename tnlParallelReductionEqualities< char, int > :: RealType* deviceInput2, + typename tnlParallelReductionEqualities< char, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< int, int > > + ( const tnlParallelReductionEqualities< int, int >& operation, + const typename tnlParallelReductionEqualities< int, int > :: IndexType size, + const typename tnlParallelReductionEqualities< int, int > :: RealType* deviceInput1, + const typename tnlParallelReductionEqualities< int, int > :: RealType* deviceInput2, + typename tnlParallelReductionEqualities< int, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< float, int > > + ( const tnlParallelReductionEqualities< float, int >& operation, + const typename tnlParallelReductionEqualities< float, int > :: IndexType size, + const typename tnlParallelReductionEqualities< float, int > :: RealType* deviceInput1, + const typename tnlParallelReductionEqualities< float, int > :: RealType* deviceInput2, + typename tnlParallelReductionEqualities< float, int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< double, int > > + ( const tnlParallelReductionEqualities< double, int>& operation, + const typename tnlParallelReductionEqualities< double, int > :: IndexType size, + const typename tnlParallelReductionEqualities< double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionEqualities< double, int > :: RealType* deviceInput2, + typename tnlParallelReductionEqualities< double, int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< long double, int > > + ( const tnlParallelReductionEqualities< long double, int>& operation, + const typename tnlParallelReductionEqualities< long double, int > :: IndexType size, + const typename tnlParallelReductionEqualities< long double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionEqualities< long double, int > :: RealType* deviceInput2, + typename tnlParallelReductionEqualities< long double, int> :: ResultType& result );*/ + +extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< char, long int > > + ( const tnlParallelReductionEqualities< char, long int >& operation, + const typename tnlParallelReductionEqualities< char, long int > :: IndexType size, + const typename tnlParallelReductionEqualities< char, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionEqualities< char, long int > :: RealType* deviceInput2, + typename tnlParallelReductionEqualities< char, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< int, long int > > + ( const tnlParallelReductionEqualities< int, long int >& operation, + const typename tnlParallelReductionEqualities< int, long int > :: IndexType size, + const typename tnlParallelReductionEqualities< int, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionEqualities< int, long int > :: RealType* deviceInput2, + typename tnlParallelReductionEqualities< int, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< float, long int > > + ( const tnlParallelReductionEqualities< float, long int >& operation, + const typename tnlParallelReductionEqualities< float, long int > :: IndexType size, + const typename tnlParallelReductionEqualities< float, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionEqualities< float, long int > :: RealType* deviceInput2, + typename tnlParallelReductionEqualities< float, long int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< double, long int > > + ( const tnlParallelReductionEqualities< double, long int>& operation, + const typename tnlParallelReductionEqualities< double, long int > :: IndexType size, + const typename tnlParallelReductionEqualities< double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionEqualities< double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionEqualities< double, long int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< long double, long int > > + ( const tnlParallelReductionEqualities< long double, long int>& operation, + const typename tnlParallelReductionEqualities< long double, long int > :: IndexType size, + const typename tnlParallelReductionEqualities< long double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionEqualities< long double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionEqualities< long double, long int> :: ResultType& result );*/ + + +/**** + * Inequalities + */ +extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< char, int > > + ( const tnlParallelReductionInequalities< char, int >& operation, + const typename tnlParallelReductionInequalities< char, int > :: IndexType size, + const typename tnlParallelReductionInequalities< char, int > :: RealType* deviceInput1, + const typename tnlParallelReductionInequalities< char, int > :: RealType* deviceInput2, + typename tnlParallelReductionInequalities< char, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< int, int > > + ( const tnlParallelReductionInequalities< int, int >& operation, + const typename tnlParallelReductionInequalities< int, int > :: IndexType size, + const typename tnlParallelReductionInequalities< int, int > :: RealType* deviceInput1, + const typename tnlParallelReductionInequalities< int, int > :: RealType* deviceInput2, + typename tnlParallelReductionInequalities< int, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< float, int > > + ( const tnlParallelReductionInequalities< float, int >& operation, + const typename tnlParallelReductionInequalities< float, int > :: IndexType size, + const typename tnlParallelReductionInequalities< float, int > :: RealType* deviceInput1, + const typename tnlParallelReductionInequalities< float, int > :: RealType* deviceInput2, + typename tnlParallelReductionInequalities< float, int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< double, int > > + ( const tnlParallelReductionInequalities< double, int>& operation, + const typename tnlParallelReductionInequalities< double, int > :: IndexType size, + const typename tnlParallelReductionInequalities< double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionInequalities< double, int > :: RealType* deviceInput2, + typename tnlParallelReductionInequalities< double, int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< long double, int > > + ( const tnlParallelReductionInequalities< long double, int>& operation, + const typename tnlParallelReductionInequalities< long double, int > :: IndexType size, + const typename tnlParallelReductionInequalities< long double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionInequalities< long double, int > :: RealType* deviceInput2, + typename tnlParallelReductionInequalities< long double, int> :: ResultType& result );*/ + +extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< char, long int > > + ( const tnlParallelReductionInequalities< char, long int >& operation, + const typename tnlParallelReductionInequalities< char, long int > :: IndexType size, + const typename tnlParallelReductionInequalities< char, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionInequalities< char, long int > :: RealType* deviceInput2, + typename tnlParallelReductionInequalities< char, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< int, long int > > + ( const tnlParallelReductionInequalities< int, long int >& operation, + const typename tnlParallelReductionInequalities< int, long int > :: IndexType size, + const typename tnlParallelReductionInequalities< int, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionInequalities< int, long int > :: RealType* deviceInput2, + typename tnlParallelReductionInequalities< int, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< float, long int > > + ( const tnlParallelReductionInequalities< float, long int >& operation, + const typename tnlParallelReductionInequalities< float, long int > :: IndexType size, + const typename tnlParallelReductionInequalities< float, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionInequalities< float, long int > :: RealType* deviceInput2, + typename tnlParallelReductionInequalities< float, long int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< double, long int > > + ( const tnlParallelReductionInequalities< double, long int>& operation, + const typename tnlParallelReductionInequalities< double, long int > :: IndexType size, + const typename tnlParallelReductionInequalities< double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionInequalities< double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionInequalities< double, long int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< long double, long int > > + ( const tnlParallelReductionInequalities< long double, long int>& operation, + const typename tnlParallelReductionInequalities< long double, long int > :: IndexType size, + const typename tnlParallelReductionInequalities< long double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionInequalities< long double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionInequalities< long double, long int> :: ResultType& result );*/ + + +/**** + * Sdot + */ +extern template bool reductionOnCudaDevice< tnlParallelReductionSdot< char, int > > + ( const tnlParallelReductionSdot< char, int >& operation, + const typename tnlParallelReductionSdot< char, int > :: IndexType size, + const typename tnlParallelReductionSdot< char, int > :: RealType* deviceInput1, + const typename tnlParallelReductionSdot< char, int > :: RealType* deviceInput2, + typename tnlParallelReductionSdot< char, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionSdot< int, int > > + ( const tnlParallelReductionSdot< int, int >& operation, + const typename tnlParallelReductionSdot< int, int > :: IndexType size, + const typename tnlParallelReductionSdot< int, int > :: RealType* deviceInput1, + const typename tnlParallelReductionSdot< int, int > :: RealType* deviceInput2, + typename tnlParallelReductionSdot< int, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionSdot< float, int > > + ( const tnlParallelReductionSdot< float, int >& operation, + const typename tnlParallelReductionSdot< float, int > :: IndexType size, + const typename tnlParallelReductionSdot< float, int > :: RealType* deviceInput1, + const typename tnlParallelReductionSdot< float, int > :: RealType* deviceInput2, + typename tnlParallelReductionSdot< float, int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionSdot< double, int > > + ( const tnlParallelReductionSdot< double, int>& operation, + const typename tnlParallelReductionSdot< double, int > :: IndexType size, + const typename tnlParallelReductionSdot< double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionSdot< double, int > :: RealType* deviceInput2, + typename tnlParallelReductionSdot< double, int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionSdot< long double, int > > + ( const tnlParallelReductionSdot< long double, int>& operation, + const typename tnlParallelReductionSdot< long double, int > :: IndexType size, + const typename tnlParallelReductionSdot< long double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionSdot< long double, int > :: RealType* deviceInput2, + typename tnlParallelReductionSdot< long double, int> :: ResultType& result );*/ + +extern template bool reductionOnCudaDevice< tnlParallelReductionSdot< char, long int > > + ( const tnlParallelReductionSdot< char, long int >& operation, + const typename tnlParallelReductionSdot< char, long int > :: IndexType size, + const typename tnlParallelReductionSdot< char, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionSdot< char, long int > :: RealType* deviceInput2, + typename tnlParallelReductionSdot< char, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionSdot< int, long int > > + ( const tnlParallelReductionSdot< int, long int >& operation, + const typename tnlParallelReductionSdot< int, long int > :: IndexType size, + const typename tnlParallelReductionSdot< int, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionSdot< int, long int > :: RealType* deviceInput2, + typename tnlParallelReductionSdot< int, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionSdot< float, long int > > + ( const tnlParallelReductionSdot< float, long int >& operation, + const typename tnlParallelReductionSdot< float, long int > :: IndexType size, + const typename tnlParallelReductionSdot< float, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionSdot< float, long int > :: RealType* deviceInput2, + typename tnlParallelReductionSdot< float, long int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionSdot< double, long int > > + ( const tnlParallelReductionSdot< double, long int>& operation, + const typename tnlParallelReductionSdot< double, long int > :: IndexType size, + const typename tnlParallelReductionSdot< double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionSdot< double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionSdot< double, long int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionSdot< long double, long int > > + ( const tnlParallelReductionSdot< long double, long int>& operation, + const typename tnlParallelReductionSdot< long double, long int > :: IndexType size, + const typename tnlParallelReductionSdot< long double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionSdot< long double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionSdot< long double, long int> :: ResultType& result );*/ + +#endif /* TEMPLATE_EXPLICIT_INSTANTIATION */ + + diff --git a/src/implementation/core/cuda/cuda-reduction_impl.cu b/src/implementation/core/cuda/cuda-reduction_impl.cu new file mode 100644 index 0000000000000000000000000000000000000000..ac420c07fbe87e8f78b383881c90f0fa4a8aaccc --- /dev/null +++ b/src/implementation/core/cuda/cuda-reduction_impl.cu @@ -0,0 +1,882 @@ +/*************************************************************************** + cuda-reduction_impl.cu - description + ------------------- + begin : Mar 24, 2013 + copyright : (C) 2013 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/*************************************************************************** + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + ***************************************************************************/ + +#include <core/cuda/reduction-operations.h> +#include <core/cuda/cuda-reduction.h> + +#ifdef TEMPLATE_EXPLICIT_INSTANTIATION + +/**** + * Sum + */ + +template bool reductionOnCudaDevice< tnlParallelReductionSum< char, int > > + ( const tnlParallelReductionSum< char, int >& operation, + const typename tnlParallelReductionSum< char, int > :: IndexType size, + const typename tnlParallelReductionSum< char, int > :: RealType* deviceInput1, + const typename tnlParallelReductionSum< char, int > :: RealType* deviceInput2, + typename tnlParallelReductionSum< char, int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionSum< int, int > > + ( const tnlParallelReductionSum< int, int >& operation, + const typename tnlParallelReductionSum< int, int > :: IndexType size, + const typename tnlParallelReductionSum< int, int > :: RealType* deviceInput1, + const typename tnlParallelReductionSum< int, int > :: RealType* deviceInput2, + typename tnlParallelReductionSum< int, int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionSum< float, int > > + ( const tnlParallelReductionSum< float, int >& operation, + const typename tnlParallelReductionSum< float, int > :: IndexType size, + const typename tnlParallelReductionSum< float, int > :: RealType* deviceInput1, + const typename tnlParallelReductionSum< float, int > :: RealType* deviceInput2, + typename tnlParallelReductionSum< float, int> :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionSum< double, int > > + ( const tnlParallelReductionSum< double, int>& operation, + const typename tnlParallelReductionSum< double, int > :: IndexType size, + const typename tnlParallelReductionSum< double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionSum< double, int > :: RealType* deviceInput2, + typename tnlParallelReductionSum< double, int> :: ResultType& result ); + +/*template bool reductionOnCudaDevice< tnlParallelReductionSum< long double, int > > + ( const tnlParallelReductionSum< long double, int>& operation, + const typename tnlParallelReductionSum< long double, int > :: IndexType size, + const typename tnlParallelReductionSum< long double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionSum< long double, int > :: RealType* deviceInput2, + typename tnlParallelReductionSum< long double, int> :: ResultType& result );*/ + +template bool reductionOnCudaDevice< tnlParallelReductionSum< char, long int > > + ( const tnlParallelReductionSum< char, long int >& operation, + const typename tnlParallelReductionSum< char, long int > :: IndexType size, + const typename tnlParallelReductionSum< char, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionSum< char, long int > :: RealType* deviceInput2, + typename tnlParallelReductionSum< char, long int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionSum< int, long int > > + ( const tnlParallelReductionSum< int, long int >& operation, + const typename tnlParallelReductionSum< int, long int > :: IndexType size, + const typename tnlParallelReductionSum< int, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionSum< int, long int > :: RealType* deviceInput2, + typename tnlParallelReductionSum< int, long int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionSum< float, long int > > + ( const tnlParallelReductionSum< float, long int >& operation, + const typename tnlParallelReductionSum< float, long int > :: IndexType size, + const typename tnlParallelReductionSum< float, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionSum< float, long int > :: RealType* deviceInput2, + typename tnlParallelReductionSum< float, long int> :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionSum< double, long int > > + ( const tnlParallelReductionSum< double, long int>& operation, + const typename tnlParallelReductionSum< double, long int > :: IndexType size, + const typename tnlParallelReductionSum< double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionSum< double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionSum< double, long int> :: ResultType& result ); + +/*template bool reductionOnCudaDevice< tnlParallelReductionSum< long double, long int > > + ( const tnlParallelReductionSum< long double, long int>& operation, + const typename tnlParallelReductionSum< long double, long int > :: IndexType size, + const typename tnlParallelReductionSum< long double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionSum< long double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionSum< long double, long int> :: ResultType& result );*/ + +/**** + * Min + */ + +template bool reductionOnCudaDevice< tnlParallelReductionMin< char, int > > + ( const tnlParallelReductionMin< char, int >& operation, + const typename tnlParallelReductionMin< char, int > :: IndexType size, + const typename tnlParallelReductionMin< char, int > :: RealType* deviceInput1, + const typename tnlParallelReductionMin< char, int > :: RealType* deviceInput2, + typename tnlParallelReductionMin< char, int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionMin< int, int > > + ( const tnlParallelReductionMin< int, int >& operation, + const typename tnlParallelReductionMin< int, int > :: IndexType size, + const typename tnlParallelReductionMin< int, int > :: RealType* deviceInput1, + const typename tnlParallelReductionMin< int, int > :: RealType* deviceInput2, + typename tnlParallelReductionMin< int, int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionMin< float, int > > + ( const tnlParallelReductionMin< float, int >& operation, + const typename tnlParallelReductionMin< float, int > :: IndexType size, + const typename tnlParallelReductionMin< float, int > :: RealType* deviceInput1, + const typename tnlParallelReductionMin< float, int > :: RealType* deviceInput2, + typename tnlParallelReductionMin< float, int> :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionMin< double, int > > + ( const tnlParallelReductionMin< double, int>& operation, + const typename tnlParallelReductionMin< double, int > :: IndexType size, + const typename tnlParallelReductionMin< double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionMin< double, int > :: RealType* deviceInput2, + typename tnlParallelReductionMin< double, int> :: ResultType& result ); + +/*template bool reductionOnCudaDevice< tnlParallelReductionMin< long double, int > > + ( const tnlParallelReductionMin< long double, int>& operation, + const typename tnlParallelReductionMin< long double, int > :: IndexType size, + const typename tnlParallelReductionMin< long double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionMin< long double, int > :: RealType* deviceInput2, + typename tnlParallelReductionMin< long double, int> :: ResultType& result );*/ + +template bool reductionOnCudaDevice< tnlParallelReductionMin< char, long int > > + ( const tnlParallelReductionMin< char, long int >& operation, + const typename tnlParallelReductionMin< char, long int > :: IndexType size, + const typename tnlParallelReductionMin< char, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionMin< char, long int > :: RealType* deviceInput2, + typename tnlParallelReductionMin< char, long int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionMin< int, long int > > + ( const tnlParallelReductionMin< int, long int >& operation, + const typename tnlParallelReductionMin< int, long int > :: IndexType size, + const typename tnlParallelReductionMin< int, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionMin< int, long int > :: RealType* deviceInput2, + typename tnlParallelReductionMin< int, long int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionMin< float, long int > > + ( const tnlParallelReductionMin< float, long int >& operation, + const typename tnlParallelReductionMin< float, long int > :: IndexType size, + const typename tnlParallelReductionMin< float, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionMin< float, long int > :: RealType* deviceInput2, + typename tnlParallelReductionMin< float, long int> :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionMin< double, long int > > + ( const tnlParallelReductionMin< double, long int>& operation, + const typename tnlParallelReductionMin< double, long int > :: IndexType size, + const typename tnlParallelReductionMin< double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionMin< double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionMin< double, long int> :: ResultType& result ); + +/*template bool reductionOnCudaDevice< tnlParallelReductionMin< long double, long int > > + ( const tnlParallelReductionMin< long double, long int>& operation, + const typename tnlParallelReductionMin< long double, long int > :: IndexType size, + const typename tnlParallelReductionMin< long double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionMin< long double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionMin< long double, long int> :: ResultType& result );*/ + +/**** + * Max + */ + +template bool reductionOnCudaDevice< tnlParallelReductionMax< char, int > > + ( const tnlParallelReductionMax< char, int >& operation, + const typename tnlParallelReductionMax< char, int > :: IndexType size, + const typename tnlParallelReductionMax< char, int > :: RealType* deviceInput1, + const typename tnlParallelReductionMax< char, int > :: RealType* deviceInput2, + typename tnlParallelReductionMax< char, int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionMax< int, int > > + ( const tnlParallelReductionMax< int, int >& operation, + const typename tnlParallelReductionMax< int, int > :: IndexType size, + const typename tnlParallelReductionMax< int, int > :: RealType* deviceInput1, + const typename tnlParallelReductionMax< int, int > :: RealType* deviceInput2, + typename tnlParallelReductionMax< int, int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionMax< float, int > > + ( const tnlParallelReductionMax< float, int >& operation, + const typename tnlParallelReductionMax< float, int > :: IndexType size, + const typename tnlParallelReductionMax< float, int > :: RealType* deviceInput1, + const typename tnlParallelReductionMax< float, int > :: RealType* deviceInput2, + typename tnlParallelReductionMax< float, int> :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionMax< double, int > > + ( const tnlParallelReductionMax< double, int>& operation, + const typename tnlParallelReductionMax< double, int > :: IndexType size, + const typename tnlParallelReductionMax< double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionMax< double, int > :: RealType* deviceInput2, + typename tnlParallelReductionMax< double, int> :: ResultType& result ); + +/*template bool reductionOnCudaDevice< tnlParallelReductionMax< long double, int > > + ( const tnlParallelReductionMax< long double, int>& operation, + const typename tnlParallelReductionMax< long double, int > :: IndexType size, + const typename tnlParallelReductionMax< long double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionMax< long double, int > :: RealType* deviceInput2, + typename tnlParallelReductionMax< long double, int> :: ResultType& result );*/ + +template bool reductionOnCudaDevice< tnlParallelReductionMax< char, long int > > + ( const tnlParallelReductionMax< char, long int >& operation, + const typename tnlParallelReductionMax< char, long int > :: IndexType size, + const typename tnlParallelReductionMax< char, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionMax< char, long int > :: RealType* deviceInput2, + typename tnlParallelReductionMax< char, long int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionMax< int, long int > > + ( const tnlParallelReductionMax< int, long int >& operation, + const typename tnlParallelReductionMax< int, long int > :: IndexType size, + const typename tnlParallelReductionMax< int, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionMax< int, long int > :: RealType* deviceInput2, + typename tnlParallelReductionMax< int, long int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionMax< float, long int > > + ( const tnlParallelReductionMax< float, long int >& operation, + const typename tnlParallelReductionMax< float, long int > :: IndexType size, + const typename tnlParallelReductionMax< float, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionMax< float, long int > :: RealType* deviceInput2, + typename tnlParallelReductionMax< float, long int> :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionMax< double, long int > > + ( const tnlParallelReductionMax< double, long int>& operation, + const typename tnlParallelReductionMax< double, long int > :: IndexType size, + const typename tnlParallelReductionMax< double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionMax< double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionMax< double, long int> :: ResultType& result ); + +/*template bool reductionOnCudaDevice< tnlParallelReductionMax< long double, long int > > + ( const tnlParallelReductionMax< long double, long int>& operation, + const typename tnlParallelReductionMax< long double, long int > :: IndexType size, + const typename tnlParallelReductionMax< long double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionMax< long double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionMax< long double, long int> :: ResultType& result );*/ + +/**** + * Abs sum + */ + +template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< char, int > > + ( const tnlParallelReductionAbsSum< char, int >& operation, + const typename tnlParallelReductionAbsSum< char, int > :: IndexType size, + const typename tnlParallelReductionAbsSum< char, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsSum< char, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsSum< char, int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< int, int > > + ( const tnlParallelReductionAbsSum< int, int >& operation, + const typename tnlParallelReductionAbsSum< int, int > :: IndexType size, + const typename tnlParallelReductionAbsSum< int, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsSum< int, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsSum< int, int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< float, int > > + ( const tnlParallelReductionAbsSum< float, int >& operation, + const typename tnlParallelReductionAbsSum< float, int > :: IndexType size, + const typename tnlParallelReductionAbsSum< float, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsSum< float, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsSum< float, int> :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< double, int > > + ( const tnlParallelReductionAbsSum< double, int>& operation, + const typename tnlParallelReductionAbsSum< double, int > :: IndexType size, + const typename tnlParallelReductionAbsSum< double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsSum< double, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsSum< double, int> :: ResultType& result ); + +/*template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< long double, int > > + ( const tnlParallelReductionAbsSum< long double, int>& operation, + const typename tnlParallelReductionAbsSum< long double, int > :: IndexType size, + const typename tnlParallelReductionAbsSum< long double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsSum< long double, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsSum< long double, int> :: ResultType& result );*/ + +template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< char, long int > > + ( const tnlParallelReductionAbsSum< char, long int >& operation, + const typename tnlParallelReductionAbsSum< char, long int > :: IndexType size, + const typename tnlParallelReductionAbsSum< char, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsSum< char, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsSum< char, long int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< int, long int > > + ( const tnlParallelReductionAbsSum< int, long int >& operation, + const typename tnlParallelReductionAbsSum< int, long int > :: IndexType size, + const typename tnlParallelReductionAbsSum< int, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsSum< int, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsSum< int, long int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< float, long int > > + ( const tnlParallelReductionAbsSum< float, long int >& operation, + const typename tnlParallelReductionAbsSum< float, long int > :: IndexType size, + const typename tnlParallelReductionAbsSum< float, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsSum< float, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsSum< float, long int> :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< double, long int > > + ( const tnlParallelReductionAbsSum< double, long int>& operation, + const typename tnlParallelReductionAbsSum< double, long int > :: IndexType size, + const typename tnlParallelReductionAbsSum< double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsSum< double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsSum< double, long int> :: ResultType& result ); + +/*template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< long double, long int > > + ( const tnlParallelReductionAbsSum< long double, long int>& operation, + const typename tnlParallelReductionAbsSum< long double, long int > :: IndexType size, + const typename tnlParallelReductionAbsSum< long double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsSum< long double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsSum< long double, long int> :: ResultType& result );*/ + +/**** + * Abs min + */ + +template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< char, int > > + ( const tnlParallelReductionAbsMin< char, int >& operation, + const typename tnlParallelReductionAbsMin< char, int > :: IndexType size, + const typename tnlParallelReductionAbsMin< char, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMin< char, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMin< char, int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< int, int > > + ( const tnlParallelReductionAbsMin< int, int >& operation, + const typename tnlParallelReductionAbsMin< int, int > :: IndexType size, + const typename tnlParallelReductionAbsMin< int, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMin< int, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMin< int, int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< float, int > > + ( const tnlParallelReductionAbsMin< float, int >& operation, + const typename tnlParallelReductionAbsMin< float, int > :: IndexType size, + const typename tnlParallelReductionAbsMin< float, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMin< float, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMin< float, int> :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< double, int > > + ( const tnlParallelReductionAbsMin< double, int>& operation, + const typename tnlParallelReductionAbsMin< double, int > :: IndexType size, + const typename tnlParallelReductionAbsMin< double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMin< double, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMin< double, int> :: ResultType& result ); + +/*template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< long double, int > > + ( const tnlParallelReductionAbsMin< long double, int>& operation, + const typename tnlParallelReductionAbsMin< long double, int > :: IndexType size, + const typename tnlParallelReductionAbsMin< long double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMin< long double, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMin< long double, int> :: ResultType& result );*/ + +template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< char, long int > > + ( const tnlParallelReductionAbsMin< char, long int >& operation, + const typename tnlParallelReductionAbsMin< char, long int > :: IndexType size, + const typename tnlParallelReductionAbsMin< char, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMin< char, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMin< char, long int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< int, long int > > + ( const tnlParallelReductionAbsMin< int, long int >& operation, + const typename tnlParallelReductionAbsMin< int, long int > :: IndexType size, + const typename tnlParallelReductionAbsMin< int, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMin< int, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMin< int, long int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< float, long int > > + ( const tnlParallelReductionAbsMin< float, long int >& operation, + const typename tnlParallelReductionAbsMin< float, long int > :: IndexType size, + const typename tnlParallelReductionAbsMin< float, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMin< float, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMin< float, long int> :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< double, long int > > + ( const tnlParallelReductionAbsMin< double, long int>& operation, + const typename tnlParallelReductionAbsMin< double, long int > :: IndexType size, + const typename tnlParallelReductionAbsMin< double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMin< double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMin< double, long int> :: ResultType& result ); + +/*template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< long double, long int > > + ( const tnlParallelReductionAbsMin< long double, long int>& operation, + const typename tnlParallelReductionAbsMin< long double, long int > :: IndexType size, + const typename tnlParallelReductionAbsMin< long double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMin< long double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMin< long double, long int> :: ResultType& result );*/ +/**** + * Abs max + */ + +template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< char, int > > + ( const tnlParallelReductionAbsMax< char, int >& operation, + const typename tnlParallelReductionAbsMax< char, int > :: IndexType size, + const typename tnlParallelReductionAbsMax< char, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMax< char, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMax< char, int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< int, int > > + ( const tnlParallelReductionAbsMax< int, int >& operation, + const typename tnlParallelReductionAbsMax< int, int > :: IndexType size, + const typename tnlParallelReductionAbsMax< int, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMax< int, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMax< int, int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< float, int > > + ( const tnlParallelReductionAbsMax< float, int >& operation, + const typename tnlParallelReductionAbsMax< float, int > :: IndexType size, + const typename tnlParallelReductionAbsMax< float, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMax< float, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMax< float, int> :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< double, int > > + ( const tnlParallelReductionAbsMax< double, int>& operation, + const typename tnlParallelReductionAbsMax< double, int > :: IndexType size, + const typename tnlParallelReductionAbsMax< double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMax< double, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMax< double, int> :: ResultType& result ); + +/*template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< long double, int > > + ( const tnlParallelReductionAbsMax< long double, int>& operation, + const typename tnlParallelReductionAbsMax< long double, int > :: IndexType size, + const typename tnlParallelReductionAbsMax< long double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMax< long double, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMax< long double, int> :: ResultType& result );*/ + +template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< char, long int > > + ( const tnlParallelReductionAbsMax< char, long int >& operation, + const typename tnlParallelReductionAbsMax< char, long int > :: IndexType size, + const typename tnlParallelReductionAbsMax< char, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMax< char, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMax< char, long int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< int, long int > > + ( const tnlParallelReductionAbsMax< int, long int >& operation, + const typename tnlParallelReductionAbsMax< int, long int > :: IndexType size, + const typename tnlParallelReductionAbsMax< int, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMax< int, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMax< int, long int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< float, long int > > + ( const tnlParallelReductionAbsMax< float, long int >& operation, + const typename tnlParallelReductionAbsMax< float, long int > :: IndexType size, + const typename tnlParallelReductionAbsMax< float, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMax< float, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMax< float, long int> :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< double, long int > > + ( const tnlParallelReductionAbsMax< double, long int>& operation, + const typename tnlParallelReductionAbsMax< double, long int > :: IndexType size, + const typename tnlParallelReductionAbsMax< double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMax< double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMax< double, long int> :: ResultType& result ); + +/*template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< long double, long int > > + ( const tnlParallelReductionAbsMax< long double, long int>& operation, + const typename tnlParallelReductionAbsMax< long double, long int > :: IndexType size, + const typename tnlParallelReductionAbsMax< long double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMax< long double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMax< long double, long int> :: ResultType& result );*/ + +/**** + * Logical AND + */ +template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< char, int > > + ( const tnlParallelReductionLogicalAnd< char, int >& operation, + const typename tnlParallelReductionLogicalAnd< char, int > :: IndexType size, + const typename tnlParallelReductionLogicalAnd< char, int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalAnd< char, int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalAnd< char, int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< int, int > > + ( const tnlParallelReductionLogicalAnd< int, int >& operation, + const typename tnlParallelReductionLogicalAnd< int, int > :: IndexType size, + const typename tnlParallelReductionLogicalAnd< int, int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalAnd< int, int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalAnd< int, int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< float, int > > + ( const tnlParallelReductionLogicalAnd< float, int >& operation, + const typename tnlParallelReductionLogicalAnd< float, int > :: IndexType size, + const typename tnlParallelReductionLogicalAnd< float, int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalAnd< float, int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalAnd< float, int> :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< double, int > > + ( const tnlParallelReductionLogicalAnd< double, int>& operation, + const typename tnlParallelReductionLogicalAnd< double, int > :: IndexType size, + const typename tnlParallelReductionLogicalAnd< double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalAnd< double, int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalAnd< double, int> :: ResultType& result ); + +/*template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< long double, int > > + ( const tnlParallelReductionLogicalAnd< long double, int>& operation, + const typename tnlParallelReductionLogicalAnd< long double, int > :: IndexType size, + const typename tnlParallelReductionLogicalAnd< long double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalAnd< long double, int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalAnd< long double, int> :: ResultType& result );*/ + +template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< char, long int > > + ( const tnlParallelReductionLogicalAnd< char, long int >& operation, + const typename tnlParallelReductionLogicalAnd< char, long int > :: IndexType size, + const typename tnlParallelReductionLogicalAnd< char, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalAnd< char, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalAnd< char, long int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< int, long int > > + ( const tnlParallelReductionLogicalAnd< int, long int >& operation, + const typename tnlParallelReductionLogicalAnd< int, long int > :: IndexType size, + const typename tnlParallelReductionLogicalAnd< int, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalAnd< int, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalAnd< int, long int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< float, long int > > + ( const tnlParallelReductionLogicalAnd< float, long int >& operation, + const typename tnlParallelReductionLogicalAnd< float, long int > :: IndexType size, + const typename tnlParallelReductionLogicalAnd< float, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalAnd< float, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalAnd< float, long int> :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< double, long int > > + ( const tnlParallelReductionLogicalAnd< double, long int>& operation, + const typename tnlParallelReductionLogicalAnd< double, long int > :: IndexType size, + const typename tnlParallelReductionLogicalAnd< double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalAnd< double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalAnd< double, long int> :: ResultType& result ); + +/*template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< long double, long int > > + ( const tnlParallelReductionLogicalAnd< long double, long int>& operation, + const typename tnlParallelReductionLogicalAnd< long double, long int > :: IndexType size, + const typename tnlParallelReductionLogicalAnd< long double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalAnd< long double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalAnd< long double, long int> :: ResultType& result );*/ + +/**** + * Logical OR + */ +template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< char, int > > + ( const tnlParallelReductionLogicalOr< char, int >& operation, + const typename tnlParallelReductionLogicalOr< char, int > :: IndexType size, + const typename tnlParallelReductionLogicalOr< char, int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalOr< char, int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalOr< char, int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< int, int > > + ( const tnlParallelReductionLogicalOr< int, int >& operation, + const typename tnlParallelReductionLogicalOr< int, int > :: IndexType size, + const typename tnlParallelReductionLogicalOr< int, int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalOr< int, int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalOr< int, int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< float, int > > + ( const tnlParallelReductionLogicalOr< float, int >& operation, + const typename tnlParallelReductionLogicalOr< float, int > :: IndexType size, + const typename tnlParallelReductionLogicalOr< float, int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalOr< float, int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalOr< float, int> :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< double, int > > + ( const tnlParallelReductionLogicalOr< double, int>& operation, + const typename tnlParallelReductionLogicalOr< double, int > :: IndexType size, + const typename tnlParallelReductionLogicalOr< double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalOr< double, int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalOr< double, int> :: ResultType& result ); + +/*template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< long double, int > > + ( const tnlParallelReductionLogicalOr< long double, int>& operation, + const typename tnlParallelReductionLogicalOr< long double, int > :: IndexType size, + const typename tnlParallelReductionLogicalOr< long double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalOr< long double, int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalOr< long double, int> :: ResultType& result );*/ + +template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< char, long int > > + ( const tnlParallelReductionLogicalOr< char, long int >& operation, + const typename tnlParallelReductionLogicalOr< char, long int > :: IndexType size, + const typename tnlParallelReductionLogicalOr< char, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalOr< char, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalOr< char, long int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< int, long int > > + ( const tnlParallelReductionLogicalOr< int, long int >& operation, + const typename tnlParallelReductionLogicalOr< int, long int > :: IndexType size, + const typename tnlParallelReductionLogicalOr< int, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalOr< int, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalOr< int, long int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< float, long int > > + ( const tnlParallelReductionLogicalOr< float, long int >& operation, + const typename tnlParallelReductionLogicalOr< float, long int > :: IndexType size, + const typename tnlParallelReductionLogicalOr< float, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalOr< float, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalOr< float, long int> :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< double, long int > > + ( const tnlParallelReductionLogicalOr< double, long int>& operation, + const typename tnlParallelReductionLogicalOr< double, long int > :: IndexType size, + const typename tnlParallelReductionLogicalOr< double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalOr< double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalOr< double, long int> :: ResultType& result ); + +/*template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< long double, long int > > + ( const tnlParallelReductionLogicalOr< long double, long int>& operation, + const typename tnlParallelReductionLogicalOr< long double, long int > :: IndexType size, + const typename tnlParallelReductionLogicalOr< long double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalOr< long double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalOr< long double, long int> :: ResultType& result );*/ + + +/**** + * Lp Norm + */ +template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< float, int > > + ( const tnlParallelReductionLpNorm< float, int >& operation, + const typename tnlParallelReductionLpNorm< float, int > :: IndexType size, + const typename tnlParallelReductionLpNorm< float, int > :: RealType* deviceInput1, + const typename tnlParallelReductionLpNorm< float, int > :: RealType* deviceInput2, + typename tnlParallelReductionLpNorm< float, int> :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< double, int > > + ( const tnlParallelReductionLpNorm< double, int>& operation, + const typename tnlParallelReductionLpNorm< double, int > :: IndexType size, + const typename tnlParallelReductionLpNorm< double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionLpNorm< double, int > :: RealType* deviceInput2, + typename tnlParallelReductionLpNorm< double, int> :: ResultType& result ); + +/*template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< long double, int > > + ( const tnlParallelReductionLpNorm< long double, int>& operation, + const typename tnlParallelReductionLpNorm< long double, int > :: IndexType size, + const typename tnlParallelReductionLpNorm< long double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionLpNorm< long double, int > :: RealType* deviceInput2, + typename tnlParallelReductionLpNorm< long double, int> :: ResultType& result );*/ + + + +template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< float, long int > > + ( const tnlParallelReductionLpNorm< float, long int >& operation, + const typename tnlParallelReductionLpNorm< float, long int > :: IndexType size, + const typename tnlParallelReductionLpNorm< float, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLpNorm< float, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLpNorm< float, long int> :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< double, long int > > + ( const tnlParallelReductionLpNorm< double, long int>& operation, + const typename tnlParallelReductionLpNorm< double, long int > :: IndexType size, + const typename tnlParallelReductionLpNorm< double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLpNorm< double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLpNorm< double, long int> :: ResultType& result ); + +/*template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< long double, long int > > + ( const tnlParallelReductionLpNorm< long double, long int>& operation, + const typename tnlParallelReductionLpNorm< long double, long int > :: IndexType size, + const typename tnlParallelReductionLpNorm< long double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLpNorm< long double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLpNorm< long double, long int> :: ResultType& result );*/ + + +/**** + * Equalities + */ +template bool reductionOnCudaDevice< tnlParallelReductionEqualities< char, int > > + ( const tnlParallelReductionEqualities< char, int >& operation, + const typename tnlParallelReductionEqualities< char, int > :: IndexType size, + const typename tnlParallelReductionEqualities< char, int > :: RealType* deviceInput1, + const typename tnlParallelReductionEqualities< char, int > :: RealType* deviceInput2, + typename tnlParallelReductionEqualities< char, int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionEqualities< int, int > > + ( const tnlParallelReductionEqualities< int, int >& operation, + const typename tnlParallelReductionEqualities< int, int > :: IndexType size, + const typename tnlParallelReductionEqualities< int, int > :: RealType* deviceInput1, + const typename tnlParallelReductionEqualities< int, int > :: RealType* deviceInput2, + typename tnlParallelReductionEqualities< int, int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionEqualities< float, int > > + ( const tnlParallelReductionEqualities< float, int >& operation, + const typename tnlParallelReductionEqualities< float, int > :: IndexType size, + const typename tnlParallelReductionEqualities< float, int > :: RealType* deviceInput1, + const typename tnlParallelReductionEqualities< float, int > :: RealType* deviceInput2, + typename tnlParallelReductionEqualities< float, int> :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionEqualities< double, int > > + ( const tnlParallelReductionEqualities< double, int>& operation, + const typename tnlParallelReductionEqualities< double, int > :: IndexType size, + const typename tnlParallelReductionEqualities< double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionEqualities< double, int > :: RealType* deviceInput2, + typename tnlParallelReductionEqualities< double, int> :: ResultType& result ); + +/*template bool reductionOnCudaDevice< tnlParallelReductionEqualities< long double, int > > + ( const tnlParallelReductionEqualities< long double, int>& operation, + const typename tnlParallelReductionEqualities< long double, int > :: IndexType size, + const typename tnlParallelReductionEqualities< long double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionEqualities< long double, int > :: RealType* deviceInput2, + typename tnlParallelReductionEqualities< long double, int> :: ResultType& result );*/ + +template bool reductionOnCudaDevice< tnlParallelReductionEqualities< char, long int > > + ( const tnlParallelReductionEqualities< char, long int >& operation, + const typename tnlParallelReductionEqualities< char, long int > :: IndexType size, + const typename tnlParallelReductionEqualities< char, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionEqualities< char, long int > :: RealType* deviceInput2, + typename tnlParallelReductionEqualities< char, long int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionEqualities< int, long int > > + ( const tnlParallelReductionEqualities< int, long int >& operation, + const typename tnlParallelReductionEqualities< int, long int > :: IndexType size, + const typename tnlParallelReductionEqualities< int, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionEqualities< int, long int > :: RealType* deviceInput2, + typename tnlParallelReductionEqualities< int, long int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionEqualities< float, long int > > + ( const tnlParallelReductionEqualities< float, long int >& operation, + const typename tnlParallelReductionEqualities< float, long int > :: IndexType size, + const typename tnlParallelReductionEqualities< float, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionEqualities< float, long int > :: RealType* deviceInput2, + typename tnlParallelReductionEqualities< float, long int> :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionEqualities< double, long int > > + ( const tnlParallelReductionEqualities< double, long int>& operation, + const typename tnlParallelReductionEqualities< double, long int > :: IndexType size, + const typename tnlParallelReductionEqualities< double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionEqualities< double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionEqualities< double, long int> :: ResultType& result ); + +/*template bool reductionOnCudaDevice< tnlParallelReductionEqualities< long double, long int > > + ( const tnlParallelReductionEqualities< long double, long int>& operation, + const typename tnlParallelReductionEqualities< long double, long int > :: IndexType size, + const typename tnlParallelReductionEqualities< long double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionEqualities< long double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionEqualities< long double, long int> :: ResultType& result );*/ + + +/**** + * Inequalities + */ +template bool reductionOnCudaDevice< tnlParallelReductionInequalities< char, int > > + ( const tnlParallelReductionInequalities< char, int >& operation, + const typename tnlParallelReductionInequalities< char, int > :: IndexType size, + const typename tnlParallelReductionInequalities< char, int > :: RealType* deviceInput1, + const typename tnlParallelReductionInequalities< char, int > :: RealType* deviceInput2, + typename tnlParallelReductionInequalities< char, int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionInequalities< int, int > > + ( const tnlParallelReductionInequalities< int, int >& operation, + const typename tnlParallelReductionInequalities< int, int > :: IndexType size, + const typename tnlParallelReductionInequalities< int, int > :: RealType* deviceInput1, + const typename tnlParallelReductionInequalities< int, int > :: RealType* deviceInput2, + typename tnlParallelReductionInequalities< int, int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionInequalities< float, int > > + ( const tnlParallelReductionInequalities< float, int >& operation, + const typename tnlParallelReductionInequalities< float, int > :: IndexType size, + const typename tnlParallelReductionInequalities< float, int > :: RealType* deviceInput1, + const typename tnlParallelReductionInequalities< float, int > :: RealType* deviceInput2, + typename tnlParallelReductionInequalities< float, int> :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionInequalities< double, int > > + ( const tnlParallelReductionInequalities< double, int>& operation, + const typename tnlParallelReductionInequalities< double, int > :: IndexType size, + const typename tnlParallelReductionInequalities< double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionInequalities< double, int > :: RealType* deviceInput2, + typename tnlParallelReductionInequalities< double, int> :: ResultType& result ); + +/*template bool reductionOnCudaDevice< tnlParallelReductionInequalities< long double, int > > + ( const tnlParallelReductionInequalities< long double, int>& operation, + const typename tnlParallelReductionInequalities< long double, int > :: IndexType size, + const typename tnlParallelReductionInequalities< long double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionInequalities< long double, int > :: RealType* deviceInput2, + typename tnlParallelReductionInequalities< long double, int> :: ResultType& result );*/ + +template bool reductionOnCudaDevice< tnlParallelReductionInequalities< char, long int > > + ( const tnlParallelReductionInequalities< char, long int >& operation, + const typename tnlParallelReductionInequalities< char, long int > :: IndexType size, + const typename tnlParallelReductionInequalities< char, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionInequalities< char, long int > :: RealType* deviceInput2, + typename tnlParallelReductionInequalities< char, long int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionInequalities< int, long int > > + ( const tnlParallelReductionInequalities< int, long int >& operation, + const typename tnlParallelReductionInequalities< int, long int > :: IndexType size, + const typename tnlParallelReductionInequalities< int, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionInequalities< int, long int > :: RealType* deviceInput2, + typename tnlParallelReductionInequalities< int, long int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionInequalities< float, long int > > + ( const tnlParallelReductionInequalities< float, long int >& operation, + const typename tnlParallelReductionInequalities< float, long int > :: IndexType size, + const typename tnlParallelReductionInequalities< float, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionInequalities< float, long int > :: RealType* deviceInput2, + typename tnlParallelReductionInequalities< float, long int> :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionInequalities< double, long int > > + ( const tnlParallelReductionInequalities< double, long int>& operation, + const typename tnlParallelReductionInequalities< double, long int > :: IndexType size, + const typename tnlParallelReductionInequalities< double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionInequalities< double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionInequalities< double, long int> :: ResultType& result ); + +/*template bool reductionOnCudaDevice< tnlParallelReductionInequalities< long double, long int > > + ( const tnlParallelReductionInequalities< long double, long int>& operation, + const typename tnlParallelReductionInequalities< long double, long int > :: IndexType size, + const typename tnlParallelReductionInequalities< long double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionInequalities< long double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionInequalities< long double, long int> :: ResultType& result );*/ + + +/**** + * Sdot + */ +template bool reductionOnCudaDevice< tnlParallelReductionSdot< char, int > > + ( const tnlParallelReductionSdot< char, int >& operation, + const typename tnlParallelReductionSdot< char, int > :: IndexType size, + const typename tnlParallelReductionSdot< char, int > :: RealType* deviceInput1, + const typename tnlParallelReductionSdot< char, int > :: RealType* deviceInput2, + typename tnlParallelReductionSdot< char, int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionSdot< int, int > > + ( const tnlParallelReductionSdot< int, int >& operation, + const typename tnlParallelReductionSdot< int, int > :: IndexType size, + const typename tnlParallelReductionSdot< int, int > :: RealType* deviceInput1, + const typename tnlParallelReductionSdot< int, int > :: RealType* deviceInput2, + typename tnlParallelReductionSdot< int, int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionSdot< float, int > > + ( const tnlParallelReductionSdot< float, int >& operation, + const typename tnlParallelReductionSdot< float, int > :: IndexType size, + const typename tnlParallelReductionSdot< float, int > :: RealType* deviceInput1, + const typename tnlParallelReductionSdot< float, int > :: RealType* deviceInput2, + typename tnlParallelReductionSdot< float, int> :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionSdot< double, int > > + ( const tnlParallelReductionSdot< double, int>& operation, + const typename tnlParallelReductionSdot< double, int > :: IndexType size, + const typename tnlParallelReductionSdot< double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionSdot< double, int > :: RealType* deviceInput2, + typename tnlParallelReductionSdot< double, int> :: ResultType& result ); + +/*template bool reductionOnCudaDevice< tnlParallelReductionSdot< long double, int > > + ( const tnlParallelReductionSdot< long double, int>& operation, + const typename tnlParallelReductionSdot< long double, int > :: IndexType size, + const typename tnlParallelReductionSdot< long double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionSdot< long double, int > :: RealType* deviceInput2, + typename tnlParallelReductionSdot< long double, int> :: ResultType& result );*/ + +template bool reductionOnCudaDevice< tnlParallelReductionSdot< char, long int > > + ( const tnlParallelReductionSdot< char, long int >& operation, + const typename tnlParallelReductionSdot< char, long int > :: IndexType size, + const typename tnlParallelReductionSdot< char, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionSdot< char, long int > :: RealType* deviceInput2, + typename tnlParallelReductionSdot< char, long int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionSdot< int, long int > > + ( const tnlParallelReductionSdot< int, long int >& operation, + const typename tnlParallelReductionSdot< int, long int > :: IndexType size, + const typename tnlParallelReductionSdot< int, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionSdot< int, long int > :: RealType* deviceInput2, + typename tnlParallelReductionSdot< int, long int > :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionSdot< float, long int > > + ( const tnlParallelReductionSdot< float, long int >& operation, + const typename tnlParallelReductionSdot< float, long int > :: IndexType size, + const typename tnlParallelReductionSdot< float, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionSdot< float, long int > :: RealType* deviceInput2, + typename tnlParallelReductionSdot< float, long int> :: ResultType& result ); + +template bool reductionOnCudaDevice< tnlParallelReductionSdot< double, long int > > + ( const tnlParallelReductionSdot< double, long int>& operation, + const typename tnlParallelReductionSdot< double, long int > :: IndexType size, + const typename tnlParallelReductionSdot< double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionSdot< double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionSdot< double, long int> :: ResultType& result ); + +/*template bool reductionOnCudaDevice< tnlParallelReductionSdot< long double, long int > > + ( const tnlParallelReductionSdot< long double, long int>& operation, + const typename tnlParallelReductionSdot< long double, long int > :: IndexType size, + const typename tnlParallelReductionSdot< long double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionSdot< long double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionSdot< long double, long int> :: ResultType& result );*/ + +#endif diff --git a/src/implementation/core/cuda/cuda-reduction_impl.h b/src/implementation/core/cuda/cuda-reduction_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..6eaf2f674e5d0232fa068c2f9cbdda98ffc3e3db --- /dev/null +++ b/src/implementation/core/cuda/cuda-reduction_impl.h @@ -0,0 +1,1313 @@ +/*************************************************************************** + cuda-reduction_impl.h - description + ------------------- + begin : Mar 24, 2013 + copyright : (C) 2013 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/*************************************************************************** + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + ***************************************************************************/ + +#ifndef CUDA_REDUCTION_IMPL_H_ +#define CUDA_REDUCTION_IMPL_H_ + +#ifdef HAVE_CUDA +#include <cuda.h> +#endif +#include <iostream> +#include <core/tnlAssert.h> +#include <core/cuda/reduction-operations.h> +#include <implementation/core/memory-operations.h> + +using namespace std; + + +/**** + * This constant says that arrays smaller than its value + * are going to be reduced on CPU. + */ +const int maxGPUReductionDataSize = 256; + +#ifdef HAVE_CUDA + + +/*** + * For each thread in block with thread ID smaller then s this function reduces + * data elements with indecis tid and tid + s. Here we assume that for each + * tid the tid + s element also exists i.e. we have even number of elements. + */ +template< typename Operation > +__device__ void reduceAligned( const Operation& operation, + typename Operation :: IndexType tid, + typename Operation :: IndexType s, + typename Operation :: ResultType* sdata ) +{ + if( tid < s ) + { + sdata[ tid ] = operation. commonReductionOnDevice( tid, tid + s, sdata ); + } +} + +/*** + * For each thread in block with thread ID smaller then s this function reduces + * data elements with indices tid and tid + s. This is a modified version of + * the previous algorithm. Thid one works even for odd number of elements but + * it is a bit slower. + */ +template< typename Operation > +__device__ void reduceNonAligned( const Operation& operation, + typename Operation :: IndexType tid, + typename Operation :: IndexType s, + typename Operation :: IndexType n, + typename Operation :: ResultType* sdata ) +{ + if( tid < s ) + { + sdata[ tid ] = operation. commonReductionOnDevice( tid, tid + s, sdata ); + } + /* This is for the case when we have odd number of elements. + * The last one will be reduced using the thread with ID 0. + */ + if( s > 32 ) + __syncthreads(); + if( 2 * s < n && tid == n - 1 ) + { + sdata[ 0 ] = operation. commonReductionOnDevice( 0, tid, sdata ); + } +} + +/*** + * The parallel reduction of one vector. + * + * WARNING: This kernel only reduce data in one block. Use rather tnlCUDASimpleReduction2 + * to call this kernel then doing it by yourself. + * This kernel is very inefficient. It is here only for educative and testing reasons. + * Please use tnlCUDAReduction instead. + * + * The kernel parameters: + * @param size is the number of all element to reduce - not just in one block. + * @param deviceInput input data which we want to reduce + * @param deviceOutput an array to which we write the result of reduction. + * Each block of the grid writes one element in this array + * (i.e. the size of this array equals the number of CUDA blocks). + */ +template < typename Operation, int blockSize > +__global__ void tnlCUDAReductionKernel( const Operation operation, + const typename Operation :: IndexType size, + const typename Operation :: RealType* deviceInput, + const typename Operation :: RealType* deviceInput2, + typename Operation :: ResultType* deviceOutput ) +{ + extern __shared__ __align__ ( 8 ) char __sdata[]; + + typedef typename Operation :: IndexType IndexType; + typedef typename Operation :: RealType RealType; + typedef typename Operation :: ResultType ResultType; + + ResultType* sdata = reinterpret_cast< ResultType* >( __sdata ); + + /*** + * Get thread id (tid) and global thread id (gid). + * lastTId is the last relevant thread id in this block. + * gridSize is the number of element processed by all blocks at the + * same time. + */ + IndexType tid = threadIdx. x; + IndexType gid = 2 * blockIdx. x * blockDim. x + threadIdx. x; + IndexType lastTId = size - 2 * blockIdx. x * blockDim. x; + IndexType gridSize = 2 * blockDim. x * gridDim.x; + + /*** + * Read data into the shared memory. We start with the + * sequential reduction. + */ + if( gid + blockDim. x < size ) + sdata[ tid ] = operation. initialValueOnDevice( gid, gid + blockDim. x, deviceInput, deviceInput2 ); + else if( gid < size ) + sdata[ tid ] = operation. initialValueOnDevice( gid, deviceInput, deviceInput2 ); + + gid += gridSize; + while( gid + blockDim. x < size ) + { + sdata[ tid ] = operation. firstReductionOnDevice( tid, gid, gid + blockDim. x, sdata, deviceInput, deviceInput2 ); + gid += gridSize; + } + if( gid < size ) + sdata[ tid ] = operation. firstReductionOnDevice( tid, gid, sdata, deviceInput, deviceInput2 ); + __syncthreads(); + + + /*** + * Perform the parallel reduction. + * We reduce the data with step s which is one half of the elements to reduce. + * Each thread with ID < s reduce elements tid and tid + s. The result is stored + * in shared memory in sdata 0 .. s. We set s = s / 2 ( i.e. s >>= 1) and repeat + * the algorithm again until s = 1. + * We also separate the case when the blockDim. x is power of 2 and the algorithm + * can be written in more efficient way without some conditions. + */ + unsigned int n = lastTId < blockDim. x ? lastTId : blockDim. x; + if( n == 128 || n == 64 || n == 32 || n == 16 || + n == 8 || n == 4 || n == 2 || n == 256 || + n == 512 ) + { + if( blockSize >= 512 ) + { + if( tid < 256 ) + reduceAligned( operation, tid, 256, sdata ); + __syncthreads(); + } + if( blockSize >= 256 ) + { + if( tid < 128 ) + reduceAligned( operation, tid, 128, sdata ); + __syncthreads(); + } + if( blockSize >= 128 ) + { + if( tid < 64 ) + reduceAligned( operation, tid, 64, sdata ); + __syncthreads(); + } + + /*** + * This runs in one warp so it is synchronised implicitly. + */ + if (tid < 32) + { + if( blockSize >= 64 ) + reduceAligned( operation, tid, 32, sdata ); + if( blockSize >= 32 ) + reduceAligned( operation, tid, 16, sdata ); + if( blockSize >= 16 ) + reduceAligned( operation, tid, 8, sdata ); + if( blockSize >= 8 ) + reduceAligned( operation, tid, 4, sdata ); + if( blockSize >= 4 ) + reduceAligned( operation, tid, 2, sdata ); + if( blockSize >= 2 ) + reduceAligned( operation, tid, 1, sdata ); + } + } + else + { + unsigned int s; + if( n >= 512 ) + { + s = n / 2; + reduceNonAligned( operation, tid, s, n, sdata ); + n = s; + __syncthreads(); + } + if( n >= 256 ) + { + s = n / 2; + reduceNonAligned( operation, tid, s, n, sdata ); + n = s; + __syncthreads(); + } + if( n >= 128 ) + { + s = n / 2; + reduceNonAligned( operation, tid, s, n, sdata ); + n = s; + __syncthreads(); + } + if( n >= 64 ) + { + s = n / 2; + reduceNonAligned( operation, tid, s, n, sdata ); + n = s; + __syncthreads(); + } + if( n >= 32 ) + { + s = n / 2; + reduceNonAligned( operation, tid, s, n, sdata ); + n = s; + __syncthreads(); + } + /*** + * This runs in one warp so it is synchronised implicitly. + */ + if( n >= 16 ) + { + s = n / 2; + reduceNonAligned( operation, tid, s, n, sdata ); + n = s; + } + if( n >= 8 ) + { + s = n / 2; + reduceNonAligned( operation, tid, s, n, sdata ); + n = s; + } + if( n >= 4 ) + { + s = n / 2; + reduceNonAligned( operation, tid, s, n, sdata ); + n = s; + } + if( n >= 2 ) + { + s = n / 2; + reduceNonAligned( operation, tid, s, n, sdata ); + n = s; + } + } + + /*** + * Store the result back in the global memory. + */ + if( tid == 0 ) + deviceOutput[ blockIdx. x ] = sdata[ 0 ]; +} + +template< typename Operation > +typename Operation :: IndexType reduceOnCudaDevice( const Operation& operation, + const typename Operation :: IndexType size, + const typename Operation :: RealType* input1, + const typename Operation :: RealType* input2, + typename Operation :: ResultType*& output) +{ + typedef typename Operation :: IndexType IndexType; + typedef typename Operation :: RealType RealType; + typedef typename Operation :: ResultType ResultType; + + const IndexType desBlockSize( 512 ); + const IndexType desGridSize( 2048 ); + dim3 blockSize( 0 ), gridSize( 0 ); + + /*** + * Compute the CUDA block size aligned to the power of two. + */ + blockSize. x = :: Min( size, desBlockSize ); + IndexType alignedBlockSize = 1; + while( alignedBlockSize < blockSize. x ) alignedBlockSize <<= 1; + blockSize. x = alignedBlockSize; + + gridSize. x = Min( ( IndexType ) ( size / blockSize. x + 1 ) / 2, desGridSize ); + + if( ! output && + ! allocateMemoryCuda( output, :: Max( ( IndexType ) 1, size / desBlockSize ) ) ) + return false; + + IndexType shmem = blockSize. x * sizeof( ResultType ); + /*** + * Depending on the blockSize we generate appropriate template instance. + */ + switch( blockSize. x ) + { + case 512: + tnlCUDAReductionKernel< Operation, 512 > + <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); + break; + case 256: + tnlCUDAReductionKernel< Operation, 256 > + <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); + break; + case 128: + tnlCUDAReductionKernel< Operation, 128 > + <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); + break; + case 64: + tnlCUDAReductionKernel< Operation, 64 > + <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); + break; + case 32: + tnlCUDAReductionKernel< Operation, 32 > + <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); + break; + case 16: + tnlCUDAReductionKernel< Operation, 16 > + <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); + break; + case 8: + tnlCUDAReductionKernel< Operation, 8 > + <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); + break; + case 4: + tnlCUDAReductionKernel< Operation, 4 > + <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); + break; + case 2: + tnlCUDAReductionKernel< Operation, 2 > + <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); + break; + case 1: + tnlAssert( false, cerr << "blockSize should not be 1." << endl ); + break; + default: + tnlAssert( false, cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); + break; + } + return gridSize. x; +} +#endif + +template< typename Operation > +bool reductionOnCudaDevice( const Operation& operation, + const typename Operation :: IndexType size, + const typename Operation :: RealType* deviceInput1, + const typename Operation :: RealType* deviceInput2, + typename Operation :: ResultType& result ) +{ +#ifdef HAVE_CUDA + + typedef typename Operation :: IndexType IndexType; + typedef typename Operation :: RealType RealType; + typedef typename Operation :: ResultType ResultType; + typedef typename Operation :: LaterReductionOperation LaterReductionOperation; + + /*** + * First check if the input array(s) is/are large enough for the reduction on GPU. + * Otherwise copy it/them to host and reduce on CPU. + */ + RealType hostArray1[ maxGPUReductionDataSize ]; + RealType hostArray2[ maxGPUReductionDataSize ]; + if( size <= maxGPUReductionDataSize ) + { + if( ! copyMemoryCudaToHost( hostArray1, deviceInput1, size ) ) + return false; + if( deviceInput2 && ! copyMemoryCudaToHost( hostArray2, deviceInput2, size ) ) + return false; + result = operation. initialValueOnHost( 0, hostArray1, hostArray2 ); + for( IndexType i = 1; i < size; i ++ ) + result = operation. reduceOnHost( i, result, hostArray1, hostArray2 ); + return true; + } + + /**** + * Reduce the data on the CUDA device. + */ + ResultType* deviceAux1( 0 ), *deviceAux2( 0 ); + IndexType reducedSize = reduceOnCudaDevice( operation, + size, + deviceInput1, + deviceInput2, + deviceAux1 ); + + LaterReductionOperation laterReductionOperation; + while( reducedSize > maxGPUReductionDataSize ) + { + reducedSize = reduceOnCudaDevice( laterReductionOperation, + reducedSize, + deviceAux1, + ( ResultType* ) 0, + deviceAux2 ); + Swap( deviceAux1, deviceAux2 ); + } + + /*** + * Transfer the reduced data from device to host. + */ + ResultType resultArray[ maxGPUReductionDataSize ]; + if( ! copyMemoryCudaToHost( resultArray, deviceAux1, reducedSize ) ) + return false; + + /*** + * Reduce the data on the host system. + */ + //for( IndexType i = 0; i < reducedSize; i ++ ) + // cout << resultArray[ i ] << ", "; + result = laterReductionOperation. initialValueOnHost( 0, resultArray, ( ResultType* ) 0 ); + for( IndexType i = 1; i < reducedSize; i ++ ) + result = laterReductionOperation. reduceOnHost( i, result, resultArray, ( ResultType*) 0 ); + + /**** + * Free the memory allocated on the device. + */ + if( deviceAux1 && ! freeMemoryCuda( deviceAux1 ) ) + return false; + if( deviceAux2 && ! freeMemoryCuda( deviceAux2 ) ) + return false; + return true; +#else + cerr << "I am sorry but CUDA support is missing on this system " << __FILE__ << " line " << __LINE__ << "." << endl; + return false; +#endif +}; + +#ifdef TEMPLATE_EXPLICIT_INSTANTIATION + +/**** + * Sum + */ + +extern template bool reductionOnCudaDevice< tnlParallelReductionSum< char, int > > + ( const tnlParallelReductionSum< char, int >& operation, + const typename tnlParallelReductionSum< char, int > :: IndexType size, + const typename tnlParallelReductionSum< char, int > :: RealType* deviceInput1, + const typename tnlParallelReductionSum< char, int > :: RealType* deviceInput2, + typename tnlParallelReductionSum< char, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionSum< int, int > > + ( const tnlParallelReductionSum< int, int >& operation, + const typename tnlParallelReductionSum< int, int > :: IndexType size, + const typename tnlParallelReductionSum< int, int > :: RealType* deviceInput1, + const typename tnlParallelReductionSum< int, int > :: RealType* deviceInput2, + typename tnlParallelReductionSum< int, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionSum< float, int > > + ( const tnlParallelReductionSum< float, int >& operation, + const typename tnlParallelReductionSum< float, int > :: IndexType size, + const typename tnlParallelReductionSum< float, int > :: RealType* deviceInput1, + const typename tnlParallelReductionSum< float, int > :: RealType* deviceInput2, + typename tnlParallelReductionSum< float, int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionSum< double, int > > + ( const tnlParallelReductionSum< double, int>& operation, + const typename tnlParallelReductionSum< double, int > :: IndexType size, + const typename tnlParallelReductionSum< double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionSum< double, int > :: RealType* deviceInput2, + typename tnlParallelReductionSum< double, int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionSum< long double, int > > + ( const tnlParallelReductionSum< long double, int>& operation, + const typename tnlParallelReductionSum< long double, int > :: IndexType size, + const typename tnlParallelReductionSum< long double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionSum< long double, int > :: RealType* deviceInput2, + typename tnlParallelReductionSum< long double, int> :: ResultType& result );*/ + +extern template bool reductionOnCudaDevice< tnlParallelReductionSum< char, long int > > + ( const tnlParallelReductionSum< char, long int >& operation, + const typename tnlParallelReductionSum< char, long int > :: IndexType size, + const typename tnlParallelReductionSum< char, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionSum< char, long int > :: RealType* deviceInput2, + typename tnlParallelReductionSum< char, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionSum< int, long int > > + ( const tnlParallelReductionSum< int, long int >& operation, + const typename tnlParallelReductionSum< int, long int > :: IndexType size, + const typename tnlParallelReductionSum< int, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionSum< int, long int > :: RealType* deviceInput2, + typename tnlParallelReductionSum< int, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionSum< float, long int > > + ( const tnlParallelReductionSum< float, long int >& operation, + const typename tnlParallelReductionSum< float, long int > :: IndexType size, + const typename tnlParallelReductionSum< float, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionSum< float, long int > :: RealType* deviceInput2, + typename tnlParallelReductionSum< float, long int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionSum< double, long int > > + ( const tnlParallelReductionSum< double, long int>& operation, + const typename tnlParallelReductionSum< double, long int > :: IndexType size, + const typename tnlParallelReductionSum< double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionSum< double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionSum< double, long int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionSum< long double, long int > > + ( const tnlParallelReductionSum< long double, long int>& operation, + const typename tnlParallelReductionSum< long double, long int > :: IndexType size, + const typename tnlParallelReductionSum< long double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionSum< long double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionSum< long double, long int> :: ResultType& result );*/ + +/**** + * Min + */ + +extern template bool reductionOnCudaDevice< tnlParallelReductionMin< char, int > > + ( const tnlParallelReductionMin< char, int >& operation, + const typename tnlParallelReductionMin< char, int > :: IndexType size, + const typename tnlParallelReductionMin< char, int > :: RealType* deviceInput1, + const typename tnlParallelReductionMin< char, int > :: RealType* deviceInput2, + typename tnlParallelReductionMin< char, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionMin< int, int > > + ( const tnlParallelReductionMin< int, int >& operation, + const typename tnlParallelReductionMin< int, int > :: IndexType size, + const typename tnlParallelReductionMin< int, int > :: RealType* deviceInput1, + const typename tnlParallelReductionMin< int, int > :: RealType* deviceInput2, + typename tnlParallelReductionMin< int, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionMin< float, int > > + ( const tnlParallelReductionMin< float, int >& operation, + const typename tnlParallelReductionMin< float, int > :: IndexType size, + const typename tnlParallelReductionMin< float, int > :: RealType* deviceInput1, + const typename tnlParallelReductionMin< float, int > :: RealType* deviceInput2, + typename tnlParallelReductionMin< float, int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionMin< double, int > > + ( const tnlParallelReductionMin< double, int>& operation, + const typename tnlParallelReductionMin< double, int > :: IndexType size, + const typename tnlParallelReductionMin< double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionMin< double, int > :: RealType* deviceInput2, + typename tnlParallelReductionMin< double, int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionMin< long double, int > > + ( const tnlParallelReductionMin< long double, int>& operation, + const typename tnlParallelReductionMin< long double, int > :: IndexType size, + const typename tnlParallelReductionMin< long double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionMin< long double, int > :: RealType* deviceInput2, + typename tnlParallelReductionMin< long double, int> :: ResultType& result );*/ + +extern template bool reductionOnCudaDevice< tnlParallelReductionMin< char, long int > > + ( const tnlParallelReductionMin< char, long int >& operation, + const typename tnlParallelReductionMin< char, long int > :: IndexType size, + const typename tnlParallelReductionMin< char, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionMin< char, long int > :: RealType* deviceInput2, + typename tnlParallelReductionMin< char, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionMin< int, long int > > + ( const tnlParallelReductionMin< int, long int >& operation, + const typename tnlParallelReductionMin< int, long int > :: IndexType size, + const typename tnlParallelReductionMin< int, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionMin< int, long int > :: RealType* deviceInput2, + typename tnlParallelReductionMin< int, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionMin< float, long int > > + ( const tnlParallelReductionMin< float, long int >& operation, + const typename tnlParallelReductionMin< float, long int > :: IndexType size, + const typename tnlParallelReductionMin< float, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionMin< float, long int > :: RealType* deviceInput2, + typename tnlParallelReductionMin< float, long int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionMin< double, long int > > + ( const tnlParallelReductionMin< double, long int>& operation, + const typename tnlParallelReductionMin< double, long int > :: IndexType size, + const typename tnlParallelReductionMin< double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionMin< double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionMin< double, long int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionMin< long double, long int > > + ( const tnlParallelReductionMin< long double, long int>& operation, + const typename tnlParallelReductionMin< long double, long int > :: IndexType size, + const typename tnlParallelReductionMin< long double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionMin< long double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionMin< long double, long int> :: ResultType& result );*/ + +/**** + * Max + */ + +extern template bool reductionOnCudaDevice< tnlParallelReductionMax< char, int > > + ( const tnlParallelReductionMax< char, int >& operation, + const typename tnlParallelReductionMax< char, int > :: IndexType size, + const typename tnlParallelReductionMax< char, int > :: RealType* deviceInput1, + const typename tnlParallelReductionMax< char, int > :: RealType* deviceInput2, + typename tnlParallelReductionMax< char, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionMax< int, int > > + ( const tnlParallelReductionMax< int, int >& operation, + const typename tnlParallelReductionMax< int, int > :: IndexType size, + const typename tnlParallelReductionMax< int, int > :: RealType* deviceInput1, + const typename tnlParallelReductionMax< int, int > :: RealType* deviceInput2, + typename tnlParallelReductionMax< int, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionMax< float, int > > + ( const tnlParallelReductionMax< float, int >& operation, + const typename tnlParallelReductionMax< float, int > :: IndexType size, + const typename tnlParallelReductionMax< float, int > :: RealType* deviceInput1, + const typename tnlParallelReductionMax< float, int > :: RealType* deviceInput2, + typename tnlParallelReductionMax< float, int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionMax< double, int > > + ( const tnlParallelReductionMax< double, int>& operation, + const typename tnlParallelReductionMax< double, int > :: IndexType size, + const typename tnlParallelReductionMax< double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionMax< double, int > :: RealType* deviceInput2, + typename tnlParallelReductionMax< double, int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionMax< long double, int > > + ( const tnlParallelReductionMax< long double, int>& operation, + const typename tnlParallelReductionMax< long double, int > :: IndexType size, + const typename tnlParallelReductionMax< long double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionMax< long double, int > :: RealType* deviceInput2, + typename tnlParallelReductionMax< long double, int> :: ResultType& result );*/ + +extern template bool reductionOnCudaDevice< tnlParallelReductionMax< char, long int > > + ( const tnlParallelReductionMax< char, long int >& operation, + const typename tnlParallelReductionMax< char, long int > :: IndexType size, + const typename tnlParallelReductionMax< char, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionMax< char, long int > :: RealType* deviceInput2, + typename tnlParallelReductionMax< char, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionMax< int, long int > > + ( const tnlParallelReductionMax< int, long int >& operation, + const typename tnlParallelReductionMax< int, long int > :: IndexType size, + const typename tnlParallelReductionMax< int, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionMax< int, long int > :: RealType* deviceInput2, + typename tnlParallelReductionMax< int, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionMax< float, long int > > + ( const tnlParallelReductionMax< float, long int >& operation, + const typename tnlParallelReductionMax< float, long int > :: IndexType size, + const typename tnlParallelReductionMax< float, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionMax< float, long int > :: RealType* deviceInput2, + typename tnlParallelReductionMax< float, long int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionMax< double, long int > > + ( const tnlParallelReductionMax< double, long int>& operation, + const typename tnlParallelReductionMax< double, long int > :: IndexType size, + const typename tnlParallelReductionMax< double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionMax< double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionMax< double, long int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionMax< long double, long int > > + ( const tnlParallelReductionMax< long double, long int>& operation, + const typename tnlParallelReductionMax< long double, long int > :: IndexType size, + const typename tnlParallelReductionMax< long double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionMax< long double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionMax< long double, long int> :: ResultType& result );*/ + +/**** + * Abs sum + */ + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< char, int > > + ( const tnlParallelReductionAbsSum< char, int >& operation, + const typename tnlParallelReductionAbsSum< char, int > :: IndexType size, + const typename tnlParallelReductionAbsSum< char, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsSum< char, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsSum< char, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< int, int > > + ( const tnlParallelReductionAbsSum< int, int >& operation, + const typename tnlParallelReductionAbsSum< int, int > :: IndexType size, + const typename tnlParallelReductionAbsSum< int, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsSum< int, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsSum< int, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< float, int > > + ( const tnlParallelReductionAbsSum< float, int >& operation, + const typename tnlParallelReductionAbsSum< float, int > :: IndexType size, + const typename tnlParallelReductionAbsSum< float, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsSum< float, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsSum< float, int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< double, int > > + ( const tnlParallelReductionAbsSum< double, int>& operation, + const typename tnlParallelReductionAbsSum< double, int > :: IndexType size, + const typename tnlParallelReductionAbsSum< double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsSum< double, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsSum< double, int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< long double, int > > + ( const tnlParallelReductionAbsSum< long double, int>& operation, + const typename tnlParallelReductionAbsSum< long double, int > :: IndexType size, + const typename tnlParallelReductionAbsSum< long double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsSum< long double, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsSum< long double, int> :: ResultType& result );*/ + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< char, long int > > + ( const tnlParallelReductionAbsSum< char, long int >& operation, + const typename tnlParallelReductionAbsSum< char, long int > :: IndexType size, + const typename tnlParallelReductionAbsSum< char, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsSum< char, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsSum< char, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< int, long int > > + ( const tnlParallelReductionAbsSum< int, long int >& operation, + const typename tnlParallelReductionAbsSum< int, long int > :: IndexType size, + const typename tnlParallelReductionAbsSum< int, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsSum< int, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsSum< int, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< float, long int > > + ( const tnlParallelReductionAbsSum< float, long int >& operation, + const typename tnlParallelReductionAbsSum< float, long int > :: IndexType size, + const typename tnlParallelReductionAbsSum< float, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsSum< float, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsSum< float, long int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< double, long int > > + ( const tnlParallelReductionAbsSum< double, long int>& operation, + const typename tnlParallelReductionAbsSum< double, long int > :: IndexType size, + const typename tnlParallelReductionAbsSum< double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsSum< double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsSum< double, long int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionAbsSum< long double, long int > > + ( const tnlParallelReductionAbsSum< long double, long int>& operation, + const typename tnlParallelReductionAbsSum< long double, long int > :: IndexType size, + const typename tnlParallelReductionAbsSum< long double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsSum< long double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsSum< long double, long int> :: ResultType& result );*/ + +/**** + * Abs min + */ + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< char, int > > + ( const tnlParallelReductionAbsMin< char, int >& operation, + const typename tnlParallelReductionAbsMin< char, int > :: IndexType size, + const typename tnlParallelReductionAbsMin< char, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMin< char, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMin< char, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< int, int > > + ( const tnlParallelReductionAbsMin< int, int >& operation, + const typename tnlParallelReductionAbsMin< int, int > :: IndexType size, + const typename tnlParallelReductionAbsMin< int, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMin< int, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMin< int, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< float, int > > + ( const tnlParallelReductionAbsMin< float, int >& operation, + const typename tnlParallelReductionAbsMin< float, int > :: IndexType size, + const typename tnlParallelReductionAbsMin< float, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMin< float, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMin< float, int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< double, int > > + ( const tnlParallelReductionAbsMin< double, int>& operation, + const typename tnlParallelReductionAbsMin< double, int > :: IndexType size, + const typename tnlParallelReductionAbsMin< double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMin< double, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMin< double, int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< long double, int > > + ( const tnlParallelReductionAbsMin< long double, int>& operation, + const typename tnlParallelReductionAbsMin< long double, int > :: IndexType size, + const typename tnlParallelReductionAbsMin< long double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMin< long double, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMin< long double, int> :: ResultType& result );*/ + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< char, long int > > + ( const tnlParallelReductionAbsMin< char, long int >& operation, + const typename tnlParallelReductionAbsMin< char, long int > :: IndexType size, + const typename tnlParallelReductionAbsMin< char, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMin< char, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMin< char, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< int, long int > > + ( const tnlParallelReductionAbsMin< int, long int >& operation, + const typename tnlParallelReductionAbsMin< int, long int > :: IndexType size, + const typename tnlParallelReductionAbsMin< int, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMin< int, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMin< int, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< float, long int > > + ( const tnlParallelReductionAbsMin< float, long int >& operation, + const typename tnlParallelReductionAbsMin< float, long int > :: IndexType size, + const typename tnlParallelReductionAbsMin< float, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMin< float, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMin< float, long int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< double, long int > > + ( const tnlParallelReductionAbsMin< double, long int>& operation, + const typename tnlParallelReductionAbsMin< double, long int > :: IndexType size, + const typename tnlParallelReductionAbsMin< double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMin< double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMin< double, long int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMin< long double, long int > > + ( const tnlParallelReductionAbsMin< long double, long int>& operation, + const typename tnlParallelReductionAbsMin< long double, long int > :: IndexType size, + const typename tnlParallelReductionAbsMin< long double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMin< long double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMin< long double, long int> :: ResultType& result );*/ +/**** + * Abs max + */ + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< char, int > > + ( const tnlParallelReductionAbsMax< char, int >& operation, + const typename tnlParallelReductionAbsMax< char, int > :: IndexType size, + const typename tnlParallelReductionAbsMax< char, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMax< char, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMax< char, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< int, int > > + ( const tnlParallelReductionAbsMax< int, int >& operation, + const typename tnlParallelReductionAbsMax< int, int > :: IndexType size, + const typename tnlParallelReductionAbsMax< int, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMax< int, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMax< int, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< float, int > > + ( const tnlParallelReductionAbsMax< float, int >& operation, + const typename tnlParallelReductionAbsMax< float, int > :: IndexType size, + const typename tnlParallelReductionAbsMax< float, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMax< float, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMax< float, int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< double, int > > + ( const tnlParallelReductionAbsMax< double, int>& operation, + const typename tnlParallelReductionAbsMax< double, int > :: IndexType size, + const typename tnlParallelReductionAbsMax< double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMax< double, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMax< double, int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< long double, int > > + ( const tnlParallelReductionAbsMax< long double, int>& operation, + const typename tnlParallelReductionAbsMax< long double, int > :: IndexType size, + const typename tnlParallelReductionAbsMax< long double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMax< long double, int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMax< long double, int> :: ResultType& result );*/ + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< char, long int > > + ( const tnlParallelReductionAbsMax< char, long int >& operation, + const typename tnlParallelReductionAbsMax< char, long int > :: IndexType size, + const typename tnlParallelReductionAbsMax< char, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMax< char, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMax< char, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< int, long int > > + ( const tnlParallelReductionAbsMax< int, long int >& operation, + const typename tnlParallelReductionAbsMax< int, long int > :: IndexType size, + const typename tnlParallelReductionAbsMax< int, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMax< int, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMax< int, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< float, long int > > + ( const tnlParallelReductionAbsMax< float, long int >& operation, + const typename tnlParallelReductionAbsMax< float, long int > :: IndexType size, + const typename tnlParallelReductionAbsMax< float, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMax< float, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMax< float, long int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< double, long int > > + ( const tnlParallelReductionAbsMax< double, long int>& operation, + const typename tnlParallelReductionAbsMax< double, long int > :: IndexType size, + const typename tnlParallelReductionAbsMax< double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMax< double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMax< double, long int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionAbsMax< long double, long int > > + ( const tnlParallelReductionAbsMax< long double, long int>& operation, + const typename tnlParallelReductionAbsMax< long double, long int > :: IndexType size, + const typename tnlParallelReductionAbsMax< long double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionAbsMax< long double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionAbsMax< long double, long int> :: ResultType& result );*/ + +/**** + * Logical AND + */ +extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< char, int > > + ( const tnlParallelReductionLogicalAnd< char, int >& operation, + const typename tnlParallelReductionLogicalAnd< char, int > :: IndexType size, + const typename tnlParallelReductionLogicalAnd< char, int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalAnd< char, int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalAnd< char, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< int, int > > + ( const tnlParallelReductionLogicalAnd< int, int >& operation, + const typename tnlParallelReductionLogicalAnd< int, int > :: IndexType size, + const typename tnlParallelReductionLogicalAnd< int, int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalAnd< int, int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalAnd< int, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< float, int > > + ( const tnlParallelReductionLogicalAnd< float, int >& operation, + const typename tnlParallelReductionLogicalAnd< float, int > :: IndexType size, + const typename tnlParallelReductionLogicalAnd< float, int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalAnd< float, int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalAnd< float, int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< double, int > > + ( const tnlParallelReductionLogicalAnd< double, int>& operation, + const typename tnlParallelReductionLogicalAnd< double, int > :: IndexType size, + const typename tnlParallelReductionLogicalAnd< double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalAnd< double, int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalAnd< double, int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< long double, int > > + ( const tnlParallelReductionLogicalAnd< long double, int>& operation, + const typename tnlParallelReductionLogicalAnd< long double, int > :: IndexType size, + const typename tnlParallelReductionLogicalAnd< long double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalAnd< long double, int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalAnd< long double, int> :: ResultType& result );*/ + +extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< char, long int > > + ( const tnlParallelReductionLogicalAnd< char, long int >& operation, + const typename tnlParallelReductionLogicalAnd< char, long int > :: IndexType size, + const typename tnlParallelReductionLogicalAnd< char, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalAnd< char, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalAnd< char, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< int, long int > > + ( const tnlParallelReductionLogicalAnd< int, long int >& operation, + const typename tnlParallelReductionLogicalAnd< int, long int > :: IndexType size, + const typename tnlParallelReductionLogicalAnd< int, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalAnd< int, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalAnd< int, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< float, long int > > + ( const tnlParallelReductionLogicalAnd< float, long int >& operation, + const typename tnlParallelReductionLogicalAnd< float, long int > :: IndexType size, + const typename tnlParallelReductionLogicalAnd< float, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalAnd< float, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalAnd< float, long int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< double, long int > > + ( const tnlParallelReductionLogicalAnd< double, long int>& operation, + const typename tnlParallelReductionLogicalAnd< double, long int > :: IndexType size, + const typename tnlParallelReductionLogicalAnd< double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalAnd< double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalAnd< double, long int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalAnd< long double, long int > > + ( const tnlParallelReductionLogicalAnd< long double, long int>& operation, + const typename tnlParallelReductionLogicalAnd< long double, long int > :: IndexType size, + const typename tnlParallelReductionLogicalAnd< long double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalAnd< long double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalAnd< long double, long int> :: ResultType& result );*/ + +/**** + * Logical OR + */ +extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< char, int > > + ( const tnlParallelReductionLogicalOr< char, int >& operation, + const typename tnlParallelReductionLogicalOr< char, int > :: IndexType size, + const typename tnlParallelReductionLogicalOr< char, int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalOr< char, int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalOr< char, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< int, int > > + ( const tnlParallelReductionLogicalOr< int, int >& operation, + const typename tnlParallelReductionLogicalOr< int, int > :: IndexType size, + const typename tnlParallelReductionLogicalOr< int, int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalOr< int, int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalOr< int, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< float, int > > + ( const tnlParallelReductionLogicalOr< float, int >& operation, + const typename tnlParallelReductionLogicalOr< float, int > :: IndexType size, + const typename tnlParallelReductionLogicalOr< float, int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalOr< float, int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalOr< float, int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< double, int > > + ( const tnlParallelReductionLogicalOr< double, int>& operation, + const typename tnlParallelReductionLogicalOr< double, int > :: IndexType size, + const typename tnlParallelReductionLogicalOr< double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalOr< double, int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalOr< double, int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< long double, int > > + ( const tnlParallelReductionLogicalOr< long double, int>& operation, + const typename tnlParallelReductionLogicalOr< long double, int > :: IndexType size, + const typename tnlParallelReductionLogicalOr< long double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalOr< long double, int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalOr< long double, int> :: ResultType& result );*/ + +extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< char, long int > > + ( const tnlParallelReductionLogicalOr< char, long int >& operation, + const typename tnlParallelReductionLogicalOr< char, long int > :: IndexType size, + const typename tnlParallelReductionLogicalOr< char, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalOr< char, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalOr< char, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< int, long int > > + ( const tnlParallelReductionLogicalOr< int, long int >& operation, + const typename tnlParallelReductionLogicalOr< int, long int > :: IndexType size, + const typename tnlParallelReductionLogicalOr< int, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalOr< int, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalOr< int, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< float, long int > > + ( const tnlParallelReductionLogicalOr< float, long int >& operation, + const typename tnlParallelReductionLogicalOr< float, long int > :: IndexType size, + const typename tnlParallelReductionLogicalOr< float, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalOr< float, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalOr< float, long int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< double, long int > > + ( const tnlParallelReductionLogicalOr< double, long int>& operation, + const typename tnlParallelReductionLogicalOr< double, long int > :: IndexType size, + const typename tnlParallelReductionLogicalOr< double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalOr< double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalOr< double, long int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionLogicalOr< long double, long int > > + ( const tnlParallelReductionLogicalOr< long double, long int>& operation, + const typename tnlParallelReductionLogicalOr< long double, long int > :: IndexType size, + const typename tnlParallelReductionLogicalOr< long double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLogicalOr< long double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLogicalOr< long double, long int> :: ResultType& result );*/ + + +/**** + * Lp Norm + */ +extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< float, int > > + ( const tnlParallelReductionLpNorm< float, int >& operation, + const typename tnlParallelReductionLpNorm< float, int > :: IndexType size, + const typename tnlParallelReductionLpNorm< float, int > :: RealType* deviceInput1, + const typename tnlParallelReductionLpNorm< float, int > :: RealType* deviceInput2, + typename tnlParallelReductionLpNorm< float, int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< double, int > > + ( const tnlParallelReductionLpNorm< double, int>& operation, + const typename tnlParallelReductionLpNorm< double, int > :: IndexType size, + const typename tnlParallelReductionLpNorm< double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionLpNorm< double, int > :: RealType* deviceInput2, + typename tnlParallelReductionLpNorm< double, int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< long double, int > > + ( const tnlParallelReductionLpNorm< long double, int>& operation, + const typename tnlParallelReductionLpNorm< long double, int > :: IndexType size, + const typename tnlParallelReductionLpNorm< long double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionLpNorm< long double, int > :: RealType* deviceInput2, + typename tnlParallelReductionLpNorm< long double, int> :: ResultType& result );*/ + +extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< char, long int > > + ( const tnlParallelReductionLpNorm< char, long int >& operation, + const typename tnlParallelReductionLpNorm< char, long int > :: IndexType size, + const typename tnlParallelReductionLpNorm< char, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLpNorm< char, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLpNorm< char, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< int, long int > > + ( const tnlParallelReductionLpNorm< int, long int >& operation, + const typename tnlParallelReductionLpNorm< int, long int > :: IndexType size, + const typename tnlParallelReductionLpNorm< int, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLpNorm< int, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLpNorm< int, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< float, long int > > + ( const tnlParallelReductionLpNorm< float, long int >& operation, + const typename tnlParallelReductionLpNorm< float, long int > :: IndexType size, + const typename tnlParallelReductionLpNorm< float, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLpNorm< float, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLpNorm< float, long int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< double, long int > > + ( const tnlParallelReductionLpNorm< double, long int>& operation, + const typename tnlParallelReductionLpNorm< double, long int > :: IndexType size, + const typename tnlParallelReductionLpNorm< double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLpNorm< double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLpNorm< double, long int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionLpNorm< long double, long int > > + ( const tnlParallelReductionLpNorm< long double, long int>& operation, + const typename tnlParallelReductionLpNorm< long double, long int > :: IndexType size, + const typename tnlParallelReductionLpNorm< long double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionLpNorm< long double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionLpNorm< long double, long int> :: ResultType& result );*/ + + +/**** + * Equalities + */ +extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< char, int > > + ( const tnlParallelReductionEqualities< char, int >& operation, + const typename tnlParallelReductionEqualities< char, int > :: IndexType size, + const typename tnlParallelReductionEqualities< char, int > :: RealType* deviceInput1, + const typename tnlParallelReductionEqualities< char, int > :: RealType* deviceInput2, + typename tnlParallelReductionEqualities< char, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< int, int > > + ( const tnlParallelReductionEqualities< int, int >& operation, + const typename tnlParallelReductionEqualities< int, int > :: IndexType size, + const typename tnlParallelReductionEqualities< int, int > :: RealType* deviceInput1, + const typename tnlParallelReductionEqualities< int, int > :: RealType* deviceInput2, + typename tnlParallelReductionEqualities< int, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< float, int > > + ( const tnlParallelReductionEqualities< float, int >& operation, + const typename tnlParallelReductionEqualities< float, int > :: IndexType size, + const typename tnlParallelReductionEqualities< float, int > :: RealType* deviceInput1, + const typename tnlParallelReductionEqualities< float, int > :: RealType* deviceInput2, + typename tnlParallelReductionEqualities< float, int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< double, int > > + ( const tnlParallelReductionEqualities< double, int>& operation, + const typename tnlParallelReductionEqualities< double, int > :: IndexType size, + const typename tnlParallelReductionEqualities< double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionEqualities< double, int > :: RealType* deviceInput2, + typename tnlParallelReductionEqualities< double, int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< long double, int > > + ( const tnlParallelReductionEqualities< long double, int>& operation, + const typename tnlParallelReductionEqualities< long double, int > :: IndexType size, + const typename tnlParallelReductionEqualities< long double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionEqualities< long double, int > :: RealType* deviceInput2, + typename tnlParallelReductionEqualities< long double, int> :: ResultType& result );*/ + +extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< char, long int > > + ( const tnlParallelReductionEqualities< char, long int >& operation, + const typename tnlParallelReductionEqualities< char, long int > :: IndexType size, + const typename tnlParallelReductionEqualities< char, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionEqualities< char, long int > :: RealType* deviceInput2, + typename tnlParallelReductionEqualities< char, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< int, long int > > + ( const tnlParallelReductionEqualities< int, long int >& operation, + const typename tnlParallelReductionEqualities< int, long int > :: IndexType size, + const typename tnlParallelReductionEqualities< int, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionEqualities< int, long int > :: RealType* deviceInput2, + typename tnlParallelReductionEqualities< int, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< float, long int > > + ( const tnlParallelReductionEqualities< float, long int >& operation, + const typename tnlParallelReductionEqualities< float, long int > :: IndexType size, + const typename tnlParallelReductionEqualities< float, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionEqualities< float, long int > :: RealType* deviceInput2, + typename tnlParallelReductionEqualities< float, long int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< double, long int > > + ( const tnlParallelReductionEqualities< double, long int>& operation, + const typename tnlParallelReductionEqualities< double, long int > :: IndexType size, + const typename tnlParallelReductionEqualities< double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionEqualities< double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionEqualities< double, long int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionEqualities< long double, long int > > + ( const tnlParallelReductionEqualities< long double, long int>& operation, + const typename tnlParallelReductionEqualities< long double, long int > :: IndexType size, + const typename tnlParallelReductionEqualities< long double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionEqualities< long double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionEqualities< long double, long int> :: ResultType& result );*/ + + +/**** + * Inequalities + */ +extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< char, int > > + ( const tnlParallelReductionInequalities< char, int >& operation, + const typename tnlParallelReductionInequalities< char, int > :: IndexType size, + const typename tnlParallelReductionInequalities< char, int > :: RealType* deviceInput1, + const typename tnlParallelReductionInequalities< char, int > :: RealType* deviceInput2, + typename tnlParallelReductionInequalities< char, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< int, int > > + ( const tnlParallelReductionInequalities< int, int >& operation, + const typename tnlParallelReductionInequalities< int, int > :: IndexType size, + const typename tnlParallelReductionInequalities< int, int > :: RealType* deviceInput1, + const typename tnlParallelReductionInequalities< int, int > :: RealType* deviceInput2, + typename tnlParallelReductionInequalities< int, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< float, int > > + ( const tnlParallelReductionInequalities< float, int >& operation, + const typename tnlParallelReductionInequalities< float, int > :: IndexType size, + const typename tnlParallelReductionInequalities< float, int > :: RealType* deviceInput1, + const typename tnlParallelReductionInequalities< float, int > :: RealType* deviceInput2, + typename tnlParallelReductionInequalities< float, int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< double, int > > + ( const tnlParallelReductionInequalities< double, int>& operation, + const typename tnlParallelReductionInequalities< double, int > :: IndexType size, + const typename tnlParallelReductionInequalities< double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionInequalities< double, int > :: RealType* deviceInput2, + typename tnlParallelReductionInequalities< double, int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< long double, int > > + ( const tnlParallelReductionInequalities< long double, int>& operation, + const typename tnlParallelReductionInequalities< long double, int > :: IndexType size, + const typename tnlParallelReductionInequalities< long double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionInequalities< long double, int > :: RealType* deviceInput2, + typename tnlParallelReductionInequalities< long double, int> :: ResultType& result );*/ + +extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< char, long int > > + ( const tnlParallelReductionInequalities< char, long int >& operation, + const typename tnlParallelReductionInequalities< char, long int > :: IndexType size, + const typename tnlParallelReductionInequalities< char, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionInequalities< char, long int > :: RealType* deviceInput2, + typename tnlParallelReductionInequalities< char, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< int, long int > > + ( const tnlParallelReductionInequalities< int, long int >& operation, + const typename tnlParallelReductionInequalities< int, long int > :: IndexType size, + const typename tnlParallelReductionInequalities< int, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionInequalities< int, long int > :: RealType* deviceInput2, + typename tnlParallelReductionInequalities< int, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< float, long int > > + ( const tnlParallelReductionInequalities< float, long int >& operation, + const typename tnlParallelReductionInequalities< float, long int > :: IndexType size, + const typename tnlParallelReductionInequalities< float, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionInequalities< float, long int > :: RealType* deviceInput2, + typename tnlParallelReductionInequalities< float, long int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< double, long int > > + ( const tnlParallelReductionInequalities< double, long int>& operation, + const typename tnlParallelReductionInequalities< double, long int > :: IndexType size, + const typename tnlParallelReductionInequalities< double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionInequalities< double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionInequalities< double, long int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionInequalities< long double, long int > > + ( const tnlParallelReductionInequalities< long double, long int>& operation, + const typename tnlParallelReductionInequalities< long double, long int > :: IndexType size, + const typename tnlParallelReductionInequalities< long double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionInequalities< long double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionInequalities< long double, long int> :: ResultType& result );*/ + + +/**** + * Sdot + */ +extern template bool reductionOnCudaDevice< tnlParallelReductionSdot< char, int > > + ( const tnlParallelReductionSdot< char, int >& operation, + const typename tnlParallelReductionSdot< char, int > :: IndexType size, + const typename tnlParallelReductionSdot< char, int > :: RealType* deviceInput1, + const typename tnlParallelReductionSdot< char, int > :: RealType* deviceInput2, + typename tnlParallelReductionSdot< char, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionSdot< int, int > > + ( const tnlParallelReductionSdot< int, int >& operation, + const typename tnlParallelReductionSdot< int, int > :: IndexType size, + const typename tnlParallelReductionSdot< int, int > :: RealType* deviceInput1, + const typename tnlParallelReductionSdot< int, int > :: RealType* deviceInput2, + typename tnlParallelReductionSdot< int, int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionSdot< float, int > > + ( const tnlParallelReductionSdot< float, int >& operation, + const typename tnlParallelReductionSdot< float, int > :: IndexType size, + const typename tnlParallelReductionSdot< float, int > :: RealType* deviceInput1, + const typename tnlParallelReductionSdot< float, int > :: RealType* deviceInput2, + typename tnlParallelReductionSdot< float, int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionSdot< double, int > > + ( const tnlParallelReductionSdot< double, int>& operation, + const typename tnlParallelReductionSdot< double, int > :: IndexType size, + const typename tnlParallelReductionSdot< double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionSdot< double, int > :: RealType* deviceInput2, + typename tnlParallelReductionSdot< double, int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionSdot< long double, int > > + ( const tnlParallelReductionSdot< long double, int>& operation, + const typename tnlParallelReductionSdot< long double, int > :: IndexType size, + const typename tnlParallelReductionSdot< long double, int > :: RealType* deviceInput1, + const typename tnlParallelReductionSdot< long double, int > :: RealType* deviceInput2, + typename tnlParallelReductionSdot< long double, int> :: ResultType& result );*/ + +extern template bool reductionOnCudaDevice< tnlParallelReductionSdot< char, long int > > + ( const tnlParallelReductionSdot< char, long int >& operation, + const typename tnlParallelReductionSdot< char, long int > :: IndexType size, + const typename tnlParallelReductionSdot< char, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionSdot< char, long int > :: RealType* deviceInput2, + typename tnlParallelReductionSdot< char, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionSdot< int, long int > > + ( const tnlParallelReductionSdot< int, long int >& operation, + const typename tnlParallelReductionSdot< int, long int > :: IndexType size, + const typename tnlParallelReductionSdot< int, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionSdot< int, long int > :: RealType* deviceInput2, + typename tnlParallelReductionSdot< int, long int > :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionSdot< float, long int > > + ( const tnlParallelReductionSdot< float, long int >& operation, + const typename tnlParallelReductionSdot< float, long int > :: IndexType size, + const typename tnlParallelReductionSdot< float, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionSdot< float, long int > :: RealType* deviceInput2, + typename tnlParallelReductionSdot< float, long int> :: ResultType& result ); + +extern template bool reductionOnCudaDevice< tnlParallelReductionSdot< double, long int > > + ( const tnlParallelReductionSdot< double, long int>& operation, + const typename tnlParallelReductionSdot< double, long int > :: IndexType size, + const typename tnlParallelReductionSdot< double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionSdot< double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionSdot< double, long int> :: ResultType& result ); + +/*extern template bool reductionOnCudaDevice< tnlParallelReductionSdot< long double, long int > > + ( const tnlParallelReductionSdot< long double, long int>& operation, + const typename tnlParallelReductionSdot< long double, long int > :: IndexType size, + const typename tnlParallelReductionSdot< long double, long int > :: RealType* deviceInput1, + const typename tnlParallelReductionSdot< long double, long int > :: RealType* deviceInput2, + typename tnlParallelReductionSdot< long double, long int> :: ResultType& result );*/ + +#endif /* TEMPLATE_EXPLICIT_INSTANTIATION */ + +#endif /* CUDA_REDUCTION_IMPL_H_ */ diff --git a/src/implementation/core/cuda/reduction_impl.h b/src/implementation/core/cuda/reduction_impl.h deleted file mode 100644 index 388afec7010a0e44defbd82785d69b5337011b06..0000000000000000000000000000000000000000 --- a/src/implementation/core/cuda/reduction_impl.h +++ /dev/null @@ -1,805 +0,0 @@ -/*************************************************************************** - cuda-long-vector-kernels.h - description - ------------------- - begin : Oct 28, 2010 - copyright : (C) 2010 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/*************************************************************************** - * * - * This program is free software; you can redistribute it and/or modify * - * it under the terms of the GNU General Public License as published by * - * the Free Software Foundation; either version 2 of the License, or * - * (at your option) any later version. * - * * - ***************************************************************************/ - -#ifndef CUDALONGVECTORKERNELS_H_ -#define CUDALONGVECTORKERNELS_H_ - -#ifdef HAVE_CUDA -#include <cuda.h> -#endif -#include <iostream> -#include <core/tnlVector.h> - -using namespace std; - -enum tnlTupleOperation { tnlParallelReductionMin = 1, - tnlParallelReductionMax, - tnlParallelReductionSum, - tnlParallelReductionAbsMin, - tnlParallelReductionAbsMax, - tnlParallelReductionAbsSum, - tnlParallelReductionLpNorm, - tnlParallelReductionSdot }; - -/**** - * This constant says that arrays smaller than its value - * are going to be reduced on CPU. - */ -const int maxGPUReductionDataSize = 256; - -/**** - * The following kernels and functions have been adopted from - * - * M. Harris, “Optimizing parallel reduction in cuda,†NVIDIA CUDA SDK, 2007. - * - * The code was extended even for data arrays with size different from - * a power of 2. - * - * For the educative and also testing/debugging reasons we have 6 version of this algorithm. - * The slower version can be found as a part o ftesting code. See directory tests. - * Version 1 is the slowest and version 6 is the fastest - teste on CUDA architecture 1.0 - 1.3. - * Another improvements are possible for the future devices. - * - */ - -#ifdef HAVE_CUDA -/*** - * This function returns minimum of two numbers stored on the device. - */ -template< class T > __device__ T tnlCudaMin( const T& a, - const T& b ) -{ - return a < b ? a : b; -} - -__device__ int tnlCudaMin( const int& a, - const int& b ) -{ - return min( a, b ); -} - -__device__ float tnlCudaMin( const float& a, - const float& b ) -{ - return fminf( a, b ); -} - -__device__ double tnlCudaMin( const double& a, - const double& b ) -{ - return fmin( a, b ); -} - -/*** - * This function returns maximum of two numbers stored on the device. - */ -template< class T > __device__ T tnlCudaMax( const T& a, - const T& b ) -{ - return a > b ? a : b; -} - -__device__ int tnlCudaMax( const int& a, - const int& b ) -{ - return max( a, b ); -} - -__device__ float tnlCudaMax( const float& a, - const float& b ) -{ - return fmaxf( a, b ); -} - -__device__ double tnlCudaMax( const double& a, - const double& b ) -{ - return fmax( a, b ); -} - -/*** - * This function returns absolute value of given number on the device. - */ -__device__ int tnlCudaAbs( const int& a ) -{ - return abs( a ); -} - -__device__ float tnlCudaAbs( const float& a ) -{ - return fabs( a ); -} - -__device__ double tnlCudaAbs( const double& a ) -{ - return fabs( a ); -} - -/*** - * For each thread in block with thread ID smaller then s this function reduces - * data elements with indecis tid and tid + s. Here we assume that for each - * tid the tid + s element also exists i.e. we have even number of elements. - */ -template< class T, tnlTupleOperation operation > -__device__ void reduceAligned( unsigned int tid, - unsigned int s, - T* sdata ) -{ - if( tid < s ) - { - if( operation == tnlParallelReductionMin ) - sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] ); - if( operation == tnlParallelReductionMax ) - sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] ); - if( operation == tnlParallelReductionSum ) - sdata[ tid ] += sdata[ tid + s ]; - if( operation == tnlParallelReductionAbsMin ) - sdata[ tid ] = tnlCudaMin( tnlCudaAbs( sdata[ tid ] ), tnlCudaAbs( sdata[ tid + s ] ) ); - if( operation == tnlParallelReductionAbsMax ) - sdata[ tid ] = tnlCudaMax( tnlCudaAbs( sdata[ tid ] ), tnlCudaAbs( sdata[ tid + s ] ) ); - if( operation == tnlParallelReductionLpNorm || - operation == tnlParallelReductionSdot ) - sdata[ tid ] = sdata[ tid ] + sdata[ tid + s ]; - } -} - -/*** - * For each thread in block with thread ID smaller then s this function reduces - * data elements with indices tid and tid + s. This is a modified version of - * the previous algorithm. Thid one works even for odd number of elements but - * it is a bit slower. - */ -template< class T, tnlTupleOperation operation > -__device__ void reduceNonAligned( unsigned int tid, - unsigned int s, - unsigned int n, - T* sdata ) -{ - if( tid < s ) - { - if( operation == tnlParallelReductionMin ) - sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] ); - if( operation == tnlParallelReductionMax ) - sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] ); - if( operation == tnlParallelReductionSum ) - sdata[ tid ] += sdata[ tid + s ]; - if( operation == tnlParallelReductionAbsMin ) - sdata[ tid ] = tnlCudaMin( tnlCudaAbs( sdata[ tid ] ), tnlCudaAbs( sdata[ tid + s ] ) ); - if( operation == tnlParallelReductionAbsMax ) - sdata[ tid ] = tnlCudaMax( tnlCudaAbs( sdata[ tid ] ), tnlCudaAbs( sdata[ tid + s ] ) ); - if( operation == tnlParallelReductionLpNorm || - operation == tnlParallelReductionSdot ) - sdata[ tid ] = sdata[ tid ] + sdata[ tid + s ]; - } - /* This is for the case when we have odd number of elements. - * The last one will be reduced using the thread with ID 0. - */ - if( s > 32 ) - __syncthreads(); - if( 2 * s < n && tid == n - 1 ) - { - if( operation == tnlParallelReductionMin ) - sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] ); - if( operation == tnlParallelReductionMax ) - sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] ); - if( operation == tnlParallelReductionSum ) - sdata[ 0 ] += sdata[ tid ]; - if( operation == tnlParallelReductionAbsMin ) - sdata[ 0 ] = tnlCudaMin( tnlCudaAbs( sdata[ 0] ), tnlCudaAbs( sdata[ tid + s ] ) ); - if( operation == tnlParallelReductionAbsMax ) - sdata[ 0 ] = tnlCudaMax( tnlCudaAbs( sdata[ 0 ] ), tnlCudaAbs( sdata[ tid + s ] ) ); - if( operation == tnlParallelReductionLpNorm || - operation == tnlParallelReductionSdot ) - sdata[ 0 ] = sdata[ 0 ] + sdata[ tid + s ]; - - } -} - -/*** - * The parallel reduction of one vector. - * - * WARNING: This kernel only reduce data in one block. Use rather tnlCUDASimpleReduction2 - * to call this kernel then doing it by yourself. - * This kernel is very inefficient. It is here only for educative and testing reasons. - * Please use tnlCUDAReduction instead. - * - * The kernel parameters: - * @param size is the number of all element to reduce - not just in one block. - * @param deviceInput input data which we want to reduce - * @param deviceOutput an array to which we write the result of reduction. - * Each block of the grid writes one element in this array - * (i.e. the size of this array equals the number of CUDA blocks). - */ -template < typename Type, typename ParameterType, typename Index, tnlTupleOperation operation, int blockSize > -__global__ void tnlCUDAReductionKernel( const Index size, - const Type* deviceInput, - const Type* deviceInput2, - Type* deviceOutput, - const ParameterType parameter, - Type* dbg_array1 = 0 ) -{ - extern __shared__ __align__ ( 8 ) char __sdata[]; - Type* sdata = reinterpret_cast< Type* >( __sdata ); - - /*** - * Get thread id (tid) and global thread id (gid). - * lastTId is the last relevant thread id in this block. - * gridSize is the number of element processed by all blocks at the - * same time. - */ - unsigned int tid = threadIdx. x; - unsigned int gid = 2 * blockIdx. x * blockDim. x + threadIdx. x; - unsigned int lastTId = size - 2 * blockIdx. x * blockDim. x; - unsigned int gridSize = 2 * blockDim. x * gridDim.x; - - /*** - * Read data into the shared memory. We start with the - * sequential reduction. - */ - if( gid + blockDim. x < size ) - { - if( operation == tnlParallelReductionMin ) - sdata[ tid ] = tnlCudaMin( deviceInput[ gid ], deviceInput[ gid + blockDim. x ] ); - if( operation == tnlParallelReductionMax ) - sdata[ tid ] = tnlCudaMax( deviceInput[ gid ], deviceInput[ gid + blockDim. x ] ); - if( operation == tnlParallelReductionAbsMin ) - sdata[ tid ] = tnlCudaMin( tnlCudaAbs( deviceInput[ gid ] ), tnlCudaAbs( deviceInput[ gid + blockDim. x ] ) ); - if( operation == tnlParallelReductionAbsMax ) - sdata[ tid ] = tnlCudaMax( tnlCudaAbs( deviceInput[ gid ] ), tnlCudaAbs( deviceInput[ gid + blockDim. x ] ) ); - if( operation == tnlParallelReductionSum ) - sdata[ tid ] = deviceInput[ gid ] + deviceInput[ gid + blockDim. x ]; - if( operation == tnlParallelReductionLpNorm ) - sdata[ tid ] = powf( tnlCudaAbs( deviceInput[ gid ] ), parameter ) + - powf( tnlCudaAbs( deviceInput[ gid + blockDim. x ] ), parameter ); - if( operation == tnlParallelReductionSdot ) - sdata[ tid ] = deviceInput[ gid ] * deviceInput2[ gid ] + - deviceInput[ gid + blockDim. x ] * deviceInput2[ gid + blockDim. x ]; - } - else if( gid < size ) - { - if( operation == tnlParallelReductionLpNorm ) - sdata[ tid ] = powf( tnlCudaAbs( deviceInput[ gid ] ), parameter ); - else - if( operation == tnlParallelReductionSdot ) - sdata[ tid ] = deviceInput[ gid ] * deviceInput2[ gid ]; - else - sdata[ tid ] = deviceInput[ gid ]; - } - gid += gridSize; - while( gid + blockDim. x < size ) - { - if( operation == tnlParallelReductionMin ) - sdata[ tid ] = :: tnlCudaMin( sdata[ tid ], :: tnlCudaMin( deviceInput[ gid ], deviceInput[ gid + blockDim. x ] ) ); - if( operation == tnlParallelReductionMax ) - sdata[ tid ] = :: tnlCudaMax( sdata[ tid ], :: tnlCudaMax( deviceInput[ gid ], deviceInput[ gid + blockDim. x ] ) ); - if( operation == tnlParallelReductionAbsMin ) - sdata[ tid ] = :: tnlCudaMin( tnlCudaAbs( sdata[ tid ] ), :: tnlCudaMin( tnlCudaAbs( deviceInput[ gid ] ), tnlCudaAbs( deviceInput[ gid + blockDim. x ] ) ) ); - if( operation == tnlParallelReductionAbsMax ) - sdata[ tid ] = :: tnlCudaMax( tnlCudaAbs( sdata[ tid ] ), :: tnlCudaMax( tnlCudaAbs( deviceInput[ gid ] ), tnlCudaAbs( deviceInput[ gid + blockDim. x ] ) ) ); - if( operation == tnlParallelReductionSum ) - sdata[ tid ] += deviceInput[gid] + deviceInput[ gid + blockDim. x ]; - if( operation == tnlParallelReductionLpNorm ) - sdata[ tid ] += powf( tnlCudaAbs( deviceInput[gid] ), parameter ) + - powf( tnlCudaAbs( deviceInput[ gid + blockDim. x ] ), parameter ); - if( operation == tnlParallelReductionSdot ) - sdata[ tid ] += deviceInput[ gid ] * deviceInput2[ gid ] + - deviceInput[ gid + blockDim. x] * deviceInput2[ gid + blockDim. x ]; - gid += gridSize; - } - if( gid < size ) - { - if( operation == tnlParallelReductionMin ) - sdata[ tid ] = :: tnlCudaMin( sdata[ tid ], deviceInput[ gid ] ); - if( operation == tnlParallelReductionMax ) - sdata[ tid ] = :: tnlCudaMax( sdata[ tid ], deviceInput[ gid ] ); - if( operation == tnlParallelReductionAbsMin ) - sdata[ tid ] = :: tnlCudaMin( tnlCudaAbs( sdata[ tid ] ), tnlCudaAbs( deviceInput[ gid ] ) ); - if( operation == tnlParallelReductionAbsMax ) - sdata[ tid ] = :: tnlCudaMax( tnlCudaAbs( sdata[ tid ] ), tnlCudaAbs( deviceInput[ gid ] ) ); - if( operation == tnlParallelReductionSum ) - sdata[ tid ] += deviceInput[gid]; - if( operation == tnlParallelReductionLpNorm ) - sdata[ tid ] += powf( tnlCudaAbs( deviceInput[gid] ), parameter ); - if( operation == tnlParallelReductionSdot ) - sdata[ tid ] += deviceInput[ gid ] * deviceInput2[ gid ]; - } - __syncthreads(); - - - /*** - * Process the parallel reduction. - * We reduce the data with step s which is one half of the elements to reduce. - * Each thread with ID < s reduce elements tid and tid + s. The result is stored - * in shared memory in sdata 0 .. s. We set s = s / 2 ( i.e. s >>= 1) and repeat - * the algorithm again until s = 1. - * We also separate the case when the blockDim. x is power of 2 and the algorithm - * can be written in more efficient way without some conditions. - */ - unsigned int n = lastTId < blockDim. x ? lastTId : blockDim. x; - if( n == 128 || n == 64 || n == 32 || n == 16 || - n == 8 || n == 4 || n == 2 || n == 256 || - n == 512 ) - { - if( blockSize >= 512 ) - { - if( tid < 256 ) - reduceAligned< Type, operation >( tid, 256, sdata ); - __syncthreads(); - } - if( blockSize >= 256 ) - { - if( tid < 128 ) - reduceAligned< Type, operation >( tid, 128, sdata ); - __syncthreads(); - } - if( blockSize >= 128 ) - { - if( tid < 64 ) - reduceAligned< Type, operation >( tid, 64, sdata ); - __syncthreads(); - } - - /*** - * This runs in one warp so it is synchronised implicitly. - */ - if (tid < 32) - { - if( blockSize >= 64 ) - reduceAligned< Type, operation >( tid, 32, sdata ); - if( blockSize >= 32 ) - reduceAligned< Type, operation >( tid, 16, sdata ); - if( blockSize >= 16 ) - reduceAligned< Type, operation >( tid, 8, sdata ); - if( blockSize >= 8 ) - reduceAligned< Type, operation >( tid, 4, sdata ); - if( blockSize >= 4 ) - reduceAligned< Type, operation >( tid, 2, sdata ); - if( blockSize >= 2 ) - reduceAligned< Type, operation >( tid, 1, sdata ); - } - } - else - { - unsigned int s; - if( n >= 512 ) - { - s = n / 2; - reduceNonAligned< Type, operation >( tid, s, n, sdata ); - n = s; - __syncthreads(); - } - if( n >= 256 ) - { - s = n / 2; - reduceNonAligned< Type, operation >( tid, s, n, sdata ); - n = s; - __syncthreads(); - } - if( n >= 128 ) - { - s = n / 2; - reduceNonAligned< Type, operation >( tid, s, n, sdata ); - n = s; - __syncthreads(); - } - if( n >= 64 ) - { - s = n / 2; - reduceNonAligned< Type, operation >( tid, s, n, sdata ); - n = s; - __syncthreads(); - } - if( n >= 32 ) - { - s = n / 2; - reduceNonAligned< Type, operation >( tid, s, n, sdata ); - n = s; - __syncthreads(); - } - /*** - * This runs in one warp so it is synchronised implicitly. - */ - if( n >= 16 ) - { - s = n / 2; - reduceNonAligned< Type, operation >( tid, s, n, sdata ); - n = s; - } - if( n >= 8 ) - { - s = n / 2; - reduceNonAligned< Type, operation >( tid, s, n, sdata ); - n = s; - } - if( n >= 4 ) - { - s = n / 2; - reduceNonAligned< Type, operation >( tid, s, n, sdata ); - n = s; - } - if( n >= 2 ) - { - s = n / 2; - reduceNonAligned< Type, operation >( tid, s, n, sdata ); - n = s; - } - - } - - /*** - * Store the result back in the global memory. - */ - if( tid == 0 ) - deviceOutput[ blockIdx. x ] = sdata[ 0 ]; -} - -#endif -/*** - * The template calling the final CUDA kernel for the single vector reduction. - * The template parameters are: - * @param T is the type of data we want to reduce - * @param operation is the operation reducing the data. - * It can be tnlParallelReductionSum, tnlParallelReductionMin or tnlParallelReductionMax. - * The function parameters: - * @param size tells number of elements in the data array. - * @param deviceInput1 is the pointer to an array storing the data we want - * to reduce. This array must stay on the device!. - * @param deviceInput2 is the pointer to an array storing the coupling data for example - * the second vector for the SDOT operation. This array must stay on the device!. - * @param result will contain the result of the reduction if everything was ok - * and the return code is true. - * @param parameter can be used for example for the passing the parameter p of Lp norm. - * @param deviceAux is auxiliary array used to store temporary data during the reduction. - * If one calls this function more then once one might provide this array to avoid repetetive - * allocation of this array on the device inside of this function. - * The size of this array should be size / 128 * sizeof( T ). - */ -template< typename Type, typename ParameterType, typename Index, tnlTupleOperation operation > -bool tnlCUDALongVectorReduction( const Index size, - const Type* deviceInput1, - const Type* deviceInput2, - Type& result, - const ParameterType& parameter, - Type* deviceAux = 0 ) -{ -#ifdef HAVE_CUDA - /*** - * Set parameters: - * @param desBlockSize is desired block size with which we get the best performance (on CUDA rach 1.0 to 1.3) - * @param desGridSize is desired grid size - */ - const int desBlockSize = 512; - const int desGridSize = 2048; - - Type* dbg_array1; // debuging array - - /*** - * Allocating auxiliary device memory to store temporary reduced arrays. - * For example in the first iteration we reduce the number of elements - * from size to size / 2. We store this new data in deviceAux array. - * If one calls the CUDA reduction more then once then one can provide - * auxiliary array by passing it via the parameter deviceAux. - */ - tnlVector< Type, tnlCuda > deviceAuxVct( "tnlCUDAOneVectorReduction:deviceAuxVct" ); - if( ! deviceAux ) - { - int sizeAlloc = :: Max( 1, size / desBlockSize ); - if( ! deviceAuxVct. setSize( sizeAlloc ) ) - return false; - deviceAux = deviceAuxVct. getData(); - } - - /*** - * Setup parameters of the kernel: - * @param sizeReduced is the size of reduced data after each step of parallel reduction - * @param reductionInput tells what data we shell reduce. We start with the input if this fuction - * and after the 1st reduction step we switch this pointer to deviceAux. - */ - int sizeReduced = size; - const Type* reductionInput1 = deviceInput1; - const Type* reductionInput2 = deviceInput2; - int reductionSteps( 0 ); - while( sizeReduced > maxGPUReductionDataSize ) - { - dim3 blockSize( 0 ), gridSize( 0 ); - blockSize. x = :: Min( sizeReduced, desBlockSize ); - gridSize. x = :: Min( ( int ) ( sizeReduced / blockSize. x + 1 ) / 2, desGridSize ); - - /*** - * We align the blockSize to the power of 2. - */ - Index alignedBlockSize = 1; - while( alignedBlockSize < blockSize. x ) alignedBlockSize <<= 1; - blockSize. x = alignedBlockSize; - Index shmem = blockSize. x * sizeof( Type ); - /*** - * Depending on the blockSize we generate appropriate template instance. - */ - if( reductionSteps > 0 && - ( operation == tnlParallelReductionSdot || - operation == tnlParallelReductionLpNorm ) ) - { - /*** - * For operations like SDOT or LpNorm we need to switch to tnlParallelReductionSum after the - * first reduction step. - */ - switch( blockSize. x ) - { - case 512: - tnlCUDAReductionKernel< Type, ParameterType, Index, tnlParallelReductionSum, 512 > - <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 ); - break; - case 256: - tnlCUDAReductionKernel< Type, ParameterType, Index, tnlParallelReductionSum, 256 > - <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 ); - break; - case 128: - tnlCUDAReductionKernel< Type, ParameterType, Index, tnlParallelReductionSum, 128 > - <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 ); - break; - case 64: - tnlCUDAReductionKernel< Type, ParameterType, Index, tnlParallelReductionSum, 64 > - <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 ); - break; - case 32: - tnlCUDAReductionKernel< Type, ParameterType, Index, tnlParallelReductionSum, 32 > - <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 ); - break; - case 16: - tnlCUDAReductionKernel< Type, ParameterType, Index, tnlParallelReductionSum, 16 > - <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 ); - break; - case 8: - tnlCUDAReductionKernel< Type, ParameterType, Index, tnlParallelReductionSum, 8 > - <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 ); - break; - case 4: - tnlCUDAReductionKernel< Type, ParameterType, Index, tnlParallelReductionSum, 4 > - <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 ); - break; - case 2: - tnlCUDAReductionKernel< Type, ParameterType, Index, tnlParallelReductionSum, 2 > - <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 ); - break; - case 1: - tnlAssert( false, cerr << "blockSize should not be 1." << endl ); - break; - default: - tnlAssert( false, cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); - break; - } - } - else - switch( blockSize. x ) - { - case 512: - tnlCUDAReductionKernel< Type, ParameterType, Index, operation, 512 > - <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 ); - break; - case 256: - tnlCUDAReductionKernel< Type, ParameterType, Index, operation, 256 > - <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 ); - break; - case 128: - tnlCUDAReductionKernel< Type, ParameterType, Index, operation, 128 > - <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 ); - break; - case 64: - tnlCUDAReductionKernel< Type, ParameterType, Index, operation, 64 > - <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 ); - break; - case 32: - tnlCUDAReductionKernel< Type, ParameterType, Index, operation, 32 > - <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 ); - break; - case 16: - tnlCUDAReductionKernel< Type, ParameterType, Index, operation, 16 > - <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 ); - break; - case 8: - tnlCUDAReductionKernel< Type, ParameterType, Index, operation, 8 > - <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 ); - break; - case 4: - tnlCUDAReductionKernel< Type, ParameterType, Index, operation, 4 > - <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 ); - break; - case 2: - tnlCUDAReductionKernel< Type, ParameterType, Index, operation, 2 > - <<< gridSize, blockSize, shmem >>>( sizeReduced, reductionInput1, reductionInput2, deviceAux, parameter, dbg_array1 ); - break; - case 1: - tnlAssert( false, cerr << "blockSize should not be 1." << endl ); - break; - default: - tnlAssert( false, cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); - break; - } - sizeReduced = gridSize. x; - reductionInput1 = deviceAux; - reductionSteps ++; - } - - /*** - * We transfer reduced data from device to host. - * If sizeReduced equals size the previous loop was not processed and we read - * data directly from the input. - */ - Type result_array[ maxGPUReductionDataSize ]; - Type result_array2[ maxGPUReductionDataSize ]; - if( sizeReduced == size ) - { - if( cudaMemcpy( result_array, deviceInput1, sizeReduced * sizeof( Type ), cudaMemcpyDeviceToHost ) != cudaSuccess ) - { - CHECK_CUDA_ERROR; - return false; - } - switch( operation ) - { - case tnlParallelReductionLpNorm: - result = pow( tnlAbs( result_array[ 0 ] ), parameter ); - for( Index i = 1; i < sizeReduced; i ++ ) - result += pow( tnlAbs( result_array[ i ] ), parameter ); - result = pow( result, 1.0/ parameter ); - return true; - case tnlParallelReductionSdot: - if( cudaMemcpy( result_array2, deviceInput2, sizeReduced * sizeof( Type ), cudaMemcpyDeviceToHost ) != cudaSuccess ) - { - CHECK_CUDA_ERROR; - } - else - { - result = 0; - for( Index i = 0; i < sizeReduced; i ++ ) - result += result_array[ i ] * result_array2[ i ] ; - return true; - } - } - } - else - if( cudaMemcpy( result_array, deviceAux, sizeReduced * sizeof( Type ), cudaMemcpyDeviceToHost ) != cudaSuccess ) - { - CHECK_CUDA_ERROR; - return false; - } - switch( operation ) - { - case tnlParallelReductionMax: - result = result_array[ 0 ]; - for( Index i = 1; i < sizeReduced; i ++ ) - result = Max( result, result_array[ i ] ); - break; - case tnlParallelReductionMin: - result = result_array[ 0 ]; - for( Index i = 1; i < sizeReduced; i ++ ) - result = Min( result, result_array[ i ] ); - break; - case tnlParallelReductionSum: - case tnlParallelReductionLpNorm: - case tnlParallelReductionSdot: - result = result_array[ 0 ]; - for( Index i = 1; i < sizeReduced; i ++ ) - result += result_array[ i ]; - break; - case tnlParallelReductionAbsMax: - result = tnlAbs( result_array[ 0 ] ); - for( Index i = 1; i < sizeReduced; i ++ ) - result = Max( result, tnlAbs( result_array[ i ] ) ); - break; - case tnlParallelReductionAbsMin: - result = tnlAbs( result_array[ 0 ] ); - for( Index i = 1; i < sizeReduced; i ++ ) - result = Min( result, tnlAbs( result_array[ i ] ) ); - break; - } - return true; -#else - cerr << "I am sorry but CUDA support is missing on this system " << __FILE__ << " line " << __LINE__ << "." << endl; - return false; -#endif -}; - -#ifdef HAVE_CUDA -/*** - * This kernel just compares two vectors element by element. It writes - * the result of the comparison into array result. This array must be - * then reduced. - */ -template< typename Real, typename Index > -__global__ void compareTwoVectorsElementwise( const Index size, - const Real* vector1, - const Real* vector2, - bool* result ) -{ - Index gid = blockDim. x * blockIdx. x + threadIdx. x; - if( gid < size ) - { - if( vector1[ gid ] == vector2[ gid ] ) - result[ gid ] = true; - else - result[ gid ] = false; - } -} -#endif - -/*** - * The template for comparison of two long vectors on the CUDA device. - * The template parameters are: - * @param T is the type of data we want to reduce - * @param operation is the operation reducing the data. - * It can be tnlParallelReductionSum, tnlParallelReductionMin or tnlParallelReductionMax. - * The function parameters: - * @param size tells number of elements in the data array. - * @param deviceInput1 is the pointer to an array storing the data we want - * to reduce. This array must stay on the device!. - * @param deviceInput2 is the pointer to an array storing the coupling data for example - * the second vector for the SDOT operation. This array most stay on the device!. - * @param result will contain the result of the reduction if everything was ok - * and the return code is true. - * @param deviceAux is auxiliary array used to store temporary data during the reduction. - * If one calls this function more then once one might provide this array to avoid repetetive - * allocation of this array on the device inside of this function. - * The size of this array should be size / 128 * sizeof( T ). - * - * This function first calls kernel which compares each couples of elements from both vectors. - * Result is written into a bool array. The minimum value then says if both vectors equal. - * - */ -template< typename Type, typename Index > -bool tnlCUDALongVectorComparison( const Index size, - const Type* deviceInput1, - const Type* deviceInput2, - bool* deviceBoolAux = 0, - Type* deviceAux = 0 ) -{ -#ifdef HAVE_CUDA - tnlAssert( size > 0, - cerr << "You try to compare two CUDA long vectors with non-positive size." << endl - << "The size is " << size ); - tnlVector< bool, tnlCuda, Index > boolArray( "tnlCUDALongVectorComparison:bool_array" ); - if( ! deviceBoolAux ) - { - if( ! boolArray. setSize( size ) ) - return false; - deviceBoolAux = boolArray. getData(); - } - dim3 blockSize( 0 ), gridSize( 0 ); - blockSize. x = 256; - gridSize. x = size / blockSize. x + 1; - - compareTwoVectorsElementwise<<< gridSize, blockSize >>>( size, - deviceInput1, - deviceInput2, - deviceBoolAux ); - CHECK_CUDA_ERROR; - bool result; - if( ! tnlCUDALongVectorReduction< bool, bool, Index, tnlParallelReductionMin >( size, - deviceBoolAux, - ( bool* ) NULL, - result, - 0 ) ) - - - return false; - return result; -#else - cerr << "I am sorry but CUDA support is missing on this system " << __FILE__ << " line " << __LINE__ << "." << endl; - return; -#endif -} - -#endif /* CUDALONGVECTORKERNELS_H_ */ diff --git a/src/implementation/core/memory-operations.h b/src/implementation/core/memory-operations.h index 759ca067c4e059a456362f5a4fdf64d417b03a28..09a721996b0ae099bc98d6c0fb7f974baa5a60fc 100644 --- a/src/implementation/core/memory-operations.h +++ b/src/implementation/core/memory-operations.h @@ -42,6 +42,8 @@ bool allocateMemoryCuda( Element*& data, ( size_t ) size * sizeof( Element ) ) != cudaSuccess ) data = 0; return checkCudaDevice; +#else + return false; #endif } @@ -58,8 +60,10 @@ bool freeMemoryCuda( Element* data ) #ifdef HAVE_CUDA cudaFree( data ); return checkCudaDevice; -#endif +#else + cerr << "I am sorry but CUDA support is missing on this system " << __FILE__ << " line " << __LINE__ << "." << endl; return true; +#endif } template< typename Element, typename Index > @@ -101,7 +105,7 @@ bool setMemoryCuda( Element* data, blockSize. x = 256; Index blocksNumber = ceil( ( double ) size / ( double ) blockSize. x ); Index elementsPerThread = ceil( ( double ) blocksNumber / ( double ) maxCudaGridSize ); - gridSize. x = Min( blocksNumber, maxCudaGridSize ); + gridSize. x = Min( blocksNumber, ( Index ) maxCudaGridSize ); //cout << "blocksNumber = " << blocksNumber << "Grid size = " << gridSize. x << " elementsPerThread = " << elementsPerThread << endl; setVectorValueCudaKernel<<< blockSize, gridSize >>>( data, size, value, elementsPerThread ); @@ -140,7 +144,7 @@ bool copyMemoryHostToCuda( Element* destination, } return true; #else - cerr << "CUDA support is missing in this system." << endl; + cerr << "CUDA support is missing on this system " << __FILE__ << " line " << __LINE__ << "." << endl; return false; #endif } @@ -163,7 +167,7 @@ bool copyMemoryCudaToHost( Element* destination, } return true; #else - cerr << "CUDA support is missing in this system." << endl; + cerr << "CUDA support is missing on this system " << __FILE__ << " line " << __LINE__ << "." << endl; return false; #endif } @@ -180,7 +184,7 @@ bool copyMemoryCudaToCuda( Element* destination, cudaMemcpyDeviceToDevice ) != cudaSuccess ) return checkCudaDevice; #else - cerr << "CUDA support is missing in this system." << endl; + cerr << "CUDA support is missing on this system " << __FILE__ << " line " << __LINE__ << "." << endl; return false; #endif } diff --git a/src/implementation/core/cuda/reduction-operations_impl.cu b/src/implementation/core/memory-operations_impl.cpp similarity index 87% rename from src/implementation/core/cuda/reduction-operations_impl.cu rename to src/implementation/core/memory-operations_impl.cpp index b86aa68fcb6b246962d6c8a2c6a24f2fe861b0ec..c6f387e5f0b31de151f23d6c355e9c29d7bd3f3c 100644 --- a/src/implementation/core/cuda/reduction-operations_impl.cu +++ b/src/implementation/core/memory-operations_impl.cpp @@ -1,7 +1,7 @@ /*************************************************************************** - reduction-operations.cu - description + memory_operations_impl.cpp - description ------------------- - begin : Mar 22, 2013 + begin : Mar 24, 2013 copyright : (C) 2013 by Tomas Oberhuber email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ @@ -13,4 +13,8 @@ * the Free Software Foundation; either version 2 of the License, or * * (at your option) any later version. * * * - ***************************************************************************/ \ No newline at end of file + ***************************************************************************/ + + + + diff --git a/src/implementation/core/memory-operations_impl.cu b/src/implementation/core/memory-operations_impl.cu new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/implementation/core/vector-operations.h b/src/implementation/core/vector-operations.h index b584cf245b9aaffad20f8cf17fe2297e2576667f..d06c1cb5c35f41a5ca8f36d491a813abfc2ea6ad 100644 --- a/src/implementation/core/vector-operations.h +++ b/src/implementation/core/vector-operations.h @@ -18,7 +18,7 @@ #ifndef VECTOROPERATIONS_H_ #define VECTOROPERATIONS_H_ -#include <implementation/core/cuda-long-vector-kernels.h> +#include <core/cuda/cuda-reduction.h> template< typename Vector > typename Vector :: RealType getHostVectorMax( const Vector& v ) @@ -38,7 +38,7 @@ typename Vector :: RealType getCudaVectorMax( const Vector& v ) typedef typename Vector :: RealType Real; typedef typename Vector :: IndexType Index; Real result( 0 ); - /*tnlCUDALongVectorReduction< Real, + /*reductionOnCudaDevice< Real, Real, Index, tnlParallelReductionMax > @@ -85,7 +85,7 @@ typename Vector :: RealType getCudaVectorMin( const Vector& v ) typedef typename Vector :: RealType Real; typedef typename Vector :: IndexType Index; Real result( 0 ); - /*tnlCUDALongVectorReduction< Real, + /*reductionOnCudaDevice< Real, Real, Index, tnlParallelReductionMin > @@ -132,7 +132,7 @@ typename Vector :: RealType getCudaVectorAbsMax( const Vector& v ) typedef typename Vector :: RealType Real; typedef typename Vector :: IndexType Index; Real result( 0 ); - /*tnlCUDALongVectorReduction< Real, + /*reductionOnCudaDevice< Real, Real, Index, tnlParallelReductionAbsMax > @@ -179,7 +179,7 @@ typename Vector :: RealType getCudaVectorAbsMin( const Vector& v ) typedef typename Vector :: RealType Real; typedef typename Vector :: IndexType Index; Real result( 0 ); - /*tnlCUDALongVectorReduction< Real, + /*reductionOnCudaDevice< Real, Real, Index, tnlParallelReductionAbsMin > @@ -249,7 +249,7 @@ typename Vector :: RealType getCudaVectorLpNorm( const Vector& v, typedef typename Vector :: IndexType Index; Real result( 0 ); - /*tnlCUDALongVectorReduction< Real, + /*reductionOnCudaDevice< Real, Real, Index, tnlParallelReductionLpNorm > @@ -300,7 +300,7 @@ typename Vector :: RealType getCudaVectorSum( const Vector& v ) typedef typename Vector :: IndexType Index; Real result( 0 ); - /*tnlCUDALongVectorReduction< Real, + /*reductionOnCudaDevice< Real, Real, Index, tnlParallelReductionSum > @@ -713,7 +713,7 @@ typename Vector1 :: RealType getCudaVectorSdot( const Vector1& v1, typedef typename Vector1 :: IndexType Index; Real result( 0 ); - /*tnlCUDALongVectorReduction< Real, + /*reductionOnCudaDevice< Real, Real, Index, tnlParallelReductionSdot > diff --git a/src/solvers/ode/tnlMersonSolver.h b/src/solvers/ode/tnlMersonSolver.h index 34b96dba1de2fe9350d394072981f6693d40ca3c..38f6329815df192b51162bd099d0ff2a079bd4b7 100644 --- a/src/solvers/ode/tnlMersonSolver.h +++ b/src/solvers/ode/tnlMersonSolver.h @@ -19,7 +19,6 @@ #define tnlMersonSolverH #include <math.h> -#include <implementation/core/cuda-long-vector-kernels.h> #include <solvers/ode/tnlExplicitSolver.h> /**** diff --git a/tests/benchmarks/tnl-benchmarks.h b/tests/benchmarks/tnl-benchmarks.h index 465f28c0deccc68e3631e407f4678e724d2919c2..bb40f95948086bcd9cd5ff99cb36eb6a786c7024 100644 --- a/tests/benchmarks/tnl-benchmarks.h +++ b/tests/benchmarks/tnl-benchmarks.h @@ -219,19 +219,19 @@ void reductionBenchmark( const int size, device_aux. getData() ); break; default: - tnlCUDALongVectorReduction< T, T, int, tnlParallelReductionSum >( size, + reductionOnCudaDevice< T, T, int, tnlParallelReductionSum >( size, device_vector. getData(), NULL, sum, 0.0, device_aux. getData() ); - tnlCUDALongVectorReduction< T, T, int, tnlParallelReductionMin >( size, + reductionOnCudaDevice< T, T, int, tnlParallelReductionMin >( size, device_vector. getData(), NULL, min, 0.0, device_aux. getData() ); - tnlCUDALongVectorReduction< T, T, int, tnlParallelReductionMax >( size, + reductionOnCudaDevice< T, T, int, tnlParallelReductionMax >( size, device_vector. getData(), NULL, max, diff --git a/tests/unit-tests/core/cuda/tnlCudaReductionTester.h b/tests/unit-tests/core/cuda/tnlCudaReductionTester.h index ee8dbd83a5cd23df63808d27a423c8ebc7d10668..b6d21aa09369775a8887d8e1943a82cb9b4cf2f6 100644 --- a/tests/unit-tests/core/cuda/tnlCudaReductionTester.h +++ b/tests/unit-tests/core/cuda/tnlCudaReductionTester.h @@ -25,7 +25,7 @@ #include <cppunit/TestCase.h> #include <cppunit/Message.h> #include <core/cuda/device-check.h> -#include <implementation/core/cuda-long-vector-kernels.h> +#include <core/cuda/cuda-reduction.h> class tnlCudaReductionTester : public CppUnit :: TestCase { @@ -42,21 +42,40 @@ class tnlCudaReductionTester : public CppUnit :: TestCase suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaReductionTester >( "shortConstantSequenceTest", - &tnlCudaReductionTester :: shortConstantSequenceTest< float > ) + &tnlCudaReductionTester :: shortConstantSequenceTest< double > ) ); - suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaReductionTester >( "longConstantSequenceTest", - &tnlCudaReductionTester :: longConstantSequenceTest< float > ) + &tnlCudaReductionTester :: longConstantSequenceTest< double > ) ); - - suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaReductionTester >( "linearSequenceTest", - &tnlCudaReductionTester :: linearSequenceTest< float > ) + &tnlCudaReductionTester :: linearSequenceTest< double > ) + ); + suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaReductionTester >( + "shortLogicalOperationsTest", + &tnlCudaReductionTester :: shortLogicalOperationsTest< int > ) + ); + suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaReductionTester >( + "longLogicalOperationsTest", + &tnlCudaReductionTester :: longLogicalOperationsTest< int > ) + ); + suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaReductionTester >( + "shortComparisonTest", + &tnlCudaReductionTester :: shortComparisonTest< int > ) + ); + suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaReductionTester >( + "longComparisonTest", + &tnlCudaReductionTester :: longComparisonTest< int > ) + ); + suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaReductionTester >( + "shortSdotTest", + &tnlCudaReductionTester :: shortSdotTest< double > ) + ); + suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaReductionTester >( + "longSdotTest", + &tnlCudaReductionTester :: longSdotTest< double > ) ); - - return suiteOfTests; } @@ -76,45 +95,51 @@ class tnlCudaReductionTester : public CppUnit :: TestCase void shortConstantSequenceTest() { const int shortSequence( 128 ); - const int longSequence( 8192 ); RealType *hostData, *deviceData; allocateMemoryHost( hostData, shortSequence ); allocateMemoryCuda( deviceData, shortSequence ); CPPUNIT_ASSERT( checkCudaDevice ); - setConstantSequence( shortSequence, ( RealType ) -1, hostData, deviceData ); RealType result; + setConstantSequence( shortSequence, ( RealType ) -1, hostData, deviceData ); tnlParallelReductionSum< RealType, int > sumOperation; CPPUNIT_ASSERT( - ( tnlCUDALongVectorReduction( sumOperation, shortSequence, deviceData, ( RealType* ) 0, result ) ) ); + ( reductionOnCudaDevice( sumOperation, shortSequence, deviceData, ( RealType* ) 0, result ) ) ); CPPUNIT_ASSERT( result == -shortSequence ); tnlParallelReductionMin< RealType, int > minOperation; CPPUNIT_ASSERT( - ( tnlCUDALongVectorReduction( minOperation, shortSequence, deviceData, ( RealType* ) 0, result ) ) ); + ( reductionOnCudaDevice( minOperation, shortSequence, deviceData, ( RealType* ) 0, result ) ) ); CPPUNIT_ASSERT( result == -1 ); tnlParallelReductionMax< RealType, int > maxOperation; CPPUNIT_ASSERT( - ( tnlCUDALongVectorReduction( maxOperation, shortSequence, deviceData, ( RealType* ) 0, result ) ) ); + ( reductionOnCudaDevice( maxOperation, shortSequence, deviceData, ( RealType* ) 0, result ) ) ); CPPUNIT_ASSERT( result == -1 ); tnlParallelReductionAbsSum< RealType, int > absSumOperation; CPPUNIT_ASSERT( - ( tnlCUDALongVectorReduction( absSumOperation, shortSequence, deviceData, ( RealType* ) 0, result ) ) ); + ( reductionOnCudaDevice( absSumOperation, shortSequence, deviceData, ( RealType* ) 0, result ) ) ); CPPUNIT_ASSERT( result == shortSequence ); tnlParallelReductionAbsMin< RealType, int > absMinOperation; CPPUNIT_ASSERT( - ( tnlCUDALongVectorReduction( absMinOperation, shortSequence, deviceData, ( RealType* ) 0, result ) ) ); + ( reductionOnCudaDevice( absMinOperation, shortSequence, deviceData, ( RealType* ) 0, result ) ) ); CPPUNIT_ASSERT( result == 1 ); tnlParallelReductionAbsMax< RealType, int > absMaxOperation; CPPUNIT_ASSERT( - ( tnlCUDALongVectorReduction( absMaxOperation, shortSequence, deviceData, ( RealType* ) 0, result ) ) ); + ( reductionOnCudaDevice( absMaxOperation, shortSequence, deviceData, ( RealType* ) 0, result ) ) ); CPPUNIT_ASSERT( result == 1 ); + tnlParallelReductionLpNorm< RealType, int > lpNormOperation; + lpNormOperation. setPower( 2.0 ); + CPPUNIT_ASSERT( + ( reductionOnCudaDevice( lpNormOperation, shortSequence, deviceData, ( RealType* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == shortSequence ); + + freeMemoryHost( hostData ); freeMemoryCuda( deviceData ); CPPUNIT_ASSERT( checkCudaDevice ); @@ -123,45 +148,86 @@ class tnlCudaReductionTester : public CppUnit :: TestCase template< typename RealType > void longConstantSequenceTest() { - const int longSequence( 8192 ); + const int longSequence( 172892 ); RealType *hostData, *deviceData; allocateMemoryHost( hostData, longSequence ); allocateMemoryCuda( deviceData, longSequence ); CPPUNIT_ASSERT( checkCudaDevice ); - setConstantSequence( longSequence, ( RealType ) -1, hostData, deviceData ); RealType result; + setConstantSequence( longSequence, ( RealType ) -1, hostData, deviceData ); tnlParallelReductionSum< RealType, int > sumOperation; CPPUNIT_ASSERT( - ( tnlCUDALongVectorReduction( sumOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) ); + ( reductionOnCudaDevice( sumOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) ); CPPUNIT_ASSERT( result == -longSequence ); tnlParallelReductionMin< RealType, int > minOperation; CPPUNIT_ASSERT( - ( tnlCUDALongVectorReduction( minOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) ); + ( reductionOnCudaDevice( minOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) ); CPPUNIT_ASSERT( result == -1 ); tnlParallelReductionMax< RealType, int > maxOperation; CPPUNIT_ASSERT( - ( tnlCUDALongVectorReduction( maxOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) ); + ( reductionOnCudaDevice( maxOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) ); CPPUNIT_ASSERT( result == -1 ); tnlParallelReductionAbsSum< RealType, int > absSumOperation; CPPUNIT_ASSERT( - ( tnlCUDALongVectorReduction( absSumOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) ); + ( reductionOnCudaDevice( absSumOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) ); CPPUNIT_ASSERT( result == longSequence ); tnlParallelReductionAbsMin< RealType, int > absMinOperation; CPPUNIT_ASSERT( - ( tnlCUDALongVectorReduction( absMinOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) ); + ( reductionOnCudaDevice( absMinOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) ); CPPUNIT_ASSERT( result == 1 ); tnlParallelReductionAbsMax< RealType, int > absMaxOperation; CPPUNIT_ASSERT( - ( tnlCUDALongVectorReduction( absMaxOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) ); + ( reductionOnCudaDevice( absMaxOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) ); CPPUNIT_ASSERT( result == 1 ); + tnlParallelReductionLpNorm< RealType, int > lpNormOperation; + lpNormOperation. setPower( 2.0 ); + CPPUNIT_ASSERT( + ( reductionOnCudaDevice( lpNormOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == longSequence ); + + setConstantSequence( longSequence, ( RealType ) 2, hostData, deviceData ); + CPPUNIT_ASSERT( + ( reductionOnCudaDevice( sumOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == 2 * longSequence ); + + CPPUNIT_ASSERT( + ( reductionOnCudaDevice( minOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == 2 ); + + CPPUNIT_ASSERT( + ( reductionOnCudaDevice( maxOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == 2 ); + + CPPUNIT_ASSERT( + ( reductionOnCudaDevice( absSumOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == 2 * longSequence ); + + CPPUNIT_ASSERT( + ( reductionOnCudaDevice( absMinOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == 2 ); + + CPPUNIT_ASSERT( + ( reductionOnCudaDevice( absMaxOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == 2 ); + + lpNormOperation. setPower( 2.0 ); + CPPUNIT_ASSERT( + ( reductionOnCudaDevice( lpNormOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == 4 * longSequence ); + lpNormOperation. setPower( 3.0 ); + CPPUNIT_ASSERT( + ( reductionOnCudaDevice( lpNormOperation, longSequence, deviceData, ( RealType* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == 8 * longSequence ); + + freeMemoryHost( hostData ); freeMemoryCuda( deviceData ); CPPUNIT_ASSERT( checkCudaDevice ); @@ -170,7 +236,7 @@ class tnlCudaReductionTester : public CppUnit :: TestCase template< typename RealType > void linearSequenceTest() { - const int size( 1024 ); + const int size( 10245 ); RealType *hostData, *deviceData; allocateMemoryHost( hostData, size ); allocateMemoryCuda( deviceData, size ); @@ -187,31 +253,31 @@ class tnlCudaReductionTester : public CppUnit :: TestCase tnlParallelReductionSum< RealType, int > sumOperation; RealType result; CPPUNIT_ASSERT( - ( tnlCUDALongVectorReduction( sumOperation, size, deviceData, ( RealType* ) 0, result ) ) ); + ( reductionOnCudaDevice( sumOperation, size, deviceData, ( RealType* ) 0, result ) ) ); CPPUNIT_ASSERT( result == sum ); tnlParallelReductionMin< RealType, int > minOperation; CPPUNIT_ASSERT( - ( tnlCUDALongVectorReduction( minOperation, size, deviceData, ( RealType* ) 0, result ) ) ); + ( reductionOnCudaDevice( minOperation, size, deviceData, ( RealType* ) 0, result ) ) ); CPPUNIT_ASSERT( result == -size ); tnlParallelReductionMax< RealType, int > maxOperation; CPPUNIT_ASSERT( - ( tnlCUDALongVectorReduction( maxOperation, size, deviceData, ( RealType* ) 0, result ) ) ); + ( reductionOnCudaDevice( maxOperation, size, deviceData, ( RealType* ) 0, result ) ) ); CPPUNIT_ASSERT( result == -1 ); tnlParallelReductionAbsSum< RealType, int > absSumOperation; CPPUNIT_ASSERT( - ( tnlCUDALongVectorReduction( absSumOperation, size, deviceData, ( RealType* ) 0, result ) ) ); + ( reductionOnCudaDevice( absSumOperation, size, deviceData, ( RealType* ) 0, result ) ) ); CPPUNIT_ASSERT( result == tnlAbs( sum ) ); tnlParallelReductionAbsMin< RealType, int > absMinOperation; CPPUNIT_ASSERT( - ( tnlCUDALongVectorReduction( absMinOperation, size, deviceData, ( RealType* ) 0, result ) ) ); + ( reductionOnCudaDevice( absMinOperation, size, deviceData, ( RealType* ) 0, result ) ) ); CPPUNIT_ASSERT( result == 1 ); tnlParallelReductionAbsMax< RealType, int > absMaxOperation; CPPUNIT_ASSERT( - ( tnlCUDALongVectorReduction( absMaxOperation, size, deviceData, ( RealType* ) 0, result ) ) ); + ( reductionOnCudaDevice( absMaxOperation, size, deviceData, ( RealType* ) 0, result ) ) ); CPPUNIT_ASSERT( result == size ); freeMemoryHost( hostData ); @@ -219,6 +285,280 @@ class tnlCudaReductionTester : public CppUnit :: TestCase CPPUNIT_ASSERT( checkCudaDevice ); }; + template< typename Type > + void shortLogicalOperationsTest() + { + int size( 125 ); + Type *hostData, *deviceData; + allocateMemoryHost( hostData, size ); + allocateMemoryCuda( deviceData, size ); + CPPUNIT_ASSERT( checkCudaDevice ); + + for( int i = 0; i < size; i ++ ) + hostData[ i ] = 1; + + copyMemoryHostToCuda( deviceData, hostData, size ); + CPPUNIT_ASSERT( checkCudaDevice ); + + tnlParallelReductionLogicalAnd< Type, int > andOperation; + tnlParallelReductionLogicalOr< Type, int > orOperation; + Type result; + CPPUNIT_ASSERT( + ( reductionOnCudaDevice( andOperation, size, deviceData, ( Type* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == 1 ); + CPPUNIT_ASSERT( + ( reductionOnCudaDevice( orOperation, size, deviceData, ( Type* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == 1 ); + + hostData[ 0 ] = 0; + copyMemoryHostToCuda( deviceData, hostData, size ); + CPPUNIT_ASSERT( checkCudaDevice ); + CPPUNIT_ASSERT( + ( reductionOnCudaDevice( andOperation, size, deviceData, ( Type* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == 0 ); + CPPUNIT_ASSERT( + ( reductionOnCudaDevice( orOperation, size, deviceData, ( Type* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == 1 ); + + for( int i = 0; i < size; i ++ ) + hostData[ i ] = 0; + + copyMemoryHostToCuda( deviceData, hostData, size ); + CPPUNIT_ASSERT( checkCudaDevice ); + CPPUNIT_ASSERT( + ( reductionOnCudaDevice( andOperation, size, deviceData, ( Type* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == 0 ); + CPPUNIT_ASSERT( + ( reductionOnCudaDevice( orOperation, size, deviceData, ( Type* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == 0 ); + } + + template< typename Type > + void longLogicalOperationsTest() + { + int size( 7628198 ); + Type *hostData, *deviceData; + allocateMemoryHost( hostData, size ); + allocateMemoryCuda( deviceData, size ); + CPPUNIT_ASSERT( checkCudaDevice ); + + for( int i = 0; i < size; i ++ ) + hostData[ i ] = 1; + + copyMemoryHostToCuda( deviceData, hostData, size ); + CPPUNIT_ASSERT( checkCudaDevice ); + + tnlParallelReductionLogicalAnd< Type, int > andOperation; + tnlParallelReductionLogicalOr< Type, int > orOperation; + Type result; + CPPUNIT_ASSERT( + ( reductionOnCudaDevice( andOperation, size, deviceData, ( Type* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == 1 ); + CPPUNIT_ASSERT( + ( reductionOnCudaDevice( orOperation, size, deviceData, ( Type* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == 1 ); + + hostData[ 0 ] = 0; + copyMemoryHostToCuda( deviceData, hostData, size ); + CPPUNIT_ASSERT( checkCudaDevice ); + CPPUNIT_ASSERT( + ( reductionOnCudaDevice( andOperation, size, deviceData, ( Type* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == 0 ); + CPPUNIT_ASSERT( + ( reductionOnCudaDevice( orOperation, size, deviceData, ( Type* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == 1 ); + + for( int i = 0; i < size; i ++ ) + hostData[ i ] = 0; + + copyMemoryHostToCuda( deviceData, hostData, size ); + CPPUNIT_ASSERT( checkCudaDevice ); + CPPUNIT_ASSERT( + ( reductionOnCudaDevice( andOperation, size, deviceData, ( Type* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == 0 ); + CPPUNIT_ASSERT( + ( reductionOnCudaDevice( orOperation, size, deviceData, ( Type* ) 0, result ) ) ); + CPPUNIT_ASSERT( result == 0 ); + } + + template< typename Type > + void shortComparisonTest() + { + const int size( 125 ); + Type *hostData1, *hostData2, + *deviceData1, *deviceData2; + allocateMemoryHost( hostData1, size ); + allocateMemoryHost( hostData2, size ); + allocateMemoryCuda( deviceData1, size ); + allocateMemoryCuda( deviceData2, size ); + CPPUNIT_ASSERT( checkCudaDevice ); + + for( int i = 0; i < size; i ++ ) + hostData1[ i ] = hostData2[ i ] = 1; + copyMemoryHostToCuda( deviceData1, hostData1, size ); + copyMemoryHostToCuda( deviceData2, hostData2, size ); + CPPUNIT_ASSERT( checkCudaDevice ); + + bool result( false ); + tnlParallelReductionEqualities< Type, int > equalityOperation; + tnlParallelReductionInequalities< Type, int > inequalityOperation; + + CPPUNIT_ASSERT( + ( reductionOnCudaDevice( equalityOperation, size, deviceData1, deviceData2, result ) ) ); + CPPUNIT_ASSERT( result == true ); + + CPPUNIT_ASSERT( + ( reductionOnCudaDevice( inequalityOperation, size, deviceData1, deviceData2, result ) ) ); + CPPUNIT_ASSERT( result == false ); + + hostData1[ 0 ] = 0; + copyMemoryHostToCuda( deviceData1, hostData1, size ); + CPPUNIT_ASSERT( checkCudaDevice ); + + CPPUNIT_ASSERT( + ( reductionOnCudaDevice( equalityOperation, size, deviceData1, deviceData2, result ) ) ); + CPPUNIT_ASSERT( result == false ); + + CPPUNIT_ASSERT( + ( reductionOnCudaDevice( inequalityOperation, size, deviceData1, deviceData2, result ) ) ); + CPPUNIT_ASSERT( result == false ); + + for( int i = 0; i < size; i ++ ) + hostData1[ i ] = 0; + copyMemoryHostToCuda( deviceData1, hostData1, size ); + CPPUNIT_ASSERT( checkCudaDevice ); + + CPPUNIT_ASSERT( + ( reductionOnCudaDevice( equalityOperation, size, deviceData1, deviceData2, result ) ) ); + CPPUNIT_ASSERT( result == false ); + + CPPUNIT_ASSERT( + ( reductionOnCudaDevice( inequalityOperation, size, deviceData1, deviceData2, result ) ) ); + CPPUNIT_ASSERT( result == true ); + } + + template< typename Type > + void longComparisonTest() + { + const int size( 1258976 ); + Type *hostData1, *hostData2, + *deviceData1, *deviceData2; + allocateMemoryHost( hostData1, size ); + allocateMemoryHost( hostData2, size ); + allocateMemoryCuda( deviceData1, size ); + allocateMemoryCuda( deviceData2, size ); + CPPUNIT_ASSERT( checkCudaDevice ); + + for( int i = 0; i < size; i ++ ) + hostData1[ i ] = hostData2[ i ] = 1; + copyMemoryHostToCuda( deviceData1, hostData1, size ); + copyMemoryHostToCuda( deviceData2, hostData2, size ); + CPPUNIT_ASSERT( checkCudaDevice ); + + bool result( false ); + tnlParallelReductionEqualities< Type, int > equalityOperation; + tnlParallelReductionInequalities< Type, int > inequalityOperation; + + CPPUNIT_ASSERT( + ( reductionOnCudaDevice( equalityOperation, size, deviceData1, deviceData2, result ) ) ); + CPPUNIT_ASSERT( result == true ); + + CPPUNIT_ASSERT( + ( reductionOnCudaDevice( inequalityOperation, size, deviceData1, deviceData2, result ) ) ); + CPPUNIT_ASSERT( result == false ); + + hostData1[ 0 ] = 0; + copyMemoryHostToCuda( deviceData1, hostData1, size ); + CPPUNIT_ASSERT( checkCudaDevice ); + + CPPUNIT_ASSERT( + ( reductionOnCudaDevice( equalityOperation, size, deviceData1, deviceData2, result ) ) ); + CPPUNIT_ASSERT( result == false ); + + CPPUNIT_ASSERT( + ( reductionOnCudaDevice( inequalityOperation, size, deviceData1, deviceData2, result ) ) ); + CPPUNIT_ASSERT( result == false ); + + for( int i = 0; i < size; i ++ ) + hostData1[ i ] = 0; + copyMemoryHostToCuda( deviceData1, hostData1, size ); + CPPUNIT_ASSERT( checkCudaDevice ); + + CPPUNIT_ASSERT( + ( reductionOnCudaDevice( equalityOperation, size, deviceData1, deviceData2, result ) ) ); + CPPUNIT_ASSERT( result == false ); + + CPPUNIT_ASSERT( + ( reductionOnCudaDevice( inequalityOperation, size, deviceData1, deviceData2, result ) ) ); + CPPUNIT_ASSERT( result == true ); + }; + + template< typename Type > + void shortSdotTest() + { + const int size( 125 ); + Type *hostData1, *hostData2, + *deviceData1, *deviceData2; + allocateMemoryHost( hostData1, size ); + allocateMemoryHost( hostData2, size ); + allocateMemoryCuda( deviceData1, size ); + allocateMemoryCuda( deviceData2, size ); + CPPUNIT_ASSERT( checkCudaDevice ); + + hostData1[ 0 ] = 0; + hostData2[ 0 ] = 1; + Type sdot( 0.0 ); + for( int i = 1; i < size; i ++ ) + { + hostData1[ i ] = i; + hostData2[ i ] = -hostData2[ i - 1 ]; + sdot += hostData1[ i ] * hostData2[ i ]; + } + copyMemoryHostToCuda( deviceData1, hostData1, size ); + copyMemoryHostToCuda( deviceData2, hostData2, size ); + CPPUNIT_ASSERT( checkCudaDevice ); + + Type result( 0.0 ); + tnlParallelReductionSdot< Type, int > sdotOperation; + + CPPUNIT_ASSERT( + ( reductionOnCudaDevice( sdotOperation, size, deviceData1, deviceData2, result ) ) ); + CPPUNIT_ASSERT( result == sdot ); + }; + + + template< typename Type > + void longSdotTest() + { + const int size( 125789 ); + Type *hostData1, *hostData2, + *deviceData1, *deviceData2; + allocateMemoryHost( hostData1, size ); + allocateMemoryHost( hostData2, size ); + allocateMemoryCuda( deviceData1, size ); + allocateMemoryCuda( deviceData2, size ); + CPPUNIT_ASSERT( checkCudaDevice ); + + hostData1[ 0 ] = 0; + hostData2[ 0 ] = 1; + Type sdot( 0.0 ); + for( int i = 1; i < size; i ++ ) + { + hostData1[ i ] = i; + hostData2[ i ] = -hostData2[ i - 1 ]; + sdot += hostData1[ i ] * hostData2[ i ]; + } + copyMemoryHostToCuda( deviceData1, hostData1, size ); + copyMemoryHostToCuda( deviceData2, hostData2, size ); + CPPUNIT_ASSERT( checkCudaDevice ); + + Type result( 0.0 ); + tnlParallelReductionSdot< Type, int > sdotOperation; + + CPPUNIT_ASSERT( + ( reductionOnCudaDevice( sdotOperation, size, deviceData1, deviceData2, result ) ) ); + CPPUNIT_ASSERT( result == sdot ); + }; };