Loading buildAll +2 −2 Original line number Diff line number Diff line Loading @@ -25,10 +25,10 @@ cd Debug ${CMAKE} .. -DCMAKE_BUILD_TYPE=Debug -DCMAKE_INSTALL_PREFIX=${HOME}/local -DCUDA_ARCHITECTURE=${CUDA_ARCHITECTURE} -DWITH_CUDA=${WITH_CUDA} -DWITH_CUSPARSE=${WITH_CUSPARSE} -DPETSC_DIR=${PETSC_DIR} make -j${CPUS} #VERBOSE=1 make -j${CPUS} test #make -j${CPUS} install make -j${CPUS} install cd ../Release ${CMAKE} .. -DCMAKE_INSTALL_PREFIX=${HOME}/local -DCUDA_ARCHITECTURE=${CUDA_ARCHITECTURE} -DWITH_CUDA=${WITH_CUDA} -DWITH_CUSPARSE=${WITH_CUSPARSE} -DPETSC_DIR=${PETSC_DIR} make -j${CPUS} #VERBOSE=1 make -j${CPUS} test #make -j${CPUS} install make -j${CPUS} install src/core/CMakeLists.txt +1 −2 Original line number Diff line number Diff line Loading @@ -24,7 +24,6 @@ set (headers tnlArray.h tnlTimerCPU.h tnlTimerRT.h tnlTuple.h tnlCudaSupport.h mfilename.h mfuncs.h mpi-supp.h Loading src/core/cuda/CMakeLists.txt +4 −1 Original line number Diff line number Diff line set( headers device-check.h ) reduction.h reduction-operations.h ) INSTALL( FILES ${headers} DESTINATION include/tnl-${tnlVersion}/core/cuda ) No newline at end of file src/core/cuda/reduction-operations.h +292 −21 Original line number Diff line number Diff line Loading @@ -22,11 +22,7 @@ #include <cuda.h> #include <core/mfuncs.h> enum tnlTupleOperation { tnlParallelReductionMax, tnlParallelReductionAbsMin, tnlParallelReductionAbsMax, tnlParallelReductionAbsSum, tnlParallelReductionLpNorm, enum tnlTupleOperation { tnlParallelReductionLpNorm, tnlParallelReductionSdot }; Loading Loading @@ -109,30 +105,39 @@ class tnlParallelReductionSum typedef Real RealType; typedef Index IndexType; typedef Real ResultType; ResultType initialValueOnHost( const IndexType idx, const RealType* data1, const RealType* data2 ) const { return data1[ idx ]; }; RealType reduceOnHost( const RealType& data1, const RealType& data2 ) const ResultType reduceOnHost( const IndexType idx, const ResultType& current, const RealType* data1, const RealType* data2 ) const { return data1 + data2; } return current + data1[ idx ]; }; __device__ RealType initialValueOnDevice( const IndexType idx1, __device__ ResultType initialValueOnDevice( const IndexType idx1, const IndexType idx2, const RealType* data1, const RealType* data2 ) const { return data1[ idx1 ] + data1[ idx2 ]; } }; __device__ RealType initialValueOnDevice( const IndexType idx1, __device__ ResultType initialValueOnDevice( const IndexType idx1, const RealType* data1, const RealType* data2 ) const { return data1[ idx1 ]; }; __device__ RealType firstReductionOnDevice( const IndexType idx1, __device__ ResultType firstReductionOnDevice( const IndexType idx1, const IndexType idx2, const IndexType idx3, const RealType* data1, Loading @@ -142,7 +147,7 @@ class tnlParallelReductionSum return data1[ idx1 ] + data2[ idx2 ] + data2[ idx3 ]; }; __device__ RealType firstReductionOnDevice( const IndexType idx1, __device__ ResultType firstReductionOnDevice( const IndexType idx1, const IndexType idx2, const RealType* data1, const RealType* data2, Loading @@ -151,9 +156,9 @@ class tnlParallelReductionSum return data1[ idx1 ] + data2[ idx2 ]; }; __device__ RealType commonReductionOnDevice( const IndexType idx1, __device__ ResultType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const RealType* data ) const const ResultType* data ) const { return data[ idx1 ] + data[ idx2 ]; }; Loading @@ -167,12 +172,20 @@ class tnlParallelReductionMin typedef Real RealType; typedef Index IndexType; RealType initialValueOnHost( const IndexType idx, const RealType* data1, const RealType* data2 ) const { return data1[ idx ]; }; RealType reduceOnHost( const RealType& data1, const RealType& data2 ) const RealType reduceOnHost( const IndexType idx, const RealType& current, const RealType* data1, const RealType* data2 ) const { return Min( data1, data2 ); } return Min( current, data1[ idx ] ); }; __device__ RealType initialValueOnDevice( const IndexType idx1, const IndexType idx2, Loading Loading @@ -216,7 +229,265 @@ class tnlParallelReductionMin }; }; template< typename Real, typename Index > class tnlParallelReductionMax { public: typedef Real RealType; typedef Index IndexType; RealType initialValueOnHost( const IndexType idx, const RealType* data1, const RealType* data2 ) const { return data1[ idx ]; }; RealType reduceOnHost( const IndexType idx, const RealType& current, const RealType* data1, const RealType* data2 ) const { return Max( current, data1[ idx ] ); }; __device__ RealType initialValueOnDevice( const IndexType idx1, const IndexType idx2, const RealType* data1, const RealType* data2 ) const { return tnlCudaMax( data1[ idx1 ], data1[ idx2 ] ); } __device__ RealType initialValueOnDevice( const IndexType idx1, const RealType* data1, const RealType* data2 ) const { return data1[ idx1 ]; }; __device__ RealType firstReductionOnDevice( const IndexType idx1, const IndexType idx2, const IndexType idx3, const RealType* data1, const RealType* data2, const RealType* data3 ) const { return tnlCudaMax( data1[ idx1 ], tnlCudaMax( data2[ idx2 ], data2[ idx3 ] ) ); }; __device__ RealType firstReductionOnDevice( const IndexType idx1, const IndexType idx2, const RealType* data1, const RealType* data2, const RealType* data3 ) const { return tnlCudaMax( data1[ idx1 ], data2[ idx2 ] ); }; __device__ RealType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const RealType* data ) const { return tnlCudaMax( data[ idx1 ], data[ idx2 ] ); }; }; template< typename Real, typename Index > class tnlParallelReductionAbsSum { public: typedef Real RealType; typedef Index IndexType; RealType initialValueOnHost( const IndexType idx, const RealType* data1, const RealType* data2 ) const { return tnlAbs( data1[ idx ] ); }; RealType reduceOnHost( const IndexType idx, const RealType& current, const RealType* data1, const RealType* data2 ) const { return current + tnlAbs( data1[ idx ] ); }; __device__ RealType initialValueOnDevice( const IndexType idx1, const IndexType idx2, const RealType* data1, const RealType* data2 ) const { return tnlCudaAbs( data1[ idx1 ] ) + tnlCudaAbs( data1[ idx2 ] ); }; __device__ RealType initialValueOnDevice( const IndexType idx1, const RealType* data1, const RealType* data2 ) const { return tnlCudaAbs( data1[ idx1 ] ); }; __device__ RealType firstReductionOnDevice( const IndexType idx1, const IndexType idx2, const IndexType idx3, const RealType* data1, const RealType* data2, const RealType* data3 ) const { return data1[ idx1 ] + tnlCudaAbs( data2[ idx2 ] ) + tnlCudaAbs( data2[ idx3 ] ); }; __device__ RealType firstReductionOnDevice( const IndexType idx1, const IndexType idx2, const RealType* data1, const RealType* data2, const RealType* data3 ) const { return data1[ idx1 ] + tnlCudaAbs( data2[ idx2 ] ); }; __device__ RealType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const RealType* data ) const { return data[ idx1 ] + data[ idx2 ]; }; }; template< typename Real, typename Index > class tnlParallelReductionAbsMin { public: typedef Real RealType; typedef Index IndexType; RealType initialValueOnHost( const IndexType idx, const RealType* data1, const RealType* data2 ) const { return tnlAbs( data1[ idx ] ); }; RealType reduceOnHost( const IndexType idx, const RealType& current, const RealType* data1, const RealType* data2 ) const { return Min( current, tnlAbs( data1[ idx ] ) ); }; __device__ RealType initialValueOnDevice( const IndexType idx1, const IndexType idx2, const RealType* data1, const RealType* data2 ) const { return tnlCudaMin( tnlCudaAbs( data1[ idx1 ] ), tnlCudaAbs( data1[ idx2 ] ) ); } __device__ RealType initialValueOnDevice( const IndexType idx1, const RealType* data1, const RealType* data2 ) const { return tnlCudaAbs( data1[ idx1 ] ); }; __device__ RealType firstReductionOnDevice( const IndexType idx1, const IndexType idx2, const IndexType idx3, const RealType* data1, const RealType* data2, const RealType* data3 ) const { return tnlCudaMin( data1[ idx1 ], tnlCudaMin( tnlCudaAbs( data2[ idx2 ] ), tnlCudaAbs( data2[ idx3 ] ) ) ); }; __device__ RealType firstReductionOnDevice( const IndexType idx1, const IndexType idx2, const RealType* data1, const RealType* data2, const RealType* data3 ) const { return tnlCudaMin( data1[ idx1 ], tnlCudaAbs( data2[ idx2 ] ) ); }; __device__ RealType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const RealType* data ) const { return tnlCudaMin( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) ); }; }; template< typename Real, typename Index > class tnlParallelReductionAbsMax { public: typedef Real RealType; typedef Index IndexType; RealType initialValueOnHost( const IndexType idx, const RealType* data1, const RealType* data2 ) const { return tnlAbs( data1[ idx ] ); }; RealType reduceOnHost( const IndexType idx, const RealType& current, const RealType* data1, const RealType* data2 ) const { return Max( current, tnlAbs( data1[ idx ] ) ); }; __device__ RealType initialValueOnDevice( const IndexType idx1, const IndexType idx2, const RealType* data1, const RealType* data2 ) const { return tnlCudaMax( tnlCudaAbs( data1[ idx1 ] ), tnlCudaAbs( data1[ idx2 ] ) ); } __device__ RealType initialValueOnDevice( const IndexType idx1, const RealType* data1, const RealType* data2 ) const { return tnlCudaAbs( data1[ idx1 ] ); }; __device__ RealType firstReductionOnDevice( const IndexType idx1, const IndexType idx2, const IndexType idx3, const RealType* data1, const RealType* data2, const RealType* data3 ) const { return tnlCudaMax( data1[ idx1 ], tnlCudaMax( tnlCudaAbs( data2[ idx2 ] ), tnlCudaAbs( data2[ idx3 ] ) ) ); }; __device__ RealType firstReductionOnDevice( const IndexType idx1, const IndexType idx2, const RealType* data1, const RealType* data2, const RealType* data3 ) const { return tnlCudaMax( data1[ idx1 ], tnlCudaAbs( data2[ idx2 ] ) ); }; __device__ RealType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const RealType* data ) const { return tnlCudaMax( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) ); }; }; #include <implementation/core/cuda/reduction-operations_impl.h> Loading src/implementation/core/CMakeLists.txt +1 −1 Original line number Diff line number Diff line Loading @@ -2,7 +2,7 @@ ADD_SUBDIRECTORY( cuda ) SET( headers cuda-long-vector-kernels.h vector-operations.h memory-functions.h memory-operations.h tnlArray_impl.h tnlHost_impl.h tnlLogger_impl.h Loading Loading
buildAll +2 −2 Original line number Diff line number Diff line Loading @@ -25,10 +25,10 @@ cd Debug ${CMAKE} .. -DCMAKE_BUILD_TYPE=Debug -DCMAKE_INSTALL_PREFIX=${HOME}/local -DCUDA_ARCHITECTURE=${CUDA_ARCHITECTURE} -DWITH_CUDA=${WITH_CUDA} -DWITH_CUSPARSE=${WITH_CUSPARSE} -DPETSC_DIR=${PETSC_DIR} make -j${CPUS} #VERBOSE=1 make -j${CPUS} test #make -j${CPUS} install make -j${CPUS} install cd ../Release ${CMAKE} .. -DCMAKE_INSTALL_PREFIX=${HOME}/local -DCUDA_ARCHITECTURE=${CUDA_ARCHITECTURE} -DWITH_CUDA=${WITH_CUDA} -DWITH_CUSPARSE=${WITH_CUSPARSE} -DPETSC_DIR=${PETSC_DIR} make -j${CPUS} #VERBOSE=1 make -j${CPUS} test #make -j${CPUS} install make -j${CPUS} install
src/core/CMakeLists.txt +1 −2 Original line number Diff line number Diff line Loading @@ -24,7 +24,6 @@ set (headers tnlArray.h tnlTimerCPU.h tnlTimerRT.h tnlTuple.h tnlCudaSupport.h mfilename.h mfuncs.h mpi-supp.h Loading
src/core/cuda/CMakeLists.txt +4 −1 Original line number Diff line number Diff line set( headers device-check.h ) reduction.h reduction-operations.h ) INSTALL( FILES ${headers} DESTINATION include/tnl-${tnlVersion}/core/cuda ) No newline at end of file
src/core/cuda/reduction-operations.h +292 −21 Original line number Diff line number Diff line Loading @@ -22,11 +22,7 @@ #include <cuda.h> #include <core/mfuncs.h> enum tnlTupleOperation { tnlParallelReductionMax, tnlParallelReductionAbsMin, tnlParallelReductionAbsMax, tnlParallelReductionAbsSum, tnlParallelReductionLpNorm, enum tnlTupleOperation { tnlParallelReductionLpNorm, tnlParallelReductionSdot }; Loading Loading @@ -109,30 +105,39 @@ class tnlParallelReductionSum typedef Real RealType; typedef Index IndexType; typedef Real ResultType; ResultType initialValueOnHost( const IndexType idx, const RealType* data1, const RealType* data2 ) const { return data1[ idx ]; }; RealType reduceOnHost( const RealType& data1, const RealType& data2 ) const ResultType reduceOnHost( const IndexType idx, const ResultType& current, const RealType* data1, const RealType* data2 ) const { return data1 + data2; } return current + data1[ idx ]; }; __device__ RealType initialValueOnDevice( const IndexType idx1, __device__ ResultType initialValueOnDevice( const IndexType idx1, const IndexType idx2, const RealType* data1, const RealType* data2 ) const { return data1[ idx1 ] + data1[ idx2 ]; } }; __device__ RealType initialValueOnDevice( const IndexType idx1, __device__ ResultType initialValueOnDevice( const IndexType idx1, const RealType* data1, const RealType* data2 ) const { return data1[ idx1 ]; }; __device__ RealType firstReductionOnDevice( const IndexType idx1, __device__ ResultType firstReductionOnDevice( const IndexType idx1, const IndexType idx2, const IndexType idx3, const RealType* data1, Loading @@ -142,7 +147,7 @@ class tnlParallelReductionSum return data1[ idx1 ] + data2[ idx2 ] + data2[ idx3 ]; }; __device__ RealType firstReductionOnDevice( const IndexType idx1, __device__ ResultType firstReductionOnDevice( const IndexType idx1, const IndexType idx2, const RealType* data1, const RealType* data2, Loading @@ -151,9 +156,9 @@ class tnlParallelReductionSum return data1[ idx1 ] + data2[ idx2 ]; }; __device__ RealType commonReductionOnDevice( const IndexType idx1, __device__ ResultType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const RealType* data ) const const ResultType* data ) const { return data[ idx1 ] + data[ idx2 ]; }; Loading @@ -167,12 +172,20 @@ class tnlParallelReductionMin typedef Real RealType; typedef Index IndexType; RealType initialValueOnHost( const IndexType idx, const RealType* data1, const RealType* data2 ) const { return data1[ idx ]; }; RealType reduceOnHost( const RealType& data1, const RealType& data2 ) const RealType reduceOnHost( const IndexType idx, const RealType& current, const RealType* data1, const RealType* data2 ) const { return Min( data1, data2 ); } return Min( current, data1[ idx ] ); }; __device__ RealType initialValueOnDevice( const IndexType idx1, const IndexType idx2, Loading Loading @@ -216,7 +229,265 @@ class tnlParallelReductionMin }; }; template< typename Real, typename Index > class tnlParallelReductionMax { public: typedef Real RealType; typedef Index IndexType; RealType initialValueOnHost( const IndexType idx, const RealType* data1, const RealType* data2 ) const { return data1[ idx ]; }; RealType reduceOnHost( const IndexType idx, const RealType& current, const RealType* data1, const RealType* data2 ) const { return Max( current, data1[ idx ] ); }; __device__ RealType initialValueOnDevice( const IndexType idx1, const IndexType idx2, const RealType* data1, const RealType* data2 ) const { return tnlCudaMax( data1[ idx1 ], data1[ idx2 ] ); } __device__ RealType initialValueOnDevice( const IndexType idx1, const RealType* data1, const RealType* data2 ) const { return data1[ idx1 ]; }; __device__ RealType firstReductionOnDevice( const IndexType idx1, const IndexType idx2, const IndexType idx3, const RealType* data1, const RealType* data2, const RealType* data3 ) const { return tnlCudaMax( data1[ idx1 ], tnlCudaMax( data2[ idx2 ], data2[ idx3 ] ) ); }; __device__ RealType firstReductionOnDevice( const IndexType idx1, const IndexType idx2, const RealType* data1, const RealType* data2, const RealType* data3 ) const { return tnlCudaMax( data1[ idx1 ], data2[ idx2 ] ); }; __device__ RealType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const RealType* data ) const { return tnlCudaMax( data[ idx1 ], data[ idx2 ] ); }; }; template< typename Real, typename Index > class tnlParallelReductionAbsSum { public: typedef Real RealType; typedef Index IndexType; RealType initialValueOnHost( const IndexType idx, const RealType* data1, const RealType* data2 ) const { return tnlAbs( data1[ idx ] ); }; RealType reduceOnHost( const IndexType idx, const RealType& current, const RealType* data1, const RealType* data2 ) const { return current + tnlAbs( data1[ idx ] ); }; __device__ RealType initialValueOnDevice( const IndexType idx1, const IndexType idx2, const RealType* data1, const RealType* data2 ) const { return tnlCudaAbs( data1[ idx1 ] ) + tnlCudaAbs( data1[ idx2 ] ); }; __device__ RealType initialValueOnDevice( const IndexType idx1, const RealType* data1, const RealType* data2 ) const { return tnlCudaAbs( data1[ idx1 ] ); }; __device__ RealType firstReductionOnDevice( const IndexType idx1, const IndexType idx2, const IndexType idx3, const RealType* data1, const RealType* data2, const RealType* data3 ) const { return data1[ idx1 ] + tnlCudaAbs( data2[ idx2 ] ) + tnlCudaAbs( data2[ idx3 ] ); }; __device__ RealType firstReductionOnDevice( const IndexType idx1, const IndexType idx2, const RealType* data1, const RealType* data2, const RealType* data3 ) const { return data1[ idx1 ] + tnlCudaAbs( data2[ idx2 ] ); }; __device__ RealType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const RealType* data ) const { return data[ idx1 ] + data[ idx2 ]; }; }; template< typename Real, typename Index > class tnlParallelReductionAbsMin { public: typedef Real RealType; typedef Index IndexType; RealType initialValueOnHost( const IndexType idx, const RealType* data1, const RealType* data2 ) const { return tnlAbs( data1[ idx ] ); }; RealType reduceOnHost( const IndexType idx, const RealType& current, const RealType* data1, const RealType* data2 ) const { return Min( current, tnlAbs( data1[ idx ] ) ); }; __device__ RealType initialValueOnDevice( const IndexType idx1, const IndexType idx2, const RealType* data1, const RealType* data2 ) const { return tnlCudaMin( tnlCudaAbs( data1[ idx1 ] ), tnlCudaAbs( data1[ idx2 ] ) ); } __device__ RealType initialValueOnDevice( const IndexType idx1, const RealType* data1, const RealType* data2 ) const { return tnlCudaAbs( data1[ idx1 ] ); }; __device__ RealType firstReductionOnDevice( const IndexType idx1, const IndexType idx2, const IndexType idx3, const RealType* data1, const RealType* data2, const RealType* data3 ) const { return tnlCudaMin( data1[ idx1 ], tnlCudaMin( tnlCudaAbs( data2[ idx2 ] ), tnlCudaAbs( data2[ idx3 ] ) ) ); }; __device__ RealType firstReductionOnDevice( const IndexType idx1, const IndexType idx2, const RealType* data1, const RealType* data2, const RealType* data3 ) const { return tnlCudaMin( data1[ idx1 ], tnlCudaAbs( data2[ idx2 ] ) ); }; __device__ RealType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const RealType* data ) const { return tnlCudaMin( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) ); }; }; template< typename Real, typename Index > class tnlParallelReductionAbsMax { public: typedef Real RealType; typedef Index IndexType; RealType initialValueOnHost( const IndexType idx, const RealType* data1, const RealType* data2 ) const { return tnlAbs( data1[ idx ] ); }; RealType reduceOnHost( const IndexType idx, const RealType& current, const RealType* data1, const RealType* data2 ) const { return Max( current, tnlAbs( data1[ idx ] ) ); }; __device__ RealType initialValueOnDevice( const IndexType idx1, const IndexType idx2, const RealType* data1, const RealType* data2 ) const { return tnlCudaMax( tnlCudaAbs( data1[ idx1 ] ), tnlCudaAbs( data1[ idx2 ] ) ); } __device__ RealType initialValueOnDevice( const IndexType idx1, const RealType* data1, const RealType* data2 ) const { return tnlCudaAbs( data1[ idx1 ] ); }; __device__ RealType firstReductionOnDevice( const IndexType idx1, const IndexType idx2, const IndexType idx3, const RealType* data1, const RealType* data2, const RealType* data3 ) const { return tnlCudaMax( data1[ idx1 ], tnlCudaMax( tnlCudaAbs( data2[ idx2 ] ), tnlCudaAbs( data2[ idx3 ] ) ) ); }; __device__ RealType firstReductionOnDevice( const IndexType idx1, const IndexType idx2, const RealType* data1, const RealType* data2, const RealType* data3 ) const { return tnlCudaMax( data1[ idx1 ], tnlCudaAbs( data2[ idx2 ] ) ); }; __device__ RealType commonReductionOnDevice( const IndexType idx1, const IndexType idx2, const RealType* data ) const { return tnlCudaMax( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) ); }; }; #include <implementation/core/cuda/reduction-operations_impl.h> Loading
src/implementation/core/CMakeLists.txt +1 −1 Original line number Diff line number Diff line Loading @@ -2,7 +2,7 @@ ADD_SUBDIRECTORY( cuda ) SET( headers cuda-long-vector-kernels.h vector-operations.h memory-functions.h memory-operations.h tnlArray_impl.h tnlHost_impl.h tnlLogger_impl.h Loading