Refactoring the reduction on the CUDA device. (8b4397ce) · Commits · TNL / tnl-dev

buildAll

+2 −2

Original line number	Diff line number	Diff line
		@@ -25,10 +25,10 @@ cd Debug
		${CMAKE} .. -DCMAKE_BUILD_TYPE=Debug -DCMAKE_INSTALL_PREFIX=${HOME}/local -DCUDA_ARCHITECTURE=${CUDA_ARCHITECTURE} -DWITH_CUDA=${WITH_CUDA} -DWITH_CUSPARSE=${WITH_CUSPARSE} -DPETSC_DIR=${PETSC_DIR}
		make -j${CPUS} #VERBOSE=1
		make -j${CPUS} test
		#make -j${CPUS} install
		make -j${CPUS} install

		cd ../Release
		${CMAKE} .. -DCMAKE_INSTALL_PREFIX=${HOME}/local -DCUDA_ARCHITECTURE=${CUDA_ARCHITECTURE} -DWITH_CUDA=${WITH_CUDA} -DWITH_CUSPARSE=${WITH_CUSPARSE} -DPETSC_DIR=${PETSC_DIR}
		make -j${CPUS} #VERBOSE=1
		make -j${CPUS} test
		#make -j${CPUS} install
		make -j${CPUS} install

src/core/CMakeLists.txt

+1 −2

Original line number	Diff line number	Diff line
		@@ -24,7 +24,6 @@ set (headers tnlArray.h
		tnlTimerCPU.h
		tnlTimerRT.h
		tnlTuple.h
		tnlCudaSupport.h
		mfilename.h
		mfuncs.h
		mpi-supp.h

src/core/cuda/CMakeLists.txt

+4 −1

Original line number	Diff line number	Diff line
		set( headers device-check.h
		)
		reduction.h
		reduction-operations.h )

		INSTALL( FILES ${headers} DESTINATION include/tnl-${tnlVersion}/core/cuda )
		No newline at end of file

src/core/cuda/reduction-operations.h

+292 −21

Original line number	Diff line number	Diff line
		@@ -22,11 +22,7 @@
		#include <cuda.h>
		#include <core/mfuncs.h>

		enum tnlTupleOperation { tnlParallelReductionMax,
		tnlParallelReductionAbsMin,
		tnlParallelReductionAbsMax,
		tnlParallelReductionAbsSum,
		tnlParallelReductionLpNorm,
		enum tnlTupleOperation { tnlParallelReductionLpNorm,
		tnlParallelReductionSdot };


		@@ -109,30 +105,39 @@ class tnlParallelReductionSum

		typedef Real RealType;
		typedef Index IndexType;
		typedef Real ResultType;

		ResultType initialValueOnHost( const IndexType idx,
		const RealType* data1,
		const RealType* data2 ) const
		{
		return data1[ idx ];
		};

		RealType reduceOnHost( const RealType& data1,
		const RealType& data2 ) const
		ResultType reduceOnHost( const IndexType idx,
		const ResultType& current,
		const RealType* data1,
		const RealType* data2 ) const
		{
		return data1 + data2;
		}
		return current + data1[ idx ];
		};

		__device__ RealType initialValueOnDevice( const IndexType idx1,
		__device__ ResultType initialValueOnDevice( const IndexType idx1,
		const IndexType idx2,
		const RealType* data1,
		const RealType* data2 ) const
		{
		return data1[ idx1 ] + data1[ idx2 ];
		}
		};

		__device__ RealType initialValueOnDevice( const IndexType idx1,
		__device__ ResultType initialValueOnDevice( const IndexType idx1,
		const RealType* data1,
		const RealType* data2 ) const
		{
		return data1[ idx1 ];
		};

		__device__ RealType firstReductionOnDevice( const IndexType idx1,
		__device__ ResultType firstReductionOnDevice( const IndexType idx1,
		const IndexType idx2,
		const IndexType idx3,
		const RealType* data1,
		@@ -142,7 +147,7 @@ class tnlParallelReductionSum
		return data1[ idx1 ] + data2[ idx2 ] + data2[ idx3 ];
		};

		__device__ RealType firstReductionOnDevice( const IndexType idx1,
		__device__ ResultType firstReductionOnDevice( const IndexType idx1,
		const IndexType idx2,
		const RealType* data1,
		const RealType* data2,
		@@ -151,9 +156,9 @@ class tnlParallelReductionSum
		return data1[ idx1 ] + data2[ idx2 ];
		};

		__device__ RealType commonReductionOnDevice( const IndexType idx1,
		__device__ ResultType commonReductionOnDevice( const IndexType idx1,
		const IndexType idx2,
		const RealType* data ) const
		const ResultType* data ) const
		{
		return data[ idx1 ] + data[ idx2 ];
		};
		@@ -167,12 +172,20 @@ class tnlParallelReductionMin
		typedef Real RealType;
		typedef Index IndexType;

		RealType initialValueOnHost( const IndexType idx,
		const RealType* data1,
		const RealType* data2 ) const
		{
		return data1[ idx ];
		};

		RealType reduceOnHost( const RealType& data1,
		const RealType& data2 ) const
		RealType reduceOnHost( const IndexType idx,
		const RealType& current,
		const RealType* data1,
		const RealType* data2 ) const
		{
		return Min( data1, data2 );
		}
		return Min( current, data1[ idx ] );
		};

		__device__ RealType initialValueOnDevice( const IndexType idx1,
		const IndexType idx2,
		@@ -216,7 +229,265 @@ class tnlParallelReductionMin
		};
		};

		template< typename Real, typename Index >
		class tnlParallelReductionMax
		{
		public:

		typedef Real RealType;
		typedef Index IndexType;

		RealType initialValueOnHost( const IndexType idx,
		const RealType* data1,
		const RealType* data2 ) const
		{
		return data1[ idx ];
		};

		RealType reduceOnHost( const IndexType idx,
		const RealType& current,
		const RealType* data1,
		const RealType* data2 ) const
		{
		return Max( current, data1[ idx ] );
		};

		__device__ RealType initialValueOnDevice( const IndexType idx1,
		const IndexType idx2,
		const RealType* data1,
		const RealType* data2 ) const
		{
		return tnlCudaMax( data1[ idx1 ], data1[ idx2 ] );
		}

		__device__ RealType initialValueOnDevice( const IndexType idx1,
		const RealType* data1,
		const RealType* data2 ) const
		{
		return data1[ idx1 ];
		};

		__device__ RealType firstReductionOnDevice( const IndexType idx1,
		const IndexType idx2,
		const IndexType idx3,
		const RealType* data1,
		const RealType* data2,
		const RealType* data3 ) const
		{
		return tnlCudaMax( data1[ idx1 ], tnlCudaMax( data2[ idx2 ], data2[ idx3 ] ) );
		};

		__device__ RealType firstReductionOnDevice( const IndexType idx1,
		const IndexType idx2,
		const RealType* data1,
		const RealType* data2,
		const RealType* data3 ) const
		{
		return tnlCudaMax( data1[ idx1 ], data2[ idx2 ] );
		};

		__device__ RealType commonReductionOnDevice( const IndexType idx1,
		const IndexType idx2,
		const RealType* data ) const
		{
		return tnlCudaMax( data[ idx1 ], data[ idx2 ] );
		};
		};

		template< typename Real, typename Index >
		class tnlParallelReductionAbsSum
		{
		public:

		typedef Real RealType;
		typedef Index IndexType;

		RealType initialValueOnHost( const IndexType idx,
		const RealType* data1,
		const RealType* data2 ) const
		{
		return tnlAbs( data1[ idx ] );
		};

		RealType reduceOnHost( const IndexType idx,
		const RealType& current,
		const RealType* data1,
		const RealType* data2 ) const
		{
		return current + tnlAbs( data1[ idx ] );
		};

		__device__ RealType initialValueOnDevice( const IndexType idx1,
		const IndexType idx2,
		const RealType* data1,
		const RealType* data2 ) const
		{
		return tnlCudaAbs( data1[ idx1 ] ) + tnlCudaAbs( data1[ idx2 ] );
		};

		__device__ RealType initialValueOnDevice( const IndexType idx1,
		const RealType* data1,
		const RealType* data2 ) const
		{
		return tnlCudaAbs( data1[ idx1 ] );
		};

		__device__ RealType firstReductionOnDevice( const IndexType idx1,
		const IndexType idx2,
		const IndexType idx3,
		const RealType* data1,
		const RealType* data2,
		const RealType* data3 ) const
		{
		return data1[ idx1 ] + tnlCudaAbs( data2[ idx2 ] ) + tnlCudaAbs( data2[ idx3 ] );
		};

		__device__ RealType firstReductionOnDevice( const IndexType idx1,
		const IndexType idx2,
		const RealType* data1,
		const RealType* data2,
		const RealType* data3 ) const
		{
		return data1[ idx1 ] + tnlCudaAbs( data2[ idx2 ] );
		};

		__device__ RealType commonReductionOnDevice( const IndexType idx1,
		const IndexType idx2,
		const RealType* data ) const
		{
		return data[ idx1 ] + data[ idx2 ];
		};
		};

		template< typename Real, typename Index >
		class tnlParallelReductionAbsMin
		{
		public:

		typedef Real RealType;
		typedef Index IndexType;

		RealType initialValueOnHost( const IndexType idx,
		const RealType* data1,
		const RealType* data2 ) const
		{
		return tnlAbs( data1[ idx ] );
		};

		RealType reduceOnHost( const IndexType idx,
		const RealType& current,
		const RealType* data1,
		const RealType* data2 ) const
		{
		return Min( current, tnlAbs( data1[ idx ] ) );
		};

		__device__ RealType initialValueOnDevice( const IndexType idx1,
		const IndexType idx2,
		const RealType* data1,
		const RealType* data2 ) const
		{
		return tnlCudaMin( tnlCudaAbs( data1[ idx1 ] ), tnlCudaAbs( data1[ idx2 ] ) );
		}

		__device__ RealType initialValueOnDevice( const IndexType idx1,
		const RealType* data1,
		const RealType* data2 ) const
		{
		return tnlCudaAbs( data1[ idx1 ] );
		};

		__device__ RealType firstReductionOnDevice( const IndexType idx1,
		const IndexType idx2,
		const IndexType idx3,
		const RealType* data1,
		const RealType* data2,
		const RealType* data3 ) const
		{
		return tnlCudaMin( data1[ idx1 ], tnlCudaMin( tnlCudaAbs( data2[ idx2 ] ), tnlCudaAbs( data2[ idx3 ] ) ) );
		};

		__device__ RealType firstReductionOnDevice( const IndexType idx1,
		const IndexType idx2,
		const RealType* data1,
		const RealType* data2,
		const RealType* data3 ) const
		{
		return tnlCudaMin( data1[ idx1 ], tnlCudaAbs( data2[ idx2 ] ) );
		};

		__device__ RealType commonReductionOnDevice( const IndexType idx1,
		const IndexType idx2,
		const RealType* data ) const
		{
		return tnlCudaMin( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) );
		};
		};

		template< typename Real, typename Index >
		class tnlParallelReductionAbsMax
		{
		public:

		typedef Real RealType;
		typedef Index IndexType;

		RealType initialValueOnHost( const IndexType idx,
		const RealType* data1,
		const RealType* data2 ) const
		{
		return tnlAbs( data1[ idx ] );
		};

		RealType reduceOnHost( const IndexType idx,
		const RealType& current,
		const RealType* data1,
		const RealType* data2 ) const
		{
		return Max( current, tnlAbs( data1[ idx ] ) );
		};

		__device__ RealType initialValueOnDevice( const IndexType idx1,
		const IndexType idx2,
		const RealType* data1,
		const RealType* data2 ) const
		{
		return tnlCudaMax( tnlCudaAbs( data1[ idx1 ] ), tnlCudaAbs( data1[ idx2 ] ) );
		}

		__device__ RealType initialValueOnDevice( const IndexType idx1,
		const RealType* data1,
		const RealType* data2 ) const
		{
		return tnlCudaAbs( data1[ idx1 ] );
		};

		__device__ RealType firstReductionOnDevice( const IndexType idx1,
		const IndexType idx2,
		const IndexType idx3,
		const RealType* data1,
		const RealType* data2,
		const RealType* data3 ) const
		{
		return tnlCudaMax( data1[ idx1 ], tnlCudaMax( tnlCudaAbs( data2[ idx2 ] ), tnlCudaAbs( data2[ idx3 ] ) ) );
		};

		__device__ RealType firstReductionOnDevice( const IndexType idx1,
		const IndexType idx2,
		const RealType* data1,
		const RealType* data2,
		const RealType* data3 ) const
		{
		return tnlCudaMax( data1[ idx1 ], tnlCudaAbs( data2[ idx2 ] ) );
		};

		__device__ RealType commonReductionOnDevice( const IndexType idx1,
		const IndexType idx2,
		const RealType* data ) const
		{
		return tnlCudaMax( data[ idx1 ], tnlCudaAbs( data[ idx2 ] ) );
		};
		};


		#include <implementation/core/cuda/reduction-operations_impl.h>

src/implementation/core/CMakeLists.txt

+1 −1

Original line number	Diff line number	Diff line
		@@ -2,7 +2,7 @@ ADD_SUBDIRECTORY( cuda )

		SET( headers cuda-long-vector-kernels.h
		vector-operations.h
		memory-functions.h
		memory-operations.h
		tnlArray_impl.h
		tnlHost_impl.h
		tnlLogger_impl.h