Reimplementing CUDA parallel reduction with template operations. (13573312) · Commits · TNL / tnl-dev

src/core/cuda/reduction-operations.h

0 → 100644

+226 −0

Original line number	Diff line number	Diff line
		/***************************************************************************
		reduction-operations.h - description
		-------------------
		begin : Mar 22, 2013
		copyright : (C) 2013 by Tomas Oberhuber
		email : tomas.oberhuber@fjfi.cvut.cz
		***************************************************************************/

		/***************************************************************************
		* *
		* This program is free software; you can redistribute it and/or modify *
		* it under the terms of the GNU General Public License as published by *
		* the Free Software Foundation; either version 2 of the License, or *
		* (at your option) any later version. *
		* *
		***************************************************************************/

		#ifndef REDUCTION_OPERATIONS_H_
		#define REDUCTION_OPERATIONS_H_

		#ifdef HAVE_CUDA
		#include <cuda.h>
		#include <core/mfuncs.h>

		enum tnlTupleOperation { tnlParallelReductionMax,
		tnlParallelReductionAbsMin,
		tnlParallelReductionAbsMax,
		tnlParallelReductionAbsSum,
		tnlParallelReductionLpNorm,
		tnlParallelReductionSdot };


		/***
		* This function returns minimum of two numbers stored on the device.
		*/
		template< class T > __device__ T tnlCudaMin( const T& a,
		const T& b )
		{
		return a < b ? a : b;
		}

		__device__ int tnlCudaMin( const int& a,
		const int& b )
		{
		return min( a, b );
		}

		__device__ float tnlCudaMin( const float& a,
		const float& b )
		{
		return fminf( a, b );
		}

		__device__ double tnlCudaMin( const double& a,
		const double& b )
		{
		return fmin( a, b );
		}

		/***
		* This function returns maximum of two numbers stored on the device.
		*/
		template< class T > __device__ T tnlCudaMax( const T& a,
		const T& b )
		{
		return a > b ? a : b;
		}

		__device__ int tnlCudaMax( const int& a,
		const int& b )
		{
		return max( a, b );
		}

		__device__ float tnlCudaMax( const float& a,
		const float& b )
		{
		return fmaxf( a, b );
		}

		__device__ double tnlCudaMax( const double& a,
		const double& b )
		{
		return fmax( a, b );
		}

		/***
		* This function returns absolute value of given number on the device.
		*/
		__device__ int tnlCudaAbs( const int& a )
		{
		return abs( a );
		}

		__device__ float tnlCudaAbs( const float& a )
		{
		return fabs( a );
		}

		__device__ double tnlCudaAbs( const double& a )
		{
		return fabs( a );
		}

		template< typename Real, typename Index >
		class tnlParallelReductionSum
		{
		public:

		typedef Real RealType;
		typedef Index IndexType;


		RealType reduceOnHost( const RealType& data1,
		const RealType& data2 ) const
		{
		return data1 + data2;
		}

		__device__ RealType initialValueOnDevice( const IndexType idx1,
		const IndexType idx2,
		const RealType* data1,
		const RealType* data2 ) const
		{
		return data1[ idx1 ] + data1[ idx2 ];
		}

		__device__ RealType initialValueOnDevice( const IndexType idx1,
		const RealType* data1,
		const RealType* data2 ) const
		{
		return data1[ idx1 ];
		};

		__device__ RealType firstReductionOnDevice( const IndexType idx1,
		const IndexType idx2,
		const IndexType idx3,
		const RealType* data1,
		const RealType* data2,
		const RealType* data3 ) const
		{
		return data1[ idx1 ] + data2[ idx2 ] + data2[ idx3 ];
		};

		__device__ RealType firstReductionOnDevice( const IndexType idx1,
		const IndexType idx2,
		const RealType* data1,
		const RealType* data2,
		const RealType* data3 ) const
		{
		return data1[ idx1 ] + data2[ idx2 ];
		};

		__device__ RealType commonReductionOnDevice( const IndexType idx1,
		const IndexType idx2,
		const RealType* data ) const
		{
		return data[ idx1 ] + data[ idx2 ];
		};
		};

		template< typename Real, typename Index >
		class tnlParallelReductionMin
		{
		public:

		typedef Real RealType;
		typedef Index IndexType;


		RealType reduceOnHost( const RealType& data1,
		const RealType& data2 ) const
		{
		return Min( data1, data2 );
		}

		__device__ RealType initialValueOnDevice( const IndexType idx1,
		const IndexType idx2,
		const RealType* data1,
		const RealType* data2 ) const
		{
		return tnlCudaMin( data1[ idx1 ], data1[ idx2 ] );
		}

		__device__ RealType initialValueOnDevice( const IndexType idx1,
		const RealType* data1,
		const RealType* data2 ) const
		{
		return data1[ idx1 ];
		};

		__device__ RealType firstReductionOnDevice( const IndexType idx1,
		const IndexType idx2,
		const IndexType idx3,
		const RealType* data1,
		const RealType* data2,
		const RealType* data3 ) const
		{
		return tnlCudaMin( data1[ idx1 ], tnlCudaMin( data2[ idx2 ], data2[ idx3 ] ) );
		};

		__device__ RealType firstReductionOnDevice( const IndexType idx1,
		const IndexType idx2,
		const RealType* data1,
		const RealType* data2,
		const RealType* data3 ) const
		{
		return tnlCudaMin( data1[ idx1 ], data2[ idx2 ] );
		};

		__device__ RealType commonReductionOnDevice( const IndexType idx1,
		const IndexType idx2,
		const RealType* data ) const
		{
		return tnlCudaMin( data[ idx1 ], data[ idx2 ] );
		};
		};




		#include <implementation/core/cuda/reduction-operations_impl.h>

		#endif

		#endif /* REDUCTION_OPERATIONS_H_ */

src/implementation/core/CMakeLists.txt

+3 −1

Original line number	Diff line number	Diff line
		@@ -21,8 +21,10 @@ SET( headers cuda-long-vector-kernels.h
		SET( CURRENT_DIR ${CMAKE_SOURCE_DIR}/src/implementation/core )
		IF( BUILD_CUDA )
		set( tnl_implementation_core_CUDA__SOURCES
		${tnl_implementation_core_cuda_CUDA__SOURCES}
		${CURRENT_DIR}/tnlArray_impl.cu
		${CURRENT_DIR}/tnlVector_impl.cu )
		${CURRENT_DIR}/tnlVector_impl.cu
		PARENT_SCOPE )
		ENDIF()
		set( tnl_implementation_core_SOURCES
		${tnl_implementation_core_CUDA__SOURCES}

src/implementation/core/cuda-long-vector-kernels.h

+137 −203

File changed.

Preview size limit exceeded, changes collapsed.

src/implementation/core/cuda/CMakeLists.txt

+8 −1

Original line number	Diff line number	Diff line
		SET( headers reduction_impl.h )
		SET( headers reduction_impl.h
		reduction-operations_impl.h )

		SET( CURRENT_DIR ${CMAKE_SOURCE_DIR}/src/implementation/core/cuda )

		IF( BUILD_CUDA )
		set( tnl_implementation_core_cuda_CUDA__SOURCES
		${CURRENT_DIR}/reduction-operations_impl.cu
		PARENT_SCOPE )
		endif()

		set( tnl_implementation_core_cuda_SOURCES
		${CURRENT_DIR}/device-check.cpp
		PARENT_SCOPE )

src/implementation/core/cuda/reduction-operations_impl.cu

0 → 100644

+16 −0

Original line number	Diff line number	Diff line
		/***************************************************************************
		reduction-operations.cu - description
		-------------------
		begin : Mar 22, 2013
		copyright : (C) 2013 by Tomas Oberhuber
		email : tomas.oberhuber@fjfi.cvut.cz
		***************************************************************************/

		/***************************************************************************
		* *
		* This program is free software; you can redistribute it and/or modify *
		* it under the terms of the GNU General Public License as published by *
		* the Free Software Foundation; either version 2 of the License, or *
		* (at your option) any later version. *
		* *
		***************************************************************************/
		No newline at end of file