Fixing the CUDA parallel reduction. (6ae56ba0) · Commits · TNL / tnl-dev

CMakeLists.txt

+13 −4

Original line number	Diff line number	Diff line
		@@ -28,9 +28,15 @@ else()
		AddCompilerFlag( "-O3 -DNDEBUG" )
		endif()

		if( WITH_TEMPLATE_EXPLICIT_INSTANTIATION STREQUAL "yes" )
		AddCompilerFlag( "-DTEMPLATE_EXPLICIT_INSTANTIATION" )
		endif()

		if( WITH_CUDA STREQUAL "yes" )
		AddCompilerFlag( "-DHAVE_NOT_CXX11" )
		else()
		AddCompilerFlag( "-std=gnu++0x" )
		#AddCompilerFlag( "-DHAVE_NOT_CXX11" )
		endif()


		#####
		@@ -288,7 +294,10 @@ if(WIN32 AND NOT UNIX)
		set(CPACK_NSIS_MODIFY_PATH ON)
		else(WIN32 AND NOT UNIX)
		# set(CPACK_STRIP_FILES "bin/MyExecutable")
		set(CPACK_SOURCE_STRIP_FILES "")
		endif(WIN32 AND NOT UNIX)

		set(CPACK_SOURCE_STRIP_FILES "Debug")
		set(CPACK_SOURCE_STRIP_FILES "Release")

		#set(CPACK_PACKAGE_EXECUTABLES "MyExecutable" "My Executable")
		INCLUDE( CPack )

buildAll

+15 −4

Original line number	Diff line number	Diff line
		@@ -2,14 +2,14 @@

		TARGET=TNL
		INSTALL_PREFIX=${HOME}/local
		WITH_CUDA=no
		WITH_CUDA=yes
		WITH_CUSPARSE=no
		CUDA_ARCHITECTURE=2.0
		TEMPLATE_EXPLICIT_INSTANTIATION=yes
		VERBOSE=1

		CMAKE="cmake"
		CPUS=`grep -c processor /proc/cpuinfo`
		CPUS=1

		echo "Building $TARGET using $CPUS processors."

		@@ -23,13 +23,24 @@ then
		fi

		cd Debug
		${CMAKE} .. -DCMAKE_BUILD_TYPE=Debug -DCMAKE_INSTALL_PREFIX=${HOME}/local -DCUDA_ARCHITECTURE=${CUDA_ARCHITECTURE} -DWITH_CUDA=${WITH_CUDA} -DWITH_CUSPARSE=${WITH_CUSPARSE} -DPETSC_DIR=${PETSC_DIR}
		${CMAKE} .. -DCMAKE_BUILD_TYPE=Debug \
		-DCMAKE_INSTALL_PREFIX=${HOME}/local \
		-DCUDA_ARCHITECTURE=${CUDA_ARCHITECTURE} \
		-DWITH_CUDA=${WITH_CUDA} \
		-DWITH_CUSPARSE=${WITH_CUSPARSE} \
		-DPETSC_DIR=${PETSC_DIR} \
		-DWITH_TEMPLATE_EXPLICIT_INSTANTIATION=${TEMPLATE_EXPLICIT_INSTANTIATION}
		make -j${CPUS} #VERBOSE=1
		make -j${CPUS} test
		make -j${CPUS} install

		cd ../Release
		${CMAKE} .. -DCMAKE_INSTALL_PREFIX=${HOME}/local -DCUDA_ARCHITECTURE=${CUDA_ARCHITECTURE} -DWITH_CUDA=${WITH_CUDA} -DWITH_CUSPARSE=${WITH_CUSPARSE} -DPETSC_DIR=${PETSC_DIR}
		${CMAKE} .. -DCMAKE_INSTALL_PREFIX=${HOME}/local \
		-DCUDA_ARCHITECTURE=${CUDA_ARCHITECTURE} \
		-DWITH_CUDA=${WITH_CUDA} \
		-DWITH_CUSPARSE=${WITH_CUSPARSE} \
		-DPETSC_DIR=${PETSC_DIR} \
		-DWITH_TEMPLATE_EXPLICIT_INSTANTIATION=${TEMPLATE_EXPLICIT_INSTANTIATION}
		make -j${CPUS} #VERBOSE=1
		make -j${CPUS} test
		make -j${CPUS} install

src/core/cuda/reduction-operations.h

+17 −11

Original line number	Diff line number	Diff line
		@@ -93,6 +93,12 @@ __device__ inline double tnlCudaAbs( const double& a )
		{
		return fabs( a );
		}

		template< typename Type1, typename Type2 >
		__device__ Type1 tnlCudaPow( const Type1& x, const Type2& power )
		{
		return ( Type1 ) pow( ( double ) x, ( double ) power );
		}
		#endif

		template< typename Real, typename Index >
		@@ -683,14 +689,14 @@ class tnlParallelReductionLpNorm
		const RealType* data1,
		const RealType* data2 ) const
		{
		return pow( tnlCudaAbs( data1[ idx1 ] ), p ) + pow( tnlCudaAbs( data1[ idx2 ] ), p );
		return tnlCudaPow( tnlCudaAbs( data1[ idx1 ] ), p ) + tnlCudaPow( tnlCudaAbs( data1[ idx2 ] ), p );
		}

		__device__ ResultType initialValueOnDevice( const IndexType idx1,
		const RealType* data1,
		const RealType* data2 ) const
		{
		return pow( tnlCudaAbs( data1[ idx1 ] ), p );
		return tnlCudaPow( tnlCudaAbs( data1[ idx1 ] ), p );
		};

		__device__ ResultType firstReductionOnDevice( const IndexType idx1,
		@@ -701,8 +707,8 @@ class tnlParallelReductionLpNorm
		const RealType* data3 ) const
		{
		return data1[ idx1 ] +
		pow( tnlCudaAbs( data2[ idx2 ] ), p ) +
		pow( tnlCudaAbs( data2[ idx3 ] ), p );
		tnlCudaPow( tnlCudaAbs( data2[ idx2 ] ), p ) +
		tnlCudaPow( tnlCudaAbs( data2[ idx3 ] ), p );
		};

		__device__ ResultType firstReductionOnDevice( const IndexType idx1,
		@@ -711,7 +717,7 @@ class tnlParallelReductionLpNorm
		const RealType* data2,
		const RealType* data3 ) const
		{
		return data1[ idx1 ] + pow( tnlCudaAbs( data2[ idx2 ] ), p );
		return data1[ idx1 ] + tnlCudaPow( tnlCudaAbs( data2[ idx2 ] ), p );
		};

		__device__ ResultType commonReductionOnDevice( const IndexType idx1,
		@@ -1409,15 +1415,15 @@ class tnlParallelReductionDiffLpNorm
		const RealType* data1,
		const RealType* data2 ) const
		{
		return pow( tnlCudaAbs( data1[ idx1 ] - data2[ idx1 ] ), p ) +
		pow( tnlCudaAbs( data1[ idx2 ] - data2[ idx2 ] ), p );
		return tnlCudaPow( tnlCudaAbs( data1[ idx1 ] - data2[ idx1 ] ), p ) +
		tnlCudaPow( tnlCudaAbs( data1[ idx2 ] - data2[ idx2 ] ), p );
		}

		__device__ ResultType initialValueOnDevice( const IndexType idx1,
		const RealType* data1,
		const RealType* data2 ) const
		{
		return pow( tnlCudaAbs( data1[ idx1 ] - data2[ idx1 ] ), p );
		return tnlCudaPow( tnlCudaAbs( data1[ idx1 ] - data2[ idx1 ] ), p );
		};

		__device__ ResultType firstReductionOnDevice( const IndexType idx1,
		@@ -1428,8 +1434,8 @@ class tnlParallelReductionDiffLpNorm
		const RealType* data3 ) const
		{
		return data1[ idx1 ] +
		pow( tnlCudaAbs( data2[ idx2 ] - data3[ idx2 ] ), p ) +
		pow( tnlCudaAbs( data2[ idx3 ] - data3[ idx3 ] ), p );
		tnlCudaPow( tnlCudaAbs( data2[ idx2 ] - data3[ idx2 ] ), p ) +
		tnlCudaPow( tnlCudaAbs( data2[ idx3 ] - data3[ idx3 ] ), p );
		};

		__device__ ResultType firstReductionOnDevice( const IndexType idx1,
		@@ -1438,7 +1444,7 @@ class tnlParallelReductionDiffLpNorm
		const RealType* data2,
		const RealType* data3 ) const
		{
		return data1[ idx1 ] + pow( tnlCudaAbs( data2[ idx2 ] - data3[ idx2 ] ), p );
		return data1[ idx1 ] + tnlCudaPow( tnlCudaAbs( data2[ idx2 ] - data3[ idx2 ] ), p );
		};

		__device__ ResultType commonReductionOnDevice( const IndexType idx1,

src/core/mfuncs.h

+4 −4

Original line number	Diff line number	Diff line
		@@ -21,19 +21,19 @@
		#include <math.h>
		#include <stdlib.h>

		template< class T > T Min( const T& a, const T& b )
		template< typename Type1, typename Type2 > Type1 Min( const Type1& a, const Type2& b )
		{
		return a < b ? a : b;
		};

		template< class T > T Max( const T& a, const T& b )
		template< typename Type1, typename Type2 > Type1 Max( const Type1& a, const Type2& b )
		{
		return a > b ? a : b;
		};

		template< class T > void Swap( T& a, T& b )
		template< typename Type > void Swap( Type& a, Type& b )
		{
		T tmp( a );
		Type tmp( a );
		a = b;
		b = tmp;
		};

src/implementation/solvers/ode/tnlMersonSolver_impl.h

+8 −0

Original line number	Diff line number	Diff line
		@@ -21,6 +21,14 @@
		#include <core/tnlHost.h>
		#include <core/tnlCuda.h>

		/****
		* In this code we do not use constants and references as we would like to.
		* OpenMP would complain that
		*
		* error: ‘some-variable’ is predetermined ‘shared’ for ‘firstprivate’
		*
		*/

		#ifdef HAVE_CUDA

		template< typename Real, typename Index >