Implementing Merson solver in CUDA. (38143495) · Commits · TNL / tnl-dev

src/diff/Makefile.am

+9 −4

Original line number	Diff line number	Diff line
		@@ -18,32 +18,37 @@ headers = \
		tnlGridCUDA1D.h \
		tnlGridCUDA2D.h \
		tnlGridCUDA3D.h \
		tnlGridCUDA2DTester.h \
		tnlGridSystem1D.h \
		tnlEulerSolver.h \
		tnlMersonSolver.h \
		tnlMersonSolverCUDA.h \
		tnlMersonSolverCUDA.cu.h \
		tnlMPIMesh2D.h \
		tnlMPIMesh3D.h \
		tnlNonlinearRungeKuttaSolver.h \
		curve-ident.h \
		norms.h

		sources= tnlMersonSolverCUDA.cu

		libtnldiffincludedir = $(TNL_INCLUDE_DIR)/diff
		libtnldiffinclude_HEADERS = $(headers)

		noinst_LTLIBRARIES = libtnldiff-0.1.la
		libtnldiff_0_1_la_SOURCES = $(sources) $(headers)
		libtnldiff_0_1_la_CXXFLAGS = $(CXXFLAGS) $(OPTIMISECXXFLAGS)
		libtnldiff_0_1_la_CXXFLAGS = $(OPTIMISECXXFLAGS)

		if BUILD_DBG
		noinst_LTLIBRARIES += libtnldiff-dbg-0.1.la
		libtnldiff_dbg_0_1_la_CXXFLAGS = $(CXXFLAGS) $(DBGCXXFLAGS)
		libtnldiff_dbg_0_1_la_CXXFLAGS = $(DBGCXXFLAGS)
		libtnldiff_dbg_0_1_la_LDFLAGS = $(LDFLAGS) $(DBGLDFLAGS)
		libtnldiff_dbg_0_1_la_SOURCES = $(sources) $(headers)
		endif

		if BUILD_MPI
		noinst_LTLIBRARIES += libtnldiff-mpi-0.1.la
		libtnldiff_mpi_0_1_la_CXXFLAGS = $(CXXFLAGS) $(MPICXXFLAGS) $(OPTIMISECXXFLAGS)
		libtnldiff_mpi_0_1_la_CXXFLAGS = $(MPICXXFLAGS) $(OPTIMISECXXFLAGS)
		libtnldiff_mpi_0_1_la_LDFLAGS = $(LDFLAGS) $(MPILDFLAGS)
		libtnldiff_mpi_0_1_la_SOURCES = $(sources) $(headers)
		endif
		@@ -51,7 +56,7 @@ endif

		if BUILD_MPI_DBG
		noinst_LTLIBRARIES += libtnldiff-mpi-dbg-0.1.la
		libtnldiff_mpi_dbg_0_1_la_CXXFLAGS = $(CXXFLAGS) $(MPICXXFLAGS) $(DBGCXXFLAGS)
		libtnldiff_mpi_dbg_0_1_la_CXXFLAGS = $(MPICXXFLAGS) $(DBGCXXFLAGS)
		libtnldiff_mpi_dbg_0_1_la_LDFLAGS = $(LDFLAGS) $(MPILDFLAGS) $(DBGLDFLAGS)
		libtnldiff_mpi_dbg_0_1_la_SOURCES = $(sources) $(headers)
		endif

src/diff/tnlGridCUDA2DTester.h

0 → 100644

+123 −0

Original line number	Diff line number	Diff line
		/*
		* tnlGridCUDA2DTester.h
		*
		* Created on: Jan 13, 2010
		* Author: Tomas Oberhuber
		*/

		#ifndef TNLGRIDCUDA2DTESTER_H_

		#include <cppunit/TestSuite.h>
		#include <cppunit/TestResult.h>
		#include <cppunit/TestCaller.h>
		#include <cppunit/TestCase.h>
		#include <diff/tnlGridCUDA2D.h>
		#include <diff/tnlGrid2D.h>

		#ifdef HAVE_CUDA

		#endif

		template< class T > class tnlGridCUDA2DTester : public CppUnit :: TestCase
		{
		public:
		tnlGridCUDA2DTester(){};

		virtual
		~tnlGridCUDA2DTester(){};

		static CppUnit :: Test* suite()
		{
		CppUnit :: TestSuite* suiteOfTests = new CppUnit :: TestSuite( "tnlGridCUDA2DTester" );
		CppUnit :: TestResult result;
		suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlGridCUDA2DTester< T > >(
		"testAllocation",
		& tnlGridCUDA2DTester< T > :: testAllocation )
		);
		suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlGridCUDA2DTester< T > >(
		"testCopying",
		& tnlGridCUDA2DTester< T > :: testCopying )
		);
		return suiteOfTests;
		}

		void testCopying()
		{
		#ifdef HAVE_CUDA
		const int size = 100;
		tnlGrid2D< T > host_grid( size, size, 0.0, 1.0, 0.0, 1.0 );
		tnlGridCUDA2D< T > device_grid( size, size, 0.0, 1.0, 0.0, 1.0 );
		for( int i = 0; i < size; i ++ )
		for( int j = 0; j < size; j ++ )
		host_grid( i, j ) = ( T ) ( i + j );
		device_grid. copyFrom( host_grid );
		host_grid. Zeros();
		host_grid. copyFrom( device_grid );
		int errs( 0 );
		for( int i = 0; i < size; i ++ )
		for( int j = 0; j < size; j ++ )
		if( host_grid( i, j ) != i + j ) errs ++;
		CPPUNIT_ASSERT( ! errs );
		#endif
		}

		void testAllocation()
		{
		#ifdef HAVE_CUDA
		tnlGridCUDA2D< T > cuda_grid_1;
		CPPUNIT_ASSERT( !cuda_grid_1 );

		cuda_grid_1. SetNewDimensions( 100, 100 );
		cuda_grid_1. SetNewDomain( 0.0, 1.0, 0.0, 1.0 );
		CPPUNIT_ASSERT( cuda_grid_1 );
		CPPUNIT_ASSERT( cuda_grid_1. GetXSize() == 100 );
		CPPUNIT_ASSERT( cuda_grid_1. GetYSize() == 100 );
		CPPUNIT_ASSERT( cuda_grid_1. GetAx() == 0.0 );
		CPPUNIT_ASSERT( cuda_grid_1. GetBx() == 1.0 );
		CPPUNIT_ASSERT( cuda_grid_1. GetAy() == 0.0 );
		CPPUNIT_ASSERT( cuda_grid_1. GetBy() == 1.0 );

		tnlGridCUDA2D< T > cuda_grid_2;
		CPPUNIT_ASSERT( !cuda_grid_2 );
		cuda_grid_2. SetNewDimensions( cuda_grid_1 );
		cuda_grid_2. SetNewDomain( cuda_grid_1 );
		CPPUNIT_ASSERT( cuda_grid_2. GetXSize() == 100 );
		CPPUNIT_ASSERT( cuda_grid_2. GetYSize() == 100 );
		CPPUNIT_ASSERT( cuda_grid_2. GetAx() == 0.0 );
		CPPUNIT_ASSERT( cuda_grid_2. GetBx() == 1.0 );
		CPPUNIT_ASSERT( cuda_grid_2. GetAy() == 0.0 );
		CPPUNIT_ASSERT( cuda_grid_2. GetBy() == 1.0 );

		tnlGridCUDA2D< T > cuda_grid_3( 100, 100, 0.0, 1.0, 0.0, 1.0 );
		CPPUNIT_ASSERT( cuda_grid_3. GetXSize() == 100 );
		CPPUNIT_ASSERT( cuda_grid_3. GetYSize() == 100 );
		CPPUNIT_ASSERT( cuda_grid_3. GetAx() == 0.0 );
		CPPUNIT_ASSERT( cuda_grid_3. GetBx() == 1.0 );
		CPPUNIT_ASSERT( cuda_grid_3. GetAy() == 0.0 );
		CPPUNIT_ASSERT( cuda_grid_3. GetBy() == 1.0 );

		tnlGridCUDA2D< T >* cuda_grid_4 = new tnlGridCUDA2D< T >( 100, 100, 0.0, 1.0, 0.0, 1.0 );
		tnlGridCUDA2D< T >* cuda_grid_5 = new tnlGridCUDA2D< T >;
		CPPUNIT_ASSERT( *cuda_grid_4 );
		CPPUNIT_ASSERT( ! *cuda_grid_5 );

		cuda_grid_5 -> SetSharedData( cuda_grid_4 -> Data(),
		cuda_grid_4 -> GetXSize(),
		cuda_grid_4 -> GetYSize()
		);
		CPPUNIT_ASSERT( *cuda_grid_5 );
		/* Shared data are not handled automaticaly.
		* One must be sure that data were not freed sooner
		* then any Grids stopped using it.
		*/
		delete cuda_grid_5;
		delete cuda_grid_4;
		#endif
		}

		};

		#define TNLGRIDCUDA2DTESTER_H_


		#endif /* TNLGRIDCUDA2DTESTER_H_ */

src/diff/tnlMersonSolverCUDA.cu

0 → 100644

+139 −0

Original line number	Diff line number	Diff line
		/***************************************************************************
		tnlMersonSolverCUDA.cu - description
		-------------------
		begin : Nov 21, 2009
		copyright : (C) 2009 by Tomas Oberhuber
		email : tomas.oberhuber@fjfi.cvut.cz
		***************************************************************************/

		/***************************************************************************
		* *
		* This program is free software; you can redistribute it and/or modify *
		* it under the terms of the GNU General Public License as published by *
		* the Free Software Foundation; either version 2 of the License, or *
		* (at your option) any later version. *
		* *
		***************************************************************************/

		#include <diff/tnlMersonSolverCUDA.cu.h>


		void computeK2Arg( const int size,
		const int block_size,
		const int grid_size,
		const float tau,
		const float* u,
		const float* k1,
		float* k2_arg )
		{
		computeK2ArgKernel<<< grid_size, block_size >>>( size, tau, u, k1, k2_arg );
		}

		void computeK2Arg( const int size,
		const int block_size,
		const int grid_size,
		const double tau,
		const double* u,
		const double* k1,
		double* k2_arg )
		{
		computeK2ArgKernel<<< grid_size, block_size >>>( size, tau, u, k1, k2_arg );
		}

		void computeK3Arg( const int size,
		const int block_size,
		const int grid_size,
		const float tau,
		const float* u,
		const float* k1,
		const float* k2,
		float* k3_arg )
		{
		computeK3ArgKernel<<< grid_size, block_size >>>( size, tau, u, k1, k2, k3_arg );
		}

		void computeK3Arg( const int size,
		const int block_size,
		const int grid_size,
		const double tau,
		const double* u,
		const double* k1,
		const double* k2,
		double* k3_arg )
		{
		computeK3ArgKernel<<< grid_size, block_size >>>( size, tau, u, k1, k2, k3_arg );
		}

		void computeK4Arg( const int size,
		const int block_size,
		const int grid_size,
		const float tau,
		const float* u,
		const float* k1,
		const float* k3,
		float* k4_arg )
		{
		computeK4ArgKernel<<< grid_size, block_size >>>( size, tau, u, k1, k3, k4_arg );
		}

		void computeK4Arg( const int size,
		const int block_size,
		const int grid_size,
		const double tau,
		const double* u,
		const double* k1,
		const double* k3,
		double* k4_arg )
		{
		computeK4ArgKernel<<< grid_size, block_size >>>( size, tau, u, k1, k3, k4_arg );
		}

		void computeK5Arg( const int size,
		const int block_size,
		const int grid_size,
		const float tau,
		const float* u,
		const float* k1,
		const float* k3,
		const float* k4,
		float* k5_arg )
		{
		computeK5ArgKernel<<< grid_size, block_size >>>( size, tau, u, k1, k3, k4, k5_arg );
		}

		void computeK5Arg( const int size,
		const int block_size,
		const int grid_size,
		const double tau,
		const double* u,
		const double* k1,
		const double* k3,
		const double* k4,
		double* k5_arg )
		{
		computeK5ArgKernel<<< grid_size, block_size >>>( size, tau, u, k1, k3, k4, k5_arg );
		}

		void updateU( const int size,
		const int block_size,
		const int grid_size,
		const float tau,
		const float* k1,
		const float* k4,
		const float* k5,
		float* u )
		{
		updateU<<< grid_size, block_size >>>( size, tau, k1, k4, k5, u );
		}

		void updateU( const int size,
		const int block_size,
		const int grid_size,
		const double tau,
		const double* k1,
		const double* k4,
		const double* k5,
		float* u )
		{
		updateU<<< grid_size, block_size >>>( size, tau, k1, k4, k5, u );
		}
		No newline at end of file

src/diff/tnlMersonSolverCUDA.cu.h

0 → 100644

+47 −0

Original line number	Diff line number	Diff line
		/*
		* tnlMersonSolverCUDA.cu.h
		*
		* Created on: Jan 13, 2010
		* Author: Tomas Oberhuber
		*/

		#ifndef TNLMERSONSOLVERCUDA_CU_H_
		#define TNLMERSONSOLVERCUDA_CU_H_

		template< class T > __global__ void computeK2ArgKernel( const int size, const T tau, const T* u, const T* k1, T* k2_arg )
		{
		int i = blockIdx. x * blockDim. x + threadIdx. x;
		if( i < size )
		k2_arg[ i ] = u[ i ] + tau * ( 1.0 / 3.0 * k1[ i ] );
		}

		template< class T > __global__ void computeK3ArgKernel( const int size, const T tau, const T* u, const T* k1, const T* k2, T* k3_arg )
		{
		int i = blockIdx. x * blockDim. x + threadIdx. x;
		if( i < size )
		k3_arg[ i ] = u[ i ] + tau * 1.0 / 6.0 * ( k1[ i ] + k2[ i ] );
		}

		template< class T > __global__ void computeK4ArgKernel( const int size, const T tau, const T* u, const T* k1, const T* k3, T* k4_arg )
		{
		int i = blockIdx. x * blockDim. x + threadIdx. x;
		if( i < size )
		k4_arg[ i ] = u[ i ] + tau * ( 0.125 * k1[ i ] + 0.375 * k3[ i ] );
		}

		template< class T > __global__ void computeK5ArgKernel( const int size, const T tau, const T* u, const T* k1, const T* k3, const T* k4, T* k5_arg )
		{
		int i = blockIdx. x * blockDim. x + threadIdx. x;
		if( i < size )
		k5_arg[ i ] = u[ i ] + tau * ( 0.5 * k1[ i ] - 1.5 * k3[ i ] + 2.0 * k4[ i ] );
		}

		template< class T > __global__ void updateUKernel( const int size, const T tau, const T* k1, const T* k4, const T* k5, T* u )
		{
		int i = blockIdx. x * blockDim. x + threadIdx. x;
		if( i < size )
		u[ i ] += tau / 6.0 * ( k1[ i ] + 4.0 * k4[ i ] + k5[ i ] );
		}


		#endif /* TNLMERSONSOLVERCUDA_CU_H_ */

src/diff/tnlMersonSolverCUDA.h

0 → 100644

+200 −0

Original line number	Diff line number	Diff line
		/***************************************************************************
		tnlMersonSolverCUDA.h - description
		-------------------
		begin : 2007/06/16
		copyright : (C) 2007 by Tomas Oberhuber
		email : tomas.oberhuber@fjfi.cvut.cz
		***************************************************************************/

		/***************************************************************************
		* *
		* This program is free software; you can redistribute it and/or modify *
		* it under the terms of the GNU General Public License as published by *
		* the Free Software Foundation; either version 2 of the License, or *
		* (at your option) any later version. *
		* *
		***************************************************************************/

		#ifndef tnlMersonSolverH
		#define tnlMersonSolverH

		#include <math.h>
		#include <diff/tnlExplicitSolver.h>

		#ifdef HAVE_CUDA
		void computeK2Arg( const int size, const float* u, const float* k1, float* k2_arg );
		void computeK2Arg( const int size, const double* u, const double* k1, double* k2_arg );
		void computeK3Arg( const int size, const float* u, const float* k1, const float* k2, float* k3_arg );
		void computeK3Arg( const int size, const double* u, const double* k1, const double* k2, double* k3_arg );
		void computeK4Arg( const int size, const float* u, const float* k1, const float* k3, float* k4_arg );
		void computeK4Arg( const int size, const double* u, const double* k1, const double* k3, double* k4_arg );
		void computeK5Arg( const int size, const float* u, const float* k1, const float* k3, const float* k4, float* k5_arg );
		void computeK5Arg( const int size, const double* u, const double* k1, const double* k3, const double* k4, double* k5_arg );
		#endif

		template< class GRID, class SCHEME, typename T = double > class tnlMersonSolver : public tnlExplicitSolver< GRID, SCHEME, T >
		{
		public:

		tnlMersonSolver( const GRID& v )
		{
		k1 = new GRID( v );
		k2 = new GRID( v );
		k3 = new GRID( v );
		k4 = new GRID( v );
		k5 = new GRID( v );
		k_tmp = new GRID( v );
		if( ! k1 \|\| ! k2 \|\| ! k3 \|\| ! k4 \|\| ! k5 \|\| ! k_tmp )
		{
		cerr << "Unable to allocate supporting structures for the Merson solver." << endl;
		abort();
		};
		k1 -> Zeros();
		k2 -> Zeros();
		k3 -> Zeros();
		k4 -> Zeros();
		k5 -> Zeros();
		k_tmp -> Zeros();
		};

		tnlString GetType() const
		{
		T t;
		GRID grid;
		return tnlString( "tnlMersonSolver< " ) + grid. GetType() +
		tnlString( ", " ) + GetParameterType( t ) + tnlString( " >" );
		};

		void SetAdaptivity( const double& a )
		{
		adaptivity = a;
		};

		bool Solve( SCHEME& scheme,
		GRID& u,
		const double& stop_time,
		const double& max_res,
		const int max_iter )
		{
		T* _k1 = k1 -> Data();
		T* _k2 = k2 -> Data();
		T* _k3 = k3 -> Data();
		T* _k4 = k4 -> Data();
		T* _k5 = k5 -> Data();
		T* _k_tmp = k_tmp -> Data();
		T* _u = u. Data();

		tnlExplicitSolver< GRID, SCHEME, T > :: iteration = 0;
		double& _time = tnlExplicitSolver< GRID, SCHEME, T > :: time;
		double& _residue = tnlExplicitSolver< GRID, SCHEME, T > :: residue;
		int& _iteration = tnlExplicitSolver< GRID, SCHEME, T > :: iteration;
		const double size_inv = 1.0 / ( double ) u. GetSize();

		const int block_size = 512;
		const int grid_size = ( size - 1 ) / 512 + 1;

		T _tau = tnlExplicitSolver< GRID, SCHEME, T > :: tau;
		if( _time + _tau > stop_time ) _tau = stop_time - _time;
		if( _tau == 0.0 ) return true;

		if( tnlExplicitSolver< GRID, SCHEME, T > :: verbosity > 0 )
		tnlExplicitSolver< GRID, SCHEME, T > :: PrintOut();
		while( 1 )
		{

		int i;
		int size = k1 -> GetSize();
		assert( size == u. GetSize() );

		T tau_3 = _tau / 3.0;

		scheme. GetExplicitRHSCUDA( _time, u, *k1 );

		computeK2Arg( size, block_size, grid_size, tau, _u, _k1, _k_tmp );
		scheme. GetExplicitRHSCUDA( _time + tau_3, k_tmp, k2 );

		computeK3Arg( size, block_size, grid_size, tau, _u, _k1, _k2, _k_tmp );
		scheme. GetExplicitRHSCUDA( _time + tau_3, k_tmp, k3 );

		computeK4Arg( size, block_size, grid_size, tau, _u, _k1, _k3, _k_tmp );
		scheme. GetExplicitRHSCUDA( _time + 0.5 * _tau, k_tmp, k4 );

		computeK5Arg( size, block_size, grid_size, tau, _k1, _k3, _k4, _k_tmp );
		scheme. GetExplicitRHSCUDA( _time + _tau, k_tmp, k5 );

		double eps( 0.0 ), max_eps( 0.0 );
		if( adaptivity )
		{
		for( i = 0; i < size; i ++ )
		{
		eps = Max( eps, _tau / 3.0 *
		fabs( 0.2 * _k1[ i ] +
		-0.9 * _k3[ i ] +
		0.8 * _k4[ i ] +
		-0.1 * _k5[ i ] ) );
		}
		:: MPIAllreduce( eps, max_eps, 1, MPI_MAX, tnlExplicitSolver< GRID, SCHEME, T > :: solver_comm );
		//if( MPIGetRank() == 0 )
		// cout << "eps = " << eps << " " << endl;
		//
		}
		//cout << endl << "max_eps = " << max_eps << endl;
		if( ! adaptivity \|\| max_eps < adaptivity )
		{
		double last_residue = _residue;
		double loc_residue = 0.0;

		updateU( size, tau, _k1, _k4, _k5 );
		// TODO: implement loc_residue - if possible

		if( _tau + _time == stop_time ) _residue = last_residue; // fixing strange values of res. at the last iteration
		else
		{
		loc_residue /= _tau * size_inv;
		:: MPIAllreduce( loc_residue, _residue, 1, MPI_SUM, tnlExplicitSolver< GRID, SCHEME, T > :: solver_comm );
		}
		_time += _tau;
		_iteration ++;
		}
		if( adaptivity && max_eps != 0.0 )
		{
		_tau = 0.8 pow( adaptivity / max_eps, 0.2 );
		:: MPIBcast( _tau, 1, 0, tnlExplicitSolver< GRID, SCHEME, T > :: solver_comm );
		}

		if( _time + _tau > stop_time )
		_tau = stop_time - _time; //we don't want to keep such tau
		else tnlExplicitSolver< GRID, SCHEME, T > :: tau = _tau;

		if( tnlExplicitSolver< GRID, SCHEME, T > :: verbosity > 1 )
		tnlExplicitSolver< GRID, SCHEME, T > :: PrintOut();

		if( _time == stop_time \|\|
		( max_res && _residue < max_res ) )
		{
		if( tnlExplicitSolver< GRID, SCHEME, T > :: verbosity > 0 )
		tnlExplicitSolver< GRID, SCHEME, T > :: PrintOut();
		return true;
		}
		//if( max_iter && _iteration == max_iter ) return false;
		}
		};

		~tnlMersonSolver()
		{
		delete k1;
		delete k2;
		delete k3;
		delete k4;
		delete k5;
		delete k_tmp;
		};

		protected:

		GRID k1, k2, k3, k4, k5, k_tmp;

		double adaptivity;
		};

		#endif