Commit 38143495 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Implementing Merson solver in CUDA.

parent 890c22bb
Loading
Loading
Loading
Loading
+9 −4
Original line number Diff line number Diff line
@@ -18,32 +18,37 @@ headers = \
   tnlGridCUDA1D.h \
   tnlGridCUDA2D.h \
   tnlGridCUDA3D.h \
   tnlGridCUDA2DTester.h \
   tnlGridSystem1D.h \
   tnlEulerSolver.h \
   tnlMersonSolver.h \
   tnlMersonSolverCUDA.h \
   tnlMersonSolverCUDA.cu.h \
   tnlMPIMesh2D.h \
   tnlMPIMesh3D.h \
   tnlNonlinearRungeKuttaSolver.h \
   curve-ident.h \
   norms.h

sources= tnlMersonSolverCUDA.cu

libtnldiffincludedir = $(TNL_INCLUDE_DIR)/diff
libtnldiffinclude_HEADERS = $(headers)

noinst_LTLIBRARIES = libtnldiff-0.1.la
libtnldiff_0_1_la_SOURCES = $(sources) $(headers)
libtnldiff_0_1_la_CXXFLAGS = $(CXXFLAGS) $(OPTIMISECXXFLAGS)
libtnldiff_0_1_la_CXXFLAGS = $(OPTIMISECXXFLAGS)

if BUILD_DBG
   noinst_LTLIBRARIES += libtnldiff-dbg-0.1.la 
   libtnldiff_dbg_0_1_la_CXXFLAGS = $(CXXFLAGS) $(DBGCXXFLAGS)
   libtnldiff_dbg_0_1_la_CXXFLAGS = $(DBGCXXFLAGS)
   libtnldiff_dbg_0_1_la_LDFLAGS =  $(LDFLAGS) $(DBGLDFLAGS)
   libtnldiff_dbg_0_1_la_SOURCES =  $(sources) $(headers)
endif

if BUILD_MPI
   noinst_LTLIBRARIES += libtnldiff-mpi-0.1.la 
   libtnldiff_mpi_0_1_la_CXXFLAGS = $(CXXFLAGS) $(MPICXXFLAGS) $(OPTIMISECXXFLAGS)
   libtnldiff_mpi_0_1_la_CXXFLAGS = $(MPICXXFLAGS) $(OPTIMISECXXFLAGS)
   libtnldiff_mpi_0_1_la_LDFLAGS =  $(LDFLAGS) $(MPILDFLAGS)
   libtnldiff_mpi_0_1_la_SOURCES =  $(sources) $(headers)
endif
@@ -51,7 +56,7 @@ endif

if BUILD_MPI_DBG
   noinst_LTLIBRARIES += libtnldiff-mpi-dbg-0.1.la 
   libtnldiff_mpi_dbg_0_1_la_CXXFLAGS = $(CXXFLAGS) $(MPICXXFLAGS) $(DBGCXXFLAGS)
   libtnldiff_mpi_dbg_0_1_la_CXXFLAGS = $(MPICXXFLAGS) $(DBGCXXFLAGS)
   libtnldiff_mpi_dbg_0_1_la_LDFLAGS = $(LDFLAGS) $(MPILDFLAGS) $(DBGLDFLAGS)
   libtnldiff_mpi_dbg_0_1_la_SOURCES = $(sources) $(headers)
endif
+123 −0
Original line number Diff line number Diff line
/*
 * tnlGridCUDA2DTester.h
 *
 *  Created on: Jan 13, 2010
 *      Author: Tomas Oberhuber
 */

#ifndef TNLGRIDCUDA2DTESTER_H_

#include <cppunit/TestSuite.h>
#include <cppunit/TestResult.h>
#include <cppunit/TestCaller.h>
#include <cppunit/TestCase.h>
#include <diff/tnlGridCUDA2D.h>
#include <diff/tnlGrid2D.h>

#ifdef HAVE_CUDA

#endif

template< class T > class tnlGridCUDA2DTester : public CppUnit :: TestCase
{
   public:
   tnlGridCUDA2DTester(){};

   virtual
   ~tnlGridCUDA2DTester(){};

   static CppUnit :: Test* suite()
   {
      CppUnit :: TestSuite* suiteOfTests = new CppUnit :: TestSuite( "tnlGridCUDA2DTester" );
      CppUnit :: TestResult result;
      suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlGridCUDA2DTester< T > >(
                               "testAllocation",
                               & tnlGridCUDA2DTester< T > :: testAllocation )
                             );
      suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlGridCUDA2DTester< T > >(
                               "testCopying",
                               & tnlGridCUDA2DTester< T > :: testCopying )
                              );
            return suiteOfTests;
   }

   void testCopying()
   {
#ifdef HAVE_CUDA
	   const int size = 100;
	   tnlGrid2D< T > host_grid( size, size, 0.0, 1.0, 0.0, 1.0 );
	   tnlGridCUDA2D< T > device_grid( size, size, 0.0, 1.0, 0.0, 1.0 );
	   for( int i = 0; i < size; i ++ )
		   for( int j = 0; j < size; j ++ )
			   host_grid( i, j ) = ( T ) ( i + j );
	   device_grid. copyFrom( host_grid );
	   host_grid. Zeros();
	   host_grid. copyFrom( device_grid );
	   int errs( 0 );
	   for( int i = 0; i < size; i ++ )
		   for( int j = 0; j < size; j ++ )
			   if( host_grid( i, j ) != i + j ) errs ++;
	   CPPUNIT_ASSERT( ! errs );
#endif
   }

   void testAllocation()
   {
#ifdef HAVE_CUDA
      tnlGridCUDA2D< T > cuda_grid_1;
      CPPUNIT_ASSERT( !cuda_grid_1 );

      cuda_grid_1. SetNewDimensions( 100, 100 );
      cuda_grid_1. SetNewDomain( 0.0, 1.0, 0.0, 1.0 );
      CPPUNIT_ASSERT( cuda_grid_1 );
      CPPUNIT_ASSERT( cuda_grid_1. GetXSize() == 100 );
      CPPUNIT_ASSERT( cuda_grid_1. GetYSize() == 100 );
      CPPUNIT_ASSERT( cuda_grid_1. GetAx() == 0.0 );
      CPPUNIT_ASSERT( cuda_grid_1. GetBx() == 1.0 );
      CPPUNIT_ASSERT( cuda_grid_1. GetAy() == 0.0 );
      CPPUNIT_ASSERT( cuda_grid_1. GetBy() == 1.0 );

      tnlGridCUDA2D< T > cuda_grid_2;
      CPPUNIT_ASSERT( !cuda_grid_2 );
      cuda_grid_2. SetNewDimensions( cuda_grid_1 );
      cuda_grid_2. SetNewDomain( cuda_grid_1 );
      CPPUNIT_ASSERT( cuda_grid_2. GetXSize() == 100 );
      CPPUNIT_ASSERT( cuda_grid_2. GetYSize() == 100 );
      CPPUNIT_ASSERT( cuda_grid_2. GetAx() == 0.0 );
      CPPUNIT_ASSERT( cuda_grid_2. GetBx() == 1.0 );
      CPPUNIT_ASSERT( cuda_grid_2. GetAy() == 0.0 );
      CPPUNIT_ASSERT( cuda_grid_2. GetBy() == 1.0 );

      tnlGridCUDA2D< T > cuda_grid_3( 100, 100, 0.0, 1.0, 0.0, 1.0 );
      CPPUNIT_ASSERT( cuda_grid_3. GetXSize() == 100 );
      CPPUNIT_ASSERT( cuda_grid_3. GetYSize() == 100 );
      CPPUNIT_ASSERT( cuda_grid_3. GetAx() == 0.0 );
      CPPUNIT_ASSERT( cuda_grid_3. GetBx() == 1.0 );
      CPPUNIT_ASSERT( cuda_grid_3. GetAy() == 0.0 );
      CPPUNIT_ASSERT( cuda_grid_3. GetBy() == 1.0 );

      tnlGridCUDA2D< T >* cuda_grid_4 = new tnlGridCUDA2D< T >( 100, 100, 0.0, 1.0, 0.0, 1.0 );
      tnlGridCUDA2D< T >* cuda_grid_5 = new tnlGridCUDA2D< T >;
      CPPUNIT_ASSERT( *cuda_grid_4 );
      CPPUNIT_ASSERT( ! *cuda_grid_5 );

      cuda_grid_5 -> SetSharedData( cuda_grid_4 -> Data(),
                                    cuda_grid_4 -> GetXSize(),
                                    cuda_grid_4 -> GetYSize()
                                   );
      CPPUNIT_ASSERT( *cuda_grid_5 );
      /* Shared data are not handled automaticaly.
       * One must be sure that data were not freed sooner
       * then any Grids stopped using it.
       */
      delete cuda_grid_5;
      delete cuda_grid_4;
#endif
   }

};

#define TNLGRIDCUDA2DTESTER_H_


#endif /* TNLGRIDCUDA2DTESTER_H_ */
+139 −0
Original line number Diff line number Diff line
/***************************************************************************
                          tnlMersonSolverCUDA.cu  -  description
                             -------------------
    begin                : Nov 21, 2009
    copyright            : (C) 2009 by Tomas Oberhuber
    email                : tomas.oberhuber@fjfi.cvut.cz
 ***************************************************************************/

/***************************************************************************
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
 ***************************************************************************/
 
#include <diff/tnlMersonSolverCUDA.cu.h>


void computeK2Arg( const int size, 
                   const int block_size,
                   const int grid_size,
                   const float tau,
                   const float* u,
                   const float* k1,
                   float* k2_arg )
{
   computeK2ArgKernel<<< grid_size, block_size >>>( size, tau, u, k1, k2_arg );
}

void computeK2Arg( const int size,
                   const int block_size,
                   const int grid_size,
                   const double tau,
                   const double* u,
                   const double* k1,
                   double* k2_arg )
{
   computeK2ArgKernel<<< grid_size, block_size >>>( size, tau, u, k1, k2_arg );
}

void computeK3Arg( const int size,
                   const int block_size,
                   const int grid_size,
                   const float tau,
                   const float* u,
                   const float* k1,
                   const float* k2,
                   float* k3_arg )
{
   computeK3ArgKernel<<< grid_size, block_size >>>( size, tau, u, k1, k2, k3_arg );
}

void computeK3Arg( const int size,
                   const int block_size,
                   const int grid_size,
                   const double tau,
                   const double* u,
                   const double* k1,
                   const double* k2,
                   double* k3_arg )
{
   computeK3ArgKernel<<< grid_size, block_size >>>( size, tau, u, k1, k2, k3_arg );
}

void computeK4Arg( const int size,
                   const int block_size,
                   const int grid_size,
                   const float tau,
                   const float* u,
                   const float* k1,
                   const float* k3,
                   float* k4_arg )
{
   computeK4ArgKernel<<< grid_size, block_size >>>( size, tau, u, k1, k3, k4_arg );
}

void computeK4Arg( const int size,
                   const int block_size,
                   const int grid_size,
                   const double tau,
                   const double* u,
                   const double* k1,
                   const double* k3,
                   double* k4_arg )
{
   computeK4ArgKernel<<< grid_size, block_size >>>( size, tau, u, k1, k3, k4_arg );
}

void computeK5Arg( const int size,
                   const int block_size,
                   const int grid_size,
                   const float tau,
                   const float* u,
                   const float* k1,
                   const float* k3,
                   const float* k4,
                   float* k5_arg )
{
   computeK5ArgKernel<<< grid_size, block_size >>>( size, tau, u, k1, k3, k4, k5_arg );
}

void computeK5Arg( const int size,
                   const int block_size,
                   const int grid_size,
                   const double tau,
                   const double* u,
                   const double* k1,
                   const double* k3,
                   const double* k4,
                   double* k5_arg )
{
   computeK5ArgKernel<<< grid_size, block_size >>>( size, tau, u, k1, k3, k4, k5_arg );
}

void updateU( const int size,
              const int block_size,
              const int grid_size,
              const float tau,
              const float* k1,
              const float* k4,
              const float* k5,
              float* u )
{
   updateU<<< grid_size, block_size >>>( size, tau, k1, k4, k5, u );
}

void updateU( const int size,
              const int block_size,
              const int grid_size,
              const double tau,
              const double* k1,
              const double* k4,
              const double* k5,
              float* u )
{
   updateU<<< grid_size, block_size >>>( size, tau, k1, k4, k5, u );
}
 No newline at end of file
+47 −0
Original line number Diff line number Diff line
/*
 * tnlMersonSolverCUDA.cu.h
 *
 *  Created on: Jan 13, 2010
 *      Author: Tomas Oberhuber
 */

#ifndef TNLMERSONSOLVERCUDA_CU_H_
#define TNLMERSONSOLVERCUDA_CU_H_

template< class T > __global__ void computeK2ArgKernel( const int size, const T tau, const T* u, const T* k1, T* k2_arg )
{
	int i = blockIdx. x * blockDim. x + threadIdx. x;
	if( i < size )
		k2_arg[ i ] = u[ i ] + tau * ( 1.0 / 3.0 * k1[ i ] );
}

template< class T > __global__ void computeK3ArgKernel( const int size, const T tau, const T* u, const T* k1, const T* k2, T* k3_arg )
{
	int i = blockIdx. x * blockDim. x + threadIdx. x;
	if( i < size )
		k3_arg[ i ] = u[ i ] + tau * 1.0 / 6.0 * ( k1[ i ] + k2[ i ] );
}

template< class T > __global__ void computeK4ArgKernel( const int size, const T tau, const T* u, const T* k1, const T* k3, T* k4_arg )
{
	int i = blockIdx. x * blockDim. x + threadIdx. x;
	if( i < size )
		k4_arg[ i ] = u[ i ] + tau * ( 0.125 * k1[ i ] + 0.375 * k3[ i ] );
}

template< class T > __global__ void computeK5ArgKernel( const int size, const T tau, const T* u, const T* k1, const T* k3, const T* k4, T* k5_arg )
{
	int i = blockIdx. x * blockDim. x + threadIdx. x;
	if( i < size )
		k5_arg[ i ] = u[ i ] + tau * ( 0.5 * k1[ i ] - 1.5 * k3[ i ] + 2.0 * k4[ i ] );
}

template< class T > __global__ void updateUKernel( const int size, const T tau, const T* k1, const T* k4, const T* k5, T* u )
{
	int i = blockIdx. x * blockDim. x + threadIdx. x;
	if( i < size )
		u[ i ] += tau / 6.0 * ( k1[ i ] + 4.0 * k4[ i ] + k5[ i ] );
}


#endif /* TNLMERSONSOLVERCUDA_CU_H_ */
+200 −0
Original line number Diff line number Diff line
/***************************************************************************
                          tnlMersonSolverCUDA.h  -  description
                             -------------------
    begin                : 2007/06/16
    copyright            : (C) 2007 by Tomas Oberhuber
    email                : tomas.oberhuber@fjfi.cvut.cz
 ***************************************************************************/

/***************************************************************************
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
 ***************************************************************************/

#ifndef tnlMersonSolverH
#define tnlMersonSolverH

#include <math.h>
#include <diff/tnlExplicitSolver.h>

#ifdef HAVE_CUDA
void computeK2Arg( const int size, const float* u, const float* k1, float* k2_arg );
void computeK2Arg( const int size, const double* u, const double* k1, double* k2_arg );
void computeK3Arg( const int size, const float* u, const float* k1, const float* k2, float* k3_arg );
void computeK3Arg( const int size, const double* u, const double* k1, const double* k2, double* k3_arg );
void computeK4Arg( const int size, const float* u, const float* k1, const float* k3, float* k4_arg );
void computeK4Arg( const int size, const double* u, const double* k1, const double* k3, double* k4_arg );
void computeK5Arg( const int size, const float* u, const float* k1, const float* k3, const float* k4, float* k5_arg );
void computeK5Arg( const int size, const double* u, const double* k1, const double* k3, const double* k4, double* k5_arg );
#endif

template< class GRID, class SCHEME, typename T = double > class tnlMersonSolver : public tnlExplicitSolver< GRID, SCHEME, T >
{
   public:

   tnlMersonSolver( const GRID& v )
   {
      k1 = new GRID( v );
      k2 = new GRID( v );
      k3 = new GRID( v );
      k4 = new GRID( v );
      k5 = new GRID( v );
      k_tmp = new GRID( v );
      if( ! k1 || ! k2 || ! k3 || ! k4 || ! k5 || ! k_tmp )
      {
         cerr << "Unable to allocate supporting structures for the Merson solver." << endl;
         abort();
      };
      k1 -> Zeros();
      k2 -> Zeros();
      k3 -> Zeros();
      k4 -> Zeros();
      k5 -> Zeros();
      k_tmp -> Zeros();
   };

   tnlString GetType() const
   {
      T t;
      GRID grid;
      return tnlString( "tnlMersonSolver< " ) + grid. GetType() + 
             tnlString( ", " ) + GetParameterType( t ) + tnlString( " >" );
   };

   void SetAdaptivity( const double& a )
   {
      adaptivity = a;
   };
   
   bool Solve( SCHEME& scheme,
               GRID& u,
               const double& stop_time,
               const double& max_res,
               const int max_iter )
   {
      T* _k1 = k1 -> Data();
      T* _k2 = k2 -> Data();
      T* _k3 = k3 -> Data();
      T* _k4 = k4 -> Data();
      T* _k5 = k5 -> Data();
      T* _k_tmp = k_tmp -> Data();
      T* _u = u. Data();
           
      tnlExplicitSolver< GRID, SCHEME, T > :: iteration = 0;
      double& _time = tnlExplicitSolver< GRID, SCHEME, T > :: time;  
      double& _residue = tnlExplicitSolver< GRID, SCHEME, T > :: residue;  
      int& _iteration = tnlExplicitSolver< GRID, SCHEME, T > :: iteration;
      const double size_inv = 1.0 / ( double ) u. GetSize();
      
      const int block_size = 512;
      const int grid_size = ( size - 1 ) / 512 + 1;

      T _tau = tnlExplicitSolver< GRID, SCHEME, T > :: tau;
      if( _time + _tau > stop_time ) _tau = stop_time - _time;
      if( _tau == 0.0 ) return true;

      if( tnlExplicitSolver< GRID, SCHEME, T > :: verbosity > 0 )
         tnlExplicitSolver< GRID, SCHEME, T > :: PrintOut();
      while( 1 )
      {

         int i;
         int size = k1 -> GetSize();
         assert( size == u. GetSize() );
         
         T tau_3 = _tau / 3.0;

         scheme. GetExplicitRHSCUDA( _time, u, *k1 );

         computeK2Arg( size, block_size, grid_size, tau, _u, _k1, _k_tmp );
         scheme. GetExplicitRHSCUDA( _time + tau_3, *k_tmp, *k2 );
         
         computeK3Arg( size, block_size, grid_size, tau, _u, _k1, _k2, _k_tmp );
         scheme. GetExplicitRHSCUDA( _time + tau_3, *k_tmp, *k3 );
         
         computeK4Arg( size, block_size, grid_size, tau, _u, _k1, _k3, _k_tmp );
         scheme. GetExplicitRHSCUDA( _time + 0.5 * _tau, *k_tmp, *k4 );
         
         computeK5Arg( size, block_size, grid_size, tau, _k1, _k3, _k4, _k_tmp );
         scheme. GetExplicitRHSCUDA( _time + _tau, *k_tmp, *k5 );
   
         double eps( 0.0 ), max_eps( 0.0 );
         if( adaptivity )
         {
            for( i = 0; i < size; i ++  )
            {
               eps = Max( eps, _tau / 3.0 * 
                                 fabs( 0.2 * _k1[ i ] +
                                      -0.9 * _k3[ i ] + 
                                       0.8 * _k4[ i ] +
                                      -0.1 * _k5[ i ] ) );
            }
            :: MPIAllreduce( eps, max_eps, 1, MPI_MAX, tnlExplicitSolver< GRID, SCHEME, T > :: solver_comm );
            //if( MPIGetRank() == 0 )
            //   cout << "eps = " << eps << "       " << endl; 
            //  
         }
         //cout << endl << "max_eps = " << max_eps << endl;
         if( ! adaptivity || max_eps < adaptivity )
         {
            double last_residue = _residue;
            double loc_residue = 0.0;

            updateU( size, tau, _k1, _k4, _k5 );
            // TODO: implement loc_residue - if possible

            if( _tau + _time == stop_time ) _residue = last_residue;  // fixing strange values of res. at the last iteration
            else
            {
                loc_residue /= _tau * size_inv;
                :: MPIAllreduce( loc_residue, _residue, 1, MPI_SUM, tnlExplicitSolver< GRID, SCHEME, T > :: solver_comm );
            }
            _time += _tau;
            _iteration ++;
         }
         if( adaptivity && max_eps != 0.0 )
         {
            _tau *= 0.8 * pow( adaptivity / max_eps, 0.2 );
            :: MPIBcast( _tau, 1, 0, tnlExplicitSolver< GRID, SCHEME, T > :: solver_comm );
         }

         if( _time + _tau > stop_time )
            _tau = stop_time - _time; //we don't want to keep such tau
         else tnlExplicitSolver< GRID, SCHEME, T > :: tau = _tau;
         
         if( tnlExplicitSolver< GRID, SCHEME, T > :: verbosity > 1 )
            tnlExplicitSolver< GRID, SCHEME, T > :: PrintOut();
         
         if( _time == stop_time || 
             ( max_res && _residue < max_res ) )
          {
            if( tnlExplicitSolver< GRID, SCHEME, T > :: verbosity > 0 )
               tnlExplicitSolver< GRID, SCHEME, T > :: PrintOut();
             return true;
          }
         //if( max_iter && _iteration == max_iter ) return false;
      }
   };

   ~tnlMersonSolver()
   {
      delete k1;
      delete k2;
      delete k3;
      delete k4;
      delete k5;
      delete k_tmp;
   };

   protected:
   
   GRID *k1, *k2, *k3, *k4, *k5, *k_tmp;

   double adaptivity;
};

#endif
Loading