diff --git a/src/Makefile.am b/src/Makefile.am index 33f159e0e6e4ad9c09a50c814142cadcf17fbc2f..c6bfa59a9a59663a9437dbda49dc55d0d7c96e27 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -56,7 +56,8 @@ check_PROGRAMS = tnl-unit-tests \ tnl_unit_tests_SOURCES = $(tnl_unit_tests_sources) tnl_unit_tests_LDADD = libtnl-0.1.la \ - core/libcore-tests.la + core/libcore-tests.la \ + diff/libdiff-tests.la tnl_benchmarks_SOURCES = $(tnl_benchmarks_sources) $(tnl_benchmarks_headers) tnl_benchmarks_LDADD = libtnl-0.1.la @@ -65,7 +66,8 @@ if BUILD_DBG bin_PROGRAMS = tnl-unit-tests-dbg tnl_unit_tests_dbg_SOURCES = $(tnl_unit_tests_sources) tnl_unit_tests_dbg_LDADD = libtnl-dbg-0.1.la \ - core/libcore-tests-dbg.la + core/libcore-tests-dbg.la \ + diff/libdiff-tests-dbg.la endif diff --git a/src/core/tnlFieldCUDA2D.h b/src/core/tnlFieldCUDA2D.h index d83e95239b2b4b371d1073836f82d2420aabe3f2..5e3d059aca48bb5b16909d774b682e075b6a2f31 100644 --- a/src/core/tnlFieldCUDA2D.h +++ b/src/core/tnlFieldCUDA2D.h @@ -42,6 +42,11 @@ template< class T > class tnlFieldCUDA2D : public tnlLongVectorCUDA< T > x_size( f. x_size ), y_size( f. y_size ) { }; + tnlFieldCUDA2D( const tnlField2D< T >& f ) + : tnlLongVectorCUDA< T >( f ), + x_size( f. GetXSize() ), y_size( f. GetYSize() ) + { }; + tnlString GetType() const { T t; diff --git a/src/core/tnlLongVectorCUDA.h b/src/core/tnlLongVectorCUDA.h index 03ee20d4ec66ef3638926650d29a60122d06a438..4a96ddb7479217b7992eee0fbe180334aca6634a 100644 --- a/src/core/tnlLongVectorCUDA.h +++ b/src/core/tnlLongVectorCUDA.h @@ -76,6 +76,26 @@ template< class T > class tnlLongVectorCUDA : public tnlObject } #endif + //! Constructor with another long vector as template + tnlLongVectorCUDA( const tnlLongVector< T >& v ) +#ifdef HAVE_CUDA + : tnlObject( v ), size( v. GetSize() ), shared_data( false ) + { + + if( cudaMalloc( ( void** ) &data, ( size + 1 ) * sizeof( T ) ) != cudaSuccess ) + { + cerr << "Unable to allocate new long vector with size " << size << " on CUDA device." << endl; + data = NULL; + abort(); + } + //data ++; + }; +#else + { + cerr << "CUDA support is missing in this system." << endl; + } +#endif + tnlString GetType() const { diff --git a/src/diff/Makefile.am b/src/diff/Makefile.am index dc2310082698a25ebec5f3c223af1cf0b09d739d..44562e8f6d6348ff9215c47c210406fe71095446 100644 --- a/src/diff/Makefile.am +++ b/src/diff/Makefile.am @@ -59,8 +59,24 @@ endif if BUILD_MPI_DBG + noinst_LTLIBRARIES += libtnldiff-mpi-dbg-0.1.la libtnldiff_mpi_dbg_0_1_la_CXXFLAGS = $(MPICXXFLAGS) $(DBGCXXFLAGS) libtnldiff_mpi_dbg_0_1_la_LDFLAGS = $(LDFLAGS) $(MPILDFLAGS) $(DBGLDFLAGS) libtnldiff_mpi_dbg_0_1_la_SOURCES = $(sources) $(headers) endif + +libdiff_tests_sources = + +if BUILD_CUDA +libdiff_tests_sources += tnlMersonSolverCUDATester.h \ + tnlMersonSolverCUDATester.cu +endif + +check_LTLIBRARIES = libdiff-tests.la +libdiff_tests_la_SOURCES = $(libdiff_tests_sources) + +if BUILD_DBG +noinst_LTLIBRARIES += libdiff-tests-dbg.la +libdiff_tests_dbg_la_SOURCES = $(libdiff_tests_sources) +endif diff --git a/src/diff/norms.h b/src/diff/norms.h index 80e1c9ffa38a09c5c5f9b16100b3d70f1e8542cf..1fa578aecd5a60a3727dfcb2d28e59323209d680 100644 --- a/src/diff/norms.h +++ b/src/diff/norms.h @@ -107,7 +107,7 @@ template< class T > T GetDiffMaxNorm( const tnlGrid2D< T >& u1, for( i = 0; i < size; i ++ ) { T diff = _u1[ i ] - _u2[ i ]; - result = Max( result, fabs( diff ) ); + result = Max( result, ( T ) fabs( diff ) ); } return result; }; diff --git a/src/diff/tnlExplicitSolver.h b/src/diff/tnlExplicitSolver.h index 59c9033b15a0bd671861bdc62029cae4cb9d07fa..6aabb585b8c9d1c13bf64e7962de5c9593a0e5e7 100644 --- a/src/diff/tnlExplicitSolver.h +++ b/src/diff/tnlExplicitSolver.h @@ -28,9 +28,9 @@ template< class GRID, class SCHEME, typename T = double > class tnlExplicitSolve tnlExplicitSolver() : iteration( 0 ), - time( 0 ), - tau( 0 ), - residue( 0 ), + time( 0.0 ), + tau( 0.0 ), + residue( 0.0 ), solver_comm( MPI_COMM_WORLD ), verbosity( 0 ), cpu_timer( &default_mcore_cpu_timer ), @@ -39,12 +39,12 @@ template< class GRID, class SCHEME, typename T = double > class tnlExplicitSolve }; - void SetTime( const double& t ) + void SetTime( const T& t ) { time = t; }; - const double& GetTime() const + const T& GetTime() const { return time; }; @@ -66,7 +66,7 @@ template< class GRID, class SCHEME, typename T = double > class tnlExplicitSolve return tau; }; - const double& GetResidue() const + const T& GetResidue() const { return residue; }; @@ -116,19 +116,19 @@ template< class GRID, class SCHEME, typename T = double > class tnlExplicitSolve virtual bool Solve( SCHEME& scheme, GRID& u, - const double& stop_time, - const double& max_res, + const T& stop_time, + const T& max_res, const int max_iter ) = 0; protected: int iteration; - double time; + T time; T tau; - double residue; + T residue; MPI_Comm solver_comm; diff --git a/src/diff/tnlGridCUDA2D.h b/src/diff/tnlGridCUDA2D.h index ce55341abd85afc2b45a0daab76c89f5afc9fc10..0a5819c6eec276686ab16c50f8d265adfd254b88 100644 --- a/src/diff/tnlGridCUDA2D.h +++ b/src/diff/tnlGridCUDA2D.h @@ -19,6 +19,7 @@ #define tnlGridCUDA2DH #include <core/tnlFieldCUDA2D.h> +#include <diff/tnlGrid2D.h> template< class T = double > class tnlGridCUDA2D : public tnlFieldCUDA2D< T > @@ -57,6 +58,15 @@ template< class T = double > class tnlGridCUDA2D : { }; + tnlGridCUDA2D( const tnlGrid2D< T >& g ) + : tnlFieldCUDA2D< T >( g ), + Ax( g. GetAx() ), Bx( g. GetBx() ), + Ay( g. GetAy() ), By( g. GetBy() ), + Hx( g. GetHx() ), Hy( g. GetHy() ) + { + }; + + tnlString GetType() const { T t; diff --git a/src/diff/tnlMersonSolver.h b/src/diff/tnlMersonSolver.h index 4fb43c012be71c22cb386df8747a43b2af7ab9c6..1d99b5b452fc8cee690911b6aa93675488a81ba7 100644 --- a/src/diff/tnlMersonSolver.h +++ b/src/diff/tnlMersonSolver.h @@ -1,8 +1,3 @@ - - - - - /*************************************************************************** tnlMersonSolver.h - description ------------------- @@ -31,6 +26,7 @@ template< class GRID, class SCHEME, typename T = double > class tnlMersonSolver public: tnlMersonSolver( const GRID& v ) + : adaptivity( 1.0e-5 ) { k1 = new GRID( v ); k2 = new GRID( v ); @@ -49,6 +45,7 @@ template< class GRID, class SCHEME, typename T = double > class tnlMersonSolver k4 -> Zeros(); k5 -> Zeros(); k_tmp -> Zeros(); + tnlExplicitSolver< GRID, SCHEME, T > :: tau = 1.0; }; tnlString GetType() const @@ -59,16 +56,16 @@ template< class GRID, class SCHEME, typename T = double > class tnlMersonSolver tnlString( ", " ) + GetParameterType( t ) + tnlString( " >" ); }; - void SetAdaptivity( const double& a ) + void SetAdaptivity( const T& a ) { adaptivity = a; }; bool Solve( SCHEME& scheme, GRID& u, - const double& stop_time, - const double& max_res, - const int max_iter ) + const T& stop_time, + const T& max_res, + const int max_iter = -1 ) { T* _k1 = k1 -> Data(); T* _k2 = k2 -> Data(); @@ -79,17 +76,18 @@ template< class GRID, class SCHEME, typename T = double > class tnlMersonSolver T* _u = u. Data(); tnlExplicitSolver< GRID, SCHEME, T > :: iteration = 0; - double& _time = tnlExplicitSolver< GRID, SCHEME, T > :: time; - double& _residue = tnlExplicitSolver< GRID, SCHEME, T > :: residue; + T& _time = tnlExplicitSolver< GRID, SCHEME, T > :: time; + T& _residue = tnlExplicitSolver< GRID, SCHEME, T > :: residue; int& _iteration = tnlExplicitSolver< GRID, SCHEME, T > :: iteration; - const double size_inv = 1.0 / ( double ) u. GetSize(); - + const T size_inv = 1.0 / ( T ) u. GetSize(); + T _tau = tnlExplicitSolver< GRID, SCHEME, T > :: tau; if( _time + _tau > stop_time ) _tau = stop_time - _time; if( _tau == 0.0 ) return true; if( tnlExplicitSolver< GRID, SCHEME, T > :: verbosity > 0 ) tnlExplicitSolver< GRID, SCHEME, T > :: PrintOut(); + while( 1 ) { @@ -129,16 +127,16 @@ template< class GRID, class SCHEME, typename T = double > class tnlMersonSolver _k_tmp[ i ] = _u[ i ] + _tau * ( 0.5 * _k1[ i ] - 1.5 * _k3[ i ] + 2.0 * _k4[ i ] ); scheme. GetExplicitRHS( _time + _tau, *k_tmp, *k5 ); - double eps( 0.0 ), max_eps( 0.0 ); + T eps( 0.0 ), max_eps( 0.0 ); if( adaptivity ) { for( i = 0; i < size; i ++ ) { - eps = Max( eps, _tau / 3.0 * + eps = Max( eps, ( T ) ( _tau / 3.0 * fabs( 0.2 * _k1[ i ] + -0.9 * _k3[ i ] + 0.8 * _k4[ i ] + - -0.1 * _k5[ i ] ) ); + -0.1 * _k5[ i ] ) ) ); } :: MPIAllreduce( eps, max_eps, 1, MPI_MAX, tnlExplicitSolver< GRID, SCHEME, T > :: solver_comm ); //if( MPIGetRank() == 0 ) @@ -148,19 +146,19 @@ template< class GRID, class SCHEME, typename T = double > class tnlMersonSolver //cout << endl << "max_eps = " << max_eps << endl; if( ! adaptivity || max_eps < adaptivity ) { - double last_residue = _residue; - double loc_residue = 0.0; + T last_residue = _residue; + T loc_residue = 0.0; #ifdef HAVE_OPENMP #pragma omp parallel for reduction(+:loc_residue) firstprivate( size, _k_tmp, _u, _k1,_tau, tau_3 ) #endif for( i = 0; i < size; i ++ ) { - // this does not have to be in double precision + // this does not have to be in T precision //_k_tmp[ i ] = ( 0.5 * ( _k1[ i ] + _k5[ i ] ) + // 2.0 * _k4[ i ] ) * tau_3; const T add = _tau / 6.0 * ( _k1[ i ] + 4.0 * _k4[ i ] + _k5[ i ] ); _u[ i ] += add; - loc_residue += fabs( ( double ) add ); + loc_residue += fabs( ( T ) add ); } if( _tau + _time == stop_time ) _residue = last_residue; // fixing strange values of res. at the last iteration else @@ -170,6 +168,7 @@ template< class GRID, class SCHEME, typename T = double > class tnlMersonSolver } _time += _tau; _iteration ++; + cout << _iteration << " "; } if( adaptivity && max_eps != 0.0 ) { @@ -182,16 +181,16 @@ template< class GRID, class SCHEME, typename T = double > class tnlMersonSolver else tnlExplicitSolver< GRID, SCHEME, T > :: tau = _tau; if( tnlExplicitSolver< GRID, SCHEME, T > :: verbosity > 1 ) - tnlExplicitSolver< GRID, SCHEME, T > :: PrintOut(); + tnlExplicitSolver< GRID, SCHEME, T > :: PrintOut(); - if( _time == stop_time || + if( _time == stop_time || ( max_res && _residue < max_res ) ) { if( tnlExplicitSolver< GRID, SCHEME, T > :: verbosity > 0 ) tnlExplicitSolver< GRID, SCHEME, T > :: PrintOut(); return true; } - //if( max_iter && _iteration == max_iter ) return false; + if( _iteration == max_iter ) return false; } }; @@ -209,7 +208,7 @@ template< class GRID, class SCHEME, typename T = double > class tnlMersonSolver GRID *k1, *k2, *k3, *k4, *k5, *k_tmp; - double adaptivity; + T adaptivity; }; #endif diff --git a/src/diff/tnlMersonSolverCUDA.cu b/src/diff/tnlMersonSolverCUDA.cu index af069d26f9772e481306cff13f39b38a0ad910f2..bcd9c098f2fadca4fbf4b379ccb187cd31a04802 100644 --- a/src/diff/tnlMersonSolverCUDA.cu +++ b/src/diff/tnlMersonSolverCUDA.cu @@ -21,7 +21,7 @@ void computeK2Arg( const int size, const int block_size, const int grid_size, - const float tau, + const float& tau, const float* u, const float* k1, float* k2_arg ) @@ -32,7 +32,7 @@ void computeK2Arg( const int size, void computeK2Arg( const int size, const int block_size, const int grid_size, - const double tau, + const double& tau, const double* u, const double* k1, double* k2_arg ) @@ -43,7 +43,7 @@ void computeK2Arg( const int size, void computeK3Arg( const int size, const int block_size, const int grid_size, - const float tau, + const float& tau, const float* u, const float* k1, const float* k2, @@ -55,7 +55,7 @@ void computeK3Arg( const int size, void computeK3Arg( const int size, const int block_size, const int grid_size, - const double tau, + const double& tau, const double* u, const double* k1, const double* k2, @@ -67,7 +67,7 @@ void computeK3Arg( const int size, void computeK4Arg( const int size, const int block_size, const int grid_size, - const float tau, + const float& tau, const float* u, const float* k1, const float* k3, @@ -79,7 +79,7 @@ void computeK4Arg( const int size, void computeK4Arg( const int size, const int block_size, const int grid_size, - const double tau, + const double& tau, const double* u, const double* k1, const double* k3, @@ -91,7 +91,7 @@ void computeK4Arg( const int size, void computeK5Arg( const int size, const int block_size, const int grid_size, - const float tau, + const float& tau, const float* u, const float* k1, const float* k3, @@ -104,7 +104,7 @@ void computeK5Arg( const int size, void computeK5Arg( const int size, const int block_size, const int grid_size, - const double tau, + const double& tau, const double* u, const double* k1, const double* k3, @@ -114,10 +114,37 @@ void computeK5Arg( const int size, computeK5ArgKernel<<< grid_size, block_size >>>( size, tau, u, k1, k3, k4, k5_arg ); } +void computeErr( const int size, + const int block_size, + const int grid_size, + const float& tau, + const float* _k1, + const float* _k3, + const float* _k4, + const float* _k5, + float* _err ) +{ + computeErrKernel<<< grid_size, block_size >>>( size, tau, _k1, _k3, _k4, _k5, _err ); +} + +void computeErr( const int size, + const int block_size, + const int grid_size, + const double& tau, + const double* _k1, + const double* _k3, + const double* _k4, + const double* _k5, + double* _err ) +{ + computeErrKernel<<< grid_size, block_size >>>( size, tau, _k1, _k3, _k4, _k5, _err ); +} + + void updateU( const int size, const int block_size, const int grid_size, - const float tau, + const float& tau, const float* k1, const float* k4, const float* k5, @@ -129,7 +156,7 @@ void updateU( const int size, void updateU( const int size, const int block_size, const int grid_size, - const double tau, + const double& tau, const double* k1, const double* k4, const double* k5, diff --git a/src/diff/tnlMersonSolverCUDA.cu.h b/src/diff/tnlMersonSolverCUDA.cu.h index 1325dbe717a2f05f7e9baf39d383c48ce20a95a1..a8ace031a9fc85d510410eda783fc059e4a2905c 100644 --- a/src/diff/tnlMersonSolverCUDA.cu.h +++ b/src/diff/tnlMersonSolverCUDA.cu.h @@ -1,13 +1,70 @@ -/* - * tnlMersonSolverCUDA.cu.h - * - * Created on: Jan 13, 2010 - * Author: Tomas Oberhuber - */ +/*************************************************************************** + tnlMersonSolverCUDATester.h + ------------------- + begin : Jan 13, 2010 + copyright : (C) 2010 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/*************************************************************************** + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + ***************************************************************************/ #ifndef TNLMERSONSOLVERCUDA_CU_H_ #define TNLMERSONSOLVERCUDA_CU_H_ +#ifdef HAVE_CUDA + +template< class T > __global__ void computeK2ArgKernel( const int size, const T tau, const T* u, const T* k1, T* k2_arg ) +{ + int i = blockIdx. x * blockDim. x + threadIdx. x; + if( i < size ) + k2_arg[ i ] = u[ i ] + tau * ( 1.0 / 3.0 * k1[ i ] ); +} + +template< class T > __global__ void computeK3ArgKernel( const int size, const T tau, const T* u, const T* k1, const T* k2, T* k3_arg ) +{ + int i = blockIdx. x * blockDim. x + threadIdx. x; + if( i < size ) + k3_arg[ i ] = u[ i ] + tau * 1.0 / 6.0 * ( k1[ i ] + k2[ i ] ); +} + +template< class T > __global__ void computeK4ArgKernel( const int size, const T tau, const T* u, const T* k1, const T* k3, T* k4_arg ) +{ + int i = blockIdx. x * blockDim. x + threadIdx. x; + if( i < size ) + k4_arg[ i ] = u[ i ] + tau * ( 0.125 * k1[ i ] + 0.375 * k3[ i ] ); +} + +template< class T > __global__ void computeK5ArgKernel( const int size, const T tau, const T* u, const T* k1, const T* k3, const T* k4, T* k5_arg ) +{ + int i = blockIdx. x * blockDim. x + threadIdx. x; + if( i < size ) + k5_arg[ i ] = u[ i ] + tau * ( 0.5 * k1[ i ] - 1.5 * k3[ i ] + 2.0 * k4[ i ] ); +} + +template< class T > __global__ void computeErrKernel( const int size, const T tau, const T* k1, const T* k3, const T* k4, const T* k5, T* err ) +{ + int i = blockIdx. x * blockDim. x + threadIdx. x; + if( i < size ) + err[ i ] = 1.0 / 3.0 * tau * fabs( 0.2 * k1[ i ] + + -0.9 * k3[ i ] + + 0.8 * k4[ i ] + + -0.1 * k5[ i ] ); +} + +template< class T > __global__ void updateUKernel( const int size, const T tau, const T* k1, const T* k4, const T* k5, T* u ) +{ + int i = blockIdx. x * blockDim. x + threadIdx. x; + if( i < size ) + u[ i ] += tau / 6.0 * ( k1[ i ] + 4.0 * k4[ i ] + k5[ i ] ); +} +#endif #endif /* TNLMERSONSOLVERCUDA_CU_H_ */ diff --git a/src/diff/tnlMersonSolverCUDA.h b/src/diff/tnlMersonSolverCUDA.h index e981d054c787b948b53246de37c8d4f14897455e..475a4d44f6aeb25e2d86a4292b2a24c20d6c4953 100644 --- a/src/diff/tnlMersonSolverCUDA.h +++ b/src/diff/tnlMersonSolverCUDA.h @@ -15,30 +15,32 @@ * * ***************************************************************************/ -#ifndef tnlMersonSolverH -#define tnlMersonSolverH +#ifndef tnlMersonSolverCUDAH +#define tnlMersonSolverCUDAH #include <math.h> -#include <core/tnl-cuda-kernels.h> +#include <core/tnl-cuda-kernels.cu.h> #include <diff/tnlExplicitSolver.h> #ifdef HAVE_CUDA -// TODO: remove this -/*void computeK2Arg( const int size, +void computeK2Arg( const int size, const int block_size, const int grid_size, + const float& tau, const float* u, const float* k1, float* k2_arg ); void computeK2Arg( const int size, const int block_size, const int grid_size, + const double& tau, const double* u, const double* k1, double* k2_arg ); void computeK3Arg( const int size, const int block_size, const int grid_size, + const float& tau, const float* u, const float* k1, const float* k2, @@ -46,6 +48,7 @@ void computeK3Arg( const int size, void computeK3Arg( const int size, const int block_size, const int grid_size, + const double& tau, const double* u, const double* k1, const double* k2, @@ -53,6 +56,7 @@ void computeK3Arg( const int size, void computeK4Arg( const int size, const int block_size, const int grid_size, + const float& tau, const float* u, const float* k1, const float* k3, @@ -60,6 +64,7 @@ void computeK4Arg( const int size, void computeK4Arg( const int size, const int block_size, const int grid_size, + const double& tau, const double* u, const double* k1, const double* k3, @@ -67,6 +72,7 @@ void computeK4Arg( const int size, void computeK5Arg( const int size, const int block_size, const int grid_size, + const float& tau, const float* u, const float* k1, const float* k3, @@ -75,15 +81,34 @@ void computeK5Arg( const int size, void computeK5Arg( const int size, const int block_size, const int grid_size, + const double& tau, const double* u, const double* k1, const double* k3, const double* k4, double* k5_arg ); +void computeErr( const int size, + const int block_size, + const int grid_size, + const float& tau, + const float* _k1, + const float* _k3, + const float* _k4, + const float* _k5, + float* _err ); +void computeErr( const int size, + const int block_size, + const int grid_size, + const double& tau, + const double* _k1, + const double* _k3, + const double* _k4, + const double* _k5, + double* _err ); void updateU( const int size, const int block_size, const int grid_size, - const float tau, + const float& tau, const float* k1, const float* k4, const float* k5, @@ -91,56 +116,12 @@ void updateU( const int size, void updateU( const int size, const int block_size, const int grid_size, - const double tau, + const double& tau, const double* k1, const double* k4, const double* k5, - double* u );*/ - -template< class T > __global__ void computeK2ArgKernel( const int size, const T tau, const T* u, const T* k1, T* k2_arg ) -{ - int i = blockIdx. x * blockDim. x + threadIdx. x; - if( i < size ) - k2_arg[ i ] = u[ i ] + tau * ( 1.0 / 3.0 * k1[ i ] ); -} + double* u ); -template< class T > __global__ void computeK3ArgKernel( const int size, const T tau, const T* u, const T* k1, const T* k2, T* k3_arg ) -{ - int i = blockIdx. x * blockDim. x + threadIdx. x; - if( i < size ) - k3_arg[ i ] = u[ i ] + tau * 1.0 / 6.0 * ( k1[ i ] + k2[ i ] ); -} - -template< class T > __global__ void computeK4ArgKernel( const int size, const T tau, const T* u, const T* k1, const T* k3, T* k4_arg ) -{ - int i = blockIdx. x * blockDim. x + threadIdx. x; - if( i < size ) - k4_arg[ i ] = u[ i ] + tau * ( 0.125 * k1[ i ] + 0.375 * k3[ i ] ); -} - -template< class T > __global__ void computeK5ArgKernel( const int size, const T tau, const T* u, const T* k1, const T* k3, const T* k4, T* k5_arg ) -{ - int i = blockIdx. x * blockDim. x + threadIdx. x; - if( i < size ) - k5_arg[ i ] = u[ i ] + tau * ( 0.5 * k1[ i ] - 1.5 * k3[ i ] + 2.0 * k4[ i ] ); -} - -template< class T > __global__ void computeErrKernel( const int size, const T tau, const T* _k1, const T* _k3, const T* _k4, const T* _k5, T* err ) -{ - int i = blockIdx. x * blockDim. x + threadIdx. x; - if( i < size ) - err[ i ] = 1.0 / 3.0 * tau * fabs( 0.2 * k1[ i ] + - -0.9 * k3[ i ] + - 0.8 * k4[ i ] + - -0.1 * k5[ i ] ) ); -} - -template< class T > __global__ void updateUKernel( const int size, const T tau, const T* k1, const T* k4, const T* k5, T* u ) -{ - int i = blockIdx. x * blockDim. x + threadIdx. x; - if( i < size ) - u[ i ] += tau / 6.0 * ( k1[ i ] + 4.0 * k4[ i ] + k5[ i ] ); -} #endif @@ -148,34 +129,33 @@ template< class GRID, class SCHEME, typename T = double > class tnlMersonSolverC { public: - tnlMersonSolver( const GRID& v ) + tnlMersonSolverCUDA( const GRID& v ) + : adaptivity( 1.0e-5 ) { - k1. SetNewDimension( v ); +#ifdef HAVE_CUDA + k1. SetNewDimensions( v ); k1. SetNewDomain( v ); - k2. SetNewDimension( v ); + k2. SetNewDimensions( v ); k2. SetNewDomain( v ); - k3. SetNewDimension( v ); + k3. SetNewDimensions( v ); k3. SetNewDomain( v ); - k4. SetNewDimension( v ); + k4. SetNewDimensions( v ); k4. SetNewDomain( v ); - k5. SetNewDimension( v ); + k5. SetNewDimensions( v ); k5. SetNewDomain( v ); - k_tmp. SetNewDimension( v ); + k_tmp. SetNewDimensions( v ); k_tmp. SetNewDomain( v ); - err. SetNewDimension( v ); + err. SetNewDimensions( v ); err. SetNewDomain( v ); if( ! k1 || ! k2 || ! k3 || ! k4 || ! k5 || ! k_tmp || ! err ) { cerr << "Unable to allocate supporting structures for the Merson solver." << endl; abort(); }; - k1. Zeros(); - k2. Zeros(); - k3. Zeros(); - k4. Zeros(); - k5. Zeros(); - k_tmp. Zeros(); - err. Zeros(); + tnlExplicitSolver< GRID, SCHEME, T > :: tau = 1.0; +#else + cerr << "CUDA is not supported on this system." << endl; +#endif }; tnlString GetType() const @@ -186,17 +166,18 @@ template< class GRID, class SCHEME, typename T = double > class tnlMersonSolverC tnlString( ", " ) + GetParameterType( t ) + tnlString( " >" ); }; - void SetAdaptivity( const double& a ) + void SetAdaptivity( const T& a ) { adaptivity = a; }; bool Solve( SCHEME& scheme, GRID& u, - const double& stop_time, - const double& max_res, - const int max_iter ) + const T& stop_time, + const T& max_res, + const int max_iter = -1 ) { +#ifdef HAVE_CUDA T* _k1 = k1. Data(); T* _k2 = k2. Data(); T* _k3 = k3. Data(); @@ -207,11 +188,12 @@ template< class GRID, class SCHEME, typename T = double > class tnlMersonSolverC T* _err = err. Data(); tnlExplicitSolver< GRID, SCHEME, T > :: iteration = 0; - double& _time = tnlExplicitSolver< GRID, SCHEME, T > :: time; - double& _residue = tnlExplicitSolver< GRID, SCHEME, T > :: residue; + T& _time = tnlExplicitSolver< GRID, SCHEME, T > :: time; + T& _residue = tnlExplicitSolver< GRID, SCHEME, T > :: residue; int& _iteration = tnlExplicitSolver< GRID, SCHEME, T > :: iteration; - const double size_inv = 1.0 / ( double ) u. GetSize(); + const T size_inv = 1.0 / ( T ) u. GetSize(); + const int size = u. GetSize(); const int block_size = 512; const int grid_size = ( size - 1 ) / block_size + 1; @@ -221,42 +203,57 @@ template< class GRID, class SCHEME, typename T = double > class tnlMersonSolverC if( tnlExplicitSolver< GRID, SCHEME, T > :: verbosity > 0 ) tnlExplicitSolver< GRID, SCHEME, T > :: PrintOut(); + + // DEBUG + /*tnlGrid2D< T > hostAux; + hostAux. SetNewDimensions( u. GetXSize(), u. GetYSize() ); + hostAux. SetNewDomain( u. GetAx(), u. GetBx(), u. GetAy(), u. GetBy() );*/ + while( 1 ) { int i; - int size = k1 -> GetSize(); + int size = k1. GetSize(); assert( size == u. GetSize() ); T tau_3 = _tau / 3.0; - scheme. GetExplicitRHSCUDA( _time, u, *k1 ); + scheme. GetExplicitRHSCUDA( _time, u, k1 ); - computeK2ArgKernel<<< block_size, grid_size >>>( size, tau, _u, _k1, _k_tmp ); - scheme. GetExplicitRHSCUDA( _time + tau_3, *k_tmp, *k2 ); + computeK2Arg( size, block_size, grid_size, _tau, _u, _k1, _k_tmp ); + scheme. GetExplicitRHSCUDA( _time + tau_3, k_tmp, k2 ); - computeK3ArgKernel<<< block_size, grid_size >>>( size, tau, _u, _k1, _k2, _k_tmp ); - scheme. GetExplicitRHSCUDA( _time + tau_3, *k_tmp, *k3 ); + computeK3Arg( size, block_size, grid_size, _tau, _u, _k1, _k2, _k_tmp ); + scheme. GetExplicitRHSCUDA( _time + tau_3, k_tmp, k3 ); - computeK4ArgKernel<<< block_size, grid_size >>>( size, tau, _u, _k1, _k3, _k_tmp ); - scheme. GetExplicitRHSCUDA( _time + 0.5 * _tau, *k_tmp, *k4 ); + computeK4Arg( size, block_size, grid_size, _tau, _u, _k1, _k3, _k_tmp ); + scheme. GetExplicitRHSCUDA( _time + 0.5 * _tau, k_tmp, k4 ); - computeK5Arg<<< block_size, grid_size >>>( size, tau, _k1, _k3, _k4, _k_tmp ); - scheme. GetExplicitRHSCUDA( _time + _tau, *k_tmp, *k5 ); + computeK5Arg( size, block_size, grid_size, _tau, _u, _k1, _k3, _k4, _k_tmp ); + scheme. GetExplicitRHSCUDA( _time + _tau, k_tmp, k5 ); - double eps( 0.0 ), max_eps( 0.0 ); + T eps( 0.0 ), max_eps( 0.0 ); if( adaptivity ) { - computeErrKernel<<< block_size, grid_size >>>( size, tau, _k1, _k3, _k4, _k5, _err ); - tnlCUDAReduction( size, err, max_eps ); // TODO: allocate auxiliary array for the reduction + computeErr( size, block_size, grid_size, _tau, _k1, _k3, _k4, _k5, _err ); + tnlCUDAReductionMax( size, _err, max_eps ); // TODO: allocate auxiliary array for the reduction + + /*hostAux. copyFrom( err ); + T seq_max_eps = 0.0; + for( int i = 0; i < hostAux. GetSize(); i ++ ) + seq_max_eps = :: Max( seq_max_eps, hostAux[ i ] ); + if( max_eps != seq_max_eps ) + abort();*/ + } + //cout << endl << "max_eps = " << max_eps << endl; if( ! adaptivity || max_eps < adaptivity ) { - double last_residue = _residue; - double loc_residue = 0.0; + T last_residue = _residue; + T loc_residue = 0.0; - updateU( size, block_size, grid_size, tau, _k1, _k4, _k5 ); + updateU( size, block_size, grid_size, _tau, _k1, _k4, _k5, _u ); // TODO: implement loc_residue - if possible if( _tau + _time == stop_time ) _residue = last_residue; // fixing strange values of res. at the last iteration @@ -288,11 +285,15 @@ template< class GRID, class SCHEME, typename T = double > class tnlMersonSolverC tnlExplicitSolver< GRID, SCHEME, T > :: PrintOut(); return true; } - //if( max_iter && _iteration == max_iter ) return false; + if( _iteration == max_iter ) return false; } +#else + cerr << "CUDA is not supported on this system." << endl; + return false; +#endif }; - ~tnlMersonSolver() + ~tnlMersonSolverCUDA() { }; @@ -300,7 +301,7 @@ template< class GRID, class SCHEME, typename T = double > class tnlMersonSolverC GRID k1, k2, k3, k4, k5, k_tmp, err; - double adaptivity; + T adaptivity; }; #endif diff --git a/src/diff/tnlMersonSolverCUDATester.cu b/src/diff/tnlMersonSolverCUDATester.cu new file mode 100644 index 0000000000000000000000000000000000000000..a27c22ac494aed5dd443ad8f37d3a00796d8ab2c --- /dev/null +++ b/src/diff/tnlMersonSolverCUDATester.cu @@ -0,0 +1,50 @@ +/*************************************************************************** + tnlMersonSolverCUDATester.cu + ------------------- + begin : Feb 2, 2010 + copyright : (C) 2009 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/*************************************************************************** + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + ***************************************************************************/ + + #include <diff/tnlMersonSolverCUDATester.cu.h> + + void heatEquationRHS( const int gridDimX, + const int gridDimY, + const int blockDimX, + const int blockDimY, + const int xSize, + const int ySize, + const float& hX, + const float& hY, + const float* u, + float* fu ) +{ + dim3 gridDim( gridDimX, gridDimY ); + dim3 blockDim( blockDimX, blockDimY ); + heatEquationRHSKernel<<< gridDim, blockDim >>>( xSize, ySize, hX, hY, u, fu ); +} + +void heatEquationRHS( const int gridDimX, + const int gridDimY, + const int blockDimX, + const int blockDimY, + const int xSize, + const int ySize, + const double& hX, + const double& hY, + const double* u, + double* fu ) +{ + dim3 gridDim( gridDimX, gridDimY ); + dim3 blockDim( blockDimX, blockDimY ); + heatEquationRHSKernel<<< gridDim, blockDim >>>( xSize, ySize, hX, hY, u, fu ); +} \ No newline at end of file diff --git a/src/diff/tnlMersonSolverCUDATester.cu.h b/src/diff/tnlMersonSolverCUDATester.cu.h new file mode 100644 index 0000000000000000000000000000000000000000..dab468ce705da57a05069e09ecb0d1dbe74e262d --- /dev/null +++ b/src/diff/tnlMersonSolverCUDATester.cu.h @@ -0,0 +1,43 @@ +/*************************************************************************** + tnlMersonSolverCUDATester.cu.h + ------------------- + begin : Feb 2, 2010 + copyright : (C) 2009 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/*************************************************************************** + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + ***************************************************************************/ + + + +#ifndef TNLMERSONSOLVERCUDATESTER_CU_H_ +#define TNLMERSONSOLVERCUDATESTER_CU_H_ + +template< class T > __global__ void heatEquationRHSKernel( const int xSize, + const int ySize, + const T hX, + const T hY, + const T* u, + T* fu ) +{ + const int i = blockIdx. x * blockDim. x + threadIdx. x; + const int j = blockIdx. y * blockDim. y + threadIdx. y; + + if( i == 0 || j == 0 || i == xSize - 1 || j == ySize - 1 ) + fu[ xSize * j + i ] = 0.0; + else + if( i < xSize && j < ySize ) + fu[ xSize * j + i ] = ( u[ xSize * j + i + 1 ] - 2.0 * u[ xSize * j + i ] + u[ xSize * j + i - 1] ) / ( hX * hX ) + + ( u[ xSize * j + i + xSize ] - 2.0 * u[ xSize * j + i ] + u[ xSize * j + i - xSize ] ) / ( hY * hY ); + +} + + +#endif /* TNLMERSONSOLVERCUDATESTER_CU_H_ */ diff --git a/src/diff/tnlMersonSolverCUDATester.h b/src/diff/tnlMersonSolverCUDATester.h new file mode 100644 index 0000000000000000000000000000000000000000..9f6fca71c13c2ce4ad171536798cf3797e380a40 --- /dev/null +++ b/src/diff/tnlMersonSolverCUDATester.h @@ -0,0 +1,200 @@ +/*************************************************************************** + tnlMersonSolverCUDATester.h + ------------------- + begin : Feb 1, 2010 + copyright : (C) 2010 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/*************************************************************************** + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + ***************************************************************************/ + +#ifndef TNLMERSONSOLVERCUDATESTER_H_ +#define TNLMERSONSOLVERCUDATESTER_H_ + +#include <diff/tnlMersonSolver.h> +#include <diff/tnlMersonSolverCUDA.h> +#include <diff/drawGrid2D.h> +#include <diff/norms.h> +#include <core/mfilename.h> +#include <cppunit/TestSuite.h> +#include <cppunit/TestResult.h> +#include <cppunit/TestCaller.h> +#include <cppunit/TestCase.h> + + void heatEquationRHS( const int gridDimX, + const int gridDimY, + const int blockDimX, + const int blockDimY, + const int xSize, + const int ySize, + const float& hX, + const float& hY, + const float* u, + float* fu ); + +void heatEquationRHS( const int gridDimX, + const int gridDimY, + const int blockDimX, + const int blockDimY, + const int xSize, + const int ySize, + const double& hX, + const double& hY, + const double* u, + double* fu ); + + +template< class T > class tnlMersonSolverCUDATester : public CppUnit :: TestCase +{ + public: + + static CppUnit :: Test* suite() + { + CppUnit :: TestSuite* suiteOfTests = new CppUnit :: TestSuite( "tnlMersonSolverCUDATester" ); + CppUnit :: TestResult result; + + T param; + tnlString test_name = tnlString( "testUpdateU< " ) + GetParameterType( param ) + tnlString( " >" ); + suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlMersonSolverCUDATester< T > >( + test_name. Data(), + & tnlMersonSolverCUDATester< T > :: testUpdateU ) + ); + + return suiteOfTests; + }; + + void GetExplicitRHS( const T& time, + tnlGrid2D< T >& u, + tnlGrid2D< T >& fu ) + { + const int xSize = u. GetXSize(); + const int ySize = u. GetYSize(); + for( int i = 0; i < xSize; i ++ ) + for( int j = 0; j < ySize; j ++ ) + { + if( i == 0 || j == 0 || i == xSize - 1 || j == ySize - 1 ) + fu( i, j ) = 0.0; + else + fu( i, j ) = u. Partial_xx( i, j ) + u. Partial_yy( i, j ); + } + } + + void GetExplicitRHSCUDA( const T& time, + tnlGridCUDA2D< T >& u, + tnlGridCUDA2D< T >& fu ) + { + const int xSize = u. GetXSize(); + const int ySize = u. GetYSize(); + const int desBlockXSize = 16; + const int desBlockYSize = 16; + const int gridXSize = xSize / desBlockXSize + ( xSize % desBlockXSize != 0 ); + const int gridYSize = ySize / desBlockYSize + ( ySize % desBlockYSize != 0 ); + heatEquationRHS( gridXSize, + gridYSize, + desBlockXSize, + desBlockYSize, + xSize, + ySize, + u. GetHx(), + u. GetHy(), + u. Data(), + fu. Data() ); + /*tnlGrid2D< T > hostU; + hostU. SetNewDimensions( u. GetXSize(), u. GetYSize() ); + hostU. SetNewDomain( u. GetAx(), u. GetBx(), u. GetAy(), u. GetBy() ); + tnlGrid2D< T > hostFu( hostU ); + hostU. copyFrom( u ); + hostFu. copyFrom( fu ); + Draw( hostU, "device-u", "gnuplot" ); + Draw( hostFu, "device-fu", "gnuplot" ); + + GetExplicitRHS( time, hostU, hostFu ); + u. copyFrom( hostU ); + fu. copyFrom( hostFu ); + Draw( hostU, "host-u", "gnuplot" ); + Draw( hostFu, "host-fu", "gnuplot" ); + getchar();*/ + } + + + void testUpdateU() + { + const int size = 1024; + + tnlGrid2D< T > hostU; + hostU. SetNewDimensions( size, size ); + hostU. SetNewDomain( 0.0, 1.0, 0.0, 1.0 ); + const T hx = hostU. GetHx(); + const T hy = hostU. GetHy(); + for( int i = 0; i < size; i ++ ) + for( int j = 0; j < size; j ++ ) + { + T x = i * hx - 0.5; + T y = j * hy - 0.5; + hostU( i, j ) = Sign( 0.25 - sqrt( x * x + y * y ) ); + } + Draw( hostU, "u-ini", "gnuplot" ); + + tnlGridCUDA2D< T > deviceU( hostU ); + deviceU. copyFrom( hostU ); + + tnlGrid2D< T > hostAuxU( hostU ); + + tnlMersonSolver< tnlGrid2D< T >, tnlMersonSolverCUDATester, T > mersonSolver( hostU ); + mersonSolver. SetVerbosity( 2 ); + mersonSolver. SetTime( 0.0 ); + mersonSolver. SetTau( 1.0 ); + + tnlMersonSolverCUDA< tnlGridCUDA2D< T >, tnlMersonSolverCUDATester, T > mersonSolverCUDA( deviceU ); + mersonSolverCUDA. SetVerbosity( 2 ); + mersonSolverCUDA. SetTime( 0.0 ); + mersonSolverCUDA. SetTau( 1.0 ); + + const T finalTime = 0.1; + const T tau = 0.01; + T time = tau; + int iteration( 0 ); + while( time < finalTime ) + { + iteration ++; + //cout << "Starting the Merson solver with stop time " << time << endl; + //mersonSolver. Solve( *this, hostU, time, 0.0 ); + //cout << "Starting the CUDA Merson solver with stop time " << time << endl; + mersonSolverCUDA. Solve( *this, deviceU, time, 0.0 ); + + hostAuxU. copyFrom( deviceU ); + /*T l1Norm = GetDiffL1Norm( hostU, hostAuxU ); + T l2Norm = GetDiffL2Norm( hostU, hostAuxU ); + T maxNorm = GetDiffMaxNorm( hostU, hostAuxU ); + cout << endl; + cout << "Errors: L1 " << l1Norm << " L2 " << l2Norm << " max." << maxNorm << endl;*/ + + cout << "Writing file ... "; + tnlString fileName; + FileNameBaseNumberEnding( + "u", + iteration, + 5, + ".gplt", + fileName ); + cout << fileName << endl; + Draw( hostAuxU, fileName. Data(), "gnuplot" ); + + + time += tau; + } + + + + + }; +}; + +#endif /* TNLMERSONSOLVERCUDATESTER_H_ */ diff --git a/src/tnl-unit-tests.cpp b/src/tnl-unit-tests.cpp index fcff25535f63b7bab841dce0cc19be6f786926b7..ddc07d5f97da412c63a315497bb0a397ced81491 100644 --- a/src/tnl-unit-tests.cpp +++ b/src/tnl-unit-tests.cpp @@ -22,6 +22,7 @@ #include <core/tnlFieldCUDA2DTester.h> #include <core/tnlCUDAKernelsTester.h> #include <diff/tnlGridCUDA2DTester.h> +#include <diff/tnlMersonSolverCUDATester.h> #include <iostream> @@ -32,7 +33,7 @@ int main( int argc, char* argv[] ) CppUnit :: TextTestRunner runner; #ifdef HAVE_CUDA - runner.addTest( tnlLongVectorCUDATester< int > :: suite() ); +/* runner.addTest( tnlLongVectorCUDATester< int > :: suite() ); runner.addTest( tnlLongVectorCUDATester< float > :: suite() ); if( CUDA_ARCH == 1.3 ) runner.addTest( tnlLongVectorCUDATester< double > :: suite() ); @@ -50,7 +51,13 @@ int main( int argc, char* argv[] ) runner.addTest( tnlCUDAKernelsTester< int > :: suite() ); runner.addTest( tnlCUDAKernelsTester< float > :: suite() ); if( CUDA_ARCH == 1.3 ) - runner.addTest( tnlCUDAKernelsTester< double > :: suite() ); + runner.addTest( tnlCUDAKernelsTester< double > :: suite() );*/ + + runner.addTest( tnlMersonSolverCUDATester< float > :: suite() ); + if( CUDA_ARCH == 1.3 ) + runner.addTest( tnlMersonSolverCUDATester< double > :: suite() ); + + #endif runner.run();