From 230234846e7ea0652a7bf152a411b1a795295756 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Wed, 3 Feb 2010 15:40:40 +0000
Subject: [PATCH] The CUDA version of the Merson solver is implemented.

---
 src/Makefile.am                         |   6 +-
 src/core/tnlFieldCUDA2D.h               |   5 +
 src/core/tnlLongVectorCUDA.h            |  20 +++
 src/diff/Makefile.am                    |  16 ++
 src/diff/norms.h                        |   2 +-
 src/diff/tnlExplicitSolver.h            |  20 +--
 src/diff/tnlGridCUDA2D.h                |  10 ++
 src/diff/tnlMersonSolver.h              |  47 +++---
 src/diff/tnlMersonSolverCUDA.cu         |  47 ++++--
 src/diff/tnlMersonSolverCUDA.cu.h       |  69 +++++++-
 src/diff/tnlMersonSolverCUDA.h          | 187 +++++++++++-----------
 src/diff/tnlMersonSolverCUDATester.cu   |  50 ++++++
 src/diff/tnlMersonSolverCUDATester.cu.h |  43 +++++
 src/diff/tnlMersonSolverCUDATester.h    | 200 ++++++++++++++++++++++++
 src/tnl-unit-tests.cpp                  |  11 +-
 15 files changed, 585 insertions(+), 148 deletions(-)
 create mode 100644 src/diff/tnlMersonSolverCUDATester.cu
 create mode 100644 src/diff/tnlMersonSolverCUDATester.cu.h
 create mode 100644 src/diff/tnlMersonSolverCUDATester.h

diff --git a/src/Makefile.am b/src/Makefile.am
index 33f159e0e6..c6bfa59a9a 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -56,7 +56,8 @@ check_PROGRAMS = tnl-unit-tests \
                   
 tnl_unit_tests_SOURCES = $(tnl_unit_tests_sources)
 tnl_unit_tests_LDADD = libtnl-0.1.la \
-                       core/libcore-tests.la
+                       core/libcore-tests.la \
+                       diff/libdiff-tests.la
 
 tnl_benchmarks_SOURCES = $(tnl_benchmarks_sources) $(tnl_benchmarks_headers)
 tnl_benchmarks_LDADD = libtnl-0.1.la
@@ -65,7 +66,8 @@ if BUILD_DBG
 bin_PROGRAMS = tnl-unit-tests-dbg
 tnl_unit_tests_dbg_SOURCES = $(tnl_unit_tests_sources)
 tnl_unit_tests_dbg_LDADD = libtnl-dbg-0.1.la \
-                           core/libcore-tests-dbg.la
+                           core/libcore-tests-dbg.la \
+                           diff/libdiff-tests-dbg.la
 endif
 
 
diff --git a/src/core/tnlFieldCUDA2D.h b/src/core/tnlFieldCUDA2D.h
index d83e95239b..5e3d059aca 100644
--- a/src/core/tnlFieldCUDA2D.h
+++ b/src/core/tnlFieldCUDA2D.h
@@ -42,6 +42,11 @@ template< class T > class tnlFieldCUDA2D : public tnlLongVectorCUDA< T >
      x_size( f. x_size ), y_size( f. y_size )
    { };
 
+   tnlFieldCUDA2D( const tnlField2D< T >& f )
+      : tnlLongVectorCUDA< T >( f ),
+        x_size( f. GetXSize() ), y_size( f. GetYSize() )
+      { };
+
    tnlString GetType() const
    {
       T t;
diff --git a/src/core/tnlLongVectorCUDA.h b/src/core/tnlLongVectorCUDA.h
index 03ee20d4ec..4a96ddb747 100644
--- a/src/core/tnlLongVectorCUDA.h
+++ b/src/core/tnlLongVectorCUDA.h
@@ -76,6 +76,26 @@ template< class T > class tnlLongVectorCUDA : public tnlObject
    }
 #endif
 
+   //! Constructor with another long vector as template
+   tnlLongVectorCUDA( const tnlLongVector< T >& v )
+#ifdef HAVE_CUDA
+    : tnlObject( v ), size( v. GetSize() ), shared_data( false )
+   {
+
+      if( cudaMalloc( ( void** ) &data, ( size + 1 ) * sizeof( T ) ) != cudaSuccess )
+      {
+         cerr << "Unable to allocate new long vector with size " << size << " on CUDA device." << endl;
+         data = NULL;
+         abort();
+      }
+      //data ++;
+   };
+#else
+   {
+      cerr << "CUDA support is missing in this system." << endl;
+   }
+#endif
+
 
    tnlString GetType() const
    {
diff --git a/src/diff/Makefile.am b/src/diff/Makefile.am
index dc23100826..44562e8f6d 100644
--- a/src/diff/Makefile.am
+++ b/src/diff/Makefile.am
@@ -59,8 +59,24 @@ endif
 
 
 if BUILD_MPI_DBG
+
    noinst_LTLIBRARIES += libtnldiff-mpi-dbg-0.1.la 
    libtnldiff_mpi_dbg_0_1_la_CXXFLAGS = $(MPICXXFLAGS) $(DBGCXXFLAGS)
    libtnldiff_mpi_dbg_0_1_la_LDFLAGS = $(LDFLAGS) $(MPILDFLAGS) $(DBGLDFLAGS)
    libtnldiff_mpi_dbg_0_1_la_SOURCES = $(sources) $(headers)
 endif
+
+libdiff_tests_sources = 
+
+if BUILD_CUDA
+libdiff_tests_sources += tnlMersonSolverCUDATester.h \
+                         tnlMersonSolverCUDATester.cu
+endif                        
+
+check_LTLIBRARIES = libdiff-tests.la
+libdiff_tests_la_SOURCES = $(libdiff_tests_sources)
+
+if BUILD_DBG
+noinst_LTLIBRARIES += libdiff-tests-dbg.la
+libdiff_tests_dbg_la_SOURCES = $(libdiff_tests_sources)
+endif 
diff --git a/src/diff/norms.h b/src/diff/norms.h
index 80e1c9ffa3..1fa578aecd 100644
--- a/src/diff/norms.h
+++ b/src/diff/norms.h
@@ -107,7 +107,7 @@ template< class T > T GetDiffMaxNorm( const tnlGrid2D< T >& u1,
    for( i = 0; i < size; i ++ )
    {
       T diff = _u1[ i ] - _u2[ i ];
-      result = Max( result, fabs( diff ) );
+      result = Max( result, ( T ) fabs( diff ) );
    }
    return result;
 };
diff --git a/src/diff/tnlExplicitSolver.h b/src/diff/tnlExplicitSolver.h
index 59c9033b15..6aabb585b8 100644
--- a/src/diff/tnlExplicitSolver.h
+++ b/src/diff/tnlExplicitSolver.h
@@ -28,9 +28,9 @@ template< class GRID, class SCHEME, typename T = double > class tnlExplicitSolve
    
    tnlExplicitSolver()
    :  iteration( 0 ), 
-      time( 0 ),
-      tau( 0 ),
-      residue( 0 ),
+      time( 0.0 ),
+      tau( 0.0 ),
+      residue( 0.0 ),
       solver_comm( MPI_COMM_WORLD ),
       verbosity( 0 ),
       cpu_timer( &default_mcore_cpu_timer ),
@@ -39,12 +39,12 @@ template< class GRID, class SCHEME, typename T = double > class tnlExplicitSolve
       };
 
 
-   void SetTime( const double& t )
+   void SetTime( const T& t )
    {
       time = t;
    };
 
-   const double& GetTime() const
+   const T& GetTime() const
    {
       return time;
    };
@@ -66,7 +66,7 @@ template< class GRID, class SCHEME, typename T = double > class tnlExplicitSolve
       return tau;
    };
 
-   const double& GetResidue() const
+   const T& GetResidue() const
    {
       return residue;
    };
@@ -116,19 +116,19 @@ template< class GRID, class SCHEME, typename T = double > class tnlExplicitSolve
    
    virtual bool Solve( SCHEME& scheme,
                        GRID& u,
-                       const double& stop_time,
-                       const double& max_res,
+                       const T& stop_time,
+                       const T& max_res,
                        const int max_iter ) = 0;
      
    protected:
     
    int iteration;
 
-   double time;
+   T time;
 
    T tau;
 
-   double residue;
+   T residue;
 
    MPI_Comm solver_comm;
 
diff --git a/src/diff/tnlGridCUDA2D.h b/src/diff/tnlGridCUDA2D.h
index ce55341abd..0a5819c6ee 100644
--- a/src/diff/tnlGridCUDA2D.h
+++ b/src/diff/tnlGridCUDA2D.h
@@ -19,6 +19,7 @@
 #define tnlGridCUDA2DH
 
 #include <core/tnlFieldCUDA2D.h>
+#include <diff/tnlGrid2D.h>
 
 template< class T = double > class tnlGridCUDA2D :
 	                         public tnlFieldCUDA2D< T >
@@ -57,6 +58,15 @@ template< class T = double > class tnlGridCUDA2D :
    {
    };
 
+   tnlGridCUDA2D( const tnlGrid2D< T >& g )
+   : tnlFieldCUDA2D< T >( g ),
+     Ax( g. GetAx() ), Bx( g. GetBx() ),
+     Ay( g. GetAy() ), By( g. GetBy() ),
+     Hx( g. GetHx() ), Hy( g. GetHy() )
+   {
+   };
+
+
    tnlString GetType() const
    {
       T t;
diff --git a/src/diff/tnlMersonSolver.h b/src/diff/tnlMersonSolver.h
index 4fb43c012b..1d99b5b452 100644
--- a/src/diff/tnlMersonSolver.h
+++ b/src/diff/tnlMersonSolver.h
@@ -1,8 +1,3 @@
-
-
-
-
-
 /***************************************************************************
                           tnlMersonSolver.h  -  description
                              -------------------
@@ -31,6 +26,7 @@ template< class GRID, class SCHEME, typename T = double > class tnlMersonSolver
    public:
 
    tnlMersonSolver( const GRID& v )
+   : adaptivity( 1.0e-5 )
    {
       k1 = new GRID( v );
       k2 = new GRID( v );
@@ -49,6 +45,7 @@ template< class GRID, class SCHEME, typename T = double > class tnlMersonSolver
       k4 -> Zeros();
       k5 -> Zeros();
       k_tmp -> Zeros();
+      tnlExplicitSolver< GRID, SCHEME, T > :: tau = 1.0;
    };
 
    tnlString GetType() const
@@ -59,16 +56,16 @@ template< class GRID, class SCHEME, typename T = double > class tnlMersonSolver
              tnlString( ", " ) + GetParameterType( t ) + tnlString( " >" );
    };
 
-   void SetAdaptivity( const double& a )
+   void SetAdaptivity( const T& a )
    {
       adaptivity = a;
    };
    
    bool Solve( SCHEME& scheme,
                GRID& u,
-               const double& stop_time,
-               const double& max_res,
-               const int max_iter )
+               const T& stop_time,
+               const T& max_res,
+               const int max_iter = -1 )
    {
       T* _k1 = k1 -> Data();
       T* _k2 = k2 -> Data();
@@ -79,17 +76,18 @@ template< class GRID, class SCHEME, typename T = double > class tnlMersonSolver
       T* _u = u. Data();
            
       tnlExplicitSolver< GRID, SCHEME, T > :: iteration = 0;
-      double& _time = tnlExplicitSolver< GRID, SCHEME, T > :: time;  
-      double& _residue = tnlExplicitSolver< GRID, SCHEME, T > :: residue;  
+      T& _time = tnlExplicitSolver< GRID, SCHEME, T > :: time;
+      T& _residue = tnlExplicitSolver< GRID, SCHEME, T > :: residue;
       int& _iteration = tnlExplicitSolver< GRID, SCHEME, T > :: iteration;
-      const double size_inv = 1.0 / ( double ) u. GetSize();
-      
+      const T size_inv = 1.0 / ( T ) u. GetSize();
+
       T _tau = tnlExplicitSolver< GRID, SCHEME, T > :: tau;
       if( _time + _tau > stop_time ) _tau = stop_time - _time;
       if( _tau == 0.0 ) return true;
 
       if( tnlExplicitSolver< GRID, SCHEME, T > :: verbosity > 0 )
          tnlExplicitSolver< GRID, SCHEME, T > :: PrintOut();
+
       while( 1 )
       {
 
@@ -129,16 +127,16 @@ template< class GRID, class SCHEME, typename T = double > class tnlMersonSolver
             _k_tmp[ i ] = _u[ i ] + _tau * ( 0.5 * _k1[ i ] - 1.5 * _k3[ i ] + 2.0 * _k4[ i ] ); 
          scheme. GetExplicitRHS( _time + _tau, *k_tmp, *k5 );      
    
-         double eps( 0.0 ), max_eps( 0.0 );
+         T eps( 0.0 ), max_eps( 0.0 );
          if( adaptivity )
          {
             for( i = 0; i < size; i ++  )
             {
-               eps = Max( eps, _tau / 3.0 * 
+               eps = Max( eps, ( T ) ( _tau / 3.0 *
                                  fabs( 0.2 * _k1[ i ] +
                                       -0.9 * _k3[ i ] + 
                                        0.8 * _k4[ i ] +
-                                      -0.1 * _k5[ i ] ) );
+                                      -0.1 * _k5[ i ] ) ) );
             }
             :: MPIAllreduce( eps, max_eps, 1, MPI_MAX, tnlExplicitSolver< GRID, SCHEME, T > :: solver_comm );
             //if( MPIGetRank() == 0 )
@@ -148,19 +146,19 @@ template< class GRID, class SCHEME, typename T = double > class tnlMersonSolver
          //cout << endl << "max_eps = " << max_eps << endl;
          if( ! adaptivity || max_eps < adaptivity )
          {
-            double last_residue = _residue;
-            double loc_residue = 0.0;
+            T last_residue = _residue;
+            T loc_residue = 0.0;
 #ifdef HAVE_OPENMP
 #pragma omp parallel for reduction(+:loc_residue) firstprivate( size, _k_tmp, _u, _k1,_tau, tau_3 )
 #endif
             for( i = 0; i < size; i ++ )
             {
-               // this does not have to be in double precision
+               // this does not have to be in T precision
                //_k_tmp[ i ] = ( 0.5 * ( _k1[ i ] + _k5[ i ] ) + 
                //                2.0 * _k4[ i ] ) * tau_3;
                const T add = _tau / 6.0 * ( _k1[ i ] + 4.0 * _k4[ i ] + _k5[ i ] );
                _u[ i ] += add; 
-               loc_residue += fabs( ( double ) add );
+               loc_residue += fabs( ( T ) add );
             }
             if( _tau + _time == stop_time ) _residue = last_residue;  // fixing strange values of res. at the last iteration
             else
@@ -170,6 +168,7 @@ template< class GRID, class SCHEME, typename T = double > class tnlMersonSolver
             }
             _time += _tau;
             _iteration ++;
+            cout << _iteration << " ";
          }
          if( adaptivity && max_eps != 0.0 )
          {
@@ -182,16 +181,16 @@ template< class GRID, class SCHEME, typename T = double > class tnlMersonSolver
          else tnlExplicitSolver< GRID, SCHEME, T > :: tau = _tau;
          
          if( tnlExplicitSolver< GRID, SCHEME, T > :: verbosity > 1 )
-            tnlExplicitSolver< GRID, SCHEME, T > :: PrintOut();
+             tnlExplicitSolver< GRID, SCHEME, T > :: PrintOut();
          
-         if( _time == stop_time || 
+         if( _time == stop_time ||
              ( max_res && _residue < max_res ) )
           {
             if( tnlExplicitSolver< GRID, SCHEME, T > :: verbosity > 0 )
                tnlExplicitSolver< GRID, SCHEME, T > :: PrintOut();
              return true;
           }
-         //if( max_iter && _iteration == max_iter ) return false;
+         if( _iteration == max_iter ) return false;
       }
    };
 
@@ -209,7 +208,7 @@ template< class GRID, class SCHEME, typename T = double > class tnlMersonSolver
    
    GRID *k1, *k2, *k3, *k4, *k5, *k_tmp;
 
-   double adaptivity;
+   T adaptivity;
 };
 
 #endif
diff --git a/src/diff/tnlMersonSolverCUDA.cu b/src/diff/tnlMersonSolverCUDA.cu
index af069d26f9..bcd9c098f2 100644
--- a/src/diff/tnlMersonSolverCUDA.cu
+++ b/src/diff/tnlMersonSolverCUDA.cu
@@ -21,7 +21,7 @@
 void computeK2Arg( const int size, 
                    const int block_size,
                    const int grid_size,
-                   const float tau,
+                   const float& tau,
                    const float* u,
                    const float* k1,
                    float* k2_arg )
@@ -32,7 +32,7 @@ void computeK2Arg( const int size,
 void computeK2Arg( const int size,
                    const int block_size,
                    const int grid_size,
-                   const double tau,
+                   const double& tau,
                    const double* u,
                    const double* k1,
                    double* k2_arg )
@@ -43,7 +43,7 @@ void computeK2Arg( const int size,
 void computeK3Arg( const int size,
                    const int block_size,
                    const int grid_size,
-                   const float tau,
+                   const float& tau,
                    const float* u,
                    const float* k1,
                    const float* k2,
@@ -55,7 +55,7 @@ void computeK3Arg( const int size,
 void computeK3Arg( const int size,
                    const int block_size,
                    const int grid_size,
-                   const double tau,
+                   const double& tau,
                    const double* u,
                    const double* k1,
                    const double* k2,
@@ -67,7 +67,7 @@ void computeK3Arg( const int size,
 void computeK4Arg( const int size,
                    const int block_size,
                    const int grid_size,
-                   const float tau,
+                   const float& tau,
                    const float* u,
                    const float* k1,
                    const float* k3,
@@ -79,7 +79,7 @@ void computeK4Arg( const int size,
 void computeK4Arg( const int size,
                    const int block_size,
                    const int grid_size,
-                   const double tau,
+                   const double& tau,
                    const double* u,
                    const double* k1,
                    const double* k3,
@@ -91,7 +91,7 @@ void computeK4Arg( const int size,
 void computeK5Arg( const int size,
                    const int block_size,
                    const int grid_size,
-                   const float tau,
+                   const float& tau,
                    const float* u,
                    const float* k1,
                    const float* k3,
@@ -104,7 +104,7 @@ void computeK5Arg( const int size,
 void computeK5Arg( const int size,
                    const int block_size,
                    const int grid_size,
-                   const double tau,
+                   const double& tau,
                    const double* u,
                    const double* k1,
                    const double* k3,
@@ -114,10 +114,37 @@ void computeK5Arg( const int size,
    computeK5ArgKernel<<< grid_size, block_size >>>( size, tau, u, k1, k3, k4, k5_arg );
 }
 
+void computeErr( const int size,
+                 const int block_size,
+                 const int grid_size,
+                 const float& tau,
+                 const float* _k1,
+                 const float* _k3,
+                 const float* _k4,
+                 const float* _k5,
+                 float* _err )
+{
+   computeErrKernel<<< grid_size, block_size >>>( size, tau, _k1, _k3, _k4, _k5, _err );
+}
+
+void computeErr( const int size,
+                 const int block_size,
+                 const int grid_size,
+                 const double& tau,
+                 const double* _k1,
+                 const double* _k3,
+                 const double* _k4,
+                 const double* _k5,
+                 double* _err )
+{
+   computeErrKernel<<< grid_size, block_size >>>( size, tau, _k1, _k3, _k4, _k5, _err );
+}
+
+
 void updateU( const int size,
               const int block_size,
               const int grid_size,
-              const float tau,
+              const float& tau,
               const float* k1,
               const float* k4,
               const float* k5,
@@ -129,7 +156,7 @@ void updateU( const int size,
 void updateU( const int size,
               const int block_size,
               const int grid_size,
-              const double tau,
+              const double& tau,
               const double* k1,
               const double* k4,
               const double* k5,
diff --git a/src/diff/tnlMersonSolverCUDA.cu.h b/src/diff/tnlMersonSolverCUDA.cu.h
index 1325dbe717..a8ace031a9 100644
--- a/src/diff/tnlMersonSolverCUDA.cu.h
+++ b/src/diff/tnlMersonSolverCUDA.cu.h
@@ -1,13 +1,70 @@
-/*
- * tnlMersonSolverCUDA.cu.h
- *
- *  Created on: Jan 13, 2010
- *      Author: Tomas Oberhuber
- */
+/***************************************************************************
+                          tnlMersonSolverCUDATester.h
+                             -------------------
+    begin                : Jan 13, 2010
+    copyright            : (C) 2010 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
 
 #ifndef TNLMERSONSOLVERCUDA_CU_H_
 #define TNLMERSONSOLVERCUDA_CU_H_
 
+#ifdef HAVE_CUDA
+
+template< class T > __global__ void computeK2ArgKernel( const int size, const T tau, const T* u, const T* k1, T* k2_arg )
+{
+   int i = blockIdx. x * blockDim. x + threadIdx. x;
+   if( i < size )
+      k2_arg[ i ] = u[ i ] + tau * ( 1.0 / 3.0 * k1[ i ] );
+}
+
+template< class T > __global__ void computeK3ArgKernel( const int size, const T tau, const T* u, const T* k1, const T* k2, T* k3_arg )
+{
+   int i = blockIdx. x * blockDim. x + threadIdx. x;
+   if( i < size )
+      k3_arg[ i ] = u[ i ] + tau * 1.0 / 6.0 * ( k1[ i ] + k2[ i ] );
+}
+
+template< class T > __global__ void computeK4ArgKernel( const int size, const T tau, const T* u, const T* k1, const T* k3, T* k4_arg )
+{
+   int i = blockIdx. x * blockDim. x + threadIdx. x;
+   if( i < size )
+      k4_arg[ i ] = u[ i ] + tau * ( 0.125 * k1[ i ] + 0.375 * k3[ i ] );
+}
+
+template< class T > __global__ void computeK5ArgKernel( const int size, const T tau, const T* u, const T* k1, const T* k3, const T* k4, T* k5_arg )
+{
+   int i = blockIdx. x * blockDim. x + threadIdx. x;
+   if( i < size )
+      k5_arg[ i ] = u[ i ] + tau * ( 0.5 * k1[ i ] - 1.5 * k3[ i ] + 2.0 * k4[ i ] );
+}
+
+template< class T > __global__ void computeErrKernel( const int size, const T tau, const T* k1, const T* k3, const T* k4, const T* k5, T* err )
+{
+   int i = blockIdx. x * blockDim. x + threadIdx. x;
+   if( i < size )
+      err[ i ] = 1.0 / 3.0 *  tau * fabs( 0.2 * k1[ i ] +
+                                         -0.9 * k3[ i ] +
+                                          0.8 * k4[ i ] +
+                                         -0.1 * k5[ i ] );
+}
+
+template< class T > __global__ void updateUKernel( const int size, const T tau, const T* k1, const T* k4, const T* k5, T* u )
+{
+        int i = blockIdx. x * blockDim. x + threadIdx. x;
+        if( i < size )
+                u[ i ] += tau / 6.0 * ( k1[ i ] + 4.0 * k4[ i ] + k5[ i ] );
+}
 
+#endif
 
 #endif /* TNLMERSONSOLVERCUDA_CU_H_ */
diff --git a/src/diff/tnlMersonSolverCUDA.h b/src/diff/tnlMersonSolverCUDA.h
index e981d054c7..475a4d44f6 100644
--- a/src/diff/tnlMersonSolverCUDA.h
+++ b/src/diff/tnlMersonSolverCUDA.h
@@ -15,30 +15,32 @@
  *                                                                         *
  ***************************************************************************/
 
-#ifndef tnlMersonSolverH
-#define tnlMersonSolverH
+#ifndef tnlMersonSolverCUDAH
+#define tnlMersonSolverCUDAH
 
 #include <math.h>
-#include <core/tnl-cuda-kernels.h>
+#include <core/tnl-cuda-kernels.cu.h>
 #include <diff/tnlExplicitSolver.h>
 
 #ifdef HAVE_CUDA
-// TODO: remove this
-/*void computeK2Arg( const int size,
+void computeK2Arg( const int size,
                    const int block_size,
                    const int grid_size,
+                   const float& tau,
                    const float* u,
                    const float* k1,
                    float* k2_arg );
 void computeK2Arg( const int size,
                    const int block_size,
                    const int grid_size,
+                   const double& tau,
                    const double* u,
                    const double* k1,
                    double* k2_arg );
 void computeK3Arg( const int size,
                    const int block_size,
                    const int grid_size,
+                   const float& tau,
                    const float* u,
                    const float* k1,
                    const float* k2,
@@ -46,6 +48,7 @@ void computeK3Arg( const int size,
 void computeK3Arg( const int size,
                    const int block_size,
                    const int grid_size,
+                   const double& tau,
                    const double* u,
                    const double* k1,
                    const double* k2,
@@ -53,6 +56,7 @@ void computeK3Arg( const int size,
 void computeK4Arg( const int size,
                    const int block_size,
                    const int grid_size,
+                   const float& tau,
                    const float* u,
                    const float* k1,
                    const float* k3,
@@ -60,6 +64,7 @@ void computeK4Arg( const int size,
 void computeK4Arg( const int size,
                    const int block_size,
                    const int grid_size,
+                   const double& tau,
                    const double* u,
                    const double* k1,
                    const double* k3,
@@ -67,6 +72,7 @@ void computeK4Arg( const int size,
 void computeK5Arg( const int size,
                    const int block_size,
                    const int grid_size,
+                   const float& tau,
                    const float* u,
                    const float* k1,
                    const float* k3,
@@ -75,15 +81,34 @@ void computeK5Arg( const int size,
 void computeK5Arg( const int size,
                    const int block_size,
                    const int grid_size,
+                   const double& tau,
                    const double* u,
                    const double* k1,
                    const double* k3,
                    const double* k4,
                    double* k5_arg );
+void computeErr( const int size,
+                 const int block_size,
+                 const int grid_size,
+                 const float& tau,
+                 const float* _k1,
+                 const float* _k3,
+                 const float* _k4,
+                 const float* _k5,
+                 float* _err );
+void computeErr( const int size,
+                 const int block_size,
+                 const int grid_size,
+                 const double& tau,
+                 const double* _k1,
+                 const double* _k3,
+                 const double* _k4,
+                 const double* _k5,
+                 double* _err );
 void updateU( const int size,
               const int block_size,
               const int grid_size,
-              const float tau,
+              const float& tau,
               const float* k1,
               const float* k4,
               const float* k5,
@@ -91,56 +116,12 @@ void updateU( const int size,
 void updateU( const int size,
               const int block_size,
               const int grid_size,
-              const double tau,
+              const double& tau,
               const double* k1,
               const double* k4,
               const double* k5,
-              double* u );*/
-
-template< class T > __global__ void computeK2ArgKernel( const int size, const T tau, const T* u, const T* k1, T* k2_arg )
-{
-   int i = blockIdx. x * blockDim. x + threadIdx. x;
-   if( i < size )
-      k2_arg[ i ] = u[ i ] + tau * ( 1.0 / 3.0 * k1[ i ] );
-}
+              double* u );
 
-template< class T > __global__ void computeK3ArgKernel( const int size, const T tau, const T* u, const T* k1, const T* k2, T* k3_arg )
-{
-   int i = blockIdx. x * blockDim. x + threadIdx. x;
-   if( i < size )
-      k3_arg[ i ] = u[ i ] + tau * 1.0 / 6.0 * ( k1[ i ] + k2[ i ] );
-}
-
-template< class T > __global__ void computeK4ArgKernel( const int size, const T tau, const T* u, const T* k1, const T* k3, T* k4_arg )
-{
-   int i = blockIdx. x * blockDim. x + threadIdx. x;
-   if( i < size )
-      k4_arg[ i ] = u[ i ] + tau * ( 0.125 * k1[ i ] + 0.375 * k3[ i ] );
-}
-
-template< class T > __global__ void computeK5ArgKernel( const int size, const T tau, const T* u, const T* k1, const T* k3, const T* k4, T* k5_arg )
-{
-   int i = blockIdx. x * blockDim. x + threadIdx. x;
-   if( i < size )
-      k5_arg[ i ] = u[ i ] + tau * ( 0.5 * k1[ i ] - 1.5 * k3[ i ] + 2.0 * k4[ i ] );
-}
-
-template< class T > __global__ void computeErrKernel( const int size, const T tau, const T* _k1, const T* _k3, const T* _k4, const T* _k5, T* err )
-{
-   int i = blockIdx. x * blockDim. x + threadIdx. x;
-   if( i < size )
-      err[ i ] = 1.0 / 3.0 *  tau * fabs( 0.2 * k1[ i ] +
-                                         -0.9 * k3[ i ] +
-                                          0.8 * k4[ i ] +
-                                         -0.1 * k5[ i ] ) );
-}
-
-template< class T > __global__ void updateUKernel( const int size, const T tau, const T* k1, const T* k4, const T* k5, T* u )
-{
-        int i = blockIdx. x * blockDim. x + threadIdx. x;
-        if( i < size )
-                u[ i ] += tau / 6.0 * ( k1[ i ] + 4.0 * k4[ i ] + k5[ i ] );
-}
 
 #endif
 
@@ -148,34 +129,33 @@ template< class GRID, class SCHEME, typename T = double > class tnlMersonSolverC
 {
    public:
 
-   tnlMersonSolver( const GRID& v )
+   tnlMersonSolverCUDA( const GRID& v )
+   : adaptivity( 1.0e-5 )
    {
-      k1. SetNewDimension( v );
+#ifdef HAVE_CUDA
+      k1. SetNewDimensions( v );
       k1. SetNewDomain( v );
-      k2. SetNewDimension( v );
+      k2. SetNewDimensions( v );
       k2. SetNewDomain( v );
-      k3. SetNewDimension( v );
+      k3. SetNewDimensions( v );
       k3. SetNewDomain( v );
-      k4. SetNewDimension( v );
+      k4. SetNewDimensions( v );
       k4. SetNewDomain( v );
-      k5. SetNewDimension( v );
+      k5. SetNewDimensions( v );
       k5. SetNewDomain( v );
-      k_tmp. SetNewDimension( v );
+      k_tmp. SetNewDimensions( v );
       k_tmp. SetNewDomain( v );
-      err. SetNewDimension( v );
+      err. SetNewDimensions( v );
       err. SetNewDomain( v );
       if( ! k1 || ! k2 || ! k3 || ! k4 || ! k5 || ! k_tmp || ! err )
       {
          cerr << "Unable to allocate supporting structures for the Merson solver." << endl;
          abort();
       };
-      k1. Zeros();
-      k2. Zeros();
-      k3. Zeros();
-      k4. Zeros();
-      k5. Zeros();
-      k_tmp. Zeros();
-      err. Zeros();
+      tnlExplicitSolver< GRID, SCHEME, T > :: tau = 1.0;
+#else
+	cerr << "CUDA is not supported on this system." << endl;
+#endif
    };
 
    tnlString GetType() const
@@ -186,17 +166,18 @@ template< class GRID, class SCHEME, typename T = double > class tnlMersonSolverC
              tnlString( ", " ) + GetParameterType( t ) + tnlString( " >" );
    };
 
-   void SetAdaptivity( const double& a )
+   void SetAdaptivity( const T& a )
    {
       adaptivity = a;
    };
    
    bool Solve( SCHEME& scheme,
                GRID& u,
-               const double& stop_time,
-               const double& max_res,
-               const int max_iter )
+               const T& stop_time,
+               const T& max_res,
+               const int max_iter = -1 )
    {
+#ifdef HAVE_CUDA
       T* _k1 = k1. Data();
       T* _k2 = k2. Data();
       T* _k3 = k3. Data();
@@ -207,11 +188,12 @@ template< class GRID, class SCHEME, typename T = double > class tnlMersonSolverC
       T* _err = err. Data();
            
       tnlExplicitSolver< GRID, SCHEME, T > :: iteration = 0;
-      double& _time = tnlExplicitSolver< GRID, SCHEME, T > :: time;  
-      double& _residue = tnlExplicitSolver< GRID, SCHEME, T > :: residue;  
+      T& _time = tnlExplicitSolver< GRID, SCHEME, T > :: time;
+      T& _residue = tnlExplicitSolver< GRID, SCHEME, T > :: residue;
       int& _iteration = tnlExplicitSolver< GRID, SCHEME, T > :: iteration;
-      const double size_inv = 1.0 / ( double ) u. GetSize();
+      const T size_inv = 1.0 / ( T ) u. GetSize();
       
+      const int size = u. GetSize();
       const int block_size = 512;
       const int grid_size = ( size - 1 ) / block_size + 1;
 
@@ -221,42 +203,57 @@ template< class GRID, class SCHEME, typename T = double > class tnlMersonSolverC
 
       if( tnlExplicitSolver< GRID, SCHEME, T > :: verbosity > 0 )
          tnlExplicitSolver< GRID, SCHEME, T > :: PrintOut();
+
+      // DEBUG
+      /*tnlGrid2D< T > hostAux;
+      hostAux. SetNewDimensions( u. GetXSize(), u. GetYSize() );
+      hostAux. SetNewDomain( u. GetAx(), u. GetBx(), u. GetAy(), u. GetBy() );*/
+
       while( 1 )
       {
 
          int i;
-         int size = k1 -> GetSize();
+         int size = k1. GetSize();
          assert( size == u. GetSize() );
          
          T tau_3 = _tau / 3.0;
 
-         scheme. GetExplicitRHSCUDA( _time, u, *k1 );
+         scheme. GetExplicitRHSCUDA( _time, u, k1 );
 
-         computeK2ArgKernel<<< block_size, grid_size >>>( size, tau, _u, _k1, _k_tmp );
-         scheme. GetExplicitRHSCUDA( _time + tau_3, *k_tmp, *k2 );
+         computeK2Arg( size, block_size, grid_size, _tau, _u, _k1, _k_tmp );
+         scheme. GetExplicitRHSCUDA( _time + tau_3, k_tmp, k2 );
          
-         computeK3ArgKernel<<< block_size, grid_size >>>( size, tau, _u, _k1, _k2, _k_tmp );
-         scheme. GetExplicitRHSCUDA( _time + tau_3, *k_tmp, *k3 );
+         computeK3Arg( size, block_size, grid_size, _tau, _u, _k1, _k2, _k_tmp );
+         scheme. GetExplicitRHSCUDA( _time + tau_3, k_tmp, k3 );
          
-         computeK4ArgKernel<<< block_size, grid_size >>>( size, tau, _u, _k1, _k3, _k_tmp );
-         scheme. GetExplicitRHSCUDA( _time + 0.5 * _tau, *k_tmp, *k4 );
+         computeK4Arg( size, block_size, grid_size, _tau, _u, _k1, _k3, _k_tmp );
+         scheme. GetExplicitRHSCUDA( _time + 0.5 * _tau, k_tmp, k4 );
          
-         computeK5Arg<<< block_size, grid_size >>>( size, tau, _k1, _k3, _k4, _k_tmp );
-         scheme. GetExplicitRHSCUDA( _time + _tau, *k_tmp, *k5 );
+         computeK5Arg( size, block_size, grid_size, _tau, _u, _k1, _k3, _k4, _k_tmp );
+         scheme. GetExplicitRHSCUDA( _time + _tau, k_tmp, k5 );
    
-         double eps( 0.0 ), max_eps( 0.0 );
+         T eps( 0.0 ), max_eps( 0.0 );
          if( adaptivity )
          {
-            computeErrKernel<<< block_size, grid_size >>>( size, tau, _k1, _k3, _k4, _k5, _err );
-            tnlCUDAReduction( size, err, max_eps ); // TODO: allocate auxiliary array for the reduction
+            computeErr( size, block_size, grid_size, _tau, _k1, _k3, _k4, _k5, _err );
+            tnlCUDAReductionMax( size, _err, max_eps ); // TODO: allocate auxiliary array for the reduction
+
+            /*hostAux. copyFrom( err );
+            T seq_max_eps = 0.0;
+            for( int i = 0; i < hostAux. GetSize(); i ++ )
+            	seq_max_eps = :: Max( seq_max_eps, hostAux[ i ] );
+            if( max_eps != seq_max_eps )
+            	abort();*/
+
          }
+
          //cout << endl << "max_eps = " << max_eps << endl;
          if( ! adaptivity || max_eps < adaptivity )
          {
-            double last_residue = _residue;
-            double loc_residue = 0.0;
+            T last_residue = _residue;
+            T loc_residue = 0.0;
 
-            updateU( size, block_size, grid_size, tau, _k1, _k4, _k5 );
+            updateU( size,  block_size, grid_size, _tau, _k1, _k4, _k5, _u );
             // TODO: implement loc_residue - if possible
 
             if( _tau + _time == stop_time ) _residue = last_residue;  // fixing strange values of res. at the last iteration
@@ -288,11 +285,15 @@ template< class GRID, class SCHEME, typename T = double > class tnlMersonSolverC
                tnlExplicitSolver< GRID, SCHEME, T > :: PrintOut();
              return true;
           }
-         //if( max_iter && _iteration == max_iter ) return false;
+         if( _iteration == max_iter ) return false;
       }
+#else
+      cerr << "CUDA is not supported on this system." << endl;
+      return false;
+#endif
    };
 
-   ~tnlMersonSolver()
+   ~tnlMersonSolverCUDA()
    {
    };
 
@@ -300,7 +301,7 @@ template< class GRID, class SCHEME, typename T = double > class tnlMersonSolverC
    
    GRID k1, k2, k3, k4, k5, k_tmp, err;
 
-   double adaptivity;
+   T adaptivity;
 };
 
 #endif
diff --git a/src/diff/tnlMersonSolverCUDATester.cu b/src/diff/tnlMersonSolverCUDATester.cu
new file mode 100644
index 0000000000..a27c22ac49
--- /dev/null
+++ b/src/diff/tnlMersonSolverCUDATester.cu
@@ -0,0 +1,50 @@
+/***************************************************************************
+                          tnlMersonSolverCUDATester.cu
+                             -------------------
+    begin                : Feb 2, 2010
+    copyright            : (C) 2009 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+ 
+ #include <diff/tnlMersonSolverCUDATester.cu.h>
+ 
+ void heatEquationRHS( const int gridDimX,
+                       const int gridDimY,
+                       const int blockDimX,
+                       const int blockDimY,
+                       const int xSize,
+                       const int ySize,
+                       const float& hX,
+                       const float& hY,
+                       const float* u,
+                       float* fu )
+{
+   dim3 gridDim( gridDimX, gridDimY );
+   dim3 blockDim( blockDimX, blockDimY );
+   heatEquationRHSKernel<<< gridDim, blockDim >>>( xSize, ySize, hX, hY, u, fu );
+}
+
+void heatEquationRHS( const int gridDimX,
+                      const int gridDimY,
+                      const int blockDimX,
+                      const int blockDimY,
+                      const int xSize,
+                      const int ySize,
+                      const double& hX,
+                      const double& hY,
+                      const double* u,
+                      double* fu )
+{
+   dim3 gridDim( gridDimX, gridDimY );
+   dim3 blockDim( blockDimX, blockDimY );
+   heatEquationRHSKernel<<< gridDim, blockDim >>>( xSize, ySize, hX, hY, u, fu );
+}
\ No newline at end of file
diff --git a/src/diff/tnlMersonSolverCUDATester.cu.h b/src/diff/tnlMersonSolverCUDATester.cu.h
new file mode 100644
index 0000000000..dab468ce70
--- /dev/null
+++ b/src/diff/tnlMersonSolverCUDATester.cu.h
@@ -0,0 +1,43 @@
+/***************************************************************************
+                          tnlMersonSolverCUDATester.cu.h
+                             -------------------
+    begin                : Feb 2, 2010
+    copyright            : (C) 2009 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+
+
+
+#ifndef TNLMERSONSOLVERCUDATESTER_CU_H_
+#define TNLMERSONSOLVERCUDATESTER_CU_H_
+
+template< class T > __global__ void heatEquationRHSKernel( const int xSize,
+		                                                   const int ySize,
+		                                                   const T hX,
+		                                                   const T hY,
+		                                                   const T* u,
+		                                                   T* fu )
+{
+	const int i = blockIdx. x * blockDim. x + threadIdx. x;
+	const int j = blockIdx. y * blockDim. y + threadIdx. y;
+
+	if( i == 0 || j == 0 || i == xSize - 1 || j == ySize - 1 )
+		fu[ xSize * j + i ] = 0.0;
+	else
+		if( i < xSize && j < ySize )
+		fu[ xSize * j + i ] = ( u[ xSize * j + i + 1 ] - 2.0 * u[ xSize * j + i ] + u[ xSize * j + i - 1] ) / ( hX * hX ) +
+	                          ( u[ xSize * j + i + xSize ] - 2.0 * u[ xSize * j + i ] + u[ xSize * j + i - xSize ] ) / ( hY * hY );
+
+}
+
+
+#endif /* TNLMERSONSOLVERCUDATESTER_CU_H_ */
diff --git a/src/diff/tnlMersonSolverCUDATester.h b/src/diff/tnlMersonSolverCUDATester.h
new file mode 100644
index 0000000000..9f6fca71c1
--- /dev/null
+++ b/src/diff/tnlMersonSolverCUDATester.h
@@ -0,0 +1,200 @@
+/***************************************************************************
+                          tnlMersonSolverCUDATester.h
+                             -------------------
+    begin                : Feb 1, 2010
+    copyright            : (C) 2010 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+
+#ifndef TNLMERSONSOLVERCUDATESTER_H_
+#define TNLMERSONSOLVERCUDATESTER_H_
+
+#include <diff/tnlMersonSolver.h>
+#include <diff/tnlMersonSolverCUDA.h>
+#include <diff/drawGrid2D.h>
+#include <diff/norms.h>
+#include <core/mfilename.h>
+#include <cppunit/TestSuite.h>
+#include <cppunit/TestResult.h>
+#include <cppunit/TestCaller.h>
+#include <cppunit/TestCase.h>
+
+ void heatEquationRHS( const int gridDimX,
+                       const int gridDimY,
+                       const int blockDimX,
+                       const int blockDimY,
+                       const int xSize,
+                       const int ySize,
+                       const float& hX,
+                       const float& hY,
+                       const float* u,
+                       float* fu );
+
+void heatEquationRHS( const int gridDimX,
+                      const int gridDimY,
+                      const int blockDimX,
+                      const int blockDimY,
+                      const int xSize,
+                      const int ySize,
+                      const double& hX,
+                      const double& hY,
+                      const double* u,
+                      double* fu );
+
+
+template< class T > class tnlMersonSolverCUDATester : public CppUnit :: TestCase
+{
+	public:
+
+	static CppUnit :: Test* suite()
+	   {
+	      CppUnit :: TestSuite* suiteOfTests = new CppUnit :: TestSuite( "tnlMersonSolverCUDATester" );
+	      CppUnit :: TestResult result;
+
+	      T param;
+	      tnlString test_name = tnlString( "testUpdateU< " ) + GetParameterType( param ) + tnlString( " >" );
+	      suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlMersonSolverCUDATester< T > >(
+	    		               test_name. Data(),
+	                               & tnlMersonSolverCUDATester< T > :: testUpdateU )
+	                             );
+
+	      return suiteOfTests;
+	   };
+
+	void GetExplicitRHS( const T& time,
+			             tnlGrid2D< T >& u,
+	                     tnlGrid2D< T >& fu )
+	{
+		const int xSize = u. GetXSize();
+		const int ySize = u. GetYSize();
+		for( int i = 0; i < xSize; i ++ )
+			for( int j = 0; j < ySize; j ++ )
+			{
+				if( i == 0 || j == 0 || i == xSize - 1 || j == ySize - 1 )
+					fu( i, j ) = 0.0;
+				else
+					fu( i, j ) = u. Partial_xx( i, j ) + u. Partial_yy( i, j );
+			}
+	}
+
+	void GetExplicitRHSCUDA( const T& time,
+			                 tnlGridCUDA2D< T >& u,
+	                         tnlGridCUDA2D< T >& fu )
+	{
+		const int xSize = u. GetXSize();
+		const int ySize = u. GetYSize();
+		const int desBlockXSize = 16;
+		const int desBlockYSize = 16;
+		const int gridXSize = xSize / desBlockXSize + ( xSize % desBlockXSize != 0 );
+		const int gridYSize = ySize / desBlockYSize + ( ySize % desBlockYSize != 0 );
+		heatEquationRHS( gridXSize,
+				         gridYSize,
+		                 desBlockXSize,
+		                 desBlockYSize,
+		                 xSize,
+		                 ySize,
+		                 u. GetHx(),
+		                 u. GetHy(),
+		                 u. Data(),
+		                 fu. Data() );
+		/*tnlGrid2D< T > hostU;
+		hostU. SetNewDimensions( u. GetXSize(), u. GetYSize() );
+		hostU. SetNewDomain( u. GetAx(), u. GetBx(), u. GetAy(), u. GetBy() );
+		tnlGrid2D< T > hostFu( hostU );
+		hostU. copyFrom( u );
+		hostFu. copyFrom( fu );
+		Draw( hostU, "device-u", "gnuplot" );
+		Draw( hostFu, "device-fu", "gnuplot" );
+
+		GetExplicitRHS( time, hostU, hostFu );
+		u. copyFrom( hostU );
+		fu. copyFrom( hostFu );
+		Draw( hostU, "host-u", "gnuplot" );
+		Draw( hostFu, "host-fu", "gnuplot" );
+		getchar();*/
+	}
+
+
+	void testUpdateU()
+	{
+		const int size = 1024;
+
+		tnlGrid2D< T > hostU;
+		hostU. SetNewDimensions( size, size );
+		hostU. SetNewDomain( 0.0, 1.0, 0.0, 1.0 );
+		const T hx = hostU. GetHx();
+		const T hy = hostU. GetHy();
+		for( int i = 0; i < size; i ++ )
+			for( int j = 0; j < size; j ++ )
+			{
+				T x = i * hx - 0.5;
+				T y = j * hy - 0.5;
+				hostU( i, j ) = Sign( 0.25 - sqrt( x * x + y * y ) );
+			}
+		Draw( hostU, "u-ini", "gnuplot" );
+
+		tnlGridCUDA2D< T > deviceU( hostU );
+		deviceU. copyFrom( hostU );
+
+		tnlGrid2D< T > hostAuxU( hostU );
+
+		tnlMersonSolver< tnlGrid2D< T >, tnlMersonSolverCUDATester, T > mersonSolver( hostU );
+		mersonSolver. SetVerbosity( 2 );
+		mersonSolver. SetTime( 0.0 );
+		mersonSolver. SetTau( 1.0 );
+
+		tnlMersonSolverCUDA< tnlGridCUDA2D< T >, tnlMersonSolverCUDATester, T > mersonSolverCUDA( deviceU );
+		mersonSolverCUDA. SetVerbosity( 2 );
+		mersonSolverCUDA. SetTime( 0.0 );
+		mersonSolverCUDA. SetTau( 1.0 );
+
+		const T finalTime = 0.1;
+		const T tau = 0.01;
+		T time = tau;
+		int iteration( 0 );
+		while( time < finalTime )
+		{
+			iteration ++;
+			//cout << "Starting the Merson solver with stop time " << time << endl;
+			//mersonSolver. Solve( *this, hostU, time, 0.0 );
+			//cout << "Starting the CUDA Merson solver with stop time " << time << endl;
+			mersonSolverCUDA. Solve( *this, deviceU, time, 0.0 );
+
+			hostAuxU. copyFrom( deviceU );
+			/*T l1Norm = GetDiffL1Norm( hostU, hostAuxU );
+			T l2Norm = GetDiffL2Norm( hostU, hostAuxU );
+			T maxNorm = GetDiffMaxNorm( hostU, hostAuxU );
+			cout << endl;
+			cout << "Errors: L1 " << l1Norm << " L2 " << l2Norm << " max." << maxNorm << endl;*/
+
+			cout << "Writing file ... ";
+			tnlString fileName;
+			FileNameBaseNumberEnding(
+					"u",
+					iteration,
+					5,
+					".gplt",
+					fileName );
+			cout << fileName << endl;
+			Draw( hostAuxU, fileName. Data(), "gnuplot" );
+
+
+			time += tau;
+		}
+
+
+
+
+	};
+};
+
+#endif /* TNLMERSONSOLVERCUDATESTER_H_ */
diff --git a/src/tnl-unit-tests.cpp b/src/tnl-unit-tests.cpp
index fcff25535f..ddc07d5f97 100644
--- a/src/tnl-unit-tests.cpp
+++ b/src/tnl-unit-tests.cpp
@@ -22,6 +22,7 @@
 #include <core/tnlFieldCUDA2DTester.h>
 #include <core/tnlCUDAKernelsTester.h>
 #include <diff/tnlGridCUDA2DTester.h>
+#include <diff/tnlMersonSolverCUDATester.h>
 
 #include <iostream>
 
@@ -32,7 +33,7 @@ int main( int argc, char* argv[] )
    CppUnit :: TextTestRunner runner;
    
 #ifdef HAVE_CUDA
-   runner.addTest( tnlLongVectorCUDATester< int > :: suite() );
+/*   runner.addTest( tnlLongVectorCUDATester< int > :: suite() );
    runner.addTest( tnlLongVectorCUDATester< float > :: suite() );
    if( CUDA_ARCH == 1.3 )
       runner.addTest( tnlLongVectorCUDATester< double > :: suite() );
@@ -50,7 +51,13 @@ int main( int argc, char* argv[] )
    runner.addTest( tnlCUDAKernelsTester< int > :: suite() );
    runner.addTest( tnlCUDAKernelsTester< float > :: suite() );
    if( CUDA_ARCH == 1.3 )
-      runner.addTest( tnlCUDAKernelsTester< double > :: suite() );
+      runner.addTest( tnlCUDAKernelsTester< double > :: suite() );*/
+
+   runner.addTest( tnlMersonSolverCUDATester< float > :: suite() );
+   if( CUDA_ARCH == 1.3 )
+	   runner.addTest( tnlMersonSolverCUDATester< double > :: suite() );
+
+
 #endif
    
    runner.run();
-- 
GitLab