From 3889eb072c4824b664edd470b68e35cf1a9a177c Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Wed, 17 Jul 2013 15:14:51 +0200
Subject: [PATCH] Refactoring the array operations.

---
 install                                       |   2 +-
 src/core/arrays/tnlArrayOperations.h          |   4 +-
 src/core/tnlCuda.h                            |   2 +
 src/implementation/core/CMakeLists.txt        |   3 +-
 .../arrays/tnlArrayOperationsCuda_impl.cu     |  60 ++---
 .../core/arrays/tnlArrayOperationsCuda_impl.h |  72 ++---
 .../core/arrays/tnlArrayOperationsHost_impl.h |  46 ++--
 .../core/cuda/cuda-reduction_impl.h           |  15 +-
 src/implementation/core/tnlCuda_impl.h        |   5 +
 tests/unit-tests/CMakeLists.txt               |   4 +-
 .../core/arrays/tnlArrayOperationsTester.h    | 145 +++++-----
 tests/unit-tests/core/cuda/CMakeLists.txt     |   7 +-
 .../core/cuda/memory-operations-test.cu       |  26 --
 .../core/cuda/tnlCudaMemoryOperationsTester.h | 248 ------------------
 .../core/cuda/tnlCudaReductionTester.h        |  48 ++--
 .../core/vectors/tnlVectorOperationsTester.h  |  21 +-
 16 files changed, 232 insertions(+), 476 deletions(-)
 delete mode 100644 tests/unit-tests/core/cuda/memory-operations-test.cu
 delete mode 100644 tests/unit-tests/core/cuda/tnlCudaMemoryOperationsTester.h

diff --git a/install b/install
index 029a1902e7..2b9489d97e 100755
--- a/install
+++ b/install
@@ -2,7 +2,7 @@
 
 TARGET=TNL
 INSTALL_PREFIX=${HOME}/local
-WITH_CUDA=no
+WITH_CUDA=yes
 WITH_CUSPARSE=no
 CUDA_ARCHITECTURE=2.0
 TEMPLATE_EXPLICIT_INSTANTIATION=yes
diff --git a/src/core/arrays/tnlArrayOperations.h b/src/core/arrays/tnlArrayOperations.h
index 51e72a9516..485b095473 100644
--- a/src/core/arrays/tnlArrayOperations.h
+++ b/src/core/arrays/tnlArrayOperations.h
@@ -63,12 +63,12 @@ class tnlArrayOperations< tnlHost >
                            const SourceElement* source,
                            const Index size );
 
-   /*template< typename Element,
+   template< typename Element,
              typename DestinationDevice,
              typename Index >
    static bool copyMemory( Element* destination,
                            const Element* source,
-                           const Index size );*/
+                           const Index size );
 
    template< typename Element1,
              typename DestinationDevice,
diff --git a/src/core/tnlCuda.h b/src/core/tnlCuda.h
index 8d4c423ac4..2496fbcb45 100644
--- a/src/core/tnlCuda.h
+++ b/src/core/tnlCuda.h
@@ -38,6 +38,8 @@ class tnlCuda
 
    static void setMaxBlockSize( int newMaxBlockSize );
 
+   static int getGPUTransferBufferSize();
+
    protected:
 
    static int maxGridSize, maxBlockSize;
diff --git a/src/implementation/core/CMakeLists.txt b/src/implementation/core/CMakeLists.txt
index 18aa614027..7a3c04eb04 100755
--- a/src/implementation/core/CMakeLists.txt
+++ b/src/implementation/core/CMakeLists.txt
@@ -2,8 +2,7 @@ ADD_SUBDIRECTORY( arrays )
 ADD_SUBDIRECTORY( cuda )
 ADD_SUBDIRECTORY( vectors )
 
-SET( headers memory-operations.h
-             tnlCuda_impl.h
+SET( headers tnlCuda_impl.h
              tnlHost_impl.h
              tnlLogger_impl.h )
 
diff --git a/src/implementation/core/arrays/tnlArrayOperationsCuda_impl.cu b/src/implementation/core/arrays/tnlArrayOperationsCuda_impl.cu
index 84245c9737..aa57f8434b 100644
--- a/src/implementation/core/arrays/tnlArrayOperationsCuda_impl.cu
+++ b/src/implementation/core/arrays/tnlArrayOperationsCuda_impl.cu
@@ -46,12 +46,12 @@ template void tnlArrayOperations< tnlCuda >::setMemoryElement< float       >( fl
 template void tnlArrayOperations< tnlCuda >::setMemoryElement< double      >( double* data, const double& value );
 template void tnlArrayOperations< tnlCuda >::setMemoryElement< long double >( long double* data, const long double& value );
 
-template char        tnlArrayOperations< tnlCuda >::getMemoryElement< char        >( char* data );
-template int         tnlArrayOperations< tnlCuda >::getMemoryElement< int         >( int* data );
-template long int    tnlArrayOperations< tnlCuda >::getMemoryElement< long int    >( long int* data );
-template float       tnlArrayOperations< tnlCuda >::getMemoryElement< float       >( float* data );
-template double      tnlArrayOperations< tnlCuda >::getMemoryElement< double      >( double* data );
-template long double tnlArrayOperations< tnlCuda >::getMemoryElement< long double >( long double* data );
+template char        tnlArrayOperations< tnlCuda >::getMemoryElement< char        >( const char* data );
+template int         tnlArrayOperations< tnlCuda >::getMemoryElement< int         >( const int* data );
+template long int    tnlArrayOperations< tnlCuda >::getMemoryElement< long int    >( const long int* data );
+template float       tnlArrayOperations< tnlCuda >::getMemoryElement< float       >( const float* data );
+template double      tnlArrayOperations< tnlCuda >::getMemoryElement< double      >( const double* data );
+template long double tnlArrayOperations< tnlCuda >::getMemoryElement< long double >( const long double* data );
 
 template char&        tnlArrayOperations< tnlCuda >::getArrayElementReference< char,        int >( char* data, const int i );
 template int&         tnlArrayOperations< tnlCuda >::getArrayElementReference< int,         int >( int* data, const int i );
@@ -106,30 +106,30 @@ template bool tnlArrayOperations< tnlCuda >::copyMemory< float,       tnlCuda,
 template bool tnlArrayOperations< tnlCuda >::copyMemory< double,      tnlCuda,      double, long int >( double* destination, const double* source, const long int size );
 template bool tnlArrayOperations< tnlCuda >::copyMemory< long double, tnlCuda, long double, long int >( long double* destination, const long double* source, const long int size );
 
-template bool tnlArrayOperations< tnlCuda >::compareMemory< char,        tnlHost, int >( const char* data1, const char* data2, const int size );
-template bool tnlArrayOperations< tnlCuda >::compareMemory< int,         tnlHost, int >( const int* data1, const int* data2, const int size );
-template bool tnlArrayOperations< tnlCuda >::compareMemory< long int,    tnlHost, int >( const long int* data1, const long int* data2, const int size );
-template bool tnlArrayOperations< tnlCuda >::compareMemory< float,       tnlHost, int >( const float* data1, const float* data2, const int size );
-template bool tnlArrayOperations< tnlCuda >::compareMemory< double,      tnlHost, int >( const double* data1, const double* data2, const int size );
-template bool tnlArrayOperations< tnlCuda >::compareMemory< long double, tnlHost, int >( const long double* data1, const long double* data2, const int size );
-template bool tnlArrayOperations< tnlCuda >::compareMemory< char,        tnlHost, long int >( const char* data1, const char* data2, const long int size );
-template bool tnlArrayOperations< tnlCuda >::compareMemory< int,         tnlHost, long int >( const int* data1, const int* data2, const long int size );
-template bool tnlArrayOperations< tnlCuda >::compareMemory< long int,    tnlHost, long int >( const long int* data1, const long int* data2, const long int size );
-template bool tnlArrayOperations< tnlCuda >::compareMemory< float,       tnlHost, long int >( const float* data1, const float* data2, const long int size );
-template bool tnlArrayOperations< tnlCuda >::compareMemory< double,      tnlHost, long int >( const double* data1, const double* data2, const long int size );
-template bool tnlArrayOperations< tnlCuda >::compareMemory< long double, tnlHost, long int >( const long double* data1, const long double* data2, const long int size );
-template bool tnlArrayOperations< tnlCuda >::compareMemory< char,        tnlCuda, int >( const char* data1, const char* data2, const int size );
-template bool tnlArrayOperations< tnlCuda >::compareMemory< int,         tnlCuda, int >( const int* data1, const int* data2, const int size );
-template bool tnlArrayOperations< tnlCuda >::compareMemory< long int,    tnlCuda, int >( const long int* data1, const long int* data2, const int size );
-template bool tnlArrayOperations< tnlCuda >::compareMemory< float,       tnlCuda, int >( const float* data1, const float* data2, const int size );
-template bool tnlArrayOperations< tnlCuda >::compareMemory< double,      tnlCuda, int >( const double* data1, const double* data2, const int size );
-template bool tnlArrayOperations< tnlCuda >::compareMemory< long double, tnlCuda, int >( const long double* data1, const long double* data2, const int size );
-template bool tnlArrayOperations< tnlCuda >::compareMemory< char,        tnlCuda, long int >( const char* data1, const char* data2, const long int size );
-template bool tnlArrayOperations< tnlCuda >::compareMemory< int,         tnlCuda, long int >( const int* data1, const int* data2, const long int size );
-template bool tnlArrayOperations< tnlCuda >::compareMemory< long int,    tnlCuda, long int >( const long int* data1, const long int* data2, const long int size );
-template bool tnlArrayOperations< tnlCuda >::compareMemory< float,       tnlCuda, long int >( const float* data1, const float* data2, const long int size );
-template bool tnlArrayOperations< tnlCuda >::compareMemory< double,      tnlCuda, long int >( const double* data1, const double* data2, const long int size );
-template bool tnlArrayOperations< tnlCuda >::compareMemory< long double, tnlCuda, long int >( const long double* data1, const long double* data2, const long int size );
+template bool tnlArrayOperations< tnlCuda >::compareMemory< char,        tnlHost,        char, int >( const char* data1, const char* data2, const int size );
+template bool tnlArrayOperations< tnlCuda >::compareMemory< int,         tnlHost,         int, int >( const int* data1, const int* data2, const int size );
+template bool tnlArrayOperations< tnlCuda >::compareMemory< long int,    tnlHost,    long int, int >( const long int* data1, const long int* data2, const int size );
+template bool tnlArrayOperations< tnlCuda >::compareMemory< float,       tnlHost,       float, int >( const float* data1, const float* data2, const int size );
+template bool tnlArrayOperations< tnlCuda >::compareMemory< double,      tnlHost,      double, int >( const double* data1, const double* data2, const int size );
+template bool tnlArrayOperations< tnlCuda >::compareMemory< long double, tnlHost, long double, int >( const long double* data1, const long double* data2, const int size );
+template bool tnlArrayOperations< tnlCuda >::compareMemory< char,        tnlHost,        char, long int >( const char* data1, const char* data2, const long int size );
+template bool tnlArrayOperations< tnlCuda >::compareMemory< int,         tnlHost,         int, long int >( const int* data1, const int* data2, const long int size );
+template bool tnlArrayOperations< tnlCuda >::compareMemory< long int,    tnlHost,    long int, long int >( const long int* data1, const long int* data2, const long int size );
+template bool tnlArrayOperations< tnlCuda >::compareMemory< float,       tnlHost,       float, long int >( const float* data1, const float* data2, const long int size );
+template bool tnlArrayOperations< tnlCuda >::compareMemory< double,      tnlHost,      double, long int >( const double* data1, const double* data2, const long int size );
+template bool tnlArrayOperations< tnlCuda >::compareMemory< long double, tnlHost, long double, long int >( const long double* data1, const long double* data2, const long int size );
+template bool tnlArrayOperations< tnlCuda >::compareMemory< char,        tnlCuda,        char, int >( const char* data1, const char* data2, const int size );
+template bool tnlArrayOperations< tnlCuda >::compareMemory< int,         tnlCuda,         int, int >( const int* data1, const int* data2, const int size );
+template bool tnlArrayOperations< tnlCuda >::compareMemory< long int,    tnlCuda,    long int, int >( const long int* data1, const long int* data2, const int size );
+template bool tnlArrayOperations< tnlCuda >::compareMemory< float,       tnlCuda,       float, int >( const float* data1, const float* data2, const int size );
+template bool tnlArrayOperations< tnlCuda >::compareMemory< double,      tnlCuda,      double, int >( const double* data1, const double* data2, const int size );
+template bool tnlArrayOperations< tnlCuda >::compareMemory< long double, tnlCuda, long double, int >( const long double* data1, const long double* data2, const int size );
+template bool tnlArrayOperations< tnlCuda >::compareMemory< char,        tnlCuda,        char, long int >( const char* data1, const char* data2, const long int size );
+template bool tnlArrayOperations< tnlCuda >::compareMemory< int,         tnlCuda,         int, long int >( const int* data1, const int* data2, const long int size );
+template bool tnlArrayOperations< tnlCuda >::compareMemory< long int,    tnlCuda,    long int, long int >( const long int* data1, const long int* data2, const long int size );
+template bool tnlArrayOperations< tnlCuda >::compareMemory< float,       tnlCuda,       float, long int >( const float* data1, const float* data2, const long int size );
+template bool tnlArrayOperations< tnlCuda >::compareMemory< double,      tnlCuda,      double, long int >( const double* data1, const double* data2, const long int size );
+template bool tnlArrayOperations< tnlCuda >::compareMemory< long double, tnlCuda, long double, long int >( const long double* data1, const long double* data2, const long int size );
 
 template bool tnlArrayOperations< tnlCuda >::setMemory< char,        int >( char* destination, const char& value, const int size );
 template bool tnlArrayOperations< tnlCuda >::setMemory< int,         int >( int* destination, const int& value, const int size );
diff --git a/src/implementation/core/arrays/tnlArrayOperationsCuda_impl.h b/src/implementation/core/arrays/tnlArrayOperationsCuda_impl.h
index f8ccb1c89b..aef5373a42 100644
--- a/src/implementation/core/arrays/tnlArrayOperationsCuda_impl.h
+++ b/src/implementation/core/arrays/tnlArrayOperationsCuda_impl.h
@@ -18,6 +18,7 @@
 #ifndef TNLARRAYOPERATIONSCUDA_IMPL_H_
 #define TNLARRAYOPERATIONSCUDA_IMPL_H_
 
+#include <core/mfuncs.h>
 
 template< typename Element, typename Index >
 bool tnlArrayOperations< tnlCuda > :: allocateMemory( Element*& data,
@@ -50,14 +51,14 @@ template< typename Element >
 void tnlArrayOperations< tnlCuda > :: setMemoryElement( Element* data,
                                          const Element& value )
 {
-   setMemoryCuda( data, value, 1, tnlCuda::getMaxGridSize() );
+   tnlArrayOperations< tnlCuda >::setMemory( data, value, 1 );
 }
 
 template< typename Element >
 Element tnlArrayOperations< tnlCuda > :: getMemoryElement( const Element* data )
 {
    Element result;
-   copyMemoryCudaToHost( &result, data, 1 );
+   tnlArrayOperations< tnlCuda >::copyMemory< Element, tnlHost, Element, int >( &result, data, 1 );
    return result;
 }
 
@@ -78,9 +79,9 @@ const Element& tnlArrayOperations< tnlCuda > :: getArrayElementReference(const E
 
 #ifdef HAVE_CUDA
 template< typename Element, typename Index >
-__global__ void tnlArrayOperations< tnlCuda > :: setArrayValueCudaKernel( Element* data,
-                                                                          const Index size,
-                                                                          const Element value )
+__global__ void setArrayValueCudaKernel( Element* data,
+                                         const Index size,
+                                         const Element value )
 {
    Index elementIdx = blockDim. x * blockIdx. x + threadIdx. x;
    const Index maxGridSize = blockDim. x * gridDim. x;
@@ -180,29 +181,34 @@ bool tnlArrayOperations< tnlCuda > :: copyMemory( DestinationElement* destinatio
 {
    if( DestinationDevice :: getDevice() == tnlHostDevice )
    {
-      SourceElement* buffer = new SourceElement[ tnlGPUvsCPUTransferBufferSize ];
-      if( ! buffer )
-      {
-         cerr << "Unable to allocate supporting buffer to transfer data between the CUDA device and the host." << endl;
-         return false;
-      }
-      Index i( 0 );
-      while( i < size )
-      {
-         if( ! copyMemoryCudaToHost( buffer,
-                                     &source[ i ],
-                                     Min( size - i, tnlGPUvsCPUTransferBufferSize ) ) )
+      #ifdef HAVE_CUDA
+         SourceElement* buffer = new SourceElement[ tnlCuda::getGPUTransferBufferSize() ];
+         if( ! buffer )
          {
-            delete[] buffer;
+            cerr << "Unable to allocate supporting buffer to transfer data between the CUDA device and the host." << endl;
             return false;
          }
-         Index j( 0 );
-         while( j < tnlGPUvsCPUTransferBufferSize && i + j < size )
-            destination[ i + j ] = buffer[ j++ ];
-         i += j;
-      }
-      delete[] buffer;
-      return true;
+         Index i( 0 );
+         while( i < size )
+         {
+            if( cudaMemcpy( buffer, 
+                            &source[ i ],
+                            Min( size - i, tnlCuda::getGPUTransferBufferSize() ), cudaMemcpyDeviceToHost ) != cudaSuccess )
+            {
+               checkCudaDevice;
+               delete[] buffer;
+               return false;
+            }
+            Index j( 0 );
+            while( j < tnlCuda::getGPUTransferBufferSize() && i + j < size )
+               destination[ i + j ] = buffer[ j++ ];
+            i += j;
+         }
+         delete[] buffer;
+      #else
+         cerr << "CUDA support is missing on this system " << __FILE__ << " line " << __LINE__ << "." << endl;
+         return false;
+      #endif
    }
    if( DestinationDevice::getDevice() == tnlCudaDevice )
    {
@@ -210,7 +216,7 @@ bool tnlArrayOperations< tnlCuda > :: copyMemory( DestinationElement* destinatio
          dim3 blockSize( 0 ), gridSize( 0 );
          blockSize. x = 256;
          Index blocksNumber = ceil( ( double ) size / ( double ) blockSize. x );
-         gridSize. x = Min( blocksNumber, maxGridSize );
+         gridSize. x = Min( blocksNumber, tnlCuda::getMaxGridSize() );
          copyMemoryCudaToCudaKernel<<< gridSize, blockSize >>>( destination, source, size );
          return checkCudaDevice;
       #else
@@ -218,7 +224,7 @@ bool tnlArrayOperations< tnlCuda > :: copyMemory( DestinationElement* destinatio
             return false;
       #endif
    }
-
+   return true;
 }
 
 template< typename Element1,
@@ -226,13 +232,13 @@ template< typename Element1,
           typename Element2,
           typename Index >
 bool tnlArrayOperations< tnlCuda > :: compareMemory( const Element1* destination,
-                    const Element2* source,
-                    const Index size )
+                                                     const Element2* source,
+                                                     const Index size )
 {
    if( DestinationDevice::getDevice() == tnlHostDevice )
    {
       #ifdef HAVE_CUDA
-         Element2* host_buffer = new Element2[ tnlGPUvsCPUTransferBufferSize ];
+         Element2* host_buffer = new Element2[ tnlCuda::getGPUTransferBufferSize() ];
          if( ! host_buffer )
          {
             cerr << "I am sorry but I cannot allocate supporting buffer on the host for comparing data between CUDA GPU and CPU." << endl;
@@ -241,9 +247,9 @@ bool tnlArrayOperations< tnlCuda > :: compareMemory( const Element1* destination
          Index compared( 0 );
          while( compared < size )
          {
-            Index transfer = Min( size - compared, tnlGPUvsCPUTransferBufferSize );
+            Index transfer = Min( size - compared, tnlCuda::getGPUTransferBufferSize() );
             if( cudaMemcpy( ( void* ) host_buffer,
-                            ( void* ) & ( deviceData[ compared ] ),
+                            ( void* ) & ( source[ compared ] ),
                             transfer * sizeof( Element2 ),
                             cudaMemcpyDeviceToHost ) != cudaSuccess )
             {
@@ -254,7 +260,7 @@ bool tnlArrayOperations< tnlCuda > :: compareMemory( const Element1* destination
             }
             Index bufferIndex( 0 );
             while( bufferIndex < transfer &&
-                   host_buffer[ bufferIndex ] == hostData[ compared ] )
+                   host_buffer[ bufferIndex ] == destination[ compared ] )
             {
                bufferIndex ++;
                compared ++;
diff --git a/src/implementation/core/arrays/tnlArrayOperationsHost_impl.h b/src/implementation/core/arrays/tnlArrayOperationsHost_impl.h
index fd70f11443..a8a321d94e 100644
--- a/src/implementation/core/arrays/tnlArrayOperationsHost_impl.h
+++ b/src/implementation/core/arrays/tnlArrayOperationsHost_impl.h
@@ -18,6 +18,8 @@
 #ifndef TNLARRAYOPERATIONSHOST_IMPL_H_
 #define TNLARRAYOPERATIONSHOST_IMPL_H_
 
+#include <core/cuda/device-check.h>
+
 template< typename Element, typename Index >
 bool tnlArrayOperations< tnlHost > :: allocateMemory( Element*& data,
                                                       const Index size )
@@ -83,29 +85,33 @@ bool tnlArrayOperations< tnlHost > :: copyMemory( DestinationElement* destinatio
          destination[ i ] = ( DestinationElement) source[ i ];
    if( DestinationDevice :: getDevice() == tnlCudaDevice )
    {
-      DestinationElement* buffer = new DestinationElement[ tnlGPUvsCPUTransferBufferSize ];
-      if( ! buffer )
-      {
-         cerr << "Unable to allocate supporting buffer to transfer data between the CUDA device and the host." << endl;
-         return false;
-      }
-      Index i( 0 );
-      while( i < size )
-      {
-         Index j( 0 );
-         while( j < tnlGPUvsCPUTransferBufferSize && i + j < size )
-            buffer[ j ] = source[ i + j++ ];
-         if( ! copyMemoryHostToCuda( &destination[ i ],
-                                     buffer,
-                                     j ) )
+      #ifdef HAVE_CUDA      
+         DestinationElement* buffer = new DestinationElement[ tnlCuda::getGPUTransferBufferSize() ];
+         if( ! buffer )
          {
-            delete[] buffer;
+            cerr << "Unable to allocate supporting buffer to transfer data between the CUDA device and the host." << endl;
             return false;
          }
-         i += j;
-      }
-      delete[] buffer;
-      return true;
+         Index i( 0 );
+         while( i < size )
+         {
+            Index j( 0 );
+            while( j < tnlCuda::getGPUTransferBufferSize() && i + j < size )
+               buffer[ j ] = source[ i + j++ ];
+            if( cudaMemcpy( &destination[ i ], buffer, j, cudaMemcpyHostToDevice ) != cudaSuccess )
+            {
+               checkCudaDevice;
+               delete[] buffer;
+               return false;
+            }
+            i += j;
+         }
+         delete[] buffer;
+         return true;
+      #else
+         cerr << "The CUDA support is missing on this system." << endl;
+         return false;
+      #endif
    }
    return true;
 }
diff --git a/src/implementation/core/cuda/cuda-reduction_impl.h b/src/implementation/core/cuda/cuda-reduction_impl.h
index 6355790aab..d227c5bd30 100644
--- a/src/implementation/core/cuda/cuda-reduction_impl.h
+++ b/src/implementation/core/cuda/cuda-reduction_impl.h
@@ -24,7 +24,7 @@
 #include <iostream>
 #include <core/tnlAssert.h>
 #include <core/cuda/reduction-operations.h>
-#include <implementation/core/memory-operations.h>
+#include <core/arrays/tnlArrayOperations.h>
 
 using namespace std;
 
@@ -296,7 +296,7 @@ typename Operation :: IndexType reduceOnCudaDevice( const Operation& operation,
    gridSize. x = Min( ( IndexType ) ( size / blockSize. x + 1 ) / 2, desGridSize );
 
    if( ! output &&
-       ! allocateMemoryCuda( output, :: Max( ( IndexType ) 1, size / desBlockSize ) ) )
+       ! tnlArrayOperations< tnlCuda >::allocateMemory( output, :: Max( ( IndexType ) 1, size / desBlockSize ) ) )
          return false;
 
    IndexType shmem = blockSize. x * sizeof( ResultType );
@@ -372,9 +372,10 @@ bool reductionOnCudaDevice( const Operation& operation,
    RealType hostArray2[ maxGPUReductionDataSize ];
    if( size <= maxGPUReductionDataSize )
    {
-      if( ! copyMemoryCudaToHost( hostArray1, deviceInput1, size ) )
+      if( ! tnlArrayOperations< tnlCuda >::copyMemory< RealType, tnlCuda, RealType, IndexType >( hostArray1, deviceInput1, size ) )
          return false;
-      if( deviceInput2 && ! copyMemoryCudaToHost( hostArray2, deviceInput2, size ) )
+      if( deviceInput2 && ! 
+          tnlArrayOperations< tnlCuda >::copyMemory< RealType, tnlHost, RealType, IndexType >( hostArray2, deviceInput2, size ) )
          return false;
       result = operation. initialValueOnHost( 0, hostArray1, hostArray2 );
       for( IndexType i = 1; i < size; i ++ )
@@ -407,7 +408,7 @@ bool reductionOnCudaDevice( const Operation& operation,
     * Transfer the reduced data from device to host.
     */
    ResultType resultArray[ maxGPUReductionDataSize ];
-   if( ! copyMemoryCudaToHost( resultArray, deviceAux1, reducedSize ) )
+   if( ! tnlArrayOperations< tnlCuda >::copyMemory< ResultType, tnlHost, ResultType, IndexType >( resultArray, deviceAux1, reducedSize ) )
       return false;
 
    /***
@@ -422,9 +423,9 @@ bool reductionOnCudaDevice( const Operation& operation,
    /****
     * Free the memory allocated on the device.
     */
-   if( deviceAux1 && ! freeMemoryCuda( deviceAux1 ) )
+   if( deviceAux1 && ! tnlArrayOperations< tnlCuda >::freeMemory( deviceAux1 ) )
       return false;
-   if( deviceAux2 && ! freeMemoryCuda( deviceAux2 ) )
+   if( deviceAux2 && ! tnlArrayOperations< tnlCuda >::freeMemory( deviceAux2 ) )
       return false;
    return true;
 #else
diff --git a/src/implementation/core/tnlCuda_impl.h b/src/implementation/core/tnlCuda_impl.h
index b2c7eb584a..33b7dce9c9 100644
--- a/src/implementation/core/tnlCuda_impl.h
+++ b/src/implementation/core/tnlCuda_impl.h
@@ -48,5 +48,10 @@ inline void tnlCuda :: setMaxBlockSize( int newMaxBlockSize )
    maxBlockSize = newMaxBlockSize;
 }
 
+inline int tnlCuda::getGPUTransferBufferSize()
+{
+   return 1 << 20;
+}
+
 
 #endif /* TNLCUDA_IMPL_H_ */
diff --git a/tests/unit-tests/CMakeLists.txt b/tests/unit-tests/CMakeLists.txt
index c99208f449..2e21edbe24 100755
--- a/tests/unit-tests/CMakeLists.txt
+++ b/tests/unit-tests/CMakeLists.txt
@@ -41,11 +41,11 @@ if( BUILD_CUDA )
    SET_TESTS_PROPERTIES ( core/cuda/tnl-reduction-test${mpiExt}${debugExt} PROPERTIES DEPENDS core/cuda/tnl-device-check-test${mpiExt}${debugExt} )   
    
    ADD_TEST( core/arrays/tnlArrayOperationsTest-cuda${mpiExt}${debugExt} ${EXECUTABLE_OUTPUT_PATH}/tnlArrayOperationsTest-cuda${mpiExt}${debugExt} )
-   SET_TESTS_PROPERTIES ( core/cuda/tnlArrayOperationsTest-cuda${mpiExt}${debugExt} PROPERTIES DEPENDS core/cuda/tnl-reduction-test${mpiExt}${debugExt} )
+   SET_TESTS_PROPERTIES ( core/arrays/tnlArrayOperationsTest-cuda${mpiExt}${debugExt} PROPERTIES DEPENDS core/cuda/tnl-reduction-test${mpiExt}${debugExt} )
    
    ADD_TEST( core/vectors/tnlCudaVectorOperationsTest${mpiExt}${debugExt} ${EXECUTABLE_OUTPUT_PATH}/tnlCudaVectorOperationsTest${mpiExt}${debugExt} )
    SET_TESTS_PROPERTIES ( core/vectors/tnlCudaVectorOperationsTest${mpiExt}${debugExt} PROPERTIES DEPENDS core/cuda/tnlArrayOperationsTest-cuda${mpiExt}${debugExt} )         
 endif()
 
 ADD_TEST( tnl-unit-tests${mpiExt}${debugExt} ${EXECUTABLE_OUTPUT_PATH}/tnl-unit-tests${mpiExt}${debugExt}  )                                                                       
-                                                                       
\ No newline at end of file
+                                                                       
diff --git a/tests/unit-tests/core/arrays/tnlArrayOperationsTester.h b/tests/unit-tests/core/arrays/tnlArrayOperationsTester.h
index 737bbbbcfd..beb2d56194 100644
--- a/tests/unit-tests/core/arrays/tnlArrayOperationsTester.h
+++ b/tests/unit-tests/core/arrays/tnlArrayOperationsTester.h
@@ -28,6 +28,7 @@
 #include <cppunit/Message.h>
 
 #include <core/arrays/tnlArrayOperations.h>
+#include <core/cuda/device-check.h>
 
 template< typename Element, typename Device >
 class tnlArrayOperationsTester{};
@@ -81,10 +82,10 @@ class tnlArrayOperationsTester< Element, tnlHost > : public CppUnit :: TestCase
     void allocationTest()
     {
        Element* data;
-       tnlArrayOperations< tnlHost > :: allocateMemory( data, getTestSize() );
+       tnlArrayOperations< tnlHost >::allocateMemory( data, getTestSize() );
        CPPUNIT_ASSERT( data != 0 );
 
-       freeMemoryCuda( data );
+       tnlArrayOperations< tnlHost >::freeMemory( data );
     };
 
     void memorySetTest()
@@ -220,10 +221,10 @@ class tnlArrayOperationsTester< Element, tnlCuda > : public CppUnit :: TestCase
     void allocationTest()
     {
        int* data;
-       allocateMemoryCuda( data, getTestSize() );
+       tnlArrayOperations< tnlCuda >::allocateMemory( data, getTestSize() );
        CPPUNIT_ASSERT( checkCudaDevice );
 
-       freeMemoryCuda( data );
+       tnlArrayOperations< tnlCuda >::freeMemory( data );
        CPPUNIT_ASSERT( checkCudaDevice );
     };
 
@@ -231,37 +232,37 @@ class tnlArrayOperationsTester< Element, tnlCuda > : public CppUnit :: TestCase
     {
        const int size = 1024;
        int *hostData, *deviceData;
-       allocateMemoryHost( hostData, size );
-       allocateMemoryCuda( deviceData, size );
-       setMemoryHost( hostData, 0, size );
-       setMemoryCuda( deviceData, 13, size, maxCudaGridSize );
+       tnlArrayOperations< tnlHost >::allocateMemory( hostData, size );
+       tnlArrayOperations< tnlCuda >::allocateMemory( deviceData, size );
+       tnlArrayOperations< tnlHost >::setMemory( hostData, 0, size );
+       tnlArrayOperations< tnlCuda >::setMemory( deviceData, 13, size );
        CPPUNIT_ASSERT( checkCudaDevice );
-       copyMemoryCudaToHost( hostData, deviceData, size );
+       tnlArrayOperations< tnlCuda >::copyMemory< int, tnlHost, int, int >( hostData, deviceData, size );
        CPPUNIT_ASSERT( checkCudaDevice );
        for( int i = 0; i < size; i ++ )
           CPPUNIT_ASSERT( hostData[ i ] == 13 );
-       freeMemoryHost( hostData );
-       freeMemoryCuda( deviceData );
+       tnlArrayOperations< tnlCuda >::freeMemory( hostData );
+       tnlArrayOperations< tnlCuda >::freeMemory( deviceData );
     };
 
     void bigMemorySetTest()
     {
        const int size( getTestSize() );
        int *hostData, *deviceData;
-       allocateMemoryHost( hostData, size );
-       allocateMemoryCuda( deviceData, size );
-       setMemoryHost( hostData, 0, size );
-       setMemoryCuda( deviceData, 13, size, maxCudaGridSize );
+       tnlArrayOperations< tnlHost >::allocateMemory( hostData, size );
+       tnlArrayOperations< tnlCuda >::allocateMemory( deviceData, size );
+       tnlArrayOperations< tnlHost >::setMemory( hostData, 0, size );
+       tnlArrayOperations< tnlCuda >::setMemory( deviceData, 13, size );
        CPPUNIT_ASSERT( checkCudaDevice );
-       copyMemoryCudaToHost( hostData, deviceData, size );
+       tnlArrayOperations< tnlCuda >::copyMemory< int, tnlHost, int, int >( hostData, deviceData, size );
        CPPUNIT_ASSERT( checkCudaDevice );
        for( int i = 0; i < size; i += 100 )
        {
           if( hostData[ i ] != 13 )
           CPPUNIT_ASSERT( hostData[ i ] == 13 );
        }
-       freeMemoryHost( hostData );
-       freeMemoryCuda( deviceData );
+       tnlArrayOperations< tnlHost >::freeMemory( hostData );
+       tnlArrayOperations< tnlCuda >::freeMemory( deviceData );
     };
 
     void copyMemoryTest()
@@ -269,16 +270,16 @@ class tnlArrayOperationsTester< Element, tnlCuda > : public CppUnit :: TestCase
        const int size = getTestSize();
 
        int *hostData1, *hostData2, *deviceData;
-       allocateMemoryHost( hostData1, size );
-       allocateMemoryHost( hostData2, size );
-       allocateMemoryCuda( deviceData, size );
-       setMemoryHost( hostData1, 13, size );
-       copyMemoryHostToCuda( deviceData, hostData1, size );
-       copyMemoryCudaToHost( hostData2, deviceData, size );
-       CPPUNIT_ASSERT( compareMemoryHost( hostData1, hostData2, size) );
-       freeMemoryHost( hostData1 );
-       freeMemoryHost( hostData2 );
-       freeMemoryCuda( deviceData );
+       tnlArrayOperations< tnlHost >::allocateMemory( hostData1, size );
+       tnlArrayOperations< tnlHost >::allocateMemory( hostData2, size );
+       tnlArrayOperations< tnlCuda >::allocateMemory( deviceData, size );
+       tnlArrayOperations< tnlHost >::setMemory( hostData1, 13, size );
+       tnlArrayOperations< tnlHost >::copyMemory< int, tnlCuda, int, int >( deviceData, hostData1, size );
+       tnlArrayOperations< tnlCuda >::copyMemory< int, tnlHost, int, int >( hostData2, deviceData, size );
+       CPPUNIT_ASSERT( ( tnlArrayOperations< tnlHost >::compareMemory< int, tnlHost, int, int >( hostData1, hostData2, size) ) );
+       tnlArrayOperations< tnlHost >::freeMemory( hostData1 );
+       tnlArrayOperations< tnlHost >::freeMemory( hostData2 );
+       tnlArrayOperations< tnlCuda >::freeMemory( deviceData );
     };
 
     void copyMemoryWithConversionHostToCudaTest()
@@ -286,17 +287,17 @@ class tnlArrayOperationsTester< Element, tnlCuda > : public CppUnit :: TestCase
        const int size = getTestSize();
        int *hostData1;
        float *hostData2, *deviceData;
-       allocateMemoryHost( hostData1, size );
-       allocateMemoryHost( hostData2, size );
-       allocateMemoryCuda( deviceData, size );
-       setMemoryHost( hostData1, 13, size );
-       copyMemoryHostToCuda( deviceData, hostData1, size );
-       copyMemoryCudaToHost( hostData2, deviceData, size );
+       tnlArrayOperations< tnlHost >::allocateMemory( hostData1, size );
+       tnlArrayOperations< tnlHost >::allocateMemory( hostData2, size );
+       tnlArrayOperations< tnlCuda >::allocateMemory( deviceData, size );
+       tnlArrayOperations< tnlHost >::setMemory( hostData1, 13, size );
+       tnlArrayOperations< tnlHost >::copyMemory< float, tnlCuda, int, int >( deviceData, hostData1, size );
+       tnlArrayOperations< tnlCuda >::copyMemory< float, tnlHost, float, int >( hostData2, deviceData, size );
        for( int i = 0; i < size; i ++ )
           CPPUNIT_ASSERT( hostData1[ i ] == hostData2[ i ] );
-       freeMemoryHost( hostData1 );
-       freeMemoryHost( hostData2 );
-       freeMemoryCuda( deviceData );
+       tnlArrayOperations< tnlHost >::freeMemory( hostData1 );
+       tnlArrayOperations< tnlHost >::freeMemory( hostData2 );
+       tnlArrayOperations< tnlCuda >::freeMemory( deviceData );
     };
 
     void copyMemoryWithConversionCudaToHostTest()
@@ -304,17 +305,17 @@ class tnlArrayOperationsTester< Element, tnlCuda > : public CppUnit :: TestCase
        const int size = getTestSize();
        int *hostData1, *deviceData;
        float *hostData2;
-       allocateMemoryHost( hostData1, size );
-       allocateMemoryHost( hostData2, size );
-       allocateMemoryCuda( deviceData, size );
-       setMemoryHost( hostData1, 13, size );
-       copyMemoryHostToCuda( deviceData, hostData1, size );
-       copyMemoryCudaToHost( hostData2, deviceData, size );
+       tnlArrayOperations< tnlHost >::allocateMemory( hostData1, size );
+       tnlArrayOperations< tnlHost >::allocateMemory( hostData2, size );
+       tnlArrayOperations< tnlCuda >::allocateMemory( deviceData, size );
+       tnlArrayOperations< tnlHost >::setMemory( hostData1, 13, size );
+       tnlArrayOperations< tnlHost >::copyMemory< int, tnlCuda, int, int >( deviceData, hostData1, size );
+       tnlArrayOperations< tnlCuda >::copyMemory< float, tnlHost, int, int >( hostData2, deviceData, size );
        for( int i = 0; i < size; i ++ )
           CPPUNIT_ASSERT( hostData1[ i ] == hostData2[ i ] );
-       freeMemoryHost( hostData1 );
-       freeMemoryHost( hostData2 );
-       freeMemoryCuda( deviceData );
+       tnlArrayOperations< tnlHost >::freeMemory( hostData1 );
+       tnlArrayOperations< tnlHost >::freeMemory( hostData2 );
+       tnlArrayOperations< tnlCuda >::freeMemory( deviceData );
     };
 
     void copyMemoryWithConversionCudaToCudaTest()
@@ -322,33 +323,33 @@ class tnlArrayOperationsTester< Element, tnlCuda > : public CppUnit :: TestCase
        const int size = getTestSize();
        int *hostData1, *deviceData1;
        float *hostData2, *deviceData2;
-       allocateMemoryHost( hostData1, size );
-       allocateMemoryHost( hostData2, size );
-       allocateMemoryCuda( deviceData1, size );
-       allocateMemoryCuda( deviceData2, size );
-       setMemoryHost( hostData1, 13, size );
-       copyMemoryHostToCuda( deviceData1, hostData1, size );
-       copyMemoryCudaToCuda( deviceData2, deviceData1, size, maxCudaGridSize );
-       copyMemoryCudaToHost( hostData2, deviceData2, size );
+       tnlArrayOperations< tnlHost >::allocateMemory( hostData1, size );
+       tnlArrayOperations< tnlHost >::allocateMemory( hostData2, size );
+       tnlArrayOperations< tnlCuda >::allocateMemory( deviceData1, size );
+       tnlArrayOperations< tnlCuda >::allocateMemory( deviceData2, size );
+       tnlArrayOperations< tnlHost >::setMemory( hostData1, 13, size );
+       tnlArrayOperations< tnlHost >::copyMemory< int, tnlCuda, int, int >( deviceData1, hostData1, size );
+       tnlArrayOperations< tnlCuda >::copyMemory< float, tnlCuda, int, int >( deviceData2, deviceData1, size );
+       tnlArrayOperations< tnlCuda >::copyMemory< float, tnlHost, float, int >( hostData2, deviceData2, size );
        for( int i = 0; i < size; i ++ )
           CPPUNIT_ASSERT( hostData1[ i ] == hostData2[ i ] );
-       freeMemoryHost( hostData1 );
-       freeMemoryHost( hostData2 );
-       freeMemoryCuda( deviceData1 );
-       freeMemoryCuda( deviceData2 );
+       tnlArrayOperations< tnlHost >::freeMemory( hostData1 );
+       tnlArrayOperations< tnlHost >::freeMemory( hostData2 );
+       tnlArrayOperations< tnlCuda >::freeMemory( deviceData1 );
+       tnlArrayOperations< tnlCuda >::freeMemory( deviceData2 );
     };
 
     void compareMemoryHostCudaTest()
     {
        const int size = getTestSize();
        int *hostData, *deviceData;
-       allocateMemoryHost( hostData, size );
-       allocateMemoryCuda( deviceData, size );
-       setMemoryHost( hostData, 7, size );
-       setMemoryCuda( deviceData, 8, size, maxCudaGridSize );
-       CPPUNIT_ASSERT( ! compareMemoryHostCuda( hostData, deviceData, size ) );
-       setMemoryCuda( deviceData, 7, size, maxCudaGridSize );
-       CPPUNIT_ASSERT( compareMemoryHostCuda( hostData, deviceData, size ) );
+       tnlArrayOperations< tnlHost >::allocateMemory( hostData, size );
+       tnlArrayOperations< tnlCuda >::allocateMemory( deviceData, size );
+       tnlArrayOperations< tnlHost >::setMemory( hostData, 7, size );
+       tnlArrayOperations< tnlCuda >::setMemory( deviceData, 8, size );
+       CPPUNIT_ASSERT( ( ! tnlArrayOperations< tnlCuda >::compareMemory< int, tnlHost, int, int >( hostData, deviceData, size ) ) );
+       tnlArrayOperations< tnlCuda >::setMemory( deviceData, 7, size );
+       CPPUNIT_ASSERT( ( tnlArrayOperations< tnlCuda >::compareMemory< int, tnlHost, int, int >( hostData, deviceData, size ) ) );
     };
 
     void compareMemoryWithConversionHostCudaTest()
@@ -356,13 +357,13 @@ class tnlArrayOperationsTester< Element, tnlCuda > : public CppUnit :: TestCase
        const int size = getTestSize();
        int *hostData;
        float *deviceData;
-       allocateMemoryHost( hostData, size );
-       allocateMemoryCuda( deviceData, size );
-       setMemoryHost( hostData, 7, size );
-       setMemoryCuda( deviceData, ( float ) 8.0, size, maxCudaGridSize );
-       CPPUNIT_ASSERT( ! compareMemoryHostCuda( hostData, deviceData, size ) );
-       setMemoryCuda( deviceData, ( float ) 7.0, size, maxCudaGridSize );
-       CPPUNIT_ASSERT( compareMemoryHostCuda( hostData, deviceData, size ) );
+       tnlArrayOperations< tnlHost >::allocateMemory( hostData, size );
+       tnlArrayOperations< tnlCuda >::allocateMemory( deviceData, size );
+       tnlArrayOperations< tnlHost >::setMemory( hostData, 7, size );
+       tnlArrayOperations< tnlCuda >::setMemory( deviceData, ( float ) 8.0, size );
+       CPPUNIT_ASSERT( ( ! tnlArrayOperations< tnlCuda >::compareMemory< int, tnlHost, float, int >( hostData, deviceData, size ) ) );
+       tnlArrayOperations< tnlCuda >::setMemory( deviceData, ( float ) 7.0, size );
+       CPPUNIT_ASSERT( ( tnlArrayOperations< tnlCuda >::compareMemory< int, tnlHost, float, int >( hostData, deviceData, size ) ) );
     };
 };
 
diff --git a/tests/unit-tests/core/cuda/CMakeLists.txt b/tests/unit-tests/core/cuda/CMakeLists.txt
index 3355318a36..36154e7580 100755
--- a/tests/unit-tests/core/cuda/CMakeLists.txt
+++ b/tests/unit-tests/core/cuda/CMakeLists.txt
@@ -1,5 +1,4 @@
 set( headers tnlCudaDeviceCheckTester.h
-             tnlCudaMemoryOperationsTester.h             
              tnlCudaReductionTester.h )
              
 if( BUILD_CUDA )
@@ -7,10 +6,6 @@ if( BUILD_CUDA )
     TARGET_LINK_LIBRARIES( tnl-device-check-test${mpiExt}${debugExt} ${CPPUNIT_LIBRARIES}
                                                               tnl${mpiExt}${debugExt}-0.1 )
 
-    CUDA_ADD_EXECUTABLE( tnl-memory-operations-test${mpiExt}${debugExt} ${headers} memory-operations-test.cu )
-    TARGET_LINK_LIBRARIES( tnl-memory-operations-test${mpiExt}${debugExt} ${CPPUNIT_LIBRARIES}
-                                                              tnl${mpiExt}${debugExt}-0.1 )
-    
     CUDA_ADD_EXECUTABLE( tnl-reduction-test${mpiExt}${debugExt} ${headers} reduction-test.cu )
     TARGET_LINK_LIBRARIES( tnl-reduction-test${mpiExt}${debugExt} ${CPPUNIT_LIBRARIES}
                                                               tnl${mpiExt}${debugExt}-0.1 )                                                                
@@ -19,4 +14,4 @@ if( BUILD_CUDA )
 endif()
 
                                                                        
-                                                                       
\ No newline at end of file
+                                                                       
diff --git a/tests/unit-tests/core/cuda/memory-operations-test.cu b/tests/unit-tests/core/cuda/memory-operations-test.cu
deleted file mode 100644
index 8fe5f88f20..0000000000
--- a/tests/unit-tests/core/cuda/memory-operations-test.cu
+++ /dev/null
@@ -1,26 +0,0 @@
-/***************************************************************************
-                          memory-operations-test.cu  -  description
-                             -------------------
-    begin                : Mar 20, 2013
-    copyright            : (C) 2013 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#include "tnlCudaMemoryOperationsTester.h"
-#include "../../tnlUnitTestStarter.h"
- 
-int main( int argc, char* argv[] )
-{
-   if( ! tnlUnitTestStarter :: run< tnlCudaMemoryOperationsTester >() )
-      return EXIT_FAILURE;
-   return EXIT_SUCCESS;
-}
\ No newline at end of file
diff --git a/tests/unit-tests/core/cuda/tnlCudaMemoryOperationsTester.h b/tests/unit-tests/core/cuda/tnlCudaMemoryOperationsTester.h
deleted file mode 100644
index f9e98d72d9..0000000000
--- a/tests/unit-tests/core/cuda/tnlCudaMemoryOperationsTester.h
+++ /dev/null
@@ -1,248 +0,0 @@
-/***************************************************************************
-                          tnlCudaMemoryOperationsTester.h  -  description
-                             -------------------
-    begin                : Mar 20, 2013
-    copyright            : (C) 2013 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/***************************************************************************
- *                                                                         *
- *   This program is free software; you can redistribute it and/or modify  *
- *   it under the terms of the GNU General Public License as published by  *
- *   the Free Software Foundation; either version 2 of the License, or     *
- *   (at your option) any later version.                                   *
- *                                                                         *
- ***************************************************************************/
-
-#ifndef TNLCUDAMEMORYOPERATIONSTESTER_H_
-#define TNLCUDAMEMORYOPERATIONSTESTER_H_
-
-#include <tnlConfig.h>
-
-#ifdef HAVE_CPPUNIT
-#include <cppunit/TestSuite.h>
-#include <cppunit/TestResult.h>
-#include <cppunit/TestCaller.h>
-#include <cppunit/TestCase.h>
-#include <cppunit/Message.h>
-#include <core/cuda/device-check.h>
-#include <implementation/core/memory-operations.h>
-
-class tnlCudaMemoryOperationsTester : public CppUnit :: TestCase
-{
-   public:
-   tnlCudaMemoryOperationsTester(){};
-
-   virtual
-   ~tnlCudaMemoryOperationsTester(){};
-
-   static CppUnit :: Test* suite()
-   {
-      CppUnit :: TestSuite* suiteOfTests = new CppUnit :: TestSuite( "tnlCudaMemoryOperationsTester" );
-      CppUnit :: TestResult result;
-
-      suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaMemoryOperationsTester >(
-                                "allocationTest",
-                                &tnlCudaMemoryOperationsTester :: allocationTest )
-                               );
-      suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaMemoryOperationsTester >(
-                                "smallMemorySetTest",
-                                &tnlCudaMemoryOperationsTester :: smallMemorySetTest )
-                               );
-      suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaMemoryOperationsTester >(
-                                "bigMemorySetTest",
-                                &tnlCudaMemoryOperationsTester :: bigMemorySetTest )
-                               );
-      suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaMemoryOperationsTester >(
-                                "copyMemoryTest",
-                                &tnlCudaMemoryOperationsTester :: copyMemoryTest )
-                               );
-      suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaMemoryOperationsTester >(
-                                "copyMemoryWithConversionHostToCudaTest",
-                                &tnlCudaMemoryOperationsTester :: copyMemoryWithConversionHostToCudaTest )
-                               );
-      suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaMemoryOperationsTester >(
-                                "copyMemoryWithConversionCudaToHostTest",
-                                &tnlCudaMemoryOperationsTester :: copyMemoryWithConversionCudaToHostTest )
-                               );
-      suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaMemoryOperationsTester >(
-                                "copyMemoryWithConversionCudaToCudaTest",
-                                &tnlCudaMemoryOperationsTester :: copyMemoryWithConversionCudaToCudaTest )
-                               );
-      suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaMemoryOperationsTester >(
-                                "compareMemoryHostCudaTest",
-                                &tnlCudaMemoryOperationsTester :: compareMemoryHostCudaTest )
-                               );
-      suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCudaMemoryOperationsTester >(
-                                "compareMemoryWithConevrsionHostCudaTest",
-                                &tnlCudaMemoryOperationsTester :: compareMemoryWithConversionHostCudaTest )
-                               );
-
-      return suiteOfTests;
-   };
-
-   int getTestSize()
-   {
-      const int cudaGridSize = 256;
-      return 1.5 * cudaGridSize * maxCudaBlockSize;
-      //return  1 << 22;
-   };
-
-   void allocationTest()
-   {
-      int* data;
-      allocateMemoryCuda( data, getTestSize() );
-      CPPUNIT_ASSERT( checkCudaDevice );
-
-      freeMemoryCuda( data );
-      CPPUNIT_ASSERT( checkCudaDevice );
-   };
-
-   void smallMemorySetTest()
-   {
-      const int size = 1024;
-      int *hostData, *deviceData;
-      allocateMemoryHost( hostData, size );
-      allocateMemoryCuda( deviceData, size );
-      setMemoryHost( hostData, 0, size );
-      setMemoryCuda( deviceData, 13, size, maxCudaGridSize );
-      CPPUNIT_ASSERT( checkCudaDevice );
-      copyMemoryCudaToHost( hostData, deviceData, size );
-      CPPUNIT_ASSERT( checkCudaDevice );
-      for( int i = 0; i < size; i ++ )
-         CPPUNIT_ASSERT( hostData[ i ] == 13 );
-      freeMemoryHost( hostData );
-      freeMemoryCuda( deviceData );
-   };
-
-   void bigMemorySetTest()
-   {
-      const int size( getTestSize() );
-      int *hostData, *deviceData;
-      allocateMemoryHost( hostData, size );
-      allocateMemoryCuda( deviceData, size );
-      setMemoryHost( hostData, 0, size );
-      setMemoryCuda( deviceData, 13, size, maxCudaGridSize );
-      CPPUNIT_ASSERT( checkCudaDevice );
-      copyMemoryCudaToHost( hostData, deviceData, size );
-      CPPUNIT_ASSERT( checkCudaDevice );
-      for( int i = 0; i < size; i += 100 )
-      {
-         if( hostData[ i ] != 13 )
-         CPPUNIT_ASSERT( hostData[ i ] == 13 );
-      }
-      freeMemoryHost( hostData );
-      freeMemoryCuda( deviceData );
-   };
-
-   void copyMemoryTest()
-   {
-      const int size = getTestSize();
-
-      int *hostData1, *hostData2, *deviceData;
-      allocateMemoryHost( hostData1, size );
-      allocateMemoryHost( hostData2, size );
-      allocateMemoryCuda( deviceData, size );
-      setMemoryHost( hostData1, 13, size );
-      copyMemoryHostToCuda( deviceData, hostData1, size );
-      copyMemoryCudaToHost( hostData2, deviceData, size );
-      CPPUNIT_ASSERT( compareMemoryHost( hostData1, hostData2, size) );
-      freeMemoryHost( hostData1 );
-      freeMemoryHost( hostData2 );
-      freeMemoryCuda( deviceData );
-   };
-
-   void copyMemoryWithConversionHostToCudaTest()
-   {
-      const int size = getTestSize();
-      int *hostData1;
-      float *hostData2, *deviceData;
-      allocateMemoryHost( hostData1, size );
-      allocateMemoryHost( hostData2, size );
-      allocateMemoryCuda( deviceData, size );
-      setMemoryHost( hostData1, 13, size );
-      copyMemoryHostToCuda( deviceData, hostData1, size );
-      copyMemoryCudaToHost( hostData2, deviceData, size );
-      for( int i = 0; i < size; i ++ )
-         CPPUNIT_ASSERT( hostData1[ i ] == hostData2[ i ] );
-      freeMemoryHost( hostData1 );
-      freeMemoryHost( hostData2 );
-      freeMemoryCuda( deviceData );
-   };
-
-   void copyMemoryWithConversionCudaToHostTest()
-   {
-      const int size = getTestSize();
-      int *hostData1, *deviceData;
-      float *hostData2;
-      allocateMemoryHost( hostData1, size );
-      allocateMemoryHost( hostData2, size );
-      allocateMemoryCuda( deviceData, size );
-      setMemoryHost( hostData1, 13, size );
-      copyMemoryHostToCuda( deviceData, hostData1, size );
-      copyMemoryCudaToHost( hostData2, deviceData, size );
-      for( int i = 0; i < size; i ++ )
-         CPPUNIT_ASSERT( hostData1[ i ] == hostData2[ i ] );
-      freeMemoryHost( hostData1 );
-      freeMemoryHost( hostData2 );
-      freeMemoryCuda( deviceData );
-   };
-
-   void copyMemoryWithConversionCudaToCudaTest()
-   {
-      const int size = getTestSize();
-      int *hostData1, *deviceData1;
-      float *hostData2, *deviceData2;
-      allocateMemoryHost( hostData1, size );
-      allocateMemoryHost( hostData2, size );
-      allocateMemoryCuda( deviceData1, size );
-      allocateMemoryCuda( deviceData2, size );
-      setMemoryHost( hostData1, 13, size );
-      copyMemoryHostToCuda( deviceData1, hostData1, size );
-      copyMemoryCudaToCuda( deviceData2, deviceData1, size, maxCudaGridSize );
-      copyMemoryCudaToHost( hostData2, deviceData2, size );
-      for( int i = 0; i < size; i ++ )
-         CPPUNIT_ASSERT( hostData1[ i ] == hostData2[ i ] );
-      freeMemoryHost( hostData1 );
-      freeMemoryHost( hostData2 );
-      freeMemoryCuda( deviceData1 );
-      freeMemoryCuda( deviceData2 );
-   };
-
-   void compareMemoryHostCudaTest()
-   {
-      const int size = getTestSize();
-      int *hostData, *deviceData;
-      allocateMemoryHost( hostData, size );
-      allocateMemoryCuda( deviceData, size );
-      setMemoryHost( hostData, 7, size );
-      setMemoryCuda( deviceData, 8, size, maxCudaGridSize );
-      CPPUNIT_ASSERT( ! compareMemoryHostCuda( hostData, deviceData, size ) );
-      setMemoryCuda( deviceData, 7, size, maxCudaGridSize );
-      CPPUNIT_ASSERT( compareMemoryHostCuda( hostData, deviceData, size ) );
-   };
-
-   void compareMemoryWithConversionHostCudaTest()
-   {
-      const int size = getTestSize();
-      int *hostData;
-      float *deviceData;
-      allocateMemoryHost( hostData, size );
-      allocateMemoryCuda( deviceData, size );
-      setMemoryHost( hostData, 7, size );
-      setMemoryCuda( deviceData, ( float ) 8.0, size, maxCudaGridSize );
-      CPPUNIT_ASSERT( ! compareMemoryHostCuda( hostData, deviceData, size ) );
-      setMemoryCuda( deviceData, ( float ) 7.0, size, maxCudaGridSize );
-      CPPUNIT_ASSERT( compareMemoryHostCuda( hostData, deviceData, size ) );
-   };
-
-
-};
-
-#else
-class tnlCudaMemoryOperationsTester
-{};
-#endif /* HAVE_CPPUNIT */
-
-#endif /* TNLCUDAMEMORYOPERATIONSTESTER_H_ */
diff --git a/tests/unit-tests/core/cuda/tnlCudaReductionTester.h b/tests/unit-tests/core/cuda/tnlCudaReductionTester.h
index 9324d9dedf..9ac39964ad 100644
--- a/tests/unit-tests/core/cuda/tnlCudaReductionTester.h
+++ b/tests/unit-tests/core/cuda/tnlCudaReductionTester.h
@@ -99,7 +99,7 @@ class tnlCudaReductionTester : public CppUnit :: TestCase
    {
       for( int i = 0; i < size; i ++ )
          hostData[ i ] = value;
-      copyMemoryHostToCuda( deviceData, hostData, size );
+      tnlArrayOperations< tnlHost >::copyMemory< RealType, tnlCuda, RealType, int >( deviceData, hostData, size );
       CPPUNIT_ASSERT( checkCudaDevice );
    }
 
@@ -108,8 +108,8 @@ class tnlCudaReductionTester : public CppUnit :: TestCase
    {
       const int shortSequence( 128 );
       RealType *hostData, *deviceData;
-      allocateMemoryHost( hostData, shortSequence );
-      allocateMemoryCuda( deviceData, shortSequence );
+      tnlArrayOperations< tnlHost >::allocateMemory( hostData, shortSequence );
+      tnlArrayOperations< tnlCuda >::allocateMemory( deviceData, shortSequence );
       CPPUNIT_ASSERT( checkCudaDevice );
 
       RealType result;
@@ -152,18 +152,18 @@ class tnlCudaReductionTester : public CppUnit :: TestCase
       CPPUNIT_ASSERT( result == shortSequence );
 
 
-      freeMemoryHost( hostData );
-      freeMemoryCuda( deviceData );
+      tnlArrayOperations< tnlHost >::freeMemory( hostData );
+      tnlArrayOperations< tnlCuda >::freeMemory( deviceData );
       CPPUNIT_ASSERT( checkCudaDevice );
-   };
+   }
 
    template< typename RealType >
    void longConstantSequenceTest()
    {
       const int longSequence( 172892 );
       RealType *hostData, *deviceData;
-      allocateMemoryHost( hostData, longSequence );
-      allocateMemoryCuda( deviceData, longSequence );
+      tnlArrayOperations< tnlHost >::allocateMemory( hostData, longSequence );
+      tnlArrayOperations< tnlCuda >::allocateMemory( deviceData, longSequence );
       CPPUNIT_ASSERT( checkCudaDevice );
 
       RealType result;
@@ -240,18 +240,18 @@ class tnlCudaReductionTester : public CppUnit :: TestCase
       CPPUNIT_ASSERT( result == 8 * longSequence );
 
 
-      freeMemoryHost( hostData );
-      freeMemoryCuda( deviceData );
+      tnlArrayOperations< tnlHost >::freeMemory( hostData );
+      tnlArrayOperations< tnlHost >::freeMemory( deviceData );
       CPPUNIT_ASSERT( checkCudaDevice );
-   };
+   }
 
    template< typename RealType >
    void linearSequenceTest()
    {
       const int size( 10245 );
       RealType *hostData, *deviceData;
-      allocateMemoryHost( hostData, size );
-      allocateMemoryCuda( deviceData, size );
+      tnlArrayOperations< tnlHost >::allocateMemory( hostData, size );
+      tnlArrayOperations< tnlCuda >::allocateMemory( deviceData, size );
       CPPUNIT_ASSERT( checkCudaDevice );
 
       RealType sum( 0.0 );
@@ -260,7 +260,7 @@ class tnlCudaReductionTester : public CppUnit :: TestCase
          hostData[ i ] = -i - 1;
          sum += hostData[ i ];
       }
-      copyMemoryHostToCuda( deviceData, hostData, size );
+      tnlArrayOperations< tnlHost >::copyMemory< RealType, tnlCuda, RealType, int >( deviceData, hostData, size );
       CPPUNIT_ASSERT( checkCudaDevice );
       tnlParallelReductionSum< RealType, int > sumOperation;
       RealType result;
@@ -292,18 +292,18 @@ class tnlCudaReductionTester : public CppUnit :: TestCase
          ( reductionOnCudaDevice( absMaxOperation, size, deviceData, ( RealType* ) 0, result ) ) );
       CPPUNIT_ASSERT( result == size );
 
-      freeMemoryHost( hostData );
-      freeMemoryCuda( deviceData );
+      tnlArrayOperations< tnlHost >::freeMemory( hostData );
+      tnlArrayOperations< tnlCuda >::freeMemory( deviceData );
       CPPUNIT_ASSERT( checkCudaDevice );
-   };
+   }
 
    template< typename Type >
    void shortLogicalOperationsTest()
    {
       int size( 125 );
       Type *hostData, *deviceData;
-      allocateMemoryHost( hostData, size );
-      allocateMemoryCuda( deviceData, size );
+      tnlArrayOperations< tnlHost >::allocateMemory( hostData, size );
+      tnlArrayOperations< tnlCuda >::allocateMemory( deviceData, size );
       CPPUNIT_ASSERT( checkCudaDevice );
 
       for( int i = 0; i < size; i ++ )
@@ -503,7 +503,7 @@ class tnlCudaReductionTester : public CppUnit :: TestCase
       CPPUNIT_ASSERT(
           ( reductionOnCudaDevice( inequalityOperation, size, deviceData1, deviceData2, result ) ) );
       CPPUNIT_ASSERT( result == true );
-   };
+   }
 
    template< typename Type >
    void shortSdotTest()
@@ -536,7 +536,7 @@ class tnlCudaReductionTester : public CppUnit :: TestCase
       CPPUNIT_ASSERT(
           ( reductionOnCudaDevice( sdotOperation, size, deviceData1, deviceData2, result ) ) );
       CPPUNIT_ASSERT( result == sdot );
-   };
+   }
 
 
    template< typename Type >
@@ -570,7 +570,7 @@ class tnlCudaReductionTester : public CppUnit :: TestCase
       CPPUNIT_ASSERT(
           ( reductionOnCudaDevice( sdotOperation, size, deviceData1, deviceData2, result ) ) );
       CPPUNIT_ASSERT( result == sdot );
-   };
+   }
 
    template< typename Type >
    void shortDiffTest()
@@ -686,7 +686,7 @@ class tnlCudaReductionTester : public CppUnit :: TestCase
       freeMemoryCuda( deviceZeros );
       freeMemoryCuda( deviceOnes );
       freeMemoryCuda( deviceLinear );
-   };
+   }
 
 
    template< typename Type >
@@ -804,7 +804,7 @@ class tnlCudaReductionTester : public CppUnit :: TestCase
       freeMemoryCuda( deviceZeros );
       freeMemoryCuda( deviceOnes );
       freeMemoryCuda( deviceLinear );
-   };
+   }
 
 };
 
diff --git a/tests/unit-tests/core/vectors/tnlVectorOperationsTester.h b/tests/unit-tests/core/vectors/tnlVectorOperationsTester.h
index 667257884e..4593fba5b7 100644
--- a/tests/unit-tests/core/vectors/tnlVectorOperationsTester.h
+++ b/tests/unit-tests/core/vectors/tnlVectorOperationsTester.h
@@ -100,7 +100,12 @@ class tnlVectorOperationsTester : public CppUnit :: TestCase
       for( int i = 0; i < a. getSize(); i ++ )
          a. getData()[ i ] = i;
 
-      copyMemoryHostToCuda( deviceVector. getData(),
+      tnlArrayOperations< tnlHost >::
+         copyMemory< typename Vector::RealType,
+                     tnlCuda, 
+                     typename Vector::RealType,
+                     typename Vector::IndexType >
+                  ( deviceVector. getData(),
                         a. getData(),
                         a. getSize() );
       CPPUNIT_ASSERT( checkCudaDevice );
@@ -115,7 +120,12 @@ class tnlVectorOperationsTester : public CppUnit :: TestCase
       for( int i = 0; i < a. getSize(); i ++ )
          a. getData()[ i ] = 1;
 
-      copyMemoryHostToCuda( deviceVector. getData(),
+      tnlArrayOperations< tnlHost >::
+         copyMemory< typename Vector::RealType,
+                     tnlCuda, 
+                     typename Vector::RealType,
+                     typename Vector::IndexType >
+                  ( deviceVector. getData(),
                         a. getData(),
                         a. getSize() );
       CPPUNIT_ASSERT( checkCudaDevice );
@@ -130,7 +140,12 @@ class tnlVectorOperationsTester : public CppUnit :: TestCase
       for( int i = 0; i < a. getSize(); i ++ )
          a. getData()[ i ] = -i;
 
-      copyMemoryHostToCuda( deviceVector. getData(),
+      tnlArrayOperations< tnlHost >::
+         copyMemory< typename Vector::RealType,
+                     tnlCuda, 
+                     typename Vector::RealType,
+                     typename Vector::IndexType >
+                  ( deviceVector. getData(),
                         a. getData(),
                         a. getSize() );
       CPPUNIT_ASSERT( checkCudaDevice );
-- 
GitLab