diff --git a/CMakeLists.txt b/CMakeLists.txt
index ca353940e1d40c175d76e04f138b9b1c0b376e34..f32e8fbb152a61b3f75e60ce8b35a82aa5f515fd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -92,15 +92,19 @@ endif()
 if( ${CXX_COMPILER_NAME} STREQUAL "mpic++" )
    message( "MPI compiler detected."    )
    set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_MPI" )
+   set( CUDA_HOST_COMPILER "mpic++" )
+
 endif()
 
 ####
-# Check for MPI
+# Check for MPI -- not working
 #
 #find_package( MPI )
 #if( MPI_CXX_FOUND )
-#    set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_MPI" )
-#    message( "MPI headers found -- ${MPI_CXX_INCLUDE_PATH}")
+   # set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_MPI" )
+   # message( "MPI headers found -- ${MPI_CXX_INCLUDE_PATH}")
+   # message( "MPI link flags  -- ${MPI_CXX_LINK_FLAGS}")
+   # message( "MPI libraries-- ${MPI_CXX_LIBRARIES}")
 #endif()
 
 #####
diff --git a/src/TNL/Communicators/MpiCommunicator.h b/src/TNL/Communicators/MpiCommunicator.h
index 5437d881b634a11907d90ad37d9302f174ca3b56..1b4e9245add11f4e0ef6adf9a8a2d9da437542ce 100644
--- a/src/TNL/Communicators/MpiCommunicator.h
+++ b/src/TNL/Communicators/MpiCommunicator.h
@@ -26,17 +26,17 @@ namespace Communicators {
     {
 
         private:
-        inline static MPI_Datatype MPIDataType( const signed char ) { return MPI_CHAR; };
-        inline static MPI_Datatype MPIDataType( const signed short int ) { return MPI_SHORT; };
-        inline static MPI_Datatype MPIDataType( const signed int ) { return MPI_INT; };
-        inline static MPI_Datatype MPIDataType( const signed long int ) { return MPI_LONG; };
-        inline static MPI_Datatype MPIDataType( const unsigned char ) { return MPI_UNSIGNED_CHAR; };
-        inline static MPI_Datatype MPIDataType( const unsigned short int ) { return MPI_UNSIGNED_SHORT; };
-        inline static MPI_Datatype MPIDataType( const unsigned int ) { return MPI_UNSIGNED; };
-        inline static MPI_Datatype MPIDataType( const unsigned long int ) { return MPI_UNSIGNED_LONG; };
-        inline static MPI_Datatype MPIDataType( const float ) { return MPI_FLOAT; };
-        inline static MPI_Datatype MPIDataType( const double ) { return MPI_DOUBLE; };
-        inline static MPI_Datatype MPIDataType( const long double ) { return MPI_LONG_DOUBLE; };
+        inline static MPI_Datatype MPIDataType( const signed char* ) { return MPI_CHAR; };
+        inline static MPI_Datatype MPIDataType( const signed short int* ) { return MPI_SHORT; };
+        inline static MPI_Datatype MPIDataType( const signed int* ) { return MPI_INT; };
+        inline static MPI_Datatype MPIDataType( const signed long int* ) { return MPI_LONG; };
+        inline static MPI_Datatype MPIDataType( const unsigned char *) { return MPI_UNSIGNED_CHAR; };
+        inline static MPI_Datatype MPIDataType( const unsigned short int* ) { return MPI_UNSIGNED_SHORT; };
+        inline static MPI_Datatype MPIDataType( const unsigned int* ) { return MPI_UNSIGNED; };
+        inline static MPI_Datatype MPIDataType( const unsigned long int* ) { return MPI_UNSIGNED_LONG; };
+        inline static MPI_Datatype MPIDataType( const float* ) { return MPI_FLOAT; };
+        inline static MPI_Datatype MPIDataType( const double* ) { return MPI_DOUBLE; };
+        inline static MPI_Datatype MPIDataType( const long double* ) { return MPI_LONG_DOUBLE; };
         
         public:
 
@@ -58,7 +58,7 @@ namespace Communicators {
         static std::streambuf *backup;
         static std::ofstream filestr;
 
-        static void Init(int argc, char **argv, bool redirect=false)
+        static void Init(int argc, char **argv,bool redirect=false)
         {
             MPI::Init(argc,argv);
             NullRequest=MPI::REQUEST_NULL;
@@ -79,7 +79,6 @@ namespace Communicators {
                     std::cout.rdbuf(psbuf);
                 }
             }
-
         };
 
         static void Finalize()
@@ -126,14 +125,14 @@ namespace Communicators {
         template <typename T>
         static Request ISend( const T *data, int count, int dest)
         {
-                return MPI::COMM_WORLD.Isend((void*) data, count, MPIDataType(*data) , dest, 0);
-        };    
+                return MPI::COMM_WORLD.Isend((void*) data, count, MPIDataType(data) , dest, 0);
+        }    
 
         template <typename T>
         static Request IRecv( const T *data, int count, int src)
         {
-                return MPI::COMM_WORLD.Irecv((void*) data, count, MPIDataType(*data) , src, 0);
-        };
+                return MPI::COMM_WORLD.Irecv((void*) data, count, MPIDataType(data) , src, 0);
+        }
 
         static void WaitAll(Request *reqs, int length)
         {
@@ -144,7 +143,7 @@ namespace Communicators {
         static void Bcast(  T& data, int count, int root)
         {
                 MPI::COMM_WORLD.Bcast((void*) &data, count,  MPIDataType(data), root);
-        };
+        }
 
       /*  template< typename T >
         static void Allreduce( T& data,
diff --git a/src/TNL/Containers/Array_impl.h b/src/TNL/Containers/Array_impl.h
index 568df2f5136e0b4a0a1258f7254518152f7d71c7..11c546c664fbe2468485f11755d2709ed0d6c7f2 100644
--- a/src/TNL/Containers/Array_impl.h
+++ b/src/TNL/Containers/Array_impl.h
@@ -547,6 +547,7 @@ boundLoad( File& file )
    return true;
 }
 
+
 template< typename Element,
           typename Device,
           typename Index >
diff --git a/src/TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h b/src/TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h
new file mode 100644
index 0000000000000000000000000000000000000000..7655bc67937d99645370e3380782f1957bf3a4a8
--- /dev/null
+++ b/src/TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h
@@ -0,0 +1,125 @@
+/***************************************************************************
+                          BufferEntittiesHelper.h  -  description
+                             -------------------
+    begin                : March 1, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/ParallelFor.h>
+
+namespace TNL {
+namespace Meshes { 
+namespace DistributedMeshes { 
+
+
+template < typename MeshFunctionType,
+           int dim,
+           typename RealType=typename MeshFunctionType::MeshType::RealType,
+           typename Device=typename MeshFunctionType::MeshType::DeviceType,
+           typename Index=typename MeshFunctionType::MeshType::GlobalIndexType >
+class BufferEntitiesHelper
+{
+};
+
+//======================================== 1D ====================================================
+
+//host
+template < typename MeshFunctionType, typename RealType, typename Device, typename Index >
+class BufferEntitiesHelper<MeshFunctionType,1,RealType,Device,Index>
+{
+    public:
+    static void BufferEntities(MeshFunctionType meshFunction, RealType * buffer, Index beginx, Index sizex, bool tobuffer)
+    {
+        auto mesh = meshFunction.getMesh();
+        RealType* meshFunctionData = meshFunction.getData().getData();
+        auto kernel = [tobuffer, mesh, buffer, meshFunctionData, beginx] __cuda_callable__ ( Index j )
+        {
+            typename MeshFunctionType::MeshType::Cell entity(mesh);
+            entity.getCoordinates().x()=beginx+j;
+            entity.refresh();
+            if(tobuffer)
+                buffer[j]=meshFunctionData[entity.getIndex()];
+            else
+                meshFunctionData[entity.getIndex()]=buffer[j];
+        };
+        ParallelFor< Device >::exec( 0, sizex, kernel );
+    };  
+};
+
+
+//======================================== 2D ====================================================
+template <typename MeshFunctionType, typename RealType, typename Device, typename Index  > 
+class BufferEntitiesHelper<MeshFunctionType,2,RealType,Device,Index>
+{
+    public:
+    static void BufferEntities(MeshFunctionType meshFunction, RealType * buffer, Index beginx, Index beginy, Index sizex, Index sizey,bool tobuffer)
+    {
+        auto mesh=meshFunction.getMesh();
+        RealType *meshFunctionData=meshFunction.getData().getData();
+        auto kernel = [tobuffer, mesh, buffer, meshFunctionData, beginx, sizex, beginy] __cuda_callable__ ( Index i, Index j )
+        {
+            typename MeshFunctionType::MeshType::Cell entity(mesh);
+            entity.getCoordinates().x()=beginx+j;
+            entity.getCoordinates().y()=beginy+i;				
+            entity.refresh();
+            if(tobuffer)
+                    buffer[i*sizex+j]=meshFunctionData[entity.getIndex()];
+            else
+                    meshFunctionData[entity.getIndex()]=buffer[i*sizex+j];
+        };
+        
+        ParallelFor2D< Device >::exec( 0, 0, sizey, sizex, kernel );       
+        
+    };
+};
+
+
+//======================================== 3D ====================================================
+template <typename MeshFunctionType, typename RealType, typename Device, typename Index >
+class BufferEntitiesHelper<MeshFunctionType,3,RealType,Device,Index>
+{
+    public:
+    static void BufferEntities(MeshFunctionType meshFunction, RealType * buffer, Index beginx, Index beginy, Index beginz, Index sizex, Index sizey, Index sizez, bool tobuffer)
+    {
+
+        auto mesh=meshFunction.getMesh();
+        RealType * meshFunctionData=meshFunction.getData().getData();
+        auto kernel = [tobuffer, mesh, buffer, meshFunctionData, beginx, sizex, beginy, sizey, beginz] __cuda_callable__ ( Index k, Index i, Index j )
+        {
+            typename MeshFunctionType::MeshType::Cell entity(mesh);
+            entity.getCoordinates().x()=beginx+j;
+            entity.getCoordinates().z()=beginz+k;
+            entity.getCoordinates().y()=beginy+i;
+            entity.refresh();
+            if(tobuffer)
+                    buffer[k*sizex*sizey+i*sizex+j]=meshFunctionData[entity.getIndex()];
+            else
+                    meshFunctionData[entity.getIndex()]=buffer[k*sizex*sizey+i*sizex+j];
+        };
+
+        ParallelFor3D< Device >::exec( 0, 0, 0, sizez, sizey, sizex, kernel ); 
+
+        /*for(int k=0;k<sizez;k++)
+        {
+            for(int i=0;i<sizey;i++)
+            {
+                for(int j=0;j<sizex;j++)
+                {
+                        kernel(k,i,j);
+                }
+            }
+        }*/
+    };
+};
+
+
+} // namespace DistributedMeshes
+} // namespace Meshes
+} // namespace TNL
diff --git a/src/TNL/Meshes/DistributedMeshes/CopyEntitiesHelper.h b/src/TNL/Meshes/DistributedMeshes/CopyEntitiesHelper.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe2f82cffe1a54fb03f51e3aa6b85e8952397718
--- /dev/null
+++ b/src/TNL/Meshes/DistributedMeshes/CopyEntitiesHelper.h
@@ -0,0 +1,136 @@
+/***************************************************************************
+                          CopyEntitiesHelper.h  -  description
+                             -------------------
+    begin                : March 8, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/ParallelFor.h>
+
+namespace TNL {
+namespace Meshes { 
+namespace DistributedMeshes { 
+
+template<typename MeshFunctionType,
+         int dim=MeshFunctionType::getMeshDimension()>
+class CopyEntitiesHelper
+{
+    public:
+    typedef typename MeshFunctionType::MeshType::CoordinatesType CoordinatesType;
+    static void Copy(MeshFunctionType &from, MeshFunctionType &to, CoordinatesType &fromBegin, CoordinatesType &toBegin, CoordinatesType &size)
+    {
+    }
+
+};
+
+
+template<typename MeshFunctionType>
+class CopyEntitiesHelper<MeshFunctionType, 1>
+{
+    public:
+    typedef typename MeshFunctionType::MeshType::CoordinatesType CoordinatesType;
+    typedef typename MeshFunctionType::MeshType::Cell Cell;
+    typedef typename MeshFunctionType::MeshType::GlobalIndexType Index;
+
+    static void Copy(MeshFunctionType &from, MeshFunctionType &to, CoordinatesType &fromBegin, CoordinatesType &toBegin, CoordinatesType &size)
+    {        
+        auto toData=to.getData().getData();
+        auto fromData=from.getData().getData();
+        auto fromMesh=from.getMesh();
+        auto toMesh=to.getMesh();
+        auto kernel = [fromData,toData, fromMesh, toMesh, fromBegin, toBegin] __cuda_callable__ ( Index i )
+        {
+            Cell fromEntity(fromMesh);
+            Cell toEntity(toMesh);
+            toEntity.getCoordinates().x()=toBegin.x()+i;            
+            toEntity.refresh();
+            fromEntity.getCoordinates().x()=fromBegin.x()+i;            
+            fromEntity.refresh();
+            toData[toEntity.getIndex()]=fromData[fromEntity.getIndex()];
+        };
+        ParallelFor< typename MeshFunctionType::MeshType::DeviceType >::exec( (Index)0, (Index)size.x(), kernel );
+
+    }
+
+};
+
+
+template<typename MeshFunctionType>
+
+class CopyEntitiesHelper<MeshFunctionType,2>
+{
+    public:
+    typedef typename MeshFunctionType::MeshType::CoordinatesType CoordinatesType;
+    typedef typename MeshFunctionType::MeshType::Cell Cell;
+    typedef typename MeshFunctionType::MeshType::GlobalIndexType Index;
+
+    static void Copy(MeshFunctionType &from, MeshFunctionType &to, CoordinatesType &fromBegin, CoordinatesType &toBegin, CoordinatesType &size)
+    {
+        auto toData=to.getData().getData();
+        auto fromData=from.getData().getData();
+        auto fromMesh=from.getMesh();
+        auto toMesh=to.getMesh();
+        auto kernel = [fromData,toData, fromMesh, toMesh, fromBegin, toBegin] __cuda_callable__ ( Index j, Index i )
+        {
+            Cell fromEntity(fromMesh);
+            Cell toEntity(toMesh);
+            toEntity.getCoordinates().x()=toBegin.x()+i;
+            toEntity.getCoordinates().y()=toBegin.y()+j;            
+            toEntity.refresh();
+            fromEntity.getCoordinates().x()=fromBegin.x()+i;
+            fromEntity.getCoordinates().y()=fromBegin.y()+j;            
+            fromEntity.refresh();
+            toData[toEntity.getIndex()]=fromData[fromEntity.getIndex()];
+        };
+        ParallelFor2D< typename MeshFunctionType::MeshType::DeviceType >::exec( (Index)0,(Index)0,(Index)size.y(), (Index)size.x(), kernel );
+
+    }
+
+};
+
+
+template<typename MeshFunctionType>
+class CopyEntitiesHelper<MeshFunctionType,3>
+{
+    public:
+    typedef typename MeshFunctionType::MeshType::CoordinatesType CoordinatesType;
+    typedef typename MeshFunctionType::MeshType::Cell Cell;
+    typedef typename MeshFunctionType::MeshType::GlobalIndexType Index;
+
+    static void Copy(MeshFunctionType &from, MeshFunctionType &to, CoordinatesType &fromBegin, CoordinatesType &toBegin, CoordinatesType &size)
+    {
+        auto toData=to.getData().getData();
+        auto fromData=from.getData().getData();
+        auto fromMesh=from.getMesh();
+        auto toMesh=to.getMesh();
+        auto kernel = [fromData,toData, fromMesh, toMesh, fromBegin, toBegin] __cuda_callable__ ( Index k, Index j, Index i )
+        {
+            Cell fromEntity(fromMesh);
+            Cell toEntity(toMesh);
+            toEntity.getCoordinates().x()=toBegin.x()+i;
+            toEntity.getCoordinates().y()=toBegin.y()+j;
+            toEntity.getCoordinates().z()=toBegin.z()+k;                                
+            toEntity.refresh();
+            fromEntity.getCoordinates().x()=fromBegin.x()+i;
+            fromEntity.getCoordinates().y()=fromBegin.y()+j;
+            fromEntity.getCoordinates().z()=fromBegin.z()+k;            
+            fromEntity.refresh();
+            toData[toEntity.getIndex()]=fromData[fromEntity.getIndex()];
+        };
+        ParallelFor3D< typename MeshFunctionType::MeshType::DeviceType >::exec( (Index)0,(Index)0,(Index)0,(Index)size.z() ,(Index)size.y(), (Index)size.x(), kernel );
+    }
+};
+
+
+
+
+} // namespace DistributedMeshes
+} // namespace Meshes
+} // namespace TNL
diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedGridIO.h b/src/TNL/Meshes/DistributedMeshes/DistributedGridIO.h
index c34e16ab8036a53f1d294465ff38da6884f13a51..0a54b312c2ed19761c442b60ef9c8460af810a63 100644
--- a/src/TNL/Meshes/DistributedMeshes/DistributedGridIO.h
+++ b/src/TNL/Meshes/DistributedMeshes/DistributedGridIO.h
@@ -12,27 +12,17 @@
 
 #include <TNL/File.h>
 #include <TNL/Meshes/DistributedMeshes/DistributedMesh.h>
+#include <TNL/Meshes/DistributedMeshes/CopyEntitiesHelper.h>
 #include <TNL/Functions/MeshFunction.h>
 
+
 #include <iostream>
 
 namespace TNL {
 namespace Meshes {   
 namespace DistributedMeshes {
 
-template<typename MeshFunctionType,
-         int dim=MeshFunctionType::getMeshDimension()>
-class CopyEntities
-{
-    public:
-    typedef typename MeshFunctionType::MeshType::CoordinatesType CoordinatesType;
-    static void Copy(MeshFunctionType &from, MeshFunctionType &to, CoordinatesType &fromBegin, CoordinatesType &toBegin, CoordinatesType &size)
-    {
-    }
-
-};
-
-enum DistrGridIOTypes { Dummy = 0 , LocalCopy = 1 };
+enum DistrGridIOTypes { Dummy = 0 , LocalCopy = 1, MPIIO=2 };
     
 template<typename MeshFunctionType,
          DistrGridIOTypes type = LocalCopy> 
@@ -103,7 +93,7 @@ class DistributedGridIO<MeshFunctionType,LocalCopy>
         CoordinatesType zeroCoord;
         zeroCoord.setValue(0);
 
-        CopyEntities<MeshFunctionType> ::Copy(meshFunction,newMeshFunction,localBegin,zeroCoord,localSize);
+        CopyEntitiesHelper<MeshFunctionType>::Copy(meshFunction,newMeshFunction,localBegin,zeroCoord,localSize);
         return newMeshFunction.save(file);
         
     };
@@ -137,94 +127,63 @@ class DistributedGridIO<MeshFunctionType,LocalCopy>
         zeroCoord.setValue(0);        
 
         bool result=newMeshFunction.boundLoad(file);
-        CopyEntities<MeshFunctionType> ::Copy(newMeshFunction,meshFunction,zeroCoord,localBegin,localSize);
+        CopyEntitiesHelper<MeshFunctionType>::Copy(newMeshFunction,meshFunction,zeroCoord,localBegin,localSize);
         
         return result;
     };
     
 };
 
-
-//==================================Copy Entities=========================================================
-template<typename MeshFunctionType>
-class CopyEntities<MeshFunctionType,1>
+/*
+ * Save distributed data into single file without overlaps using MPIIO and MPI datatypes, 
+ * EXPLOSIVE: works with only Grids
+ */
+/*template<typename MeshFunctionType> 
+class DistributedGridIO<MeshFunctionType,MPIIO>
 {
+
     public:
-    typedef typename MeshFunctionType::MeshType::CoordinatesType CoordinatesType;
-    typedef typename MeshFunctionType::MeshType::Cell Cell;
 
-    static void Copy(MeshFunctionType &from, MeshFunctionType &to, CoordinatesType &fromBegin, CoordinatesType &toBegin, CoordinatesType &size)
-    {        
-        Cell fromEntity(from.getMesh());
-        Cell toEntity(to.getMesh());
-        for(int i=0;i<size.x();i++)
+    typedef typename MeshFunctionType::MeshType MeshType;
+    typedef typename MeshFunctionType::MeshType::CoordinatesType CoordinatesType;
+    typedef typename MeshFunctionType::MeshType::PointType PointType;
+    typedef typename MeshFunctionType::VectorType VectorType;
+    //typedef DistributedGrid< MeshType,MeshFunctionType::getMeshDimension()> DistributedGridType;
+    
+    static bool save(File &file,File &meshOutputFile, MeshFunctionType &meshFunction)
+    {
+        auto *distrGrid=meshFunction.getMesh().GetDistMesh();
+        
+        if(distrGrid==NULL) //not distributed
         {
-                        toEntity.getCoordinates().x()=toBegin.x()+i;            
-                        toEntity.refresh();
-                        fromEntity.getCoordinates().x()=fromBegin.x()+i;            
-                        fromEntity.refresh();
-            to.getData()[toEntity.getIndex()]=from.getData()[fromEntity.getIndex()];
+            return meshFunction.save(file);
         }
-    }
-
-};
 
-template<typename MeshFunctionType>
+        int dim=distrGrid.getMeshDimension();
 
-class CopyEntities<MeshFunctionType,2>
-{
-    public:
-    typedef typename MeshFunctionType::MeshType::CoordinatesType CoordinatesType;
-    typedef typename MeshFunctionType::MeshType::Cell Cell;
+        MeshType mesh=meshFunction.getMesh();
+        
+        fgsize[2],flsize[2],fstarts[2];
 
-    static void Copy(MeshFunctionType &from, MeshFunctionType &to, CoordinatesType &fromBegin, CoordinatesType &toBegin, CoordinatesType &size)
+        return newMeshFunction.save(file);
+        
+    };
+            
+    static bool load(File &file,MeshFunctionType &meshFunction) 
     {
-        Cell fromEntity(from.getMesh());
-        Cell toEntity(to.getMesh());
-        for(int j=0;j<size.y();j++)
-            for(int i=0;i<size.x();i++)
-            {
-                toEntity.getCoordinates().x()=toBegin.x()+i;
-                toEntity.getCoordinates().y()=toBegin.y()+j;            
-                toEntity.refresh();
-                fromEntity.getCoordinates().x()=fromBegin.x()+i;
-                fromEntity.getCoordinates().y()=fromBegin.y()+j;            
-                fromEntity.refresh();
-                to.getData()[toEntity.getIndex()]=from.getData()[fromEntity.getIndex()];
-            }
-    }
-
-};
-
-template<typename MeshFunctionType>
-class CopyEntities<MeshFunctionType,3>
-{
-    public:
-    typedef typename MeshFunctionType::MeshType::CoordinatesType CoordinatesType;
-    typedef typename MeshFunctionType::MeshType::Cell Cell;
+        auto *distrGrid=meshFunction.getMesh().GetDistMesh();
+        if(distrGrid==NULL) //not distributed
+        {
+            return meshFunction.boundLoad(file);
+        }
 
-    static void Copy(MeshFunctionType &from, MeshFunctionType &to, CoordinatesType &fromBegin, CoordinatesType &toBegin, CoordinatesType &size)
-    {
-        Cell fromEntity(from.getMesh());
-        Cell toEntity(to.getMesh());
-        for(int k=0;k<size.z();k++)
-            for(int j=0;j<size.y();j++)
-                for(int i=0;i<size.x();i++)
-                {
-                    toEntity.getCoordinates().x()=toBegin.x()+i;
-                                    toEntity.getCoordinates().y()=toBegin.y()+j;
-                                    toEntity.getCoordinates().z()=toBegin.z()+k;                                
-                    toEntity.refresh();
-                    fromEntity.getCoordinates().x()=fromBegin.x()+i;
-                    fromEntity.getCoordinates().y()=fromBegin.y()+j;
-                    fromEntity.getCoordinates().z()=fromBegin.z()+k;            
-                    fromEntity.refresh();
-                    to.getData()[toEntity.getIndex()]=from.getData()[fromEntity.getIndex()];
-                }
-
-    }
+        MeshType mesh=meshFunction.getMesh();
+        
+        
+    };
+    
+};*/
 
-};
 
 }
 }
diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedGridSynchronizer.h b/src/TNL/Meshes/DistributedMeshes/DistributedGridSynchronizer.h
index 7f0625155b7099d445cd0a8a202d46c8f7465ca2..fcf4c35ec772f46a61e5c4661aa424d5aa5c6082 100644
--- a/src/TNL/Meshes/DistributedMeshes/DistributedGridSynchronizer.h
+++ b/src/TNL/Meshes/DistributedMeshes/DistributedGridSynchronizer.h
@@ -11,7 +11,8 @@
 #pragma once
 
 #include <TNL/Meshes/Grid.h>
-//#include <TNL/Functions/MeshFunction.h>
+#include <TNL/Containers/Array.h>
+#include <TNL/Meshes/DistributedMeshes/BufferEntitiesHelper.h>
 
 
 namespace TNL {
@@ -46,12 +47,9 @@ public:
 
 
 private:  
-        Real * leftsendbuf;
-        Real * rightsendbuf;
-        Real * leftrcvbuf;
-        Real * rightrcvbuf;
-
-        int size;
+        Containers::Array<RealType, Device> sendbuffs[2];
+        Containers::Array<RealType, Device> rcvbuffs[2];
+        int overlapSize;
 
         DistributedGridType *distributedgrid;
 
@@ -72,28 +70,16 @@ private:
 
     void SetDistributedGrid(DistributedGridType *distrgrid)
     {
-        if(isSet)
-        {
-            DeleteBuffers();
-        }
         isSet=true;
 
         this->distributedgrid=distrgrid;
 
-        size = distributedgrid->getOverlap().x();
+        overlapSize = distributedgrid->getOverlap().x();
 
-        leftsendbuf=new Real[size];
-        rightsendbuf=new Real[size];
-        leftrcvbuf=new Real[size];
-        rightrcvbuf=new Real[size];      
-    };
-
-    ~DistributedMeshSynchronizer()
-    {
-        if(isSet)
-        {
-            DeleteBuffers();
-        };
+        sendbuffs[0].setSize(overlapSize);
+        sendbuffs[1].setSize(overlapSize);
+        rcvbuffs[0].setSize(overlapSize);
+        rcvbuffs[1].setSize(overlapSize);      
     };
 
     template<typename CommunicatorType>
@@ -104,38 +90,23 @@ private:
         if(!distributedgrid->IsDistributed())
                 return;
 
-        Cell leftentity(meshfunction.getMesh());
-        Cell rightentity(meshfunction.getMesh());
+        int leftN=distributedgrid->getLeft();
+        int rightN=distributedgrid->getRight();
 
-        int left=distributedgrid->getLeft();
-        int right=distributedgrid->getRight();
+        int totalSize = meshfunction.getMesh().getDimensions().x();
 
-        //fill send buffers
-        for(int i=0;i<size;i++)
-        {
-            if(left!=-1)
-            {
-                leftentity.getCoordinates().x() = size+i;
-                leftentity.refresh();
-                leftsendbuf[i]=meshfunction.getData()[leftentity.getIndex()];
-            }
-    
-            if(right!=-1)
-            {
-                rightentity.getCoordinates().x() = meshfunction.getMesh().getDimensions().x()-2*size+i;
-                rightentity.refresh();            
-                rightsendbuf[i]=meshfunction.getData()[rightentity.getIndex()];
-            }
-        }
+        CopyBuffers(meshfunction, sendbuffs, true,
+                overlapSize, totalSize-2*overlapSize, overlapSize,
+                leftN, rightN);
 
         //async send
         typename CommunicatorType::Request req[4];
 
         //send everithing, recieve everything 
-        if(left!=-1)
+        if(leftN!=-1)
         {
-            req[0]=CommunicatorType::ISend(leftsendbuf, size, left);
-            req[2]=CommunicatorType::IRecv(leftrcvbuf, size, left);
+            req[0]=CommunicatorType::ISend(sendbuffs[Left].getData(), overlapSize, leftN);
+            req[2]=CommunicatorType::IRecv(rcvbuffs[Left].getData(), overlapSize, leftN);
         }
         else
         {
@@ -143,10 +114,10 @@ private:
             req[2]=CommunicatorType::NullRequest;
         }        
 
-        if(right!=-1)
+        if(rightN!=-1)
         {
-            req[1]=CommunicatorType::ISend(rightsendbuf, size, right);
-            req[3]=CommunicatorType::IRecv(rightrcvbuf, size, right);
+            req[1]=CommunicatorType::ISend(sendbuffs[Right].getData(), overlapSize, rightN);
+            req[3]=CommunicatorType::IRecv(rcvbuffs[Right].getData(), overlapSize, rightN);
         }
         else
         {
@@ -157,40 +128,26 @@ private:
         //wait until send and recv is done
         CommunicatorType::WaitAll(req, 4);
 
-        //copy data form rcv buffers
-        if(left!=-1)
-        {
-            for(int i=0;i<size;i++)
-            {
-                leftentity.getCoordinates().x() = i;
-                leftentity.refresh();
-                meshfunction.getData()[leftentity.getIndex()]=leftrcvbuf[i];
-            }
-        }
-
-        if(right!=-1)
-        {
-            for(int i=0;i<size;i++)
-            {
-                rightentity.getCoordinates().x() = meshfunction.getMesh().getDimensions().x()-size+i;
-                rightentity.refresh();
-                meshfunction.getData()[rightentity.getIndex()]=rightrcvbuf[i];
-            }
-        }
-    };
+        CopyBuffers(meshfunction, rcvbuffs, false,
+                0, totalSize-overlapSize, overlapSize,
+                leftN, rightN);
+    }
 
     private:
-    void DeleteBuffers(void)
+    template <typename Real>
+    void CopyBuffers(MeshFunctionType meshfunction, TNL::Containers::Array<Real,Device> * buffers, bool toBuffer,
+            int left, int right,
+            int size,
+            int leftNeighbor, int rightNeighbor)
     {
-        delete [] leftrcvbuf;
-        delete [] rightrcvbuf;
-        delete [] leftsendbuf;
-        delete [] rightsendbuf; 
-    };
+        if(leftNeighbor!=-1)
+            BufferEntitiesHelper<MeshFunctionType,1,Real,Device>::BufferEntities(meshfunction,buffers[Left].getData(),left,size,toBuffer);
+        if(rightNeighbor!=-1)
+            BufferEntitiesHelper<MeshFunctionType,1,Real,Device>::BufferEntities(meshfunction,buffers[Right].getData(),right,size,toBuffer);  
+    }
 
 };
 
-
 //=========================2D=================================================
 template <typename RealType,
           int EntityDimension,
@@ -210,8 +167,8 @@ class DistributedMeshSynchronizer< Functions::MeshFunction< Grid< 2, GridReal, D
     private:
         DistributedGridType *distributedgrid;
 
-        Real * sendbuffs[8];
-        Real * rcvbuffs[8];
+        Containers::Array<RealType, Device, Index> sendbuffs[8];
+        Containers::Array<RealType, Device, Index> rcvbuffs[8];
         int sizes[8];
         
         int leftSrc;
@@ -244,10 +201,6 @@ class DistributedMeshSynchronizer< Functions::MeshFunction< Grid< 2, GridReal, D
 
     void SetDistributedGrid(DistributedGridType *distrgrid)
     {
-        if(isSet)
-        {
-            DeleteBuffers();
-        }
         isSet=true;
         
         this->distributedgrid=distrgrid;
@@ -273,8 +226,8 @@ class DistributedMeshSynchronizer< Functions::MeshFunction< Grid< 2, GridReal, D
 
         for(int i=0;i<8;i++)
         {
-            sendbuffs[i]=new Real[sizes[i]];
-            rcvbuffs[i]=new Real[sizes[i]];
+            sendbuffs[i].setSize(sizes[i]);
+            rcvbuffs[i].setSize(sizes[i]);
         }
 
         leftSrc=localbegin.x();
@@ -290,13 +243,7 @@ class DistributedMeshSynchronizer< Functions::MeshFunction< Grid< 2, GridReal, D
         upDst=0;
         downDst=localgridsize.y()-overlap.y();                       
     }
-
-    ~DistributedMeshSynchronizer()
-    {
-        if(isSet)
-            DeleteBuffers();
-    }
-        
+       
     template<typename CommunicatorType>
     void Synchronize( MeshFunctionType &meshfunction)
     {
@@ -321,8 +268,8 @@ class DistributedMeshSynchronizer< Functions::MeshFunction< Grid< 2, GridReal, D
         for(int i=0;i<8;i++)	
            if(neighbor[i]!=-1)
            {
-               req[i]=CommunicatorType::ISend(sendbuffs[i], sizes[i], neighbor[i]);
-               req[8+i]=CommunicatorType::IRecv(rcvbuffs[i], sizes[i], neighbor[i]);
+               req[i]=CommunicatorType::ISend(sendbuffs[i].getData(), sizes[i], neighbor[i]);
+               req[8+i]=CommunicatorType::IRecv(rcvbuffs[i].getData(), sizes[i], neighbor[i]);
            }
 		   else
       	   {
@@ -339,62 +286,33 @@ class DistributedMeshSynchronizer< Functions::MeshFunction< Grid< 2, GridReal, D
             xcenter, ycenter,
             overlap,localsize,
             neighbor);
-    };
+    }
     
     private:
     template <typename Real>
-    void CopyBuffers(MeshFunctionType meshfunction, Real ** buffers, bool toBuffer,
+    void CopyBuffers(MeshFunctionType meshfunction, Containers::Array<Real, Device, Index> * buffers, bool toBuffer,
             int left, int right, int up, int down,
             int xcenter, int ycenter,
             CoordinatesType shortDim, CoordinatesType longDim,
             int *neighbor)
     {
        	if(neighbor[Left]!=-1)        
-            BufferEntities(meshfunction,buffers[Left],left,ycenter,shortDim.x(),longDim.y(),toBuffer);
+            BufferEntitiesHelper<MeshFunctionType,2,Real,Device>::BufferEntities(meshfunction,buffers[Left].getData(),left,ycenter,shortDim.x(),longDim.y(),toBuffer);
         if(neighbor[Right]!=-1)
-            BufferEntities(meshfunction,buffers[Right],right,ycenter,shortDim.x(),longDim.y(),toBuffer);
+            BufferEntitiesHelper<MeshFunctionType,2,Real,Device>::BufferEntities(meshfunction,buffers[Right].getData(),right,ycenter,shortDim.x(),longDim.y(),toBuffer);
         if(neighbor[Up]!=-1)
-            BufferEntities(meshfunction,buffers[Up],xcenter,up,longDim.x(),shortDim.y(),toBuffer);
+            BufferEntitiesHelper<MeshFunctionType,2,Real,Device>::BufferEntities(meshfunction,buffers[Up].getData(),xcenter,up,longDim.x(),shortDim.y(),toBuffer);
         if(neighbor[Down]!=-1)
-            BufferEntities(meshfunction,buffers[Down],xcenter,down,longDim.x(),shortDim.y(),toBuffer);
+            BufferEntitiesHelper<MeshFunctionType,2,Real,Device>::BufferEntities(meshfunction,buffers[Down].getData(),xcenter,down,longDim.x(),shortDim.y(),toBuffer);
         if(neighbor[UpLeft]!=-1)
-            BufferEntities(meshfunction,buffers[UpLeft],left,up,shortDim.x(),shortDim.y(),toBuffer);
+            BufferEntitiesHelper<MeshFunctionType,2,Real,Device>::BufferEntities(meshfunction,buffers[UpLeft].getData(),left,up,shortDim.x(),shortDim.y(),toBuffer);
         if(neighbor[UpRight]!=-1)
-            BufferEntities(meshfunction,buffers[UpRight],right,up,shortDim.x(),shortDim.y(),toBuffer);
+            BufferEntitiesHelper<MeshFunctionType,2,Real,Device>::BufferEntities(meshfunction,buffers[UpRight].getData(),right,up,shortDim.x(),shortDim.y(),toBuffer);
         if(neighbor[DownLeft]!=-1)        
-            BufferEntities(meshfunction,buffers[DownLeft],left,down,shortDim.x(),shortDim.y(),toBuffer);
+            BufferEntitiesHelper<MeshFunctionType,2,Real,Device>::BufferEntities(meshfunction,buffers[DownLeft].getData(),left,down,shortDim.x(),shortDim.y(),toBuffer);
         if(neighbor[DownRight]!=-1)
-            BufferEntities(meshfunction,buffers[DownRight],right,down,shortDim.x(),shortDim.y(),toBuffer);
+            BufferEntitiesHelper<MeshFunctionType,2,Real,Device>::BufferEntities(meshfunction,buffers[DownRight].getData(),right,down,shortDim.x(),shortDim.y(),toBuffer);
     }
-    
-    template <typename Real>
-    void BufferEntities(MeshFunctionType meshfunction, Real * buffer, int beginx, int beginy, int sizex, int sizey,bool tobuffer)
-    {
-
-        typename MeshFunctionType::MeshType::Cell entity(meshfunction.getMesh());
-        for(int i=0;i<sizey;i++)
-        {
-            for(int j=0;j<sizex;j++)
-            {
-                    entity.getCoordinates().x()=beginx+j;
-                    entity.getCoordinates().y()=beginy+i;				
-                    entity.refresh();
-                    if(tobuffer)
-                            buffer[i*sizex+j]=meshfunction.getData()[entity.getIndex()];
-                    else
-                            meshfunction.getData()[entity.getIndex()]=buffer[i*sizex+j];
-            }
-        }
-    };
-    
-    void DeleteBuffers(void)
-    {
-        for(int i=0;i<8;i++)
-        {
-            delete [] sendbuffs[i];
-            delete [] rcvbuffs[i];
-        }
-    };
 };
 
 
@@ -415,8 +333,8 @@ class DistributedMeshSynchronizer< Functions::MeshFunction< Grid< 3, GridReal, D
         typedef typename DistributedGridType::CoordinatesType CoordinatesType;
       
     private:
-        Real ** sendbuffs=new Real*[26];
-        Real ** rcvbuffs=new Real*[26];
+        Containers::Array<RealType, Device, Index> sendbuffs[26];
+        Containers::Array<RealType, Device, Index> rcvbuffs[26];
         int sizes[26];
         DistributedGridType *distributedgrid;
         
@@ -456,10 +374,6 @@ class DistributedMeshSynchronizer< Functions::MeshFunction< Grid< 3, GridReal, D
 
     void SetDistributedGrid(DistributedGridType *distrgrid)
     {
-        if(isSet)
-        {
-            DeleteBuffers();
-        }
         isSet=true;
 
         this->distributedgrid=distrgrid;        
@@ -483,8 +397,8 @@ class DistributedMeshSynchronizer< Functions::MeshFunction< Grid< 3, GridReal, D
 
         for(int i=0;i<26;i++)
         {
-                sendbuffs[i]=new Real[sizes[i]];
-                rcvbuffs[i]=new Real[sizes[i]];
+                sendbuffs[i].setSize(sizes[i]);
+                rcvbuffs[i].setSize(sizes[i]);
         }
         
         westSrc=localbegin.x();
@@ -506,14 +420,6 @@ class DistributedMeshSynchronizer< Functions::MeshFunction< Grid< 3, GridReal, D
         topDst=localgridsize.z()-overlap.z();
         
     }
-    
-    ~DistributedMeshSynchronizer()
-    {
-        if(isSet)
-        {
-            DeleteBuffers();
-        }
-    }
         
     template<typename CommunicatorType>
     void Synchronize(MeshFunctionType &meshfunction)
@@ -540,8 +446,8 @@ class DistributedMeshSynchronizer< Functions::MeshFunction< Grid< 3, GridReal, D
         for(int i=0;i<26;i++)	
            if(neighbor[i]!=-1)
            {
-               req[i]=CommunicatorType::ISend(sendbuffs[i], sizes[i], neighbor[i]);
-               req[26+i]=CommunicatorType::IRecv(rcvbuffs[i], sizes[i], neighbor[i]);
+               req[i]=CommunicatorType::ISend(sendbuffs[i].getData(), sizes[i], neighbor[i]);
+               req[26+i]=CommunicatorType::IRecv(rcvbuffs[i].getData(), sizes[i], neighbor[i]);
            }
 		   else
       	   {
@@ -559,36 +465,11 @@ class DistributedMeshSynchronizer< Functions::MeshFunction< Grid< 3, GridReal, D
             overlap, localsize,
             neighbor); 
  
-    };
+    }
     
     private:    
     template <typename Real>
-    void BufferEntities(MeshFunctionType meshfunction, Real * buffer, int beginx, int beginy, int beginz, int sizex, int sizey, int sizez, bool tobuffer)
-    {
-
-        typename MeshFunctionType::MeshType::Cell entity(meshfunction.getMesh());
-        for(int k=0;k<sizez;k++)
-        {
-            for(int i=0;i<sizey;i++)
-            {
-                for(int j=0;j<sizex;j++)
-                {
-                        entity.getCoordinates().x()=beginx+j;
-                        entity.getCoordinates().y()=beginy+i;
-                        entity.getCoordinates().z()=beginz+k;
-                        entity.refresh();
-                        if(tobuffer)
-                                buffer[k*sizex*sizey+i*sizex+j]=meshfunction.getData()[entity.getIndex()];
-                        else
-                                meshfunction.getData()[entity.getIndex()]=buffer[k*sizex*sizey+i*sizex+j];
-                }
-            }
-        }
-        
-    };
-    
-    template <typename Real>
-    void CopyBuffers(MeshFunctionType meshfunction, Real ** buffers, bool toBuffer,
+    void CopyBuffers(MeshFunctionType meshfunction, Containers::Array<Real, Device, Index> * buffers, bool toBuffer,
             int west, int east, int nord, int south, int bottom, int top,
             int xcenter, int ycenter, int zcenter,
             CoordinatesType shortDim, CoordinatesType longDim,
@@ -596,73 +477,63 @@ class DistributedMeshSynchronizer< Functions::MeshFunction< Grid< 3, GridReal, D
     {
        //X-Y-Z
         if(neighbor[West]!=-1)
-            BufferEntities(meshfunction,buffers[West],west,ycenter,zcenter,shortDim.x(),longDim.y(),longDim.z(),toBuffer);
+            BufferEntitiesHelper<MeshFunctionType,3,Real,Device>::BufferEntities(meshfunction,buffers[West].getData(),west,ycenter,zcenter,shortDim.x(),longDim.y(),longDim.z(),toBuffer);
         if(neighbor[East]!=-1)
-            BufferEntities(meshfunction,buffers[East],east,ycenter,zcenter,shortDim.x(),longDim.y(),longDim.z(),toBuffer);
+            BufferEntitiesHelper<MeshFunctionType,3,Real,Device>::BufferEntities(meshfunction,buffers[East].getData(),east,ycenter,zcenter,shortDim.x(),longDim.y(),longDim.z(),toBuffer);
         if(neighbor[Nord]!=-1)
-            BufferEntities(meshfunction,buffers[Nord],xcenter,nord,zcenter,longDim.x(),shortDim.y(),longDim.z(),toBuffer);
+            BufferEntitiesHelper<MeshFunctionType,3,Real,Device>::BufferEntities(meshfunction,buffers[Nord].getData(),xcenter,nord,zcenter,longDim.x(),shortDim.y(),longDim.z(),toBuffer);
         if(neighbor[South]!=-1)
-            BufferEntities(meshfunction,buffers[South],xcenter,south,zcenter,longDim.x(),shortDim.y(),longDim.z(),toBuffer);
+            BufferEntitiesHelper<MeshFunctionType,3,Real,Device>::BufferEntities(meshfunction,buffers[South].getData(),xcenter,south,zcenter,longDim.x(),shortDim.y(),longDim.z(),toBuffer);
         if(neighbor[Bottom]!=-1)
-            BufferEntities(meshfunction,buffers[Bottom],xcenter,ycenter,bottom,longDim.x(),longDim.y(),shortDim.z(),toBuffer);
+            BufferEntitiesHelper<MeshFunctionType,3,Real,Device>::BufferEntities(meshfunction,buffers[Bottom].getData(),xcenter,ycenter,bottom,longDim.x(),longDim.y(),shortDim.z(),toBuffer);
         if(neighbor[Top]!=-1)
-            BufferEntities(meshfunction,buffers[Top],xcenter,ycenter,top,longDim.x(),longDim.y(),shortDim.z(),toBuffer);	
+            BufferEntitiesHelper<MeshFunctionType,3,Real,Device>::BufferEntities(meshfunction,buffers[Top].getData(),xcenter,ycenter,top,longDim.x(),longDim.y(),shortDim.z(),toBuffer);	
         //XY
         if(neighbor[NordWest]!=-1)
-            BufferEntities(meshfunction,buffers[NordWest],west,nord,zcenter,shortDim.x(),shortDim.y(),longDim.z(),toBuffer);
+            BufferEntitiesHelper<MeshFunctionType,3,Real,Device>::BufferEntities(meshfunction,buffers[NordWest].getData(),west,nord,zcenter,shortDim.x(),shortDim.y(),longDim.z(),toBuffer);
         if(neighbor[NordEast]!=-1)
-            BufferEntities(meshfunction,buffers[NordEast],east,nord,zcenter,shortDim.x(),shortDim.y(),longDim.z(),toBuffer);
+            BufferEntitiesHelper<MeshFunctionType,3,Real,Device>::BufferEntities(meshfunction,buffers[NordEast].getData(),east,nord,zcenter,shortDim.x(),shortDim.y(),longDim.z(),toBuffer);
         if(neighbor[SouthWest]!=-1)
-            BufferEntities(meshfunction,buffers[SouthWest],west,south,zcenter,shortDim.x(),shortDim.y(),longDim.z(),toBuffer);
+            BufferEntitiesHelper<MeshFunctionType,3,Real,Device>::BufferEntities(meshfunction,buffers[SouthWest].getData(),west,south,zcenter,shortDim.x(),shortDim.y(),longDim.z(),toBuffer);
         if(neighbor[SouthEast]!=-1)
-            BufferEntities(meshfunction,buffers[SouthEast],east,south,zcenter,shortDim.x(),shortDim.y(),longDim.z(),toBuffer);
+            BufferEntitiesHelper<MeshFunctionType,3,Real,Device>::BufferEntities(meshfunction,buffers[SouthEast].getData(),east,south,zcenter,shortDim.x(),shortDim.y(),longDim.z(),toBuffer);
         //XZ
         if(neighbor[BottomWest]!=-1)
-            BufferEntities(meshfunction,buffers[BottomWest],west,ycenter,bottom,shortDim.x(),longDim.y(),shortDim.z(),toBuffer);
+            BufferEntitiesHelper<MeshFunctionType,3,Real,Device>::BufferEntities(meshfunction,buffers[BottomWest].getData(),west,ycenter,bottom,shortDim.x(),longDim.y(),shortDim.z(),toBuffer);
         if(neighbor[BottomEast]!=-1)
-            BufferEntities(meshfunction,buffers[BottomEast],east,ycenter,bottom,shortDim.x(),longDim.y(),shortDim.z(),toBuffer);
+            BufferEntitiesHelper<MeshFunctionType,3,Real,Device>::BufferEntities(meshfunction,buffers[BottomEast].getData(),east,ycenter,bottom,shortDim.x(),longDim.y(),shortDim.z(),toBuffer);
         if(neighbor[TopWest]!=-1)
-            BufferEntities(meshfunction,buffers[TopWest],west,ycenter,top,shortDim.x(),longDim.y(),shortDim.z(),toBuffer);
+            BufferEntitiesHelper<MeshFunctionType,3,Real,Device>::BufferEntities(meshfunction,buffers[TopWest].getData(),west,ycenter,top,shortDim.x(),longDim.y(),shortDim.z(),toBuffer);
         if(neighbor[TopEast]!=-1)
-            BufferEntities(meshfunction,buffers[TopEast],east,ycenter,top,shortDim.x(),longDim.y(),shortDim.z(),toBuffer);   
+            BufferEntitiesHelper<MeshFunctionType,3,Real,Device>::BufferEntities(meshfunction,buffers[TopEast].getData(),east,ycenter,top,shortDim.x(),longDim.y(),shortDim.z(),toBuffer);   
         //YZ
         if(neighbor[BottomNord]!=-1)
-            BufferEntities(meshfunction,buffers[BottomNord],xcenter,nord,bottom,longDim.x(),shortDim.y(),shortDim.z(),toBuffer);
+            BufferEntitiesHelper<MeshFunctionType,3,Real,Device>::BufferEntities(meshfunction,buffers[BottomNord].getData(),xcenter,nord,bottom,longDim.x(),shortDim.y(),shortDim.z(),toBuffer);
         if(neighbor[BottomSouth]!=-1)
-            BufferEntities(meshfunction,buffers[BottomSouth],xcenter,south,bottom,longDim.x(),shortDim.y(),shortDim.z(),toBuffer);
+            BufferEntitiesHelper<MeshFunctionType,3,Real,Device>::BufferEntities(meshfunction,buffers[BottomSouth].getData(),xcenter,south,bottom,longDim.x(),shortDim.y(),shortDim.z(),toBuffer);
         if(neighbor[TopNord]!=-1)
-            BufferEntities(meshfunction,buffers[TopNord],xcenter,nord,top,longDim.x(),shortDim.y(),shortDim.z(),toBuffer);
+            BufferEntitiesHelper<MeshFunctionType,3,Real,Device>::BufferEntities(meshfunction,buffers[TopNord].getData(),xcenter,nord,top,longDim.x(),shortDim.y(),shortDim.z(),toBuffer);
         if(neighbor[TopSouth]!=-1)
-            BufferEntities(meshfunction,buffers[TopSouth],xcenter,south,top,longDim.x(),shortDim.y(),shortDim.z(),toBuffer);
+            BufferEntitiesHelper<MeshFunctionType,3,Real,Device>::BufferEntities(meshfunction,buffers[TopSouth].getData(),xcenter,south,top,longDim.x(),shortDim.y(),shortDim.z(),toBuffer);
         //XYZ
         if(neighbor[BottomNordWest]!=-1)
-            BufferEntities(meshfunction,buffers[BottomNordWest],west,nord,bottom,shortDim.x(),shortDim.y(),shortDim.z(),toBuffer);
+            BufferEntitiesHelper<MeshFunctionType,3,Real,Device>::BufferEntities(meshfunction,buffers[BottomNordWest].getData(),west,nord,bottom,shortDim.x(),shortDim.y(),shortDim.z(),toBuffer);
         if(neighbor[BottomNordEast]!=-1)
-            BufferEntities(meshfunction,buffers[BottomNordEast],east,nord,bottom,shortDim.x(),shortDim.y(),shortDim.z(),toBuffer);
+            BufferEntitiesHelper<MeshFunctionType,3,Real,Device>::BufferEntities(meshfunction,buffers[BottomNordEast].getData(),east,nord,bottom,shortDim.x(),shortDim.y(),shortDim.z(),toBuffer);
         if(neighbor[BottomSouthWest]!=-1)
-            BufferEntities(meshfunction,buffers[BottomSouthWest],west,south,bottom,shortDim.x(),shortDim.y(),shortDim.z(),toBuffer);
+            BufferEntitiesHelper<MeshFunctionType,3,Real,Device>::BufferEntities(meshfunction,buffers[BottomSouthWest].getData(),west,south,bottom,shortDim.x(),shortDim.y(),shortDim.z(),toBuffer);
         if(neighbor[BottomSouthEast]!=-1)
-            BufferEntities(meshfunction,buffers[BottomSouthEast],east,south,bottom,shortDim.x(),shortDim.y(),shortDim.z(),toBuffer);
+            BufferEntitiesHelper<MeshFunctionType,3,Real,Device>::BufferEntities(meshfunction,buffers[BottomSouthEast].getData(),east,south,bottom,shortDim.x(),shortDim.y(),shortDim.z(),toBuffer);
         if(neighbor[TopNordWest]!=-1)
-            BufferEntities(meshfunction,buffers[TopNordWest],west,nord,top,shortDim.x(),shortDim.y(),shortDim.z(),toBuffer);
+            BufferEntitiesHelper<MeshFunctionType,3,Real,Device>::BufferEntities(meshfunction,buffers[TopNordWest].getData(),west,nord,top,shortDim.x(),shortDim.y(),shortDim.z(),toBuffer);
         if(neighbor[TopNordEast]!=-1)
-            BufferEntities(meshfunction,buffers[TopNordEast],east,nord,top,shortDim.x(),shortDim.y(),shortDim.z(),toBuffer);
+            BufferEntitiesHelper<MeshFunctionType,3,Real,Device>::BufferEntities(meshfunction,buffers[TopNordEast].getData(),east,nord,top,shortDim.x(),shortDim.y(),shortDim.z(),toBuffer);
         if(neighbor[TopSouthWest]!=-1)
-            BufferEntities(meshfunction,buffers[TopSouthWest],west,south,top,shortDim.x(),shortDim.y(),shortDim.z(),toBuffer);
+            BufferEntitiesHelper<MeshFunctionType,3,Real,Device>::BufferEntities(meshfunction,buffers[TopSouthWest].getData(),west,south,top,shortDim.x(),shortDim.y(),shortDim.z(),toBuffer);
         if(neighbor[TopSouthEast]!=-1)
-            BufferEntities(meshfunction,buffers[TopSouthEast],east,south,top,shortDim.x(),shortDim.y(),shortDim.z(),toBuffer);   
+            BufferEntitiesHelper<MeshFunctionType,3,Real,Device>::BufferEntities(meshfunction,buffers[TopSouthEast].getData(),east,south,top,shortDim.x(),shortDim.y(),shortDim.z(),toBuffer);   
 
-    };
-
-    void DeleteBuffers(void)
-    {
-        //free buffers
-        for(int i=0;i<26;i++)
-        {
-            delete [] sendbuffs[i];
-            delete [] rcvbuffs[i];
-        }
-    };
+    }
 };
 
 
diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedGrid_1D.h b/src/TNL/Meshes/DistributedMeshes/DistributedGrid_1D.h
index bad979a3df24b54fdc12b89cfc141e0d7b5736fe..630a6ecda11a5b680f678f8cd1a68c7e6cadc7c4 100644
--- a/src/TNL/Meshes/DistributedMeshes/DistributedGrid_1D.h
+++ b/src/TNL/Meshes/DistributedMeshes/DistributedGrid_1D.h
@@ -139,7 +139,7 @@ class DistributedMesh<Grid< 1, RealType, Device, Index >>
                    localgridsize.x()+=2*overlap.x();
                          
            }  
-       }; 
+       } 
        
        void SetupGrid( GridType& grid)
        {
diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedGrid_2D.h b/src/TNL/Meshes/DistributedMeshes/DistributedGrid_2D.h
index 06dc654b3bfe64b09ff1f68d8f3943e39d4edf90..ac43b8fcd8eef9461995edbc0c2c810bdccfe94a 100644
--- a/src/TNL/Meshes/DistributedMeshes/DistributedGrid_2D.h
+++ b/src/TNL/Meshes/DistributedMeshes/DistributedGrid_2D.h
@@ -197,7 +197,7 @@ class DistributedMesh<Grid< 2, RealType, Device, Index >>
                      
            
                       
-       };
+       }
        
        void SetupGrid( GridType& grid)
        {
diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedGrid_3D.h b/src/TNL/Meshes/DistributedMeshes/DistributedGrid_3D.h
index ca24114827c7bf6fdd73b8f4080fd420563981ff..36488d10e8ef337cba241688dd304b46be57b459 100644
--- a/src/TNL/Meshes/DistributedMeshes/DistributedGrid_3D.h
+++ b/src/TNL/Meshes/DistributedMeshes/DistributedGrid_3D.h
@@ -272,7 +272,7 @@ class DistributedMesh<Grid< 3, RealType, Device, Index >>
                    localgridsize.z()+=overlap.z();
                
            }                     
-       };
+       }
        
        void SetupGrid( GridType& grid)
        {
diff --git a/src/TNL/ParallelFor.h b/src/TNL/ParallelFor.h
index 7002ecafe0fd7cc97271bf2fd885f3bfb50ac331..ec66906a5e3c3de9930cc622e150dacccbe4d5f8 100644
--- a/src/TNL/ParallelFor.h
+++ b/src/TNL/ParallelFor.h
@@ -13,6 +13,7 @@
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Devices/CudaDeviceInfo.h>
+#include <TNL/Math.h>
 
 /*
  * The implementation of ParallelFor is not meant to provide maximum performance
@@ -43,18 +44,102 @@ struct ParallelFor
    }
 };
 
+template< typename Device = Devices::Host >
+struct ParallelFor2D
+{
+   template< typename Index,
+             typename Function,
+             typename... FunctionArgs >
+   static void exec( Index startX, Index startY, Index endX, Index endY, Function f, FunctionArgs... args )
+   {
+#ifdef HAVE_OPENMP
+      #pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() )
+#endif
+      for( Index i = startX; i < endX; i++ )
+      for( Index j = startY; j < endY; j++ )
+         f( i, j, args... );
+   }
+};
+
+template< typename Device = Devices::Host >
+struct ParallelFor3D
+{
+   template< typename Index,
+             typename Function,
+             typename... FunctionArgs >
+   static void exec( Index startX, Index startY, Index startZ, Index endX, Index endY, Index endZ, Function f, FunctionArgs... args )
+   {
+#ifdef HAVE_OPENMP
+      #pragma omp parallel for collapse(2) if( TNL::Devices::Host::isOMPEnabled() )
+#endif
+      for( Index i = startX; i < endX; i++ )
+      for( Index j = startY; j < endY; j++ )
+      for( Index k = startZ; k < endZ; k++ )
+         f( i, j, k, args... );
+   }
+};
+
 #ifdef HAVE_CUDA
-template< typename Index,
+template< bool gridStrideX = true,
+          typename Index,
           typename Function,
           typename... FunctionArgs >
 __global__ void
 ParallelForKernel( Index start, Index end, Function f, FunctionArgs... args )
 {
-   for( Index i = start + blockIdx.x * blockDim.x + threadIdx.x;
-        i < end;
-        i += blockDim.x * gridDim.x )
-   {
+   Index i = start + blockIdx.x * blockDim.x + threadIdx.x;
+   while( i < end ) {
       f( i, args... );
+      if( gridStrideX ) i += blockDim.x * gridDim.x;
+      else break;
+   }
+}
+
+template< bool gridStrideX = true,
+          bool gridStrideY = true,
+          typename Index,
+          typename Function,
+          typename... FunctionArgs >
+__global__ void
+ParallelFor2DKernel( Index startX, Index startY, Index endX, Index endY, Function f, FunctionArgs... args )
+{
+   Index j = startY + blockIdx.y * blockDim.y + threadIdx.y;
+   Index i = startX + blockIdx.x * blockDim.x + threadIdx.x;
+   while( j < endY ) {
+      while( i < endX ) {
+         f( i, j, args... );
+         if( gridStrideX ) i += blockDim.x * gridDim.x;
+         else break;
+      }
+      if( gridStrideY ) j += blockDim.y * gridDim.y;
+      else break;
+   }
+}
+
+template< bool gridStrideX = true,
+          bool gridStrideY = true,
+          bool gridStrideZ = true,
+          typename Index,
+          typename Function,
+          typename... FunctionArgs >
+__global__ void
+ParallelFor3DKernel( Index startX, Index startY, Index startZ, Index endX, Index endY, Index endZ, Function f, FunctionArgs... args )
+{
+   Index k = startZ + blockIdx.z * blockDim.z + threadIdx.z;
+   Index j = startY + blockIdx.y * blockDim.y + threadIdx.y;
+   Index i = startX + blockIdx.x * blockDim.x + threadIdx.x;
+   while( k < endZ ) {
+      while( j < endY ) {
+         while( i < endX ) {
+            f( i, j, k, args... );
+            if( gridStrideX ) i += blockDim.x * gridDim.x;
+            else break;
+         }
+         if( gridStrideY ) j += blockDim.y * gridDim.y;
+         else break;
+      }
+      if( gridStrideZ ) k += blockDim.z * gridDim.z;
+      else break;
    }
 }
 #endif
@@ -71,14 +156,163 @@ struct ParallelFor< Devices::Cuda >
       if( end > start ) {
          dim3 blockSize( 256 );
          dim3 gridSize;
-         const int desGridSize = 32 * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
-         gridSize.x = min( desGridSize, Devices::Cuda::getNumberOfBlocks( end - start, blockSize.x ) );
+         gridSize.x = TNL::min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( end - start, blockSize.x ) );
 
          Devices::Cuda::synchronizeDevice();
-         ParallelForKernel<<< gridSize, blockSize >>>( start, end, f, args... );
+
+         if( Devices::Cuda::getNumberOfGrids( end - start ) == 1 )
+            ParallelForKernel< false ><<< gridSize, blockSize >>>( start, end, f, args... );
+         else {
+            // decrease the grid size and align to the number of multiprocessors
+            const int desGridSize = 32 * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
+            gridSize.x = TNL::min( desGridSize, Devices::Cuda::getNumberOfBlocks( end - start, blockSize.x ) );
+            ParallelForKernel< true ><<< gridSize, blockSize >>>( start, end, f, args... );
+         }
+
+         cudaDeviceSynchronize();
+         TNL_CHECK_CUDA_DEVICE;
+      }
+#else
+      throw Exceptions::CudaSupportMissing();
+#endif
+   }
+};
+
+template<>
+struct ParallelFor2D< Devices::Cuda >
+{
+   template< typename Index,
+             typename Function,
+             typename... FunctionArgs >
+   static void exec( Index startX, Index startY, Index endX, Index endY, Function f, FunctionArgs... args )
+   {
+#ifdef HAVE_CUDA
+      if( endX > startX && endY > startY ) {
+         const Index sizeX = endX - startX;
+         const Index sizeY = endY - startY;
+
+         dim3 blockSize;
+         if( sizeX >= sizeY * sizeY ) {
+            blockSize.x = TNL::min( 256, sizeX );
+            blockSize.y = 1;
+         }
+         else if( sizeY >= sizeX * sizeX ) {
+            blockSize.x = 1;
+            blockSize.y = TNL::min( 256, sizeY );
+         }
+         else {
+            blockSize.x = TNL::min( 32, sizeX );
+            blockSize.y = TNL::min( 8, sizeY );
+         }
+         dim3 gridSize;
+         gridSize.x = TNL::min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( sizeX, blockSize.x ) );
+         gridSize.y = TNL::min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( sizeY, blockSize.y ) );
+
+         dim3 gridCount;
+         gridCount.x = Devices::Cuda::getNumberOfGrids( sizeX );
+         gridCount.y = Devices::Cuda::getNumberOfGrids( sizeY );
+
+         Devices::Cuda::synchronizeDevice();
+
+         if( gridCount.x == 1 && gridCount.y == 1 )
+            ParallelFor2DKernel< false, false ><<< gridSize, blockSize >>>
+               ( startX, startY, endX, endY, f, args... );
+         else if( gridCount.x == 1 && gridCount.y > 1 )
+            ParallelFor2DKernel< false, true ><<< gridSize, blockSize >>>
+               ( startX, startY, endX, endY, f, args... );
+         else if( gridCount.x > 1 && gridCount.y == 1 )
+            ParallelFor2DKernel< true, false ><<< gridSize, blockSize >>>
+               ( startX, startY, endX, endY, f, args... );
+         else
+            ParallelFor2DKernel< true, true ><<< gridSize, blockSize >>>
+               ( startX, startY, endX, endY, f, args... );
+
+         cudaDeviceSynchronize();
+         TNL_CHECK_CUDA_DEVICE;
+      }
+#else
+      throw Exceptions::CudaSupportMissing();
+#endif
+   }
+};
+
+template<>
+struct ParallelFor3D< Devices::Cuda >
+{
+   template< typename Index,
+             typename Function,
+             typename... FunctionArgs >
+   static void exec( Index startX, Index startY, Index startZ, Index endX, Index endY, Index endZ, Function f, FunctionArgs... args )
+   {
+#ifdef HAVE_CUDA
+      if( endX > startX && endY > startY && endZ > startZ ) {
+         const Index sizeX = endX - startX;
+         const Index sizeY = endY - startY;
+         const Index sizeZ = endZ - startZ;
+
+         dim3 blockSize;
+         if( sizeX >= sizeY * sizeZ ) {
+            blockSize.x = TNL::min( 256, sizeX );
+            blockSize.y = 1;
+            blockSize.z = 1;
+         }
+         else if( sizeY >= sizeX * sizeZ ) {
+            blockSize.x = 1;
+            blockSize.y = TNL::min( 256, sizeY );
+            blockSize.z = 1;
+         }
+         else if( sizeZ >= sizeX * sizeY ) {
+            blockSize.x = 1;
+            blockSize.y = 1;
+            blockSize.z = TNL::min( 256, sizeZ );
+         }
+         else {
+            blockSize.x = TNL::min( 16, sizeX );
+            blockSize.y = TNL::min( 4, sizeY );
+            blockSize.z = TNL::min( 4, sizeZ );
+         }
+         dim3 gridSize;
+         gridSize.x = TNL::min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( sizeX, blockSize.x ) );
+         gridSize.y = TNL::min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( sizeY, blockSize.y ) );
+         gridSize.z = TNL::min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( sizeZ, blockSize.z ) );
+
+         dim3 gridCount;
+         gridCount.x = Devices::Cuda::getNumberOfGrids( sizeX );
+         gridCount.y = Devices::Cuda::getNumberOfGrids( sizeY );
+         gridCount.z = Devices::Cuda::getNumberOfGrids( sizeZ );
+
+         Devices::Cuda::synchronizeDevice();
+
+         if( gridCount.x == 1 && gridCount.y == 1 && gridCount.z == 1 )
+            ParallelFor3DKernel< false, false, false ><<< gridSize, blockSize >>>
+               ( startX, startY, startZ, endX, endY, endZ, f, args... );
+         else if( gridCount.x == 1 && gridCount.y == 1 && gridCount.z > 1 )
+            ParallelFor3DKernel< false, false, true ><<< gridSize, blockSize >>>
+               ( startX, startY, startZ, endX, endY, endZ, f, args... );
+         else if( gridCount.x == 1 && gridCount.y > 1 && gridCount.z == 1 )
+            ParallelFor3DKernel< false, true, false ><<< gridSize, blockSize >>>
+               ( startX, startY, startZ, endX, endY, endZ, f, args... );
+         else if( gridCount.x > 1 && gridCount.y == 1 && gridCount.z == 1 )
+            ParallelFor3DKernel< true, false, false ><<< gridSize, blockSize >>>
+               ( startX, startY, startZ, endX, endY, endZ, f, args... );
+         else if( gridCount.x == 1 && gridCount.y > 1 && gridCount.z > 1 )
+            ParallelFor3DKernel< false, true, true ><<< gridSize, blockSize >>>
+               ( startX, startY, startZ, endX, endY, endZ, f, args... );
+         else if( gridCount.x > 1 && gridCount.y > 1 && gridCount.z == 1 )
+            ParallelFor3DKernel< true, true, false ><<< gridSize, blockSize >>>
+               ( startX, startY, startZ, endX, endY, endZ, f, args... );
+         else if( gridCount.x > 1 && gridCount.y == 1 && gridCount.z > 1 )
+            ParallelFor3DKernel< true, false, true ><<< gridSize, blockSize >>>
+               ( startX, startY, startZ, endX, endY, endZ, f, args... );
+         else
+            ParallelFor3DKernel< true, true, true ><<< gridSize, blockSize >>>
+               ( startX, startY, startZ, endX, endY, endZ, f, args... );
+
          cudaDeviceSynchronize();
          TNL_CHECK_CUDA_DEVICE;
       }
+#else
+      throw Exceptions::CudaSupportMissing();
 #endif
    }
 };
diff --git a/src/TNL/param-types.h b/src/TNL/param-types.h
index 8a13e09565dd3ede7c2414c9504b0e12ab142a5d..b73383165d5d023e02c4cc943dc0489b57b6e892 100644
--- a/src/TNL/param-types.h
+++ b/src/TNL/param-types.h
@@ -22,8 +22,11 @@ template<> inline String getType< bool >() { return String( "bool" ); };
 template<> inline String getType< short int >() { return String( "short int" ); };
 template<> inline String getType< int >() { return String( "int" ); };
 template<> inline String getType< long int >() { return String( "long int" ); };
+template<> inline String getType< unsigned short >() { return String( "unsigned short" ); };
 template<> inline String getType< unsigned int >() { return String( "unsigned int" ); };
+template<> inline String getType< unsigned long >() { return String( "unsigned long" ); };
 template<> inline String getType< char >() { return String( "char" ); };
+template<> inline String getType< unsigned char >() { return String( "unsigned char" ); };
 template<> inline String getType< float >() { return String( "float" ); };
 template<> inline String getType< double >() { return String( "double" ); };
 template<> inline String getType< long double >() { return String( "long double" ); };
diff --git a/src/UnitTests/Mpi/CMakeLists.txt b/src/UnitTests/Mpi/CMakeLists.txt
index da05a508aefe7bb84e9cb7f58eb84318c8f25088..3fee83ec98b46833d44c069d821ea16d12bf72e1 100644
--- a/src/UnitTests/Mpi/CMakeLists.txt
+++ b/src/UnitTests/Mpi/CMakeLists.txt
@@ -43,5 +43,12 @@ ADD_TEST( CopyEntitesTest ${EXECUTABLE_OUTPUT_PATH}/CopyEntitesTest )
 SET (mpi_test_parameters_IO -np 4 "${EXECUTABLE_OUTPUT_PATH}/DistributedGridIOTest")
 ADD_TEST( NAME DistributedGridIOTest COMMAND "mpirun" ${mpi_test_parameters_IO})
 
+IF( BUILD_CUDA )
+    CUDA_ADD_EXECUTABLE( GPUDistributedGridIOTest GPUDistributedGridIOTest.cu OPTIONS ${CXX_TESTS_FLAGS})
+    TARGET_LINK_LIBRARIES( GPUDistributedGridIOTest 
+                                    ${GTEST_BOTH_LIBRARIES}
+                                    tnl )   
+ENDIF( BUILD_CUDA )  
+
 endif()
 
diff --git a/src/UnitTests/Mpi/CopyEntitiesTest.cpp b/src/UnitTests/Mpi/CopyEntitiesTest.cpp
index b86e8e52bc6f1627b989b4c7bfc6f94c0ed95978..ae0be78b99e8c46bb832dbad640c8bcc935a7a80 100644
--- a/src/UnitTests/Mpi/CopyEntitiesTest.cpp
+++ b/src/UnitTests/Mpi/CopyEntitiesTest.cpp
@@ -6,9 +6,10 @@
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
-#include <TNL/Meshes/DistributedMeshes/DistributedGridIO.h>
+#include <TNL/Meshes/DistributedMeshes/CopyEntitiesHelper.h>
 #include <TNL/Functions/MeshFunction.h>
 
+
 #ifdef HAVE_GTEST 
 #include <gtest/gtest.h>
 
@@ -199,7 +200,7 @@ class TestCopyEntities
 			CoordinatesType size;
 			size.setValue(8);
 
-			CopyEntities< MeshFunctionType >::Copy(inputMeshFunction,outputMeshFunction, begin,zero, size);
+			CopyEntitiesHelper< MeshFunctionType >::Copy(inputMeshFunction,outputMeshFunction, begin,zero, size);
 
 			TestMovedMeshfunction<MeshFunctionType>::Test(outputMeshFunction);
 		};
@@ -210,7 +211,7 @@ TEST( CopyEntitiesTest, 1D )
 	TestCopyEntities<1>::Test();
 }
 
-TEST( CopyEntitiesTest, 2D )
+/*TEST( CopyEntitiesTest, 2D )
 {
 	TestCopyEntities<2>::Test();
 }
@@ -218,7 +219,7 @@ TEST( CopyEntitiesTest, 2D )
 TEST( CopyEntitiesTest, 3D )
 {
 	TestCopyEntities<3>::Test();
-}
+}*/
 
 
 #endif
diff --git a/src/UnitTests/Mpi/DistributedGridIOTest.cpp b/src/UnitTests/Mpi/DistributedGridIOTest.cpp
index ddbc36d40d24fc72c653bdb337e285e684d9bcf8..b0f51781517e000ee4983b6cefebb4e4e1479f00 100644
--- a/src/UnitTests/Mpi/DistributedGridIOTest.cpp
+++ b/src/UnitTests/Mpi/DistributedGridIOTest.cpp
@@ -1,407 +1,40 @@
-/***************************************************************************
-                          CopyEntitiesTest.cpp  -  description
-                             -------------------
-    begin                : Nov 1, 2017
-    copyright            : (C) 2017 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
 
-#ifdef HAVE_GTEST  
+#ifdef HAVE_GTEST
+  
 #include <gtest/gtest.h>
 
-#ifdef HAVE_MPI    
-
-#include <TNL/Communicators/MpiCommunicator.h>
-#include <TNL/Meshes/DistributedMeshes/DistributedMesh.h>
-#include <TNL/Functions/MeshFunction.h>
-#include <TNL/Meshes/DistributedMeshes/DistributedGridIO.h>
-
-
-#include "Functions.h"
-
-using namespace TNL::Containers;
-using namespace TNL::Meshes;
-using namespace TNL::Functions;
-using namespace TNL::Devices;
-using namespace TNL::Communicators;
-using namespace TNL::Meshes::DistributedMeshes;
-
-
-//================Parameters===================================
-template <int dim>
-class ParameterProvider
-{
-    public:
-
-    typedef Grid<dim,double,Host,int> MeshType;
-    typedef typename MeshType::CoordinatesType CoordinatesType;
-    typedef typename MeshType::PointType PointType;
-
-    PointType getOrigin(int rank)
-    {
-    };
-
-    PointType getProportions(int rank)
-    {
-    };
-
-    int* getDistr(void)
-    {
-        return NULL;
-    };
-};
-
-template<>
-class ParameterProvider<1>
-{
-    public:
-
-    int distr[1];
-
-    typedef Grid<1,double,Host,int> MeshType;
-    typedef typename MeshType::CoordinatesType CoordinatesType;
-    typedef typename MeshType::PointType PointType;
-
-    PointType getOrigin(int rank)
-    {
-        if(rank==0)
-            return PointType(-0.5);
-        if(rank==1)
-            return PointType(4.5);
-        if(rank==2)
-            return PointType(9.5);
-        if(rank==3)
-            return PointType(14.5);
-
-        return PointType(0);
-    };
-
-    PointType getProportions(int rank)
-    {
-        if(rank==0)
-            return PointType(5);
-        if(rank==1)
-            return PointType(5);
-        if(rank==2)
-            return PointType(5);
-        if(rank==3)
-            return PointType(5);
-        return PointType(0);
-    };
-
-    int* getDistr()
-    {
-        distr[0]=4;
-        return distr;
-    };
-};
-
-template<>
-class ParameterProvider<2>
-{
-    public:
-
-    int distr[2];
-
-    typedef Grid<2,double,Host,int> MeshType;
-    typedef typename MeshType::CoordinatesType CoordinatesType;
-    typedef typename MeshType::PointType PointType;
-
-    PointType getOrigin(int rank)
-    {
-        if(rank==0)
-            return PointType(-0.5,-0.5);
-        if(rank==1)
-            return PointType(9.5,-0.5);
-        if(rank==2)
-            return PointType(-0.5,9.5);
-        if(rank==3)
-            return PointType(9.5,9.5);
-
-        return PointType(0,0);
-    };
-
-    PointType getProportions(int rank)
-    {
-        if(rank==0)
-            return PointType(10,10);
-        if(rank==1)
-            return PointType(10,10);
-        if(rank==2)
-            return PointType(10,10);
-        if(rank==3)
-            return PointType(10,10);
-        return PointType(0,0);
-    };
-
-    int* getDistr()
-    {
-        distr[0]=2;
-        distr[1]=2;
-        return distr;
-    };
-};
-
-template<>
-class ParameterProvider<3>
-{
-    public:
-
-    int distr[3];
-
-    typedef Grid<3,double,Host,int> MeshType;
-    typedef typename MeshType::CoordinatesType CoordinatesType;
-    typedef typename MeshType::PointType PointType;
-
-    PointType getOrigin(int rank)
-    {
-        if(rank==0)
-            return PointType(-0.5,-0.5,-0.5);
-        if(rank==1)
-            return PointType(9.5,-0.5,-0.5);
-        if(rank==2)
-            return PointType(-0.5,9.5,-0.5);
-        if(rank==3)
-            return PointType(9.5,9.5,-0.5);
-
-        return PointType(0,0,0);
-    };
-
-    PointType getProportions(int rank)
-    {
-        if(rank==0)
-            return PointType(10,10,20);
-        if(rank==1)
-            return PointType(10,10,20);
-        if(rank==2)
-            return PointType(10,10,20);
-        if(rank==3)
-            return PointType(10,10,20);
-        return PointType(0,0,0);
-    };
-
-    int* getDistr()
-    {
-        distr[0]=2;
-        distr[1]=2;
-        distr[2]=1;
-        return distr;
-    };
-};
-
-//------------------------------------------------------------------------------
-
-typedef MpiCommunicator CommunicatorType;
-
-template <int dim>
-class TestDistributedGridIO{
-    public:
-
-    typedef Grid<dim,double,Host,int> MeshType;
-    typedef MeshFunction<MeshType> MeshFunctionType;
-    typedef Vector<double,Host,int> DofType;
-    typedef typename MeshType::Cell Cell;
-    typedef typename MeshType::IndexType IndexType; 
-    typedef typename MeshType::PointType PointType; 
-    typedef DistributedMesh<MeshType> DistributedGridType;
-
-    typedef typename DistributedGridType::CoordinatesType CoordinatesType;
-    typedef LinearFunction<double,dim> LinearFunctionType;
-
-    static void TestSave()
-    {
-        SharedPointer< LinearFunctionType, Host > linearFunctionPtr;
-        MeshFunctionEvaluator< MeshFunctionType, LinearFunctionType > linearFunctionEvaluator;    
-        
-        ParameterProvider<dim> parametry;
-        
-        //save distributed meshfunction into files
-        PointType globalOrigin;
-        globalOrigin.setValue(-0.5);
-
-        PointType globalProportions;
-        globalProportions.setValue(20);
-
-
-        MeshType globalGrid;
-        globalGrid.setDimensions(globalProportions);
-        globalGrid.setDomain(globalOrigin,globalProportions);
-        
-        int *distr=parametry.getDistr();
-
-        CoordinatesType overlap;
-        overlap.setValue(1);
-        DistributedGridType distrgrid;
-        distrgrid.template setGlobalGrid<CommunicatorType>(globalGrid,overlap);
-
-        SharedPointer<MeshType> gridptr;
-        SharedPointer<MeshFunctionType> meshFunctionptr;
-        distrgrid.SetupGrid(*gridptr);
-       
-        DofType dof(gridptr->template getEntitiesCount< Cell >());
-        dof.setValue(0);
-        meshFunctionptr->bind(gridptr,dof);
-            
-        linearFunctionEvaluator.evaluateAllEntities(meshFunctionptr , linearFunctionPtr);
-        
-        File file;
-
-        File meshFile;
-        file.open( String( "/tmp/test-file.tnl-" )+convertToString(CommunicatorType::GetRank()), IOMode::write );
-        meshFile.open( String( "/tmp/test-file-mesh.tnl-" )+convertToString(CommunicatorType::GetRank()), IOMode::write );
-        DistributedGridIO<MeshFunctionType> ::save(file,meshFile, *meshFunctionptr );
-        meshFile.close();
-
-        file.close();
-
-        //create similar local mesh function and evaluate linear function on it
-        PointType localOrigin=parametry.getOrigin(CommunicatorType::GetRank());        
-        PointType localProportions=parametry.getProportions(CommunicatorType::GetRank());;
-            
-        SharedPointer<MeshType>  localGridptr;
-        localGridptr->setDimensions(localProportions);
-        localGridptr->setDomain(localOrigin,localProportions);
-
-        DofType localDof(localGridptr->template getEntitiesCount< Cell >());
-
-        SharedPointer<MeshFunctionType> localMeshFunctionptr;
-        localMeshFunctionptr->bind(localGridptr,localDof);
-        linearFunctionEvaluator.evaluateAllEntities(localMeshFunctionptr , linearFunctionPtr);
-
-        //load other meshfunction on same localgrid from created file
-        SharedPointer<MeshType>  loadGridptr;
-        loadGridptr->setDimensions(localProportions);
-        loadGridptr->setDomain(localOrigin,localProportions);
-
-        DofType loadDof(localGridptr->template getEntitiesCount< Cell >());
-        SharedPointer<MeshFunctionType> loadMeshFunctionptr;
-        loadMeshFunctionptr->bind(loadGridptr,loadDof);
-
-        loadDof.setValue(-1);
-        
-
-        file.open( String( "/tmp/test-file.tnl-" )+convertToString(CommunicatorType::GetRank()), IOMode::read );
-        loadMeshFunctionptr->boundLoad(file);
-        file.close();
-
-        for(int i=0;i<localDof.getSize();i++)
-        {
-            EXPECT_EQ( localDof[i], loadDof[i]) << "Compare Loaded and evaluated Dof Failed for: "<< i;
-        }
-        
-    }
-    
-    static void TestLoad()
-    {
-        SharedPointer< LinearFunctionType, Host > linearFunctionPtr;
-        MeshFunctionEvaluator< MeshFunctionType, LinearFunctionType > linearFunctionEvaluator;    
-        
-        ParameterProvider<dim> parametry;
-
-        //save files from local mesh        
-        PointType localOrigin=parametry.getOrigin(CommunicatorType::GetRank());        
-        PointType localProportions=parametry.getProportions(CommunicatorType::GetRank());;
-            
-        SharedPointer<MeshType> localGridptr;
-        localGridptr->setDimensions(localProportions);
-        localGridptr->setDomain(localOrigin,localProportions);
-
-        DofType localDof(localGridptr->template getEntitiesCount< Cell >());
-
-        SharedPointer<MeshFunctionType> localMeshFunctionptr;
-        localMeshFunctionptr->bind(localGridptr,localDof);
-        linearFunctionEvaluator.evaluateAllEntities(localMeshFunctionptr , linearFunctionPtr);
-
-        File file;
-        file.open( String( "/tmp/test-file.tnl-" )+convertToString(CommunicatorType::GetRank()), IOMode::write );        
-        localMeshFunctionptr->save(file);
-        file.close();
-
-        //Crete distributed grid            
-        PointType globalOrigin;
-        globalOrigin.setValue(-0.5);
-
-        PointType globalProportions;
-        globalProportions.setValue(20);
-
-        int a;
-
-        MeshType globalGrid;
-        globalGrid.setDimensions(globalProportions);
-        globalGrid.setDomain(globalOrigin,globalProportions);
-        
-        int *distr=parametry.getDistr();
-
-        CoordinatesType overlap;
-        overlap.setValue(1);
-        DistributedGridType distrgrid;
-        distrgrid.template setGlobalGrid<CommunicatorType>(globalGrid,overlap, distr);
-
-        //Crete "distributedgrid driven" grid filed by load
-        SharedPointer<MeshType> loadGridptr;
-        SharedPointer<MeshFunctionType> loadMeshFunctionptr;
-        distrgrid.SetupGrid(*loadGridptr);
-        
-        DofType loadDof(loadGridptr->template getEntitiesCount< Cell >());
-        loadDof.setValue(0);
-        loadMeshFunctionptr->bind(loadGridptr,loadDof);
-
-            
-        file.open( String( "/tmp/test-file.tnl-" )+convertToString(CommunicatorType::GetRank()), IOMode::read );    
-        DistributedGridIO<MeshFunctionType> ::load(file, *loadMeshFunctionptr );
-        file.close();
-
-        loadMeshFunctionptr->template Synchronize<CommunicatorType>(); //need synchronization for overlaps to be filled corectly in loadDof
-
-
-        //Crete "distributedgrid driven" grid filed by evaluated linear function
-        SharedPointer<MeshType> gridptr;
-        SharedPointer<MeshFunctionType> meshFunctionptr;
-        distrgrid.SetupGrid(*gridptr);
-        
-        DofType dof(gridptr->template getEntitiesCount< Cell >());
-        dof.setValue(-1);
-        meshFunctionptr->bind(gridptr,dof);
-        
-        linearFunctionEvaluator.evaluateAllEntities(meshFunctionptr , linearFunctionPtr);        
-        meshFunctionptr->template Synchronize<CommunicatorType>();
-
-        for(int i=0;i<localDof.getSize();i++)
-        {
-            EXPECT_EQ( dof[i], loadDof[i]) << "Compare Loaded and evaluated Dof Failed for: "<< i;
-        }        
-    }
-};
+#ifdef HAVE_MPI
 
+#include "DistributedGridIOTest.h"
 
 TEST( DistributedGridIO, Save_1D )
 {
-    TestDistributedGridIO<1>::TestSave();
+    TestDistributedGridIO<1,Host>::TestSave();
 }
 
 TEST( DistributedGridIO, Save_2D )
 {
-    TestDistributedGridIO<2>::TestSave();
+    TestDistributedGridIO<2,Host>::TestSave();
 }
 
 TEST( DistributedGridIO, Save_3D )
 {
-    TestDistributedGridIO<3>::TestSave();
+    TestDistributedGridIO<3,Host>::TestSave();
 }
 
 TEST( DistributedGridIO, Load_1D )
 {
-    TestDistributedGridIO<1>::TestLoad();
+    TestDistributedGridIO<1,Host>::TestLoad();
 }
 
 TEST( DistributedGridIO, Load_2D )
 {
-    TestDistributedGridIO<2>::TestLoad();
+    TestDistributedGridIO<2,Host>::TestLoad();
 }
 
 TEST( DistributedGridIO, Load_3D )
 {
-    TestDistributedGridIO<3>::TestLoad();
+    TestDistributedGridIO<3,Host>::TestLoad();
 }
 
 #else
diff --git a/src/UnitTests/Mpi/DistributedGridIOTest.h b/src/UnitTests/Mpi/DistributedGridIOTest.h
new file mode 100644
index 0000000000000000000000000000000000000000..25f1d1988c60893e39832405c66cdd52d2deda52
--- /dev/null
+++ b/src/UnitTests/Mpi/DistributedGridIOTest.h
@@ -0,0 +1,369 @@
+/***************************************************************************
+                          CopyEntitiesTest.cpp  -  description
+                             -------------------
+    begin                : Nov 1, 2017
+    copyright            : (C) 2017 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+#include <TNL/Communicators/MpiCommunicator.h>
+#include <TNL/Meshes/DistributedMeshes/DistributedMesh.h>
+#include <TNL/Functions/MeshFunction.h>
+#include <TNL/Meshes/DistributedMeshes/DistributedGridIO.h>
+
+
+#include "Functions.h"
+
+using namespace TNL::Containers;
+using namespace TNL::Meshes;
+using namespace TNL::Functions;
+using namespace TNL::Devices;
+using namespace TNL::Communicators;
+using namespace TNL::Meshes::DistributedMeshes;
+
+
+//================Parameters===================================
+template <int dim, typename Device>
+class ParameterProvider
+{
+    public:
+
+    typedef Grid<dim,double,Device,int> MeshType;
+    typedef typename MeshType::CoordinatesType CoordinatesType;
+    typedef typename MeshType::PointType PointType;
+
+    PointType getOrigin(int rank)
+    {
+    };
+
+    PointType getProportions(int rank)
+    {
+    };
+
+    int* getDistr(void)
+    {
+        return NULL;
+    };
+};
+
+template<typename Device>
+class ParameterProvider<1,Device>
+{
+    public:
+
+    int distr[1];
+
+    typedef Grid<1,double,Device,int> MeshType;
+    typedef typename MeshType::CoordinatesType CoordinatesType;
+    typedef typename MeshType::PointType PointType;
+
+    PointType getOrigin(int rank)
+    {
+        if(rank==0)
+            return PointType(-0.5);
+        if(rank==1)
+            return PointType(4.5);
+        if(rank==2)
+            return PointType(9.5);
+        if(rank==3)
+            return PointType(14.5);
+
+        return PointType(0);
+    };
+
+    PointType getProportions(int rank)
+    {
+        if(rank==0)
+            return PointType(5);
+        if(rank==1)
+            return PointType(5);
+        if(rank==2)
+            return PointType(5);
+        if(rank==3)
+            return PointType(5);
+        return PointType(0);
+    };
+
+    int* getDistr()
+    {
+        distr[0]=4;
+        return distr;
+    };
+};
+
+template<typename Device>
+class ParameterProvider<2,Device>
+{
+    public:
+
+    int distr[2];
+
+    typedef Grid<2,double,Device,int> MeshType;
+    typedef typename MeshType::CoordinatesType CoordinatesType;
+    typedef typename MeshType::PointType PointType;
+
+    PointType getOrigin(int rank)
+    {
+        if(rank==0)
+            return PointType(-0.5,-0.5);
+        if(rank==1)
+            return PointType(9.5,-0.5);
+        if(rank==2)
+            return PointType(-0.5,9.5);
+        if(rank==3)
+            return PointType(9.5,9.5);
+
+        return PointType(0,0);
+    };
+
+    PointType getProportions(int rank)
+    {
+        if(rank==0)
+            return PointType(10,10);
+        if(rank==1)
+            return PointType(10,10);
+        if(rank==2)
+            return PointType(10,10);
+        if(rank==3)
+            return PointType(10,10);
+        return PointType(0,0);
+    };
+
+    int* getDistr()
+    {
+        distr[0]=2;
+        distr[1]=2;
+        return distr;
+    };
+};
+
+template<typename Device>
+class ParameterProvider<3,Device>
+{
+    public:
+
+    int distr[3];
+
+    typedef Grid<3,double,Device,int> MeshType;
+    typedef typename MeshType::CoordinatesType CoordinatesType;
+    typedef typename MeshType::PointType PointType;
+
+    PointType getOrigin(int rank)
+    {
+        if(rank==0)
+            return PointType(-0.5,-0.5,-0.5);
+        if(rank==1)
+            return PointType(9.5,-0.5,-0.5);
+        if(rank==2)
+            return PointType(-0.5,9.5,-0.5);
+        if(rank==3)
+            return PointType(9.5,9.5,-0.5);
+
+        return PointType(0,0,0);
+    };
+
+    PointType getProportions(int rank)
+    {
+        if(rank==0)
+            return PointType(10,10,20);
+        if(rank==1)
+            return PointType(10,10,20);
+        if(rank==2)
+            return PointType(10,10,20);
+        if(rank==3)
+            return PointType(10,10,20);
+        return PointType(0,0,0);
+    };
+
+    int* getDistr()
+    {
+        distr[0]=2;
+        distr[1]=2;
+        distr[2]=1;
+        return distr;
+    };
+};
+
+//------------------------------------------------------------------------------
+
+typedef MpiCommunicator CommunicatorType;
+
+template <int dim, typename Device>
+class TestDistributedGridIO{
+    public:
+
+    typedef Grid<dim,double,Device,int> MeshType;
+    typedef MeshFunction<MeshType> MeshFunctionType;
+    typedef Vector<double,Device,int> DofType;
+    typedef typename MeshType::Cell Cell;
+    typedef typename MeshType::IndexType IndexType; 
+    typedef typename MeshType::PointType PointType; 
+    typedef DistributedMesh<MeshType> DistributedGridType;
+
+    typedef typename DistributedGridType::CoordinatesType CoordinatesType;
+    typedef LinearFunction<double,dim> LinearFunctionType;
+
+    static void TestSave()
+    {
+        SharedPointer< LinearFunctionType, Device > linearFunctionPtr;
+        MeshFunctionEvaluator< MeshFunctionType, LinearFunctionType > linearFunctionEvaluator;    
+        
+        ParameterProvider<dim,Device> parametry;
+        
+        //save distributed meshfunction into files
+        PointType globalOrigin;
+        globalOrigin.setValue(-0.5);
+
+        PointType globalProportions;
+        globalProportions.setValue(20);
+
+
+        MeshType globalGrid;
+        globalGrid.setDimensions(globalProportions);
+        globalGrid.setDomain(globalOrigin,globalProportions);
+        
+        int *distr=parametry.getDistr();
+
+        CoordinatesType overlap;
+        overlap.setValue(1);
+        DistributedGridType distrgrid;
+        distrgrid.template setGlobalGrid<CommunicatorType>(globalGrid,overlap);
+
+        SharedPointer<MeshType> gridptr;
+        SharedPointer<MeshFunctionType> meshFunctionptr;
+        distrgrid.SetupGrid(*gridptr);
+       
+        DofType dof(gridptr->template getEntitiesCount< Cell >());
+        dof.setValue(0);
+        meshFunctionptr->bind(gridptr,dof);
+            
+        linearFunctionEvaluator.evaluateAllEntities(meshFunctionptr , linearFunctionPtr);
+        
+        File file;
+
+        File meshFile;
+        file.open( String( "/tmp/test-file.tnl-" )+convertToString(CommunicatorType::GetRank()), IOMode::write );
+        meshFile.open( String( "/tmp/test-file-mesh.tnl-" )+convertToString(CommunicatorType::GetRank()), IOMode::write );
+        DistributedGridIO<MeshFunctionType> ::save(file,meshFile, *meshFunctionptr );
+        meshFile.close();
+
+        file.close();
+
+       //create similar local mesh function and evaluate linear function on it
+        PointType localOrigin=parametry.getOrigin(CommunicatorType::GetRank());        
+        PointType localProportions=parametry.getProportions(CommunicatorType::GetRank());;
+            
+        SharedPointer<MeshType>  localGridptr;
+        localGridptr->setDimensions(localProportions);
+        localGridptr->setDomain(localOrigin,localProportions);
+
+        DofType localDof(localGridptr->template getEntitiesCount< Cell >());
+
+        SharedPointer<MeshFunctionType> localMeshFunctionptr;
+        localMeshFunctionptr->bind(localGridptr,localDof);
+        linearFunctionEvaluator.evaluateAllEntities(localMeshFunctionptr , linearFunctionPtr);
+
+        //load other meshfunction on same localgrid from created file
+        SharedPointer<MeshType>  loadGridptr;
+        loadGridptr->setDimensions(localProportions);
+        loadGridptr->setDomain(localOrigin,localProportions);
+
+        DofType loadDof(localGridptr->template getEntitiesCount< Cell >());
+        SharedPointer<MeshFunctionType> loadMeshFunctionptr;
+        loadMeshFunctionptr->bind(loadGridptr,loadDof);
+
+        loadDof.setValue(-1);
+        
+
+        file.open( String( "/tmp/test-file.tnl-" )+convertToString(CommunicatorType::GetRank()), IOMode::read );
+        loadMeshFunctionptr->boundLoad(file);
+        file.close();
+
+        for(int i=0;i<localDof.getSize();i++)
+        {
+            EXPECT_EQ( localDof.getElement(i), loadDof.getElement(i)) << "Compare Loaded and evaluated Dof Failed for: "<< i;
+        }
+    }
+    
+    static void TestLoad()
+    {
+        SharedPointer< LinearFunctionType, Device > linearFunctionPtr;
+        MeshFunctionEvaluator< MeshFunctionType, LinearFunctionType > linearFunctionEvaluator;    
+        
+        ParameterProvider<dim,Device> parametry;
+
+        //save files from local mesh        
+        PointType localOrigin=parametry.getOrigin(CommunicatorType::GetRank());        
+        PointType localProportions=parametry.getProportions(CommunicatorType::GetRank());;
+            
+        SharedPointer<MeshType> localGridptr;
+        localGridptr->setDimensions(localProportions);
+        localGridptr->setDomain(localOrigin,localProportions);
+
+        DofType localDof(localGridptr->template getEntitiesCount< Cell >());
+
+        SharedPointer<MeshFunctionType> localMeshFunctionptr;
+        localMeshFunctionptr->bind(localGridptr,localDof);
+        linearFunctionEvaluator.evaluateAllEntities(localMeshFunctionptr , linearFunctionPtr);
+
+        File file;
+        file.open( String( "/tmp/test-file.tnl-" )+convertToString(CommunicatorType::GetRank()), IOMode::write );        
+        localMeshFunctionptr->save(file);
+        file.close();
+
+        //Crete distributed grid            
+        PointType globalOrigin;
+        globalOrigin.setValue(-0.5);
+
+        PointType globalProportions;
+        globalProportions.setValue(20);
+
+        MeshType globalGrid;
+        globalGrid.setDimensions(globalProportions);
+        globalGrid.setDomain(globalOrigin,globalProportions);
+        
+        int *distr=parametry.getDistr();
+
+        CoordinatesType overlap;
+        overlap.setValue(1);
+        DistributedGridType distrgrid;
+        distrgrid.template setGlobalGrid<CommunicatorType>(globalGrid,overlap, distr);
+
+        //Crete "distributedgrid driven" grid filed by load
+        SharedPointer<MeshType> loadGridptr;
+        SharedPointer<MeshFunctionType> loadMeshFunctionptr;
+        distrgrid.SetupGrid(*loadGridptr);
+        
+        DofType loadDof(loadGridptr->template getEntitiesCount< Cell >());
+        loadDof.setValue(0);
+        loadMeshFunctionptr->bind(loadGridptr,loadDof);
+
+            
+        file.open( String( "/tmp/test-file.tnl-" )+convertToString(CommunicatorType::GetRank()), IOMode::read );  
+  
+        DistributedGridIO<MeshFunctionType> ::load(file, *loadMeshFunctionptr );
+
+        file.close();
+
+        loadMeshFunctionptr->template Synchronize<CommunicatorType>(); //need synchronization for overlaps to be filled corectly in loadDof
+
+
+        //Crete "distributedgrid driven" grid filed by evaluated linear function
+        SharedPointer<MeshType> gridptr;
+        SharedPointer<MeshFunctionType> meshFunctionptr;
+        distrgrid.SetupGrid(*gridptr);
+        
+        DofType dof(gridptr->template getEntitiesCount< Cell >());
+        dof.setValue(-1);
+        meshFunctionptr->bind(gridptr,dof);
+        
+        linearFunctionEvaluator.evaluateAllEntities(meshFunctionptr , linearFunctionPtr);        
+        meshFunctionptr->template Synchronize<CommunicatorType>();
+
+        for(int i=0;i<dof.getSize();i++)
+        {
+            EXPECT_EQ( dof.getElement(i), loadDof.getElement(i)) << "Compare Loaded and evaluated Dof Failed for: "<< i;
+        }       
+    }
+};
+
diff --git a/src/UnitTests/Mpi/DistributedGridTest_2D.cpp b/src/UnitTests/Mpi/DistributedGridTest_2D.cpp
index 7919610dc585ea6dc7a42ddf65e9f4eec3f5cfc1..c3657cede1f0e175d91b9a96d45731dd1bfe2be1 100644
--- a/src/UnitTests/Mpi/DistributedGridTest_2D.cpp
+++ b/src/UnitTests/Mpi/DistributedGridTest_2D.cpp
@@ -401,7 +401,10 @@ class DistributedGirdTest_2D : public ::testing::Test {
     typename DistributedGridType::CoordinatesType overlap;
     overlap.setValue(1);
     distrgrid=new DistributedGridType();
-    distrgrid->template setGlobalGrid<CommunicatorType>(globalGrid,overlap);
+    int distr[2];
+    distr[0]=3;
+    distr[1]=3;
+    distrgrid->template setGlobalGrid<CommunicatorType>(globalGrid,overlap, distr);
     
     distrgrid->SetupGrid(*gridptr);
     dof=new DofType(gridptr->template getEntitiesCount< Cell >());
diff --git a/src/UnitTests/Mpi/Functions.h b/src/UnitTests/Mpi/Functions.h
index 449c2f490af7cf7da0fcf20709fb91f243b1f4ec..59a0335c15af53b880f716318e571688f593d91c 100644
--- a/src/UnitTests/Mpi/Functions.h
+++ b/src/UnitTests/Mpi/Functions.h
@@ -42,7 +42,7 @@ class LinearFunction<Real,1> : public Functions::Domain< 1, Functions::MeshDomai
       {};
 
       template< typename EntityType >
-      RealType operator()( const EntityType& meshEntity,
+      __cuda_callable__ RealType operator()( const EntityType& meshEntity,
                                   const RealType& time = 0.0 ) const
       {
          return meshEntity.getCenter().x();
@@ -62,7 +62,7 @@ class ConstFunction<Real,1> : public Functions::Domain< 1, Functions::MeshDomain
       {};
 
       template< typename EntityType >
-      RealType operator()( const EntityType& meshEntity,
+      __cuda_callable__ RealType operator()( const EntityType& meshEntity,
                                   const RealType& time = 0.0 ) const
       {
          return Number;
@@ -95,7 +95,7 @@ class LinearFunction<Real,2> : public Functions::Domain< 2, Functions::MeshDomai
       {};
 
       template< typename EntityType >
-      RealType operator()( const EntityType& meshEntity,
+      __cuda_callable__ RealType operator()( const EntityType& meshEntity,
                                   const RealType& time = 0.0 ) const
       {
          //return meshEntity.getCoordinates().y()*10+meshEntity.getCoordinates().x();
@@ -114,7 +114,7 @@ class ConstFunction<Real,2> : public Functions::Domain< 2, Functions::MeshDomain
       {};
           
       template< typename EntityType >
-      RealType operator()( const EntityType& meshEntity,
+      __cuda_callable__ RealType operator()( const EntityType& meshEntity,
                                   const RealType& time = 0.0 ) const
       {
          //return meshEntity.getCoordinates().y()*10+meshEntity.getCoordinates().x();
@@ -153,7 +153,7 @@ class LinearFunction<Real,3> : public Functions::Domain< 3, Functions::MeshDomai
       {};
 
       template< typename EntityType >
-      RealType operator()( const EntityType& meshEntity,
+      __cuda_callable__ RealType operator()( const EntityType& meshEntity,
                                   const RealType& time = 0.0 ) const
       {
          //return meshEntity.getCoordinates().y()*10+meshEntity.getCoordinates().x();
@@ -172,7 +172,7 @@ class ConstFunction<Real,3> : public Functions::Domain< 3, Functions::MeshDomain
       {};
           
       template< typename EntityType >
-      RealType operator()( const EntityType& meshEntity,
+      __cuda_callable__ RealType operator()( const EntityType& meshEntity,
                                   const RealType& time = 0.0 ) const
       {
          //return meshEntity.getCoordinates().y()*10+meshEntity.getCoordinates().x();
diff --git a/src/UnitTests/Mpi/GPUDistributedGridIOTest.cu b/src/UnitTests/Mpi/GPUDistributedGridIOTest.cu
new file mode 100644
index 0000000000000000000000000000000000000000..dfc57f86cf715d6b9f4d5eb02336388bf58fe897
--- /dev/null
+++ b/src/UnitTests/Mpi/GPUDistributedGridIOTest.cu
@@ -0,0 +1,110 @@
+
+#ifdef HAVE_GTEST
+  
+#include <gtest/gtest.h>
+
+#ifdef HAVE_MPI
+
+#include "DistributedGridIOTest.h"
+
+TEST( DistributedGridIO, Save_1D )
+{
+    TestDistributedGridIO<1,Cuda>::TestSave();
+}
+
+TEST( DistributedGridIO, Save_2D )
+{
+    TestDistributedGridIO<2,Cuda>::TestSave();
+}
+
+TEST( DistributedGridIO, Save_3D )
+{
+    TestDistributedGridIO<3,Cuda>::TestSave();
+}
+
+TEST( DistributedGridIO, Load_1D )
+{
+    TestDistributedGridIO<1,Cuda>::TestLoad();
+}
+
+TEST( DistributedGridIO, Load_2D )
+{
+    TestDistributedGridIO<2,Cuda>::TestLoad();
+}
+
+TEST( DistributedGridIO, Load_3D )
+{
+    TestDistributedGridIO<3,Cuda>::TestLoad();
+}
+
+#else
+TEST(NoMPI, NoTest)
+{
+    ASSERT_TRUE(true) << ":-(";
+}
+#endif
+
+#endif
+
+#if (defined(HAVE_GTEST) && defined(HAVE_MPI))
+#include <sstream>
+
+  class MinimalistBuffredPrinter : public ::testing::EmptyTestEventListener {
+      
+  private:
+      std::stringstream sout;
+      
+  public:
+      
+    // Called before a test starts.
+    virtual void OnTestStart(const ::testing::TestInfo& test_info) {
+      sout<< test_info.test_case_name() <<"." << test_info.name() << " Start." <<std::endl;
+    }
+
+    // Called after a failed assertion or a SUCCEED() invocation.
+    virtual void OnTestPartResult(
+        const ::testing::TestPartResult& test_part_result) {
+      sout << (test_part_result.failed() ? "====Failure=== " : "===Success=== ") 
+              << test_part_result.file_name() << " "
+              << test_part_result.line_number() <<std::endl
+              << test_part_result.summary() <<std::endl;
+    }
+
+    // Called after a test ends.
+    virtual void OnTestEnd(const ::testing::TestInfo& test_info) 
+    {
+        int rank=CommunicatorType::GetRank();
+        sout<< test_info.test_case_name() <<"." << test_info.name() << " End." <<std::endl;
+        std::cout << rank << ":" << std::endl << sout.str()<< std::endl;
+        sout.str( std::string() );
+        sout.clear();
+    }
+  };
+#endif
+
+#include "../GtestMissingError.h"
+int main( int argc, char* argv[] )
+{
+#ifdef HAVE_GTEST
+   ::testing::InitGoogleTest( &argc, argv );
+
+    #ifdef HAVE_MPI
+       ::testing::TestEventListeners& listeners =
+          ::testing::UnitTest::GetInstance()->listeners();
+
+       delete listeners.Release(listeners.default_result_printer());
+       listeners.Append(new MinimalistBuffredPrinter);
+
+       CommunicatorType::Init(argc,argv);
+    #endif
+       int result= RUN_ALL_TESTS();
+
+    #ifdef HAVE_MPI
+       CommunicatorType::Finalize();
+    #endif
+       return result;
+#else
+   
+   throw GtestMissingError();
+#endif
+}
diff --git a/tests/mpi/CMakeLists.txt b/tests/mpi/CMakeLists.txt
index 9935c6a8924918d48e9af4237221bb6973c4a8ba..e42d94f65f9210b63291755bcb15431dd960c5ea 100644
--- a/tests/mpi/CMakeLists.txt
+++ b/tests/mpi/CMakeLists.txt
@@ -16,6 +16,10 @@
    	TARGET_COMPILE_DEFINITIONS( tnlMeshFuncttionEvaluateTestY PUBLIC "-DDIMENSION=2" )
    	TARGET_COMPILE_DEFINITIONS( tnlMeshFuncttionEvaluateTestY PUBLIC "-DYDISTR" )
 
+    ADD_EXECUTABLE( mpiio-save-test ${headers} mpiio-save-test.cpp )   
+    TARGET_LINK_LIBRARIES( tnlMeshFuncttionEvaluateTestY ${CPPUNIT_LIBRARIES}
+                                                           tnl )
+
     #ADD_EXECUTABLE( tnlMeshFuncttionEvaluateTestZ ${headers} MeshFunctionEvauateTest.cpp )   
     #TARGET_LINK_LIBRARIES( tnlMeshFuncttionEvaluateTestZ ${CPPUNIT_LIBRARIES}
           #                                                  tnl${mpiExt}-0.1 )
@@ -28,3 +32,12 @@
 	#TARGET_COMPILE_DEFINITIONS( tnlMeshFuncttionEvaluateTestXY PUBLIC "-DDIMENSION=3" )
 	#TARGET_COMPILE_DEFINITIONS( tnlMeshFuncttionEvaluateTestXY PUBLIC "-DXDISTR -DYDISTR" )
 
+IF( BUILD_CUDA )
+    CUDA_ADD_EXECUTABLE( mpi-gpu-test ${headers} mpi-gpu.cu )
+    TARGET_LINK_LIBRARIES( mpi-gpu-test ${CPPUNIT_LIBRARIES}
+                                                   tnl )   
+
+    CUDA_ADD_EXECUTABLE( GPUmeshFunctionEvaluateTest ${headers} GPUmeshFuncitonEvaluateTest.cu )
+    TARGET_LINK_LIBRARIES( GPUmeshFunctionEvaluateTest ${CPPUNIT_LIBRARIES}
+                                           tnl )   
+ENDIF( BUILD_CUDA )    
diff --git a/tests/mpi/GPUmeshFuncitonEvaluateTest.cu b/tests/mpi/GPUmeshFuncitonEvaluateTest.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c179304c878e940244c8320a92692e336f79ce4e
--- /dev/null
+++ b/tests/mpi/GPUmeshFuncitonEvaluateTest.cu
@@ -0,0 +1,182 @@
+#include <iostream>
+
+
+#if defined(HAVE_MPI) && defined(HAVE_CUDA)
+
+#include <TNL/Containers/Array.h>
+#include <TNL/Meshes/Grid.h>
+#include <TNL/Meshes/DistributedMeshes/DistributedMesh.h>
+#include <TNL/Communicators/MpiCommunicator.h>
+#include <TNL/Communicators/NoDistrCommunicator.h>
+#include <TNL/Functions/MeshFunction.h>
+
+#include <TNL/Timer.h>
+#include  <TNL/SharedPointer.h>
+
+using namespace std;
+
+#define DIMENSION 3
+//#define OUTPUT 
+#define XDISTR
+//#define YDISTR
+//#define ZDISTR
+
+#include "../../src/UnitTests/Mpi/Functions.h"
+
+using namespace TNL;
+using namespace TNL::Containers;
+using namespace TNL::Meshes;
+using namespace TNL::Meshes::DistributedMeshes;
+using namespace TNL::Communicators;
+using namespace TNL::Functions;
+using namespace TNL::Devices;
+
+ 
+int main ( int argc, char *argv[])
+{
+    
+  Timer all,setup,eval,sync;
+  
+  typedef Cuda Device;  
+  //typedef Host Device;  
+
+  typedef MpiCommunicator CommunicatorType;
+  //typedef NoDistrCommunicator CommType;
+  typedef Grid<DIMENSION, double,Device,int> MeshType;
+  typedef MeshFunction<MeshType> MeshFunctionType;
+  typedef Vector<double,Device,int> DofType;
+  typedef typename MeshType::Cell Cell;
+  typedef typename MeshType::IndexType IndexType; 
+  typedef typename MeshType::PointType PointType; 
+  
+  typedef DistributedMesh<MeshType> DistributedMeshType;
+  
+  typedef LinearFunction<double,DIMENSION> LinearFunctionType;
+  typedef ConstFunction<double,DIMENSION> ConstFunctionType;
+  
+  CommunicatorType::Init(argc,argv);
+
+  int size=10;
+  int cycles=1;
+  if(argc==3)
+  {
+      size=strtol(argv[1],NULL,10);
+      cycles=strtol(argv[2],NULL,10);
+      //cout << "size: "<< size <<"cycles: "<< cycles <<endl;
+  }
+  
+    all.start();
+    setup.start();
+  
+ PointType globalOrigin;
+ globalOrigin.setValue(-0.5);
+ 
+ PointType globalProportions;
+ globalProportions.setValue(size);
+ 
+ 
+ MeshType globalGrid;
+ globalGrid.setDimensions(globalProportions);
+ globalGrid.setDomain(globalOrigin,globalProportions);
+
+ 
+ int distr[DIMENSION];
+ for(int i=0;i<DIMENSION;i++) 
+    distr[i]=1;
+
+ #ifdef XDISTR
+     distr[0]=0;
+ #endif
+
+ #ifdef YDISTR
+    distr[1]=0;
+ #endif
+
+ #ifdef ZDISTR
+     distr[2]=0;
+ #endif
+
+ typename MeshType::CoordinatesType overlap;
+ overlap.setValue(1);
+ DistributedMeshType distrgrid;
+ distrgrid.template setGlobalGrid<CommunicatorType>(globalGrid,overlap, distr); 
+   
+ SharedPointer<MeshType> gridptr;
+ SharedPointer<MeshFunctionType> meshFunctionptr;
+ MeshFunctionEvaluator< MeshFunctionType, LinearFunctionType > linearFunctionEvaluator;
+ MeshFunctionEvaluator< MeshFunctionType, ConstFunctionType > constFunctionEvaluator;
+ 
+  distrgrid.SetupGrid(*gridptr);
+  
+  DofType dof(gridptr->template getEntitiesCount< Cell >());
+
+  dof.setValue(0);
+  
+  meshFunctionptr->bind(gridptr,dof);  
+  
+  SharedPointer< LinearFunctionType, Device > linearFunctionPtr;
+  SharedPointer< ConstFunctionType, Device > constFunctionPtr; 
+   
+  setup.stop();
+  
+  double sum=0.0;
+
+  constFunctionPtr->Number=CommunicatorType::GetRank();
+  
+  for(int i=0;i<cycles;i++)
+    {    
+        eval.start();
+        
+        constFunctionEvaluator.evaluateAllEntities( meshFunctionptr , constFunctionPtr );
+        //linearFunctionEvaluator.evaluateAllEntities(meshFunctionptr , linearFunctionPtr);
+        CommunicatorType::Barrier();
+        eval.stop();
+
+        sync.start();    
+        meshFunctionptr->template Synchronize<CommunicatorType>();
+        CommunicatorType::Barrier();
+        sync.stop();
+
+        ///sum+=dof[gridptr->getDimensions().x()/2]; //dummy acces to array    
+    }
+  all.stop();
+  
+#ifdef OUTPUT    
+  //print local dof
+  Printer<MeshType,DofType>::print_dof(CommunicatorType::GetRank(),*gridptr, dof);
+#endif
+  
+  if(CommunicatorType::GetRank()==0)
+  {
+    cout << sum <<endl<<endl;  
+    
+    cout<<"distr: ";
+    distrgrid.printdistr(cout);
+    cout << endl;
+  
+    cout<<"setup: "<<setup.getRealTime() <<endl;
+    cout<<"evalpercycle: "<<eval.getRealTime()/cycles<<endl;
+    cout<<"syncpercycle: "<<sync.getRealTime()/cycles<<endl;
+    cout <<"eval: "<<eval.getRealTime()<<endl;
+    cout <<"sync: "<<sync.getRealTime()<<endl;
+    cout<<"all: "<<all.getRealTime()<<endl<<endl;
+  }
+  
+
+  CommunicatorType::Finalize();
+
+  return 0;
+
+}
+
+#else
+
+using namespace std;
+
+int main(void)
+{
+    cout << "MPI or Cuda missing...." <<endl;
+}
+#endif
+
+
diff --git a/tests/mpi/mpi-gpu.cu b/tests/mpi/mpi-gpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e136e6bead8350ef8dedb1cab364310f117234cb
--- /dev/null
+++ b/tests/mpi/mpi-gpu.cu
@@ -0,0 +1,79 @@
+#include <iostream>
+
+using namespace std;
+
+
+#if defined(HAVE_MPI) && defined(HAVE_CUDA)
+
+#include <cuda_runtime.h>
+#include <mpi.h>
+ 
+__global__ void SetKernel(float *deviceData, float value)
+{
+    // Just a dummy kernel
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    deviceData[idx] = value;
+}
+
+double sum(float * data, int count)
+{
+    double sum=0;
+    for(int i=0;i<count;i++)
+        sum+=data[i];
+
+    return sum;
+}
+
+
+int main(int argc, char **argv)
+{
+    MPI_Init(&argc, &argv);
+
+    int rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    int blockSize = 256; //poÄŤet threadĹŻ v v bloku
+    int gridSize = 1000; //počet bloků v gridu -> musí být menší než maxGridSize v cudeGetDeviceProperties
+
+    int dataCount=blockSize*gridSize;
+
+    float * deviceData=NULL;
+    cudaMalloc((void **)&deviceData, dataCount * sizeof(float));
+
+    if(rank==0)
+    {
+        cout << rank<<": "<<"Setup GPU alocated array to 1" << endl;
+        SetKernel<<< gridSize,blockSize >>>(deviceData,1.0f);
+        cout << rank<<": "<<" Sending GPU data " <<endl;
+        MPI_Send((void*)deviceData, dataCount, MPI_FLOAT, 1, 1, MPI_COMM_WORLD);
+    }
+    
+    if(rank==1) 
+    {
+        cout << rank<<": "<<" Reciving GPU data " <<endl;
+        MPI_Recv((void*) deviceData, dataCount, MPI_FLOAT, 0, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+
+        float *data = new float[dataCount];
+        cout << rank<<": "<<" Copying data from GPU to CPU " <<endl;
+        cudaMemcpy( (void*) data, (void*)deviceData, dataCount*sizeof(float),  cudaMemcpyDeviceToHost);    
+        cout << rank<<": "<<" Computin Sum on CPU " <<endl;
+        cout << rank<<": "<< "sum:" << sum(data,dataCount) << endl;
+        delete [] data;
+    }
+
+    cudaFree(deviceData);
+
+    MPI_Finalize();
+return 0;
+}
+
+#else
+
+int main(void)
+{
+    cout << "CUDA or MPI missing...." <<endl;
+}
+
+#endif
+
+