diff --git a/src/Python/pytnl/tnl/MeshReaders.cpp b/src/Python/pytnl/tnl/MeshReaders.cpp
index c4abae015de76ee4ff440871ed57752f13a7ad79..c009300f3c447d593c9b125911811ce0239195fd 100644
--- a/src/Python/pytnl/tnl/MeshReaders.cpp
+++ b/src/Python/pytnl/tnl/MeshReaders.cpp
@@ -20,6 +20,8 @@ void export_MeshReaders( py::module & m )
         .def("loadMesh", &MeshReader::template loadMesh< MeshOfQuadrangles >)
         .def("loadMesh", &MeshReader::template loadMesh< MeshOfTetrahedrons >)
         .def("loadMesh", &MeshReader::template loadMesh< MeshOfHexahedrons >)
+        .def("readPointData", &MeshReader::readPointData)
+        .def("readCellData", &MeshReader::readCellData)
     ;
 
     py::class_< TNL::Meshes::Readers::VTKReader, MeshReader >( m, "VTKReader" )
@@ -29,8 +31,6 @@ void export_MeshReaders( py::module & m )
     // base class for VTUReader and PVTUReader
     py::class_< XMLVTK, PyXMLVTK, MeshReader >( m, "XMLVTK" )
         .def(py::init<std::string>())
-        .def("readPointData", &XMLVTK::readPointData)
-        .def("readCellData", &XMLVTK::readCellData)
    ;
 
     py::class_< TNL::Meshes::Readers::VTUReader, XMLVTK >( m, "VTUReader" )
diff --git a/src/TNL/Containers/NDArray.h b/src/TNL/Containers/NDArray.h
index 674e1c558cfc57e5f56cf5178512bff976ef4fdb..2df4d40e9fc728a3c3142447a2564531478ce7cd 100644
--- a/src/TNL/Containers/NDArray.h
+++ b/src/TNL/Containers/NDArray.h
@@ -167,9 +167,6 @@ public:
    operator()( IndexTypes&&... indices )
    {
       static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
-      __ndarray_impl::assertIndicesInBounds( getSizes(), OverlapsType{}, std::forward< IndexTypes >( indices )... );
-      TNL_ASSERT_LT( getStorageIndex( std::forward< IndexTypes >( indices )... ), getStorageSize(),
-                     "storage index out of bounds - either input error or a bug in the indexer" );
       return array[ getStorageIndex( std::forward< IndexTypes >( indices )... ) ];
    }
 
@@ -179,9 +176,6 @@ public:
    operator()( IndexTypes&&... indices ) const
    {
       static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
-      __ndarray_impl::assertIndicesInBounds( getSizes(), OverlapsType{}, std::forward< IndexTypes >( indices )... );
-      TNL_ASSERT_LT( getStorageIndex( std::forward< IndexTypes >( indices )... ), getStorageSize(),
-                     "storage index out of bounds - either input error or a bug in the indexer" );
       return array[ getStorageIndex( std::forward< IndexTypes >( indices )... ) ];
    }
 
@@ -294,9 +288,6 @@ public:
    getElement( IndexTypes&&... indices ) const
    {
       static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
-      __ndarray_impl::assertIndicesInBounds( getSizes(), OverlapsType{}, std::forward< IndexTypes >( indices )... );
-      TNL_ASSERT_LT( getStorageIndex( std::forward< IndexTypes >( indices )... ), getStorageSize(),
-                     "storage index out of bounds - either input error or a bug in the indexer" );
       return array.getElement( getStorageIndex( std::forward< IndexTypes >( indices )... ) );
    }
 
diff --git a/src/TNL/Containers/NDArrayIndexer.h b/src/TNL/Containers/NDArrayIndexer.h
index e3f068e0cacb774db90122c25617e8531ae52787..b168209cfce0a4edfeab007348aa498f3e981677 100644
--- a/src/TNL/Containers/NDArrayIndexer.h
+++ b/src/TNL/Containers/NDArrayIndexer.h
@@ -91,10 +91,19 @@ public:
    getStorageIndex( IndexTypes&&... indices ) const
    {
       static_assert( sizeof...( indices ) == SizesHolder::getDimension(), "got wrong number of indices" );
-      return Base::template getStorageIndex< Permutation, Overlaps >
-             ( sizes,
-               static_cast< const StridesHolder& >( *this ),
-               std::forward< IndexTypes >( indices )... );
+      __ndarray_impl::assertIndicesInBounds( getSizes(), OverlapsType{}, std::forward< IndexTypes >( indices )... );
+      const IndexType result = Base::template getStorageIndex< Permutation, Overlaps >
+                               ( sizes,
+                                 static_cast< const StridesHolder& >( *this ),
+                                 std::forward< IndexTypes >( indices )... );
+      TNL_ASSERT_GE( result, (IndexType) 0,
+                     "storage index out of bounds - either input error or a bug in the indexer" );
+      // upper bound can be checked only for contiguous arrays/views
+      if( StridesHolder::isContiguous() ) {
+         TNL_ASSERT_LT( result, getStorageSize(),
+                        "storage index out of bounds - either input error or a bug in the indexer" );
+      }
+      return result;
    }
 
 protected:
diff --git a/src/TNL/Containers/NDArrayView.h b/src/TNL/Containers/NDArrayView.h
index 57d166b2ef9e88f97102bdd3a15f740c7890f7ab..60397f613f295275f24bfad04e7b10555feeb04c 100644
--- a/src/TNL/Containers/NDArrayView.h
+++ b/src/TNL/Containers/NDArrayView.h
@@ -223,7 +223,6 @@ public:
    operator()( IndexTypes&&... indices )
    {
       static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
-      __ndarray_impl::assertIndicesInBounds( getSizes(), OverlapsType{}, std::forward< IndexTypes >( indices )... );
       return array[ getStorageIndex( std::forward< IndexTypes >( indices )... ) ];
    }
 
@@ -233,7 +232,6 @@ public:
    operator()( IndexTypes&&... indices ) const
    {
       static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
-      __ndarray_impl::assertIndicesInBounds( getSizes(), OverlapsType{}, std::forward< IndexTypes >( indices )... );
       return array[ getStorageIndex( std::forward< IndexTypes >( indices )... ) ];
    }
 
diff --git a/src/TNL/MPI/Utils.h b/src/TNL/MPI/Utils.h
index ab2e0cbe8c19bc49b84437298e4c163c0f643829..bb58a1a12c55b91a4fd0673818fb81180a718593 100644
--- a/src/TNL/MPI/Utils.h
+++ b/src/TNL/MPI/Utils.h
@@ -72,5 +72,30 @@ inline int getRankOnNode( MPI_Comm group = AllGroup() )
 #endif
 }
 
+/**
+ * \brief Applies the given reduction operation to the values among all ranks
+ * in the given communicator.
+ *
+ * This is a collective operation which uses \ref Allreduce internally. It
+ * provides nicer semantics than the wrapper function: the input value is passed
+ * by value (heh) rather then by pointer, and the result is returned rather than
+ * written to the output pointer.
+ *
+ * \param value Value of the current rank to be reduced.
+ * \param op The reduction operation to be applied.
+ * \param group The communicator comprising ranks that participate in the
+ *              collective operation.
+ * \return The reduced value (it is ensured that all ranks receive the same
+ *         value).
+ */
+template< typename T >
+T reduce( T value, const MPI_Op& op, MPI_Comm group = AllGroup() )
+{
+   // call the in-place variant of Allreduce
+   Allreduce( &value, 1, op, group );
+   // return the reduced value
+   return value;
+}
+
 } // namespace MPI
 } // namespace TNL
diff --git a/src/TNL/Meshes/MeshDetails/layers/EntityTags/Layer.h b/src/TNL/Meshes/MeshDetails/layers/EntityTags/Layer.h
index dc0c767b806000f2497f2cae0e32b1d67aa0e5cf..72640715841dac5003e8c53e2e37d77f3416d9ac 100644
--- a/src/TNL/Meshes/MeshDetails/layers/EntityTags/Layer.h
+++ b/src/TNL/Meshes/MeshDetails/layers/EntityTags/Layer.h
@@ -81,14 +81,12 @@ public:
       ghostsOffset = entitiesCount;
    }
 
-   __cuda_callable__
    typename EntityTagsArrayType::ViewType
    getEntityTagsView( DimensionTag )
    {
       return tags.getView();
    }
 
-   __cuda_callable__
    typename EntityTagsArrayType::ConstViewType
    getEntityTagsView( DimensionTag ) const
    {
diff --git a/src/TNL/Meshes/MeshDetails/layers/EntityTags/LayerFamily.h b/src/TNL/Meshes/MeshDetails/layers/EntityTags/LayerFamily.h
index 2914850a7fed5253e3dfc772b79ed8354e4510ef..163a3a94bc4d7ae2c910ab39030033b737552142 100644
--- a/src/TNL/Meshes/MeshDetails/layers/EntityTags/LayerFamily.h
+++ b/src/TNL/Meshes/MeshDetails/layers/EntityTags/LayerFamily.h
@@ -170,7 +170,6 @@ public:
    using BaseType::operator=;
 
    template< int Dimension >
-   __cuda_callable__
    typename EntityTagsArrayType::ViewType
    getEntityTagsView()
    {
@@ -179,7 +178,6 @@ public:
    }
 
    template< int Dimension >
-   __cuda_callable__
    typename EntityTagsArrayType::ConstViewType
    getEntityTagsView() const
    {
diff --git a/src/TNL/Meshes/Readers/VTKReader.h b/src/TNL/Meshes/Readers/VTKReader.h
index 58a196a852d1b506593b3a35c1babd12d2da16bd..d6cbfb3e28784cb246c31e7d3ff492a765279af3 100644
--- a/src/TNL/Meshes/Readers/VTKReader.h
+++ b/src/TNL/Meshes/Readers/VTKReader.h
@@ -17,7 +17,7 @@
 #include <vector>
 
 #include <TNL/Meshes/Readers/MeshReader.h>
-#include <TNL/Exceptions/NotImplementedError.h>
+#include <TNL/Endianness.h>
 
 namespace TNL {
 namespace Meshes {
@@ -39,33 +39,34 @@ public:
 
       std::ifstream inputFile( fileName );
       if( ! inputFile )
-         throw MeshReaderError( "VTKReader", "failed to open the file '" + fileName + "'." );
+         throw MeshReaderError( "VTKReader", "failed to open the file '" + fileName + "'" );
 
-      if( ! parseHeader( inputFile ) )
-         throw MeshReaderError( "VTKReader", "failed to parse the VTK file header." );
-      const auto positionAfterHeading = inputFile.tellg();
+      parseHeader( inputFile );
 
       if( dataset != "UNSTRUCTURED_GRID" )
-         throw MeshReaderError( "VTKReader", "the dataset '" + dataset + "' is not supported." );
+         throw MeshReaderError( "VTKReader", "the dataset '" + dataset + "' is not supported" );
 
-      // TODO: implement binary parsing
-      if( dataType == "BINARY" )
-         throw Exceptions::NotImplementedError("VTKReader: parsing of BINARY data is not implemented yet.");
+      // parse the file, find the starting positions of all relevant sections
+      findSections( inputFile );
 
       std::string line, aux;
       std::istringstream iss;
 
       // parse points section
-      if( ! findSection( inputFile, "POINTS" ) )
-         throw MeshReaderError( "VTKReader", "unable to find the POINTS section, the file may be invalid or corrupted." );
+      if( ! sectionPositions.count( "POINTS" ) )
+         throw MeshReaderError( "VTKReader", "unable to find the POINTS section, the file may be invalid or corrupted" );
+      inputFile.seekg( sectionPositions["POINTS"] );
       getline( inputFile, line );
       iss.clear();
       iss.str( line );
       iss >> aux;
       iss >> NumberOfPoints;
       iss >> pointsType;
+      if( pointsType != "float" && pointsType != "double" )
+         throw MeshReaderError( "VTKReader", "unsupported data type for POINTS: " + pointsType );
 
       // global index type is not stored in legacy VTK files
+      // (binary VTK files don't support int64)
       connectivityType = offsetsType = "std::int32_t";
       // only std::uint8_t makes sense for entity types
       typesType = "std::uint8_t";
@@ -78,22 +79,18 @@ public:
       // read points
       worldDimension = 0;
       for( std::size_t pointIndex = 0; pointIndex < NumberOfPoints; pointIndex++ ) {
-         if( ! inputFile ) {
-            reset();
-            throw MeshReaderError( "VTKReader", "unable to read enough vertices, the file may be invalid or corrupted." );
-         }
-         getline( inputFile, line );
+         if( ! inputFile )
+            throw MeshReaderError( "VTKReader", "unable to read enough vertices, the file may be invalid or corrupted" );
 
          // read the coordinates and compute the world dimension
-         iss.clear();
-         iss.str( line );
          for( int i = 0; i < 3; i++ ) {
-            double aux;
-            iss >> aux;
-            if( ! iss ) {
-               reset();
-               throw MeshReaderError( "VTKReader", "unable to read " + std::to_string(i) + "th component of the vertex number " + std::to_string(pointIndex) + "." );
-            }
+            double aux = 0;
+            if( pointsType == "float" )
+               aux = readValue< float >( dataFormat, inputFile );
+            else
+               aux = readValue< double >( dataFormat, inputFile );
+            if( ! inputFile )
+               throw MeshReaderError( "VTKReader", "unable to read " + std::to_string(i) + "th component of the vertex number " + std::to_string(pointIndex) );
             if( aux != 0.0 )
                worldDimension = std::max( worldDimension, i + 1 );
             pointsArray.push_back( aux );
@@ -101,30 +98,22 @@ public:
       }
 
       // skip to the CELL_TYPES section
-      if( ! findSection( inputFile, "CELL_TYPES" ) ) {
-         reset();
-         throw MeshReaderError( "VTKReader", "unable to find the CELL_TYPES section, the file may be invalid or corrupted." );
-      }
+      if( ! sectionPositions.count( "CELL_TYPES" ) )
+         throw MeshReaderError( "VTKReader", "unable to find the CELL_TYPES section, the file may be invalid or corrupted" );
+      inputFile.seekg( sectionPositions["CELL_TYPES"] );
       getline( inputFile, line );
       iss.clear();
       iss.str( line );
       iss >> aux;
-      std::size_t NumberOfEntities;
+      std::size_t NumberOfEntities = 0;
       iss >> NumberOfEntities;
 
       // read entity types
       for( std::size_t entityIndex = 0; entityIndex < NumberOfEntities; entityIndex++ ) {
-         if( ! inputFile ) {
-            reset();
-            throw MeshReaderError( "VTKReader", "unable to read enough cell types, the file may be invalid or corrupted." );
-         }
-         getline( inputFile, line );
-
-         // get entity type
-         int typeId;
-         iss.clear();
-         iss.str( line );
-         iss >> typeId;
+         if( ! inputFile )
+            throw MeshReaderError( "VTKReader", "unable to read enough cell types, the file may be invalid or corrupted" );
+         // cell types are stored with great redundancy as int32 in the VTK file
+         const std::uint8_t typeId = readValue< std::int32_t >( dataFormat, inputFile );
          typesArray.push_back( typeId );
       }
 
@@ -146,10 +135,8 @@ public:
          }
       }
 
-      if( meshDimension == 0 ) {
-         reset();
+      if( meshDimension == 0 )
          throw MeshReaderError( "VTKReader", "Mesh dimension cannot be 0. Are there any entities at all?" );
-      }
 
       // filter out cell shapes
       std::vector< std::uint8_t > cellTypes;
@@ -165,7 +152,6 @@ public:
          const std::string msg = "invalid number of cells (" + std::to_string(NumberOfCells) + "). Counts of entities for each dimension (0,1,2,3) are: "
                                + std::to_string(entitiesCounts[0]) + ", " + std::to_string(entitiesCounts[1]) + ", "
                                + std::to_string(entitiesCounts[2]) + ", " + std::to_string(entitiesCounts[3]);
-         reset();
          throw MeshReaderError( "VTKReader", msg );
       }
 
@@ -174,41 +160,31 @@ public:
       for( auto c : cellTypes )
          if( (VTK::EntityShape) c != cellShape ) {
             const std::string msg = "Mixed unstructured meshes are not supported. There are cells with type "
-                                  + VTK::getShapeName(cellShape) + " and " + VTK::getShapeName((VTK::EntityShape) c) + ".";
-            reset();
+                                  + VTK::getShapeName(cellShape) + " and " + VTK::getShapeName((VTK::EntityShape) c);
             throw MeshReaderError( "VTKReader", msg );
          }
 
       // find to the CELLS section
-      if( ! findSection( inputFile, "CELLS", positionAfterHeading ) ) {
-         reset();
-         throw MeshReaderError( "VTKReader", "unable to find the CELLS section, the file may be invalid or corrupted." );
-      }
+      if( ! sectionPositions.count( "CELLS" ) )
+         throw MeshReaderError( "VTKReader", "unable to find the CELLS section, the file may be invalid or corrupted" );
+      inputFile.seekg( sectionPositions["CELLS"] );
       getline( inputFile, line );
 
       // read entities
       for( std::size_t entityIndex = 0; entityIndex < NumberOfEntities; entityIndex++ ) {
-         if( ! inputFile ) {
-            reset();
-            throw MeshReaderError( "VTKReader", "unable to read enough cells, the file may be invalid or corrupted."
+         if( ! inputFile )
+            throw MeshReaderError( "VTKReader", "unable to read enough cells, the file may be invalid or corrupted"
                                                 " (entityIndex = " + std::to_string(entityIndex) + ")" );
-         }
-         getline( inputFile, line );
 
          if( (VTK::EntityShape) typesArray[ entityIndex ] == cellShape ) {
-            iss.clear();
-            iss.str( line );
             // read number of subvertices
-            int subvertices = 0;
-            iss >> subvertices;
+            const std::int32_t subvertices = readValue< std::int32_t >( dataFormat, inputFile );
             for( int v = 0; v < subvertices; v++ ) {
-               std::size_t vid;
-               iss >> vid;
-               if( ! iss ) {
-                  reset();
-                  throw MeshReaderError( "VTKReader", "unable to read enough cells, the file may be invalid or corrupted."
+               // legacy VTK files do not support 64-bit integers, even in the BINARY format
+               const std::int32_t vid = readValue< std::int32_t >( dataFormat, inputFile );
+               if( ! inputFile )
+                  throw MeshReaderError( "VTKReader", "unable to read enough cells, the file may be invalid or corrupted"
                                                       " (entityIndex = " + std::to_string(entityIndex) + ", subvertex = " + std::to_string(v) + ")" );
-               }
                connectivityArray.push_back( vid );
             }
             offsetsArray.push_back( connectivityArray.size() );
@@ -228,82 +204,339 @@ public:
       meshType = "Meshes::Mesh";
    }
 
+   virtual VariantVector
+   readPointData( std::string arrayName ) override
+   {
+      return readPointOrCellData( "POINT_DATA", arrayName );
+   }
+
+   virtual VariantVector
+   readCellData( std::string arrayName ) override
+   {
+      return readPointOrCellData( "CELL_DATA", arrayName );
+   }
+
    virtual void reset() override
    {
       resetBase();
-      dataType = "";
+      dataFormat = VTK::FileFormat::ascii;
       dataset = "";
+      sectionPositions.clear();
    }
 
 protected:
    // output of parseHeader
-   std::string dataType;
+   VTK::FileFormat dataFormat = VTK::FileFormat::ascii;
    std::string dataset;
 
-   bool parseHeader( std::istream& str )
+   // output of findSections
+   std::map< std::string, std::ios::pos_type > sectionPositions;
+
+   // mesh properties - needed for reading POINT_DATA and CELL_DATA
+   std::int32_t points_count = 0;
+   std::int32_t cells_count = 0;
+
+   void parseHeader( std::istream& str )
    {
       std::string line;
       std::istringstream iss;
 
       // check header
       getline( str, line );
-      if( line != "# vtk DataFile Version 2.0" ) {
-         std::cerr << "VTKReader: unsupported VTK header: '" << line << "'." << std::endl;
-         return false;
-      }
+      if( line != "# vtk DataFile Version 2.0" )
+         throw MeshReaderError( "VTKReader", "failed to parse the VTK file header: unsupported VTK header '" + line + "'" );
 
       // skip title
       if( ! str )
-         return false;
+         throw MeshReaderError( "VTKReader", "failed to parse the VTK file header" );
       getline( str, line );
 
       // parse data type
       if( ! str )
-         return false;
-      getline( str, dataType );
-      if( dataType != "ASCII" && dataType != "BINARY" ) {
-         std::cerr << "VTKReader: unknown data type: '" << dataType << "'." << std::endl;
-         return false;
-      }
+         throw MeshReaderError( "VTKReader", "failed to parse the VTK file header" );
+      std::string format;
+      getline( str, format );
+      if( format == "ASCII" )
+         dataFormat = VTK::FileFormat::ascii;
+      else if( format == "BINARY" )
+         dataFormat = VTK::FileFormat::binary;
+      else
+         throw MeshReaderError( "VTKReader", "unknown data format: '" + format + "'" );
 
       // parse dataset
       if( ! str )
-         return false;
+         throw MeshReaderError( "VTKReader", "failed to parse the VTK file header" );
       getline( str, line );
       iss.clear();
       iss.str( line );
       std::string tmp;
       iss >> tmp;
-      if( tmp != "DATASET" ) {
-         std::cerr << "VTKReader: wrong dataset specification: '" << line << "'." << std::endl;
-         return false;
-      }
+      if( tmp != "DATASET" )
+         throw MeshReaderError( "VTKReader", "wrong dataset specification: '" + line + "'" );
       iss >> dataset;
+   }
+
+   void findSections( std::istream& str )
+   {
+      while( str ) {
+         // drop all whitespace (empty lines etc) before saving a position and reading a line
+         str >> std::ws;
+         if( str.eof() )
+            break;
+
+         // read a line which should contain the following section header
+         const std::ios::pos_type currentPosition = str.tellg();
+         std::string line;
+         getline( str, line );
+         if( ! str )
+            throw MeshReaderError( "VTKReader", "failed to parse sections of the VTK file" );
+
+         // parse the section name
+         std::istringstream iss( line );
+         std::string name;
+         iss >> name;
+
+         if( name == "FIELD" ) {
+            sectionPositions.insert( {"FIELD", currentPosition} );
+            // parse the rest of the line: FIELD FieldData <count>
+            std::string aux;
+            int count = 0;
+            iss >> aux >> count;
+            // skip the FieldData arrays
+            for( int i = 0; i < count; i++ ) {
+               getline( str, line );
+               iss.clear();
+               iss.str( line );
+               // <name> <components> <tuples> <datatype>
+               std::int32_t components = 0;
+               std::int32_t tuples = 0;
+               std::string datatype;
+               iss >> aux >> components >> tuples >> datatype;
+               if( ! iss )
+                  throw MeshReaderError( "VTKReader", "failed to extract FieldData information from line '" + line + "'" );
+               // skip the points coordinates
+               for( std::int32_t j = 0; j < components * tuples; j++ )
+                  skipValue( dataFormat, str, datatype );
+            }
+         }
+         else if( name == "POINTS" ) {
+            sectionPositions.insert( {"POINTS", currentPosition} );
+            // parse the rest of the line: POINTS <points_count> <datatype>
+            std::string datatype;
+            iss >> points_count >> datatype;
+            // skip the values
+            for( std::int32_t j = 0; j < 3 * points_count; j++ )
+               skipValue( dataFormat, str, datatype );
+         }
+         else if( name == "CELLS" ) {
+            sectionPositions.insert( {"CELLS", currentPosition} );
+            // parse the rest of the line: CELLS <cells_count> <values_count>
+            std::int32_t values_count = 0;
+            iss >> cells_count >> values_count;
+            // skip the values
+            for( std::int32_t j = 0; j < values_count; j++ )
+               skipValue( dataFormat, str, "int" );
+         }
+         else if( name == "CELL_TYPES" ) {
+            sectionPositions.insert( {"CELL_TYPES", currentPosition} );
+            // parse the rest of the line: CELL_TYPES <count>
+            std::int32_t count = 0;
+            iss >> count;
+            // skip the values
+            for( std::int32_t j = 0; j < count; j++ )
+               // cell types are stored with great redundancy as int32 in the VTK file
+               skipValue( dataFormat, str, "int" );
+         }
+         else if( name == "CELL_DATA" || name == "POINT_DATA" ) {
+            if( cells_count == 0 || points_count == 0 )
+               throw MeshReaderError( "VTKReader", "encountered a " + name + " section, but the mesh topology was not parsed yet "
+                                      "(cells count = " + std::to_string(cells_count) + ", points count = " + std::to_string(points_count) + ")" );
+
+            while( str ) {
+               // drop all whitespace (empty lines etc) before saving a position and reading a line
+               str >> std::ws;
+               if( str.eof() )
+                  break;
+
+               // read a line which should contain the following array metadata
+               const std::ios::pos_type currentPosition = str.tellg();
+               std::string line;
+               getline( str, line );
+               if( ! str )
+                  throw MeshReaderError( "VTKReader", "failed to parse sections of the VTK file" );
+
+               // parse the array type
+               std::istringstream iss( line );
+               std::string type;
+               iss >> type;
+
+               const std::int32_t elements = (name == "CELL_DATA") ? cells_count : points_count;
+
+               // scalars: 1 value per cell/point
+               // vectors: 3 values per cell/point
+               // fields: arbitrary number of values per cell/point
+               int values_per_element = 1;
+
+               // additional metadata
+               std::string array_name, datatype;
+
+               if( type == "SCALARS" ) {
+                  // parse the rest of the line: SCALARS <array_name> <datatype>
+                  iss >> array_name >> datatype;
+                  sectionPositions.insert( {name + "::" + array_name, currentPosition} );
+                  // skip the LOOKUP_TABLE line
+                  getline( str, line );
+               }
+               else if( type == "VECTORS" ) {
+                  values_per_element = 3;
+                  // parse the rest of the line: VECTORS <array_name> <datatype>
+                  iss >> array_name >> datatype;
+                  sectionPositions.insert( {name + "::" + array_name, currentPosition} );
+               }
+               else if( type == "TENSORS" ) {
+                  values_per_element = 9;
+                  // parse the rest of the line: TENSORS <array_name> <datatype>
+                  iss >> array_name >> datatype;
+                  sectionPositions.insert( {name + "::" + array_name, currentPosition} );
+               }
+               else if( type == "FIELD" ) {
+                  // parse the rest of the line: FIELD FieldData <count>
+                  std::string aux;
+                  int count = 0;
+                  iss >> aux >> count;
+                  // skip the FieldData arrays
+                  for( int i = 0; i < count; i++ ) {
+                     // drop all whitespace (empty lines etc) before saving a position and reading a line
+                     str >> std::ws;
+                     const std::ios::pos_type currentPosition = str.tellg();
+                     getline( str, line );
+                     iss.clear();
+                     iss.str( line );
+                     // <array_name> <components> <tuples> <datatype>
+                     std::int32_t components = 0;
+                     std::int32_t tuples = 0;
+                     std::string datatype;
+                     iss >> array_name >> components >> tuples >> datatype;
+                     if( ! iss )
+                        throw MeshReaderError( "VTKReader", "failed to extract FieldData information from line '" + line + "'" );
+                     sectionPositions.insert( {name + "::" + array_name, currentPosition} );
+                     // skip the points coordinates
+                     for( std::int32_t j = 0; j < components * tuples; j++ )
+                        skipValue( dataFormat, str, datatype );
+                  }
+                  continue;
+               }
+               else {
+                  std::cerr << "VTKReader: encountered an unsupported CELL_DATA array type: " << type
+                            << ". Ignoring the rest of the file." << std::endl;
+                  return;
+               }
 
-      return true;
+               // skip the values
+               for( std::int32_t j = 0; j < elements * values_per_element; j++ )
+                  skipValue( dataFormat, str, datatype );
+            }
+         }
+         else
+            throw MeshReaderError( "VTKReader", "parsing error: unexpected section start at byte " + std::to_string(currentPosition)
+                                    + " (section name is '" + name + "')" );
+      }
    }
 
-   bool findSection( std::istream& str, const std::string& section, std::ios::pos_type begin = -1 )
+   VariantVector
+   readPointOrCellData( std::string sectionName, std::string arrayName )
    {
-      std::string line, aux;
-      std::istringstream iss;
+      std::ifstream inputFile( fileName );
+      if( ! inputFile )
+         throw MeshReaderError( "VTKReader", "failed to open the file '" + fileName + "'" );
 
-      if( begin >= 0 )
-         str.seekg( begin );
+      std::int32_t elements = (sectionName == "CELL_DATA") ? cells_count : points_count;
+      int values_per_element = 1;
 
-      while( str ) {
-         std::ios::pos_type currentPosition = str.tellg();
-         getline( str, line );
-         iss.clear();
-         iss.str( line );
-         iss >> aux;
-         if( aux == section ) {
-            str.seekg( currentPosition );
-            return true;
+      sectionName += "::" + arrayName;
+      if( ! sectionPositions.count( sectionName ) ) {
+         throw MeshReaderError( "VTKReader", "array " + arrayName + " was not found in the CELL_DATA section" );
+      }
+      inputFile.seekg( sectionPositions[sectionName] );
+
+      // type: SCALARS, VECTORS, etc.
+      // datatype: int, float, double
+      std::string type, datatype;
+
+      // parse the metadata line
+      std::string line;
+      getline( inputFile, line );
+      std::istringstream iss( line );
+      iss >> type;
+
+      // if the line starts with the array name, it must be a FIELD
+      if( type == arrayName ) {
+         // parse <array_name> <components> <tuples> <datatype>
+         iss >> values_per_element >> elements >> datatype;
+      }
+      else {
+         // parse the rest of the line: <type> <array_name> <datatype>
+         std::string array_name;
+         iss >> array_name >> datatype;
+         if( type == "SCALARS" ) {
+            values_per_element = 1;
+            // skip the LOOKUP_TABLE line
+            getline( inputFile, line );
          }
+         else if( type == "VECTORS" )
+            values_per_element = 3;
+         else if( type == "TENSORS" )
+            values_per_element = 9;
+         else
+            throw MeshReaderError( "VTKReader", "requested array type " + type + " is not implemented in the reader" );
       }
 
-      return false;
+      if( datatype == "int" )
+         return readDataArray< std::int32_t >( inputFile, elements * values_per_element );
+      else if( datatype == "float" )
+         return readDataArray< float >( inputFile, elements * values_per_element );
+      else if( datatype == "double" )
+         return readDataArray< double >( inputFile, elements * values_per_element );
+      else
+         throw MeshReaderError( "VTKReader", "found data type which is not implemented in the reader: " + datatype );
+   }
+
+   template< typename T >
+   std::vector<T>
+   readDataArray( std::istream& str, std::int32_t values )
+   {
+      std::vector<T> vector( values );
+      for( std::int32_t i = 0; i < values; i++ )
+         vector[i] = readValue< T >( dataFormat, str );
+      return vector;
+   }
+
+   static void skipValue( VTK::FileFormat format, std::istream& str, std::string datatype )
+   {
+      if( datatype == "int" )
+         readValue< std::int32_t >( format, str );
+      else if( datatype == "float" )
+         readValue< float >( format, str );
+      else if( datatype == "double" )
+         readValue< double >( format, str );
+      else
+         throw MeshReaderError( "VTKReader", "found data type which is not implemented in the reader: " + datatype );
+   }
+
+   template< typename T >
+   static T readValue( VTK::FileFormat format, std::istream& str )
+   {
+      T value;
+      if( format == VTK::FileFormat::binary ) {
+         str.read( reinterpret_cast<char*>(&value), sizeof(T) );
+         // forceBigEndian = swapIfLittleEndian, i.e. here it forces a big-endian
+         // value to the correct system endianness
+         value = forceBigEndian( value );
+      }
+      else {
+         str >> value;
+      }
+      return value;
    }
 };
 
diff --git a/src/TNL/Meshes/Readers/XMLVTK.h b/src/TNL/Meshes/Readers/XMLVTK.h
index af864e6e9c603f5998c6c29b9a91a0b43c64167d..9e2817a17e058abf64dcdf6167a9a4c8b9ca65ac 100644
--- a/src/TNL/Meshes/Readers/XMLVTK.h
+++ b/src/TNL/Meshes/Readers/XMLVTK.h
@@ -14,6 +14,7 @@
 
 #include <map>
 #include <set>
+#include <experimental/filesystem>
 
 #include <TNL/Meshes/Readers/MeshReader.h>
 #include <TNL/base64.h>
@@ -161,16 +162,25 @@ protected:
       return found;
    }
 
-   template< typename HeaderType >
-   static std::size_t
-   readBlockSize( const char* block )
+   template< typename T >
+   VariantVector
+   readAsciiBlock( const char* block ) const
    {
-      std::pair<std::size_t, std::unique_ptr<char[]>> decoded_data = decode_block( block, get_encoded_length(sizeof(HeaderType)) );
-      if( decoded_data.first != sizeof(HeaderType) )
-         throw MeshReaderError( "XMLVTK", "base64-decoding failed - mismatched data size in the binary header (read "
-                                          + std::to_string(decoded_data.first) + " bytes, expected " + std::to_string(sizeof(HeaderType)) + " bytes)" );
-      const HeaderType* blockSize = reinterpret_cast<const HeaderType*>(decoded_data.second.get());
-      return *blockSize;
+      // creating a copy of the block is rather costly, but so is ASCII parsing
+      std::stringstream ss;
+      ss << block;
+
+      std::vector<T> vector;
+      while( ss ) {
+         // since std::uint8_t is an alias to unsigned char, we need to parse
+         // bytes into a larger type, otherwise operator>> would read it as char
+         std::common_type_t< T, std::uint16_t > value;
+         ss >> value;
+         if( ss )
+            vector.push_back( value );
+      }
+
+      return vector;
    }
 
    template< typename HeaderType, typename T >
@@ -182,12 +192,26 @@ protected:
          ++block;
 
       if( compressor == "" ) {
-         const std::size_t blockSize = readBlockSize< HeaderType >( block );
-         block += get_encoded_length(sizeof(HeaderType));
-         std::pair<std::size_t, std::unique_ptr<char[]>> decoded_data = decode_block( block, get_encoded_length(blockSize) );
-         std::vector<T> vector( decoded_data.first / sizeof(T) );
+         std::size_t data_size = 0;
+         const T* data_ptr = nullptr;
+         std::pair<std::size_t, std::unique_ptr<std::uint8_t[]>> decoded_data = base64::decode( block, std::strlen(block) );
+
+         // check if block size was decoded separately (so decoding stopped after block size due to padding)
+         if( decoded_data.first == sizeof(HeaderType) ) {
+            const std::size_t header_length = base64::get_encoded_length(sizeof(HeaderType));
+            const HeaderType block_size = *reinterpret_cast<const HeaderType*>(decoded_data.second.get());
+            decoded_data = base64::decode( block + header_length, base64::get_encoded_length(block_size) );
+            data_size = decoded_data.first / sizeof(T);
+            data_ptr = reinterpret_cast<const T*>(decoded_data.second.get());
+         }
+         else {
+            data_size = *reinterpret_cast<const HeaderType*>(decoded_data.second.get()) / sizeof(T);
+            data_ptr = reinterpret_cast<const T*>(decoded_data.second.get() + sizeof(HeaderType));
+         }
+
+         std::vector<T> vector( data_size );
          for( std::size_t i = 0; i < vector.size(); i++ )
-            vector[i] = reinterpret_cast<const T*>(decoded_data.second.get())[i];
+            vector[i] = data_ptr[i];
          return vector;
       }
       else if( compressor == "vtkZLibDataCompressor" ) {
@@ -231,8 +255,17 @@ protected:
       const std::string type = getAttributeString( elem, "type" );
       const std::string format = getAttributeString( elem, "format" );
       if( format == "ascii" ) {
-         // TODO
-         throw MeshReaderError( "XMLVTK", "reading ASCII arrays is not implemented yet" );
+         if( type == "Int8" )          return readAsciiBlock< std::int8_t   >( block );
+         else if( type == "UInt8" )    return readAsciiBlock< std::uint8_t  >( block );
+         else if( type == "Int16" )    return readAsciiBlock< std::int16_t  >( block );
+         else if( type == "UInt16" )   return readAsciiBlock< std::uint16_t >( block );
+         else if( type == "Int32" )    return readAsciiBlock< std::int32_t  >( block );
+         else if( type == "UInt32" )   return readAsciiBlock< std::uint32_t >( block );
+         else if( type == "Int64" )    return readAsciiBlock< std::int64_t  >( block );
+         else if( type == "UInt64" )   return readAsciiBlock< std::uint64_t >( block );
+         else if( type == "Float32" )  return readAsciiBlock< float  >( block );
+         else if( type == "Float64" )  return readAsciiBlock< double >( block );
+         else throw MeshReaderError( "XMLVTK", "unsupported DataArray type: " + type );
       }
       else if( format == "binary" ) {
          if( type == "Int8" )          return readBinaryBlock< std::int8_t   >( block );
@@ -286,6 +319,12 @@ public:
 #ifdef HAVE_TINYXML2
       using namespace tinyxml2;
 
+      namespace fs = std::experimental::filesystem;
+      if( ! fs::exists( fileName ) )
+         throw MeshReaderError( "XMLVTK", "file '" + fileName + "' does not exist" );
+      if( fs::is_directory( fileName ) )
+         throw MeshReaderError( "XMLVTK", "path '" + fileName + "' is a directory" );
+
       // load and verify XML
       tinyxml2::XMLError status = dom.LoadFile( fileName.c_str() );
       if( status != XML_SUCCESS )
diff --git a/src/TNL/Meshes/VTKTraits.h b/src/TNL/Meshes/VTKTraits.h
index 0883b607a54ab755b96907ee243dca614f14f08b..143654f71461ffa3b046a0d96f9c594e84f5e9ee 100644
--- a/src/TNL/Meshes/VTKTraits.h
+++ b/src/TNL/Meshes/VTKTraits.h
@@ -10,6 +10,7 @@
 
 #pragma once
 
+#include <stdexcept>
 #include <string>
 #include <cstdint>
 
@@ -115,9 +116,8 @@ inline int getEntityDimension( EntityShape shape )
       case EntityShape::Wedge:          return 3;
       case EntityShape::Pyramid:        return 3;
    }
-   // this just avoids a compiler warning in GCC and nvcc (clang actually knows if the
-   // switch above covers all cases, and print a warning only when it does not)
-   throw 1;
+   // this can actually happen when an invalid uint8_t value is converted to EntityShape
+   throw std::runtime_error( "VTK::getEntityDimension: invalid entity shape value " + std::to_string(int(shape)) );
 }
 
 // static mapping of TNL entity topologies to EntityShape
diff --git a/src/TNL/Meshes/Writers/VTKWriter.hpp b/src/TNL/Meshes/Writers/VTKWriter.hpp
index 801d3bc1926a944396fc19b9765e3d6bfb9ad841..06474f1e532339c0337ff862b81539d6919daf43 100644
--- a/src/TNL/Meshes/Writers/VTKWriter.hpp
+++ b/src/TNL/Meshes/Writers/VTKWriter.hpp
@@ -22,13 +22,13 @@ namespace Writers {
 
 namespace details {
 
-// TODO: 64-bit integers are most likely not supported in the BINARY format
+// legacy VTK files do not support 64-bit integers, even in the BINARY format
 inline void
-writeInt( VTK::FileFormat format, std::ostream& str, int value )
+writeInt( VTK::FileFormat format, std::ostream& str, std::int32_t value )
 {
    if( format == VTK::FileFormat::binary ) {
       value = forceBigEndian( value );
-      str.write( reinterpret_cast<const char*>(&value), sizeof(int) );
+      str.write( reinterpret_cast<const char*>(&value), sizeof(std::int32_t) );
    }
    else {
       str << value << " ";
diff --git a/src/TNL/Meshes/Writers/VTUWriter.hpp b/src/TNL/Meshes/Writers/VTUWriter.hpp
index c8093010d6db57db63675d8adc0fcab002a997b4..960cc4638683fe596dbffd7c042be9983087738d 100644
--- a/src/TNL/Meshes/Writers/VTUWriter.hpp
+++ b/src/TNL/Meshes/Writers/VTUWriter.hpp
@@ -504,7 +504,7 @@ VTUWriter< Mesh >::writeDataArray( const Array& array,
 #endif
          // fall through to binary if HAVE_ZLIB is not defined
       case VTK::FileFormat::binary:
-         write_encoded_block< HeaderType >( array.getData(), array.getSize(), str );
+         base64::write_encoded_block< HeaderType >( array.getData(), array.getSize(), str );
          str << "\n";
          break;
    }
diff --git a/src/TNL/base64.h b/src/TNL/base64.h
index cab004bb2038cb95712a61b40c0e537803e097c3..fd2272e9081f8391cf86f3817fdf5593c9d6f644 100644
--- a/src/TNL/base64.h
+++ b/src/TNL/base64.h
@@ -11,306 +11,206 @@
 #pragma once
 
 #include <cstddef>
+#include <cstdint>
 #include <memory>
 #include <utility>
+#include <stdexcept>
 #include <cmath>    // std::ceil
 
 namespace TNL {
-
-// The functions in the base64 namespace are taken from the libb64 project, see
-// http://sourceforge.net/projects/libb64
-//
-// libb64 has been placed in the public domain
+/**
+ * \brief Namespace for base64 encoding and decoding functions.
+ *
+ * The actual algorithms are based on these sources:
+ *
+ * - http://web.mit.edu/freebsd/head/contrib/wpa/src/utils/base64.c
+ * - https://stackoverflow.com/questions/180947/base64-decode-snippet-in-c/
+ * - https://stackoverflow.com/questions/342409/how-do-i-base64-encode-decode-in-c
+ */
 namespace base64 {
 
-// encoding
-
-typedef enum
-{
-    step_A,
-    step_B,
-    step_C
-} base64_encodestep;
-
-typedef struct
-{
-    base64_encodestep step;
-    char              result;
-} base64_encodestate;
-
-inline void
-base64_init_encodestate(base64_encodestate *state_in)
-{
-    state_in->step   = step_A;
-    state_in->result = 0;
-}
-
-inline char
-base64_encode_value(char value_in)
-{
-    static const char *encoding = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
-    if (value_in > 63)
-        return '=';
-    return encoding[(int)value_in];
-}
-
-inline std::ptrdiff_t
-base64_encode_block(const char *        plaintext_in,
-                    std::size_t         length_in,
-                    char *              code_out,
-                    base64_encodestate *state_in)
-{
-    const char *      plainchar    = plaintext_in;
-    const char *const plaintextend = plaintext_in + length_in;
-    char *            codechar     = code_out;
-    char              result;
-
-    result = state_in->result;
-
-    switch (state_in->step)
-    {
-        while (true)
-        {
-            case step_A:
-            {
-                if (plainchar == plaintextend)
-                {
-                    state_in->result = result;
-                    state_in->step   = step_A;
-                    return codechar - code_out;
-                }
-                const char fragment = *plainchar++;
-                result              = (fragment & 0x0fc) >> 2;
-                *codechar++         = base64_encode_value(result);
-                result              = (fragment & 0x003) << 4;
-                // intended fallthrough
-            }
-            case step_B:
-            {
-                if (plainchar == plaintextend)
-                {
-                    state_in->result = result;
-                    state_in->step   = step_B;
-                    return codechar - code_out;
-                }
-                const char fragment = *plainchar++;
-                result |= (fragment & 0x0f0) >> 4;
-                *codechar++ = base64_encode_value(result);
-                result      = (fragment & 0x00f) << 2;
-                // intended fallthrough
-            }
-            case step_C:
-            {
-                if (plainchar == plaintextend)
-                {
-                    state_in->result = result;
-                    state_in->step   = step_C;
-                    return codechar - code_out;
-                }
-                const char fragment = *plainchar++;
-                result |= (fragment & 0x0c0) >> 6;
-                *codechar++ = base64_encode_value(result);
-                result      = (fragment & 0x03f) >> 0;
-                *codechar++ = base64_encode_value(result);
-            }
-        }
-    }
-    /* control should not reach here */
-    return codechar - code_out;
-}
-
-inline std::ptrdiff_t
-base64_encode_blockend(char *code_out, base64_encodestate *state_in)
+/**
+ * \brief Get the length of base64-encoded block for given data byte length.
+ */
+inline std::size_t
+get_encoded_length( std::size_t byte_length )
 {
-    char *codechar = code_out;
-
-    switch (state_in->step)
-    {
-        case step_B:
-            *codechar++ = base64_encode_value(state_in->result);
-            *codechar++ = '=';
-            *codechar++ = '=';
-            break;
-        case step_C:
-            *codechar++ = base64_encode_value(state_in->result);
-            *codechar++ = '=';
-            break;
-        case step_A:
-            break;
-    }
-    *codechar++ = '\0';
-
-    return codechar - code_out;
+   std::size_t encoded = std::ceil(byte_length * (4.0 / 3.0));
+   // base64 uses padding to a multiple of 4
+   if( encoded % 4 == 0 )
+      return encoded;
+   return encoded + 4 - (encoded % 4);
 }
 
+/**
+ * \brief Static table for base64 encoding.
+ */
+static constexpr unsigned char encoding_table[65] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
 
-// decoding
-
-typedef enum
-{
-    step_a, step_b, step_c, step_d
-} base64_decodestep;
-
-typedef struct
-{
-    base64_decodestep step;
-    char plainchar;
-} base64_decodestate;
-
-inline void
-base64_init_decodestate(base64_decodestate* state_in)
-{
-    state_in->step = step_a;
-    state_in->plainchar = 0;
-}
-
-inline int
-base64_decode_value(char value_in)
-{
-    static const char decoding[] = {62,-1,-1,-1,63,52,53,54,55,56,57,58,59,60,61,-1,-1,-1,-2,-1,-1,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,-1,-1,-1,-1,-1,-1,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51};
-    static const char decoding_size = sizeof(decoding);
-    value_in -= 43;
-    if (value_in < 0 || value_in >= decoding_size) return -1;
-    return decoding[(int)value_in];
-}
+/**
+ * \brief Static table for base64 decoding.
+ *
+ * Can be built with the following code:
+ *
+ * \code
+ * std::uint8_t decoding_table[256];
+ * for( int i = 0; i < 256; i++ )
+ *    decoding_table[i] = 128;
+ * for( std::uint8_t i = 0; i < sizeof(encoding_table) - 1; i++ )
+ *    decoding_table[encoding_table[i]] = i;
+ * decoding_table[(int) '='] = 0;
+ * \endcode
+ */
+static constexpr std::uint8_t decoding_table[256] = {
+   128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+   128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+   128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,  62, 128, 128, 128,  63,
+    52,  53,  54,  55,  56,  57,  58,  59,  60,  61, 128, 128, 128,   0, 128, 128,
+   128,   0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,
+    15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25, 128, 128, 128, 128, 128,
+   128,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,
+    41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51, 128, 128, 128, 128, 128,
+   128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+   128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+   128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+   128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+   128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+   128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+   128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+   128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128
+};
 
-inline std::ptrdiff_t
-base64_decode_block(const char* code_in, const std::size_t length_in, char* plaintext_out, base64_decodestate* state_in)
+/**
+ * \brief Do a base64 encoding of the given data.
+ *
+ * \param data Pointer to the data to be encoded.
+ * \param data_size Length of the input data (in bytes).
+ * \return A \ref std::unique_ptr to the encoded data.
+ */
+inline std::unique_ptr<char[]>
+encode( const std::uint8_t* data, std::size_t data_size )
 {
-    const char* codechar = code_in;
-    char* plainchar = plaintext_out;
-    char fragment;
-
-    *plainchar = state_in->plainchar;
-
-    switch (state_in->step)
-    {
-        while (1)
-        {
-    case step_a:
-            do {
-                if (codechar == code_in+length_in)
-                {
-                    state_in->step = step_a;
-                    state_in->plainchar = *plainchar;
-                    return plainchar - plaintext_out;
-                }
-                fragment = (char)base64_decode_value(*codechar++);
-            } while (fragment < 0);
-            *plainchar    = (fragment & 0x03f) << 2;
-    case step_b:
-            do {
-                if (codechar == code_in+length_in)
-                {
-                    state_in->step = step_b;
-                    state_in->plainchar = *plainchar;
-                    return plainchar - plaintext_out;
-                }
-                fragment = (char)base64_decode_value(*codechar++);
-            } while (fragment < 0);
-            *plainchar++ |= (fragment & 0x030) >> 4;
-            *plainchar    = (fragment & 0x00f) << 4;
-    case step_c:
-            do {
-                if (codechar == code_in+length_in)
-                {
-                    state_in->step = step_c;
-                    state_in->plainchar = *plainchar;
-                    return plainchar - plaintext_out;
-                }
-                fragment = (char)base64_decode_value(*codechar++);
-            } while (fragment < 0);
-            *plainchar++ |= (fragment & 0x03c) >> 2;
-            *plainchar    = (fragment & 0x003) << 6;
-    case step_d:
-            do {
-                if (codechar == code_in+length_in)
-                {
-                    state_in->step = step_d;
-                    state_in->plainchar = *plainchar;
-                    return plainchar - plaintext_out;
-                }
-                fragment = (char)base64_decode_value(*codechar++);
-            } while (fragment < 0);
-            *plainchar++   |= (fragment & 0x03f);
-        }
-    }
-    /* control should not reach here */
-    return plainchar - plaintext_out;
+   const std::size_t output_length = get_encoded_length( data_size );
+   std::unique_ptr<char[]> encoded_data{new char[output_length + 1]};
+
+   const std::uint8_t* end = data + data_size;
+   const std::uint8_t* in = data;
+   char* out = encoded_data.get();
+   char* pos = out;
+
+   while( end - in >= 3 ) {
+      *pos++ = encoding_table[in[0] >> 2];
+      *pos++ = encoding_table[((in[0] & 0x03) << 4) | (in[1] >> 4)];
+      *pos++ = encoding_table[((in[1] & 0x0f) << 2) | (in[2] >> 6)];
+      *pos++ = encoding_table[in[2] & 0x3f];
+      in += 3;
+   }
+
+   if( end - in ) {
+      *pos++ = encoding_table[in[0] >> 2];
+      if( end - in == 1 ) {
+         *pos++ = encoding_table[(in[0] & 0x03) << 4];
+         *pos++ = '=';
+      }
+      else {
+         *pos++ = encoding_table[((in[0] & 0x03) << 4) | (in[1] >> 4)];
+         *pos++ = encoding_table[(in[1] & 0x0f) << 2];
+      }
+      *pos++ = '=';
+   }
+
+   *pos++ = '\0';
+   return encoded_data;
 }
 
-} // namespace base64
-
-
 /**
- * Do a base64 encoding of the given data.
+ * \brief Internal base64 decoding function.
  *
- * The function returns a unique_ptr to the encoded data.
+ * \param input Pointer to the encoded data (C string).
+ * \param input_length Length of the input string.
+ * \param output Pointer to a pre-allocated output buffer.
+ * \param output_length Length of the output buffer.
+ * \return Size of the decoded data (in bytes).
  */
-inline std::unique_ptr<char[]>
-encode_block(const char* data, const std::size_t data_size)
+inline std::ptrdiff_t
+decode_block( const char* input, std::size_t input_length, std::uint8_t* output, std::size_t output_length )
 {
-    base64::base64_encodestate state;
-    base64::base64_init_encodestate(&state);
-
-    std::unique_ptr<char[]> encoded_data{new char[2 * data_size + 1]};
+   const std::size_t min_buffer_size = std::ceil(input_length * (3.0 / 4.0));
+   if( output_length < min_buffer_size )
+      throw std::logic_error( "base64: insufficient output buffer size " + std::to_string(output_length)
+                              + " (needed at least " + std::to_string(min_buffer_size) + " bytes)" );
+
+   std::size_t count = 0;
+   int pad = 0;
+   std::uint8_t block[4];
+   std::uint8_t* pos = output;
+
+   for (std::size_t i = 0; i < input_length; i++) {
+      const std::uint8_t tmp = decoding_table[(int) input[i]];
+      if( tmp == 128 )
+         continue;
+
+      if( input[i] == '=' )
+         pad++;
+      block[count] = tmp;
+      count++;
+
+      if( count == 4 ) {
+         *pos++ = (block[0] << 2) | (block[1] >> 4);
+         *pos++ = (block[1] << 4) | (block[2] >> 2);
+         *pos++ = (block[2] << 6) | block[3];
+         count = 0;
+         if( pad > 2 )
+            // invalid padding
+            throw std::invalid_argument( "base64: decoding error: input has invalid padding" );
+         if( pad > 0 ) {
+            pos -= pad;
+            break;
+         }
+      }
+   }
 
-    const std::size_t encoded_length_data = base64::base64_encode_block(data, data_size, encoded_data.get(), &state);
-    base64::base64_encode_blockend(encoded_data.get() + encoded_length_data, &state);
+   // check left-over chars
+   if( count )
+      throw std::invalid_argument( "base64: decoding error: invalid input (length not padded to a multiple of 4)" );
 
-    return encoded_data;
+   return pos - output;
 }
 
-
 /**
- * Do a base64 decoding of the given data.
+ * \brief Do a base64 decoding of the given data.
  *
- * The function returns a pair of the decoded data length and a unique_ptr to
- * the decoded data.
+ * \param data Pointer to the encoded data (C string).
+ * \param data_size Length of the input string.
+ * \return A pair of the decoded data length and a \ref std::unique_ptr to the
+ *         decoded data.
  */
-inline std::pair<std::size_t, std::unique_ptr<char[]>>
-decode_block(const char* data, const std::size_t data_size)
+inline std::pair< std::size_t, std::unique_ptr<std::uint8_t[]> >
+decode( const char* data, const std::size_t data_size )
 {
-    base64::base64_decodestate state;
-    base64::base64_init_decodestate(&state);
-
-    std::unique_ptr<char[]> decoded_data{new char[data_size + 1]};
-
-    const std::size_t decoded_length_data = base64::base64_decode_block(data, data_size, decoded_data.get(), &state);
-    decoded_data[decoded_length_data] = '\0';
+   const std::size_t buffer_size = std::ceil(data_size * (3.0 / 4.0));
+   std::unique_ptr<std::uint8_t[]> decoded_data{new std::uint8_t[buffer_size + 1]};
 
-    return {decoded_length_data, std::move(decoded_data)};
+   const std::size_t decoded_length_data = decode_block( data, data_size, decoded_data.get(), buffer_size );
+   return {decoded_length_data, std::move(decoded_data)};
 }
 
-
 /**
- * Write a base64-encoded block of data into the given stream.
+ * \brief Write a base64-encoded block of data into the given stream.
  *
  * The encoded data is prepended with a short header, which is the base64-encoded
  * byte length of the data. The type of the byte length value is `HeaderType`.
  */
-template <typename HeaderType = std::uint64_t, typename T>
-void write_encoded_block(const T* data, const std::size_t data_length, std::ostream& output_stream)
+template< typename HeaderType = std::uint64_t, typename T >
+void
+write_encoded_block( const T* data, const std::size_t data_length, std::ostream& output_stream )
 {
    const HeaderType size = data_length * sizeof(T);
-   std::unique_ptr<char[]> encoded_size = encode_block(reinterpret_cast<const char*>(&size), sizeof(HeaderType));
+   std::unique_ptr<char[]> encoded_size = base64::encode( reinterpret_cast<const std::uint8_t*>(&size), sizeof(HeaderType) );
    output_stream << encoded_size.get();
-   std::unique_ptr<char[]> encoded_data = encode_block(reinterpret_cast<const char*>(data), size);
+   std::unique_ptr<char[]> encoded_data = base64::encode( reinterpret_cast<const std::uint8_t*>(data), size );
    output_stream << encoded_data.get();
 }
 
-
-/**
- * Get the length of base64-encoded block for given data byte length.
- */
-inline std::size_t
-get_encoded_length(const std::size_t byte_length)
-{
-    int encoded = std::ceil(byte_length * (4.0 / 3.0));
-    // base64 uses padding to a multiple of 4
-    if (encoded % 4 == 0)
-        return encoded;
-    return encoded + 4 - (encoded % 4);
-}
-
+} // namespace base64
 } // namespace TNL
diff --git a/src/TNL/zlib_compression.h b/src/TNL/zlib_compression.h
index a9c4234753206fc1f8cbfa85efe7720588078451..6d5e674b3d4d9f46e052b576679312cefdd8ebba 100644
--- a/src/TNL/zlib_compression.h
+++ b/src/TNL/zlib_compression.h
@@ -20,8 +20,8 @@
 namespace TNL {
 
 /**
- * Use zlib to compress the data, then encode it with base64 and write the
- * result to the given stream.
+ * \brief Use zlib to compress the data, then encode it with base64 and write
+ * the result to the given stream.
  *
  * Options for compression_level:
  *   - Z_NO_COMPRESSION
@@ -29,157 +29,161 @@ namespace TNL {
  *   - Z_BEST_COMPRESSION
  *   - Z_DEFAULT_COMPRESSION
  */
-template <typename HeaderType = std::uint64_t, typename T>
+template< typename HeaderType = std::uint64_t, typename T >
 void
-write_compressed_block(const T* data,
-                       const std::size_t data_size,
-                       std::ostream& output_stream,
-                       const int compression_level = Z_DEFAULT_COMPRESSION)
+write_compressed_block( const T* data,
+                        const std::size_t data_size,
+                        std::ostream& output_stream,
+                        const int compression_level = Z_DEFAULT_COMPRESSION )
 {
-    if (data_size == 0)
-        return;
+   if (data_size == 0)
+      return;
 
-    // allocate a buffer for compressing data and do so
-    uLongf compressed_data_length = compressBound(data_size * sizeof(T));
-    std::unique_ptr<char[]> compressed_data{new char[compressed_data_length]};
+   // allocate a buffer for compressing data and do so
+   uLongf compressed_data_length = compressBound(data_size * sizeof(T));
+   std::unique_ptr<std::uint8_t[]> compressed_data{new std::uint8_t[compressed_data_length]};
 
-    // compress the data
-    const int status = compress2(
+   // compress the data
+   const int status = compress2(
             reinterpret_cast<Bytef*>(compressed_data.get()),
             &compressed_data_length,
             reinterpret_cast<const Bytef*>(data),
             data_size * sizeof(T),
             compression_level
-        );
-    if (status != Z_OK)
-        throw std::runtime_error("zlib compression failed");
-
-    // create compression header
-    const HeaderType compression_header[4] = {
-        (HeaderType) 1,                       // number of blocks
-        (HeaderType) (data_size * sizeof(T)), // size of block
-        (HeaderType) (data_size * sizeof(T)), // size of last block
-        (HeaderType) compressed_data_length   // list of compressed sizes of blocks
-    };
-
-    // base64-encode the compression header
-    std::unique_ptr<char[]> encoded_header(
-            encode_block(reinterpret_cast<const char*>(&compression_header[0]),
-                         4 * sizeof(HeaderType))
-        );
-    output_stream << encoded_header.get();
-
-    // base64-encode the compressed data
-    std::unique_ptr<char[]> encoded_data(
-            encode_block(compressed_data.get(), compressed_data_length)
-        );
-    output_stream << encoded_data.get();
+         );
+   if (status != Z_OK)
+      throw std::runtime_error("zlib compression failed");
+
+   // create compression header
+   const HeaderType compression_header[4] = {
+      (HeaderType) 1,                       // number of blocks
+      (HeaderType) (data_size * sizeof(T)), // size of block
+      (HeaderType) (data_size * sizeof(T)), // size of last block
+      (HeaderType) compressed_data_length   // list of compressed sizes of blocks
+   };
+
+   // base64-encode the compression header
+   std::unique_ptr<char[]> encoded_header(
+            base64::encode(reinterpret_cast<const std::uint8_t*>(&compression_header[0]),
+                           4 * sizeof(HeaderType))
+         );
+   output_stream << encoded_header.get();
+
+   // base64-encode the compressed data
+   std::unique_ptr<char[]> encoded_data(
+            base64::encode(compressed_data.get(), compressed_data_length)
+         );
+   output_stream << encoded_data.get();
 }
 
 /**
- * Decompress data in given byte array and return an array of elements of
- * type T and length data_size.
+ * \brief Decompress data in given byte array and return an array of elements of
+ * type \e T and length \e data_size.
  */
-template <typename T>
+template< typename T >
 std::unique_ptr<T[]>
-decompress_data(const char* decoded_data, const std::size_t decoded_data_length, const std::size_t data_size)
+decompress_data( const std::uint8_t* decoded_data, const std::size_t decoded_data_length, const std::size_t data_size )
 {
-    // decompress the data
-    std::unique_ptr<T[]> data{new T[data_size]};
-    uLongf uncompressed_length_data = data_size * sizeof(T);
-    const int status = uncompress(
+   // decompress the data
+   std::unique_ptr<T[]> data{new T[data_size]};
+   uLongf uncompressed_length_data = data_size * sizeof(T);
+   const int status = uncompress(
             reinterpret_cast<Bytef*>(data.get()),
             &uncompressed_length_data,
             reinterpret_cast<const Bytef*>(decoded_data),
             decoded_data_length
-        );
-    if (status != Z_OK)
-        throw std::runtime_error("zlib decompression failed");
+         );
+   if (status != Z_OK)
+      throw std::runtime_error("zlib decompression failed");
 
-    if (uncompressed_length_data != data_size * sizeof(T))
-        throw std::length_error("uncompressed data length does not match the size stored in the compression header: "
-                                + std::to_string(uncompressed_length_data) + " vs "
-                                + std::to_string(data_size));
+   if (uncompressed_length_data != data_size * sizeof(T))
+      throw std::length_error("uncompressed data length does not match the size stored in the compression header: "
+                              + std::to_string(uncompressed_length_data) + " vs "
+                              + std::to_string(data_size));
 
-    return data;
+   return data;
 }
 
 /**
- * Decode and decompress data from a stream. The data must be in the format
- * as produced by the write_compressed_block function.
+ * \brief Decode and decompress data from a stream. The data must be in the
+ * format as produced by the \ref write_compressed_block function.
  *
- * The function returns a pair of the resulting data size and a unique_ptr to
- * the data.
+ * The function returns a pair of the resulting data size and a \ref std::unique_ptr
+ * to the data.
  */
-template <typename HeaderType = std::uint64_t, typename T>
+template< typename HeaderType = std::uint64_t, typename T >
 std::pair<HeaderType, std::unique_ptr<T[]>>
-decompress_block(const char* data)
+decompress_block( const char* data )
 {
-    // decode the header
-    const int encoded_header_length = get_encoded_length(4 * sizeof(HeaderType));
-    std::pair<std::size_t, std::unique_ptr<char[]>> decoded_header = decode_block(data, encoded_header_length);
-    const HeaderType* compression_header = reinterpret_cast<const HeaderType*>(decoded_header.second.get());
-
-    if (compression_header[0] != 1)
-        throw std::length_error("unexpected number of compressed blocks: "
-                                + std::to_string(compression_header[0]));
-    if (compression_header[1] != compression_header[2])
-        throw std::logic_error("inconsistent block sizes in the compression header: "
-                               + std::to_string(compression_header[1]) + " vs "
-                               + std::to_string(compression_header[2]));
-    const HeaderType data_size = compression_header[1] / sizeof(T);
-    const HeaderType compressed_data_length = get_encoded_length(compression_header[3]);
-
-    // decode the data
-    std::pair<std::size_t, std::unique_ptr<char[]>> decoded_data = decode_block(data + encoded_header_length, compressed_data_length);
-
-    // decompress the data and return
-    return {data_size, decompress_data<T>(decoded_data.second.get(), decoded_data.first, data_size)};
+   // decode the header
+   const int encoded_header_length = base64::get_encoded_length(4 * sizeof(HeaderType));
+   std::pair<std::size_t, std::unique_ptr<std::uint8_t[]>> decoded_header = base64::decode(data, encoded_header_length);
+   const HeaderType* compression_header = reinterpret_cast<const HeaderType*>(decoded_header.second.get());
+
+   if (compression_header[0] != 1)
+      throw std::length_error("unexpected number of compressed blocks: "
+                              + std::to_string(compression_header[0]));
+   // Note: for short arrays, paraview 5.9 creates files with:
+   //       num_blocks == 1, block_size == 32768, last_block_size == [the actual data size]
+   //if (compression_header[1] != compression_header[2])
+   //    throw std::logic_error("inconsistent block sizes in the compression header: "
+   //                           + std::to_string(compression_header[1]) + " vs "
+   //                           + std::to_string(compression_header[2]));
+   const HeaderType data_size = compression_header[2] / sizeof(T);
+   const HeaderType compressed_data_length = base64::get_encoded_length(compression_header[3]);
+
+   // decode the data
+   std::pair<std::size_t, std::unique_ptr<std::uint8_t[]>> decoded_data = base64::decode(data + encoded_header_length, compressed_data_length);
+
+   // decompress the data and return
+   return {data_size, decompress_data<T>(decoded_data.second.get(), decoded_data.first, data_size)};
 }
 
 /**
- * Decode and decompress data from a stream. The data must be in the format
- * as produced by the write_compressed_block function.
+ * \brief Decode and decompress data from a stream. The data must be in the
+ * format as produced by the \ref write_compressed_block function.
  *
- * The function returns a pair of the resulting data size and a unique_ptr to
- * the data.
+ * The function returns a pair of the resulting data size and a \ref std::unique_ptr
+ * to the data.
  */
-template <typename HeaderType = std::uint64_t, typename T>
-std::pair<HeaderType, std::unique_ptr<T[]>>
-decompress_block(std::istream& input_stream)
+template< typename HeaderType = std::uint64_t, typename T >
+std::pair< HeaderType, std::unique_ptr<T[]> >
+decompress_block( std::istream& input_stream )
 {
-    // read the header
-    const int encoded_header_length = get_encoded_length(4 * sizeof(HeaderType));
-    std::unique_ptr<char[]> encoded_header{new char[encoded_header_length]};
-    input_stream.read(encoded_header.get(), encoded_header_length);
-    if (!input_stream.good())
-        throw std::length_error("input is not long enough to contain a compression header");
-
-    // decode the header
-    std::pair<std::size_t, std::unique_ptr<char[]>> decoded_header = decode_block(encoded_header.get(), encoded_header_length);
-    const HeaderType* compression_header = reinterpret_cast<const HeaderType*>(decoded_header.second.get());
-
-    if (compression_header[0] != 1)
-        throw std::length_error("unexpected number of compressed blocks: "
-                                + std::to_string(compression_header[0]));
-    if (compression_header[1] != compression_header[2])
-        throw std::logic_error("inconsistent block sizes in the compression header: "
-                               + std::to_string(compression_header[1]) + " vs "
-                               + std::to_string(compression_header[2]));
-    const HeaderType data_size = compression_header[1] / sizeof(T);
-    const HeaderType compressed_data_length = get_encoded_length(compression_header[3]);
-
-    // read the compressed data
-    std::unique_ptr<char[]> encoded_data{new char[compressed_data_length]};
-    input_stream.read(encoded_data.get(), compressed_data_length);
-    if (!input_stream.good())
-        throw std::length_error("failed to read the compressed data");
-
-    // decode the data
-    std::pair<std::size_t, std::unique_ptr<char[]>> decoded_data = decode_block(encoded_data.get(), compressed_data_length);
-
-    // decompress the data and return
-    return {data_size, decompress_data<T>(decoded_data.second.get(), decoded_data.first, data_size)};
+   // read the header
+   const int encoded_header_length = base64::get_encoded_length(4 * sizeof(HeaderType));
+   std::unique_ptr<char[]> encoded_header{new char[encoded_header_length]};
+   input_stream.read(encoded_header.get(), encoded_header_length);
+   if (!input_stream.good())
+      throw std::length_error("input is not long enough to contain a compression header");
+
+   // decode the header
+   std::pair<std::size_t, std::unique_ptr<std::uint8_t[]>> decoded_header = base64::decode(encoded_header.get(), encoded_header_length);
+   const HeaderType* compression_header = reinterpret_cast<const HeaderType*>(decoded_header.second.get());
+
+   if (compression_header[0] != 1)
+      throw std::length_error("unexpected number of compressed blocks: "
+                              + std::to_string(compression_header[0]));
+   // Note: for short arrays, paraview 5.9 creates files with:
+   //       num_blocks == 1, block_size == 32768, last_block_size == [the actual data size]
+   //if (compression_header[1] != compression_header[2])
+   //    throw std::logic_error("inconsistent block sizes in the compression header: "
+   //                           + std::to_string(compression_header[1]) + " vs "
+   //                           + std::to_string(compression_header[2]));
+   const HeaderType data_size = compression_header[2] / sizeof(T);
+   const HeaderType compressed_data_length = base64::get_encoded_length(compression_header[3]);
+
+   // read the compressed data
+   std::unique_ptr<char[]> encoded_data{new char[compressed_data_length]};
+   input_stream.read(encoded_data.get(), compressed_data_length);
+   if (!input_stream.good())
+      throw std::length_error("failed to read the compressed data");
+
+   // decode the data
+   std::pair<std::size_t, std::unique_ptr<std::uint8_t[]>> decoded_data = base64::decode(encoded_data.get(), compressed_data_length);
+
+   // decompress the data and return
+   return {data_size, decompress_data<T>(decoded_data.second.get(), decoded_data.first, data_size)};
 }
 
 } // namespace TNL
diff --git a/src/Tools/tnl-decompose-mesh.cpp b/src/Tools/tnl-decompose-mesh.cpp
index ac16df11bf8f5441af6d96c0a2b55d890f3e3e02..3bd66fec89baf27bcf97db66e2a72cca5aa9a3ed 100644
--- a/src/Tools/tnl-decompose-mesh.cpp
+++ b/src/Tools/tnl-decompose-mesh.cpp
@@ -26,6 +26,7 @@
 #include <chrono>
 
 using namespace TNL;
+using MetisIndexArray = Containers::Array< idx_t, Devices::Sequential, idx_t >;
 
 struct DecomposeMeshConfigTag {};
 
@@ -55,10 +56,10 @@ struct MeshWorldDimensionTag< DecomposeMeshConfigTag, CellTopology, WorldDimensi
 { enum { enabled = ( WorldDimension == CellTopology::dimension ) }; };
 
 // Meshes are enabled only for types explicitly listed below.
-template<> struct MeshRealTag< DecomposeMeshConfigTag, float > { enum { enabled = false }; };
+template<> struct MeshRealTag< DecomposeMeshConfigTag, float > { enum { enabled = true }; };
 template<> struct MeshRealTag< DecomposeMeshConfigTag, double > { enum { enabled = true }; };
 template<> struct MeshGlobalIndexTag< DecomposeMeshConfigTag, int > { enum { enabled = true }; };
-template<> struct MeshGlobalIndexTag< DecomposeMeshConfigTag, long int > { enum { enabled = false }; };
+template<> struct MeshGlobalIndexTag< DecomposeMeshConfigTag, long int > { enum { enabled = true }; };
 template<> struct MeshLocalIndexTag< DecomposeMeshConfigTag, short int > { enum { enabled = true }; };
 
 // Config tag specifying the MeshConfig template to use.
@@ -276,487 +277,485 @@ void setMETISoptions( idx_t options[METIS_NOPTIONS], const Config::ParameterCont
 }
 
 template< typename Mesh >
-struct DecomposeMesh
+void
+decompose_and_save( const Mesh& mesh,
+                    const unsigned nparts,
+                    const MetisIndexArray& part,
+                    const std::shared_ptr< idx_t > dual_xadj,
+                    const std::shared_ptr< idx_t > dual_adjncy,
+                    const unsigned ncommon,
+                    const unsigned ghost_levels,
+                    const std::string pvtuFileName )
 {
    using Index = typename Mesh::GlobalIndexType;
    using IndexArray = Containers::Array< Index, Devices::Sequential, Index >;
-   using MetisIndexArray = Containers::Array< idx_t, Devices::Sequential, idx_t >;
 
-   static bool run( const Mesh& mesh, const Config::ParameterContainer& parameters )
-   {
-      // warn if input mesh has 64-bit indices, but METIS uses only 32-bit indices
-      if( IDXTYPEWIDTH == 32 && sizeof(Index) > 32 )
-         std::cerr << "Warning: the input mesh uses 64-bit indices, but METIS was compiled only with 32-bit indices. "
-                      "Decomposition may not work correctly if the index values overflow the 32-bit type." << std::endl;
-
-      // get the mesh connectivity information in a format suitable for METIS. Actually, the same
-      // format is used by the XML-based VTK formats - the only difference is that METIS requires
-      // `offsets` to start with 0.
-      std::vector< idx_t > connectivity, offsets;
-      offsets.push_back(0);
-      const Index cellsCount = mesh.template getEntitiesCount< typename Mesh::Cell >();
-      for( Index i = 0; i < cellsCount; i++ ) {
-         const auto& entity = mesh.template getEntity< typename Mesh::Cell >( i );
-         const Index subvertices = entity.template getSubentitiesCount< 0 >();
-         for( Index j = 0; j < subvertices; j++ )
-            connectivity.push_back( entity.template getSubentityIndex< 0 >( j ) );
-         offsets.push_back( connectivity.size() );
-      }
-
-      // number of elements (cells)
-      idx_t ne = mesh.template getEntitiesCount< typename Mesh::Cell >();
-      // number of nodes (vertices)
-      idx_t nn = mesh.template getEntitiesCount< 0 >();
-      // pointers to arrays storing the mesh in a CSR-like format
-      idx_t* eptr = offsets.data();
-      idx_t* eind = connectivity.data();
-      // Specifies the number of common nodes that two elements must have in order to put an edge
-      // between them in the dual graph.
-      idx_t ncommon = Mesh::getMeshDimension();
-      if( parameters.checkParameter( "min-common-vertices" ) )
-         ncommon = parameters.template getParameter< unsigned >( "min-common-vertices" );
-      // numbering scheme for the adjacency structure of a graph or element-node structure of a mesh
-      // (0 is C-style, 1 is Fortran-style)
-      idx_t numflag = 0;
-      // These arrays store the adjacency structure of the generated dual graph. The format is of
-      // the adjacency structure is described in Section 5.5 of the METIS manual. Memory for these
-      // arrays is allocated by METIS’ API using the standard malloc function. It is the
-      // responsibility of the application to free this memory by calling free. METIS provides the
-      // METIS_Free that is a wrapper to C’s free function.
-      idx_t* xadj = nullptr;
-      idx_t* adjncy = nullptr;
-
-      // We could use METIS_PartMeshDual directly instead of METIS_MeshToDual + METIS_PartGraph*,
-      // but we need to reuse the dual graph for the generation of ghost cells.
-      std::cout << "Running METIS_MeshToDual..." << std::endl;
-      int status = METIS_MeshToDual(&ne, &nn, eptr, eind, &ncommon, &numflag, &xadj, &adjncy);
-
-      // wrap xadj and adjncy with shared_ptr
-      std::shared_ptr<idx_t> shared_xadj {xadj, METIS_Free};
-      std::shared_ptr<idx_t> shared_adjncy {adjncy, METIS_Free};
+   const Index cellsCount = mesh.template getEntitiesCount< typename Mesh::Cell >();
+   const Index pointsCount = mesh.template getEntitiesCount< typename Mesh::Vertex >();
 
-      switch( status )
-      {
-         case METIS_OK: break;
-         case METIS_ERROR_INPUT:
-            throw std::runtime_error( "METIS_MeshToDual failed due to an input error." );
-         case METIS_ERROR_MEMORY:
-            throw std::runtime_error( "METIS_MeshToDual failed due to a memory allocation error." );
-         case METIS_ERROR:
-         default:
-            throw std::runtime_error( "METIS_MeshToDual failed with an unspecified error." );
-      }
+   // count cells in each subdomain
+   IndexArray cells_counts( nparts );
+   cells_counts.setValue( 0 );
+   for( Index i = 0; i < cellsCount; i++ )
+      ++cells_counts[ part[ i ] ];
 
-      // The number of vertices in the graph.
-      idx_t nvtxs = ne;
-      // The number of balancing constraints. It should be at least 1.
-      idx_t ncon = 1;
-      // An array of size `ne` specifying the weights of the elements. A NULL value can be passed
-      // to indicate that all elements have an equal weight.
-      idx_t* vwgt = nullptr;
-      // An array of size `ne` specifying the size of the elements that is used for computing the
-      // total communication volume as described in Section 5.7 of the METIS manual. A NULL value
-      // can be passed when the objective is cut or when all elements have an equal size.
-      idx_t* vsize = nullptr;
-      // The weights of the edges as described in Section 5.5 of the METIS manual.
-      idx_t* adjwgt = nullptr;  // METIS_PartMeshDual uses NULL too
-      // The number of parts to partition the mesh.
-      idx_t nparts = parameters.template getParameter< unsigned >( "subdomains" );
-      // This is an array of size nparts that specifies the desired weight for each partition. The
-      // target partition weight for the i-th partition is specified at tpwgts[i] (the numbering for
-      // the partitions starts from 0). The sum of the tpwgts[] entries must be 1.0. A NULL value
-      // can be passed to indicate that the graph should be equally divided among the partitions.
-      real_t* tpwgts = nullptr;
-      // This is an array of size ncon that specifies the allowed load imbalance tolerance for each
-      // constraint. For the i-th partition and j-th constraint the allowed weight is the
-      // ubvec[j]*tpwgts[i*ncon+j] fraction of the j-th’s constraint total weight. The load
-      // imbalances must be greater than 1.0. A NULL value can be passed indicating that the load
-      // imbalance tolerance for each constraint should be 1.001 (for ncon=1) or 1.01 (for ncon>1).
-      real_t* ubvec = nullptr;  // METIS_PartMeshDual uses NULL too
-      // Upon successful completion, this variable stores either the edgecut or the total
-      // communication volume of the dual graph’s partitioning.
-      idx_t objval = 0;
-      // Array of size `ne` that upon successful completion stores the partition array for the
-      // elements of the mesh.
-      MetisIndexArray part_array( nvtxs );
-      idx_t* part = part_array.getData();
-
-      // Array of METIS options as described in Section 5.4 of the METIS manual.
-      idx_t options[METIS_NOPTIONS];
-      // future-proof (or just in case we forgot to set some options explicitly)
-      METIS_SetDefaultOptions(options);
-      // set METIS options from parameters
-      setMETISoptions(options, parameters);
-
-      if( nparts == 1 ) {
-         // k-way partitioning from Metis fails for nparts == 1 (segfault),
-         // RB succeeds but produces nonsense
-         part_array.setValue( 0 );
-      }
-      else {
-         if( options[METIS_OPTION_PTYPE] == METIS_PTYPE_KWAY ) {
-            std::cout << "Running METIS_PartGraphKway..." << std::endl;
-            status = METIS_PartGraphKway(&nvtxs, &ncon, xadj, adjncy, vwgt, vsize, adjwgt, &nparts, tpwgts, ubvec, options, &objval, part);
-         }
-         else {
-            std::cout << "Running METIS_PartGraphRecursive..." << std::endl;
-            status = METIS_PartGraphRecursive(&nvtxs, &ncon, xadj, adjncy, vwgt, vsize, adjwgt, &nparts, tpwgts, ubvec, options, &objval, part);
-         }
+   // build offsets for the partitioned cell indices
+   IndexArray cells_offsets( nparts );
+   cells_offsets[0] = 0;
+   for( unsigned p = 1; p < nparts; p++ )
+      cells_offsets[p] = cells_offsets[p-1] + cells_counts[p-1];
 
-         switch( status )
-         {
-            case METIS_OK: break;
-            case METIS_ERROR_INPUT:
-               throw std::runtime_error( "METIS_PartGraph failed due to an input error." );
-            case METIS_ERROR_MEMORY:
-               throw std::runtime_error( "METIS_PartGraph failed due to a memory allocation error." );
-            case METIS_ERROR:
-            default:
-               throw std::runtime_error( "METIS_PartGraph failed with an unspecified error." );
-         }
+   // construct block-wise local-to-global mapping for cells
+   IndexArray cells_local_to_global( cellsCount );
+   {
+      IndexArray offsets; offsets = cells_offsets;
+      for( Index i = 0; i < cellsCount; i++ ) {
+         const Index p = part[ i ];
+         cells_local_to_global[ offsets[p]++ ] = i;
       }
-
-      // deallocate auxiliary vectors
-      connectivity.clear();
-      connectivity.shrink_to_fit();
-      offsets.clear();
-      offsets.shrink_to_fit();
-
-      return decompose_and_save( mesh, parameters, part_array, shared_xadj, shared_adjncy, ncommon );
    }
 
-   static bool
-   decompose_and_save( const Mesh& mesh,
-                       const Config::ParameterContainer& parameters,
-                       const MetisIndexArray& part,
-                       const std::shared_ptr< idx_t > dual_xadj,
-                       const std::shared_ptr< idx_t > dual_adjncy,
-                       const Index ncommon )
+   auto is_ghost_neighbor = [&] ( const typename Mesh::Cell& cell )
    {
-      const unsigned nparts = parameters.template getParameter< unsigned >( "subdomains" );
-      const unsigned ghost_levels = parameters.getParameter< unsigned >( "ghost-levels" );
-      const Index cellsCount = mesh.template getEntitiesCount< typename Mesh::Cell >();
-      const Index pointsCount = mesh.template getEntitiesCount< typename Mesh::Vertex >();
-
-      // count cells in each subdomain
-      IndexArray cells_counts( nparts );
-      cells_counts.setValue( 0 );
-      for( Index i = 0; i < cellsCount; i++ )
-         ++cells_counts[ part[ i ] ];
-
-      // build offsets for the partitioned cell indices
-      IndexArray cells_offsets( nparts );
-      cells_offsets[0] = 0;
-      for( unsigned p = 1; p < nparts; p++ )
-         cells_offsets[p] = cells_offsets[p-1] + cells_counts[p-1];
-
-      // construct block-wise local-to-global mapping for cells
-      IndexArray cells_local_to_global( cellsCount );
-      {
-         IndexArray offsets; offsets = cells_offsets;
-         for( Index i = 0; i < cellsCount; i++ ) {
-            const Index p = part[ i ];
-            cells_local_to_global[ offsets[p]++ ] = i;
-         }
+      const Index neighbors_start = dual_xadj.get()[ cell.getIndex() ];
+      const Index neighbors_end = dual_xadj.get()[ cell.getIndex() + 1 ];
+      for( Index i = neighbors_start; i < neighbors_end; i++ ) {
+         const Index neighbor_idx = dual_adjncy.get()[ i ];
+         if( part[ cell.getIndex() ] != part[ neighbor_idx ] )
+            return true;
       }
+      return false;
+   };
 
-      auto is_ghost_neighbor = [&] ( const typename Mesh::Cell& cell )
-      {
-         const Index neighbors_start = dual_xadj.get()[ cell.getIndex() ];
-         const Index neighbors_end = dual_xadj.get()[ cell.getIndex() + 1 ];
-         for( Index i = neighbors_start; i < neighbors_end; i++ ) {
-            const Index neighbor_idx = dual_adjncy.get()[ i ];
-            if( part[ cell.getIndex() ] != part[ neighbor_idx ] )
-               return true;
-         }
-         return false;
-      };
-
-      // construct global index permutation for cells
-      // convention: seed index = global index in the partitioned mesh,
-      // cell index = global index in the original mesh
-      IndexArray seed_to_cell_index( cellsCount );
-      for( unsigned p = 0; p < nparts; p++ ) {
-         Index assigned = 0;
-         std::set< Index > boundary, ghost_neighbors;
-         for( Index local_idx = 0; local_idx < cells_counts[ p ]; local_idx++ ) {
-            const Index global_idx = cells_local_to_global[ cells_offsets[ p ] + local_idx ];
-            const auto& cell = mesh.template getEntity< typename Mesh::Cell >( global_idx );
-            // check global domain boundary first
-            if( mesh.template isBoundaryEntity< Mesh::getMeshDimension() >( cell.getIndex() ) )
-               boundary.insert( global_idx );
-            // check subdomain boundary
-            else if( is_ghost_neighbor( cell ) )
-               ghost_neighbors.insert( global_idx );
-            // otherwise subdomain interior - assign index now
-            else
-               seed_to_cell_index[ cells_offsets[ p ] + assigned++ ] = global_idx;
-         }
-         for( auto global_idx : boundary )
-            seed_to_cell_index[ cells_offsets[ p ] + assigned++ ] = global_idx;
-         for( auto global_idx : ghost_neighbors )
+   // construct global index permutation for cells
+   // convention: seed index = global index in the partitioned mesh,
+   // cell index = global index in the original mesh
+   IndexArray seed_to_cell_index( cellsCount );
+   for( unsigned p = 0; p < nparts; p++ ) {
+      Index assigned = 0;
+      std::set< Index > boundary, ghost_neighbors;
+      for( Index local_idx = 0; local_idx < cells_counts[ p ]; local_idx++ ) {
+         const Index global_idx = cells_local_to_global[ cells_offsets[ p ] + local_idx ];
+         const auto& cell = mesh.template getEntity< typename Mesh::Cell >( global_idx );
+         // check global domain boundary first
+         if( mesh.template isBoundaryEntity< Mesh::getMeshDimension() >( cell.getIndex() ) )
+            boundary.insert( global_idx );
+         // check subdomain boundary
+         else if( is_ghost_neighbor( cell ) )
+            ghost_neighbors.insert( global_idx );
+         // otherwise subdomain interior - assign index now
+         else
             seed_to_cell_index[ cells_offsets[ p ] + assigned++ ] = global_idx;
-         TNL_ASSERT_EQ( assigned, cells_counts[ p ], "bug in the global index permutation generator" );
-      }
-      cells_local_to_global.reset();
-      // cell_to_seed_index is an inverse permutation of seed_to_cell_index
-      IndexArray cell_to_seed_index( cellsCount );
-      for( Index i = 0; i < cellsCount; i++ )
-         cell_to_seed_index[ seed_to_cell_index[ i ] ] = i;
-
-      // construct global index permutation for points
-      // convention: points at subdomain boundaries are assigned to the subdomain with the higher number
-      IndexArray point_old_to_new_global_index( pointsCount );
-      IndexArray points_counts( nparts );
-      points_counts.setValue( 0 );
-      {
-         // first assign points to subdomains - the subdomain with the highest number takes the point
-         // (go over local cells, set subvertex owner = cell owner, higher rank will overwrite)
-         IndexArray point_to_subdomain( pointsCount );
-         for( unsigned p = 0; p < nparts; p++ ) {
-            for( Index local_idx = 0; local_idx < cells_counts[ p ]; local_idx++ ) {
-               const Index global_idx = seed_to_cell_index[ cells_offsets[ p ] + local_idx ];
-               const auto& cell = mesh.template getEntity< typename Mesh::Cell >( global_idx );
-               const Index subvertices = cell.template getSubentitiesCount< 0 >();
-               for( Index j = 0; j < subvertices; j++ ) {
-                  const Index v = cell.template getSubentityIndex< 0 >( j );
-                  point_to_subdomain[ v ] = p;
-               }
-            }
-         }
-         // assign global indices to points
-         Index pointIdx = 0;
-         for( unsigned p = 0; p < nparts; p++ ) {
-            for( Index local_idx = 0; local_idx < cells_counts[ p ]; local_idx++ ) {
-               const Index global_idx = seed_to_cell_index[ cells_offsets[ p ] + local_idx ];
-               const auto& cell = mesh.template getEntity< typename Mesh::Cell >( global_idx );
-               const Index subvertices = cell.template getSubentitiesCount< 0 >();
-               for( Index j = 0; j < subvertices; j++ ) {
-                  const Index v = cell.template getSubentityIndex< 0 >( j );
-                  if( point_to_subdomain[ v ] == (Index) p ) {
-                     // assign index
-                     point_old_to_new_global_index[ v ] = pointIdx++;
-                     // mark as assigned
-                     point_to_subdomain[ v ] = nparts;
-                     // increase count
-                     ++points_counts[ p ];
-                  }
-               }
-            }
-         }
-      }
-
-      // build offsets for the partitioned point indices
-      IndexArray points_offsets( nparts );
-      points_offsets[0] = 0;
-      for( unsigned p = 1; p < nparts; p++ )
-         points_offsets[p] = points_offsets[p-1] + points_counts[p-1];
-
-      // write a .pvtu file
-      using PVTU = Meshes::Writers::PVTUWriter< Mesh >;
-      const std::string pvtuFileName = parameters.template getParameter< String >( "output-file" );
-      std::ofstream file( pvtuFileName );
-      PVTU pvtu( file );
-      pvtu.template writeEntities< Mesh::getMeshDimension() >( Mesh{}, ghost_levels, ncommon );
-      if( ghost_levels > 0 ) {
-         // the PointData and CellData from the individual files should be added here
-         pvtu.template writePPointData< std::uint8_t >( Meshes::VTK::ghostArrayName() );
-         pvtu.template writePPointData< Index >( "GlobalIndex" );
-         pvtu.template writePCellData< std::uint8_t >( Meshes::VTK::ghostArrayName() );
-         pvtu.template writePCellData< Index >( "GlobalIndex" );
       }
-
-      std::cout << "Writing subdomains..." << std::endl;
+      for( auto global_idx : boundary )
+         seed_to_cell_index[ cells_offsets[ p ] + assigned++ ] = global_idx;
+      for( auto global_idx : ghost_neighbors )
+         seed_to_cell_index[ cells_offsets[ p ] + assigned++ ] = global_idx;
+      TNL_ASSERT_EQ( assigned, cells_counts[ p ], "bug in the global index permutation generator" );
+   }
+   cells_local_to_global.reset();
+   // cell_to_seed_index is an inverse permutation of seed_to_cell_index
+   IndexArray cell_to_seed_index( cellsCount );
+   for( Index i = 0; i < cellsCount; i++ )
+      cell_to_seed_index[ seed_to_cell_index[ i ] ] = i;
+
+   // construct global index permutation for points
+   // convention: points at subdomain boundaries are assigned to the subdomain with the higher number
+   IndexArray point_old_to_new_global_index( pointsCount );
+   IndexArray points_counts( nparts );
+   points_counts.setValue( 0 );
+   {
+      // first assign points to subdomains - the subdomain with the highest number takes the point
+      // (go over local cells, set subvertex owner = cell owner, higher rank will overwrite)
+      IndexArray point_to_subdomain( pointsCount );
       for( unsigned p = 0; p < nparts; p++ ) {
-         const std::string outputFileName = pvtu.addPiece( pvtuFileName, p );
-         std::cout << outputFileName << std::endl;
-
-         // Due to ghost levels, we don't know the number of cells, let alone points, in each
-         // subdomain ahead of time. Hence, we use dynamic data structures instead of MeshBuilder.
-         using CellSeed = typename Mesh::MeshTraitsType::CellSeedType;
-         std::vector< CellSeed > seeds;
-         std::vector< Index > seeds_global_indices;
-         std::set< Index > cell_global_indices;
-
-         // We'll need global-to-local index mapping for vertices. Here we also record which
-         // points are actually needed.
-         std::map< Index, Index > vertex_global_to_local;
-         auto add_cell = [&] ( const auto& cell )
-         {
-            if( cell_global_indices.count( cell.getIndex() ) != 0 )
-               return false;
-            CellSeed seed;
-            for( Index v = 0; v < cell.template getSubentitiesCount< 0 >(); v++ ) {
-               const Index global_idx = cell.template getSubentityIndex< 0 >( v );
-               if( vertex_global_to_local.count(global_idx) == 0 )
-                  vertex_global_to_local.insert( {global_idx, vertex_global_to_local.size()} );
-               seed.setCornerId( v, vertex_global_to_local[ global_idx ] );
-            }
-            seeds.push_back( seed );
-            seeds_global_indices.push_back( cell_to_seed_index[ cell.getIndex() ] );
-            cell_global_indices.insert( cell.getIndex() );
-            return true;
-         };
-
-         // iterate over local cells, add only local points (to ensure that ghost points are ordered after all local points)
          for( Index local_idx = 0; local_idx < cells_counts[ p ]; local_idx++ ) {
             const Index global_idx = seed_to_cell_index[ cells_offsets[ p ] + local_idx ];
             const auto& cell = mesh.template getEntity< typename Mesh::Cell >( global_idx );
-            for( Index v = 0; v < cell.template getSubentitiesCount< 0 >(); v++ ) {
-               const Index global_vert_idx = cell.template getSubentityIndex< 0 >( v );
-               if( point_old_to_new_global_index[global_vert_idx] >= points_offsets[p] &&
-                   point_old_to_new_global_index[global_vert_idx] < points_offsets[p] + points_counts[p] )
-               {
-                  if( vertex_global_to_local.count(global_vert_idx) == 0 )
-                     vertex_global_to_local.insert( {global_vert_idx, vertex_global_to_local.size()} );
-               }
+            const Index subvertices = cell.template getSubentitiesCount< 0 >();
+            for( Index j = 0; j < subvertices; j++ ) {
+               const Index v = cell.template getSubentityIndex< 0 >( j );
+               point_to_subdomain[ v ] = p;
             }
          }
-         TNL_ASSERT_EQ( (Index) vertex_global_to_local.size(), points_counts[p],
-                        "some local points were not added in the first pass" );
-         // iterate over local cells, create seeds and record ghost neighbor indices
-         std::vector< Index > ghost_neighbors;
+      }
+      // assign global indices to points
+      Index pointIdx = 0;
+      for( unsigned p = 0; p < nparts; p++ ) {
          for( Index local_idx = 0; local_idx < cells_counts[ p ]; local_idx++ ) {
             const Index global_idx = seed_to_cell_index[ cells_offsets[ p ] + local_idx ];
             const auto& cell = mesh.template getEntity< typename Mesh::Cell >( global_idx );
-            add_cell( cell );
-            if( is_ghost_neighbor( cell ) )
-               ghost_neighbors.push_back( global_idx );
-         }
-
-         // collect seed indices of ghost cells
-         std::set< Index > ghost_seed_indices;
-         for( unsigned gl = 0; gl < ghost_levels; gl++ ) {
-            std::vector< Index > new_ghosts;
-            for( auto global_idx : ghost_neighbors ) {
-               const Index neighbors_start = dual_xadj.get()[ global_idx ];
-               const Index neighbors_end = dual_xadj.get()[ global_idx + 1 ];
-               for( Index i = neighbors_start; i < neighbors_end; i++ ) {
-                  const Index neighbor_idx = dual_adjncy.get()[ i ];
-                  // skip neighbors on the local subdomain
-                  if( part[ neighbor_idx ] == (int) p )
-                     continue;
-                  const Index neighbor_seed_idx = cell_to_seed_index[ neighbor_idx ];
-                  // skip neighbors whose seed was already added
-                  if( ghost_seed_indices.count( neighbor_seed_idx ) == 0 ) {
-                     new_ghosts.push_back( neighbor_idx );
-                     ghost_seed_indices.insert( neighbor_seed_idx );
-                  }
+            const Index subvertices = cell.template getSubentitiesCount< 0 >();
+            for( Index j = 0; j < subvertices; j++ ) {
+               const Index v = cell.template getSubentityIndex< 0 >( j );
+               if( point_to_subdomain[ v ] == (Index) p ) {
+                  // assign index
+                  point_old_to_new_global_index[ v ] = pointIdx++;
+                  // mark as assigned
+                  point_to_subdomain[ v ] = nparts;
+                  // increase count
+                  ++points_counts[ p ];
                }
             }
-            std::swap( ghost_neighbors, new_ghosts );
-         }
-         ghost_neighbors.clear();
-         ghost_neighbors.shrink_to_fit();
-
-         // add ghost cells (the set is sorted by the seed index)
-         for( auto idx : ghost_seed_indices ) {
-            // the ghost_seed_indices array may contain duplicates and even local
-            // cells, but add_cell takes care of uniqueness, so we don't have to
-            // care about that
-            const auto& cell = mesh.template getEntity< typename Mesh::Cell >( seed_to_cell_index[ idx ] );
-            add_cell( cell );
          }
-         ghost_seed_indices.clear();
-         cell_global_indices.clear();
-
-         // set points needed for the subdomain
-         using PointArrayType = typename Mesh::MeshTraitsType::PointArrayType;
-         PointArrayType points( vertex_global_to_local.size() );
-         // create "GlobalIndex" PointData array
-         IndexArray pointsGlobalIndices( vertex_global_to_local.size() );
-         for( auto it : vertex_global_to_local ) {
-            points[ it.second ] = mesh.getPoint( it.first );
-            pointsGlobalIndices[ it.second ] = point_old_to_new_global_index[ it.first ];
+      }
+   }
+
+   // build offsets for the partitioned point indices
+   IndexArray points_offsets( nparts );
+   points_offsets[0] = 0;
+   for( unsigned p = 1; p < nparts; p++ )
+      points_offsets[p] = points_offsets[p-1] + points_counts[p-1];
+
+   // write a .pvtu file
+   using PVTU = Meshes::Writers::PVTUWriter< Mesh >;
+   std::ofstream file( pvtuFileName );
+   PVTU pvtu( file );
+   pvtu.template writeEntities< Mesh::getMeshDimension() >( Mesh{}, ghost_levels, ncommon );
+   if( ghost_levels > 0 ) {
+      // the PointData and CellData from the individual files should be added here
+      pvtu.template writePPointData< std::uint8_t >( Meshes::VTK::ghostArrayName() );
+      pvtu.template writePPointData< Index >( "GlobalIndex" );
+      pvtu.template writePCellData< std::uint8_t >( Meshes::VTK::ghostArrayName() );
+      pvtu.template writePCellData< Index >( "GlobalIndex" );
+   }
+
+   std::cout << "Writing subdomains..." << std::endl;
+   for( unsigned p = 0; p < nparts; p++ ) {
+      const std::string outputFileName = pvtu.addPiece( pvtuFileName, p );
+      std::cout << outputFileName << std::endl;
+
+      // Due to ghost levels, we don't know the number of cells, let alone points, in each
+      // subdomain ahead of time. Hence, we use dynamic data structures instead of MeshBuilder.
+      using CellSeed = typename Mesh::MeshTraitsType::CellSeedType;
+      std::vector< CellSeed > seeds;
+      std::vector< Index > seeds_global_indices;
+      std::set< Index > cell_global_indices;
+
+      // We'll need global-to-local index mapping for vertices. Here we also record which
+      // points are actually needed.
+      std::map< Index, Index > vertex_global_to_local;
+      auto add_cell = [&] ( const auto& cell )
+      {
+         if( cell_global_indices.count( cell.getIndex() ) != 0 )
+            return false;
+         CellSeed seed;
+         for( Index v = 0; v < cell.template getSubentitiesCount< 0 >(); v++ ) {
+            const Index global_idx = cell.template getSubentityIndex< 0 >( v );
+            if( vertex_global_to_local.count(global_idx) == 0 )
+               vertex_global_to_local.insert( {global_idx, vertex_global_to_local.size()} );
+            seed.setCornerId( v, vertex_global_to_local[ global_idx ] );
          }
-         vertex_global_to_local.clear();
-
-         // copy cell seeds into a TNL array which is used by the mesh initializer
-         using CellSeedArrayType = typename Mesh::MeshTraitsType::CellSeedArrayType;
-         CellSeedArrayType cellSeeds = seeds;
-         seeds.clear();
-         seeds.shrink_to_fit();
-
-         // create "GlobalIndex" CellData array
-         IndexArray cellsGlobalIndices = seeds_global_indices;
-         seeds_global_indices.clear();
-         seeds_global_indices.shrink_to_fit();
-
-         // create "vtkGhostType" CellData and PointData arrays - see https://blog.kitware.com/ghost-and-blanking-visibility-changes/
-         Containers::Array< std::uint8_t, Devices::Sequential, Index > cellGhosts( cellSeeds.getSize() ), pointGhosts( points.getSize() );
-         for( Index i = 0; i < cells_counts[ p ]; i++ )
-            cellGhosts[ i ] = 0;
-         for( Index i = cells_counts[ p ]; i < cellSeeds.getSize(); i++ )
-            cellGhosts[ i ] = (std::uint8_t) Meshes::VTK::CellGhostTypes::DUPLICATECELL;
-         // point ghosts are more tricky because they were assigned to the subdomain with higher number
-         Index pointsGhostCount = 0;
-         for( Index i = 0; i < points.getSize(); i++ ) {
-            const Index global_idx = pointsGlobalIndices[ i ];
-            if( global_idx < points_offsets[ p ] || global_idx >= points_offsets[ p ] + points_counts[ p ] ) {
-               pointGhosts[ i ] = (std::uint8_t) Meshes::VTK::PointGhostTypes::DUPLICATEPOINT;
-               pointsGhostCount++;
+         seeds.push_back( seed );
+         seeds_global_indices.push_back( cell_to_seed_index[ cell.getIndex() ] );
+         cell_global_indices.insert( cell.getIndex() );
+         return true;
+      };
+
+      // iterate over local cells, add only local points (to ensure that ghost points are ordered after all local points)
+      for( Index local_idx = 0; local_idx < cells_counts[ p ]; local_idx++ ) {
+         const Index global_idx = seed_to_cell_index[ cells_offsets[ p ] + local_idx ];
+         const auto& cell = mesh.template getEntity< typename Mesh::Cell >( global_idx );
+         for( Index v = 0; v < cell.template getSubentitiesCount< 0 >(); v++ ) {
+            const Index global_vert_idx = cell.template getSubentityIndex< 0 >( v );
+            if( point_old_to_new_global_index[global_vert_idx] >= points_offsets[p] &&
+                point_old_to_new_global_index[global_vert_idx] < points_offsets[p] + points_counts[p] )
+            {
+               if( vertex_global_to_local.count(global_vert_idx) == 0 )
+                  vertex_global_to_local.insert( {global_vert_idx, vertex_global_to_local.size()} );
             }
-            else
-               pointGhosts[ i ] = 0;
          }
+      }
+      TNL_ASSERT_EQ( (Index) vertex_global_to_local.size(), points_counts[p],
+                     "some local points were not added in the first pass" );
+      // iterate over local cells, create seeds and record ghost neighbor indices
+      std::vector< Index > ghost_neighbors;
+      for( Index local_idx = 0; local_idx < cells_counts[ p ]; local_idx++ ) {
+         const Index global_idx = seed_to_cell_index[ cells_offsets[ p ] + local_idx ];
+         const auto& cell = mesh.template getEntity< typename Mesh::Cell >( global_idx );
+         add_cell( cell );
+         if( is_ghost_neighbor( cell ) )
+            ghost_neighbors.push_back( global_idx );
+      }
 
-         // reorder ghost points to make sure that global indices are sorted
-         {
-            // prepare vector with an identity permutation
-            std::vector< Index > permutation( points.getSize() );
-            std::iota( permutation.begin(), permutation.end(), (Index) 0 );
-
-            // sort the subarray corresponding to ghost entities by the global index
-            std::stable_sort( permutation.begin() + points.getSize() - pointsGhostCount,
-                              permutation.end(),
-                              [&pointsGlobalIndices](auto& left, auto& right) {
-               return pointsGlobalIndices[ left ] < pointsGlobalIndices[ right ];
-            });
-
-            // copy the permutation into TNL array
-            typename Mesh::GlobalIndexArray perm( permutation );
-            permutation.clear();
-            permutation.shrink_to_fit();
-
-            // apply the permutation
-            using PermutationApplier = TNL::Meshes::IndexPermutationApplier< Mesh, 0 >;
-            // - pointGhosts
-            PermutationApplier::permuteArray( pointGhosts, perm );
-            // - pointsGlobalIndices
-            PermutationApplier::permuteArray( pointsGlobalIndices, perm );
-            // - points
-            PermutationApplier::permuteArray( points, perm );
-            // - cellSeeds.setCornerID (inverse perm)
-            std::vector< Index > iperm( points.getSize() );
-            for( Index i = 0; i < perm.getSize(); i++ )
-               iperm[ perm[ i ] ] = i;
-            for( Index i = 0; i < cellSeeds.getSize(); i++ ) {
-               auto& cornerIds = cellSeeds[ i ].getCornerIds();
-               for( Index v = 0; v < cornerIds.getSize(); v++ )
-                  cornerIds[ v ] = iperm[ cornerIds[ v ] ];
+      // collect seed indices of ghost cells
+      std::set< Index > ghost_seed_indices;
+      for( unsigned gl = 0; gl < ghost_levels; gl++ ) {
+         std::vector< Index > new_ghosts;
+         for( auto global_idx : ghost_neighbors ) {
+            const Index neighbors_start = dual_xadj.get()[ global_idx ];
+            const Index neighbors_end = dual_xadj.get()[ global_idx + 1 ];
+            for( Index i = neighbors_start; i < neighbors_end; i++ ) {
+               const Index neighbor_idx = dual_adjncy.get()[ i ];
+               // skip neighbors on the local subdomain
+               if( part[ neighbor_idx ] == (int) p )
+                  continue;
+               const Index neighbor_seed_idx = cell_to_seed_index[ neighbor_idx ];
+               // skip neighbors whose seed was already added
+               if( ghost_seed_indices.count( neighbor_seed_idx ) == 0 ) {
+                  new_ghosts.push_back( neighbor_idx );
+                  ghost_seed_indices.insert( neighbor_seed_idx );
+               }
             }
          }
+         std::swap( ghost_neighbors, new_ghosts );
+      }
+      ghost_neighbors.clear();
+      ghost_neighbors.shrink_to_fit();
+
+      // add ghost cells (the set is sorted by the seed index)
+      for( auto idx : ghost_seed_indices ) {
+         // the ghost_seed_indices array may contain duplicates and even local
+         // cells, but add_cell takes care of uniqueness, so we don't have to
+         // care about that
+         const auto& cell = mesh.template getEntity< typename Mesh::Cell >( seed_to_cell_index[ idx ] );
+         add_cell( cell );
+      }
+      ghost_seed_indices.clear();
+      cell_global_indices.clear();
+
+      // set points needed for the subdomain
+      using PointArrayType = typename Mesh::MeshTraitsType::PointArrayType;
+      PointArrayType points( vertex_global_to_local.size() );
+      // create "GlobalIndex" PointData array
+      IndexArray pointsGlobalIndices( vertex_global_to_local.size() );
+      for( auto it : vertex_global_to_local ) {
+         points[ it.second ] = mesh.getPoint( it.first );
+         pointsGlobalIndices[ it.second ] = point_old_to_new_global_index[ it.first ];
+      }
+      vertex_global_to_local.clear();
+
+      // copy cell seeds into a TNL array which is used by the mesh initializer
+      using CellSeedArrayType = typename Mesh::MeshTraitsType::CellSeedArrayType;
+      CellSeedArrayType cellSeeds = seeds;
+      seeds.clear();
+      seeds.shrink_to_fit();
+
+      // create "GlobalIndex" CellData array
+      IndexArray cellsGlobalIndices = seeds_global_indices;
+      seeds_global_indices.clear();
+      seeds_global_indices.shrink_to_fit();
+
+      // create "vtkGhostType" CellData and PointData arrays - see https://blog.kitware.com/ghost-and-blanking-visibility-changes/
+      Containers::Array< std::uint8_t, Devices::Sequential, Index > cellGhosts( cellSeeds.getSize() ), pointGhosts( points.getSize() );
+      for( Index i = 0; i < cells_counts[ p ]; i++ )
+         cellGhosts[ i ] = 0;
+      for( Index i = cells_counts[ p ]; i < cellSeeds.getSize(); i++ )
+         cellGhosts[ i ] = (std::uint8_t) Meshes::VTK::CellGhostTypes::DUPLICATECELL;
+      // point ghosts are more tricky because they were assigned to the subdomain with higher number
+      Index pointsGhostCount = 0;
+      for( Index i = 0; i < points.getSize(); i++ ) {
+         const Index global_idx = pointsGlobalIndices[ i ];
+         if( global_idx < points_offsets[ p ] || global_idx >= points_offsets[ p ] + points_counts[ p ] ) {
+            pointGhosts[ i ] = (std::uint8_t) Meshes::VTK::PointGhostTypes::DUPLICATEPOINT;
+            pointsGhostCount++;
+         }
+         else
+            pointGhosts[ i ] = 0;
+      }
 
-         // init mesh for the subdomain
-         Mesh subdomain;
-         subdomain.init( points, cellSeeds );
-
-         // write the subdomain
-         using Writer = Meshes::Writers::VTUWriter< Mesh >;
-         std::ofstream file( outputFileName );
-         Writer writer( file );
-         writer.template writeEntities< Mesh::getMeshDimension() >( subdomain );
-         if( ghost_levels > 0 ) {
-            writer.writePointData( pointGhosts, Meshes::VTK::ghostArrayName() );
-            writer.writePointData( pointsGlobalIndices, "GlobalIndex" );
-            writer.writeCellData( cellGhosts, Meshes::VTK::ghostArrayName() );
-            writer.writeCellData( cellsGlobalIndices, "GlobalIndex" );
+      // reorder ghost points to make sure that global indices are sorted
+      {
+         // prepare vector with an identity permutation
+         std::vector< Index > permutation( points.getSize() );
+         std::iota( permutation.begin(), permutation.end(), (Index) 0 );
+
+         // sort the subarray corresponding to ghost entities by the global index
+         std::stable_sort( permutation.begin() + points.getSize() - pointsGhostCount,
+                           permutation.end(),
+                           [&pointsGlobalIndices](auto& left, auto& right) {
+            return pointsGlobalIndices[ left ] < pointsGlobalIndices[ right ];
+         });
+
+         // copy the permutation into TNL array
+         typename Mesh::GlobalIndexArray perm( permutation );
+         permutation.clear();
+         permutation.shrink_to_fit();
+
+         // apply the permutation
+         using PermutationApplier = TNL::Meshes::IndexPermutationApplier< Mesh, 0 >;
+         // - pointGhosts
+         PermutationApplier::permuteArray( pointGhosts, perm );
+         // - pointsGlobalIndices
+         PermutationApplier::permuteArray( pointsGlobalIndices, perm );
+         // - points
+         PermutationApplier::permuteArray( points, perm );
+         // - cellSeeds.setCornerID (inverse perm)
+         std::vector< Index > iperm( points.getSize() );
+         for( Index i = 0; i < perm.getSize(); i++ )
+            iperm[ perm[ i ] ] = i;
+         for( Index i = 0; i < cellSeeds.getSize(); i++ ) {
+            auto& cornerIds = cellSeeds[ i ].getCornerIds();
+            for( Index v = 0; v < cornerIds.getSize(); v++ )
+               cornerIds[ v ] = iperm[ cornerIds[ v ] ];
          }
       }
 
-      return true;
+      // init mesh for the subdomain
+      Mesh subdomain;
+      subdomain.init( points, cellSeeds );
+
+      // write the subdomain
+      using Writer = Meshes::Writers::VTUWriter< Mesh >;
+      std::ofstream file( outputFileName );
+      Writer writer( file );
+      writer.template writeEntities< Mesh::getMeshDimension() >( subdomain );
+      if( ghost_levels > 0 ) {
+         writer.writePointData( pointGhosts, Meshes::VTK::ghostArrayName() );
+         writer.writePointData( pointsGlobalIndices, "GlobalIndex" );
+         writer.writeCellData( cellGhosts, Meshes::VTK::ghostArrayName() );
+         writer.writeCellData( cellsGlobalIndices, "GlobalIndex" );
+      }
    }
-};
+}
+
+template< typename Mesh >
+void run( const Mesh& mesh, const Config::ParameterContainer& parameters )
+{
+   using Index = typename Mesh::GlobalIndexType;
+
+   // warn if input mesh has 64-bit indices, but METIS uses only 32-bit indices
+   if( IDXTYPEWIDTH == 32 && sizeof(Index) > 4 )
+      std::cerr << "Warning: the input mesh uses 64-bit indices, but METIS was compiled only with 32-bit indices. "
+                   "Decomposition may not work correctly if the index values overflow the 32-bit type." << std::endl;
+
+   // get the mesh connectivity information in a format suitable for METIS. Actually, the same
+   // format is used by the XML-based VTK formats - the only difference is that METIS requires
+   // `offsets` to start with 0.
+   std::vector< idx_t > connectivity, offsets;
+   offsets.push_back(0);
+   const Index cellsCount = mesh.template getEntitiesCount< typename Mesh::Cell >();
+   for( Index i = 0; i < cellsCount; i++ ) {
+      const auto& entity = mesh.template getEntity< typename Mesh::Cell >( i );
+      const Index subvertices = entity.template getSubentitiesCount< 0 >();
+      for( Index j = 0; j < subvertices; j++ )
+         connectivity.push_back( entity.template getSubentityIndex< 0 >( j ) );
+      offsets.push_back( connectivity.size() );
+   }
+
+   // number of elements (cells)
+   idx_t ne = mesh.template getEntitiesCount< typename Mesh::Cell >();
+   // number of nodes (vertices)
+   idx_t nn = mesh.template getEntitiesCount< 0 >();
+   // pointers to arrays storing the mesh in a CSR-like format
+   idx_t* eptr = offsets.data();
+   idx_t* eind = connectivity.data();
+   // Specifies the number of common nodes that two elements must have in order to put an edge
+   // between them in the dual graph.
+   idx_t ncommon = Mesh::getMeshDimension();
+   if( parameters.checkParameter( "min-common-vertices" ) )
+      ncommon = parameters.template getParameter< unsigned >( "min-common-vertices" );
+   // numbering scheme for the adjacency structure of a graph or element-node structure of a mesh
+   // (0 is C-style, 1 is Fortran-style)
+   idx_t numflag = 0;
+   // These arrays store the adjacency structure of the generated dual graph. The format is of
+   // the adjacency structure is described in Section 5.5 of the METIS manual. Memory for these
+   // arrays is allocated by METIS’ API using the standard malloc function. It is the
+   // responsibility of the application to free this memory by calling free. METIS provides the
+   // METIS_Free that is a wrapper to C’s free function.
+   idx_t* xadj = nullptr;
+   idx_t* adjncy = nullptr;
+
+   // We could use METIS_PartMeshDual directly instead of METIS_MeshToDual + METIS_PartGraph*,
+   // but we need to reuse the dual graph for the generation of ghost cells.
+   std::cout << "Running METIS_MeshToDual..." << std::endl;
+   int status = METIS_MeshToDual(&ne, &nn, eptr, eind, &ncommon, &numflag, &xadj, &adjncy);
+
+   // wrap xadj and adjncy with shared_ptr
+   std::shared_ptr<idx_t> shared_xadj {xadj, METIS_Free};
+   std::shared_ptr<idx_t> shared_adjncy {adjncy, METIS_Free};
+
+   switch( status )
+   {
+      case METIS_OK: break;
+      case METIS_ERROR_INPUT:
+         throw std::runtime_error( "METIS_MeshToDual failed due to an input error." );
+      case METIS_ERROR_MEMORY:
+         throw std::runtime_error( "METIS_MeshToDual failed due to a memory allocation error." );
+      case METIS_ERROR:
+      default:
+         throw std::runtime_error( "METIS_MeshToDual failed with an unspecified error." );
+   }
+
+   // The number of vertices in the graph.
+   idx_t nvtxs = ne;
+   // The number of balancing constraints. It should be at least 1.
+   idx_t ncon = 1;
+   // An array of size `ne` specifying the weights of the elements. A NULL value can be passed
+   // to indicate that all elements have an equal weight.
+   idx_t* vwgt = nullptr;
+   // An array of size `ne` specifying the size of the elements that is used for computing the
+   // total communication volume as described in Section 5.7 of the METIS manual. A NULL value
+   // can be passed when the objective is cut or when all elements have an equal size.
+   idx_t* vsize = nullptr;
+   // The weights of the edges as described in Section 5.5 of the METIS manual.
+   idx_t* adjwgt = nullptr;  // METIS_PartMeshDual uses NULL too
+   // The number of parts to partition the mesh.
+   idx_t nparts = parameters.template getParameter< unsigned >( "subdomains" );
+   // This is an array of size nparts that specifies the desired weight for each partition. The
+   // target partition weight for the i-th partition is specified at tpwgts[i] (the numbering for
+   // the partitions starts from 0). The sum of the tpwgts[] entries must be 1.0. A NULL value
+   // can be passed to indicate that the graph should be equally divided among the partitions.
+   real_t* tpwgts = nullptr;
+   // This is an array of size ncon that specifies the allowed load imbalance tolerance for each
+   // constraint. For the i-th partition and j-th constraint the allowed weight is the
+   // ubvec[j]*tpwgts[i*ncon+j] fraction of the j-th’s constraint total weight. The load
+   // imbalances must be greater than 1.0. A NULL value can be passed indicating that the load
+   // imbalance tolerance for each constraint should be 1.001 (for ncon=1) or 1.01 (for ncon>1).
+   real_t* ubvec = nullptr;  // METIS_PartMeshDual uses NULL too
+   // Upon successful completion, this variable stores either the edgecut or the total
+   // communication volume of the dual graph’s partitioning.
+   idx_t objval = 0;
+   // Array of size `ne` that upon successful completion stores the partition array for the
+   // elements of the mesh.
+   MetisIndexArray part_array( nvtxs );
+   idx_t* part = part_array.getData();
+
+   // Array of METIS options as described in Section 5.4 of the METIS manual.
+   idx_t options[METIS_NOPTIONS];
+   // future-proof (or just in case we forgot to set some options explicitly)
+   METIS_SetDefaultOptions(options);
+   // set METIS options from parameters
+   setMETISoptions(options, parameters);
+
+   if( nparts == 1 ) {
+      // k-way partitioning from Metis fails for nparts == 1 (segfault),
+      // RB succeeds but produces nonsense
+      part_array.setValue( 0 );
+   }
+   else {
+      if( options[METIS_OPTION_PTYPE] == METIS_PTYPE_KWAY ) {
+         std::cout << "Running METIS_PartGraphKway..." << std::endl;
+         status = METIS_PartGraphKway(&nvtxs, &ncon, xadj, adjncy, vwgt, vsize, adjwgt, &nparts, tpwgts, ubvec, options, &objval, part);
+      }
+      else {
+         std::cout << "Running METIS_PartGraphRecursive..." << std::endl;
+         status = METIS_PartGraphRecursive(&nvtxs, &ncon, xadj, adjncy, vwgt, vsize, adjwgt, &nparts, tpwgts, ubvec, options, &objval, part);
+      }
+
+      switch( status )
+      {
+         case METIS_OK: break;
+         case METIS_ERROR_INPUT:
+            throw std::runtime_error( "METIS_PartGraph failed due to an input error." );
+         case METIS_ERROR_MEMORY:
+            throw std::runtime_error( "METIS_PartGraph failed due to a memory allocation error." );
+         case METIS_ERROR:
+         default:
+            throw std::runtime_error( "METIS_PartGraph failed with an unspecified error." );
+      }
+   }
+
+   // deallocate auxiliary vectors
+   connectivity.clear();
+   connectivity.shrink_to_fit();
+   offsets.clear();
+   offsets.shrink_to_fit();
+
+   const unsigned ghost_levels = parameters.getParameter< unsigned >( "ghost-levels" );
+   const std::string pvtuFileName = parameters.template getParameter< String >( "output-file" );
+   decompose_and_save( mesh, nparts, part_array, shared_xadj, shared_adjncy, ncommon, ghost_levels, pvtuFileName );
+}
 
 int main( int argc, char* argv[] )
 {
@@ -779,7 +778,8 @@ int main( int argc, char* argv[] )
    auto wrapper = [&] ( const auto& reader, auto&& mesh )
    {
       using MeshType = std::decay_t< decltype(mesh) >;
-      return DecomposeMesh< MeshType >::run( std::forward<MeshType>(mesh), parameters );
+      run( std::forward<MeshType>(mesh), parameters );
+      return true;
    };
    return ! Meshes::resolveAndLoadMesh< DecomposeMeshConfigTag, Devices::Host >( wrapper, inputFileName, inputFileFormat );
 }
diff --git a/src/Tools/tnl-game-of-life.cpp b/src/Tools/tnl-game-of-life.cpp
index b2a1e8f84ba25517eafd0c17ab45a947fd7900ed..f864a6df1f92ad9250ad53a6e1d7f41509798b03 100644
--- a/src/Tools/tnl-game-of-life.cpp
+++ b/src/Tools/tnl-game-of-life.cpp
@@ -44,10 +44,10 @@ struct MeshWorldDimensionTag< MyConfigTag, CellTopology, WorldDimension >
 { enum { enabled = ( WorldDimension == CellTopology::dimension ) }; };
 
 // Meshes are enabled only for types explicitly listed below.
-template<> struct MeshRealTag< MyConfigTag, float > { enum { enabled = false }; };
+template<> struct MeshRealTag< MyConfigTag, float > { enum { enabled = true }; };
 template<> struct MeshRealTag< MyConfigTag, double > { enum { enabled = true }; };
 template<> struct MeshGlobalIndexTag< MyConfigTag, int > { enum { enabled = true }; };
-template<> struct MeshGlobalIndexTag< MyConfigTag, long int > { enum { enabled = false }; };
+template<> struct MeshGlobalIndexTag< MyConfigTag, long int > { enum { enabled = true }; };
 template<> struct MeshLocalIndexTag< MyConfigTag, short int > { enum { enabled = true }; };
 
 // Config tag specifying the MeshConfig template to use.
diff --git a/src/Tools/tnl-mesh-converter.cpp b/src/Tools/tnl-mesh-converter.cpp
index fbcfd926301a5532e48e84c1292f240586590b3a..28ba0de1171db847bcb8e826ae681d7cd53b9d4c 100644
--- a/src/Tools/tnl-mesh-converter.cpp
+++ b/src/Tools/tnl-mesh-converter.cpp
@@ -49,10 +49,10 @@ struct MeshWorldDimensionTag< MeshConverterConfigTag, CellTopology, WorldDimensi
 { enum { enabled = ( WorldDimension == CellTopology::dimension ) }; };
 
 // Meshes are enabled only for types explicitly listed below.
-template<> struct MeshRealTag< MeshConverterConfigTag, float > { enum { enabled = false }; };
+template<> struct MeshRealTag< MeshConverterConfigTag, float > { enum { enabled = true }; };
 template<> struct MeshRealTag< MeshConverterConfigTag, double > { enum { enabled = true }; };
 template<> struct MeshGlobalIndexTag< MeshConverterConfigTag, int > { enum { enabled = true }; };
-template<> struct MeshGlobalIndexTag< MeshConverterConfigTag, long int > { enum { enabled = false }; };
+template<> struct MeshGlobalIndexTag< MeshConverterConfigTag, long int > { enum { enabled = true }; };
 template<> struct MeshLocalIndexTag< MeshConverterConfigTag, short int > { enum { enabled = true }; };
 
 // Config tag specifying the MeshConfig template to use.
diff --git a/src/Tools/tnl-test-distributed-mesh.h b/src/Tools/tnl-test-distributed-mesh.h
index 9f2165a986f085dd9bd551cd023e3e6749130991..ea495be30f08b27fbe8d25d20f6a2b13d73a04ab 100644
--- a/src/Tools/tnl-test-distributed-mesh.h
+++ b/src/Tools/tnl-test-distributed-mesh.h
@@ -55,10 +55,10 @@ struct MeshWorldDimensionTag< MyConfigTag, CellTopology, WorldDimension >
 { enum { enabled = ( WorldDimension == CellTopology::dimension ) }; };
 
 // Meshes are enabled only for types explicitly listed below.
-template<> struct MeshRealTag< MyConfigTag, float > { enum { enabled = false }; };
+template<> struct MeshRealTag< MyConfigTag, float > { enum { enabled = true }; };
 template<> struct MeshRealTag< MyConfigTag, double > { enum { enabled = true }; };
 template<> struct MeshGlobalIndexTag< MyConfigTag, int > { enum { enabled = true }; };
-template<> struct MeshGlobalIndexTag< MyConfigTag, long int > { enum { enabled = false }; };
+template<> struct MeshGlobalIndexTag< MyConfigTag, long int > { enum { enabled = true }; };
 template<> struct MeshLocalIndexTag< MyConfigTag, short int > { enum { enabled = true }; };
 
 // Config tag specifying the MeshConfig template to use.
diff --git a/src/UnitTests/CMakeLists.txt b/src/UnitTests/CMakeLists.txt
index 80d2d61a0b91a7cc47990ed5ccda1b70a381fd82..04a3a4f00259e580d8f5cd0cabf9bcccb559e813 100644
--- a/src/UnitTests/CMakeLists.txt
+++ b/src/UnitTests/CMakeLists.txt
@@ -6,7 +6,7 @@ ADD_SUBDIRECTORY( Functions )
 ADD_SUBDIRECTORY( Meshes )
 ADD_SUBDIRECTORY( Pointers )
 
-set( CPP_TESTS  AssertTest FileNameTest MathTest ObjectTest StringTest TimerTest TypeInfoTest )
+set( CPP_TESTS  AssertTest base64Test FileNameTest MathTest ObjectTest StringTest TimerTest TypeInfoTest )
 set( CUDA_TESTS  AssertCudaTest )
 if( BUILD_CUDA )
    set( CUDA_TESTS  ${CUDA_TESTS} AllocatorsTest FileTest )
@@ -28,3 +28,16 @@ if( BUILD_CUDA )
       add_test( ${target} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${target}${CMAKE_EXECUTABLE_SUFFIX} )
    endforeach()
 endif()
+
+# special test needing external library
+find_package( ZLIB )
+if( ZLIB_FOUND )
+   foreach( target IN ITEMS zlibCompressionTest )
+      add_executable( ${target} ${target}.cpp )
+      target_compile_definitions( ${target} PUBLIC "-DHAVE_ZLIB" )
+      target_compile_options( ${target} PRIVATE ${CXX_TESTS_FLAGS} )
+      target_include_directories(${target} PUBLIC ${ZLIB_INCLUDE_DIRS})
+      target_link_libraries( ${target} ${GTEST_BOTH_LIBRARIES} ${ZLIB_LIBRARIES} )
+      add_test( ${target} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${target}${CMAKE_EXECUTABLE_SUFFIX} )
+   endforeach()
+endif()
diff --git a/src/UnitTests/base64Test.cpp b/src/UnitTests/base64Test.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bf63410ead430848037689e0c35bb203891003e4
--- /dev/null
+++ b/src/UnitTests/base64Test.cpp
@@ -0,0 +1,121 @@
+/***************************************************************************
+                          base64.cpp  -  description
+                             -------------------
+    begin                : Apr 11, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+#endif
+
+#include <TNL/base64.h>
+
+using namespace TNL::base64;
+
+#ifdef HAVE_GTEST
+TEST( base64Test, get_encoded_length )
+{
+   EXPECT_EQ( get_encoded_length( 0 ), 0UL );
+   EXPECT_EQ( get_encoded_length( 1 ), 4UL );
+   EXPECT_EQ( get_encoded_length( 2 ), 4UL );
+   EXPECT_EQ( get_encoded_length( 3 ), 4UL );
+   EXPECT_EQ( get_encoded_length( 4 ), 8UL );
+   EXPECT_EQ( get_encoded_length( 5 ), 8UL );
+   EXPECT_EQ( get_encoded_length( 6 ), 8UL );
+   EXPECT_EQ( get_encoded_length( 7 ), 12UL );
+   EXPECT_EQ( get_encoded_length( 8 ), 12UL );
+   EXPECT_EQ( get_encoded_length( 9 ), 12UL );
+}
+
+void test_encode_block( std::string input, std::string expected_output )
+{
+   const auto result = encode( (const std::uint8_t*) input.c_str(), input.length() );
+   const std::string encoded( result.get() );
+   EXPECT_EQ( encoded.length(), get_encoded_length( input.length() ) );
+   EXPECT_EQ( encoded, expected_output );
+}
+
+void test_decode_block( std::string input, std::string expected_output )
+{
+   const auto result = decode( input.c_str(), input.length() );
+   EXPECT_EQ( result.first, expected_output.length() );
+   const std::string decoded( reinterpret_cast<const char*>(result.second.get()), result.first );
+   EXPECT_EQ( decoded, expected_output );
+}
+
+TEST( base64Test, encode )
+{
+   test_encode_block( "hello, world", "aGVsbG8sIHdvcmxk" );
+   test_encode_block( "hello, worl",  "aGVsbG8sIHdvcmw=" );
+   test_encode_block( "hello, wor",   "aGVsbG8sIHdvcg==" );
+   test_encode_block( "hello, wo",    "aGVsbG8sIHdv"     );
+   test_encode_block( "hello, w",     "aGVsbG8sIHc="     );
+   test_encode_block( "hello, ",      "aGVsbG8sIA=="     );
+   test_encode_block( "hello,",       "aGVsbG8s"         );
+   test_encode_block( "hello",        "aGVsbG8="         );
+   test_encode_block( "hell",         "aGVsbA=="         );
+   test_encode_block( "hel",          "aGVs"             );
+   test_encode_block( "he",           "aGU="             );
+   test_encode_block( "h",            "aA=="             );
+   test_encode_block( "",             ""                 );
+}
+
+TEST( base64Test, decode )
+{
+   test_decode_block( "aGVsbG8sIHdvcmxk", "hello, world" );
+   test_decode_block( "aGVsbG8sIHdvcmw=", "hello, worl"  );
+   test_decode_block( "aGVsbG8sIHdvcg==", "hello, wor"   );
+   test_decode_block( "aGVsbG8sIHdv",     "hello, wo"    );
+   test_decode_block( "aGVsbG8sIHc=",     "hello, w"     );
+   test_decode_block( "aGVsbG8sIA==",     "hello, "      );
+   test_decode_block( "aGVsbG8s",         "hello,"       );
+   test_decode_block( "aGVsbG8=",         "hello"        );
+   test_decode_block( "aGVsbA==",         "hell"         );
+   test_decode_block( "aGVs",             "hel"          );
+   test_decode_block( "aGU=",             "he"           );
+   test_decode_block( "aA==",             "h"            );
+   test_decode_block( "",                 ""             );
+}
+
+TEST( base64Test, decode_invalid_chars )
+{
+   test_decode_block( "_", "" );
+   test_decode_block( "a_A==", "h" );
+   test_decode_block( "aA_==", "h" );
+   test_decode_block( "aA=_=", "h" );
+   test_decode_block( "aA==_", "h" );
+}
+
+TEST( base64Test, decode_invalid_padding )
+{
+   EXPECT_THROW( test_decode_block( "aaa", "" ),  std::invalid_argument );
+   EXPECT_THROW( test_decode_block( "aa", "" ),   std::invalid_argument );
+   EXPECT_THROW( test_decode_block( "a", "" ),    std::invalid_argument );
+   EXPECT_THROW( test_decode_block( "aa=", "" ),  std::invalid_argument );
+   EXPECT_THROW( test_decode_block( "a===", "" ), std::invalid_argument );
+   EXPECT_THROW( test_decode_block( "a==", "" ),  std::invalid_argument );
+   EXPECT_THROW( test_decode_block( "a=", "" ),   std::invalid_argument );
+   EXPECT_THROW( test_decode_block( "=", "" ),    std::invalid_argument );
+   EXPECT_THROW( test_decode_block( "==", "" ),   std::invalid_argument );
+   EXPECT_THROW( test_decode_block( "===", "" ),  std::invalid_argument );
+   EXPECT_THROW( test_decode_block( "====", "" ), std::invalid_argument );
+}
+
+TEST( base64Test, decode_ignore_after_padding )
+{
+   test_decode_block( "aGVsbG8=x", "hello" );
+   test_decode_block( "aGVsbG8=xx", "hello" );
+   test_decode_block( "aGVsbG8=xxx", "hello" );
+   test_decode_block( "aGVsbG8=xxxx", "hello" );
+   test_decode_block( "aGVsbG8=xxxxx", "hello" );
+   test_decode_block( "aGVsbG8=xxxxxx", "hello" );
+   test_decode_block( "aGVsbG8=xxxxxxx", "hello" );
+   test_decode_block( "aGVsbG8=xxxxxxxx", "hello" );
+}
+#endif
+
+#include "main.h"
diff --git a/src/UnitTests/zlibCompressionTest.cpp b/src/UnitTests/zlibCompressionTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c7feafbe8c651c0ebc3b1880e978972901394f7b
--- /dev/null
+++ b/src/UnitTests/zlibCompressionTest.cpp
@@ -0,0 +1,67 @@
+/***************************************************************************
+                          base64.cpp  -  description
+                             -------------------
+    begin                : Apr 11, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include <sstream>
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+#endif
+
+#include <sstream>
+
+#include <TNL/zlib_compression.h>
+
+#ifdef HAVE_GTEST
+template< typename HeaderType >
+void test_compress( std::string input, std::string expected_output )
+{
+   std::stringstream str;
+   TNL::write_compressed_block< HeaderType >( input.c_str(), input.length(), str );
+   const std::string output = str.str();
+   EXPECT_EQ( output, expected_output );
+}
+
+template< typename HeaderType >
+void test_decompress( std::string input, std::string expected_output )
+{
+   // decompress C string
+   {
+      const auto result = TNL::decompress_block< HeaderType, char >( input.c_str() );
+      EXPECT_EQ( result.first, expected_output.length() );
+      const std::string output( result.second.get(), result.first );
+      EXPECT_EQ( output, expected_output );
+   }
+   // decompress stream
+   {
+      std::stringstream str( input );
+      const auto result = TNL::decompress_block< HeaderType, char >( str );
+      EXPECT_EQ( result.first, expected_output.length() );
+      const std::string output( result.second.get(), result.first );
+      EXPECT_EQ( output, expected_output );
+   }
+}
+
+TEST( base64Test, compress_string )
+{
+   test_compress< std::int32_t > ( "hello, world", "AQAAAAwAAAAMAAAAFAAAAA==eJzLSM3JyddRKM8vykkBAB1UBIk=" );
+   test_compress< std::uint32_t >( "hello, world", "AQAAAAwAAAAMAAAAFAAAAA==eJzLSM3JyddRKM8vykkBAB1UBIk=" );
+   test_compress< std::int64_t > ( "hello, world", "AQAAAAAAAAAMAAAAAAAAAAwAAAAAAAAAFAAAAAAAAAA=eJzLSM3JyddRKM8vykkBAB1UBIk=" );
+   test_compress< std::uint64_t >( "hello, world", "AQAAAAAAAAAMAAAAAAAAAAwAAAAAAAAAFAAAAAAAAAA=eJzLSM3JyddRKM8vykkBAB1UBIk=" );
+}
+
+TEST( base64Test, decode )
+{
+   test_decompress< std::int32_t  >( "AQAAAAwAAAAMAAAAFAAAAA==eJzLSM3JyddRKM8vykkBAB1UBIk=", "hello, world" );
+   test_decompress< std::uint32_t >( "AQAAAAwAAAAMAAAAFAAAAA==eJzLSM3JyddRKM8vykkBAB1UBIk=", "hello, world" );
+   test_decompress< std::int64_t  >( "AQAAAAAAAAAMAAAAAAAAAAwAAAAAAAAAFAAAAAAAAAA=eJzLSM3JyddRKM8vykkBAB1UBIk=", "hello, world" );
+   test_decompress< std::uint64_t >( "AQAAAAAAAAAMAAAAAAAAAAwAAAAAAAAAFAAAAAAAAAA=eJzLSM3JyddRKM8vykkBAB1UBIk=", "hello, world" );
+}
+#endif
+
+#include "main.h"