diff --git a/src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem_impl.h b/src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem_impl.h
index e3f47292306bbb879bf2677223f2f4b63cc7513c..3f0c9194867d011724e2387c882420e04e798ee6 100644
--- a/src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem_impl.h
+++ b/src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem_impl.h
@@ -490,7 +490,7 @@ getExplicitUpdate( const RealType& time,
          
          //std::cerr << "Setting boundary conditions..." << std::endl;
 
-         Devices::Cuda::synchronizeDevice();
+         Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
          for( IndexType gridYIdx = 0; gridYIdx < cudaYGrids; gridYIdx ++ )
             for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ )
                boundaryConditionsTemplatedCompact< MeshType, CellType, BoundaryCondition, MeshFunctionType >
@@ -594,7 +594,7 @@ getExplicitUpdate( const RealType& time,
                                gridYSize / 16 + ( gridYSize % 16 != 0 ) );
             */
 
-            TNL::Devices::Cuda::synchronizeDevice();
+            Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
             int cudaErr;
             Meshes::Traverser< MeshType, Cell > meshTraverser;
             meshTraverser.template processInteriorEntities< UserData,
diff --git a/src/Benchmarks/HeatEquation/Tuning/GridTraverser_impl.h b/src/Benchmarks/HeatEquation/Tuning/GridTraverser_impl.h
index 2a77f8bb5287a75cee1b38356c96dd49dbff5250..c9fe0e43be20b175321dc1b50c563fb48e843b5d 100644
--- a/src/Benchmarks/HeatEquation/Tuning/GridTraverser_impl.h
+++ b/src/Benchmarks/HeatEquation/Tuning/GridTraverser_impl.h
@@ -246,7 +246,7 @@ processEntities(
       IndexType cudaThreadsCount = 2 * ( end.x() - begin.x() + end.y() - begin.y() + 1 );
       Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, cudaThreadsCount );
       dim3 gridIdx, cudaGridSize;
-      Devices::Cuda::synchronizeDevice();
+      Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
       for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x++ )
       {
          Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
@@ -273,7 +273,7 @@ processEntities(
       auto& pool = Cuda::StreamPool::getInstance();
       const cudaStream_t& s = pool.getStream( stream );
 
-      Devices::Cuda::synchronizeDevice();
+      Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
       dim3 gridIdx, cudaGridSize;
       for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ )
          for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ )
diff --git a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
index 6661c5f6a8e720df9a265ca5cb3c5b97717294d5..ffb2f121a2b68719e6e07c29ce404384aa715b23 100644
--- a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
+++ b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h
@@ -130,7 +130,7 @@ benchmarkIterativeSolvers( Benchmark& benchmark,
    *cudaMatrixPointer = *matrixPointer;
 
    // synchronize shared pointers
-   Devices::Cuda::synchronizeDevice();
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
 #endif
 
    using namespace Solvers::Linear;
diff --git a/src/TNL/Devices/Cuda.h b/src/TNL/Devices/Cuda.h
index 853cd2e038c669485781a1462cdb4a54a9670e35..fc924ccc9f054cd5b55c0e21ab836dbfa2b52687 100644
--- a/src/TNL/Devices/Cuda.h
+++ b/src/TNL/Devices/Cuda.h
@@ -10,13 +10,7 @@
 
 #pragma once
 
-#include <iostream>
-
 #include <TNL/String.h>
-#include <TNL/Assert.h>
-#include <TNL/Pointers/SmartPointersRegister.h>
-#include <TNL/Timer.h>
-#include <TNL/Cuda/CudaCallable.h>
 #include <TNL/Config/ConfigDescription.h>
 #include <TNL/Config/ParameterContainer.h>
 
@@ -33,16 +27,6 @@ public:
 
    static inline constexpr int getGPUTransferBufferSize();
 
-   static inline void insertSmartPointer( Pointers::SmartPointer* pointer );
-
-   static inline void removeSmartPointer( Pointers::SmartPointer* pointer );
-
-   // Negative deviceId means that CudaDeviceInfo::getActiveDevice will be
-   // called to get the device ID.
-   static inline bool synchronizeDevice( int deviceId = -1 );
-
-   static inline Timer& getSmartPointersSynchronizationTimer();
-
    ////
    // When we transfer data between the GPU and the CPU we use 5 MB buffer. This
    // size should ensure good performance -- see.
@@ -50,11 +34,6 @@ public:
    // We use the same buffer size even for retyping data during IO operations.
    //
    static constexpr std::size_t TransferBufferSize = 5 * 2<<20;
-
-
-   protected:
-
-   static inline Pointers::SmartPointersRegister& getSmartPointersRegister();
 };
 
 #ifdef HAVE_CUDA
diff --git a/src/TNL/Devices/Cuda_impl.h b/src/TNL/Devices/Cuda_impl.h
index 6d3daa356cb64f2373451a260a1403aabff83be6..7a4d59fcc0a87d224867f9eab930cc6bbf2c06d1 100644
--- a/src/TNL/Devices/Cuda_impl.h
+++ b/src/TNL/Devices/Cuda_impl.h
@@ -10,12 +10,15 @@
 
 #pragma once
 
+#include <iostream>
+
 #include <TNL/Math.h>
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Cuda/DeviceInfo.h>
 #include <TNL/Exceptions/CudaBadAlloc.h>
 #include <TNL/Exceptions/CudaSupportMissing.h>
 #include <TNL/Exceptions/CudaRuntimeError.h>
+#include <TNL/Pointers/SmartPointersRegister.h>
 
 namespace TNL {
 namespace Devices {
@@ -42,8 +45,8 @@ Cuda::setup( const Config::ParameterContainer& parameters,
       std::cerr << "I cannot activate CUDA device number " << cudaDevice << "." << std::endl;
       return false;
    }
-   getSmartPointersSynchronizationTimer().reset();
-   getSmartPointersSynchronizationTimer().stop();
+   Pointers::getSmartPointersSynchronizationTimer< Devices::Cuda >().reset();
+   Pointers::getSmartPointersSynchronizationTimer< Devices::Cuda >().stop();
 #endif
    return true;
 }
@@ -53,42 +56,6 @@ inline constexpr int Cuda::getGPUTransferBufferSize()
    return 1 << 20;
 }
 
-inline void Cuda::insertSmartPointer( Pointers::SmartPointer* pointer )
-{
-   getSmartPointersRegister().insert( pointer, TNL::Cuda::DeviceInfo::getActiveDevice() );
-}
-
-inline void Cuda::removeSmartPointer( Pointers::SmartPointer* pointer )
-{
-   getSmartPointersRegister().remove( pointer, TNL::Cuda::DeviceInfo::getActiveDevice() );
-}
-
-inline bool Cuda::synchronizeDevice( int deviceId )
-{
-#ifdef HAVE_CUDA
-   if( deviceId < 0 )
-      deviceId = TNL::Cuda::DeviceInfo::getActiveDevice();
-   getSmartPointersSynchronizationTimer().start();
-   bool b = getSmartPointersRegister().synchronizeDevice( deviceId );
-   getSmartPointersSynchronizationTimer().stop();
-   return b;
-#else
-   return true;
-#endif
-}
-
-inline Timer& Cuda::getSmartPointersSynchronizationTimer()
-{
-   static Timer timer;
-   return timer;
-}
-
-inline Pointers::SmartPointersRegister& Cuda::getSmartPointersRegister()
-{
-   static Pointers::SmartPointersRegister reg;
-   return reg;
-}
-
 // double-precision atomicAdd function for Maxwell and older GPUs
 // copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
 #ifdef HAVE_CUDA
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase1D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase1D_impl.h
index 4dac64a23fc50d85186e2a1f118f818807d0fc87..49cda643cc73a6717f47d877315b980e5589d60c 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase1D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase1D_impl.h
@@ -25,7 +25,7 @@ initInterface( const MeshFunctionPointer& _input,
     int numBlocksX = Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
     dim3 blockSize( cudaBlockSize );
     dim3 gridSize( numBlocksX );
-    Devices::Cuda::synchronizeDevice();
+    Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
     CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(),
             _output.template modifyData< Device >(),
             _interfaceMap.template modifyData< Device >() );
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h
index 947a4be062a9b8763ec9a6042c41d3048aa8a361..b18252cb078b803d6e911ca130b9b161c241ff4d 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h
@@ -29,7 +29,7 @@ initInterface( const MeshFunctionPointer& _input,
     int numBlocksY = Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize );
     dim3 blockSize( cudaBlockSize, cudaBlockSize );
     dim3 gridSize( numBlocksX, numBlocksY );
-    Devices::Cuda::synchronizeDevice();
+    Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
     CudaInitCaller<<< gridSize, blockSize >>>( _input.template getData< Device >(),
             _output.template modifyData< Device >(),
             _interfaceMap.template modifyData< Device >(),
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h
index eb0665c7e4a66a8424af4a312d68f0f3a039a934..fd7dc9381ec1325f108078271df343953be58ebd 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h
@@ -30,7 +30,7 @@ initInterface( const MeshFunctionPointer& _input,
       std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl;
     dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize );
     dim3 gridSize( numBlocksX, numBlocksY, numBlocksZ );
-    Devices::Cuda::synchronizeDevice();
+    Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
     CudaInitCaller3d<<< gridSize, blockSize >>>( _input.template getData< Device >(),
             _output.template modifyData< Device >(),
             _interfaceMap.template modifyData< Device >(), vLower, vUpper );
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
index c5a0f74cc53f89808f5308aa628c6c55802eb200..1b1666a02b627778bd8364a7e46f5e22718ff76f 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
@@ -316,7 +316,7 @@ solve( const MeshPointer& mesh,
           
           
   /** HERE IS FIM FOR MPI AND WITHOUT MPI **/
-          Devices::Cuda::synchronizeDevice();
+          Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
           CudaUpdateCellCaller<18><<< gridSize, blockSize >>>( ptr, interfaceMapPtr.template getData< Device >(),
                   auxPtr.template getData< Device>(), helpFunc.template modifyData< Device>(),
                   blockCalculationIndicator.getView(), vecLowerOverlaps, vecUpperOverlaps );
@@ -327,7 +327,7 @@ solve( const MeshPointer& mesh,
           auxPtr.swap( helpFunc );
           
           // Getting blocks that should calculate in next passage. These blocks are neighbours of those that were calculated now.
-          Devices::Cuda::synchronizeDevice(); 
+          Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
           GetNeighbours<<< nBlocksNeigh, 1024 >>>( blockCalculationIndicator.getView(), blockCalculationIndicatorHelp.getView(), numBlocksX, numBlocksY );
           cudaDeviceSynchronize();
           TNL_CHECK_CUDA_DEVICE;
@@ -349,7 +349,7 @@ solve( const MeshPointer& mesh,
         if( numIter%2 == 1 ) // Need to check parity for MPI overlaps to synchronize ( otherwise doesnt work )
         {
           helpFunc.swap( auxPtr );
-          Devices::Cuda::synchronizeDevice();
+          Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
           cudaDeviceSynchronize();
           TNL_CHECK_CUDA_DEVICE;
         }
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
index 3fce5564efdf65ebaf45c62eaed8cae259d1c4b3..82185a937d832b0785b597188aeb0989ab751d47 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
@@ -295,14 +295,14 @@ solve( const MeshPointer& mesh,
         //MeshFunctionPointer helpFunc1( mesh );      
         MeshFunctionPointer helpFunc( mesh );
         helpFunc.template modifyData() = auxPtr.template getData();
-        Devices::Cuda::synchronizeDevice(); 
+        Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
                 
         int numIter = 0; // number of passages of following while cycle
         
         while( BlockIterD ) //main body of cuda code
         {
           
-          Devices::Cuda::synchronizeDevice();          
+          Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
           // main function that calculates all values in each blocks
           // calculated values are in helpFunc
           CudaUpdateCellCaller< 10 ><<< gridSize, blockSize >>>( ptr,
@@ -315,14 +315,14 @@ solve( const MeshPointer& mesh,
           // Switching pointers to helpFunc and auxPtr so real results are in memory of helpFunc but here under variable auxPtr
           auxPtr.swap( helpFunc );
           
-          Devices::Cuda::synchronizeDevice();
+          Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
           // Neighbours of blocks that calculatedBefore in this passage should calculate in the next!
           // BlockIterDevice contains blocks that calculatedBefore in this passage and BlockIterPom those that should calculate in next (are neighbours)
           GetNeighbours<<< nBlocksNeigh, 1024 >>>( BlockIterDevice.getView(), BlockIterPom.getView(), numBlocksX, numBlocksY, numBlocksZ );
           cudaDeviceSynchronize();
           TNL_CHECK_CUDA_DEVICE;
           BlockIterDevice = BlockIterPom;
-          Devices::Cuda::synchronizeDevice();
+          Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
           
           // .containsValue(1) is actually parallel reduction implemented in TNL
           BlockIterD = BlockIterDevice.containsValue(1);
@@ -340,7 +340,7 @@ solve( const MeshPointer& mesh,
           // We need auxPtr to point on memory of original auxPtr (not to helpFunc)
           // last passage of previous while cycle didnt calculate any number anyway so switching names doesnt effect values
           auxPtr.swap( helpFunc ); 
-          Devices::Cuda::synchronizeDevice();
+          Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
         }
         cudaDeviceSynchronize();
         TNL_CHECK_CUDA_DEVICE;
diff --git a/src/TNL/Matrices/SparseOperations_impl.h b/src/TNL/Matrices/SparseOperations_impl.h
index b6d118bf22e39c9ee345269c2a2cc28760b3a198..ce7caaf3297dc869b6b0ae969d567c939e5b9ebb 100644
--- a/src/TNL/Matrices/SparseOperations_impl.h
+++ b/src/TNL/Matrices/SparseOperations_impl.h
@@ -140,7 +140,7 @@ copySparseMatrix_impl( Matrix1& A, const Matrix2& B )
       const Pointers::DevicePointer< const Matrix2 > Bpointer( B );
 
       // set row lengths
-      Devices::Cuda::synchronizeDevice();
+      Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
       SparseMatrixSetRowLengthsVectorKernel<<< gridSize, blockSize >>>(
             rowLengths.getData(),
             &Bpointer.template getData< TNL::Devices::Cuda >(),
@@ -150,7 +150,7 @@ copySparseMatrix_impl( Matrix1& A, const Matrix2& B )
       Apointer->setCompressedRowLengths( rowLengths );
 
       // copy rows
-      Devices::Cuda::synchronizeDevice();
+      Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
       SparseMatrixCopyKernel<<< gridSize, blockSize >>>(
             &Apointer.template modifyData< TNL::Devices::Cuda >(),
             &Bpointer.template getData< TNL::Devices::Cuda >(),
diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp
index 796ffe491d572c108f7b8072769e42e9d5c9571d..c1aab9660d50ee8fe6917ae08c6bb869e333c056 100644
--- a/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp
+++ b/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp
@@ -185,7 +185,7 @@ processEntities(
    auto& pool = Cuda::StreamPool::getInstance();
    const cudaStream_t& s = pool.getStream( stream );
 
-   Devices::Cuda::synchronizeDevice();
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
    if( processOnlyBoundaryEntities )
    {
       dim3 cudaBlockSize( 2 );
diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp
index 15c5a0eda13a10bd512fe69930bed1e9c90f068d..721ec96d2331c103cb0179e5bd77b224b700c28f 100644
--- a/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp
+++ b/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp
@@ -418,8 +418,8 @@ processEntities(
       Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongY, cudaGridsCountAlongY, end.y() - begin.y() - 1 );
             
       auto& pool = Cuda::StreamPool::getInstance();
-      Devices::Cuda::synchronizeDevice();
-      
+      Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
+
       const cudaStream_t& s1 = pool.getStream( stream );
       const cudaStream_t& s2 = pool.getStream( stream + 1 );
       dim3 gridIdx, cudaGridSize;
@@ -486,7 +486,7 @@ processEntities(
       //std::cerr << "blocksPerFace = " << blocksPerFace << "Threads count = " << cudaThreadsCount 
       //          << "cudaBlockCount = " << cudaBlocksCount.x << std::endl;      
       dim3 gridIdx, cudaGridSize;
-      Devices::Cuda::synchronizeDevice();
+      Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
       for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x++ )
       {
          Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
@@ -518,7 +518,7 @@ processEntities(
       auto& pool = Cuda::StreamPool::getInstance();
       const cudaStream_t& s = pool.getStream( stream );
 
-      Devices::Cuda::synchronizeDevice();
+      Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
       dim3 gridIdx, cudaGridSize;
       for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ )
          for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ )
diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp
index 489e58d779b57e7b5afea4b2d99516fb23b04f31..a9aad8c9533dfecdc6e5410be51705d24438725c 100644
--- a/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp
+++ b/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp
@@ -346,8 +346,8 @@ processEntities(
       Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongYZ, cudaGridsCountAlongYZ, entitiesAlongY - 2, entitiesAlongZ - 2 );
 
       auto& pool = Cuda::StreamPool::getInstance();
-      Devices::Cuda::synchronizeDevice();
-      
+      Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
+
       const cudaStream_t& s1 = pool.getStream( stream );
       const cudaStream_t& s2 = pool.getStream( stream + 1 );
       const cudaStream_t& s3 = pool.getStream( stream + 2 );
@@ -458,7 +458,7 @@ processEntities(
       auto& pool = Cuda::StreamPool::getInstance();
       const cudaStream_t& s = pool.getStream( stream );
 
-      Devices::Cuda::synchronizeDevice();
+      Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
       dim3 gridIdx, gridSize;
       for( gridIdx.z = 0; gridIdx.z < cudaGridsCount.z; gridIdx.z ++ )
          for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ )
diff --git a/src/TNL/Meshes/MeshDetails/Traverser_impl.h b/src/TNL/Meshes/MeshDetails/Traverser_impl.h
index 27cbba714160986d156c4c5e828274df53fb1bf3..33832d4f10944d51c5bb01646d2a239c86365f7f 100644
--- a/src/TNL/Meshes/MeshDetails/Traverser_impl.h
+++ b/src/TNL/Meshes/MeshDetails/Traverser_impl.h
@@ -164,7 +164,7 @@ processBoundaryEntities( const MeshPointer& meshPointer,
    const int desGridSize = 32 * Cuda::DeviceInfo::getCudaMultiprocessors( Cuda::DeviceInfo::getActiveDevice() );
    gridSize.x = min( desGridSize, Cuda::getNumberOfBlocks( entitiesCount, blockSize.x ) );
 
-   Devices::Cuda::synchronizeDevice();
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
    MeshTraverserBoundaryEntitiesKernel< EntitiesDimension, EntitiesProcessor >
       <<< gridSize, blockSize >>>
       ( &meshPointer.template getData< Devices::Cuda >(),
@@ -195,7 +195,7 @@ processInteriorEntities( const MeshPointer& meshPointer,
    const int desGridSize = 32 * Cuda::DeviceInfo::getCudaMultiprocessors( Cuda::DeviceInfo::getActiveDevice() );
    gridSize.x = min( desGridSize, Cuda::getNumberOfBlocks( entitiesCount, blockSize.x ) );
 
-   Devices::Cuda::synchronizeDevice();
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
    MeshTraverserInteriorEntitiesKernel< EntitiesDimension, EntitiesProcessor >
       <<< gridSize, blockSize >>>
       ( &meshPointer.template getData< Devices::Cuda >(),
@@ -226,7 +226,7 @@ processAllEntities( const MeshPointer& meshPointer,
    const int desGridSize = 32 * Cuda::DeviceInfo::getCudaMultiprocessors( Cuda::DeviceInfo::getActiveDevice() );
    gridSize.x = min( desGridSize, Cuda::getNumberOfBlocks( entitiesCount, blockSize.x ) );
 
-   Devices::Cuda::synchronizeDevice();
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
    MeshTraverserAllEntitiesKernel< EntitiesDimension, EntitiesProcessor >
       <<< gridSize, blockSize >>>
       ( &meshPointer.template getData< Devices::Cuda >(),
diff --git a/src/TNL/Pointers/DevicePointer.h b/src/TNL/Pointers/DevicePointer.h
index d3bf9b07f9b2c59030433e1bca633a2601bb73d0..5276c3ed465938e7e7fcdfde2885dc8986cac3b5 100644
--- a/src/TNL/Pointers/DevicePointer.h
+++ b/src/TNL/Pointers/DevicePointer.h
@@ -16,6 +16,7 @@
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Pointers/SmartPointer.h>
+#include <TNL/Pointers/SmartPointersRegister.h>
 #include <TNL/TypeInfo.h>
 #include <TNL/Cuda/MemoryHelpers.h>
 
@@ -406,7 +407,7 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
       ~DevicePointer()
       {
          this->free();
-         Devices::Cuda::removeSmartPointer( this );
+         getSmartPointersRegister< DeviceType >().remove( this );
       }
 
    protected:
@@ -426,7 +427,7 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
          this->cuda_pointer = Cuda::passToDevice( *this->pointer );
          // set last-sync state
          this->set_last_sync_state();
-         Devices::Cuda::insertSmartPointer( this );
+         getSmartPointersRegister< DeviceType >().insert( this );
          return true;
       }
 
diff --git a/src/TNL/Pointers/SharedPointerCuda.h b/src/TNL/Pointers/SharedPointerCuda.h
index 27a4aabef4ed7e1383ae5f141e1c7e56a6f5d350..54dd4ee3c71c7eb461de7d8c906fd42dc96af1c6 100644
--- a/src/TNL/Pointers/SharedPointerCuda.h
+++ b/src/TNL/Pointers/SharedPointerCuda.h
@@ -16,6 +16,7 @@
 
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Pointers/SmartPointer.h>
+#include <TNL/Pointers/SmartPointersRegister.h>
 #include <TNL/Cuda/MemoryHelpers.h>
 
 #include <cstring>   // std::memcpy, std::memcmp
@@ -546,7 +547,7 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
       ~SharedPointer()
       {
          this->free();
-         Devices::Cuda::removeSmartPointer( this );
+         getSmartPointersRegister< DeviceType >().remove( this );
       }
 
    protected:
@@ -577,7 +578,7 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
 #ifdef TNL_DEBUG_SHARED_POINTERS
          std::cerr << "Created shared pointer to " << getType< ObjectType >() << " (cuda_pointer = " << this->cuda_pointer << ")" << std::endl;
 #endif
-         Devices::Cuda::insertSmartPointer( this );
+         getSmartPointersRegister< DeviceType >().insert( this );
          return true;
       }
 
diff --git a/src/TNL/Pointers/SmartPointersRegister.h b/src/TNL/Pointers/SmartPointersRegister.h
index ad716b9c036ee011dca583b2557247ce0d110453..5094c1c0ee26887262f02656be7efd9f6b923345 100644
--- a/src/TNL/Pointers/SmartPointersRegister.h
+++ b/src/TNL/Pointers/SmartPointersRegister.h
@@ -2,7 +2,7 @@
                           SmartPointersRegister.h  -  description
                              -------------------
     begin                : Apr 29, 2016
-    copyright            : (C) 2016 by Tomas Oberhuber
+    copyright            : (C) 2016 by Tomas Oberhuber et al.
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
@@ -12,24 +12,43 @@
 
 #include <unordered_set>
 #include <unordered_map>
+
 #include <TNL/Pointers/SmartPointer.h>
-#include <TNL/Assert.h>
+#include <TNL/Timer.h>
+#include <TNL/Cuda/DeviceInfo.h>
 
 namespace TNL {
 namespace Pointers {
 
+// Since TNL currently supports only execution on host (which does not need
+// to register and synchronize smart pointers) and CUDA GPU's, the smart
+// pointers register is implemented only for CUDA. If more execution types
+// which need to register smart pointers are implemented in the future, this
+// should beome a class template specialization.
 class SmartPointersRegister
 {
 
    public:
 
-      void insert( SmartPointer* pointer, int deviceId )
+      /**
+       * Negative deviceId means that \ref Cuda::DeviceInfo::getActiveDevice will be
+       * called to get the device ID.
+       */
+      void insert( SmartPointer* pointer, int deviceId = -1 )
       {
+         if( deviceId < 0 )
+            deviceId = Cuda::DeviceInfo::getActiveDevice();
          pointersOnDevices[ deviceId ].insert( pointer );
       }
 
-      void remove( SmartPointer* pointer, int deviceId )
+      /**
+       * Negative deviceId means that \ref Cuda::DeviceInfo::getActiveDevice will be
+       * called to get the device ID.
+       */
+      void remove( SmartPointer* pointer, int deviceId = -1 )
       {
+         if( deviceId < 0 )
+            deviceId = Cuda::DeviceInfo::getActiveDevice();
          try {
             pointersOnDevices.at( deviceId ).erase( pointer );
          }
@@ -41,8 +60,14 @@ class SmartPointersRegister
          }
       }
 
-      bool synchronizeDevice( int deviceId )
+      /**
+       * Negative deviceId means that \ref Cuda::DeviceInfo::getActiveDevice will be
+       * called to get the device ID.
+       */
+      bool synchronizeDevice( int deviceId = -1 )
       {
+         if( deviceId < 0 )
+            deviceId = Cuda::DeviceInfo::getActiveDevice();
          try {
             const auto & set = pointersOnDevices.at( deviceId );
             for( auto&& it : set )
@@ -61,5 +86,34 @@ class SmartPointersRegister
       std::unordered_map< int, SetType > pointersOnDevices;
 };
 
+
+// TODO: Device -> Allocator (in all smart pointers)
+template< typename Device >
+SmartPointersRegister& getSmartPointersRegister()
+{
+   static SmartPointersRegister reg;
+   return reg;
+}
+
+template< typename Device >
+Timer& getSmartPointersSynchronizationTimer()
+{
+   static Timer timer;
+   return timer;
+}
+
+/**
+ * Negative deviceId means that the ID of the currently active device will be
+ * determined automatically.
+ */
+template< typename Device >
+bool synchronizeSmartPointersOnDevice( int deviceId = -1 )
+{
+   getSmartPointersSynchronizationTimer< Device >().start();
+   bool b = getSmartPointersRegister< Device >().synchronizeDevice( deviceId );
+   getSmartPointersSynchronizationTimer< Device >().stop();
+   return b;
+}
+
 } // namespace Pointers
 } // namespace TNL
diff --git a/src/TNL/Pointers/UniquePointer.h b/src/TNL/Pointers/UniquePointer.h
index 9a7a2b677769fde29d49419141a76cd9cd6f4373..071de4d51132fa6b71e0e6b86ab16acd3c8269c8 100644
--- a/src/TNL/Pointers/UniquePointer.h
+++ b/src/TNL/Pointers/UniquePointer.h
@@ -16,6 +16,7 @@
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Pointers/SmartPointer.h>
+#include <TNL/Pointers/SmartPointersRegister.h>
 #include <TNL/Cuda/MemoryHelpers.h>
 
 #include <cstring>  // std::memcpy, std::memcmp
@@ -250,7 +251,7 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer
       ~UniquePointer()
       {
          this->free();
-         Devices::Cuda::removeSmartPointer( this );
+         getSmartPointersRegister< DeviceType >().remove( this );
       }
 
    protected:
@@ -276,7 +277,7 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer
          this->cuda_pointer = Cuda::passToDevice( this->pd->data );
          // set last-sync state
          this->set_last_sync_state();
-         Devices::Cuda::insertSmartPointer( this );
+         getSmartPointersRegister< DeviceType >().insert( this );
          return true;
       }
 
diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h b/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h
index 626469920ff9e08d7f935e13017086b7cd583081..be9e37f23d8d1fa7e1250d2ffc91ec0e8870e9fa 100644
--- a/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h
+++ b/src/TNL/Solvers/Linear/Preconditioners/ILU0_impl.h
@@ -282,7 +282,7 @@ allocate_LU()
    U->setDimensions( N, N );
 
    // extract raw pointer
-   Devices::Cuda::synchronizeDevice();
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
    const CSR* kernel_A = &A.template getData< DeviceType >();
 
    // copy row lengths
@@ -329,7 +329,7 @@ copy_triangular_factors()
    const int N = A->getRows();
 
    // extract raw pointers
-   Devices::Cuda::synchronizeDevice();
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
    CSR* kernel_L = &L.template modifyData< DeviceType >();
    CSR* kernel_U = &U.template modifyData< DeviceType >();
    const CSR* kernel_A = &A.template getData< DeviceType >();
diff --git a/src/TNL/Solvers/SolverStarter_impl.h b/src/TNL/Solvers/SolverStarter_impl.h
index e52d03a4f7377c5fc80ade50d7981a3a07a48f08..8b323d5d745e08216bc3477adc03d3495bce8dce 100644
--- a/src/TNL/Solvers/SolverStarter_impl.h
+++ b/src/TNL/Solvers/SolverStarter_impl.h
@@ -406,7 +406,7 @@ bool SolverStarter< ConfigTag > :: writeEpilog( std::ostream& str, const Solver&
    if( std::is_same< typename Solver::DeviceType, TNL::Devices::Cuda >::value )
    {
       logger.writeParameter< const char* >( "GPU synchronization time:", "" );
-      TNL::Devices::Cuda::getSmartPointersSynchronizationTimer().writeLog( logger, 1 );
+      Pointers::getSmartPointersSynchronizationTimer< Devices::Cuda >().writeLog( logger, 1 );
    }
    logger.writeParameter< const char* >( "I/O time:", "" );
    this->ioTimer.writeLog( logger, 1 );
diff --git a/src/UnitTests/Pointers/SharedPointerCudaTest.cu b/src/UnitTests/Pointers/SharedPointerCudaTest.cu
index c0d76b2cc050d074831a4a6065d71b99ea24a7e9..83b6b4793bf6d5e17f6587b71c60261e8b80cea0 100644
--- a/src/UnitTests/Pointers/SharedPointerCudaTest.cu
+++ b/src/UnitTests/Pointers/SharedPointerCudaTest.cu
@@ -55,7 +55,7 @@ TEST( SharedPointerCudaTest, getDataTest )
    ASSERT_EQ( ptr1->y(), 2 );
 #else
 
-   Devices::Cuda::synchronizeDevice();
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
 
    TestType aux;
 
@@ -89,7 +89,7 @@ TEST( SharedPointerCudaTest, getDataArrayTest )
    ptr->setElement( 0, 1 );
    ptr->setElement( 1, 2 );
 
-   Devices::Cuda::synchronizeDevice();
+   Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
 
    int *testArray_device, *testArray_host;
    cudaMalloc( ( void** ) &testArray_device, 2 * sizeof( int ) );