diff --git a/src/Benchmarks/BLAS/spmv.h b/src/Benchmarks/BLAS/spmv.h
index 9fe469509b7cb4aa4f2ca6a0c32251cf370b266c..d515d52d73d513d87b86d4b743d8b0e27b20e0ca 100644
--- a/src/Benchmarks/BLAS/spmv.h
+++ b/src/Benchmarks/BLAS/spmv.h
@@ -53,7 +53,7 @@ __global__ void setCudaTestMatrixKernel( Matrix* matrix,
                                          const int elementsPerRow,
                                          const int gridIdx )
 {
-   const int rowIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const int rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    if( rowIdx >= matrix->getRows() )
       return;
    int col = rowIdx - elementsPerRow / 2;
@@ -73,12 +73,12 @@ void setCudaTestMatrix( Matrix& matrix,
    typedef typename Matrix::IndexType IndexType;
    typedef typename Matrix::RealType RealType;
    Pointers::DevicePointer< Matrix > kernel_matrix( matrix );
-   dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() );
+   dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
    const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
-   const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() );
+   const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
    for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) {
       if( gridIdx == cudaGrids - 1 )
-         cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
+         cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
       setCudaTestMatrixKernel< Matrix >
          <<< cudaGridSize, cudaBlockSize >>>
          ( &kernel_matrix.template modifyData< Devices::Cuda >(), elementsPerRow, gridIdx );
diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index 683a18376276c4b0dbd194329226e9a517a4af12..3822fef28eec53e47a22c566a34c651b248daab5 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -24,7 +24,7 @@
 
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/SystemInfo.h>
-#include <TNL/Devices/CudaDeviceInfo.h>
+#include <TNL/Cuda/DeviceInfo.h>
 #include <TNL/Config/ConfigDescription.h>
 #include <TNL/Communicators/MpiCommunicator.h>
 
@@ -339,9 +339,9 @@ Benchmark::MetadataMap getHardwareMetadata()
                        + convertToString( cacheSizes.L2 ) + ", "
                        + convertToString( cacheSizes.L3 );
 #ifdef HAVE_CUDA
-   const int activeGPU = Devices::CudaDeviceInfo::getActiveDevice();
-   const String deviceArch = convertToString( Devices::CudaDeviceInfo::getArchitectureMajor( activeGPU ) ) + "." +
-                             convertToString( Devices::CudaDeviceInfo::getArchitectureMinor( activeGPU ) );
+   const int activeGPU = Cuda::DeviceInfo::getActiveDevice();
+   const String deviceArch = convertToString( Cuda::DeviceInfo::getArchitectureMajor( activeGPU ) ) + "." +
+                             convertToString( Cuda::DeviceInfo::getArchitectureMinor( activeGPU ) );
 #endif
    Benchmark::MetadataMap metadata {
        { "host name", Devices::SystemInfo::getHostname() },
@@ -362,13 +362,13 @@ Benchmark::MetadataMap getHardwareMetadata()
        { "CPU max frequency (MHz)", convertToString( Devices::SystemInfo::getCPUMaxFrequency( cpu_id ) / 1e3 ) },
        { "CPU cache sizes (L1d, L1i, L2, L3) (kiB)", cacheInfo },
 #ifdef HAVE_CUDA
-       { "GPU name", Devices::CudaDeviceInfo::getDeviceName( activeGPU ) },
+       { "GPU name", Cuda::DeviceInfo::getDeviceName( activeGPU ) },
        { "GPU architecture", deviceArch },
-       { "GPU CUDA cores", convertToString( Devices::CudaDeviceInfo::getCudaCores( activeGPU ) ) },
-       { "GPU clock rate (MHz)", convertToString( (double) Devices::CudaDeviceInfo::getClockRate( activeGPU ) / 1e3 ) },
-       { "GPU global memory (GB)", convertToString( (double) Devices::CudaDeviceInfo::getGlobalMemory( activeGPU ) / 1e9 ) },
-       { "GPU memory clock rate (MHz)", convertToString( (double) Devices::CudaDeviceInfo::getMemoryClockRate( activeGPU ) / 1e3 ) },
-       { "GPU memory ECC enabled", convertToString( Devices::CudaDeviceInfo::getECCEnabled( activeGPU ) ) },
+       { "GPU CUDA cores", convertToString( Cuda::DeviceInfo::getCudaCores( activeGPU ) ) },
+       { "GPU clock rate (MHz)", convertToString( (double) Cuda::DeviceInfo::getClockRate( activeGPU ) / 1e3 ) },
+       { "GPU global memory (GB)", convertToString( (double) Cuda::DeviceInfo::getGlobalMemory( activeGPU ) / 1e9 ) },
+       { "GPU memory clock rate (MHz)", convertToString( (double) Cuda::DeviceInfo::getMemoryClockRate( activeGPU ) / 1e3 ) },
+       { "GPU memory ECC enabled", convertToString( Cuda::DeviceInfo::getECCEnabled( activeGPU ) ) },
 #endif
    };
 
diff --git a/src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem_impl.h b/src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem_impl.h
index 53cd0ec362a028b6ab5f30ce55cf489d4eb5c9f0..e3f47292306bbb879bf2677223f2f4b63cc7513c 100644
--- a/src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem_impl.h
+++ b/src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem_impl.h
@@ -82,9 +82,9 @@ setup( const Config::ParameterContainer& parameters,
 
    if( std::is_same< DeviceType, Devices::Cuda >::value )
    {
-      this->cudaBoundaryConditions = Devices::Cuda::passToDevice( *this->boundaryConditionPointer );
-      this->cudaRightHandSide = Devices::Cuda::passToDevice( *this->rightHandSidePointer );
-      this->cudaDifferentialOperator = Devices::Cuda::passToDevice( *this->differentialOperatorPointer );
+      this->cudaBoundaryConditions = Cuda::passToDevice( *this->boundaryConditionPointer );
+      this->cudaRightHandSide = Cuda::passToDevice( *this->rightHandSidePointer );
+      this->cudaDifferentialOperator = Cuda::passToDevice( *this->differentialOperatorPointer );
    }
    this->explicitUpdater.setDifferentialOperator( this->differentialOperatorPointer );
    this->explicitUpdater.setBoundaryConditions( this->boundaryConditionPointer );
@@ -266,8 +266,8 @@ boundaryConditionsTemplatedCompact( const GridType* grid,
 {
    typename GridType::CoordinatesType coordinates;
 
-   coordinates.x() = begin.x() + ( gridXIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   coordinates.y() = begin.y() + ( gridYIdx * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;        
+   coordinates.x() = begin.x() + ( gridXIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   coordinates.y() = begin.y() + ( gridYIdx * Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;        
 
    if( coordinates.x() < end.x() &&
        coordinates.y() < end.y() )
@@ -357,8 +357,8 @@ heatEquationTemplatedCompact( const GridType* grid,
    typedef typename GridType::IndexType IndexType;
    typedef typename GridType::RealType RealType;
 
-   coordinates.x() = begin.x() + ( gridXIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   coordinates.y() = begin.y() + ( gridYIdx * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;     
+   coordinates.x() = begin.x() + ( gridXIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   coordinates.y() = begin.y() + ( gridYIdx * Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;     
       
    MeshFunction& u = *_u;
    MeshFunction& fu = *_fu;
@@ -483,10 +483,10 @@ getExplicitUpdate( const RealType& time,
          CellType cell( mesh.template getData< DeviceType >() );
          dim3 cudaBlockSize( 16, 16 );
          dim3 cudaBlocks;
-         cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x );
-         cudaBlocks.y = Devices::Cuda::getNumberOfBlocks( end.y() - begin.y() + 1, cudaBlockSize.y );
-         const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x );
-         const IndexType cudaYGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.y );
+         cudaBlocks.x = Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x );
+         cudaBlocks.y = Cuda::getNumberOfBlocks( end.y() - begin.y() + 1, cudaBlockSize.y );
+         const IndexType cudaXGrids = Cuda::getNumberOfGrids( cudaBlocks.x );
+         const IndexType cudaYGrids = Cuda::getNumberOfGrids( cudaBlocks.y );
          
          //std::cerr << "Setting boundary conditions..." << std::endl;
 
@@ -762,10 +762,10 @@ template< typename Mesh,
 HeatEquationBenchmarkProblem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator, Communicator >::
 ~HeatEquationBenchmarkProblem()
 {
-   if( this->cudaMesh ) Devices::Cuda::freeFromDevice( this->cudaMesh );
-   if( this->cudaBoundaryConditions )  Devices::Cuda::freeFromDevice( this->cudaBoundaryConditions );
-   if( this->cudaRightHandSide ) Devices::Cuda::freeFromDevice( this->cudaRightHandSide );
-   if( this->cudaDifferentialOperator ) Devices::Cuda::freeFromDevice( this->cudaDifferentialOperator );
+   if( this->cudaMesh ) Cuda::freeFromDevice( this->cudaMesh );
+   if( this->cudaBoundaryConditions )  Cuda::freeFromDevice( this->cudaBoundaryConditions );
+   if( this->cudaRightHandSide ) Cuda::freeFromDevice( this->cudaRightHandSide );
+   if( this->cudaDifferentialOperator ) Cuda::freeFromDevice( this->cudaDifferentialOperator );
 }
 
 
diff --git a/src/Benchmarks/HeatEquation/Tuning/GridTraverser.h b/src/Benchmarks/HeatEquation/Tuning/GridTraverser.h
index cdbc4922ca07eda7da0ba442340705f6646d8430..7e7e5369182ebc579ab98da55bbacef872284edf 100644
--- a/src/Benchmarks/HeatEquation/Tuning/GridTraverser.h
+++ b/src/Benchmarks/HeatEquation/Tuning/GridTraverser.h
@@ -12,7 +12,7 @@
 
 #include <TNL/Meshes/Grid.h>
 #include <TNL/Pointers/SharedPointer.h>
-#include <TNL/CudaStreamPool.h>
+#include <TNL/Cuda/StreamPool.h>
 
 namespace TNL {
 
diff --git a/src/Benchmarks/HeatEquation/Tuning/GridTraverser_impl.h b/src/Benchmarks/HeatEquation/Tuning/GridTraverser_impl.h
index 816ee5e2c4724137118d221a7487f75651bbc8ac..2a77f8bb5287a75cee1b38356c96dd49dbff5250 100644
--- a/src/Benchmarks/HeatEquation/Tuning/GridTraverser_impl.h
+++ b/src/Benchmarks/HeatEquation/Tuning/GridTraverser_impl.h
@@ -126,8 +126,8 @@ _GridTraverser2D(
    typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType;
    typename GridType::CoordinatesType coordinates;
 
-   coordinates.x() = begin.x() + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
-   coordinates.y() = begin.y() + Devices::Cuda::getGlobalThreadIdx_y( gridIdx );
+   coordinates.x() = begin.x() + Cuda::getGlobalThreadIdx_x( gridIdx );
+   coordinates.y() = begin.y() + Cuda::getGlobalThreadIdx_y( gridIdx );
    
    if( coordinates <= end )
    {
@@ -173,7 +173,7 @@ _GridTraverser2DBoundary(
    Index entitiesAlongX = endX - beginX + 1;
    Index entitiesAlongY = endY - beginY;
    
-   Index threadId = Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
+   Index threadId = Cuda::getGlobalThreadIdx_x( gridIdx );
    if( threadId < entitiesAlongX )
    {
       GridEntity entity( *grid, 
@@ -244,12 +244,12 @@ processEntities(
       dim3 cudaBlockSize( 256 );      
       dim3 cudaBlocksCount, cudaGridsCount;
       IndexType cudaThreadsCount = 2 * ( end.x() - begin.x() + end.y() - begin.y() + 1 );
-      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, cudaThreadsCount );
+      Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, cudaThreadsCount );
       dim3 gridIdx, cudaGridSize;
       Devices::Cuda::synchronizeDevice();
       for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x++ )
       {
-         Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
+         Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
          _GridTraverser2DBoundary< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
                <<< cudaGridSize, cudaBlockSize >>>
                ( &gridPointer.template getData< Devices::Cuda >(),
@@ -266,11 +266,11 @@ processEntities(
    {
       dim3 cudaBlockSize( 16, 16 );
       dim3 cudaBlocksCount, cudaGridsCount;
-      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount,
-                                   end.x() - begin.x() + 1,
-                                   end.y() - begin.y() + 1 );
+      Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount,
+                          end.x() - begin.x() + 1,
+                          end.y() - begin.y() + 1 );
       
-      auto& pool = CudaStreamPool::getInstance();
+      auto& pool = Cuda::StreamPool::getInstance();
       const cudaStream_t& s = pool.getStream( stream );
 
       Devices::Cuda::synchronizeDevice();
@@ -278,8 +278,8 @@ processEntities(
       for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ )
          for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ )
          {
-            Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
-	    //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCount, cudaGridSize, cudaGridsCount );
+            Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
+	    //Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCount, cudaGridSize, cudaGridsCount );
             TNL::_GridTraverser2D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
                <<< cudaGridSize, cudaBlockSize, 0, s >>>
                ( &gridPointer.template getData< Devices::Cuda >(),
diff --git a/src/Benchmarks/ODESolvers/Euler.hpp b/src/Benchmarks/ODESolvers/Euler.hpp
index 22c0130419123500981c4b343e3b53e2186779bc..ab975ed078c470f4824d18e7848033e6fed73f2c 100644
--- a/src/Benchmarks/ODESolvers/Euler.hpp
+++ b/src/Benchmarks/ODESolvers/Euler.hpp
@@ -176,10 +176,10 @@ void Euler< Problem, SolverMonitor >::computeNewTimeLevel( DofVectorPointer& u,
    {
 #ifdef HAVE_CUDA
       dim3 cudaBlockSize( 512 );
-      const IndexType cudaBlocks = Devices::Cuda::getNumberOfBlocks( size, cudaBlockSize.x );
-      const IndexType cudaGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks );
-      this->cudaBlockResidue.setSize( min( cudaBlocks, Devices::Cuda::getMaxGridSize() ) );
-      const IndexType threadsPerGrid = Devices::Cuda::getMaxGridSize() * cudaBlockSize.x;
+      const IndexType cudaBlocks = Cuda::getNumberOfBlocks( size, cudaBlockSize.x );
+      const IndexType cudaGrids = Cuda::getNumberOfGrids( cudaBlocks );
+      this->cudaBlockResidue.setSize( min( cudaBlocks, Cuda::getMaxGridSize() ) );
+      const IndexType threadsPerGrid = Cuda::getMaxGridSize() * cudaBlockSize.x;
 
       localResidue = 0.0;
       for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ )
@@ -187,7 +187,7 @@ void Euler< Problem, SolverMonitor >::computeNewTimeLevel( DofVectorPointer& u,
          const IndexType sharedMemory = cudaBlockSize.x * sizeof( RealType );
          const IndexType gridOffset = gridIdx * threadsPerGrid;
          const IndexType currentSize = min( size - gridOffset, threadsPerGrid );
-         const IndexType currentGridSize = Devices::Cuda::getNumberOfBlocks( currentSize, cudaBlockSize.x );
+         const IndexType currentGridSize = Cuda::getNumberOfBlocks( currentSize, cudaBlockSize.x );
 
          updateUEuler<<< currentGridSize, cudaBlockSize, sharedMemory >>>( currentSize,
                                                                       tau,
diff --git a/src/Benchmarks/ODESolvers/Merson.hpp b/src/Benchmarks/ODESolvers/Merson.hpp
index 1ea606c4c233e17b88bd6ba55ef0cfc593e91859..3c74bdf480c66025f44de201e18ad5368d050e6f 100644
--- a/src/Benchmarks/ODESolvers/Merson.hpp
+++ b/src/Benchmarks/ODESolvers/Merson.hpp
@@ -290,10 +290,10 @@ void Merson< Problem, SolverMonitor >::computeKFunctions( DofVectorPointer& u,
    {
 #ifdef HAVE_CUDA
       dim3 cudaBlockSize( 512 );
-      const IndexType cudaBlocks = Devices::Cuda::getNumberOfBlocks( size, cudaBlockSize.x );
-      const IndexType cudaGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks );
-      this->cudaBlockResidue.setSize( min( cudaBlocks, Devices::Cuda::getMaxGridSize() ) );
-      const IndexType threadsPerGrid = Devices::Cuda::getMaxGridSize() * cudaBlockSize.x;
+      const IndexType cudaBlocks = Cuda::getNumberOfBlocks( size, cudaBlockSize.x );
+      const IndexType cudaGrids = Cuda::getNumberOfGrids( cudaBlocks );
+      this->cudaBlockResidue.setSize( min( cudaBlocks, Cuda::getMaxGridSize() ) );
+      const IndexType threadsPerGrid = Cuda::getMaxGridSize() * cudaBlockSize.x;
 
       this->problem->getExplicitUpdate( time, tau, u, k1 );
       cudaDeviceSynchronize();
@@ -384,10 +384,10 @@ typename Problem :: RealType Merson< Problem, SolverMonitor >::computeError( con
    {
 #ifdef HAVE_CUDA
       dim3 cudaBlockSize( 512 );
-      const IndexType cudaBlocks = Devices::Cuda::getNumberOfBlocks( size, cudaBlockSize.x );
-      const IndexType cudaGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks );
-      this->cudaBlockResidue.setSize( min( cudaBlocks, Devices::Cuda::getMaxGridSize() ) );
-      const IndexType threadsPerGrid = Devices::Cuda::getMaxGridSize() * cudaBlockSize.x;
+      const IndexType cudaBlocks = Cuda::getNumberOfBlocks( size, cudaBlockSize.x );
+      const IndexType cudaGrids = Cuda::getNumberOfGrids( cudaBlocks );
+      this->cudaBlockResidue.setSize( min( cudaBlocks, Cuda::getMaxGridSize() ) );
+      const IndexType threadsPerGrid = Cuda::getMaxGridSize() * cudaBlockSize.x;
 
       for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ )
       {
@@ -439,10 +439,10 @@ void Merson< Problem, SolverMonitor >::computeNewTimeLevel( const RealType time,
    {
 #ifdef HAVE_CUDA
       dim3 cudaBlockSize( 512 );
-      const IndexType cudaBlocks = Devices::Cuda::getNumberOfBlocks( size, cudaBlockSize.x );
-      const IndexType cudaGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks );
-      this->cudaBlockResidue.setSize( min( cudaBlocks, Devices::Cuda::getMaxGridSize() ) );
-      const IndexType threadsPerGrid = Devices::Cuda::getMaxGridSize() * cudaBlockSize.x;
+      const IndexType cudaBlocks = Cuda::getNumberOfBlocks( size, cudaBlockSize.x );
+      const IndexType cudaGrids = Cuda::getNumberOfGrids( cudaBlocks );
+      this->cudaBlockResidue.setSize( min( cudaBlocks, Cuda::getMaxGridSize() ) );
+      const IndexType threadsPerGrid = Cuda::getMaxGridSize() * cudaBlockSize.x;
 
       localResidue = 0.0;
       for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ )
diff --git a/src/Examples/simple-examples/large-meshfunction-example.h b/src/Examples/simple-examples/large-meshfunction-example.h
index 2f9c70b859606f6f02b3689e9cd74e6e87edd7ae..d5520b69e1af3ce54952d49c463c9b85d345903a 100644
--- a/src/Examples/simple-examples/large-meshfunction-example.h
+++ b/src/Examples/simple-examples/large-meshfunction-example.h
@@ -10,7 +10,6 @@ using namespace TNL;
 using namespace TNL::Containers;
 using namespace TNL::Meshes;
 using namespace TNL::Functions;
-using namespace TNL::Devices;
 
 int main(int argc, char ** argv)
 {
@@ -28,9 +27,9 @@ int main(int argc, char ** argv)
     time.start();
 
 #ifdef HAVE_CUDA
-    using Device=Cuda;
+    using Device=Devices::Cuda;
 #else
-    using Device=Host;
+    using Device=Devices::Host;
 #endif
 
   using MeshType= Grid<2, double,Device,int>;
diff --git a/src/TNL/Allocators/Cuda.h b/src/TNL/Allocators/Cuda.h
index 74ebb840432136d9033a17a86684607098a80d86..1b648f1ce3818978e086f26e64128536f40a8806 100644
--- a/src/TNL/Allocators/Cuda.h
+++ b/src/TNL/Allocators/Cuda.h
@@ -12,7 +12,9 @@
 
 #pragma once
 
-#include <TNL/Devices/Cuda.h>
+#include <TNL/Exceptions/CudaBadAlloc.h>
+#include <TNL/Exceptions/CudaSupportMissing.h>
+#include <TNL/Cuda/CheckDevice.h>
 
 namespace TNL {
 namespace Allocators {
diff --git a/src/TNL/Allocators/CudaHost.h b/src/TNL/Allocators/CudaHost.h
index 284c91fe9b8dbc7abe8e3d4685ef1d7551d19a89..9047e0b9af632b9f6fd466352d2cd3659f67210a 100644
--- a/src/TNL/Allocators/CudaHost.h
+++ b/src/TNL/Allocators/CudaHost.h
@@ -12,7 +12,9 @@
 
 #pragma once
 
-#include <TNL/Devices/Cuda.h>
+#include <TNL/Exceptions/CudaBadAlloc.h>
+#include <TNL/Exceptions/CudaSupportMissing.h>
+#include <TNL/Cuda/CheckDevice.h>
 
 namespace TNL {
 namespace Allocators {
diff --git a/src/TNL/Allocators/CudaManaged.h b/src/TNL/Allocators/CudaManaged.h
index db29f86cb618bf79e4f1c0fa0ac1ad2750d476bc..bb878ca66bef97491c6db407128c7c3322fdce7a 100644
--- a/src/TNL/Allocators/CudaManaged.h
+++ b/src/TNL/Allocators/CudaManaged.h
@@ -12,7 +12,9 @@
 
 #pragma once
 
-#include <TNL/Devices/Cuda.h>
+#include <TNL/Exceptions/CudaBadAlloc.h>
+#include <TNL/Exceptions/CudaSupportMissing.h>
+#include <TNL/Cuda/CheckDevice.h>
 
 namespace TNL {
 namespace Allocators {
diff --git a/src/TNL/Assert.h b/src/TNL/Assert.h
index 3d91c8c763b8c9532713f90beee3a02fd2c64b6a..df862956219ac03e1d1e1fa27e1c67a0e1035ad5 100644
--- a/src/TNL/Assert.h
+++ b/src/TNL/Assert.h
@@ -120,7 +120,7 @@
 #include <iostream>
 #include <stdio.h>
 
-#include <TNL/Devices/CudaCallable.h>
+#include <TNL/Cuda/CudaCallable.h>
 #include <TNL/Debugging/StackBacktrace.h>
 
 namespace TNL {
diff --git a/src/TNL/Communicators/MpiCommunicator.h b/src/TNL/Communicators/MpiCommunicator.h
index 926fa329a6e88b3d84406f464aa3e82057e3ef24..0aa14a9ece5d518ce5e142898cea56593d375189 100644
--- a/src/TNL/Communicators/MpiCommunicator.h
+++ b/src/TNL/Communicators/MpiCommunicator.h
@@ -24,7 +24,7 @@
 #include <unistd.h>  // getpid
 
 #ifdef HAVE_CUDA
-    #include <TNL/Devices/Cuda.h>
+    #include <TNL/Cuda/CheckDevice.h>
 
     typedef struct __attribute__((__packed__))  {
        char name[MPI_MAX_PROCESSOR_NAME];
diff --git a/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h b/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h
index e67c11b419dca3ae83706d43c56f5f282aba4beb..c97e0a8aa8f12354b9dbe84e2858fafe235881b2 100644
--- a/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h
+++ b/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h
@@ -14,7 +14,8 @@
 
 #include <TNL/Assert.h>
 #include <TNL/Math.h>
-#include <TNL/Devices/CudaDeviceInfo.h>
+#include <TNL/Cuda/DeviceInfo.h>
+#include <TNL/Cuda/SharedMemory.h>
 #include <TNL/Containers/Algorithms/CudaReductionBuffer.h>
 #include <TNL/Exceptions/CudaSupportMissing.h>
 
@@ -52,7 +53,7 @@ CudaMultireductionKernel( const Result zero,
                           const int n,
                           Result* output )
 {
-   Result* sdata = Devices::Cuda::getSharedMemory< Result >();
+   Result* sdata = Cuda::getSharedMemory< Result >();
 
    // Get the thread id (tid), global thread id (gid) and gridSize.
    const Index tid = threadIdx.y * blockDim.x + threadIdx.x;
@@ -160,10 +161,10 @@ CudaMultireductionKernelLauncher( const Result zero,
    // where blocksPerMultiprocessor is determined according to the number of
    // available registers on the multiprocessor.
    // On Tesla K40c, desGridSize = 8 * 15 = 120.
-   const int activeDevice = Devices::CudaDeviceInfo::getActiveDevice();
-   const int blocksdPerMultiprocessor = Devices::CudaDeviceInfo::getRegistersPerMultiprocessor( activeDevice )
+   const int activeDevice = Cuda::DeviceInfo::getActiveDevice();
+   const int blocksdPerMultiprocessor = Cuda::DeviceInfo::getRegistersPerMultiprocessor( activeDevice )
                                       / ( Multireduction_maxThreadsPerBlock * Multireduction_registersPerThread );
-   const int desGridSizeX = blocksdPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice );
+   const int desGridSizeX = blocksdPerMultiprocessor * Cuda::DeviceInfo::getCudaMultiprocessors( activeDevice );
    dim3 blockSize, gridSize;
 
    // version A: max 16 rows of threads
@@ -189,10 +190,10 @@ CudaMultireductionKernelLauncher( const Result zero,
    while( blockSize.x * blockSize.y > Multireduction_maxThreadsPerBlock )
       blockSize.x /= 2;
 
-   gridSize.x = TNL::min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSizeX );
-   gridSize.y = Devices::Cuda::getNumberOfBlocks( n, blockSize.y );
+   gridSize.x = TNL::min( Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSizeX );
+   gridSize.y = Cuda::getNumberOfBlocks( n, blockSize.y );
 
-   if( gridSize.y > (unsigned) Devices::Cuda::getMaxGridSize() ) {
+   if( gridSize.y > (unsigned) Cuda::getMaxGridSize() ) {
       std::cerr << "Maximum gridSize.y limit exceeded (limit is 65535, attempted " << gridSize.y << ")." << std::endl;
       throw 1;
    }
diff --git a/src/TNL/Containers/Algorithms/CudaReductionBuffer.h b/src/TNL/Containers/Algorithms/CudaReductionBuffer.h
index 2897c7280a6bc61f9b60a9cb3c7b44a94ad20de3..f873d7815157d9703364622f5d814a2a516af6bf 100644
--- a/src/TNL/Containers/Algorithms/CudaReductionBuffer.h
+++ b/src/TNL/Containers/Algorithms/CudaReductionBuffer.h
@@ -14,7 +14,7 @@
 
 #include <stdlib.h>
 
-#include <TNL/Devices/Cuda.h>
+#include <TNL/Cuda/CheckDevice.h>
 #include <TNL/Exceptions/CudaBadAlloc.h>
 #include <TNL/Exceptions/CudaSupportMissing.h>
 
diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
index 36bd5c88b5794c18f1bb9688cee9690c115e50d4..3e948a90673069cae11c336a7c5e75e9e13d5807 100644
--- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h
+++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
@@ -14,7 +14,8 @@
 
 #include <TNL/Assert.h>
 #include <TNL/Math.h>
-#include <TNL/Devices/CudaDeviceInfo.h>
+#include <TNL/Cuda/DeviceInfo.h>
+#include <TNL/Cuda/SharedMemory.h>
 #include <TNL/Containers/Algorithms/CudaReductionBuffer.h>
 #include <TNL/Containers/Algorithms/ArrayOperations.h>
 #include <TNL/Exceptions/CudaSupportMissing.h>
@@ -52,7 +53,7 @@ CudaReductionKernel( const Result zero,
                      const Index size,
                      Result* output )
 {
-   Result* sdata = Devices::Cuda::getSharedMemory< Result >();
+   Result* sdata = Cuda::getSharedMemory< Result >();
 
    // Get the thread id (tid), global thread id (gid) and gridSize.
    const Index tid = threadIdx.x;
@@ -147,7 +148,7 @@ CudaReductionWithArgumentKernel( const Result zero,
                                  Index* idxOutput,
                                  const Index* idxInput = nullptr )
 {
-   Result* sdata = Devices::Cuda::getSharedMemory< Result >();
+   Result* sdata = Cuda::getSharedMemory< Result >();
    Index* sidx = reinterpret_cast< Index* >( &sdata[ blockDim.x ] );
 
    // Get the thread id (tid), global thread id (gid) and gridSize.
@@ -282,11 +283,11 @@ struct CudaReductionKernelLauncher
    // It seems to be better to map only one CUDA block per one multiprocessor or maybe
    // just slightly more. Therefore we omit blocksdPerMultiprocessor in the following.
    CudaReductionKernelLauncher( const Index size )
-   : activeDevice( Devices::CudaDeviceInfo::getActiveDevice() ),
-     blocksdPerMultiprocessor( Devices::CudaDeviceInfo::getRegistersPerMultiprocessor( activeDevice )
+   : activeDevice( Cuda::DeviceInfo::getActiveDevice() ),
+     blocksdPerMultiprocessor( Cuda::DeviceInfo::getRegistersPerMultiprocessor( activeDevice )
                                / ( Reduction_maxThreadsPerBlock * Reduction_registersPerThread ) ),
-     //desGridSize( blocksdPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice ) ),
-     desGridSize( Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice ) ),
+     //desGridSize( blocksdPerMultiprocessor * Cuda::DeviceInfo::getCudaMultiprocessors( activeDevice ) ),
+     desGridSize( Cuda::DeviceInfo::getCudaMultiprocessors( activeDevice ) ),
      originalSize( size )
    {
    }
@@ -402,7 +403,7 @@ struct CudaReductionKernelLauncher
 #ifdef HAVE_CUDA
          dim3 blockSize, gridSize;
          blockSize.x = Reduction_maxThreadsPerBlock;
-         gridSize.x = TNL::min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );
+         gridSize.x = TNL::min( Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );
 
          // when there is only one warp per blockSize.x, we need to allocate two warps
          // worth of shared memory so that we don't index shared memory out of bounds
@@ -510,7 +511,7 @@ struct CudaReductionKernelLauncher
 #ifdef HAVE_CUDA
          dim3 blockSize, gridSize;
          blockSize.x = Reduction_maxThreadsPerBlock;
-         gridSize.x = TNL::min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );
+         gridSize.x = TNL::min( Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );
 
          // when there is only one warp per blockSize.x, we need to allocate two warps
          // worth of shared memory so that we don't index shared memory out of bounds
diff --git a/src/TNL/Containers/Algorithms/CudaScanKernel.h b/src/TNL/Containers/Algorithms/CudaScanKernel.h
index a8c3548757668df966b094a9da19e37b88ab7ed8..5b20164390b53d38034cc5b4b278eff417c2142d 100644
--- a/src/TNL/Containers/Algorithms/CudaScanKernel.h
+++ b/src/TNL/Containers/Algorithms/CudaScanKernel.h
@@ -13,7 +13,7 @@
 #include <iostream>
 
 #include <TNL/Math.h>
-#include <TNL/Devices/Cuda.h>
+#include <TNL/Cuda/SharedMemory.h>
 #include <TNL/Exceptions/CudaBadAlloc.h>
 #include <TNL/Containers/Array.h>
 
@@ -36,8 +36,8 @@ cudaFirstPhaseBlockScan( const ScanType scanType,
                          Real* output,
                          Real* auxArray )
 {
-   Real* sharedData = TNL::Devices::Cuda::getSharedMemory< Real >();
-   Real* auxData = &sharedData[ elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2 ];
+   Real* sharedData = TNL::Cuda::getSharedMemory< Real >();
+   Real* auxData = &sharedData[ elementsInBlock + elementsInBlock / Cuda::getNumberOfSharedMemoryBanks() + 2 ];
    Real* warpSums = &auxData[ blockDim.x ];
 
    const Index lastElementIdx = size - blockIdx.x * elementsInBlock;
@@ -54,7 +54,7 @@ cudaFirstPhaseBlockScan( const ScanType scanType,
          sharedData[ 0 ] = zero;
       while( idx < elementsInBlock && blockOffset + idx < size )
       {
-         sharedData[ Devices::Cuda::getInterleaving( idx + 1 ) ] = input[ blockOffset + idx ];
+         sharedData[ Cuda::getInterleaving( idx + 1 ) ] = input[ blockOffset + idx ];
          idx += blockDim.x;
       }
    }
@@ -62,7 +62,7 @@ cudaFirstPhaseBlockScan( const ScanType scanType,
    {
       while( idx < elementsInBlock && blockOffset + idx < size )
       {
-         sharedData[ Devices::Cuda::getInterleaving( idx ) ] = input[ blockOffset + idx ];
+         sharedData[ Cuda::getInterleaving( idx ) ] = input[ blockOffset + idx ];
          idx += blockDim.x;
       }
    }
@@ -78,33 +78,33 @@ cudaFirstPhaseBlockScan( const ScanType scanType,
    if( chunkOffset < lastElementInBlock )
    {
       auxData[ threadIdx.x ] =
-         sharedData[ Devices::Cuda::getInterleaving( chunkOffset ) ];
+         sharedData[ Cuda::getInterleaving( chunkOffset ) ];
    }
 
    int chunkPointer = 1;
    while( chunkPointer < chunkSize &&
           chunkOffset + chunkPointer < lastElementInBlock )
    {
-      sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ] =
-         reduction( sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ],
-                    sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ] );
+      sharedData[ Cuda::getInterleaving( chunkOffset + chunkPointer ) ] =
+         reduction( sharedData[ Cuda::getInterleaving( chunkOffset + chunkPointer ) ],
+                    sharedData[ Cuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ] );
       auxData[ threadIdx.x ] =
-         sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ];
+         sharedData[ Cuda::getInterleaving( chunkOffset + chunkPointer ) ];
       chunkPointer++;
    }
 
    /***
     *  Perform the parallel prefix-sum inside warps.
     */
-   const int threadInWarpIdx = threadIdx.x % Devices::Cuda::getWarpSize();
-   const int warpIdx = threadIdx.x / Devices::Cuda::getWarpSize();
-   for( int stride = 1; stride < Devices::Cuda::getWarpSize(); stride *= 2 ) {
+   const int threadInWarpIdx = threadIdx.x % Cuda::getWarpSize();
+   const int warpIdx = threadIdx.x / Cuda::getWarpSize();
+   for( int stride = 1; stride < Cuda::getWarpSize(); stride *= 2 ) {
       if( threadInWarpIdx >= stride && threadIdx.x < numberOfChunks )
          auxData[ threadIdx.x ] = reduction( auxData[ threadIdx.x ], auxData[ threadIdx.x - stride ] );
       __syncwarp();
    }
 
-   if( threadInWarpIdx == Devices::Cuda::getWarpSize() - 1 )
+   if( threadInWarpIdx == Cuda::getWarpSize() - 1 )
       warpSums[ warpIdx ] = auxData[ threadIdx.x ];
    __syncthreads();
 
@@ -112,7 +112,7 @@ cudaFirstPhaseBlockScan( const ScanType scanType,
     * Compute prefix-sum of warp sums using one warp
     */
    if( warpIdx == 0 )
-      for( int stride = 1; stride < Devices::Cuda::getWarpSize(); stride *= 2 ) {
+      for( int stride = 1; stride < Cuda::getWarpSize(); stride *= 2 ) {
          if( threadInWarpIdx >= stride )
             warpSums[ threadIdx.x ] = reduction( warpSums[ threadIdx.x ], warpSums[ threadIdx.x - stride ] );
          __syncwarp();
@@ -136,9 +136,9 @@ cudaFirstPhaseBlockScan( const ScanType scanType,
       Real chunkShift( zero );
       if( chunkIdx > 0 )
          chunkShift = auxData[ chunkIdx - 1 ];
-      sharedData[ Devices::Cuda::getInterleaving( idx ) ] =
-         reduction( sharedData[ Devices::Cuda::getInterleaving( idx ) ], chunkShift );
-      output[ blockOffset + idx ] = sharedData[ Devices::Cuda::getInterleaving( idx ) ];
+      sharedData[ Cuda::getInterleaving( idx ) ] =
+         reduction( sharedData[ Cuda::getInterleaving( idx ) ], chunkShift );
+      output[ blockOffset + idx ] = sharedData[ Cuda::getInterleaving( idx ) ];
       idx += blockDim.x;
    }
    __syncthreads();
@@ -147,11 +147,11 @@ cudaFirstPhaseBlockScan( const ScanType scanType,
    {
       if( scanType == ScanType::Exclusive )
       {
-         auxArray[ blockIdx.x ] = reduction( sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ],
-                                             sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock ) ] );
+         auxArray[ blockIdx.x ] = reduction( sharedData[ Cuda::getInterleaving( lastElementInBlock - 1 ) ],
+                                             sharedData[ Cuda::getInterleaving( lastElementInBlock ) ] );
       }
       else
-         auxArray[ blockIdx.x ] = sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ];
+         auxArray[ blockIdx.x ] = sharedData[ Cuda::getInterleaving( lastElementInBlock - 1 ) ];
    }
 }
 
@@ -245,7 +245,7 @@ struct CudaScanKernelLauncher
       // compute the number of grids
       const int elementsInBlock = 8 * blockSize;
       const Index numberOfBlocks = roundUpDivision( size, elementsInBlock );
-      const Index numberOfGrids = Devices::Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() );
+      const Index numberOfGrids = Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() );
       //std::cerr << "numberOfgrids =  " << numberOfGrids << std::endl;
 
       // allocate array for the block sums
@@ -268,8 +268,8 @@ struct CudaScanKernelLauncher
 
          // run the kernel
          const std::size_t sharedDataSize = elementsInBlock +
-                                            elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2;
-         const std::size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize() ) * sizeof( Real );
+                                            elementsInBlock / Cuda::getNumberOfSharedMemoryBanks() + 2;
+         const std::size_t sharedMemory = ( sharedDataSize + blockSize + Cuda::getWarpSize() ) * sizeof( Real );
          cudaFirstPhaseBlockScan<<< cudaGridSize, cudaBlockSize, sharedMemory >>>
             ( scanType,
               reduction,
@@ -330,7 +330,7 @@ struct CudaScanKernelLauncher
       // compute the number of grids
       const int elementsInBlock = 8 * blockSize;
       const Index numberOfBlocks = roundUpDivision( size, elementsInBlock );
-      const Index numberOfGrids = Devices::Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() );
+      const Index numberOfGrids = Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() );
 
       // loop over all grids
       for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) {
@@ -369,13 +369,13 @@ struct CudaScanKernelLauncher
     */
    static int& maxGridSize()
    {
-      static int maxGridSize = Devices::Cuda::getMaxGridSize();
+      static int maxGridSize = Cuda::getMaxGridSize();
       return maxGridSize;
    }
 
    static void resetMaxGridSize()
    {
-      maxGridSize() = Devices::Cuda::getMaxGridSize();
+      maxGridSize() = Cuda::getMaxGridSize();
    }
 
    static int& gridsCount()
diff --git a/src/TNL/Containers/Multimaps/EllpackIndexMultimapValues.h b/src/TNL/Containers/Multimaps/EllpackIndexMultimapValues.h
index fe7a0fb380230909be094042f69cf3ddabd24522..9be47980d1dbef78af8891ff50837d70fb851c22 100644
--- a/src/TNL/Containers/Multimaps/EllpackIndexMultimapValues.h
+++ b/src/TNL/Containers/Multimaps/EllpackIndexMultimapValues.h
@@ -13,7 +13,7 @@
 #include <type_traits>
 #include <ostream>
 
-#include <TNL/Devices/Cuda.h>
+#include <TNL/Cuda/CudaCallable.h>
 
 namespace TNL {
 namespace Containers {
diff --git a/src/TNL/Containers/Multimaps/StaticEllpackIndexMultimapValues.h b/src/TNL/Containers/Multimaps/StaticEllpackIndexMultimapValues.h
index 95ffade9fcad3674a7092bc69701e2a2500ab819..efae4f05173b9f0531cc12e96dd5644a5c72fefe 100644
--- a/src/TNL/Containers/Multimaps/StaticEllpackIndexMultimapValues.h
+++ b/src/TNL/Containers/Multimaps/StaticEllpackIndexMultimapValues.h
@@ -13,7 +13,7 @@
 #include <type_traits>
 #include <ostream>
 
-#include <TNL/Devices/Cuda.h>
+#include <TNL/Cuda/CudaCallable.h>
 
 namespace TNL {
 namespace Containers {
diff --git a/src/TNL/Containers/ndarray/SizesHolder.h b/src/TNL/Containers/ndarray/SizesHolder.h
index c3334e19b2c4e1ff5db6706fc25e264106fea691..72d61bf8123349a7d2807d4dd877b01f109d48a1 100644
--- a/src/TNL/Containers/ndarray/SizesHolder.h
+++ b/src/TNL/Containers/ndarray/SizesHolder.h
@@ -13,7 +13,7 @@
 #pragma once
 
 #include <TNL/Assert.h>
-#include <TNL/Devices/CudaCallable.h>
+#include <TNL/Cuda/CudaCallable.h>
 #include <TNL/TemplateStaticFor.h>
 
 #include <TNL/Containers/ndarray/Meta.h>
diff --git a/src/TNL/Cuda/CheckDevice.h b/src/TNL/Cuda/CheckDevice.h
new file mode 100644
index 0000000000000000000000000000000000000000..c857d8dd6ab8129fd2b1cac4e967831207296153
--- /dev/null
+++ b/src/TNL/Cuda/CheckDevice.h
@@ -0,0 +1,40 @@
+/***************************************************************************
+                          CheckDevice.h  -  description
+                             -------------------
+    begin                : Aug 18, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Exceptions/CudaRuntimeError.h>
+
+namespace TNL {
+namespace Cuda {
+
+#ifdef HAVE_CUDA
+   /****
+    * I do not know why, but it is more reliable to pass the error code instead
+    * of calling cudaGetLastError() inside the function.
+    * We recommend to use macro 'TNL_CHECK_CUDA_DEVICE' defined bellow.
+    */
+   inline void checkDevice( const char* file_name, int line, cudaError error )
+   {
+      if( error != cudaSuccess )
+         throw Exceptions::CudaRuntimeError( error, file_name, line );
+   }
+#else
+   inline void checkDevice() {}
+#endif
+
+} // namespace Cuda
+} // namespace TNL
+
+#ifdef HAVE_CUDA
+#define TNL_CHECK_CUDA_DEVICE ::TNL::Cuda::checkDevice( __FILE__, __LINE__, cudaGetLastError() )
+#else
+#define TNL_CHECK_CUDA_DEVICE ::TNL::Cuda::checkDevice()
+#endif
diff --git a/src/TNL/Devices/CudaCallable.h b/src/TNL/Cuda/CudaCallable.h
similarity index 88%
rename from src/TNL/Devices/CudaCallable.h
rename to src/TNL/Cuda/CudaCallable.h
index f63e4e430e01e76df7833001a651e4f220e9bab6..5cd3e8fbbe51abe0bd7dc525c165990b734ef388 100644
--- a/src/TNL/Devices/CudaCallable.h
+++ b/src/TNL/Cuda/CudaCallable.h
@@ -12,9 +12,6 @@
 
 // The __cuda_callable__ macro has to be in a separate header file to avoid
 // infinite loops by the #include directives.
-//
-// For example, the implementation of Devices::Cuda needs TNL_ASSERT_*
-// macros, which need __cuda_callable__ functions.
 
 /***
  * This macro serves for definition of function which are supposed to be called
diff --git a/src/TNL/Cuda/DeviceInfo.h b/src/TNL/Cuda/DeviceInfo.h
new file mode 100644
index 0000000000000000000000000000000000000000..d53b46fecbf45c49d7c9d6723423c6951a456fef
--- /dev/null
+++ b/src/TNL/Cuda/DeviceInfo.h
@@ -0,0 +1,52 @@
+/***************************************************************************
+                          CudaDeviceInfo.h  -  description
+                             -------------------
+    begin                : Jun 21, 2015
+    copyright            : (C) 2007 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/String.h>
+
+namespace TNL {
+namespace Cuda {
+
+struct DeviceInfo
+{
+   static int getNumberOfDevices();
+
+   static int getActiveDevice();
+
+   static String getDeviceName( int deviceNum );
+
+   static int getArchitectureMajor( int deviceNum );
+
+   static int getArchitectureMinor( int deviceNum );
+
+   static int getClockRate( int deviceNum );
+
+   static std::size_t getGlobalMemory( int deviceNum );
+
+   static std::size_t getFreeGlobalMemory();
+
+   static int getMemoryClockRate( int deviceNum );
+
+   static bool getECCEnabled( int deviceNum );
+
+   static int getCudaMultiprocessors( int deviceNum );
+
+   static int getCudaCoresPerMultiprocessors( int deviceNum );
+
+   static int getCudaCores( int deviceNum );
+
+   static int getRegistersPerMultiprocessor( int deviceNum );
+};
+
+} // namespace Cuda
+} // namespace TNL
+
+#include <TNL/Cuda/DeviceInfo.hpp>
diff --git a/src/TNL/Devices/CudaDeviceInfo_impl.h b/src/TNL/Cuda/DeviceInfo.hpp
similarity index 86%
rename from src/TNL/Devices/CudaDeviceInfo_impl.h
rename to src/TNL/Cuda/DeviceInfo.hpp
index f29ecd8c91493edb538f11a8ced6c9ee6503983a..d10e6f05cbbb391d06e1287ad6f19db30c967ac0 100644
--- a/src/TNL/Devices/CudaDeviceInfo_impl.h
+++ b/src/TNL/Cuda/DeviceInfo.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          CudaDeviceInfo_impl.h  -  description
+                          DeviceInfo.hpp  -  description
                              -------------------
     begin                : Jun 21, 2015
     copyright            : (C) 2007 by Tomas Oberhuber
@@ -12,14 +12,14 @@
 
 #include <unordered_map>
 
-#include <TNL/Devices/CudaDeviceInfo.h>
+#include <TNL/Cuda/DeviceInfo.h>
 #include <TNL/Exceptions/CudaSupportMissing.h>
 
 namespace TNL {
-namespace Devices {
+namespace Cuda {
 
 inline int
-CudaDeviceInfo::
+DeviceInfo::
 getNumberOfDevices()
 {
 #ifdef HAVE_CUDA
@@ -32,7 +32,7 @@ getNumberOfDevices()
 }
 
 inline int
-CudaDeviceInfo::
+DeviceInfo::
 getActiveDevice()
 {
 #ifdef HAVE_CUDA
@@ -45,7 +45,7 @@ getActiveDevice()
 }
 
 inline String
-CudaDeviceInfo::
+DeviceInfo::
 getDeviceName( int deviceNum )
 {
 #ifdef HAVE_CUDA
@@ -58,7 +58,7 @@ getDeviceName( int deviceNum )
 }
 
 inline int
-CudaDeviceInfo::
+DeviceInfo::
 getArchitectureMajor( int deviceNum )
 {
 #ifdef HAVE_CUDA
@@ -71,7 +71,7 @@ getArchitectureMajor( int deviceNum )
 }
 
 inline int
-CudaDeviceInfo::
+DeviceInfo::
 getArchitectureMinor( int deviceNum )
 {
 #ifdef HAVE_CUDA
@@ -84,7 +84,7 @@ getArchitectureMinor( int deviceNum )
 }
 
 inline int
-CudaDeviceInfo::
+DeviceInfo::
 getClockRate( int deviceNum )
 {
 #ifdef HAVE_CUDA
@@ -96,8 +96,8 @@ getClockRate( int deviceNum )
 #endif
 }
 
-inline size_t
-CudaDeviceInfo::
+inline std::size_t
+DeviceInfo::
 getGlobalMemory( int deviceNum )
 {
 #ifdef HAVE_CUDA
@@ -109,13 +109,13 @@ getGlobalMemory( int deviceNum )
 #endif
 }
 
-inline size_t
-CudaDeviceInfo::
+inline std::size_t
+DeviceInfo::
 getFreeGlobalMemory()
 {
 #ifdef HAVE_CUDA
-   size_t free = 0;
-   size_t total = 0;
+   std::size_t free = 0;
+   std::size_t total = 0;
    cudaMemGetInfo( &free, &total );
    return free;
 #else
@@ -124,7 +124,7 @@ getFreeGlobalMemory()
 }
 
 inline int
-CudaDeviceInfo::
+DeviceInfo::
 getMemoryClockRate( int deviceNum )
 {
 #ifdef HAVE_CUDA
@@ -137,7 +137,7 @@ getMemoryClockRate( int deviceNum )
 }
 
 inline bool
-CudaDeviceInfo::
+DeviceInfo::
 getECCEnabled( int deviceNum )
 {
 #ifdef HAVE_CUDA
@@ -150,7 +150,7 @@ getECCEnabled( int deviceNum )
 }
 
 inline int
-CudaDeviceInfo::
+DeviceInfo::
 getCudaMultiprocessors( int deviceNum )
 {
 #ifdef HAVE_CUDA
@@ -169,12 +169,12 @@ getCudaMultiprocessors( int deviceNum )
 }
 
 inline int
-CudaDeviceInfo::
+DeviceInfo::
 getCudaCoresPerMultiprocessors( int deviceNum )
 {
 #ifdef HAVE_CUDA
-   int major = CudaDeviceInfo::getArchitectureMajor( deviceNum );
-   int minor = CudaDeviceInfo::getArchitectureMinor( deviceNum );
+   int major = DeviceInfo::getArchitectureMajor( deviceNum );
+   int minor = DeviceInfo::getArchitectureMinor( deviceNum );
    switch( major )
    {
       case 1:   // Tesla generation, G80, G8x, G9x classes
@@ -209,19 +209,19 @@ getCudaCoresPerMultiprocessors( int deviceNum )
 }
 
 inline int
-CudaDeviceInfo::
+DeviceInfo::
 getCudaCores( int deviceNum )
 {
 #ifdef HAVE_CUDA
-   return CudaDeviceInfo::getCudaMultiprocessors( deviceNum ) *
-          CudaDeviceInfo::getCudaCoresPerMultiprocessors( deviceNum );
+   return DeviceInfo::getCudaMultiprocessors( deviceNum ) *
+          DeviceInfo::getCudaCoresPerMultiprocessors( deviceNum );
 #else
    throw Exceptions::CudaSupportMissing();
 #endif
 }
 
 inline int
-CudaDeviceInfo::
+DeviceInfo::
 getRegistersPerMultiprocessor( int deviceNum )
 {
 #ifdef HAVE_CUDA
@@ -239,5 +239,5 @@ getRegistersPerMultiprocessor( int deviceNum )
 #endif
 }
 
-} // namespace Devices
+} // namespace Cuda
 } // namespace TNL
diff --git a/src/TNL/Cuda/LaunchHelpers.h b/src/TNL/Cuda/LaunchHelpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..aaca4a67d8f228e593915b491bb771c8c89921c2
--- /dev/null
+++ b/src/TNL/Cuda/LaunchHelpers.h
@@ -0,0 +1,162 @@
+/***************************************************************************
+                          LaunchHelpers.h  -  description
+                             -------------------
+    begin                : Aug 19, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Math.h>
+
+namespace TNL {
+namespace Cuda {
+
+inline constexpr int getMaxGridSize()
+{
+   return 65535;
+}
+
+inline constexpr int getMaxBlockSize()
+{
+   return 1024;
+}
+
+inline constexpr int getWarpSize()
+{
+   return 32;
+}
+
+#ifdef HAVE_CUDA
+__device__ inline int getGlobalThreadIdx( const int gridIdx = 0,
+                                          const int gridSize = getMaxGridSize() )
+{
+   return ( gridIdx * gridSize + blockIdx.x ) * blockDim.x + threadIdx.x;
+}
+
+__device__ inline int getGlobalThreadIdx_x( const dim3& gridIdx )
+{
+   return ( gridIdx.x * getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+}
+
+__device__ inline int getGlobalThreadIdx_y( const dim3& gridIdx )
+{
+   return ( gridIdx.y * getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
+}
+
+__device__ inline int getGlobalThreadIdx_z( const dim3& gridIdx )
+{
+   return ( gridIdx.z * getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
+}
+#endif
+
+inline int getNumberOfBlocks( const int threads,
+                              const int blockSize )
+{
+   return roundUpDivision( threads, blockSize );
+}
+
+inline int getNumberOfGrids( const int blocks,
+                             const int gridSize = getMaxGridSize() )
+{
+   return roundUpDivision( blocks, gridSize );
+}
+
+#ifdef HAVE_CUDA
+inline void setupThreads( const dim3& blockSize,
+                          dim3& blocksCount,
+                          dim3& gridsCount,
+                          long long int xThreads,
+                          long long int yThreads = 0,
+                          long long int zThreads = 0 )
+{
+   blocksCount.x = max( 1, xThreads / blockSize.x + ( xThreads % blockSize.x != 0 ) );
+   blocksCount.y = max( 1, yThreads / blockSize.y + ( yThreads % blockSize.y != 0 ) );
+   blocksCount.z = max( 1, zThreads / blockSize.z + ( zThreads % blockSize.z != 0 ) );
+
+   /****
+    * TODO: Fix the following:
+    * I do not known how to get max grid size in kernels :(
+    *
+    * Also, this is very slow. */
+   /*int currentDevice( 0 );
+   cudaGetDevice( currentDevice );
+   cudaDeviceProp properties;
+   cudaGetDeviceProperties( &properties, currentDevice );
+   gridsCount.x = blocksCount.x / properties.maxGridSize[ 0 ] + ( blocksCount.x % properties.maxGridSize[ 0 ] != 0 );
+   gridsCount.y = blocksCount.y / properties.maxGridSize[ 1 ] + ( blocksCount.y % properties.maxGridSize[ 1 ] != 0 );
+   gridsCount.z = blocksCount.z / properties.maxGridSize[ 2 ] + ( blocksCount.z % properties.maxGridSize[ 2 ] != 0 );
+   */
+   gridsCount.x = blocksCount.x / getMaxGridSize() + ( blocksCount.x % getMaxGridSize() != 0 );
+   gridsCount.y = blocksCount.y / getMaxGridSize() + ( blocksCount.y % getMaxGridSize() != 0 );
+   gridsCount.z = blocksCount.z / getMaxGridSize() + ( blocksCount.z % getMaxGridSize() != 0 );
+}
+
+inline void setupGrid( const dim3& blocksCount,
+                       const dim3& gridsCount,
+                       const dim3& gridIdx,
+                       dim3& gridSize )
+{
+   /* TODO: this is ext slow!!!!
+   int currentDevice( 0 );
+   cudaGetDevice( &currentDevice );
+   cudaDeviceProp properties;
+   cudaGetDeviceProperties( &properties, currentDevice );*/
+
+   /****
+    * TODO: fix the following
+   if( gridIdx.x < gridsCount.x )
+      gridSize.x = properties.maxGridSize[ 0 ];
+   else
+      gridSize.x = blocksCount.x % properties.maxGridSize[ 0 ];
+
+   if( gridIdx.y < gridsCount.y )
+      gridSize.y = properties.maxGridSize[ 1 ];
+   else
+      gridSize.y = blocksCount.y % properties.maxGridSize[ 1 ];
+
+   if( gridIdx.z < gridsCount.z )
+      gridSize.z = properties.maxGridSize[ 2 ];
+   else
+      gridSize.z = blocksCount.z % properties.maxGridSize[ 2 ];*/
+
+   if( gridIdx.x < gridsCount.x - 1 )
+      gridSize.x = getMaxGridSize();
+   else
+      gridSize.x = blocksCount.x % getMaxGridSize();
+
+   if( gridIdx.y < gridsCount.y - 1 )
+      gridSize.y = getMaxGridSize();
+   else
+      gridSize.y = blocksCount.y % getMaxGridSize();
+
+   if( gridIdx.z < gridsCount.z - 1 )
+      gridSize.z = getMaxGridSize();
+   else
+      gridSize.z = blocksCount.z % getMaxGridSize();
+}
+
+inline std::ostream& operator<<( std::ostream& str, const dim3& d )
+{
+   str << "( " << d.x << ", " << d.y << ", " << d.z << " )";
+   return str;
+}
+
+inline void printThreadsSetup( const dim3& blockSize,
+                               const dim3& blocksCount,
+                               const dim3& gridSize,
+                               const dim3& gridsCount,
+                               std::ostream& str = std::cout )
+{
+   str << "Block size: " << blockSize << std::endl
+       << " Blocks count: " << blocksCount << std::endl
+       << " Grid size: " << gridSize << std::endl
+       << " Grids count: " << gridsCount << std::endl;
+}
+#endif
+
+} // namespace Cuda
+} // namespace TNL
diff --git a/src/TNL/Cuda/MemoryHelpers.h b/src/TNL/Cuda/MemoryHelpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..cb214f5d02ebaf6784d08e4c288de6ddd8638de7
--- /dev/null
+++ b/src/TNL/Cuda/MemoryHelpers.h
@@ -0,0 +1,103 @@
+/***************************************************************************
+                          MemoryHelpers.h  -  description
+                             -------------------
+    begin                : Aug 19, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <iostream>
+
+#include <TNL/Cuda/CheckDevice.h>
+#include <TNL/Exceptions/CudaSupportMissing.h>
+#include <TNL/Exceptions/CudaBadAlloc.h>
+
+namespace TNL {
+namespace Cuda {
+
+template< typename ObjectType >
+[[deprecated("Allocators and MemoryOperations hould be used instead.")]]
+ObjectType* passToDevice( const ObjectType& object )
+{
+#ifdef HAVE_CUDA
+   ObjectType* deviceObject;
+   if( cudaMalloc( ( void** ) &deviceObject,
+                   ( size_t ) sizeof( ObjectType ) ) != cudaSuccess )
+      throw Exceptions::CudaBadAlloc();
+   if( cudaMemcpy( ( void* ) deviceObject,
+                   ( void* ) &object,
+                   sizeof( ObjectType ),
+                   cudaMemcpyHostToDevice ) != cudaSuccess )
+   {
+      TNL_CHECK_CUDA_DEVICE;
+      cudaFree( ( void* ) deviceObject );
+      TNL_CHECK_CUDA_DEVICE;
+      return 0;
+   }
+   return deviceObject;
+#else
+   throw Exceptions::CudaSupportMissing();
+#endif
+}
+
+template< typename ObjectType >
+[[deprecated("Allocators and MemoryOperations hould be used instead.")]]
+ObjectType passFromDevice( const ObjectType* object )
+{
+#ifdef HAVE_CUDA
+   ObjectType aux;
+   cudaMemcpy( ( void* ) aux,
+               ( void* ) &object,
+               sizeof( ObjectType ),
+               cudaMemcpyDeviceToHost );
+   TNL_CHECK_CUDA_DEVICE;
+   return aux;
+#else
+   throw Exceptions::CudaSupportMissing();
+#endif
+}
+
+template< typename ObjectType >
+[[deprecated("Allocators and MemoryOperations hould be used instead.")]]
+void passFromDevice( const ObjectType* deviceObject,
+                     ObjectType& hostObject )
+{
+#ifdef HAVE_CUDA
+   cudaMemcpy( ( void* ) &hostObject,
+               ( void* ) deviceObject,
+               sizeof( ObjectType ),
+               cudaMemcpyDeviceToHost );
+   TNL_CHECK_CUDA_DEVICE;
+#else
+   throw Exceptions::CudaSupportMissing();
+#endif
+}
+
+template< typename ObjectType >
+[[deprecated("Allocators and MemoryOperations hould be used instead.")]]
+void freeFromDevice( ObjectType* deviceObject )
+{
+#ifdef HAVE_CUDA
+   cudaFree( ( void* ) deviceObject );
+   TNL_CHECK_CUDA_DEVICE;
+#else
+   throw Exceptions::CudaSupportMissing();
+#endif
+}
+
+template< typename ObjectType >
+void print( const ObjectType* deviceObject, std::ostream& str = std::cout )
+{
+#ifdef HAVE_CUDA
+   ObjectType hostObject;
+   passFromDevice( deviceObject, hostObject );
+   str << hostObject;
+#endif
+}
+
+} // namespace Cuda
+} // namespace TNL
diff --git a/src/TNL/CudaSharedMemory.h b/src/TNL/Cuda/SharedMemory.h
similarity index 78%
rename from src/TNL/CudaSharedMemory.h
rename to src/TNL/Cuda/SharedMemory.h
index ec9a43c207fc7962f36ffb40ad0af71973b76868..29851952c01c86356e6872511f90496358b55152 100644
--- a/src/TNL/CudaSharedMemory.h
+++ b/src/TNL/Cuda/SharedMemory.h
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          CudaSharedMemory.h  -  description
+                          SharedMemory.h  -  description
                              -------------------
     begin                : Oct 18, 2017
     copyright            : (C) 2017 by Tomas Oberhuber et al.
@@ -26,11 +26,11 @@
  *
  * Until CUDA 8.0, it was possible to use reinterpret_cast this way:
  *
- *    template< typename Element, size_t Alignment >
- *    __device__ Element* Cuda::getSharedMemory()
+ *    template< typename T, size_t Alignment >
+ *    __device__ T* getSharedMemory()
  *    {
  *       extern __shared__ __align__ ( Alignment ) unsigned char __sdata[];
- *       return reinterpret_cast< Element* >( __sdata );
+ *       return reinterpret_cast< T* >( __sdata );
  *    }
  *
  * But since CUDA 9.0 there is a new restriction that the alignment of the
@@ -44,12 +44,13 @@
 #include <stdint.h>
 
 namespace TNL {
+namespace Cuda {
 
 template< typename T, std::size_t _alignment = CHAR_BIT * sizeof(T) >
-struct CudaSharedMemory {};
+struct SharedMemory;
 
 template< typename T >
-struct CudaSharedMemory< T, 8 >
+struct SharedMemory< T, 8 >
 {
    __device__ inline operator T* ()
    {
@@ -65,7 +66,7 @@ struct CudaSharedMemory< T, 8 >
 };
 
 template< typename T >
-struct CudaSharedMemory< T, 16 >
+struct SharedMemory< T, 16 >
 {
    __device__ inline operator T* ()
    {
@@ -81,7 +82,7 @@ struct CudaSharedMemory< T, 16 >
 };
 
 template< typename T >
-struct CudaSharedMemory< T, 32 >
+struct SharedMemory< T, 32 >
 {
    __device__ inline operator T* ()
    {
@@ -97,7 +98,7 @@ struct CudaSharedMemory< T, 32 >
 };
 
 template< typename T >
-struct CudaSharedMemory< T, 64 >
+struct SharedMemory< T, 64 >
 {
    __device__ inline operator T* ()
    {
@@ -112,6 +113,25 @@ struct CudaSharedMemory< T, 64 >
    }
 };
 
+template< typename T >
+__device__ inline T* getSharedMemory()
+{
+   return SharedMemory< T >{};
+}
+
+// helper functions for indexing shared memory
+inline constexpr int getNumberOfSharedMemoryBanks()
+{
+   return 32;
+}
+
+template< typename Index >
+__device__ Index getInterleaving( const Index index )
+{
+   return index + index / Cuda::getNumberOfSharedMemoryBanks();
+}
+
+} // namespace Cuda
 } // namespace TNL
 
 #endif
diff --git a/src/TNL/CudaStreamPool.h b/src/TNL/Cuda/StreamPool.h
similarity index 73%
rename from src/TNL/CudaStreamPool.h
rename to src/TNL/Cuda/StreamPool.h
index 1dd2b7907fe39b53e331b0147fff1cabe16424ef..59bf38a5791d2ae9f381e831559c5caa7788567a 100644
--- a/src/TNL/CudaStreamPool.h
+++ b/src/TNL/Cuda/StreamPool.h
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          CudaStreamPool.h  -  description
+                          StreamPool.h  -  description
                              -------------------
     begin                : Oct 14, 2016
     copyright            : (C) 2016 by Tomas Oberhuber et al.
@@ -15,22 +15,20 @@
 #include <stdlib.h>
 #include <unordered_map>
 
-#include <TNL/Devices/Host.h>
-#include <TNL/Devices/Cuda.h>
-
 namespace TNL {
+namespace Cuda {
 
 #ifdef HAVE_CUDA
-class CudaStreamPool
+class StreamPool
 {
    public:
       // stop the compiler generating methods of copy the object
-      CudaStreamPool( CudaStreamPool const& copy ) = delete;
-      CudaStreamPool& operator=( CudaStreamPool const& copy ) = delete;
+      StreamPool( StreamPool const& copy ) = delete;
+      StreamPool& operator=( StreamPool const& copy ) = delete;
 
-      inline static CudaStreamPool& getInstance()
+      inline static StreamPool& getInstance()
       {
-         static CudaStreamPool instance;
+         static StreamPool instance;
          return instance;
       }
 
@@ -47,14 +45,14 @@ class CudaStreamPool
 
    private:
       // private constructor of the singleton
-      inline CudaStreamPool()
+      inline StreamPool()
       {
-         atexit( CudaStreamPool::free_atexit );
+         atexit( StreamPool::free_atexit );
       }
 
       inline static void free_atexit( void )
       {
-         CudaStreamPool::getInstance().free();
+         StreamPool::getInstance().free();
       }
 
    protected:
@@ -70,5 +68,6 @@ class CudaStreamPool
 };
 #endif
 
+} // namespace Cuda
 } // namespace TNL
 
diff --git a/src/TNL/Devices/Cuda.h b/src/TNL/Devices/Cuda.h
index 122c466c9f98778d3cbaa19a4853710284761495..853cd2e038c669485781a1462cdb4a54a9670e35 100644
--- a/src/TNL/Devices/Cuda.h
+++ b/src/TNL/Devices/Cuda.h
@@ -16,7 +16,7 @@
 #include <TNL/Assert.h>
 #include <TNL/Pointers/SmartPointersRegister.h>
 #include <TNL/Timer.h>
-#include <TNL/Devices/CudaCallable.h>
+#include <TNL/Cuda/CudaCallable.h>
 #include <TNL/Config/ConfigDescription.h>
 #include <TNL/Config/ParameterContainer.h>
 
@@ -31,131 +31,8 @@ public:
    static inline bool setup( const Config::ParameterContainer& parameters,
                              const String& prefix = "" );
 
-   __cuda_callable__ static inline constexpr int getMaxGridSize();
-
-   __cuda_callable__ static inline constexpr int getMaxBlockSize();
-
-   __cuda_callable__ static inline constexpr int getWarpSize();
-
-   __cuda_callable__ static inline constexpr int getNumberOfSharedMemoryBanks();
-
    static inline constexpr int getGPUTransferBufferSize();
 
-#ifdef HAVE_CUDA
-   /***
-    * This function is obsolete and should be replaced by the following functions.
-    */
-   __device__ static inline int
-   getGlobalThreadIdx( const int gridIdx = 0,
-                       const int gridSize = getMaxGridSize() );   
-
-   __device__ static inline int
-   getGlobalThreadIdx_x( const dim3& gridIdx );
-
-   __device__ static inline int
-   getGlobalThreadIdx_y( const dim3& gridIdx );
-
-   __device__ static inline int
-   getGlobalThreadIdx_z( const dim3& gridIdx );   
-#endif
-
-   /****
-    * This functions helps to count number of CUDA blocks depending on the 
-    * number of the CUDA threads and the block size.
-    * It is obsolete and it will be replaced by setupThreads.
-    */
-   static inline int getNumberOfBlocks( const int threads,
-                                        const int blockSize );
-
-   /****
-    * This functions helps to count number of CUDA grids depending on the 
-    * number of the CUDA blocks and maximum grid size.
-    * It is obsolete and it will be replaced by setupThreads.
-    */
-   static inline int getNumberOfGrids( const int blocks,
-                                       const int gridSize = getMaxGridSize() );
-   
-#ifdef HAVE_CUDA   
-   /*! This method sets up gridSize and computes number of grids depending
-    *  on total number of CUDA threads.
-    */
-   static void setupThreads( const dim3& blockSize,
-                             dim3& blocksCount,
-                             dim3& gridsCount,
-                             long long int xThreads,
-                             long long int yThreads = 0,
-                             long long int zThreads = 0 );
-   
-   /*! This method sets up grid size when one iterates over more grids.
-    * If gridIdx.? < gridsCount.? then the gridSize.? is set to maximum
-    * allowed by CUDA. Otherwise gridSize.? is set to the size of the grid
-    * in the last loop i.e. blocksCount.? % maxGridSize.?.
-    */
-   static void setupGrid( const dim3& blocksCount,
-                          const dim3& gridsCount,
-                          const dim3& gridIdx,
-                          dim3& gridSize );
-   
-   static void printThreadsSetup( const dim3& blockSize,
-                                  const dim3& blocksCount,
-                                  const dim3& gridSize,
-                                  const dim3& gridsCount,
-                                  std::ostream& str = std::cout );
-#endif   
-
-   template< typename ObjectType >
-   static ObjectType* passToDevice( const ObjectType& object );
-
-   template< typename ObjectType >
-   static ObjectType passFromDevice( const ObjectType* object );
-
-   template< typename ObjectType >
-   static void passFromDevice( const ObjectType* deviceObject,
-                               ObjectType& hostObject );
-
-   template< typename ObjectType >
-   static void freeFromDevice( ObjectType* object );
-
-   template< typename ObjectType >
-   static void print( const ObjectType* object, std::ostream& str = std::cout );
-
-#ifdef HAVE_CUDA
-   template< typename Index >
-   static __device__ Index getInterleaving( const Index index );
-
-   /****
-    * Declaration of variables for dynamic shared memory is difficult in
-    * templated functions. For example, the following does not work for
-    * different types T:
-    *
-    *    template< typename T >
-    *    void foo()
-    *    {
-    *        extern __shared__ T shx[];
-    *    }
-    *
-    * This is because extern variables must be declared exactly once. In
-    * templated functions we need to have same variable name with different
-    * type, which causes the conflict. In CUDA samples they solve the problem
-    * using template specialization via classes, but using one base type and
-    * reinterpret_cast works too.
-    * See http://stackoverflow.com/a/19339004/4180822 for reference.
-    */
-   template< typename Element >
-   static __device__ Element* getSharedMemory();
-#endif
-
-#ifdef HAVE_CUDA
-   /****
-    * I do not know why, but it is more reliable to pass the error code instead
-    * of calling cudaGetLastError() inside the method.
-    * We recommend to use macro 'TNL_CHECK_CUDA_DEVICE' defined bellow.
-    */
-   static inline void checkDevice( const char* file_name, int line, cudaError error );
-#else
-   static inline void checkDevice() {}
-#endif
-
    static inline void insertSmartPointer( Pointers::SmartPointer* pointer );
 
    static inline void removeSmartPointer( Pointers::SmartPointer* pointer );
@@ -180,18 +57,6 @@ public:
    static inline Pointers::SmartPointersRegister& getSmartPointersRegister();
 };
 
-#ifdef HAVE_CUDA
-#define TNL_CHECK_CUDA_DEVICE ::TNL::Devices::Cuda::checkDevice( __FILE__, __LINE__, cudaGetLastError() )
-#else
-#define TNL_CHECK_CUDA_DEVICE ::TNL::Devices::Cuda::checkDevice()
-#endif
-
-#ifdef HAVE_CUDA
-namespace {
-   std::ostream& operator << ( std::ostream& str, const dim3& d );
-}
-#endif
-
 #ifdef HAVE_CUDA
 #if __CUDA_ARCH__ < 600
 namespace {
diff --git a/src/TNL/Devices/CudaDeviceInfo.h b/src/TNL/Devices/CudaDeviceInfo.h
deleted file mode 100644
index 9eefe3bad8932670af271204e03f72b5eb501a95..0000000000000000000000000000000000000000
--- a/src/TNL/Devices/CudaDeviceInfo.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/***************************************************************************
-                          CudaDeviceInfo.h  -  description
-                             -------------------
-    begin                : Jun 21, 2015
-    copyright            : (C) 2007 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <stdlib.h>
-
-#include <TNL/String.h>
-
-namespace TNL {
-namespace Devices {
-
-class CudaDeviceInfo
-{
-   public:
-
-      static int getNumberOfDevices();
-
-      static int getActiveDevice();
-
-      static String getDeviceName( int deviceNum );
-
-      static int getArchitectureMajor( int deviceNum );
-
-      static int getArchitectureMinor( int deviceNum );
-
-      static int getClockRate( int deviceNum );
-
-      static size_t getGlobalMemory( int deviceNum );
-
-      static size_t getFreeGlobalMemory();
-
-      static int getMemoryClockRate( int deviceNum );
-
-      static bool getECCEnabled( int deviceNum );
-
-      static int getCudaMultiprocessors( int deviceNum );
-
-      static int getCudaCoresPerMultiprocessors( int deviceNum );
-
-      static int getCudaCores( int deviceNum );
-
-      static int getRegistersPerMultiprocessor( int deviceNum );
-};
-
-} // namespace Devices
-} // namespace TNL
-
-#include <TNL/Devices/CudaDeviceInfo_impl.h>
diff --git a/src/TNL/Devices/Cuda_impl.h b/src/TNL/Devices/Cuda_impl.h
index b758584bd55773de11b0f85f187bf9e64fc3befc..6d3daa356cb64f2373451a260a1403aabff83be6 100644
--- a/src/TNL/Devices/Cuda_impl.h
+++ b/src/TNL/Devices/Cuda_impl.h
@@ -12,11 +12,10 @@
 
 #include <TNL/Math.h>
 #include <TNL/Devices/Cuda.h>
-#include <TNL/Devices/CudaDeviceInfo.h>
+#include <TNL/Cuda/DeviceInfo.h>
 #include <TNL/Exceptions/CudaBadAlloc.h>
 #include <TNL/Exceptions/CudaSupportMissing.h>
 #include <TNL/Exceptions/CudaRuntimeError.h>
-#include <TNL/CudaSharedMemory.h>
 
 namespace TNL {
 namespace Devices {
@@ -49,279 +48,30 @@ Cuda::setup( const Config::ParameterContainer& parameters,
    return true;
 }
 
-__cuda_callable__
-inline constexpr int Cuda::getMaxGridSize()
-{
-   return 65535;
-}
-
-__cuda_callable__
-inline constexpr int Cuda::getMaxBlockSize()
-{
-   return 1024;
-}
-
-__cuda_callable__
-inline constexpr int Cuda::getWarpSize()
-{
-   return 32;
-}
-
-__cuda_callable__
-inline constexpr int Cuda::getNumberOfSharedMemoryBanks()
-{
-   return 32;
-}
-
 inline constexpr int Cuda::getGPUTransferBufferSize()
 {
    return 1 << 20;
 }
 
-#ifdef HAVE_CUDA
-__device__ inline int Cuda::getGlobalThreadIdx( const int gridIdx, const int gridSize )
-{
-   return ( gridIdx * gridSize + blockIdx.x ) * blockDim.x + threadIdx.x;
-}
-
-__device__ inline int Cuda::getGlobalThreadIdx_x( const dim3& gridIdx )
-{
-   return ( gridIdx.x * getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-}
-
-__device__ inline int Cuda::getGlobalThreadIdx_y( const dim3& gridIdx )
-{
-   return ( gridIdx.y * getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
-}
-
-__device__ inline int Cuda::getGlobalThreadIdx_z( const dim3& gridIdx )
-{
-   return ( gridIdx.z * getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
-}
-#endif
-
-inline int Cuda::getNumberOfBlocks( const int threads,
-                                    const int blockSize )
-{
-   return roundUpDivision( threads, blockSize );
-}
-
-inline int Cuda::getNumberOfGrids( const int blocks,
-                                   const int gridSize )
-{
-   return roundUpDivision( blocks, gridSize );
-}
-
-#ifdef HAVE_CUDA
-inline void Cuda::setupThreads( const dim3& blockSize,
-                                dim3& blocksCount,
-                                dim3& gridsCount,
-                                long long int xThreads,
-                                long long int yThreads,
-                                long long int zThreads )
-{
-   blocksCount.x = max( 1, xThreads / blockSize.x + ( xThreads % blockSize.x != 0 ) );
-   blocksCount.y = max( 1, yThreads / blockSize.y + ( yThreads % blockSize.y != 0 ) );
-   blocksCount.z = max( 1, zThreads / blockSize.z + ( zThreads % blockSize.z != 0 ) );
-   
-   /****
-    * TODO: Fix the following:
-    * I do not known how to get max grid size in kernels :(
-    * 
-    * Also, this is very slow. */
-   /*int currentDevice( 0 );
-   cudaGetDevice( currentDevice );
-   cudaDeviceProp properties;
-   cudaGetDeviceProperties( &properties, currentDevice );
-   gridsCount.x = blocksCount.x / properties.maxGridSize[ 0 ] + ( blocksCount.x % properties.maxGridSize[ 0 ] != 0 );
-   gridsCount.y = blocksCount.y / properties.maxGridSize[ 1 ] + ( blocksCount.y % properties.maxGridSize[ 1 ] != 0 );
-   gridsCount.z = blocksCount.z / properties.maxGridSize[ 2 ] + ( blocksCount.z % properties.maxGridSize[ 2 ] != 0 );
-   */
-   gridsCount.x = blocksCount.x / getMaxGridSize() + ( blocksCount.x % getMaxGridSize() != 0 );
-   gridsCount.y = blocksCount.y / getMaxGridSize() + ( blocksCount.y % getMaxGridSize() != 0 );
-   gridsCount.z = blocksCount.z / getMaxGridSize() + ( blocksCount.z % getMaxGridSize() != 0 );
-}
-
-inline void Cuda::setupGrid( const dim3& blocksCount,
-                             const dim3& gridsCount,
-                             const dim3& gridIdx,
-                             dim3& gridSize )
-{
-   /* TODO: this is extremely slow!!!!
-   int currentDevice( 0 );
-   cudaGetDevice( &currentDevice );
-   cudaDeviceProp properties;
-   cudaGetDeviceProperties( &properties, currentDevice );*/
- 
-   /****
-    * TODO: fix the following
-   if( gridIdx.x < gridsCount.x )
-      gridSize.x = properties.maxGridSize[ 0 ];
-   else
-      gridSize.x = blocksCount.x % properties.maxGridSize[ 0 ];
-   
-   if( gridIdx.y < gridsCount.y )
-      gridSize.y = properties.maxGridSize[ 1 ];
-   else
-      gridSize.y = blocksCount.y % properties.maxGridSize[ 1 ];
-
-   if( gridIdx.z < gridsCount.z )
-      gridSize.z = properties.maxGridSize[ 2 ];
-   else
-      gridSize.z = blocksCount.z % properties.maxGridSize[ 2 ];*/
-   
-   if( gridIdx.x < gridsCount.x - 1 )
-      gridSize.x = getMaxGridSize();
-   else
-      gridSize.x = blocksCount.x % getMaxGridSize();
-   
-   if( gridIdx.y < gridsCount.y - 1 )
-      gridSize.y = getMaxGridSize();
-   else
-      gridSize.y = blocksCount.y % getMaxGridSize();
-
-   if( gridIdx.z < gridsCount.z - 1 )
-      gridSize.z = getMaxGridSize();
-   else
-      gridSize.z = blocksCount.z % getMaxGridSize();
-}
-
-inline void Cuda::printThreadsSetup( const dim3& blockSize,
-                                     const dim3& blocksCount,
-                                     const dim3& gridSize,
-                                     const dim3& gridsCount,
-                                     std::ostream& str )
-{
-   str << "Block size: " << blockSize << std::endl
-       << " Blocks count: " << blocksCount << std::endl
-       << " Grid size: " << gridSize << std::endl
-       << " Grids count: " << gridsCount << std::endl;
-}
-#endif
-
-
-template< typename ObjectType >
-ObjectType* Cuda::passToDevice( const ObjectType& object )
-{
-#ifdef HAVE_CUDA
-   ObjectType* deviceObject;
-   if( cudaMalloc( ( void** ) &deviceObject,
-                   ( size_t ) sizeof( ObjectType ) ) != cudaSuccess )
-      throw Exceptions::CudaBadAlloc();
-   if( cudaMemcpy( ( void* ) deviceObject,
-                   ( void* ) &object,
-                   sizeof( ObjectType ),
-                   cudaMemcpyHostToDevice ) != cudaSuccess )
-   {
-      TNL_CHECK_CUDA_DEVICE;
-      cudaFree( ( void* ) deviceObject );
-      TNL_CHECK_CUDA_DEVICE;
-      return 0;
-   }
-   return deviceObject;
-#else
-   throw Exceptions::CudaSupportMissing();
-#endif
-}
-
-template< typename ObjectType >
-ObjectType Cuda::passFromDevice( const ObjectType* object )
-{
-#ifdef HAVE_CUDA
-   ObjectType aux;
-   cudaMemcpy( ( void* ) aux,
-               ( void* ) &object,
-               sizeof( ObjectType ),
-               cudaMemcpyDeviceToHost );
-   TNL_CHECK_CUDA_DEVICE;
-   return aux;
-#else
-   throw Exceptions::CudaSupportMissing();
-#endif
-}
-
-template< typename ObjectType >
-void Cuda::passFromDevice( const ObjectType* deviceObject,
-                           ObjectType& hostObject )
-{
-#ifdef HAVE_CUDA
-   cudaMemcpy( ( void* ) &hostObject,
-               ( void* ) deviceObject,
-               sizeof( ObjectType ),
-               cudaMemcpyDeviceToHost );
-   TNL_CHECK_CUDA_DEVICE;
-#else
-   throw Exceptions::CudaSupportMissing();
-#endif
-}
-
-template< typename ObjectType >
-void Cuda::print( const ObjectType* deviceObject, std::ostream& str )
-{
-#ifdef HAVE_CUDA
-   ObjectType hostObject;
-   passFromDevice( deviceObject, hostObject );
-   str << hostObject;
-#endif
-}
-
-
-template< typename ObjectType >
-void Cuda::freeFromDevice( ObjectType* deviceObject )
-{
-#ifdef HAVE_CUDA
-   cudaFree( ( void* ) deviceObject );
-   TNL_CHECK_CUDA_DEVICE;
-#else
-   throw Exceptions::CudaSupportMissing();
-#endif
-}
-
-#ifdef HAVE_CUDA
-template< typename Index >
-__device__ Index Cuda::getInterleaving( const Index index )
-{
-   return index + index / Cuda::getNumberOfSharedMemoryBanks();
-}
-
-template< typename Element >
-__device__ Element* Cuda::getSharedMemory()
-{
-   return CudaSharedMemory< Element >();
-}
-#endif
-
-#ifdef HAVE_CUDA
-inline void Cuda::checkDevice( const char* file_name, int line, cudaError error )
-{
-   if( error != cudaSuccess )
-      throw Exceptions::CudaRuntimeError( error, file_name, line );
-}
-#endif
-
 inline void Cuda::insertSmartPointer( Pointers::SmartPointer* pointer )
 {
-   getSmartPointersRegister().insert( pointer, Devices::CudaDeviceInfo::getActiveDevice() );
+   getSmartPointersRegister().insert( pointer, TNL::Cuda::DeviceInfo::getActiveDevice() );
 }
 
 inline void Cuda::removeSmartPointer( Pointers::SmartPointer* pointer )
 {
-   getSmartPointersRegister().remove( pointer, Devices::CudaDeviceInfo::getActiveDevice() );
+   getSmartPointersRegister().remove( pointer, TNL::Cuda::DeviceInfo::getActiveDevice() );
 }
 
 inline bool Cuda::synchronizeDevice( int deviceId )
 {
 #ifdef HAVE_CUDA
-#ifdef HAVE_CUDA_UNIFIED_MEMORY
-   return true;
-#else
    if( deviceId < 0 )
-      deviceId = Devices::CudaDeviceInfo::getActiveDevice();
+      deviceId = TNL::Cuda::DeviceInfo::getActiveDevice();
    getSmartPointersSynchronizationTimer().start();
    bool b = getSmartPointersRegister().synchronizeDevice( deviceId );
    getSmartPointersSynchronizationTimer().stop();
    return b;
-#endif
 #else
    return true;
 #endif
@@ -339,16 +89,6 @@ inline Pointers::SmartPointersRegister& Cuda::getSmartPointersRegister()
    return reg;
 }
 
-#ifdef HAVE_CUDA
-namespace {
-   std::ostream& operator << ( std::ostream& str, const dim3& d )
-   {
-      str << "( " << d.x << ", " << d.y << ", " << d.z << " )";
-      return str;
-   }
-}
-#endif
-
 // double-precision atomicAdd function for Maxwell and older GPUs
 // copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
 #ifdef HAVE_CUDA
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase1D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase1D_impl.h
index 55129c4e1008e69ef3b3d238acb8fc587cce4076..4dac64a23fc50d85186e2a1f118f818807d0fc87 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase1D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase1D_impl.h
@@ -22,7 +22,7 @@ initInterface( const MeshFunctionPointer& _input,
     const MeshType& mesh = _input->getMesh();
     
     const int cudaBlockSize( 16 );
-    int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
+    int numBlocksX = Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
     dim3 blockSize( cudaBlockSize );
     dim3 gridSize( numBlocksX );
     Devices::Cuda::synchronizeDevice();
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h
index cddf4f9cb7a97f8a74eb94f7377b2cf740db03a5..947a4be062a9b8763ec9a6042c41d3048aa8a361 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h
@@ -25,8 +25,8 @@ initInterface( const MeshFunctionPointer& _input,
     const MeshType& mesh = _input->getMesh();
     
     const int cudaBlockSize( 16 );
-    int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
-    int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize );
+    int numBlocksX = Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
+    int numBlocksY = Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize );
     dim3 blockSize( cudaBlockSize, cudaBlockSize );
     dim3 gridSize( numBlocksX, numBlocksY );
     Devices::Cuda::synchronizeDevice();
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h
index 32548abcfe66affd71a79d3ed5f2f21d67df644e..eb0665c7e4a66a8424af4a312d68f0f3a039a934 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h
@@ -23,9 +23,9 @@ initInterface( const MeshFunctionPointer& _input,
     const MeshType& mesh = _input->getMesh();
     
     const int cudaBlockSize( 8 );
-    int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
-    int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize );
-    int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().z(), cudaBlockSize );
+    int numBlocksX = Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize );
+    int numBlocksY = Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize );
+    int numBlocksZ = Cuda::getNumberOfBlocks( mesh.getDimensions().z(), cudaBlockSize );
     if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 )
       std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl;
     dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize );
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod1D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod1D_impl.h
index f2f033ccbee3ffa5b71567ea1b54e2307ebe1713..52c2ebbee3f16fd39a2dfab738ac9bea6ffaf393 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod1D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod1D_impl.h
@@ -105,7 +105,7 @@ solve( const MeshPointer& mesh,
          // TODO: CUDA code
 #ifdef HAVE_CUDA
           const int cudaBlockSize( 16 );
-          int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
+          int numBlocksX = Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize );
           dim3 blockSize( cudaBlockSize );
           dim3 gridSize( numBlocksX );
           
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
index e5638c11dd71d88d72d8d0590c8e51c0df6baaab..c5a0f74cc53f89808f5308aa628c6c55802eb200 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
@@ -251,8 +251,8 @@ solve( const MeshPointer& mesh,
         const int cudaBlockSize( 16 );
         
         // Setting number of threads and blocks for kernel
-        int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x() - vecLowerOverlaps[0] - vecUpperOverlaps[0], cudaBlockSize );
-        int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y() - vecLowerOverlaps[1] - vecUpperOverlaps[1], cudaBlockSize );
+        int numBlocksX = Cuda::getNumberOfBlocks( mesh->getDimensions().x() - vecLowerOverlaps[0] - vecUpperOverlaps[0], cudaBlockSize );
+        int numBlocksY = Cuda::getNumberOfBlocks( mesh->getDimensions().y() - vecLowerOverlaps[1] - vecUpperOverlaps[1], cudaBlockSize );
         dim3 blockSize( cudaBlockSize, cudaBlockSize );
         dim3 gridSize( numBlocksX, numBlocksY );
         
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
index 325b626f7bf5262637f8e1b43ec9e156bbeca26b..3fce5564efdf65ebaf45c62eaed8cae259d1c4b3 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
@@ -263,9 +263,9 @@ solve( const MeshPointer& mesh,
         const int cudaBlockSize( 8 );
         
         // Getting the number of blocks in grid in each direction (without overlaps bcs we dont calculate on overlaps)
-        int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x() - vecLowerOverlaps[0] - vecUpperOverlaps[0], cudaBlockSize );
-        int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y() - vecLowerOverlaps[1] - vecUpperOverlaps[1], cudaBlockSize );
-        int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().z() - vecLowerOverlaps[2] - vecUpperOverlaps[2], cudaBlockSize ); 
+        int numBlocksX = Cuda::getNumberOfBlocks( mesh->getDimensions().x() - vecLowerOverlaps[0] - vecUpperOverlaps[0], cudaBlockSize );
+        int numBlocksY = Cuda::getNumberOfBlocks( mesh->getDimensions().y() - vecLowerOverlaps[1] - vecUpperOverlaps[1], cudaBlockSize );
+        int numBlocksZ = Cuda::getNumberOfBlocks( mesh->getDimensions().z() - vecLowerOverlaps[2] - vecUpperOverlaps[2], cudaBlockSize ); 
         if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 )
           std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl;
         
diff --git a/src/TNL/File.hpp b/src/TNL/File.hpp
index 19a9eaa06a3a88e264de3b4528730baa0304a897..a3eb66066e2a014fe16c70910db60212aecc8629 100644
--- a/src/TNL/File.hpp
+++ b/src/TNL/File.hpp
@@ -17,6 +17,7 @@
 
 #include <TNL/File.h>
 #include <TNL/Assert.h>
+#include <TNL/Cuda/CheckDevice.h>
 #include <TNL/Exceptions/CudaSupportMissing.h>
 #include <TNL/Exceptions/FileSerializationError.h>
 #include <TNL/Exceptions/FileDeserializationError.h>
diff --git a/src/TNL/Functions/FunctionAdapter.h b/src/TNL/Functions/FunctionAdapter.h
index b9c35886689bf254eccfd4154469be71462107d3..b763ee47631d10522038b8dad9f927e83bc37a88 100644
--- a/src/TNL/Functions/FunctionAdapter.h
+++ b/src/TNL/Functions/FunctionAdapter.h
@@ -10,7 +10,7 @@
 
 #pragma once
 
-#include <TNL/Devices/CudaCallable.h>
+#include <TNL/Cuda/CudaCallable.h>
 #include <TNL/Config/ParameterContainer.h>
 #include <TNL/Functions/Domain.h>
 
diff --git a/src/TNL/Functions/OperatorFunction.h b/src/TNL/Functions/OperatorFunction.h
index 1f1e89b029f5d5e816e6f6df4bc6d9e9d27bb377..cc46d557a10cbfc315d04275d613de0ce679dce0 100644
--- a/src/TNL/Functions/OperatorFunction.h
+++ b/src/TNL/Functions/OperatorFunction.h
@@ -11,7 +11,7 @@
 #pragma once
 
 #include <type_traits>
-#include <TNL/Devices/CudaCallable.h>
+#include <TNL/Cuda/CudaCallable.h>
 #include <TNL/Functions/MeshFunction.h>
 #include <TNL/Solvers/PDE/BoundaryConditionsSetter.h>
 
diff --git a/src/TNL/Functions/TestFunction_impl.h b/src/TNL/Functions/TestFunction_impl.h
index e2bdce1f1c4a72848e82f10d9c270099121c28b7..918f24107d0e4a27a24d414d6cdcbea4f885cb45 100644
--- a/src/TNL/Functions/TestFunction_impl.h
+++ b/src/TNL/Functions/TestFunction_impl.h
@@ -11,6 +11,8 @@
 #pragma once
 
 #include <TNL/Devices/Cuda.h>
+#include <TNL/Cuda/MemoryHelpers.h>
+
 #include <TNL/Functions/Analytic/Constant.h>
 #include <TNL/Functions/Analytic/ExpBump.h>
 #include <TNL/Functions/Analytic/SinBumps.h>
@@ -137,7 +139,7 @@ setupFunction( const Config::ParameterContainer& parameters,
    }
    if( std::is_same< Device, Devices::Cuda >::value )
    {
-      this->function = Devices::Cuda::passToDevice( *auxFunction );
+      this->function = Cuda::passToDevice( *auxFunction );
       delete auxFunction;
       TNL_CHECK_CUDA_DEVICE;
    }
@@ -166,7 +168,7 @@ setupOperator( const Config::ParameterContainer& parameters,
    }
    if( std::is_same< Device, Devices::Cuda >::value )
    {
-      this->operator_ = Devices::Cuda::passToDevice( *auxOperator );
+      this->operator_ = Cuda::passToDevice( *auxOperator );
       delete auxOperator;
       TNL_CHECK_CUDA_DEVICE;
    }
@@ -736,7 +738,7 @@ deleteFunction()
    if( std::is_same< Device, Devices::Cuda >::value )
    {
       if( function )
-         Devices::Cuda::freeFromDevice( ( FunctionType * ) function );
+         Cuda::freeFromDevice( ( FunctionType * ) function );
    }
 }
 
@@ -756,7 +758,7 @@ deleteOperator()
    if( std::is_same< Device, Devices::Cuda >::value )
    {
       if( operator_ )
-         Devices::Cuda::freeFromDevice( ( OperatorType * ) operator_ );
+         Cuda::freeFromDevice( ( OperatorType * ) operator_ );
    }
 }
 
@@ -912,7 +914,7 @@ printFunction( std::ostream& str ) const
    }
    if( std::is_same< Device, Devices::Cuda >::value )
    {
-      Devices::Cuda::print( f, str );
+      Cuda::print( f, str );
       return str;
    }
 }
diff --git a/src/TNL/Logger_impl.h b/src/TNL/Logger_impl.h
index 0e1dd8dc62434faf07b64a95f1f896ed9b8af940..6a3da0f9659e8688bca777d76d71b6947c62f99b 100644
--- a/src/TNL/Logger_impl.h
+++ b/src/TNL/Logger_impl.h
@@ -14,7 +14,7 @@
 #include <iomanip>
 
 #include <TNL/Logger.h>
-#include <TNL/Devices/CudaDeviceInfo.h>
+#include <TNL/Cuda/DeviceInfo.h>
 #include <TNL/Devices/SystemInfo.h>
 
 namespace TNL {
@@ -95,19 +95,19 @@ Logger::writeSystemInformation( const Config::ParameterContainer& parameters )
    //   for( int i = 0; i < devices; i++ )
    //   {
    //      logger.writeParameter< int >( "Device no.", i, 1 );
-         const int i = Devices::CudaDeviceInfo::getActiveDevice();
-         writeParameter< String >( "Name", Devices::CudaDeviceInfo::getDeviceName( i ), 2 );
-         const String deviceArch = convertToString( Devices::CudaDeviceInfo::getArchitectureMajor( i ) ) + "." +
-                                   convertToString( Devices::CudaDeviceInfo::getArchitectureMinor( i ) );
+         const int i = Cuda::DeviceInfo::getActiveDevice();
+         writeParameter< String >( "Name", Cuda::DeviceInfo::getDeviceName( i ), 2 );
+         const String deviceArch = convertToString( Cuda::DeviceInfo::getArchitectureMajor( i ) ) + "." +
+                                   convertToString( Cuda::DeviceInfo::getArchitectureMinor( i ) );
          writeParameter< String >( "Architecture", deviceArch, 2 );
-         writeParameter< int >( "CUDA cores", Devices::CudaDeviceInfo::getCudaCores( i ), 2 );
-         const double clockRate = ( double ) Devices::CudaDeviceInfo::getClockRate( i ) / 1.0e3;
+         writeParameter< int >( "CUDA cores", Cuda::DeviceInfo::getCudaCores( i ), 2 );
+         const double clockRate = ( double ) Cuda::DeviceInfo::getClockRate( i ) / 1.0e3;
          writeParameter< double >( "Clock rate (in MHz)", clockRate, 2 );
-         const double globalMemory = ( double ) Devices::CudaDeviceInfo::getGlobalMemory( i ) / 1.0e9;
+         const double globalMemory = ( double ) Cuda::DeviceInfo::getGlobalMemory( i ) / 1.0e9;
          writeParameter< double >( "Global memory (in GB)", globalMemory, 2 );
-         const double memoryClockRate = ( double ) Devices::CudaDeviceInfo::getMemoryClockRate( i ) / 1.0e3;
+         const double memoryClockRate = ( double ) Cuda::DeviceInfo::getMemoryClockRate( i ) / 1.0e3;
          writeParameter< double >( "Memory clock rate (in Mhz)", memoryClockRate, 2 );
-         writeParameter< bool >( "ECC enabled", Devices::CudaDeviceInfo::getECCEnabled( i ), 2 );
+         writeParameter< bool >( "ECC enabled", Cuda::DeviceInfo::getECCEnabled( i ), 2 );
    //   }
    }
    return true;
diff --git a/src/TNL/Math.h b/src/TNL/Math.h
index b7591bf65942b8070ec90f6d8a440cecab4807b6..321cc7ce39e0d0beb8c3c3c2a5ab3ac7bbfddbdd 100644
--- a/src/TNL/Math.h
+++ b/src/TNL/Math.h
@@ -15,7 +15,7 @@
 #include <algorithm>
 
 #include <TNL/TypeTraits.h>
-#include <TNL/Devices/CudaCallable.h>
+#include <TNL/Cuda/CudaCallable.h>
 
 namespace TNL {
 
diff --git a/src/TNL/Matrices/AdEllpack_impl.h b/src/TNL/Matrices/AdEllpack_impl.h
index e754eca68afe082c0958c4ab3bdcbb7f546bcdde..a0f293b3df94afcfeda8124f8e1d8173cb4c7718 100644
--- a/src/TNL/Matrices/AdEllpack_impl.h
+++ b/src/TNL/Matrices/AdEllpack_impl.h
@@ -936,14 +936,14 @@ void AdEllpack< Real, Device, Index >::spmvCuda2( const InVector& inVector,
                                                   OutVector& outVector,
                                                   const int gridIdx ) const
 {
-    IndexType globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+    IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
     IndexType warpIdx = globalIdx >> 5;
     IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 );
     if( globalIdx >= this->reduceMap.getSize() )
 	return;
 
     const int blockSize = 256;
-    Real* temp = Devices::Cuda::getSharedMemory< Real >();
+    Real* temp = Cuda::getSharedMemory< Real >();
     __shared__ IndexType reduceMap[ blockSize ];
     reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ];
     temp[ threadIdx.x ] = 0.0;
@@ -984,14 +984,14 @@ void AdEllpack< Real, Device, Index >::spmvCuda4( const InVector& inVector,
                                                            OutVector& outVector,
                                                            const int gridIdx ) const
 {
-    IndexType globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+    IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
     IndexType warpIdx = globalIdx >> 5;
     IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 );
     if( globalIdx >= this->reduceMap.getSize() )
 	return;
 
     const int blockSize = 192;
-    Real* temp = Devices::Cuda::getSharedMemory< Real >();
+    Real* temp = Cuda::getSharedMemory< Real >();
     __shared__ IndexType reduceMap[ blockSize ];
     reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ];
     temp[ threadIdx.x ] = 0.0;
@@ -1043,14 +1043,14 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector,
                                                            OutVector& outVector,
                                                            const int gridIdx ) const
 {
-    IndexType globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+    IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
     IndexType warpIdx = globalIdx >> 5;
     IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 );
     if( globalIdx >= this->reduceMap.getSize() )
 	return;
 
     const int blockSize = 128;
-    Real* temp = Devices::Cuda::getSharedMemory< Real >();
+    Real* temp = Cuda::getSharedMemory< Real >();
     __shared__ IndexType reduceMap[ blockSize ];
     reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ];
     temp[ threadIdx.x ] = 0.0;
@@ -1101,14 +1101,14 @@ void AdEllpack< Real, Device, Index >::spmvCuda16( const InVector& inVector,
                                                             OutVector& outVector,
                                                             const int gridIdx ) const
 {
-    IndexType globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+    IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
     IndexType warpIdx = globalIdx >> 5;
     IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 );
     if( globalIdx >= this->reduceMap.getSize() )
 	return;
 
     const int blockSize = 128;
-    Real* temp = Devices::Cuda::getSharedMemory< Real >();
+    Real* temp = Cuda::getSharedMemory< Real >();
     __shared__ IndexType reduceMap[ blockSize ];
     reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ];
     temp[ threadIdx.x ] = 0.0;
@@ -1159,14 +1159,14 @@ void AdEllpack< Real, Device, Index >::spmvCuda32( const InVector& inVector,
                                                             OutVector& outVector,
                                                             const int gridIdx ) const
 {
-    IndexType globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+    IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
     IndexType warpIdx = globalIdx >> 5;
     IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 );
     if( globalIdx >= this->reduceMap.getSize() )
 	return;
 
     const int blockSize = 96;
-    Real* temp = Devices::Cuda::getSharedMemory< Real >();
+    Real* temp = Cuda::getSharedMemory< Real >();
     __shared__ IndexType reduceMap[ blockSize ];
     reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ];
     temp[ threadIdx.x ] = 0.0;
@@ -1292,18 +1292,18 @@ public:
     {
         typedef AdEllpack< Real, Devices::Cuda, Index > Matrix;
 	typedef typename Matrix::IndexType IndexType;
-	Matrix* kernel_this = Devices::Cuda::passToDevice( matrix );
-	InVector* kernel_inVector = Devices::Cuda::passToDevice( inVector );
-	OutVector* kernel_outVector = Devices::Cuda::passToDevice( outVector );
+	Matrix* kernel_this = Cuda::passToDevice( matrix );
+	InVector* kernel_inVector = Cuda::passToDevice( inVector );
+	OutVector* kernel_outVector = Cuda::passToDevice( outVector );
 	if( matrix.totalLoad < 2 )
 	{
-	    dim3 blockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() );
+	    dim3 blockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
 	    IndexType cudaBlocks = roundUpDivision( matrix.reduceMap.getSize(), blockSize.x );
-	    IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() );
+	    IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
 	    for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
 	    {
 	        if( gridIdx == cudaGrids - 1 )
-		    cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
+		    cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
 	        const int sharedMemory = blockSize.x * sizeof( Real );
 	        AdEllpackVectorProductCuda2< Real, Index, InVector, OutVector >
                                                     <<< cudaGridSize, blockSize, sharedMemory >>>
@@ -1313,20 +1313,20 @@ public:
                                                       gridIdx );
 	    }
 	    TNL_CHECK_CUDA_DEVICE;
-	    Devices::Cuda::freeFromDevice( kernel_this );
-	    Devices::Cuda::freeFromDevice( kernel_inVector );
-	    Devices::Cuda::freeFromDevice( kernel_outVector );
+	    Cuda::freeFromDevice( kernel_this );
+	    Cuda::freeFromDevice( kernel_inVector );
+	    Cuda::freeFromDevice( kernel_outVector );
 	    TNL_CHECK_CUDA_DEVICE;
 	}
 	else if( matrix.totalLoad < 4 )
 	{
-	    dim3 blockSize( 192 ), cudaGridSize( Devices::Cuda::getMaxGridSize() );
+	    dim3 blockSize( 192 ), cudaGridSize( Cuda::getMaxGridSize() );
 	    IndexType cudaBlocks = roundUpDivision( matrix.reduceMap.getSize(), blockSize.x );
-	    IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() );
+	    IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
 	    for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
 	    {
 	        if( gridIdx == cudaGrids - 1 )
-		    cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
+		    cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
 	        const int sharedMemory = blockSize.x * sizeof( Real );
 	        AdEllpackVectorProductCuda4< Real, Index, InVector, OutVector >
                                                     <<< cudaGridSize, blockSize, sharedMemory >>>
@@ -1336,20 +1336,20 @@ public:
                                                       gridIdx );
 	    }
 	    TNL_CHECK_CUDA_DEVICE;
-	    Devices::Cuda::freeFromDevice( kernel_this );
-	    Devices::Cuda::freeFromDevice( kernel_inVector );
-	    Devices::Cuda::freeFromDevice( kernel_outVector );
+	    Cuda::freeFromDevice( kernel_this );
+	    Cuda::freeFromDevice( kernel_inVector );
+	    Cuda::freeFromDevice( kernel_outVector );
 	    TNL_CHECK_CUDA_DEVICE;
 	}
 	else if( matrix.totalLoad < 8 )
 	{
-	    dim3 blockSize( 128 ), cudaGridSize( Devices::Cuda::getMaxGridSize() );
+	    dim3 blockSize( 128 ), cudaGridSize( Cuda::getMaxGridSize() );
 	    IndexType cudaBlocks = roundUpDivision( matrix.reduceMap.getSize(), blockSize.x );
-	    IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() );
+	    IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
 	    for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
 	    {
 	        if( gridIdx == cudaGrids - 1 )
-		    cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
+		    cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
 	        const int sharedMemory = blockSize.x * sizeof( Real );
 	        AdEllpackVectorProductCuda8< Real, Index, InVector, OutVector >
                                                     <<< cudaGridSize, blockSize, sharedMemory >>>
@@ -1359,20 +1359,20 @@ public:
                                                       gridIdx );
 	    }
 	    TNL_CHECK_CUDA_DEVICE;
-	    Devices::Cuda::freeFromDevice( kernel_this );
-	    Devices::Cuda::freeFromDevice( kernel_inVector );
-	    Devices::Cuda::freeFromDevice( kernel_outVector );
+	    Cuda::freeFromDevice( kernel_this );
+	    Cuda::freeFromDevice( kernel_inVector );
+	    Cuda::freeFromDevice( kernel_outVector );
 	    TNL_CHECK_CUDA_DEVICE;
 	}
 	else if( matrix.totalLoad < 16 )
 	{
-	    dim3 blockSize( 128 ), cudaGridSize( Devices::Cuda::getMaxGridSize() );
+	    dim3 blockSize( 128 ), cudaGridSize( Cuda::getMaxGridSize() );
 	    IndexType cudaBlocks = roundUpDivision( matrix.reduceMap.getSize(), blockSize.x );
-	    IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() );
+	    IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
 	    for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
 	    {
 	        if( gridIdx == cudaGrids - 1 )
-		    cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
+		    cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
 	        const int sharedMemory = blockSize.x * sizeof( Real );
 	        AdEllpackVectorProductCuda16< Real, Index, InVector, OutVector >
                                                      <<< cudaGridSize, blockSize, sharedMemory >>>
@@ -1382,20 +1382,20 @@ public:
                                                        gridIdx );
 	    }
 	    TNL_CHECK_CUDA_DEVICE;
-	    Devices::Cuda::freeFromDevice( kernel_this );
-	    Devices::Cuda::freeFromDevice( kernel_inVector );
-	    Devices::Cuda::freeFromDevice( kernel_outVector );
+	    Cuda::freeFromDevice( kernel_this );
+	    Cuda::freeFromDevice( kernel_inVector );
+	    Cuda::freeFromDevice( kernel_outVector );
 	    TNL_CHECK_CUDA_DEVICE;
 	}
 	else
 	{
-	    dim3 blockSize( 96 ), cudaGridSize( Devices::Cuda::getMaxGridSize() );
+	    dim3 blockSize( 96 ), cudaGridSize( Cuda::getMaxGridSize() );
 	    IndexType cudaBlocks = roundUpDivision( matrix.reduceMap.getSize(), blockSize.x );
-	    IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() );
+	    IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
 	    for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
 	    {
 	        if( gridIdx == cudaGrids - 1 )
-		    cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
+		    cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
 	        const int sharedMemory = blockSize.x * sizeof( Real );
 	        AdEllpackVectorProductCuda32< Real, Index, InVector, OutVector >
                                                      <<< cudaGridSize, blockSize, sharedMemory >>>
@@ -1405,9 +1405,9 @@ public:
                                                        gridIdx );
 	    }
 	    TNL_CHECK_CUDA_DEVICE;
-	    Devices::Cuda::freeFromDevice( kernel_this );
-	    Devices::Cuda::freeFromDevice( kernel_inVector );
-	    Devices::Cuda::freeFromDevice( kernel_outVector );
+	    Cuda::freeFromDevice( kernel_this );
+	    Cuda::freeFromDevice( kernel_inVector );
+	    Cuda::freeFromDevice( kernel_outVector );
 	    TNL_CHECK_CUDA_DEVICE;
 	}
     }
diff --git a/src/TNL/Matrices/BiEllpackSymmetric_impl.h b/src/TNL/Matrices/BiEllpackSymmetric_impl.h
index 47b34282822c48d8e7fc384698c164b0036b9fc6..0af180c0e8c2c54d2c4fdb304fa3e2813d76786c 100644
--- a/src/TNL/Matrices/BiEllpackSymmetric_impl.h
+++ b/src/TNL/Matrices/BiEllpackSymmetric_impl.h
@@ -1053,7 +1053,7 @@ void BiEllpackSymmetric< Real, Device, Index, StripSize >::spmvCuda( const InVec
     IndexType bisection = this->warpSize;
     IndexType groupBegin = strip * ( this->logWarpSize + 1 );
 
-    Real* temp = Devices::Cuda::getSharedMemory< Real >();
+    Real* temp = Cuda::getSharedMemory< Real >();
     __shared__ Real results[ cudaBlockSize ];
     results[ threadIdx.x ] = 0.0;
     IndexType elementPtr = ( this->groupPointers[ groupBegin ] << this->logWarpSize ) + inWarpIdx;
@@ -1274,7 +1274,7 @@ void BiEllpackSymmetricVectorProductCuda( const BiEllpackSymmetric< Real, Device
                                           int gridIdx,
                                           const int warpSize )
 {
-    Index globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+    Index globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
     matrix->spmvCuda( *inVector, *outVector, globalIdx );
 }
 #endif
@@ -1394,7 +1394,7 @@ void performRowBubbleSortCuda( BiEllpackSymmetric< Real, Devices::Cuda, Index, S
                                const typename BiEllpackSymmetric< Real, Devices::Cuda, Index, StripSize >::RowLengthsVector* rowLengths,
                                int gridIdx )
 {
-    const Index stripIdx = gridIdx * Devices::Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x;
+    const Index stripIdx = gridIdx * Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x;
     matrix->performRowBubbleSortCudaKernel( *rowLengths, stripIdx );
 }
 #endif
@@ -1409,7 +1409,7 @@ void computeColumnSizesCuda( BiEllpackSymmetric< Real, Devices::Cuda, Index, Str
                              const Index numberOfStrips,
                              int gridIdx )
 {
-    const Index stripIdx = gridIdx * Devices::Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x;
+    const Index stripIdx = gridIdx * Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x;
     matrix->computeColumnSizesCudaKernel( *rowLengths, numberOfStrips, stripIdx );
 }
 #endif
@@ -1513,23 +1513,23 @@ public:
         Index numberOfStrips = matrix.virtualRows / StripSize;
         typedef BiEllpackSymmetric< Real, Devices::Cuda, Index, StripSize > Matrix;
         typedef typename Matrix::RowLengthsVector CompressedRowLengthsVector;
-        Matrix* kernel_this = Devices::Cuda::passToDevice( matrix );
-        CompressedRowLengthsVector* kernel_rowLengths = Devices::Cuda::passToDevice( rowLengths );
-        dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() );
+        Matrix* kernel_this = Cuda::passToDevice( matrix );
+        CompressedRowLengthsVector* kernel_rowLengths = Cuda::passToDevice( rowLengths );
+        dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
         const Index cudaBlocks = roundUpDivision( numberOfStrips, cudaBlockSize.x );
-        const Index cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() );
+        const Index cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
         for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
         {
              if( gridIdx == cudaGrids - 1 )
-                 cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
+                 cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
              performRowBubbleSortCuda< Real, Index, StripSize >
                                      <<< cudaGridSize, cudaBlockSize >>>
                                      ( kernel_this,
                                        kernel_rowLengths,
                                        gridIdx );
         }
-        Devices::Cuda::freeFromDevice( kernel_this );
-        Devices::Cuda::freeFromDevice( kernel_rowLengths );
+        Cuda::freeFromDevice( kernel_this );
+        Cuda::freeFromDevice( kernel_rowLengths );
         TNL_CHECK_CUDA_DEVICE;
 #endif
     }
@@ -1544,15 +1544,15 @@ public:
         const Index numberOfStrips = matrix.virtualRows / StripSize;
         typedef BiEllpackSymmetric< Real, Devices::Cuda, Index, StripSize > Matrix;
         typedef typename Matrix::RowLengthsVector CompressedRowLengthsVector;
-        Matrix* kernel_this = Devices::Cuda::passToDevice( matrix );
-        CompressedRowLengthsVector* kernel_rowLengths = Devices::Cuda::passToDevice( rowLengths );
-        dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() );
+        Matrix* kernel_this = Cuda::passToDevice( matrix );
+        CompressedRowLengthsVector* kernel_rowLengths = Cuda::passToDevice( rowLengths );
+        dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
         const Index cudaBlocks = roundUpDivision( numberOfStrips, cudaBlockSize.x );
-        const Index cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() );
+        const Index cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
         for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
         {
              if( gridIdx == cudaGrids - 1 )
-                 cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
+                 cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
              computeColumnSizesCuda< Real, Index, StripSize >
                                    <<< cudaGridSize, cudaBlockSize >>>
                                    ( kernel_this,
@@ -1560,8 +1560,8 @@ public:
                                      numberOfStrips,
                                      gridIdx );
         }
-        Devices::Cuda::freeFromDevice( kernel_this );
-        Devices::Cuda::freeFromDevice( kernel_rowLengths );
+        Cuda::freeFromDevice( kernel_this );
+        Cuda::freeFromDevice( kernel_rowLengths );
         TNL_CHECK_CUDA_DEVICE;
 #endif
     }
@@ -1579,16 +1579,16 @@ public:
 #ifdef HAVE_CUDA
         typedef BiEllpackSymmetric< Real, Devices::Cuda, Index > Matrix;
         typedef typename Matrix::IndexType IndexType;
-        Matrix* kernel_this = Devices::Cuda::passToDevice( matrix );
-        InVector* kernel_inVector = Devices::Cuda::passToDevice( inVector );
-        OutVector* kernel_outVector = Devices::Cuda::passToDevice( outVector );
-        dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() );
+        Matrix* kernel_this = Cuda::passToDevice( matrix );
+        InVector* kernel_inVector = Cuda::passToDevice( inVector );
+        OutVector* kernel_outVector = Cuda::passToDevice( outVector );
+        dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
         const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
-        const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() );
+        const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
         for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
         {
             if( gridIdx == cudaGrids - 1 )
-                cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
+                cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
             const int sharedMemory = cudaBlockSize.x * sizeof( Real );
             BiEllpackSymmetricVectorProductCuda< Real, Index, StripSize, InVector, OutVector >
                                                <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
@@ -1598,9 +1598,9 @@ public:
                                                  gridIdx,
                                                  matrix.warpSize );
         }
-        Devices::Cuda::freeFromDevice( kernel_this );
-        Devices::Cuda::freeFromDevice( kernel_inVector );
-        Devices::Cuda::freeFromDevice( kernel_outVector );
+        Cuda::freeFromDevice( kernel_this );
+        Cuda::freeFromDevice( kernel_inVector );
+        Cuda::freeFromDevice( kernel_outVector );
         TNL_CHECK_CUDA_DEVICE;
 #endif
     }
diff --git a/src/TNL/Matrices/BiEllpack_impl.h b/src/TNL/Matrices/BiEllpack_impl.h
index ac02853616f17965458e9e14f8b225718d82354a..6af80899578528bd9d958c8515e3c228db4778d0 100644
--- a/src/TNL/Matrices/BiEllpack_impl.h
+++ b/src/TNL/Matrices/BiEllpack_impl.h
@@ -1057,7 +1057,7 @@ void BiEllpack< Real, Device, Index, StripSize >::spmvCuda( const InVector& inVe
     IndexType bisection = this->warpSize;
     IndexType groupBegin = strip * ( this->logWarpSize + 1 );
 
-    Real* temp = Devices::Cuda::getSharedMemory< Real >();
+    Real* temp = Cuda::getSharedMemory< Real >();
     __shared__ Real results[ cudaBlockSize ];
     results[ threadIdx.x ] = 0.0;
     IndexType elementPtr = ( this->groupPointers[ groupBegin ] << this->logWarpSize ) + inWarpIdx;
@@ -1277,7 +1277,7 @@ void BiEllpackVectorProductCuda( const BiEllpack< Real, Devices::Cuda, Index, St
 										  int gridIdx,
 										  const int warpSize )
 {
-	Index globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+	Index globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
 	matrix->spmvCuda( *inVector, *outVector, globalIdx );
 }
 #endif
@@ -1397,7 +1397,7 @@ void performRowBubbleSortCuda( BiEllpack< Real, Devices::Cuda, Index, StripSize
 							   const typename BiEllpack< Real, Devices::Cuda, Index, StripSize >::CompressedRowLengthsVector* rowLengths,
 							   int gridIdx )
 {
-	const Index stripIdx = gridIdx * Devices::Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x;
+	const Index stripIdx = gridIdx * Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x;
 	matrix->performRowBubbleSortCudaKernel( *rowLengths, stripIdx );
 }
 #endif
@@ -1412,7 +1412,7 @@ void computeColumnSizesCuda( BiEllpack< Real, Devices::Cuda, Index, StripSize >*
 							 const Index numberOfStrips,
 							 int gridIdx )
 {
-	const Index stripIdx = gridIdx * Devices::Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x;
+	const Index stripIdx = gridIdx * Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x;
 	matrix->computeColumnSizesCudaKernel( *rowLengths, numberOfStrips, stripIdx );
 }
 #endif
@@ -1516,23 +1516,23 @@ public:
 		Index numberOfStrips = matrix.virtualRows / StripSize;
 		typedef BiEllpack< Real, Devices::Cuda, Index, StripSize > Matrix;
 		typedef typename Matrix::CompressedRowLengthsVector CompressedRowLengthsVector;
-		Matrix* kernel_this = Devices::Cuda::passToDevice( matrix );
-		CompressedRowLengthsVector* kernel_rowLengths = Devices::Cuda::passToDevice( rowLengths );
-		dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() );
+		Matrix* kernel_this = Cuda::passToDevice( matrix );
+		CompressedRowLengthsVector* kernel_rowLengths = Cuda::passToDevice( rowLengths );
+		dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
 		const Index cudaBlocks = roundUpDivision( numberOfStrips, cudaBlockSize.x );
-		const Index cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() );
+		const Index cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
 		for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
 		{
 		     if( gridIdx == cudaGrids - 1 )
-		         cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
+		         cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
 		     performRowBubbleSortCuda< Real, Index, StripSize >
 		     	 	 	 	 	 	 <<< cudaGridSize, cudaBlockSize >>>
 		                             ( kernel_this,
 		                               kernel_rowLengths,
 		                               gridIdx );
 		}
-		Devices::Cuda::freeFromDevice( kernel_this );
-		Devices::Cuda::freeFromDevice( kernel_rowLengths );
+		Cuda::freeFromDevice( kernel_this );
+		Cuda::freeFromDevice( kernel_rowLengths );
 		TNL_CHECK_CUDA_DEVICE;
 #endif
 	}
@@ -1547,15 +1547,15 @@ public:
 		const Index numberOfStrips = matrix.virtualRows / StripSize;
 		typedef BiEllpack< Real, Devices::Cuda, Index, StripSize > Matrix;
 		typedef typename Matrix::CompressedRowLengthsVector CompressedRowLengthsVector;
-		Matrix* kernel_this = Devices::Cuda::passToDevice( matrix );
-		CompressedRowLengthsVector* kernel_rowLengths = Devices::Cuda::passToDevice( rowLengths );
-		dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() );
+		Matrix* kernel_this = Cuda::passToDevice( matrix );
+		CompressedRowLengthsVector* kernel_rowLengths = Cuda::passToDevice( rowLengths );
+		dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
 		const Index cudaBlocks = roundUpDivision( numberOfStrips, cudaBlockSize.x );
-		const Index cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() );
+		const Index cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
 		for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
 		{
 		     if( gridIdx == cudaGrids - 1 )
-		         cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
+		         cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
 		     computeColumnSizesCuda< Real, Index, StripSize >
 		     	 	 	 	 	   <<< cudaGridSize, cudaBlockSize >>>
 		                           ( kernel_this,
@@ -1563,8 +1563,8 @@ public:
 		                             numberOfStrips,
 		                             gridIdx );
         }
-		Devices::Cuda::freeFromDevice( kernel_this );
-		Devices::Cuda::freeFromDevice( kernel_rowLengths );
+		Cuda::freeFromDevice( kernel_this );
+		Cuda::freeFromDevice( kernel_rowLengths );
 		TNL_CHECK_CUDA_DEVICE;
 #endif
 	}
@@ -1582,16 +1582,16 @@ public:
 #ifdef HAVE_CUDA
 		typedef BiEllpack< Real, Devices::Cuda, Index > Matrix;
 		typedef typename Matrix::IndexType IndexType;
-		Matrix* kernel_this = Devices::Cuda::passToDevice( matrix );
-		InVector* kernel_inVector = Devices::Cuda::passToDevice( inVector );
-		OutVector* kernel_outVector = Devices::Cuda::passToDevice( outVector );
-		dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() );
+		Matrix* kernel_this = Cuda::passToDevice( matrix );
+		InVector* kernel_inVector = Cuda::passToDevice( inVector );
+		OutVector* kernel_outVector = Cuda::passToDevice( outVector );
+		dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
 		const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
-		const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() );
+		const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
 		for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
 		{
 			if( gridIdx == cudaGrids - 1 )
-				cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
+				cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
 			const int sharedMemory = cudaBlockSize.x * sizeof( Real );
 			BiEllpackVectorProductCuda< Real, Index, StripSize, InVector, OutVector >
 			                                   <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
@@ -1601,9 +1601,9 @@ public:
 			                                     gridIdx,
 			                                     matrix.warpSize );
 		}
-		Devices::Cuda::freeFromDevice( kernel_this );
-		Devices::Cuda::freeFromDevice( kernel_inVector );
-		Devices::Cuda::freeFromDevice( kernel_outVector );
+		Cuda::freeFromDevice( kernel_this );
+		Cuda::freeFromDevice( kernel_inVector );
+		Cuda::freeFromDevice( kernel_outVector );
 		TNL_CHECK_CUDA_DEVICE;
 #endif
     }
diff --git a/src/TNL/Matrices/CSR_impl.h b/src/TNL/Matrices/CSR_impl.h
index 1e28157f90dc21b49532dc717a37534df82a2c9d..86b10119c249860cd8469f12ec89e12cd3d70aa9 100644
--- a/src/TNL/Matrices/CSR_impl.h
+++ b/src/TNL/Matrices/CSR_impl.h
@@ -33,7 +33,7 @@ template< typename Real,
           typename Index >
 CSR< Real, Device, Index >::CSR()
 : spmvCudaKernel( hybrid ),
-  cudaWarpSize( 32 ), //Devices::Cuda::getWarpSize() )
+  cudaWarpSize( 32 ), //Cuda::getWarpSize() )
   hybridModeSplit( 4 )
 {
 };
@@ -145,16 +145,16 @@ Index CSR< Real, Device, Index >::getNonZeroRowLength( const IndexType row ) con
 //       //  (gdb) p rowPointers.getElement(0)
 //       //    Attempt to take address of value not located in memory.
 //       IndexType resultHost ( 0 );
-//       IndexType* resultCuda = Devices::Cuda::passToDevice( resultHost );
+//       IndexType* resultCuda = Cuda::passToDevice( resultHost );
 //       // PROBLEM: If the second parameter of getNonZeroRowLengthCudaKernel is '&resultCuda', the following issue is thrown:
 //       //          'error: no instance of function template "TNL::Matrices::getNonZeroRowLengthCudaKernel" matches the argument list'
 //       TNL::Matrices::getNonZeroRowLengthCudaKernel< ConstMatrixRow, IndexType ><<< 1, 1 >>>( matrixRow, resultCuda ); // matrixRow works fine, tested them both separately
 //       delete []cols;
 //       delete []vals;
 //       std::cout << "Checkpoint BEFORE passFromDevice" << std::endl;
-//       resultHost = Devices::Cuda::passFromDevice( resultCuda ); // This causes a crash: Illegal memory address in Cuda_impl.h at TNL_CHECK_CUDA_DEVICE
+//       resultHost = Cuda::passFromDevice( resultCuda ); // This causes a crash: Illegal memory address in Cuda_impl.h at TNL_CHECK_CUDA_DEVICE
 //       std::cout << "Checkpoint AFTER passFromDevice" << std::endl;
-//       Devices::Cuda::freeFromDevice( resultCuda );
+//       Cuda::freeFromDevice( resultCuda );
 //       return resultHost;
 //   }
 }
@@ -713,7 +713,7 @@ void CSR< Real, Device, Index >::spmvCudaVectorized( const InVector& inVector,
                                                               const IndexType warpEnd,
                                                               const IndexType inWarpIdx ) const
 {
-   volatile Real* aux = Devices::Cuda::getSharedMemory< Real >();
+   volatile Real* aux = Cuda::getSharedMemory< Real >();
    for( IndexType row = warpStart; row < warpEnd; row++ )
    {
       aux[ threadIdx.x ] = 0.0;
@@ -753,7 +753,7 @@ void CSR< Real, Device, Index >::vectorProductCuda( const InVector& inVector,
                                                              OutVector& outVector,
                                                              int gridIdx ) const
 {
-   IndexType globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    const IndexType warpStart = warpSize * ( globalIdx / warpSize );
    const IndexType warpEnd = min( warpStart + warpSize, this->getRows() );
    const IndexType inWarpIdx = globalIdx % warpSize;
@@ -764,7 +764,7 @@ void CSR< Real, Device, Index >::vectorProductCuda( const InVector& inVector,
    /****
     * Hybrid mode
     */
-   const Index firstRow = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x;
+   const Index firstRow = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x;
    const IndexType lastRow = min( this->getRows(), firstRow + blockDim. x );
    const IndexType nonzerosPerRow = ( this->rowPointers[ lastRow ] - this->rowPointers[ firstRow ] ) /
                                     ( lastRow - firstRow );
@@ -828,7 +828,7 @@ __global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Inde
 {
    typedef CSR< Real, Devices::Cuda, Index > Matrix;
    static_assert( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value, "" );
-   const typename Matrix::IndexType rowIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const typename Matrix::IndexType rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    if( matrix->getCudaKernelType() == Matrix::scalar )
    {
       if( rowIdx < matrix->getRows() )
@@ -854,17 +854,17 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix,
 #ifdef HAVE_CUDA
    typedef CSR< Real, Devices::Cuda, Index > Matrix;
    typedef typename Matrix::IndexType IndexType;
-   Matrix* kernel_this = Devices::Cuda::passToDevice( matrix );
-   InVector* kernel_inVector = Devices::Cuda::passToDevice( inVector );
-   OutVector* kernel_outVector = Devices::Cuda::passToDevice( outVector );
+   Matrix* kernel_this = Cuda::passToDevice( matrix );
+   InVector* kernel_inVector = Cuda::passToDevice( inVector );
+   OutVector* kernel_outVector = Cuda::passToDevice( outVector );
    TNL_CHECK_CUDA_DEVICE;
-   dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() );
+   dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
    const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
-   const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() );
+   const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
    for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
    {
       if( gridIdx == cudaGrids - 1 )
-         cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
+         cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
       const int sharedMemory = cudaBlockSize.x * sizeof( Real );
       if( matrix.getCudaWarpSize() == 32 )
          CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 32 >
@@ -911,9 +911,9 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix,
 
    }
    TNL_CHECK_CUDA_DEVICE;
-   Devices::Cuda::freeFromDevice( kernel_this );
-   Devices::Cuda::freeFromDevice( kernel_inVector );
-   Devices::Cuda::freeFromDevice( kernel_outVector );
+   Cuda::freeFromDevice( kernel_this );
+   Cuda::freeFromDevice( kernel_inVector );
+   Cuda::freeFromDevice( kernel_outVector );
    TNL_CHECK_CUDA_DEVICE;
 #endif
 }
diff --git a/src/TNL/Matrices/ChunkedEllpack_impl.h b/src/TNL/Matrices/ChunkedEllpack_impl.h
index 5bc0cfd27a2aa9e9519f57cc732cfe63605f6478..89e525e87bde4a752d2bd7cb4e17a4f27a79a79d 100644
--- a/src/TNL/Matrices/ChunkedEllpack_impl.h
+++ b/src/TNL/Matrices/ChunkedEllpack_impl.h
@@ -1123,7 +1123,7 @@ __device__ void ChunkedEllpack< Real, Device, Index >::computeSliceVectorProduct
 {
    static_assert( std::is_same < DeviceType, Devices::Cuda >::value, "" );
 
-   RealType* chunkProducts = Devices::Cuda::getSharedMemory< RealType >();
+   RealType* chunkProducts = Cuda::getSharedMemory< RealType >();
    ChunkedEllpackSliceInfo* sliceInfo = ( ChunkedEllpackSliceInfo* ) & chunkProducts[ blockDim.x ];
 
    if( threadIdx.x == 0 )
@@ -1403,7 +1403,7 @@ __global__ void ChunkedEllpackVectorProductCudaKernel( const ChunkedEllpack< Rea
                                                                 OutVector* outVector,
                                                                 int gridIdx )
 {
-   const Index sliceIdx = gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x;
+   const Index sliceIdx = gridIdx * Cuda::getMaxGridSize() + blockIdx.x;
    if( sliceIdx < matrix->getNumberOfSlices() )
       matrix->computeSliceVectorProduct( inVector, outVector, sliceIdx );
 
@@ -1456,19 +1456,19 @@ class ChunkedEllpackDeviceDependentCode< Devices::Cuda >
             typedef ChunkedEllpack< Real, Devices::Cuda, Index > Matrix;
             typedef Index IndexType;
             typedef Real RealType;
-            Matrix* kernel_this = Devices::Cuda::passToDevice( matrix );
-            InVector* kernel_inVector = Devices::Cuda::passToDevice( inVector );
-            OutVector* kernel_outVector = Devices::Cuda::passToDevice( outVector );
+            Matrix* kernel_this = Cuda::passToDevice( matrix );
+            InVector* kernel_inVector = Cuda::passToDevice( inVector );
+            OutVector* kernel_outVector = Cuda::passToDevice( outVector );
             dim3 cudaBlockSize( matrix.getNumberOfChunksInSlice() ),
-                 cudaGridSize( Devices::Cuda::getMaxGridSize() );
+                 cudaGridSize( Cuda::getMaxGridSize() );
             const IndexType cudaBlocks = matrix.getNumberOfSlices();
-            const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() );
+            const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
             const IndexType sharedMemory = cudaBlockSize.x * sizeof( RealType ) +
                                            sizeof( tnlChunkedEllpackSliceInfo< IndexType > );
             for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
             {
                if( gridIdx == cudaGrids - 1 )
-                  cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
+                  cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
                ChunkedEllpackVectorProductCudaKernel< Real, Index, InVector, OutVector >
                                                              <<< cudaGridSize, cudaBlockSize, sharedMemory  >>>
                                                              ( kernel_this,
@@ -1476,9 +1476,9 @@ class ChunkedEllpackDeviceDependentCode< Devices::Cuda >
                                                                kernel_outVector,
                                                                gridIdx );
             }
-            Devices::Cuda::freeFromDevice( kernel_this );
-            Devices::Cuda::freeFromDevice( kernel_inVector );
-            Devices::Cuda::freeFromDevice( kernel_outVector );
+            Cuda::freeFromDevice( kernel_this );
+            Cuda::freeFromDevice( kernel_inVector );
+            Cuda::freeFromDevice( kernel_outVector );
             TNL_CHECK_CUDA_DEVICE;
          #endif
       }
diff --git a/src/TNL/Matrices/Dense_impl.h b/src/TNL/Matrices/Dense_impl.h
index f5767119744ca2e24568223d5a679a7d2e340cdf..246bd09edb459e6df9749af9d1589f508c2c5806 100644
--- a/src/TNL/Matrices/Dense_impl.h
+++ b/src/TNL/Matrices/Dense_impl.h
@@ -586,20 +586,20 @@ void Dense< Real, Device, Index >::getMatrixProduct( const Matrix1& matrix1,
       const IndexType cudaBlockRows( matrixProductCudaBlockSize / tileDim );
       cudaBlockSize.x = cudaBlockColumns;
       cudaBlockSize.y = cudaBlockRows;
-      const IndexType rowGrids = roundUpDivision( rowTiles, Devices::Cuda::getMaxGridSize() );
-      const IndexType columnGrids = roundUpDivision( columnTiles, Devices::Cuda::getMaxGridSize() );
+      const IndexType rowGrids = roundUpDivision( rowTiles, Cuda::getMaxGridSize() );
+      const IndexType columnGrids = roundUpDivision( columnTiles, Cuda::getMaxGridSize() );
 
       for( IndexType gridIdx_x = 0; gridIdx_x < columnGrids; gridIdx_x++ )
          for( IndexType gridIdx_y = 0; gridIdx_y < rowGrids; gridIdx_y++ )
          {
-            cudaGridSize.x = cudaGridSize.y = Devices::Cuda::getMaxGridSize();
+            cudaGridSize.x = cudaGridSize.y = Cuda::getMaxGridSize();
             if( gridIdx_x == columnGrids - 1 )
-               cudaGridSize.x = columnTiles % Devices::Cuda::getMaxGridSize();
+               cudaGridSize.x = columnTiles % Cuda::getMaxGridSize();
             if( gridIdx_y == rowGrids - 1 )
-               cudaGridSize.y = rowTiles % Devices::Cuda::getMaxGridSize();
-            Dense* this_kernel = Devices::Cuda::passToDevice( *this );
-            Matrix1* matrix1_kernel = Devices::Cuda::passToDevice( matrix1 );
-            Matrix2* matrix2_kernel = Devices::Cuda::passToDevice( matrix2 );
+               cudaGridSize.y = rowTiles % Cuda::getMaxGridSize();
+            Dense* this_kernel = Cuda::passToDevice( *this );
+            Matrix1* matrix1_kernel = Cuda::passToDevice( matrix1 );
+            Matrix2* matrix2_kernel = Cuda::passToDevice( matrix2 );
             DenseMatrixProductKernel< Real,
                                                Index,
                                                Matrix1,
@@ -616,9 +616,9 @@ void Dense< Real, Device, Index >::getMatrixProduct( const Matrix1& matrix1,
                                                matrix2Multiplicator,
                                                gridIdx_x,
                                                gridIdx_y );
-            Devices::Cuda::freeFromDevice( this_kernel );
-            Devices::Cuda::freeFromDevice( matrix1_kernel );
-            Devices::Cuda::freeFromDevice( matrix2_kernel );
+            Cuda::freeFromDevice( this_kernel );
+            Cuda::freeFromDevice( matrix1_kernel );
+            Cuda::freeFromDevice( matrix2_kernel );
          }
 #endif
    }
@@ -669,7 +669,7 @@ __global__ void DenseTranspositionAlignedKernel( Dense< Real, Devices::Cuda, Ind
         rowBlock < tileDim;
         rowBlock += tileRowBlockSize )
    {
-      tile[ Devices::Cuda::getInterleaving( threadIdx.x*tileDim +  threadIdx.y + rowBlock ) ] =
+      tile[ Cuda::getInterleaving( threadIdx.x*tileDim +  threadIdx.y + rowBlock ) ] =
                inputMatrix->getElementFast( readColumnPosition,
                                             readRowPosition + rowBlock );
    }
@@ -688,7 +688,7 @@ __global__ void DenseTranspositionAlignedKernel( Dense< Real, Devices::Cuda, Ind
    {
       resultMatrix->setElementFast( writeColumnPosition,
                                     writeRowPosition + rowBlock,
-                                    matrixMultiplicator * tile[ Devices::Cuda::getInterleaving( ( threadIdx.y + rowBlock ) * tileDim + threadIdx.x ) ] );
+                                    matrixMultiplicator * tile[ Cuda::getInterleaving( ( threadIdx.y + rowBlock ) * tileDim + threadIdx.x ) ] );
 
    }
 
@@ -741,7 +741,7 @@ __global__ void DenseTranspositionNonAlignedKernel( Dense< Real, Devices::Cuda,
            rowBlock += tileRowBlockSize )
       {
          if( readRowPosition + rowBlock < rows )
-            tile[ Devices::Cuda::getInterleaving( threadIdx.x*tileDim +  threadIdx.y + rowBlock ) ] =
+            tile[ Cuda::getInterleaving( threadIdx.x*tileDim +  threadIdx.y + rowBlock ) ] =
                inputMatrix->getElementFast( readColumnPosition,
                                             readRowPosition + rowBlock );
       }
@@ -765,7 +765,7 @@ __global__ void DenseTranspositionNonAlignedKernel( Dense< Real, Devices::Cuda,
          if( writeRowPosition + rowBlock < columns )
             resultMatrix->setElementFast( writeColumnPosition,
                                           writeRowPosition + rowBlock,
-                                          matrixMultiplicator * tile[ Devices::Cuda::getInterleaving( ( threadIdx.y + rowBlock ) * tileDim + threadIdx.x ) ] );
+                                          matrixMultiplicator * tile[ Cuda::getInterleaving( ( threadIdx.y + rowBlock ) * tileDim + threadIdx.x ) ] );
       }
    }
 
@@ -809,21 +809,21 @@ void Dense< Real, Device, Index >::getTransposition( const Matrix& matrix,
       const IndexType cudaBlockRows( matrixProductCudaBlockSize / tileDim );
       cudaBlockSize.x = cudaBlockColumns;
       cudaBlockSize.y = cudaBlockRows;
-      const IndexType rowGrids = roundUpDivision( rowTiles, Devices::Cuda::getMaxGridSize() );
-      const IndexType columnGrids = roundUpDivision( columnTiles, Devices::Cuda::getMaxGridSize() );
-      const IndexType sharedMemorySize = tileDim*tileDim + tileDim*tileDim/Devices::Cuda::getNumberOfSharedMemoryBanks();
+      const IndexType rowGrids = roundUpDivision( rowTiles, Cuda::getMaxGridSize() );
+      const IndexType columnGrids = roundUpDivision( columnTiles, Cuda::getMaxGridSize() );
+      const IndexType sharedMemorySize = tileDim*tileDim + tileDim*tileDim/Cuda::getNumberOfSharedMemoryBanks();
 
-      Dense* this_device = Devices::Cuda::passToDevice( *this );
-      Matrix* matrix_device = Devices::Cuda::passToDevice( matrix );
+      Dense* this_device = Cuda::passToDevice( *this );
+      Matrix* matrix_device = Cuda::passToDevice( matrix );
 
       for( IndexType gridIdx_x = 0; gridIdx_x < columnGrids; gridIdx_x++ )
          for( IndexType gridIdx_y = 0; gridIdx_y < rowGrids; gridIdx_y++ )
          {
-            cudaGridSize.x = cudaGridSize.y = Devices::Cuda::getMaxGridSize();
+            cudaGridSize.x = cudaGridSize.y = Cuda::getMaxGridSize();
             if( gridIdx_x == columnGrids - 1)
-               cudaGridSize.x = columnTiles % Devices::Cuda::getMaxGridSize();
+               cudaGridSize.x = columnTiles % Cuda::getMaxGridSize();
             if( gridIdx_y == rowGrids - 1 )
-               cudaGridSize.y = rowTiles % Devices::Cuda::getMaxGridSize();
+               cudaGridSize.y = rowTiles % Cuda::getMaxGridSize();
             if( ( gridIdx_x < columnGrids - 1 || matrix.getColumns() % tileDim == 0 ) &&
                 ( gridIdx_y < rowGrids - 1 || matrix.getRows() % tileDim == 0 ) )
             {
@@ -859,8 +859,8 @@ void Dense< Real, Device, Index >::getTransposition( const Matrix& matrix,
             }
             TNL_CHECK_CUDA_DEVICE;
          }
-      Devices::Cuda::freeFromDevice( this_device );
-      Devices::Cuda::freeFromDevice( matrix_device );
+      Cuda::freeFromDevice( this_device );
+      Cuda::freeFromDevice( matrix_device );
 #endif
    }
 }
diff --git a/src/TNL/Matrices/EllpackSymmetricGraph_impl.h b/src/TNL/Matrices/EllpackSymmetricGraph_impl.h
index b817372dc5620971df6f64251fbd2f716c5416e7..b949292c5f1664562525a4ead8ca17b2ad9f343b 100644
--- a/src/TNL/Matrices/EllpackSymmetricGraph_impl.h
+++ b/src/TNL/Matrices/EllpackSymmetricGraph_impl.h
@@ -54,7 +54,7 @@ void EllpackSymmetricGraph< Real, Device, Index >::setDimensions( const IndexTyp
    this->rows = rows;
    this->columns = columns;   
    if( std::is_same< DeviceType, Devices::Cuda >::value )
-      this->alignedRows = roundToMultiple( columns, Devices::Cuda::getWarpSize() );
+      this->alignedRows = roundToMultiple( columns, Cuda::getWarpSize() );
    else this->alignedRows = rows;
    if( this->rowLengths != 0 )
    allocateElements();
@@ -917,7 +917,7 @@ void EllpackSymmetricGraphVectorProductCuda( const EllpackSymmetricGraph< Real,
                                              const int gridIdx,
                                              const int color )
 {
-   int globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   int globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    matrix->spmvCuda( *inVector, *outVector, globalIdx, color );
 }
 #endif
@@ -966,19 +966,19 @@ class EllpackSymmetricGraphDeviceDependentCode< Devices::Cuda >
 #ifdef HAVE_CUDA
           typedef EllpackSymmetricGraph< Real, Devices::Cuda, Index > Matrix;
           typedef typename Matrix::IndexType IndexType;
-          Matrix* kernel_this = Devices::Cuda::passToDevice( matrix );
-          InVector* kernel_inVector = Devices::Cuda::passToDevice( inVector );
-          OutVector* kernel_outVector = Devices::Cuda::passToDevice( outVector );
-          dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() );
+          Matrix* kernel_this = Cuda::passToDevice( matrix );
+          InVector* kernel_inVector = Cuda::passToDevice( inVector );
+          OutVector* kernel_outVector = Cuda::passToDevice( outVector );
+          dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
           for( IndexType color = 0; color < matrix.getNumberOfColors(); color++ )
           {
               IndexType rows = matrix.getRowsOfColor( color );
               const IndexType cudaBlocks = roundUpDivision( rows, cudaBlockSize.x );
-              const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() );
+              const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
               for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
               {
                   if( gridIdx == cudaGrids - 1 )
-                      cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
+                      cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
                   EllpackSymmetricGraphVectorProductCuda< Real, Index, InVector, OutVector >
                                                       <<< cudaGridSize, cudaBlockSize >>>
                                                         ( kernel_this,
@@ -989,9 +989,9 @@ class EllpackSymmetricGraphDeviceDependentCode< Devices::Cuda >
               }
           }
 
-          Devices::Cuda::freeFromDevice( kernel_this );
-          Devices::Cuda::freeFromDevice( kernel_inVector );
-          Devices::Cuda::freeFromDevice( kernel_outVector );
+          Cuda::freeFromDevice( kernel_this );
+          Cuda::freeFromDevice( kernel_inVector );
+          Cuda::freeFromDevice( kernel_outVector );
           TNL_CHECK_CUDA_DEVICE;
 #endif
       }
diff --git a/src/TNL/Matrices/EllpackSymmetric_impl.h b/src/TNL/Matrices/EllpackSymmetric_impl.h
index 5890212a4b5c207087f35bdacbcf3d097bb75a67..90369f77af0f0085b140934c27fe3fe5a2d8f015 100644
--- a/src/TNL/Matrices/EllpackSymmetric_impl.h
+++ b/src/TNL/Matrices/EllpackSymmetric_impl.h
@@ -38,7 +38,7 @@ void EllpackSymmetric< Real, Device, Index >::setDimensions( const IndexType row
    this->rows = rows;
    this->columns = columns;   
    if( std::is_same< DeviceType, Devices::Cuda >::value )
-      this->alignedRows = roundToMultiple( columns, Devices::Cuda::getWarpSize() );
+      this->alignedRows = roundToMultiple( columns, Cuda::getWarpSize() );
    else this->alignedRows = rows;
    if( this->rowLengths != 0 )
       allocateElements();
@@ -708,7 +708,7 @@ void EllpackSymmetricVectorProductCuda( const EllpackSymmetric< Real, Devices::C
                                            OutVector* outVector,
                                            const int gridIdx )
 {
-    int globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+    int globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
     if( globalIdx >= matrix->getRows() )
         return;
     matrix->spmvCuda( *inVector, *outVector, globalIdx );
@@ -760,16 +760,16 @@ class EllpackSymmetricDeviceDependentCode< Devices::Cuda >
 #ifdef HAVE_CUDA
           typedef EllpackSymmetric< Real, Devices::Cuda, Index > Matrix;
           typedef typename Matrix::IndexType IndexType;
-          Matrix* kernel_this = Devices::Cuda::passToDevice( matrix );
-          InVector* kernel_inVector = Devices::Cuda::passToDevice( inVector );
-          OutVector* kernel_outVector = Devices::Cuda::passToDevice( outVector );
-          dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() );
+          Matrix* kernel_this = Cuda::passToDevice( matrix );
+          InVector* kernel_inVector = Cuda::passToDevice( inVector );
+          OutVector* kernel_outVector = Cuda::passToDevice( outVector );
+          dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
           const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
-          const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() );
+          const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
           for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
           {
               if( gridIdx == cudaGrids - 1 )
-                  cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
+                  cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
               const int sharedMemory = cudaBlockSize.x * sizeof( Real );
               EllpackSymmetricVectorProductCuda< Real, Index, InVector, OutVector >
                                                 <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
@@ -778,9 +778,9 @@ class EllpackSymmetricDeviceDependentCode< Devices::Cuda >
                                                     kernel_outVector,
                                                     gridIdx );
           }
-          Devices::Cuda::freeFromDevice( kernel_this );
-          Devices::Cuda::freeFromDevice( kernel_inVector );
-          Devices::Cuda::freeFromDevice( kernel_outVector );
+          Cuda::freeFromDevice( kernel_this );
+          Cuda::freeFromDevice( kernel_inVector );
+          Cuda::freeFromDevice( kernel_outVector );
           TNL_CHECK_CUDA_DEVICE;
 #endif
       }
diff --git a/src/TNL/Matrices/Ellpack_impl.h b/src/TNL/Matrices/Ellpack_impl.h
index 2b8675c0469d93bd33a54901e1324256c43b0e31..7651ea0d7af455351a731b29b768906114227945 100644
--- a/src/TNL/Matrices/Ellpack_impl.h
+++ b/src/TNL/Matrices/Ellpack_impl.h
@@ -60,7 +60,7 @@ void Ellpack< Real, Device, Index >::setDimensions( const IndexType rows,
    this->rows = rows;
    this->columns = columns;
    if( std::is_same< Device, Devices::Cuda >::value )
-      this->alignedRows = roundToMultiple( rows, Devices::Cuda::getWarpSize() );
+      this->alignedRows = roundToMultiple( rows, Cuda::getWarpSize() );
    else this->alignedRows = rows;
    if( this->rowLengths != 0 )
       allocateElements();
@@ -128,7 +128,7 @@ void Ellpack< Real, Device, Index >::setLike( const Ellpack< Real2, Device2, Ind
    Sparse< Real, Device, Index >::setLike( matrix );
    this->rowLengths = matrix.rowLengths;
    if( std::is_same< Device, Devices::Cuda >::value )
-      this->alignedRows = roundToMultiple( this->getRows(), Devices::Cuda::getWarpSize() );
+      this->alignedRows = roundToMultiple( this->getRows(), Cuda::getWarpSize() );
    else this->alignedRows = this->getRows();
 }
 
@@ -836,7 +836,7 @@ __global__ void EllpackVectorProductCudaKernel(
    Real multiplicator,
    const Index gridIdx )
 {
-   const Index rowIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const Index rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    if( rowIdx >= rows )
       return;
    Index i = rowIdx;
@@ -902,16 +902,16 @@ class EllpackDeviceDependentCode< Devices::Cuda >
          #ifdef HAVE_CUDA
             typedef Ellpack< Real, Device, Index > Matrix;
             typedef typename Matrix::IndexType IndexType;
-            //Matrix* kernel_this = Devices::Cuda::passToDevice( matrix );
-            //InVector* kernel_inVector = Devices::Cuda::passToDevice( inVector );
-            //OutVector* kernel_outVector = Devices::Cuda::passToDevice( outVector );
-            dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() );
+            //Matrix* kernel_this = Cuda::passToDevice( matrix );
+            //InVector* kernel_inVector = Cuda::passToDevice( inVector );
+            //OutVector* kernel_outVector = Cuda::passToDevice( outVector );
+            dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
             const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
-            const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() );
+            const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
             for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
             {
                if( gridIdx == cudaGrids - 1 )
-                  cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
+                  cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
                EllpackVectorProductCudaKernel
                < Real, Index >
                 <<< cudaGridSize, cudaBlockSize >>>
@@ -928,9 +928,9 @@ class EllpackDeviceDependentCode< Devices::Cuda >
                   gridIdx );
                TNL_CHECK_CUDA_DEVICE;
             }
-            //Devices::Cuda::freeFromDevice( kernel_this );
-            //Devices::Cuda::freeFromDevice( kernel_inVector );
-            //Devices::Cuda::freeFromDevice( kernel_outVector );
+            //Cuda::freeFromDevice( kernel_this );
+            //Cuda::freeFromDevice( kernel_inVector );
+            //Cuda::freeFromDevice( kernel_outVector );
             TNL_CHECK_CUDA_DEVICE;
             cudaDeviceSynchronize();
          #endif
diff --git a/src/TNL/Matrices/MatrixOperations.h b/src/TNL/Matrices/MatrixOperations.h
index 07991a573662c7380d6fd2814b0a20db30e7dca8..a6ede3f7bd1dd593f22d055cf4ac2426cbc58d35 100644
--- a/src/TNL/Matrices/MatrixOperations.h
+++ b/src/TNL/Matrices/MatrixOperations.h
@@ -21,6 +21,8 @@
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Math.h>
+#include <TNL/Cuda/DeviceInfo.h>
+#include <TNL/Cuda/SharedMemory.h>
 
 namespace TNL {
 namespace Matrices {
@@ -248,7 +250,7 @@ GemvCudaKernel( const IndexType m,
    IndexType elementIdx = blockIdx.x * blockDim.x + threadIdx.x;
    const IndexType gridSize = blockDim.x * gridDim.x;
 
-   RealType* shx = Devices::Cuda::getSharedMemory< RealType >();
+   RealType* shx = Cuda::getSharedMemory< RealType >();
 
    if( threadIdx.x < n )
       shx[ threadIdx.x ] = alpha * x[ threadIdx.x ];
@@ -344,10 +346,10 @@ public:
       Containers::Algorithms::ArrayOperations< Devices::Cuda, Devices::Host >::copy< RealType, RealType, IndexType >( xDevice.getData(), x, n );
 
       // desGridSize = blocksPerMultiprocessor * numberOfMultiprocessors
-      const int desGridSize = 32 * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
+      const int desGridSize = 32 * Cuda::DeviceInfo::getCudaMultiprocessors( Cuda::DeviceInfo::getActiveDevice() );
       dim3 blockSize, gridSize;
       blockSize.x = 256;
-      gridSize.x = min( desGridSize, Devices::Cuda::getNumberOfBlocks( m, blockSize.x ) );
+      gridSize.x = min( desGridSize, Cuda::getNumberOfBlocks( m, blockSize.x ) );
 
       GemvCudaKernel<<< gridSize, blockSize, n * sizeof( RealType ) >>>(
             m, n,
@@ -401,9 +403,9 @@ public:
          blockSize.x /= 2;
 
       // desGridSize = blocksPerMultiprocessor * numberOfMultiprocessors
-      const int desGridSize = 32 * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
-      gridSize.x = min( desGridSize, Devices::Cuda::getNumberOfBlocks( m, blockSize.x ) );
-      gridSize.y = Devices::Cuda::getNumberOfBlocks( n, blockSize.y );
+      const int desGridSize = 32 * Cuda::DeviceInfo::getCudaMultiprocessors( Cuda::DeviceInfo::getActiveDevice() );
+      gridSize.x = min( desGridSize, Cuda::getNumberOfBlocks( m, blockSize.x ) );
+      gridSize.y = Cuda::getNumberOfBlocks( n, blockSize.y );
 
       GeamCudaKernel<<< gridSize, blockSize >>>(
             m, n,
diff --git a/src/TNL/Matrices/Matrix_impl.h b/src/TNL/Matrices/Matrix_impl.h
index 7472760c23ec6d8df8a4295457cd6773e4ac80df..33c4d2e654cb32f9ba56516a1678b73d17ee3b96 100644
--- a/src/TNL/Matrices/Matrix_impl.h
+++ b/src/TNL/Matrices/Matrix_impl.h
@@ -12,6 +12,9 @@
 
 #include <TNL/Matrices/Matrix.h>
 #include <TNL/Assert.h>
+#include <TNL/Cuda/LaunchHelpers.h>
+#include <TNL/Cuda/MemoryHelpers.h>
+#include <TNL/Cuda/SharedMemory.h>
 
 namespace TNL {
 namespace Matrices {
@@ -240,7 +243,7 @@ __global__ void MatrixVectorProductCudaKernel( const Matrix* matrix,
                                                   int gridIdx )
 {
    static_assert( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value, "" );
-   const typename Matrix::IndexType rowIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const typename Matrix::IndexType rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    if( rowIdx < matrix->getRows() )
       ( *outVector )[ rowIdx ] = matrix->rowVectorProduct( rowIdx, *inVector );
 }
@@ -255,16 +258,16 @@ void MatrixVectorProductCuda( const Matrix& matrix,
 {
 #ifdef HAVE_CUDA
    typedef typename Matrix::IndexType IndexType;
-   Matrix* kernel_this = Devices::Cuda::passToDevice( matrix );
-   InVector* kernel_inVector = Devices::Cuda::passToDevice( inVector );
-   OutVector* kernel_outVector = Devices::Cuda::passToDevice( outVector );
-   dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() );
+   Matrix* kernel_this = Cuda::passToDevice( matrix );
+   InVector* kernel_inVector = Cuda::passToDevice( inVector );
+   OutVector* kernel_outVector = Cuda::passToDevice( outVector );
+   dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
    const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
-   const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() );
+   const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
    for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
    {
       if( gridIdx == cudaGrids - 1 )
-         cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
+         cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
       MatrixVectorProductCudaKernel<<< cudaGridSize, cudaBlockSize >>>
                                      ( kernel_this,
                                        kernel_inVector,
@@ -272,9 +275,9 @@ void MatrixVectorProductCuda( const Matrix& matrix,
                                        gridIdx );
       TNL_CHECK_CUDA_DEVICE;
    }
-   Devices::Cuda::freeFromDevice( kernel_this );
-   Devices::Cuda::freeFromDevice( kernel_inVector );
-   Devices::Cuda::freeFromDevice( kernel_outVector );
+   Cuda::freeFromDevice( kernel_this );
+   Cuda::freeFromDevice( kernel_inVector );
+   Cuda::freeFromDevice( kernel_outVector );
    TNL_CHECK_CUDA_DEVICE;
 #endif
 }
diff --git a/src/TNL/Matrices/SlicedEllpackSymmetricGraph_impl.h b/src/TNL/Matrices/SlicedEllpackSymmetricGraph_impl.h
index f9ef284dac30cb59a973314719bfe3481cdc80e3..bfe73f231092a0e4ea90c3011b823c6ab8c17d95 100644
--- a/src/TNL/Matrices/SlicedEllpackSymmetricGraph_impl.h
+++ b/src/TNL/Matrices/SlicedEllpackSymmetricGraph_impl.h
@@ -1095,7 +1095,7 @@ __global__ void SlicedEllpackSymmetricGraph_computeMaximalRowLengthInSlices_Cuda
                                                                                         typename SlicedEllpackSymmetricGraph< Real, Devices::Cuda, Index, SliceSize >::ConstCompressedRowLengthsVector rowLengths,
                                                                                         int gridIdx )
 {
-   const Index sliceIdx = gridIdx * Devices::Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x;
+   const Index sliceIdx = gridIdx * Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x;
    matrix->computeMaximalRowLengthInSlicesCuda( rowLengths, sliceIdx );
 }
 #endif
@@ -1152,7 +1152,7 @@ void SlicedEllpackSymmetricGraphVectorProductCuda( const SlicedEllpackSymmetricG
                                                    const int color,
                                                    const int sliceOffset )
 {
-    int globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x + sliceOffset;
+    int globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x + sliceOffset;
     matrix->smvCuda( *inVector, *outVector, globalIdx, color );
 }
 #endif
@@ -1213,21 +1213,21 @@ class SlicedEllpackSymmetricGraphDeviceDependentCode< Devices::Cuda >
 #ifdef HAVE_CUDA
          typedef SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize > Matrix;
          typedef typename Matrix::RowLengthsVector CompressedRowLengthsVector;
-         Matrix* kernel_matrix = Devices::Cuda::passToDevice( matrix );
+         Matrix* kernel_matrix = Cuda::passToDevice( matrix );
          const Index numberOfSlices = roundUpDivision( matrix.getRows(), SliceSize );
-         dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() );
+         dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
          const Index cudaBlocks = roundUpDivision( numberOfSlices, cudaBlockSize.x );
-         const Index cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() );
+         const Index cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
          for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
          {
             if( gridIdx == cudaGrids - 1 )
-               cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
+               cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
             SlicedEllpackSymmetricGraph_computeMaximalRowLengthInSlices_CudaKernel< Real, Index, SliceSize ><<< cudaGridSize, cudaBlockSize >>>
                                                                              ( kernel_matrix,
                                                                                rowLengths,
                                                                                gridIdx );
          }
-         Devices::Cuda::freeFromDevice( kernel_matrix );
+         Cuda::freeFromDevice( kernel_matrix );
          TNL_CHECK_CUDA_DEVICE;
 #endif
       }
@@ -1245,10 +1245,10 @@ class SlicedEllpackSymmetricGraphDeviceDependentCode< Devices::Cuda >
 #ifdef HAVE_CUDA
          typedef SlicedEllpackSymmetricGraph< Real, Devices::Cuda, Index, SliceSize > Matrix;
          typedef typename Matrix::IndexType IndexType;
-         Matrix* kernel_this = Devices::Cuda::passToDevice( matrix );
-         InVector* kernel_inVector = Devices::Cuda::passToDevice( inVector );
-         OutVector* kernel_outVector = Devices::Cuda::passToDevice( outVector );
-         dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() );
+         Matrix* kernel_this = Cuda::passToDevice( matrix );
+         InVector* kernel_inVector = Cuda::passToDevice( inVector );
+         OutVector* kernel_outVector = Cuda::passToDevice( outVector );
+         dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
          for( IndexType color = 0; color < matrix.getNumberOfColors(); color++ )
          {
             IndexType offset = matrix.colorPointers.getElement( color ); //can be computed in kernel
@@ -1258,11 +1258,11 @@ class SlicedEllpackSymmetricGraphDeviceDependentCode< Devices::Cuda >
             //IndexType rows = matrix.colorPointers.getElement( color + 1 ) - matrix.colorPointers.getElement( color ) + inSliceIdx;
             // TODO: rows id undefined
             /*const IndexType cudaBlocks = roundUpDivision( rows, cudaBlockSize.x );
-            const IndexType cudaGrids = rondUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize );
+            const IndexType cudaGrids = rondUpDivision( cudaBlocks, Cuda::getMaxGridSize );
             for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
             {
                if( gridIdx == cudaGrids - 1 )
-                  cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
+                  cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
                // TODO: this cannot be used here and i is undefined
                //IndexType offset = this->colorPointers[ i ];
                IndexType inSliceIdx = offset % SliceSize;
@@ -1277,9 +1277,9 @@ class SlicedEllpackSymmetricGraphDeviceDependentCode< Devices::Cuda >
                                                              sliceOffset );
             }*/
          }
-         Devices::Cuda::freeFromDevice( kernel_this );
-         Devices::Cuda::freeFromDevice( kernel_inVector );
-         Devices::Cuda::freeFromDevice( kernel_outVector );
+         Cuda::freeFromDevice( kernel_this );
+         Cuda::freeFromDevice( kernel_inVector );
+         Cuda::freeFromDevice( kernel_outVector );
          TNL_CHECK_CUDA_DEVICE;
 #endif
       }
diff --git a/src/TNL/Matrices/SlicedEllpackSymmetric_impl.h b/src/TNL/Matrices/SlicedEllpackSymmetric_impl.h
index edc645688dc8ad5fedbeb74b1c6f55dd32fb1af8..f53efdc0c0db21c9d0ab12de2776a53e9e2f803c 100644
--- a/src/TNL/Matrices/SlicedEllpackSymmetric_impl.h
+++ b/src/TNL/Matrices/SlicedEllpackSymmetric_impl.h
@@ -512,7 +512,7 @@ const SlicedEllpackSymmetric< Real, Devices::Cuda, Index, SliceSize >* matrix,
                                                        OutVector* outVector,
                                                        int gridIdx )
 {
-   int rowIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   int rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    matrix->spmvCuda( *inVector, *outVector, rowIdx );
 }
 #endif
@@ -784,7 +784,7 @@ __global__ void SlicedEllpackSymmetric_computeMaximalRowLengthInSlices_CudaKerne
                                                                                    typename SlicedEllpackSymmetric< Real, Devices::Cuda, Index, SliceSize >::ConstCompressedRowLengthsVectorView rowLengths,
                                                                                    int gridIdx )
 {
-   const Index sliceIdx = gridIdx * Devices::Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x;
+   const Index sliceIdx = gridIdx * Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x;
    matrix->computeMaximalRowLengthInSlicesCuda( rowLengths, sliceIdx );
 }
 #endif
@@ -843,21 +843,21 @@ class SlicedEllpackSymmetricDeviceDependentCode< Devices::Cuda >
 #ifdef HAVE_CUDA
          typedef SlicedEllpackSymmetric< Real, Device, Index, SliceSize > Matrix;
          typedef typename Matrix::RowLengthsVector CompressedRowLengthsVector;
-         Matrix* kernel_matrix = Devices::Cuda::passToDevice( matrix );
+         Matrix* kernel_matrix = Cuda::passToDevice( matrix );
          const Index numberOfSlices = roundUpDivision( matrix.getRows(), SliceSize );
-         dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() );
+         dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
          const Index cudaBlocks = roundUpDivision( numberOfSlices, cudaBlockSize.x );
-         const Index cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() );
+         const Index cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
          for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
          {
             if( gridIdx == cudaGrids - 1 )
-               cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
+               cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
             SlicedEllpackSymmetric_computeMaximalRowLengthInSlices_CudaKernel< Real, Index, SliceSize ><<< cudaGridSize, cudaBlockSize >>>
                                                                              ( kernel_matrix,
                                                                                rowLengths,
                                                                                gridIdx );
          }
-         Devices::Cuda::freeFromDevice( kernel_matrix );
+         Cuda::freeFromDevice( kernel_matrix );
          TNL_CHECK_CUDA_DEVICE;
 #endif
       }
@@ -874,16 +874,16 @@ class SlicedEllpackSymmetricDeviceDependentCode< Devices::Cuda >
 #ifdef HAVE_CUDA
          typedef SlicedEllpackSymmetric< Real, Device, Index, SliceSize > Matrix;
          typedef typename Matrix::IndexType IndexType;
-         Matrix* kernel_this = Devices::Cuda::passToDevice( matrix );
-         InVector* kernel_inVector = Devices::Cuda::passToDevice( inVector );
-         OutVector* kernel_outVector = Devices::Cuda::passToDevice( outVector );
-         dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() );
+         Matrix* kernel_this = Cuda::passToDevice( matrix );
+         InVector* kernel_inVector = Cuda::passToDevice( inVector );
+         OutVector* kernel_outVector = Cuda::passToDevice( outVector );
+         dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
          const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
-         const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() );
+         const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
          for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
          {
             if( gridIdx == cudaGrids - 1 )
-               cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
+               cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
             SlicedEllpackSymmetricVectorProductCudaKernel< Real, Index, SliceSize, InVector, OutVector >
                                                             <<< cudaGridSize, cudaBlockSize >>>
                                                               ( kernel_this,
@@ -891,9 +891,9 @@ class SlicedEllpackSymmetricDeviceDependentCode< Devices::Cuda >
                                                                 kernel_outVector,
                                                                 gridIdx );
          }
-         Devices::Cuda::freeFromDevice( kernel_this );
-         Devices::Cuda::freeFromDevice( kernel_inVector );
-         Devices::Cuda::freeFromDevice( kernel_outVector );
+         Cuda::freeFromDevice( kernel_this );
+         Cuda::freeFromDevice( kernel_inVector );
+         Cuda::freeFromDevice( kernel_outVector );
          TNL_CHECK_CUDA_DEVICE;
 #endif
       }
diff --git a/src/TNL/Matrices/SlicedEllpack_impl.h b/src/TNL/Matrices/SlicedEllpack_impl.h
index c6caa56393d265931c6a8e12f4df3400b8e560b2..9f5875f1717e80e971ee0c735a9713dea5eb3973 100644
--- a/src/TNL/Matrices/SlicedEllpack_impl.h
+++ b/src/TNL/Matrices/SlicedEllpack_impl.h
@@ -876,7 +876,7 @@ __global__ void SlicedEllpack_computeMaximalRowLengthInSlices_CudaKernel( Sliced
                                                                           typename SlicedEllpack< Real, Devices::Cuda, Index, SliceSize >::ConstCompressedRowLengthsVectorView rowLengths,
                                                                           int gridIdx )
 {
-   const Index sliceIdx = gridIdx * Devices::Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x;
+   const Index sliceIdx = gridIdx * Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x;
    matrix->computeMaximalRowLengthInSlicesCuda( rowLengths, sliceIdx );
 }
 #endif
@@ -899,7 +899,7 @@ __global__ void SlicedEllpackVectorProductCudaKernel(
    Real multiplicator,
    const Index gridIdx )
 {
-   const Index rowIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const Index rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    if( rowIdx >= rows )
       return;
    const Index sliceIdx = rowIdx / SliceSize;
@@ -975,21 +975,21 @@ class SlicedEllpackDeviceDependentCode< Devices::Cuda >
 #ifdef HAVE_CUDA
          typedef SlicedEllpack< Real, Device, Index, SliceSize > Matrix;
          typedef typename Matrix::CompressedRowLengthsVector CompressedRowLengthsVector;
-         Matrix* kernel_matrix = Devices::Cuda::passToDevice( matrix );
+         Matrix* kernel_matrix = Cuda::passToDevice( matrix );
          const Index numberOfSlices = roundUpDivision( matrix.getRows(), SliceSize );
-         dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() );
+         dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
          const Index cudaBlocks = roundUpDivision( numberOfSlices, cudaBlockSize.x );
-         const Index cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() );
+         const Index cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
          for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
          {
             if( gridIdx == cudaGrids - 1 )
-               cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
+               cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
             SlicedEllpack_computeMaximalRowLengthInSlices_CudaKernel< Real, Index, SliceSize ><<< cudaGridSize, cudaBlockSize >>>
                                                                              ( kernel_matrix,
                                                                                rowLengths,
                                                                                gridIdx );
          }
-         Devices::Cuda::freeFromDevice( kernel_matrix );
+         Cuda::freeFromDevice( kernel_matrix );
          TNL_CHECK_CUDA_DEVICE;
 #endif
          return true;
@@ -1009,16 +1009,16 @@ class SlicedEllpackDeviceDependentCode< Devices::Cuda >
          #ifdef HAVE_CUDA
             typedef SlicedEllpack< Real, Device, Index, SliceSize > Matrix;
             typedef typename Matrix::IndexType IndexType;
-            //Matrix* kernel_this = Devices::Cuda::passToDevice( matrix );
-            //InVector* kernel_inVector = Devices::Cuda::passToDevice( inVector );
-            //OutVector* kernel_outVector = Devices::Cuda::passToDevice( outVector );
-            dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() );
+            //Matrix* kernel_this = Cuda::passToDevice( matrix );
+            //InVector* kernel_inVector = Cuda::passToDevice( inVector );
+            //OutVector* kernel_outVector = Cuda::passToDevice( outVector );
+            dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
             const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
-            const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() );
+            const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
             for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
             {
                if( gridIdx == cudaGrids - 1 )
-                  cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
+                  cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
                SlicedEllpackVectorProductCudaKernel
                < Real, Index, SliceSize >
                 <<< cudaGridSize, cudaBlockSize >>>
@@ -1035,9 +1035,9 @@ class SlicedEllpackDeviceDependentCode< Devices::Cuda >
                   gridIdx );
                TNL_CHECK_CUDA_DEVICE;
             }
-            //Devices::Cuda::freeFromDevice( kernel_this );
-            //Devices::Cuda::freeFromDevice( kernel_inVector );
-            //Devices::Cuda::freeFromDevice( kernel_outVector );
+            //Cuda::freeFromDevice( kernel_this );
+            //Cuda::freeFromDevice( kernel_inVector );
+            //Cuda::freeFromDevice( kernel_outVector );
             TNL_CHECK_CUDA_DEVICE;
             cudaDeviceSynchronize();
          #endif
diff --git a/src/TNL/Matrices/SparseOperations_impl.h b/src/TNL/Matrices/SparseOperations_impl.h
index ccc8930f9bb99825d89de1f7df6de5ce31fa427d..b6d118bf22e39c9ee345269c2a2cc28760b3a198 100644
--- a/src/TNL/Matrices/SparseOperations_impl.h
+++ b/src/TNL/Matrices/SparseOperations_impl.h
@@ -130,8 +130,8 @@ copySparseMatrix_impl( Matrix1& A, const Matrix2& B )
 #ifdef HAVE_CUDA
       dim3 blockSize( 256 );
       dim3 gridSize;
-      const IndexType desGridSize = 32 * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
-      gridSize.x = min( desGridSize, Devices::Cuda::getNumberOfBlocks( rows, blockSize.x ) );
+      const IndexType desGridSize = 32 * Cuda::DeviceInfo::getCudaMultiprocessors( Cuda::DeviceInfo::getActiveDevice() );
+      gridSize.x = min( desGridSize, Cuda::getNumberOfBlocks( rows, blockSize.x ) );
 
       typename Matrix1::CompressedRowLengthsVector rowLengths;
       rowLengths.setSize( rows );
diff --git a/src/TNL/Matrices/SparseRow.h b/src/TNL/Matrices/SparseRow.h
index c7ebd07039061fdb775a583ac065e0a71c5f4869..f66cd2ceaf1c6f0cd882bb962a78c6649816aa75 100644
--- a/src/TNL/Matrices/SparseRow.h
+++ b/src/TNL/Matrices/SparseRow.h
@@ -14,7 +14,7 @@
 #include <type_traits>
 #include <ostream>
 
-#include <TNL/Devices/CudaCallable.h>
+#include <TNL/Cuda/CudaCallable.h>
 
 namespace TNL {
 namespace Matrices {
diff --git a/src/TNL/Matrices/Tridiagonal_impl.h b/src/TNL/Matrices/Tridiagonal_impl.h
index 78e798c2bf0a43213d14a89af6f6a922d3b23ca0..62575f1776144e374b560a65e213248a1177de80 100644
--- a/src/TNL/Matrices/Tridiagonal_impl.h
+++ b/src/TNL/Matrices/Tridiagonal_impl.h
@@ -452,7 +452,7 @@ __global__ void TridiagonalTranspositionCudaKernel( const Tridiagonal< Real2, De
                                                              const Real matrixMultiplicator,
                                                              const Index gridIdx )
 {
-   const Index rowIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const Index rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    if( rowIdx < inMatrix->getRows() )
    {
       if( rowIdx > 0 )
@@ -494,24 +494,24 @@ void Tridiagonal< Real, Device, Index >::getTransposition( const Tridiagonal< Re
    if( std::is_same< Device, Devices::Cuda >::value )
    {
 #ifdef HAVE_CUDA
-      Tridiagonal* kernel_this = Devices::Cuda::passToDevice( *this );
+      Tridiagonal* kernel_this = Cuda::passToDevice( *this );
       typedef  Tridiagonal< Real2, Device, Index2 > InMatrixType;
-      InMatrixType* kernel_inMatrix = Devices::Cuda::passToDevice( matrix );
-      dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() );
+      InMatrixType* kernel_inMatrix = Cuda::passToDevice( matrix );
+      dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
       const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
-      const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() );
+      const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
       for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
       {
          if( gridIdx == cudaGrids - 1 )
-            cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
+            cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
          TridiagonalTranspositionCudaKernel<<< cudaGridSize, cudaBlockSize >>>
                                                     ( kernel_inMatrix,
                                                       kernel_this,
                                                       matrixMultiplicator,
                                                       gridIdx );
       }
-      Devices::Cuda::freeFromDevice( kernel_this );
-      Devices::Cuda::freeFromDevice( kernel_inMatrix );
+      Cuda::freeFromDevice( kernel_this );
+      Cuda::freeFromDevice( kernel_inMatrix );
       TNL_CHECK_CUDA_DEVICE;
 #endif
    }
diff --git a/src/TNL/Meshes/Geometry/getEntityCenter.h b/src/TNL/Meshes/Geometry/getEntityCenter.h
index 59cd950ca180cdbf8095382536544e9fe0ffde51..a37c27acf00523341382a7f44c5ef74adfd14d45 100644
--- a/src/TNL/Meshes/Geometry/getEntityCenter.h
+++ b/src/TNL/Meshes/Geometry/getEntityCenter.h
@@ -10,7 +10,7 @@
 
 #pragma once
 
-#include <TNL/Devices/CudaCallable.h>
+#include <TNL/Cuda/CudaCallable.h>
 #include <TNL/Meshes/GridEntity.h>
 #include <TNL/Meshes/Mesh.h>
 #include <TNL/Meshes/MeshEntity.h>
diff --git a/src/TNL/Meshes/Geometry/getEntityMeasure.h b/src/TNL/Meshes/Geometry/getEntityMeasure.h
index 7402e4f6db3226b534caf6753958fc10e4efdb1a..a3381ed96b1e72ddc2f0ccf25bbbdccd7e71739a 100644
--- a/src/TNL/Meshes/Geometry/getEntityMeasure.h
+++ b/src/TNL/Meshes/Geometry/getEntityMeasure.h
@@ -10,7 +10,7 @@
 
 #pragma once
 
-#include <TNL/Devices/CudaCallable.h>
+#include <TNL/Cuda/CudaCallable.h>
 #include <TNL/Meshes/GridEntity.h>
 #include <TNL/Meshes/Mesh.h>
 #include <TNL/Meshes/MeshEntity.h>
diff --git a/src/TNL/Meshes/GridDetails/GridTraverser.h b/src/TNL/Meshes/GridDetails/GridTraverser.h
index 7ce106f5d9c9e05160490d59434b7d170298a993..e8702153fb03343f85badc160676b1c788eaa948 100644
--- a/src/TNL/Meshes/GridDetails/GridTraverser.h
+++ b/src/TNL/Meshes/GridDetails/GridTraverser.h
@@ -12,7 +12,6 @@
 
 #include <TNL/Meshes/Grid.h>
 #include <TNL/Pointers/SharedPointer.h>
-#include <TNL/CudaStreamPool.h>
 
 namespace TNL {
 namespace Meshes {
diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp
index 53370853824a8fde7f47bf28c0a68de76f84a2b0..796ffe491d572c108f7b8072769e42e9d5c9571d 100644
--- a/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp
+++ b/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp
@@ -16,7 +16,7 @@
 
 #include <TNL/Meshes/Grid.h>
 #include <TNL/Pointers/SharedPointer.h>
-#include <TNL/CudaStreamPool.h>
+#include <TNL/Cuda/StreamPool.h>
 #include <TNL/Exceptions/CudaSupportMissing.h>
 #include <TNL/Meshes/GridDetails/GridTraverser.h>
 #include <TNL/Exceptions/NotImplementedError.h>
@@ -120,7 +120,7 @@ GridTraverser1D(
    typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType;
    typename GridType::CoordinatesType coordinates;
  
-   coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   coordinates.x() = begin.x() + ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
    if( coordinates <= end )
    {   
       GridEntity entity( *grid, coordinates );
@@ -182,7 +182,7 @@ processEntities(
    const int& stream )
 {
 #ifdef HAVE_CUDA
-   auto& pool = CudaStreamPool::getInstance();
+   auto& pool = Cuda::StreamPool::getInstance();
    const cudaStream_t& s = pool.getStream( stream );
 
    Devices::Cuda::synchronizeDevice();
@@ -200,7 +200,7 @@ processEntities(
    else
    {
       dim3 blockSize( 256 ), blocksCount, gridsCount;
-      Devices::Cuda::setupThreads(
+      Cuda::setupThreads(
          blockSize,
          blocksCount,
          gridsCount,
@@ -209,7 +209,7 @@ processEntities(
       for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ )
       {
          dim3 gridSize;
-         Devices::Cuda::setupGrid(
+         Cuda::setupGrid(
             blocksCount,
             gridsCount,
             gridIdx,
@@ -225,8 +225,8 @@ processEntities(
 
       /*dim3 cudaBlockSize( 256 );
       dim3 cudaBlocks;
-      cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x );
-      const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x );
+      cudaBlocks.x = Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x );
+      const IndexType cudaXGrids = Cuda::getNumberOfGrids( cudaBlocks.x );
 
       for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ )
          GridTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor >
diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp
index 3efdb478fd9d8874a730d48deff8849733f0b13e..15c5a0eda13a10bd512fe69930bed1e9c90f068d 100644
--- a/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp
+++ b/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp
@@ -12,7 +12,7 @@
 
 #include <TNL/Meshes/Grid.h>
 #include <TNL/Pointers/SharedPointer.h>
-#include <TNL/CudaStreamPool.h>
+#include <TNL/Cuda/StreamPool.h>
 #include <TNL/Exceptions/CudaSupportMissing.h>
 #include <TNL/Meshes/GridDetails/GridTraverser.h>
 
@@ -148,8 +148,8 @@ GridTraverser2D(
    typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType;
    typename GridType::CoordinatesType coordinates;
 
-   coordinates.x() = begin.x() + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
-   coordinates.y() = begin.y() + Devices::Cuda::getGlobalThreadIdx_y( gridIdx );
+   coordinates.x() = begin.x() + Cuda::getGlobalThreadIdx_x( gridIdx );
+   coordinates.y() = begin.y() + Cuda::getGlobalThreadIdx_y( gridIdx );
    
    if( coordinates <= end )
    {
@@ -186,7 +186,7 @@ GridTraverser2DBoundaryAlongX(
    typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType;
    typename GridType::CoordinatesType coordinates;
 
-   coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
+   coordinates.x() = beginX + Cuda::getGlobalThreadIdx_x( gridIdx );
    coordinates.y() = fixedY;
    
    if( coordinates.x() <= endX )
@@ -222,7 +222,7 @@ GridTraverser2DBoundaryAlongY(
    typename GridType::CoordinatesType coordinates;
 
    coordinates.x() = fixedX;
-   coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
+   coordinates.y() = beginY + Cuda::getGlobalThreadIdx_x( gridIdx );
    
    if( coordinates.y() <= endY )
    {
@@ -291,10 +291,10 @@ GridTraverser2DBoundary(
    
    
    /*const Index aux = max( entitiesAlongX, entitiesAlongY );
-   const Index& warpSize = Devices::Cuda::getWarpSize();
+   const Index& warpSize = Cuda::getWarpSize();
    const Index threadsPerAxis = warpSize * ( aux / warpSize + ( aux % warpSize != 0 ) );
    
-   Index threadId = Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
+   Index threadId = Cuda::getGlobalThreadIdx_x( gridIdx );
    GridEntity entity( *grid, 
          CoordinatesType( 0, 0 ),
          gridEntityParameters... );
@@ -414,10 +414,10 @@ processEntities(
       dim3 cudaBlockSize( 256 );
       dim3 cudaBlocksCountAlongX, cudaGridsCountAlongX,
            cudaBlocksCountAlongY, cudaGridsCountAlongY;
-      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongX, cudaGridsCountAlongX, end.x() - begin.x() + 1 );
-      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongY, cudaGridsCountAlongY, end.y() - begin.y() - 1 );
+      Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongX, cudaGridsCountAlongX, end.x() - begin.x() + 1 );
+      Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongY, cudaGridsCountAlongY, end.y() - begin.y() - 1 );
             
-      auto& pool = CudaStreamPool::getInstance();
+      auto& pool = Cuda::StreamPool::getInstance();
       Devices::Cuda::synchronizeDevice();
       
       const cudaStream_t& s1 = pool.getStream( stream );
@@ -425,8 +425,8 @@ processEntities(
       dim3 gridIdx, cudaGridSize;
       for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongX.x; gridIdx.x++ )
       {
-         Devices::Cuda::setupGrid( cudaBlocksCountAlongX, cudaGridsCountAlongX, gridIdx, cudaGridSize );
-         //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCountAlongX, cudaGridSize, cudaGridsCountAlongX );
+         Cuda::setupGrid( cudaBlocksCountAlongX, cudaGridsCountAlongX, gridIdx, cudaGridSize );
+         //Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCountAlongX, cudaGridSize, cudaGridsCountAlongX );
          GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
                <<< cudaGridSize, cudaBlockSize, 0, s1 >>>
                ( &gridPointer.template getData< Devices::Cuda >(),
@@ -450,7 +450,7 @@ processEntities(
       const cudaStream_t& s4 = pool.getStream( stream + 3 );
       for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongY.x; gridIdx.x++ )
       {
-         Devices::Cuda::setupGrid( cudaBlocksCountAlongY, cudaGridsCountAlongY, gridIdx, cudaGridSize );
+         Cuda::setupGrid( cudaBlocksCountAlongY, cudaGridsCountAlongY, gridIdx, cudaGridSize );
          GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
                <<< cudaGridSize, cudaBlockSize, 0, s3 >>>
                ( &gridPointer.template getData< Devices::Cuda >(),
@@ -482,15 +482,15 @@ processEntities(
       const IndexType maxFaceSize = max( entitiesAlongX, entitiesAlongY );
       const IndexType blocksPerFace = maxFaceSize / cudaBlockSize.x + ( maxFaceSize % cudaBlockSize.x != 0 );
       IndexType cudaThreadsCount = 4 * cudaBlockSize.x * blocksPerFace;
-      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, cudaThreadsCount );
+      Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, cudaThreadsCount );
       //std::cerr << "blocksPerFace = " << blocksPerFace << "Threads count = " << cudaThreadsCount 
       //          << "cudaBlockCount = " << cudaBlocksCount.x << std::endl;      
       dim3 gridIdx, cudaGridSize;
       Devices::Cuda::synchronizeDevice();
       for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x++ )
       {
-         Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
-         //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCountAlongX, cudaGridSize, cudaGridsCountAlongX );
+         Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
+         //Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCountAlongX, cudaGridSize, cudaGridsCountAlongX );
          GridTraverser2DBoundary< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
                <<< cudaGridSize, cudaBlockSize >>>
                ( &gridPointer.template getData< Devices::Cuda >(),
@@ -511,11 +511,11 @@ processEntities(
    {
       dim3 cudaBlockSize( 16, 16 );
       dim3 cudaBlocksCount, cudaGridsCount;
-      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount,
-                                   end.x() - begin.x() + 1,
-                                   end.y() - begin.y() + 1 );
+      Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount,
+                          end.x() - begin.x() + 1,
+                          end.y() - begin.y() + 1 );
       
-      auto& pool = CudaStreamPool::getInstance();
+      auto& pool = Cuda::StreamPool::getInstance();
       const cudaStream_t& s = pool.getStream( stream );
 
       Devices::Cuda::synchronizeDevice();
@@ -523,8 +523,8 @@ processEntities(
       for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ )
          for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ )
          {
-            Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
-	    //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCount, cudaGridSize, cudaGridsCount );
+            Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
+	    //Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCount, cudaGridSize, cudaGridsCount );
             GridTraverser2D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
                <<< cudaGridSize, cudaBlockSize, 0, s >>>
                ( &gridPointer.template getData< Devices::Cuda >(),
diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp
index 24200c15dd89661492142c5f74f3166feb5d7ed6..489e58d779b57e7b5afea4b2d99516fb23b04f31 100644
--- a/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp
+++ b/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp
@@ -12,7 +12,7 @@
 
 #include <TNL/Meshes/Grid.h>
 #include <TNL/Pointers/SharedPointer.h>
-#include <TNL/CudaStreamPool.h>
+#include <TNL/Cuda/StreamPool.h>
 #include <TNL/Exceptions/CudaSupportMissing.h>
 #include <TNL/Meshes/GridDetails/GridTraverser.h>
 #include <TNL/Exceptions/NotImplementedError.h>
@@ -177,9 +177,9 @@ GridTraverser3D(
    typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType;
    typename GridType::CoordinatesType coordinates;
 
-   coordinates.x() = begin.x() + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
-   coordinates.y() = begin.y() + Devices::Cuda::getGlobalThreadIdx_y( gridIdx );
-   coordinates.z() = begin.z() + Devices::Cuda::getGlobalThreadIdx_z( gridIdx );
+   coordinates.x() = begin.x() + Cuda::getGlobalThreadIdx_x( gridIdx );
+   coordinates.y() = begin.y() + Cuda::getGlobalThreadIdx_y( gridIdx );
+   coordinates.z() = begin.z() + Cuda::getGlobalThreadIdx_z( gridIdx );
 
    if( coordinates <= end )
    {
@@ -217,8 +217,8 @@ GridTraverser3DBoundaryAlongXY(
    typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType;
    typename GridType::CoordinatesType coordinates;
 
-   coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
-   coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_y( gridIdx );
+   coordinates.x() = beginX + Cuda::getGlobalThreadIdx_x( gridIdx );
+   coordinates.y() = beginY + Cuda::getGlobalThreadIdx_y( gridIdx );
    coordinates.z() = fixedZ;  
    
    if( coordinates.x() <= endX && coordinates.y() <= endY )
@@ -254,9 +254,9 @@ GridTraverser3DBoundaryAlongXZ(
    typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType;
    typename GridType::CoordinatesType coordinates;
 
-   coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
+   coordinates.x() = beginX + Cuda::getGlobalThreadIdx_x( gridIdx );
    coordinates.y() = fixedY;
-   coordinates.z() = beginZ + Devices::Cuda::getGlobalThreadIdx_y( gridIdx );
+   coordinates.z() = beginZ + Cuda::getGlobalThreadIdx_y( gridIdx );
    
    if( coordinates.x() <= endX && coordinates.z() <= endZ )
    {
@@ -292,8 +292,8 @@ GridTraverser3DBoundaryAlongYZ(
    typename GridType::CoordinatesType coordinates;
 
    coordinates.x() = fixedX;
-   coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
-   coordinates.z() = beginZ + Devices::Cuda::getGlobalThreadIdx_y( gridIdx );
+   coordinates.y() = beginY + Cuda::getGlobalThreadIdx_x( gridIdx );
+   coordinates.z() = beginZ + Cuda::getGlobalThreadIdx_y( gridIdx );
    
    if( coordinates.y() <= endY && coordinates.z() <= endZ )
    {
@@ -341,11 +341,11 @@ processEntities(
       dim3 cudaBlocksCountAlongXY, cudaBlocksCountAlongXZ, cudaBlocksCountAlongYZ,
            cudaGridsCountAlongXY, cudaGridsCountAlongXZ, cudaGridsCountAlongYZ;
       
-      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongXY, cudaGridsCountAlongXY, entitiesAlongX, entitiesAlongY );
-      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongXZ, cudaGridsCountAlongXZ, entitiesAlongX, entitiesAlongZ - 2 );
-      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongYZ, cudaGridsCountAlongYZ, entitiesAlongY - 2, entitiesAlongZ - 2 );
+      Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongXY, cudaGridsCountAlongXY, entitiesAlongX, entitiesAlongY );
+      Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongXZ, cudaGridsCountAlongXZ, entitiesAlongX, entitiesAlongZ - 2 );
+      Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongYZ, cudaGridsCountAlongYZ, entitiesAlongY - 2, entitiesAlongZ - 2 );
 
-      auto& pool = CudaStreamPool::getInstance();
+      auto& pool = Cuda::StreamPool::getInstance();
       Devices::Cuda::synchronizeDevice();
       
       const cudaStream_t& s1 = pool.getStream( stream );
@@ -359,7 +359,7 @@ processEntities(
       for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongXY.y; gridIdx.y++ )
          for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongXY.x; gridIdx.x++ )
          {
-            Devices::Cuda::setupGrid( cudaBlocksCountAlongXY, cudaGridsCountAlongXY, gridIdx, gridSize );
+            Cuda::setupGrid( cudaBlocksCountAlongXY, cudaGridsCountAlongXY, gridIdx, gridSize );
             GridTraverser3DBoundaryAlongXY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
                   <<< cudaBlocksCountAlongXY, cudaBlockSize, 0 , s1 >>>
                   ( &gridPointer.template getData< Devices::Cuda >(),
@@ -386,7 +386,7 @@ processEntities(
       for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongXZ.y; gridIdx.y++ )
          for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongXZ.x; gridIdx.x++ )
          {
-            Devices::Cuda::setupGrid( cudaBlocksCountAlongXZ, cudaGridsCountAlongXZ, gridIdx, gridSize );
+            Cuda::setupGrid( cudaBlocksCountAlongXZ, cudaGridsCountAlongXZ, gridIdx, gridSize );
             GridTraverser3DBoundaryAlongXZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
                   <<< cudaBlocksCountAlongXZ, cudaBlockSize, 0, s3 >>>
                   ( &gridPointer.template getData< Devices::Cuda >(),
@@ -413,7 +413,7 @@ processEntities(
       for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongYZ.y; gridIdx.y++ )
          for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongYZ.x; gridIdx.x++ )
          {
-            Devices::Cuda::setupGrid( cudaBlocksCountAlongYZ, cudaGridsCountAlongYZ, gridIdx, gridSize );
+            Cuda::setupGrid( cudaBlocksCountAlongYZ, cudaGridsCountAlongYZ, gridIdx, gridSize );
             GridTraverser3DBoundaryAlongYZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
                   <<< cudaBlocksCountAlongYZ, cudaBlockSize, 0, s5 >>>
                   ( &gridPointer.template getData< Devices::Cuda >(),
@@ -450,12 +450,12 @@ processEntities(
       dim3 cudaBlockSize( 8, 8, 8 );
       dim3 cudaBlocksCount, cudaGridsCount;
       
-      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount,
-                                   end.x() - begin.x() + 1,
-                                   end.y() - begin.y() + 1,
-                                   end.z() - begin.z() + 1 );
+      Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount,
+                          end.x() - begin.x() + 1,
+                          end.y() - begin.y() + 1,
+                          end.z() - begin.z() + 1 );
 
-      auto& pool = CudaStreamPool::getInstance();
+      auto& pool = Cuda::StreamPool::getInstance();
       const cudaStream_t& s = pool.getStream( stream );
 
       Devices::Cuda::synchronizeDevice();
@@ -464,7 +464,7 @@ processEntities(
          for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ )
             for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ )
             {
-               Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, gridSize );
+               Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, gridSize );
                GridTraverser3D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
                   <<< gridSize, cudaBlockSize, 0, s >>>
                   ( &gridPointer.template getData< Devices::Cuda >(),
diff --git a/src/TNL/Meshes/GridDetails/NeighborGridEntitiesStorage.h b/src/TNL/Meshes/GridDetails/NeighborGridEntitiesStorage.h
index dd9562add377b02c3ab9ca91fa4804762182b46c..29fe8ffd67d0e7e0550abd6540391d24d5a0205d 100644
--- a/src/TNL/Meshes/GridDetails/NeighborGridEntitiesStorage.h
+++ b/src/TNL/Meshes/GridDetails/NeighborGridEntitiesStorage.h
@@ -10,7 +10,7 @@
 
 #pragma once
 
-#include <TNL/Devices/Cuda.h>
+#include <TNL/Cuda/CudaCallable.h>
 #include <TNL/Meshes/DimensionTag.h>
 #include <TNL/Meshes/GridEntityConfig.h>
 #include <TNL/Meshes/GridDetails/NeighborGridEntityGetter.h>
diff --git a/src/TNL/Meshes/GridDetails/NeighborGridEntityGetter.h b/src/TNL/Meshes/GridDetails/NeighborGridEntityGetter.h
index 84a9c56d9389f31c013206c10f63fceb81ec2e0c..f7a3cc180fa3da51b8830f8001e8d616e621f891 100644
--- a/src/TNL/Meshes/GridDetails/NeighborGridEntityGetter.h
+++ b/src/TNL/Meshes/GridDetails/NeighborGridEntityGetter.h
@@ -11,7 +11,7 @@
 #pragma once
 
 #include <TNL/Assert.h>
-#include <TNL/Devices/Cuda.h>
+#include <TNL/Cuda/CudaCallable.h>
 #include <TNL/Meshes/GridEntityConfig.h>
 
 namespace TNL {
diff --git a/src/TNL/Meshes/MeshDetails/MeshEntityIndex.h b/src/TNL/Meshes/MeshDetails/MeshEntityIndex.h
index 64485dc3c7ecb31ebdeb9891830244776550e315..110fa9eefc1ca498435ce2fd11d1d84df2ad4410 100644
--- a/src/TNL/Meshes/MeshDetails/MeshEntityIndex.h
+++ b/src/TNL/Meshes/MeshDetails/MeshEntityIndex.h
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <TNL/Assert.h>
-#include <TNL/Devices/Cuda.h>
+#include <TNL/Cuda/CudaCallable.h>
 
 namespace TNL {
 namespace Meshes {
diff --git a/src/TNL/Meshes/MeshDetails/Traverser_impl.h b/src/TNL/Meshes/MeshDetails/Traverser_impl.h
index 5dedf58fd2ceea521e51ece53e42d0efd65caec1..27cbba714160986d156c4c5e828274df53fb1bf3 100644
--- a/src/TNL/Meshes/MeshDetails/Traverser_impl.h
+++ b/src/TNL/Meshes/MeshDetails/Traverser_impl.h
@@ -13,6 +13,8 @@
 #include <TNL/Meshes/Traverser.h>
 
 #include <TNL/Exceptions/CudaSupportMissing.h>
+#include <TNL/Cuda/DeviceInfo.h>
+#include <TNL/Cuda/LaunchHelpers.h>
 
 namespace TNL {
 namespace Meshes {
@@ -159,8 +161,8 @@ processBoundaryEntities( const MeshPointer& meshPointer,
 
    dim3 blockSize( 256 );
    dim3 gridSize;
-   const int desGridSize = 32 * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
-   gridSize.x = min( desGridSize, Devices::Cuda::getNumberOfBlocks( entitiesCount, blockSize.x ) );
+   const int desGridSize = 32 * Cuda::DeviceInfo::getCudaMultiprocessors( Cuda::DeviceInfo::getActiveDevice() );
+   gridSize.x = min( desGridSize, Cuda::getNumberOfBlocks( entitiesCount, blockSize.x ) );
 
    Devices::Cuda::synchronizeDevice();
    MeshTraverserBoundaryEntitiesKernel< EntitiesDimension, EntitiesProcessor >
@@ -190,8 +192,8 @@ processInteriorEntities( const MeshPointer& meshPointer,
 
    dim3 blockSize( 256 );
    dim3 gridSize;
-   const int desGridSize = 32 * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
-   gridSize.x = min( desGridSize, Devices::Cuda::getNumberOfBlocks( entitiesCount, blockSize.x ) );
+   const int desGridSize = 32 * Cuda::DeviceInfo::getCudaMultiprocessors( Cuda::DeviceInfo::getActiveDevice() );
+   gridSize.x = min( desGridSize, Cuda::getNumberOfBlocks( entitiesCount, blockSize.x ) );
 
    Devices::Cuda::synchronizeDevice();
    MeshTraverserInteriorEntitiesKernel< EntitiesDimension, EntitiesProcessor >
@@ -221,8 +223,8 @@ processAllEntities( const MeshPointer& meshPointer,
 
    dim3 blockSize( 256 );
    dim3 gridSize;
-   const int desGridSize = 32 * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
-   gridSize.x = min( desGridSize, Devices::Cuda::getNumberOfBlocks( entitiesCount, blockSize.x ) );
+   const int desGridSize = 32 * Cuda::DeviceInfo::getCudaMultiprocessors( Cuda::DeviceInfo::getActiveDevice() );
+   gridSize.x = min( desGridSize, Cuda::getNumberOfBlocks( entitiesCount, blockSize.x ) );
 
    Devices::Cuda::synchronizeDevice();
    MeshTraverserAllEntitiesKernel< EntitiesDimension, EntitiesProcessor >
diff --git a/src/TNL/Object.h b/src/TNL/Object.h
index ba41510953c976ab4730084802a014d80108d267..ff7432635a13379c83bf49af758f1da7dd8f6b28 100644
--- a/src/TNL/Object.h
+++ b/src/TNL/Object.h
@@ -12,7 +12,6 @@
 
 #include <vector>
 
-#include <TNL/Devices/CudaCallable.h>
 #include <TNL/String.h>
 #include <TNL/File.h>
 
diff --git a/src/TNL/ParallelFor.h b/src/TNL/ParallelFor.h
index 04af2740807b9139ca0b8452b9c1b7bc52f5a8c8..cc9ce7080679a2a6fbdddd067095eb110a3d1c2a 100644
--- a/src/TNL/ParallelFor.h
+++ b/src/TNL/ParallelFor.h
@@ -12,7 +12,9 @@
 
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
-#include <TNL/Devices/CudaDeviceInfo.h>
+#include <TNL/Cuda/CheckDevice.h>
+#include <TNL/Cuda/DeviceInfo.h>
+#include <TNL/Cuda/LaunchHelpers.h>
 #include <TNL/Math.h>
 
 /****
@@ -203,14 +205,14 @@ struct ParallelFor< Devices::Cuda, Mode >
       if( end > start ) {
          dim3 blockSize( 256 );
          dim3 gridSize;
-         gridSize.x = TNL::min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( end - start, blockSize.x ) );
+         gridSize.x = TNL::min( Cuda::getMaxGridSize(), Cuda::getNumberOfBlocks( end - start, blockSize.x ) );
 
          if( (std::size_t) blockSize.x * gridSize.x >= (std::size_t) end - start )
             ParallelForKernel< false ><<< gridSize, blockSize >>>( start, end, f, args... );
          else {
             // decrease the grid size and align to the number of multiprocessors
-            const int desGridSize = 32 * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
-            gridSize.x = TNL::min( desGridSize, Devices::Cuda::getNumberOfBlocks( end - start, blockSize.x ) );
+            const int desGridSize = 32 * Cuda::DeviceInfo::getCudaMultiprocessors( Cuda::DeviceInfo::getActiveDevice() );
+            gridSize.x = TNL::min( desGridSize, Cuda::getNumberOfBlocks( end - start, blockSize.x ) );
             ParallelForKernel< true ><<< gridSize, blockSize >>>( start, end, f, args... );
          }
 
@@ -253,8 +255,8 @@ struct ParallelFor2D< Devices::Cuda, Mode >
             blockSize.y = TNL::min( 8, sizeY );
          }
          dim3 gridSize;
-         gridSize.x = TNL::min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( sizeX, blockSize.x ) );
-         gridSize.y = TNL::min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( sizeY, blockSize.y ) );
+         gridSize.x = TNL::min( Cuda::getMaxGridSize(), Cuda::getNumberOfBlocks( sizeX, blockSize.x ) );
+         gridSize.y = TNL::min( Cuda::getMaxGridSize(), Cuda::getNumberOfBlocks( sizeY, blockSize.y ) );
 
          dim3 gridCount;
          gridCount.x = roundUpDivision( sizeX, blockSize.x * gridSize.x );
@@ -337,9 +339,9 @@ struct ParallelFor3D< Devices::Cuda, Mode >
             blockSize.z = TNL::min( 4, sizeZ );
          }
          dim3 gridSize;
-         gridSize.x = TNL::min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( sizeX, blockSize.x ) );
-         gridSize.y = TNL::min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( sizeY, blockSize.y ) );
-         gridSize.z = TNL::min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( sizeZ, blockSize.z ) );
+         gridSize.x = TNL::min( Cuda::getMaxGridSize(), Cuda::getNumberOfBlocks( sizeX, blockSize.x ) );
+         gridSize.y = TNL::min( Cuda::getMaxGridSize(), Cuda::getNumberOfBlocks( sizeY, blockSize.y ) );
+         gridSize.z = TNL::min( Cuda::getMaxGridSize(), Cuda::getNumberOfBlocks( sizeZ, blockSize.z ) );
 
          dim3 gridCount;
          gridCount.x = roundUpDivision( sizeX, blockSize.x * gridSize.x );
diff --git a/src/TNL/Pointers/DevicePointer.h b/src/TNL/Pointers/DevicePointer.h
index 136c809cda7b8cbd3ad87957eca4f37f5bc6ff88..d3bf9b07f9b2c59030433e1bca633a2601bb73d0 100644
--- a/src/TNL/Pointers/DevicePointer.h
+++ b/src/TNL/Pointers/DevicePointer.h
@@ -17,6 +17,7 @@
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Pointers/SmartPointer.h>
 #include <TNL/TypeInfo.h>
+#include <TNL/Cuda/MemoryHelpers.h>
 
 #include <cstring>  // std::memcpy, std::memcmp
 
@@ -422,7 +423,7 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
          this->pointer = &obj;
          this->pd = new PointerData();
          // pass to device
-         this->cuda_pointer = Devices::Cuda::passToDevice( *this->pointer );
+         this->cuda_pointer = Cuda::passToDevice( *this->pointer );
          // set last-sync state
          this->set_last_sync_state();
          Devices::Cuda::insertSmartPointer( this );
@@ -456,7 +457,7 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer
                delete this->pd;
                this->pd = nullptr;
                if( this->cuda_pointer )
-                  Devices::Cuda::freeFromDevice( this->cuda_pointer );
+                  Cuda::freeFromDevice( this->cuda_pointer );
             }
          }
       }
diff --git a/src/TNL/Pointers/SharedPointerCuda.h b/src/TNL/Pointers/SharedPointerCuda.h
index 9c883c23a7c5f1c73e6861beb96bfafd61bd7571..27a4aabef4ed7e1383ae5f141e1c7e56a6f5d350 100644
--- a/src/TNL/Pointers/SharedPointerCuda.h
+++ b/src/TNL/Pointers/SharedPointerCuda.h
@@ -16,6 +16,7 @@
 
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Pointers/SmartPointer.h>
+#include <TNL/Cuda/MemoryHelpers.h>
 
 #include <cstring>   // std::memcpy, std::memcmp
 #include <cstddef>   // std::nullptr_t
@@ -570,7 +571,7 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
       {
          this->pd = new PointerData( args... );
          // pass to device
-         this->cuda_pointer = Devices::Cuda::passToDevice( this->pd->data );
+         this->cuda_pointer = Cuda::passToDevice( this->pd->data );
          // set last-sync state
          this->set_last_sync_state();
 #ifdef TNL_DEBUG_SHARED_POINTERS
@@ -608,7 +609,7 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer
                delete this->pd;
                this->pd = nullptr;
                if( this->cuda_pointer )
-                  Devices::Cuda::freeFromDevice( this->cuda_pointer );
+                  Cuda::freeFromDevice( this->cuda_pointer );
 #ifdef TNL_DEBUG_SHARED_POINTERS
                std::cerr << "...deleted data." << std::endl;
 #endif
diff --git a/src/TNL/Pointers/SharedPointerHost.h b/src/TNL/Pointers/SharedPointerHost.h
index 087cfd79e9fbe77ac7e4c9a5a2e3a8c2cce5fe5c..39a6d4da4a2b8ab8b964110173d1716fade1ac71 100644
--- a/src/TNL/Pointers/SharedPointerHost.h
+++ b/src/TNL/Pointers/SharedPointerHost.h
@@ -15,7 +15,7 @@
 #include "SharedPointer.h"
 
 #include <TNL/Devices/Host.h>
-#include <TNL/Devices/CudaCallable.h>
+#include <TNL/Cuda/CudaCallable.h>
 #include <TNL/Pointers/SmartPointer.h>
 
 #include <cstddef>   // std::nullptr_t
diff --git a/src/TNL/Pointers/UniquePointer.h b/src/TNL/Pointers/UniquePointer.h
index 53f2dac5b87a012354fe06675efafcecdab48dfc..9a7a2b677769fde29d49419141a76cd9cd6f4373 100644
--- a/src/TNL/Pointers/UniquePointer.h
+++ b/src/TNL/Pointers/UniquePointer.h
@@ -16,6 +16,7 @@
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Pointers/SmartPointer.h>
+#include <TNL/Cuda/MemoryHelpers.h>
 
 #include <cstring>  // std::memcpy, std::memcmp
 #include <cstddef>  // std::nullptr_t
@@ -272,7 +273,7 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer
       {
          this->pd = new PointerData( args... );
          // pass to device
-         this->cuda_pointer = Devices::Cuda::passToDevice( this->pd->data );
+         this->cuda_pointer = Cuda::passToDevice( this->pd->data );
          // set last-sync state
          this->set_last_sync_state();
          Devices::Cuda::insertSmartPointer( this );
@@ -300,7 +301,7 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer
          if( this->pd )
             delete this->pd;
          if( this->cuda_pointer )
-            Devices::Cuda::freeFromDevice( this->cuda_pointer );
+            Cuda::freeFromDevice( this->cuda_pointer );
       }
 
       PointerData* pd;
diff --git a/src/TNL/Solvers/PDE/BoundaryConditionsSetter.h b/src/TNL/Solvers/PDE/BoundaryConditionsSetter.h
index a28a64cf5567eaee40f5d4efca3fd24af3dd2819..d1b871c25c0a998b68a770d4c0629ad76d20dfb5 100644
--- a/src/TNL/Solvers/PDE/BoundaryConditionsSetter.h
+++ b/src/TNL/Solvers/PDE/BoundaryConditionsSetter.h
@@ -11,7 +11,7 @@
 
 #pragma once
 
-#include <TNL/Devices/Cuda.h>
+#include <TNL/Cuda/CudaCallable.h>
 #include <TNL/Functions/FunctionAdapter.h>
 #include <TNL/Pointers/SharedPointer.h>
 #include <TNL/Meshes/Traverser.h>
diff --git a/src/TNL/StaticFor.h b/src/TNL/StaticFor.h
index 990036dfc0090708851468e03e991a30a07cc835..c37763aaa9fb5ae740755b1b8a3750c71b92ae65 100644
--- a/src/TNL/StaticFor.h
+++ b/src/TNL/StaticFor.h
@@ -10,7 +10,7 @@
 
 #pragma once
 
-#include <TNL/Devices/Cuda.h>
+#include <TNL/Cuda/CudaCallable.h>
 
 namespace TNL {
 
diff --git a/src/TNL/TemplateStaticFor.h b/src/TNL/TemplateStaticFor.h
index 88ad764fd9b78d0348469a115ee1cb83ecb7993b..efd9d1ad9c686a6ff9780656ce3cd0d9d85727e0 100644
--- a/src/TNL/TemplateStaticFor.h
+++ b/src/TNL/TemplateStaticFor.h
@@ -13,7 +13,7 @@
 #include <utility>
 #include <type_traits>
 
-#include <TNL/Devices/CudaCallable.h>
+#include <TNL/Cuda/CudaCallable.h>
 
 namespace TNL {
 namespace detail {
diff --git a/src/UnitTests/AssertCudaTest.cu b/src/UnitTests/AssertCudaTest.cu
index 9d4865eb9c8ba3b7aaa6f8bf5506fabe17be4483..8f42da6772dc2b4b6dbe3f185ca96bc8efde9290 100644
--- a/src/UnitTests/AssertCudaTest.cu
+++ b/src/UnitTests/AssertCudaTest.cu
@@ -13,7 +13,7 @@
 #endif
 
 #include <TNL/Assert.h>
-#include <TNL/Devices/Cuda.h>
+#include <TNL/Cuda/CheckDevice.h>
 #include <TNL/Exceptions/CudaRuntimeError.h>
 
 #ifdef HAVE_GTEST
diff --git a/src/UnitTests/Containers/ArrayTest.h b/src/UnitTests/Containers/ArrayTest.h
index 25c7fda4935dbfcd5d064d22bcce86adfddc4b2e..69dd2d25227202a4391aebb0054dae8acf8dfa26 100644
--- a/src/UnitTests/Containers/ArrayTest.h
+++ b/src/UnitTests/Containers/ArrayTest.h
@@ -15,6 +15,7 @@
 
 #include <TNL/Containers/Array.h>
 #include <TNL/Containers/Vector.h>
+#include <TNL/Pointers/DevicePointer.h>
 
 #include "gtest/gtest.h"
 
@@ -312,9 +313,9 @@ void testArrayElementwiseAccess( Array< Value, Devices::Cuda, Index >&& u )
 #ifdef HAVE_CUDA
    u.setSize( 10 );
    using ArrayType = Array< Value, Devices::Cuda, Index >;
-   ArrayType* kernel_u = Devices::Cuda::passToDevice( u );
-   testSetGetElementKernel<<< 1, 16 >>>( kernel_u );
-   Devices::Cuda::freeFromDevice( kernel_u );
+   Pointers::DevicePointer< ArrayType > kernel_u( u );
+   testSetGetElementKernel<<< 1, 16 >>>( &kernel_u.template modifyData< Devices::Cuda >() );
+   cudaDeviceSynchronize();
    TNL_CHECK_CUDA_DEVICE;
    for( int i = 0; i < 10; i++ ) {
       EXPECT_EQ( u.getElement( i ), i );