diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h
index 45f715a5b3be31bbb9052a73db392ceeeff4425d..d8b20898310a1353cad1bced491410be620abc08 100644
--- a/src/Benchmarks/SpMV/spmv.h
+++ b/src/Benchmarks/SpMV/spmv.h
@@ -232,8 +232,10 @@ benchmarkSpMV( Benchmark& benchmark,
     resultcuSPARSEDeviceVector2 = deviceVector2;
     
     // Difference between GPU (curent format) and GPU-cuSPARSE results
-    Real cuSparseDifferenceAbsMax = resultDeviceVector2.differenceAbsMax( resultcuSPARSEDeviceVector2 );
-    Real cuSparseDifferenceLpNorm = resultDeviceVector2.differenceLpNorm( resultcuSPARSEDeviceVector2, 1 );
+    //Real cuSparseDifferenceAbsMax = resultDeviceVector2.differenceAbsMax( resultcuSPARSEDeviceVector2 );
+    Real cuSparseDifferenceAbsMax = max( abs( resultDeviceVector2 - resultcuSPARSEDeviceVector2 ) );
+    //Real cuSparseDifferenceLpNorm = resultDeviceVector2.differenceLpNorm( resultcuSPARSEDeviceVector2, 1 );
+    Real cuSparseDifferenceLpNorm = lpNorm( resultDeviceVector2 - resultcuSPARSEDeviceVector2, 1 );
     
     std::string GPUxGPUcuSparse_resultDifferenceAbsMax = "GPUxGPUcuSPARSE differenceAbsMax = " + std::to_string( cuSparseDifferenceAbsMax );
     std::string GPUxGPUcuSparse_resultDifferenceLpNorm = "GPUxGPUcuSPARSE differenceLpNorm = " + std::to_string( cuSparseDifferenceLpNorm );
@@ -243,8 +245,10 @@ benchmarkSpMV( Benchmark& benchmark,
     
     
     // Difference between CPU and GPU results for the current format
-    Real differenceAbsMax = resultHostVector2.differenceAbsMax( resultDeviceVector2 );
-    Real differenceLpNorm = resultHostVector2.differenceLpNorm( resultDeviceVector2, 1 );
+    //Real differenceAbsMax = resultHostVector2.differenceAbsMax( resultDeviceVector2 );
+    Real differenceAbsMax = max( abs( resultHostVector2 - resultDeviceVector2 ) );
+    //Real differenceLpNorm = resultHostVector2.differenceLpNorm( resultDeviceVector2, 1 );
+    Real differenceLpNorm = lpNorm( resultHostVector2 - resultDeviceVector2, 1 );
     
     std::string CPUxGPU_resultDifferenceAbsMax = "CPUxGPU differenceAbsMax = " + std::to_string( differenceAbsMax );
     std::string CPUxGPU_resultDifferenceLpNorm = "CPUxGPU differenceLpNorm = " + std::to_string( differenceLpNorm );
diff --git a/src/TNL/Matrices/AdEllpack_impl.h b/src/TNL/Matrices/AdEllpack_impl.h
index bea4a1b4fb4d7d093c70456686f66b77d00aec7d..b7b97ff93550ef8c7289b749156e1fd5973e2f7d 100644
--- a/src/TNL/Matrices/AdEllpack_impl.h
+++ b/src/TNL/Matrices/AdEllpack_impl.h
@@ -1064,7 +1064,7 @@ void AdEllpack< Real, Device, Index >::spmvCuda4( const InVector& inVector,
                                                            OutVector& outVector,
                                                            const int gridIdx ) const
 {
-    IndexType globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+    IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
     IndexType warpIdx = globalIdx >> 5;
     IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 );
     if( globalIdx >= this->reduceMap.getSize() )
@@ -1129,14 +1129,14 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector,
                                                            OutVector& outVector,
                                                            const int gridIdx ) const
 {
-    IndexType globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+    IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
     IndexType warpIdx = globalIdx >> 5;
     IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 );
     if( globalIdx >= this->reduceMap.getSize() )
         return;
 
     const int blockSize = 128;
-    Real* temp = Devices::Cuda::getSharedMemory< Real >();
+    Real* temp = Cuda::getSharedMemory< Real >();
     __shared__ IndexType reduceMap[ blockSize ];
     reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ];
     temp[ threadIdx.x ] = 0.0;
@@ -1207,14 +1207,14 @@ void AdEllpack< Real, Device, Index >::spmvCuda16( const InVector& inVector,
                                                          OutVector& outVector,
                                                          const int gridIdx ) const
 {
-    IndexType globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+    IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
     IndexType warpIdx = globalIdx >> 5;
     IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 );
     if( globalIdx >= this->reduceMap.getSize() )
         return;
 
     const int blockSize = 128;
-    Real* temp = Devices::Cuda::getSharedMemory< Real >();
+    Real* temp = Cuda::getSharedMemory< Real >();
     __shared__ IndexType reduceMap[ blockSize ];
     reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ];
     temp[ threadIdx.x ] = 0.0;
@@ -1293,7 +1293,7 @@ void AdEllpack< Real, Device, Index >::spmvCuda32( const InVector& inVector,
 	return;
 
     const int blockSize = 96;
-    Real* temp = Devices::Cuda::getSharedMemory< Real >();
+    Real* temp = Cuda::getSharedMemory< Real >();
     __shared__ IndexType reduceMap[ blockSize ];
     reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ];
     temp[ threadIdx.x ] = 0.0;
@@ -1441,9 +1441,9 @@ public:
 #ifdef HAVE_CUDA
       typedef AdEllpack< Real, Devices::Cuda, Index > Matrix;
       typedef typename Matrix::IndexType IndexType;
-	   Matrix* kernel_this = Devices::Cuda::passToDevice( matrix );
-	   InVector* kernel_inVector = Devices::Cuda::passToDevice( inVector );
-	   OutVector* kernel_outVector = Devices::Cuda::passToDevice( outVector );
+	   Matrix* kernel_this = Cuda::passToDevice( matrix );
+	   InVector* kernel_inVector = Cuda::passToDevice( inVector );
+	   OutVector* kernel_outVector = Cuda::passToDevice( outVector );
       TNL_CHECK_CUDA_DEVICE;
 
       if( matrix.totalLoad < 2 )
@@ -1510,16 +1510,16 @@ public:
                                                       gridIdx );
 	    }
 	    TNL_CHECK_CUDA_DEVICE;
-	    Devices::Cuda::freeFromDevice( kernel_this );
-	    Devices::Cuda::freeFromDevice( kernel_inVector );
-	    Devices::Cuda::freeFromDevice( kernel_outVector );
+	    Cuda::freeFromDevice( kernel_this );
+	    Cuda::freeFromDevice( kernel_inVector );
+	    Cuda::freeFromDevice( kernel_outVector );
 	    TNL_CHECK_CUDA_DEVICE;
 	}
 	else if( matrix.totalLoad < 16 )
 	{
 	    dim3 blockSize( 128 ), cudaGridSize( Cuda::getMaxGridSize() );
 	    IndexType cudaBlocks = roundUpDivision( matrix.reduceMap.getSize(), blockSize.x );
-	    IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() );
+	    IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
             for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
 	    {
 	        if( gridIdx == cudaGrids - 1 )
@@ -1556,9 +1556,9 @@ public:
                                                        gridIdx );
 	    }
 	    TNL_CHECK_CUDA_DEVICE;
-	    Devices::Cuda::freeFromDevice( kernel_this );
-	    Devices::Cuda::freeFromDevice( kernel_inVector );
-	    Devices::Cuda::freeFromDevice( kernel_outVector );
+	    Cuda::freeFromDevice( kernel_this );
+	    Cuda::freeFromDevice( kernel_inVector );
+	    Cuda::freeFromDevice( kernel_outVector );
 	    TNL_CHECK_CUDA_DEVICE;
 	}
 #endif // HAVE_CUDA
diff --git a/src/TNL/Matrices/BiEllpack_impl.h b/src/TNL/Matrices/BiEllpack_impl.h
index 2789c92ebd2fd0f60935aa3cb20b09487d63dd4f..c659b758e9cffe531a101baf8fe3cd812436fe2c 100644
--- a/src/TNL/Matrices/BiEllpack_impl.h
+++ b/src/TNL/Matrices/BiEllpack_impl.h
@@ -1406,7 +1406,7 @@ public:
 		for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
 		{
 		     if( gridIdx == cudaGrids - 1 )
-		         cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
+		         cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
 		     performRowBubbleSortCuda< Real, Index >
 		     	 	 	 	 	 	 <<< cudaGridSize, cudaBlockSize >>>
 		                             ( kernel_this,
@@ -1436,7 +1436,7 @@ public:
 		for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
 		{
 		     if( gridIdx == cudaGrids - 1 )
-		         cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
+		         cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
 		     computeColumnSizesCuda< Real, Index >
 		     	 	 	 	 	   <<< cudaGridSize, cudaBlockSize >>>
 		                           ( kernel_this,
diff --git a/src/TNL/Matrices/ChunkedEllpack_impl.h b/src/TNL/Matrices/ChunkedEllpack_impl.h
index a77b4a7667ae8bc990f91f35d472e6952840208d..23ba2ed5e012e5f3e6cfdef43441031b28d79f5c 100644
--- a/src/TNL/Matrices/ChunkedEllpack_impl.h
+++ b/src/TNL/Matrices/ChunkedEllpack_impl.h
@@ -1230,8 +1230,8 @@ ChunkedEllpack< Real, Device, Index >::operator=( const ChunkedEllpack< Real2, D
    
    // host -> cuda
    if( std::is_same< Device, Devices::Cuda >::value ) {
-       typename ValuesVector::HostType tmpValues;
-       typename ColumnIndexesVector::HostType tmpColumnIndexes;
+       typename ValuesVector::Self< typename ValuesVector::RealType, Devices::Host > tmpValues;
+       typename ColumnIndexesVector::Self< typename ColumnIndexesVector::RealType, Devices::Host > tmpColumnIndexes;
        tmpValues.setLike( matrix.values );
        tmpColumnIndexes.setLike( matrix.columnIndexes );