diff --git a/src/TNL/Matrices/CMakeLists.txt b/src/TNL/Matrices/CMakeLists.txt
index 82c8924bf762b89b8b3378c52acbb51e1ae1772f..f60a43c87d951152cda5491c23e6626a25d5bd2e 100644
--- a/src/TNL/Matrices/CMakeLists.txt
+++ b/src/TNL/Matrices/CMakeLists.txt
@@ -14,6 +14,8 @@ SET( headers AdEllpack.h
              Multidiagonal_impl.h
              Sparse.h
              Sparse_impl.h
+             SparseOperations.h
+             SparseOperations_impl.h
              Ellpack.h
              Ellpack_impl.h
              SlicedEllpack.h
diff --git a/src/TNL/Matrices/Sparse.h b/src/TNL/Matrices/Sparse.h
index 110dd7a40a57372c9fc65ea94c6bac1adf3c0f54..2ee49219ee2d8fa8be6662c417dd68c3c3a6c690 100644
--- a/src/TNL/Matrices/Sparse.h
+++ b/src/TNL/Matrices/Sparse.h
@@ -14,7 +14,7 @@
 #include <TNL/Matrices/SparseRow.h>
 
 namespace TNL {
-namespace Matrices {   
+namespace Matrices {
 
 template< typename Real,
           typename Device,
@@ -62,14 +62,8 @@ class Sparse : public Matrix< Real, Device, Index >
    Index maxRowLength;
 };
 
-
-// This cannot be a method of the Sparse class, because the implementation uses
-// methods (marked with __cuda_callable__) which are defined only on the
-// subclasses, but are not virtual methods of Sparse.
-template< typename Matrix1, typename Matrix2 >
-void copySparseMatrix( Matrix1& A, const Matrix2& B );
-
 } // namespace Matrices
 } // namespace TNL
 
 #include <TNL/Matrices/Sparse_impl.h>
+#include <TNL/Matrices/SparseOperations.h>
diff --git a/src/TNL/Matrices/SparseOperations.h b/src/TNL/Matrices/SparseOperations.h
new file mode 100644
index 0000000000000000000000000000000000000000..e375d596ad439520289bbf6d5e3a0006dc34d2c6
--- /dev/null
+++ b/src/TNL/Matrices/SparseOperations.h
@@ -0,0 +1,28 @@
+/***************************************************************************
+                          SparseOperations.h  -  description
+                             -------------------
+    begin                : Oct 4, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub KlinkovskĂ˝
+
+// Note that these functions cannot be methods of the Sparse class, because
+// their implementation uses methods (marked with __cuda_callable__) which are
+// defined only on the subclasses, but are not virtual methods of Sparse.
+
+#pragma once
+
+namespace TNL {
+namespace Matrices {
+
+template< typename Matrix1, typename Matrix2 >
+void copySparseMatrix( Matrix1& A, const Matrix2& B );
+
+} // namespace Matrices
+} // namespace TNL
+
+#include "SparseOperations_impl.h"
diff --git a/src/TNL/Matrices/SparseOperations_impl.h b/src/TNL/Matrices/SparseOperations_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..0fbb02005d1fe070087d5eceb180efee0daea166
--- /dev/null
+++ b/src/TNL/Matrices/SparseOperations_impl.h
@@ -0,0 +1,161 @@
+/***************************************************************************
+                          SparseOperations_impl.h  -  description
+                             -------------------
+    begin                : Oct 4, 2018
+    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub KlinkovskĂ˝
+
+#pragma once
+
+#include <TNL/Pointers/DevicePointer.h>
+
+namespace TNL {
+namespace Matrices {
+
+#ifdef HAVE_CUDA
+template< typename Vector, typename Matrix >
+__global__ void
+SparseMatrixSetRowLengthsVectorKernel( Vector* rowLengths,
+                                       const Matrix* matrix,
+                                       typename Matrix::IndexType rows,
+                                       typename Matrix::IndexType cols )
+{
+   using IndexType = typename Matrix::IndexType;
+
+   IndexType rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
+   const IndexType gridSize = blockDim.x * gridDim.x;
+
+   while( rowIdx < rows ) {
+      const auto max_length = matrix->getRowLengthFast( rowIdx );
+      const auto row = matrix->getRow( rowIdx );
+      IndexType length = 0;
+      for( IndexType c_j = 0; c_j < max_length; c_j++ )
+         if( row.getElementColumn( c_j ) < cols )
+            length++;
+         else
+            break;
+      rowLengths[ rowIdx ] = length;
+      rowIdx += gridSize;
+   }
+}
+
+template< typename Matrix1, typename Matrix2 >
+__global__ void
+SparseMatrixCopyKernel( Matrix1* A,
+                        const Matrix2* B,
+                        const typename Matrix2::IndexType* rowLengths,
+                        typename Matrix2::IndexType rows )
+{
+   using IndexType = typename Matrix2::IndexType;
+
+   IndexType rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
+   const IndexType gridSize = blockDim.x * gridDim.x;
+
+   while( rowIdx < rows ) {
+      const auto length = rowLengths[ rowIdx ];
+      const auto rowB = B->getRow( rowIdx );
+      auto rowA = A->getRow( rowIdx );
+      for( IndexType c = 0; c < length; c++ )
+         rowA.setElement( c, rowB.getElementColumn( c ), rowB.getElementValue( c ) );
+      rowIdx += gridSize;
+   }
+}
+#endif
+
+
+template< typename Matrix1, typename Matrix2 >
+void
+copySparseMatrix( Matrix1& A, const Matrix2& B )
+{
+   static_assert( std::is_same< typename Matrix1::RealType, typename Matrix2::RealType >::value,
+                  "The matrices must have the same RealType." );
+   static_assert( std::is_same< typename Matrix1::DeviceType, typename Matrix2::DeviceType >::value,
+                  "The matrices must be allocated on the same device." );
+   static_assert( std::is_same< typename Matrix1::IndexType, typename Matrix2::IndexType >::value,
+                  "The matrices must have the same IndexType." );
+
+   using RealType = typename Matrix1::RealType;
+   using DeviceType = typename Matrix1::DeviceType;
+   using IndexType = typename Matrix1::IndexType;
+
+   const IndexType rows = B.getRows();
+   const IndexType cols = B.getColumns();
+
+   A.setDimensions( rows, cols );
+
+   if( std::is_same< DeviceType, Devices::Host >::value ) {
+      // set row lengths
+      typename Matrix1::CompressedRowLengthsVector rowLengths;
+      rowLengths.setSize( rows );
+#ifdef HAVE_OPENMP
+#pragma omp parallel for if( Devices::Host::isOMPEnabled() )
+#endif
+      for( IndexType i = 0; i < rows; i++ ) {
+         const auto max_length = B.getRowLength( i );
+         const auto row = B.getRow( i );
+         IndexType length = 0;
+         for( IndexType c_j = 0; c_j < max_length; c_j++ )
+            if( row.getElementColumn( c_j ) < cols )
+               length++;
+            else
+               break;
+         rowLengths[ i ] = length;
+      }
+      A.setCompressedRowLengths( rowLengths );
+
+#ifdef HAVE_OPENMP
+#pragma omp parallel for if( Devices::Host::isOMPEnabled() )
+#endif
+      for( IndexType i = 0; i < rows; i++ ) {
+         const auto length = rowLengths[ i ];
+         const auto rowB = B.getRow( i );
+         auto rowA = A.getRow( i );
+         for( IndexType c = 0; c < length; c++ )
+            rowA.setElement( c, rowB.getElementColumn( c ), rowB.getElementValue( c ) );
+      }
+   }
+
+   if( std::is_same< DeviceType, Devices::Cuda >::value ) {
+#ifdef HAVE_CUDA
+      dim3 blockSize( 256 );
+      dim3 gridSize;
+      const IndexType desGridSize = 32 * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
+      gridSize.x = min( desGridSize, Devices::Cuda::getNumberOfBlocks( rows, blockSize.x ) );
+
+      typename Matrix1::CompressedRowLengthsVector rowLengths;
+      rowLengths.setSize( rows );
+
+      Pointers::DevicePointer< Matrix1 > Apointer( A );
+      const Pointers::DevicePointer< const Matrix2 > Bpointer( B );
+
+      // set row lengths
+      Devices::Cuda::synchronizeDevice();
+      SparseMatrixSetRowLengthsVectorKernel<<< gridSize, blockSize >>>(
+            rowLengths.getData(),
+            &Bpointer.template getData< TNL::Devices::Cuda >(),
+            rows,
+            cols );
+      TNL_CHECK_CUDA_DEVICE;
+      Apointer->setCompressedRowLengths( rowLengths );
+
+      // copy rows
+      Devices::Cuda::synchronizeDevice();
+      SparseMatrixCopyKernel<<< gridSize, blockSize >>>(
+            &Apointer.template modifyData< TNL::Devices::Cuda >(),
+            &Bpointer.template getData< TNL::Devices::Cuda >(),
+            rowLengths.getData(),
+            rows );
+      TNL_CHECK_CUDA_DEVICE;
+#else
+      throw Exceptions::CudaSupportMissing();
+#endif
+   }
+}
+
+} // namespace Matrices
+} // namespace TNL
diff --git a/src/TNL/Matrices/Sparse_impl.h b/src/TNL/Matrices/Sparse_impl.h
index ba4558d3f989374f65c12ace41391323e5a6adba..cd57637cbd773dc2ce25a13ad9b39191a7d2507b 100644
--- a/src/TNL/Matrices/Sparse_impl.h
+++ b/src/TNL/Matrices/Sparse_impl.h
@@ -11,7 +11,6 @@
 #pragma once
 
 #include "Sparse.h"
-#include <TNL/Pointers/DevicePointer.h>
 
 namespace TNL {
 namespace Matrices {
@@ -134,145 +133,5 @@ void Sparse< Real, Device, Index >::printStructure( std::ostream& str ) const
    TNL_ASSERT_TRUE( false, "Not implemented yet." );
 }
 
-
-#ifdef HAVE_CUDA
-template< typename Vector, typename Matrix >
-__global__ void
-SparseMatrixSetRowLengthsVectorKernel( Vector* rowLengths,
-                                       const Matrix* matrix,
-                                       typename Matrix::IndexType rows,
-                                       typename Matrix::IndexType cols )
-{
-   using IndexType = typename Matrix::IndexType;
-
-   IndexType rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-   const IndexType gridSize = blockDim.x * gridDim.x;
-
-   while( rowIdx < rows ) {
-      const auto max_length = matrix->getRowLengthFast( rowIdx );
-      const auto row = matrix->getRow( rowIdx );
-      IndexType length = 0;
-      for( IndexType c_j = 0; c_j < max_length; c_j++ )
-         if( row.getElementColumn( c_j ) < cols )
-            length++;
-         else
-            break;
-      rowLengths[ rowIdx ] = length;
-      rowIdx += gridSize;
-   }
-}
-
-template< typename Matrix1, typename Matrix2 >
-__global__ void
-SparseMatrixCopyKernel( Matrix1* A,
-                        const Matrix2* B,
-                        const typename Matrix2::IndexType* rowLengths,
-                        typename Matrix2::IndexType rows )
-{
-   using IndexType = typename Matrix2::IndexType;
-
-   IndexType rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-   const IndexType gridSize = blockDim.x * gridDim.x;
-
-   while( rowIdx < rows ) {
-      const auto length = rowLengths[ rowIdx ];
-      const auto rowB = B->getRow( rowIdx );
-      auto rowA = A->getRow( rowIdx );
-      for( IndexType c = 0; c < length; c++ )
-         rowA.setElement( c, rowB.getElementColumn( c ), rowB.getElementValue( c ) );
-      rowIdx += gridSize;
-   }
-}
-#endif
-
-template< typename Matrix1, typename Matrix2 >
-void
-copySparseMatrix( Matrix1& A, const Matrix2& B )
-{
-   static_assert( std::is_same< typename Matrix1::RealType, typename Matrix2::RealType >::value,
-                  "The matrices must have the same RealType." );
-   static_assert( std::is_same< typename Matrix1::DeviceType, typename Matrix2::DeviceType >::value,
-                  "The matrices must be allocated on the same device." );
-   static_assert( std::is_same< typename Matrix1::IndexType, typename Matrix2::IndexType >::value,
-                  "The matrices must have the same IndexType." );
-
-   using RealType = typename Matrix1::RealType;
-   using DeviceType = typename Matrix1::DeviceType;
-   using IndexType = typename Matrix1::IndexType;
-
-   const IndexType rows = B.getRows();
-   const IndexType cols = B.getColumns();
-
-   A.setDimensions( rows, cols );
-
-   if( std::is_same< DeviceType, Devices::Host >::value ) {
-      // set row lengths
-      typename Matrix1::CompressedRowLengthsVector rowLengths;
-      rowLengths.setSize( rows );
-#ifdef HAVE_OPENMP
-#pragma omp parallel for if( Devices::Host::isOMPEnabled() )
-#endif
-      for( IndexType i = 0; i < rows; i++ ) {
-         const auto max_length = B.getRowLength( i );
-         const auto row = B.getRow( i );
-         IndexType length = 0;
-         for( IndexType c_j = 0; c_j < max_length; c_j++ )
-            if( row.getElementColumn( c_j ) < cols )
-               length++;
-            else
-               break;
-         rowLengths[ i ] = length;
-      }
-      A.setCompressedRowLengths( rowLengths );
-
-#ifdef HAVE_OPENMP
-#pragma omp parallel for if( Devices::Host::isOMPEnabled() )
-#endif
-      for( IndexType i = 0; i < rows; i++ ) {
-         const auto length = rowLengths[ i ];
-         const auto rowB = B.getRow( i );
-         auto rowA = A.getRow( i );
-         for( IndexType c = 0; c < length; c++ )
-            rowA.setElement( c, rowB.getElementColumn( c ), rowB.getElementValue( c ) );
-      }
-   }
-
-   if( std::is_same< DeviceType, Devices::Cuda >::value ) {
-#ifdef HAVE_CUDA
-      dim3 blockSize( 256 );
-      dim3 gridSize;
-      const IndexType desGridSize = 32 * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
-      gridSize.x = min( desGridSize, Devices::Cuda::getNumberOfBlocks( rows, blockSize.x ) );
-
-      typename Matrix1::CompressedRowLengthsVector rowLengths;
-      rowLengths.setSize( rows );
-
-      Pointers::DevicePointer< Matrix1 > Apointer( A );
-      const Pointers::DevicePointer< const Matrix2 > Bpointer( B );
-
-      // set row lengths
-      Devices::Cuda::synchronizeDevice();
-      SparseMatrixSetRowLengthsVectorKernel<<< gridSize, blockSize >>>(
-            rowLengths.getData(),
-            &Bpointer.template getData< TNL::Devices::Cuda >(),
-            rows,
-            cols );
-      TNL_CHECK_CUDA_DEVICE;
-      Apointer->setCompressedRowLengths( rowLengths );
-
-      // copy rows
-      Devices::Cuda::synchronizeDevice();
-      SparseMatrixCopyKernel<<< gridSize, blockSize >>>(
-            &Apointer.template modifyData< TNL::Devices::Cuda >(),
-            &Bpointer.template getData< TNL::Devices::Cuda >(),
-            rowLengths.getData(),
-            rows );
-      TNL_CHECK_CUDA_DEVICE;
-#else
-      throw Exceptions::CudaSupportMissing();
-#endif
-   }
-}
-
 } // namespace Matrices
 } // namespace TNL