diff --git a/src/TNL/Containers/Segments/Ellpack.h b/src/TNL/Containers/Segments/Ellpack.h
index 772566f5181d2de3c3dfa84b5067b8cfe0c857cc..dc1a717b3a0822e60c9e098795fdcfc2507e5885 100644
--- a/src/TNL/Containers/Segments/Ellpack.h
+++ b/src/TNL/Containers/Segments/Ellpack.h
@@ -28,6 +28,7 @@ class Ellpack
       using IndexType = Index;
       using OffsetsHolder = Containers::Vector< IndexType, DeviceType, IndexType >;
       static constexpr int getAlignment() { return Alignment; }
+      static constexpr bool getRowMajorOrder() { return RowMajorOrder; }
 
       Ellpack();
 
diff --git a/src/TNL/Containers/Segments/Ellpack.hpp b/src/TNL/Containers/Segments/Ellpack.hpp
index 42d7eb8c1c5a83f69e49fd7655e6904ad953a340..8a23693ec8f3d38effac6a3de3d0fb24a8838531 100644
--- a/src/TNL/Containers/Segments/Ellpack.hpp
+++ b/src/TNL/Containers/Segments/Ellpack.hpp
@@ -63,7 +63,7 @@ setSizes( const SizesHolder& sizes )
    if( RowMajorOrder )
       this->alignedSize = this->size;
    else
-      this->alignedSize = roundUpDivision( size / this->getAlignment() ) * this->getAlignment();
+      this->alignedSize = roundUpDivision( size, this->getAlignment() ) * this->getAlignment();
 }
 
 template< typename Device,
@@ -186,17 +186,35 @@ void
 Ellpack< Device, Index, RowMajorOrder, Alignment >::
 segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
-   using RealType = decltype( fetch( IndexType(), IndexType() ) );
-   const auto offsetsView = this->offsets.getConstView();
-   auto l = [=] __cuda_callable__ ( const IndexType i, Args... args ) mutable {
-      const IndexType begin = offsetsView[ i ];
-      const IndexType end = offsetsView[ i + 1 ];
-      RealType aux( zero );
-      for( IndexType j = begin; j < end; j++  )
-         reduction( aux, fetch( i, j, args... ) );
-      keeper( i, aux );
-   };
-   Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
+   if( RowMajorOrder )
+   {
+      using RealType = decltype( fetch( IndexType(), IndexType() ) );
+      const IndexType segmentSize = this->segmentSize;
+      auto l = [=] __cuda_callable__ ( const IndexType i, Args... args ) mutable {
+         const IndexType begin = i * segmentSize;
+         const IndexType end = begin + segmentSize;
+         RealType aux( zero );
+         for( IndexType j = begin; j < end; j++  )
+            reduction( aux, fetch( i, j, args... ) );
+         keeper( i, aux );
+      };
+      Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
+   }
+   else
+   {
+      using RealType = decltype( fetch( IndexType(), IndexType() ) );
+      const IndexType storageSize = this->getStorageSize();
+      const IndexType alignedSize = this->alignedSize;
+      auto l = [=] __cuda_callable__ ( const IndexType i, Args... args ) mutable {
+         const IndexType begin = i;
+         const IndexType end = storageSize;
+         RealType aux( zero );
+         for( IndexType j = begin; j < end; j += alignedSize  )
+            reduction( aux, fetch( i, j, args... ) );
+         keeper( i, aux );
+      };
+      Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
+   }
 }
 
 template< typename Device,
@@ -219,7 +237,9 @@ void
 Ellpack< Device, Index, RowMajorOrder, Alignment >::
 save( File& file ) const
 {
-   file << this->offsets;
+   file.save( &segmentSize );
+   file.save( &size );
+   file.save( &alignedSize );
 }
 
 template< typename Device,
@@ -230,7 +250,9 @@ void
 Ellpack< Device, Index, RowMajorOrder, Alignment >::
 load( File& file )
 {
-   file >> this->offsets;
+   file.load( &segmentSize );
+   file.load( &size );
+   file.load( &alignedSize );
 }
 
       } // namespace Segments
diff --git a/src/TNL/Matrices/SparseMatrix.hpp b/src/TNL/Matrices/SparseMatrix.hpp
index a43ddba8297eb1624db2b8113ce4349bfda42c8c..9bc8d7fb7355130847edbf3441b6856f98ed402e 100644
--- a/src/TNL/Matrices/SparseMatrix.hpp
+++ b/src/TNL/Matrices/SparseMatrix.hpp
@@ -489,12 +489,16 @@ vectorProduct( const InVector& inVector,
                const RealType& matrixMultiplicator,
                const RealType& inVectorAddition ) const
 {
-   auto inVectorView = inVector.getConstView();
+   const auto inVectorView = inVector.getConstView();
    auto outVectorView = outVector.getView();
-   auto valuesView = this->values.getConstView();
-   auto columnIndexesView = this->columnIndexes.getConstView();
+   const auto valuesView = this->values.getConstView();
+   const auto columnIndexesView = this->columnIndexes.getConstView();
+   const IndexType paddingIndex = this->getPaddingIndex();
    auto fetch = [=] __cuda_callable__ ( IndexType row, IndexType offset ) -> RealType {
-      return valuesView[ offset ] * inVectorView[ columnIndexesView[ offset ] ];
+      const IndexType column = columnIndexesView[ offset ];
+      if( column == paddingIndex )
+         return 0.0;
+      return valuesView[ offset ] * inVectorView[ column ];
    };
    auto reduction = [] __cuda_callable__ ( RealType& sum, const RealType& value ) {
       sum += value;
diff --git a/src/UnitTests/Matrices/CMakeLists.txt b/src/UnitTests/Matrices/CMakeLists.txt
index 996dd0430c4dbeec7b4131c9e748836c26d3495e..ef1f043710a792b30d74cad335e6659643d3554b 100644
--- a/src/UnitTests/Matrices/CMakeLists.txt
+++ b/src/UnitTests/Matrices/CMakeLists.txt
@@ -31,6 +31,9 @@ IF( BUILD_CUDA )
    CUDA_ADD_EXECUTABLE( SparseMatrixTest_CSR_segments SparseMatrixTest_CSR_segments.cu OPTIONS ${CXX_TESTS_FLAGS} )
    TARGET_LINK_LIBRARIES( SparseMatrixTest_CSR_segments ${GTEST_BOTH_LIBRARIES} )
 
+   CUDA_ADD_EXECUTABLE( SparseMatrixTest_Ellpack_segments SparseMatrixTest_Ellpack_segments.cu OPTIONS ${CXX_TESTS_FLAGS} )
+   TARGET_LINK_LIBRARIES( SparseMatrixTest_Ellpack_segments ${GTEST_BOTH_LIBRARIES} )
+
 ELSE(  BUILD_CUDA )
    ADD_EXECUTABLE( SparseMatrixCopyTest SparseMatrixCopyTest.cpp )
    TARGET_COMPILE_OPTIONS( SparseMatrixCopyTest PRIVATE ${CXX_TESTS_FLAGS} )
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_Ellpack_segments.h b/src/UnitTests/Matrices/SparseMatrixTest_Ellpack_segments.h
index 79cdf06cff8e65723b7c0224ff2310b1dd0621a2..c54aab9486fdecc8836faa5f4e2e5e11a7ce36f3 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_Ellpack_segments.h
+++ b/src/UnitTests/Matrices/SparseMatrixTest_Ellpack_segments.h
@@ -26,34 +26,45 @@ protected:
    using EllpackMatrixType = Matrix;
 };
 
+////
+// Row-major format is used for the host system
+template< typename Device, typename Index >
+using RowMajorEllpack = TNL::Containers::Segments::Ellpack< Device, Index, true, 32 >;
+
+
+////
+// Column-major format is used for GPUs
+template< typename Device, typename Index >
+using ColumnMajorEllpack = TNL::Containers::Segments::Ellpack< Device, Index, false, 32 >;
+
 // types for which MatrixTest is instantiated
 using EllpackMatrixTypes = ::testing::Types
 <
-    TNL::Matrices::SparseMatrix< int,     TNL::Containers::Segments::Ellpack, TNL::Devices::Host, short >,
-    TNL::Matrices::SparseMatrix< long,    TNL::Containers::Segments::Ellpack, TNL::Devices::Host, short >,
-    TNL::Matrices::SparseMatrix< float,   TNL::Containers::Segments::Ellpack, TNL::Devices::Host, short >,
-    TNL::Matrices::SparseMatrix< double,  TNL::Containers::Segments::Ellpack, TNL::Devices::Host, short >,
-    TNL::Matrices::SparseMatrix< int,     TNL::Containers::Segments::Ellpack, TNL::Devices::Host, int   >,
-    TNL::Matrices::SparseMatrix< long,    TNL::Containers::Segments::Ellpack, TNL::Devices::Host, int   >,
-    TNL::Matrices::SparseMatrix< float,   TNL::Containers::Segments::Ellpack, TNL::Devices::Host, int   >,
-    TNL::Matrices::SparseMatrix< double,  TNL::Containers::Segments::Ellpack, TNL::Devices::Host, int   >,
-    TNL::Matrices::SparseMatrix< int,     TNL::Containers::Segments::Ellpack, TNL::Devices::Host, long  >,
-    TNL::Matrices::SparseMatrix< long,    TNL::Containers::Segments::Ellpack, TNL::Devices::Host, long  >,
-    TNL::Matrices::SparseMatrix< float,   TNL::Containers::Segments::Ellpack, TNL::Devices::Host, long  >,
-    TNL::Matrices::SparseMatrix< double,  TNL::Containers::Segments::Ellpack, TNL::Devices::Host, long  >
+    TNL::Matrices::SparseMatrix< int,     RowMajorEllpack, TNL::Devices::Host, short >,
+    TNL::Matrices::SparseMatrix< long,    RowMajorEllpack, TNL::Devices::Host, short >,
+    TNL::Matrices::SparseMatrix< float,   RowMajorEllpack, TNL::Devices::Host, short >,
+    TNL::Matrices::SparseMatrix< double,  RowMajorEllpack, TNL::Devices::Host, short >,
+    TNL::Matrices::SparseMatrix< int,     RowMajorEllpack, TNL::Devices::Host, int   >,
+    TNL::Matrices::SparseMatrix< long,    RowMajorEllpack, TNL::Devices::Host, int   >,
+    TNL::Matrices::SparseMatrix< float,   RowMajorEllpack, TNL::Devices::Host, int   >,
+    TNL::Matrices::SparseMatrix< double,  RowMajorEllpack, TNL::Devices::Host, int   >,
+    TNL::Matrices::SparseMatrix< int,     RowMajorEllpack, TNL::Devices::Host, long  >,
+    TNL::Matrices::SparseMatrix< long,    RowMajorEllpack, TNL::Devices::Host, long  >,
+    TNL::Matrices::SparseMatrix< float,   RowMajorEllpack, TNL::Devices::Host, long  >,
+    TNL::Matrices::SparseMatrix< double,  RowMajorEllpack, TNL::Devices::Host, long  >
 #ifdef HAVE_CUDA
-   ,TNL::Matrices::SparseMatrix< int,     TNL::Containers::Segments::Ellpack, TNL::Devices::Cuda, short >,
-    TNL::Matrices::SparseMatrix< long,    TNL::Containers::Segments::Ellpack, TNL::Devices::Cuda, short >,
-    TNL::Matrices::SparseMatrix< float,   TNL::Containers::Segments::Ellpack, TNL::Devices::Cuda, short >,
-    TNL::Matrices::SparseMatrix< double,  TNL::Containers::Segments::Ellpack, TNL::Devices::Cuda, short >,
-    TNL::Matrices::SparseMatrix< int,     TNL::Containers::Segments::Ellpack, TNL::Devices::Cuda, int   >,
-    TNL::Matrices::SparseMatrix< long,    TNL::Containers::Segments::Ellpack, TNL::Devices::Cuda, int   >,
-    TNL::Matrices::SparseMatrix< float,   TNL::Containers::Segments::Ellpack, TNL::Devices::Cuda, int   >,
-    TNL::Matrices::SparseMatrix< double,  TNL::Containers::Segments::Ellpack, TNL::Devices::Cuda, int   >,
-    TNL::Matrices::SparseMatrix< int,     TNL::Containers::Segments::Ellpack, TNL::Devices::Cuda, long  >,
-    TNL::Matrices::SparseMatrix< long,    TNL::Containers::Segments::Ellpack, TNL::Devices::Cuda, long  >,
-    TNL::Matrices::SparseMatrix< float,   TNL::Containers::Segments::Ellpack, TNL::Devices::Cuda, long  >,
-    TNL::Matrices::SparseMatrix< double,  TNL::Containers::Segments::Ellpack, TNL::Devices::Cuda, long  >
+   ,TNL::Matrices::SparseMatrix< int,     ColumnMajorEllpack, TNL::Devices::Cuda, short >,
+    TNL::Matrices::SparseMatrix< long,    ColumnMajorEllpack, TNL::Devices::Cuda, short >,
+    TNL::Matrices::SparseMatrix< float,   ColumnMajorEllpack, TNL::Devices::Cuda, short >,
+    TNL::Matrices::SparseMatrix< double,  ColumnMajorEllpack, TNL::Devices::Cuda, short >,
+    TNL::Matrices::SparseMatrix< int,     ColumnMajorEllpack, TNL::Devices::Cuda, int   >,
+    TNL::Matrices::SparseMatrix< long,    ColumnMajorEllpack, TNL::Devices::Cuda, int   >,
+    TNL::Matrices::SparseMatrix< float,   ColumnMajorEllpack, TNL::Devices::Cuda, int   >,
+    TNL::Matrices::SparseMatrix< double,  ColumnMajorEllpack, TNL::Devices::Cuda, int   >,
+    TNL::Matrices::SparseMatrix< int,     ColumnMajorEllpack, TNL::Devices::Cuda, long  >,
+    TNL::Matrices::SparseMatrix< long,    ColumnMajorEllpack, TNL::Devices::Cuda, long  >,
+    TNL::Matrices::SparseMatrix< float,   ColumnMajorEllpack, TNL::Devices::Cuda, long  >,
+    TNL::Matrices::SparseMatrix< double,  ColumnMajorEllpack, TNL::Devices::Cuda, long  >
 #endif
 >;