diff --git a/src/TNL/Containers/Segments/CSR.hpp b/src/TNL/Containers/Segments/CSR.hpp
index ecd52190c7c8c41db7d9ea55423f5ce93d669b2b..b40524e5e90141c3f0c55a3d534f294ebfc6cbe6 100644
--- a/src/TNL/Containers/Segments/CSR.hpp
+++ b/src/TNL/Containers/Segments/CSR.hpp
@@ -204,6 +204,6 @@ load( File& file )
    file >> this->offsets;
 }
 
-      } // namespace Segements
+      } // namespace Segments
    }  // namespace Conatiners
 } // namespace TNL
diff --git a/src/TNL/Containers/Segments/Ellpack.h b/src/TNL/Containers/Segments/Ellpack.h
index 49f859afb465acb6c1395379bd395342c8cc2011..772566f5181d2de3c3dfa84b5067b8cfe0c857cc 100644
--- a/src/TNL/Containers/Segments/Ellpack.h
+++ b/src/TNL/Containers/Segments/Ellpack.h
@@ -18,6 +18,7 @@ namespace TNL {
 
 template< typename Device,
           typename Index,
+          bool RowMajorOrder = std::is_same< Device, Devices::Host >::value,
           int Alignment = 32 >
 class Ellpack
 {
diff --git a/src/TNL/Containers/Segments/Ellpack.hpp b/src/TNL/Containers/Segments/Ellpack.hpp
index 0b6240514626ee1fdbf86c2cf84d0977eacc9544..42d7eb8c1c5a83f69e49fd7655e6904ad953a340 100644
--- a/src/TNL/Containers/Segments/Ellpack.hpp
+++ b/src/TNL/Containers/Segments/Ellpack.hpp
@@ -20,128 +20,170 @@ namespace TNL {
 
 
 template< typename Device,
-          typename Index >
-Ellpack< Device, Index >::
-Ellpack() : size( 0 ), rowLength( 0 )
+          typename Index,
+          bool RowMajorOrder,
+          int Alignment >
+Ellpack< Device, Index, RowMajorOrder, Alignment >::
+Ellpack()
+   : segmentSize( 0 ), size( 0 ), alignedSize( 0 )
 {
 }
 
 template< typename Device,
-          typename Index >
-Ellpack< Device, Index >::
-Ellpack( const Ellpack& ellpack ) : offsets( ellpack.offsets )
+          typename Index,
+          bool RowMajorOrder,
+          int Alignment >
+Ellpack< Device, Index, RowMajorOrder, Alignment >::
+Ellpack( const Ellpack& ellpack )
+   : segmentSize( ellpack.segmentSize ), size( ellpack.size ), alignedSize( ellpack.alignedSize )
 {
 }
 
 template< typename Device,
-          typename Index >
-Ellpack< Device, Index >::
-Ellpack( const Ellpack&& ellpack ) : offsets( std::move( ellpack.offsets ) )
+          typename Index,
+          bool RowMajorOrder,
+          int Alignment >
+Ellpack< Device, Index, RowMajorOrder, Alignment >::
+Ellpack( const Ellpack&& ellpack )
+   : segmentSize( ellpack.segmentSize ), size( ellpack.size ), alignedSize( ellpack.alignedSize )
 {
-
 }
 
 template< typename Device,
-          typename Index >
+          typename Index,
+          bool RowMajorOrder,
+          int Alignment >
    template< typename SizesHolder >
 void
-Ellpack< Device, Index >::
+Ellpack< Device, Index, RowMajorOrder, Alignment >::
 setSizes( const SizesHolder& sizes )
 {
    this->segmentSize = max( sizes );
    this->size = sizes.getSize();
+   if( RowMajorOrder )
+      this->alignedSize = this->size;
+   else
+      this->alignedSize = roundUpDivision( size / this->getAlignment() ) * this->getAlignment();
 }
 
 template< typename Device,
-          typename Index >
+          typename Index,
+          bool RowMajorOrder,
+          int Alignment >
 __cuda_callable__
 Index
-Ellpack< Device, Index >::
+Ellpack< Device, Index, RowMajorOrder, Alignment >::
 getSize() const
 {
-   return this->offsets.getSize() - 1;
+   return this->size;
 }
 
 template< typename Device,
-          typename Index >
+          typename Index,
+          bool RowMajorOrder,
+          int Alignment >
 __cuda_callable__
 Index
-Ellpack< Device, Index >::
+Ellpack< Device, Index, RowMajorOrder, Alignment >::
 getSegmentSize( const IndexType segmentIdx ) const
 {
    return this->segmentSize;
 }
 
 template< typename Device,
-          typename Index >
+          typename Index,
+          bool RowMajorOrder,
+          int Alignment >
 __cuda_callable__
 Index
-Ellpack< Device, Index >::
+Ellpack< Device, Index, RowMajorOrder, Alignment >::
 getStorageSize() const
 {
-   return this->size * this->segmentSize;
+   return this->alignedSize * this->segmentSize;
 }
 
 template< typename Device,
-          typename Index >
+          typename Index,
+          bool RowMajorOrder,
+          int Alignment >
 __cuda_callable__
 Index
-Ellpack< Device, Index >::
+Ellpack< Device, Index, RowMajorOrder, Alignment >::
 getGlobalIndex( const Index segmentIdx, const Index localIdx ) const
 {
-   if( ! std::is_same< DeviceType, Devices::Host >::value )
-   {
-#ifdef __CUDA_ARCH__
-      return offsets[ segmentIdx ] + localIdx;
-#else
-      return offsets.getElement( segmentIdx ) + localIdx;
-#endif
-   }
-   return offsets[ segmentIdx ] + localIdx;
+   if( RowMajorOrder )
+      return segmentIdx * this->segmentSize + localIdx;
+   else
+      return segmentIdx + this->alignedSize * localIdx;
 }
 
 template< typename Device,
-          typename Index >
+          typename Index,
+          bool RowMajorOrder,
+          int Alignment >
 __cuda_callable__
 void
-Ellpack< Device, Index >::
+Ellpack< Device, Index, RowMajorOrder, Alignment >::
 getSegmentAndLocalIndex( const Index globalIdx, Index& segmentIdx, Index& localIdx ) const
 {
 }
 
 template< typename Device,
-          typename Index >
+          typename Index,
+          bool RowMajorOrder,
+          int Alignment >
    template< typename Function, typename... Args >
 void
-Ellpack< Device, Index >::
+Ellpack< Device, Index, RowMajorOrder, Alignment >::
 forSegments( IndexType first, IndexType last, Function& f, Args... args ) const
 {
    const auto offsetsView = this->offsets.getView();
-   auto l = [=] __cuda_callable__ ( const IndexType i, Args... args ) {
-      const IndexType begin = offsetsView[ i ];
-      const IndexType end = offsetsView[ i + 1 ];
-      for( IndexType j = begin; j < end; j++  )
-         if( ! f( i, j, args... ) )
-            break;
-   };
-   Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
+   if( RowMajorOrder )
+   {
+      const IndexType segmentSize = this->segmentSize;
+      auto l = [=] __cuda_callable__ ( const IndexType i, Args... args ) {
+         const IndexType begin = i * segmentSize;
+         const IndexType end = begin + segmentSize;
+         for( IndexType j = begin; j < end; j++  )
+            if( ! f( i, j, args... ) )
+               break;
+      };
+      Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
+   }
+   else
+   {
+      const IndexType storageSize = this->getStorageSize();
+      const IndexType alignedSize = this->alignedSize;
+      auto l = [=] __cuda_callable__ ( const IndexType i, Args... args ) {
+         const IndexType begin = i;
+         const IndexType end = storageSize;
+         for( IndexType j = begin; j < end; j += alignedSize )
+            if( ! f( i, j, args... ) )
+               break;
+      };
+      Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
+   }
 }
 
 template< typename Device,
-          typename Index >
+          typename Index,
+          bool RowMajorOrder,
+          int Alignment >
    template< typename Function, typename... Args >
 void
-Ellpack< Device, Index >::
+Ellpack< Device, Index, RowMajorOrder, Alignment >::
 forAll( Function& f, Args... args ) const
 {
    this->forSegments( 0, this->getSize(), f, args... );
 }
 
 template< typename Device,
-          typename Index >
+          typename Index,
+          bool RowMajorOrder,
+          int Alignment >
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
-Ellpack< Device, Index >::
+Ellpack< Device, Index, RowMajorOrder, Alignment >::
 segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    using RealType = decltype( fetch( IndexType(), IndexType() ) );
@@ -158,33 +200,39 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red
 }
 
 template< typename Device,
-          typename Index >
+          typename Index,
+          bool RowMajorOrder,
+          int Alignment >
    template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
 void
-Ellpack< Device, Index >::
+Ellpack< Device, Index, RowMajorOrder, Alignment >::
 allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    this->segmentsReduction( 0, this->getSize(), fetch, reduction, keeper, zero, args... );
 }
 
 template< typename Device,
-          typename Index >
+          typename Index,
+          bool RowMajorOrder,
+          int Alignment >
 void
-Ellpack< Device, Index >::
+Ellpack< Device, Index, RowMajorOrder, Alignment >::
 save( File& file ) const
 {
    file << this->offsets;
 }
 
 template< typename Device,
-          typename Index >
+          typename Index,
+          bool RowMajorOrder,
+          int Alignment >
 void
-Ellpack< Device, Index >::
+Ellpack< Device, Index, RowMajorOrder, Alignment >::
 load( File& file )
 {
    file >> this->offsets;
 }
 
-      } // namespace Segements
+      } // namespace Segments
    }  // namespace Conatiners
 } // namespace TNL
diff --git a/src/UnitTests/Matrices/CMakeLists.txt b/src/UnitTests/Matrices/CMakeLists.txt
index f278934a6a38af68f1ed9a577e718225d8a2ccbe..996dd0430c4dbeec7b4131c9e748836c26d3495e 100644
--- a/src/UnitTests/Matrices/CMakeLists.txt
+++ b/src/UnitTests/Matrices/CMakeLists.txt
@@ -74,6 +74,11 @@ ELSE(  BUILD_CUDA )
    TARGET_COMPILE_OPTIONS( SparseMatrixTest_CSR_segments PRIVATE ${CXX_TESTS_FLAGS} )
    TARGET_LINK_LIBRARIES( SparseMatrixTest_CSR_segments ${GTEST_BOTH_LIBRARIES} )
 
+   ADD_EXECUTABLE( SparseMatrixTest_Ellpack_segments SparseMatrixTest_Ellpack_segments.cpp )
+   TARGET_COMPILE_OPTIONS( SparseMatrixTest_Ellpack_segments PRIVATE ${CXX_TESTS_FLAGS} )
+   TARGET_LINK_LIBRARIES( SparseMatrixTest_Ellpack_segments ${GTEST_BOTH_LIBRARIES} )
+
+
 ENDIF( BUILD_CUDA )
 
 
@@ -92,6 +97,7 @@ ADD_TEST( SparseMatrixTest_SlicedEllpack ${EXECUTABLE_OUTPUT_PATH}/SparseMatrixT
 ####
 # Segments tests
 ADD_TEST( SparseMatrixTest_CSR_segments ${EXECUTABLE_OUTPUT_PATH}/SparseMatrixTest_CSR_segments${CMAKE_EXECUTABLE_SUFFIX} )
+ADD_TEST( SparseMatrixTest_Ellpack_segments ${EXECUTABLE_OUTPUT_PATH}/SparseMatrixTest_Ellpack_segments${CMAKE_EXECUTABLE_SUFFIX} )
 
 if( ${BUILD_MPI} )
    if( BUILD_CUDA )
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSR_segments.h b/src/UnitTests/Matrices/SparseMatrixTest_CSR_segments.h
index a738af0e2aff3bb5bc9c30a16d59f406669b1983..b5335846990dec98c6d45b1100334ee998a13562 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest_CSR_segments.h
+++ b/src/UnitTests/Matrices/SparseMatrixTest_CSR_segments.h
@@ -1,8 +1,8 @@
 /***************************************************************************
                           SparseMatrixTest_CSR.h -  description
                              -------------------
-    begin                : Nov 2, 2018
-    copyright            : (C) 2018 by Tomas Oberhuber et al.
+    begin                : Dec 2, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber et al.
     email                : tomas.oberhuber@fjfi.cvut.cz
  ***************************************************************************/
 
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_Ellpack_segments.cpp b/src/UnitTests/Matrices/SparseMatrixTest_Ellpack_segments.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..63219e9b075b6334346e397f8ad3949ae6a5781d
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixTest_Ellpack_segments.cpp
@@ -0,0 +1 @@
+#include "SparseMatrixTest_Ellpack_segments.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_Ellpack_segments.cu b/src/UnitTests/Matrices/SparseMatrixTest_Ellpack_segments.cu
new file mode 100644
index 0000000000000000000000000000000000000000..63219e9b075b6334346e397f8ad3949ae6a5781d
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixTest_Ellpack_segments.cu
@@ -0,0 +1 @@
+#include "SparseMatrixTest_Ellpack_segments.h"
diff --git a/src/UnitTests/Matrices/SparseMatrixTest_Ellpack_segments.h b/src/UnitTests/Matrices/SparseMatrixTest_Ellpack_segments.h
new file mode 100644
index 0000000000000000000000000000000000000000..79cdf06cff8e65723b7c0224ff2310b1dd0621a2
--- /dev/null
+++ b/src/UnitTests/Matrices/SparseMatrixTest_Ellpack_segments.h
@@ -0,0 +1,141 @@
+/***************************************************************************
+                          SparseMatrixTest_Ellpack.h -  description
+                             -------------------
+    begin                : Dec 3, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include <TNL/Containers/Segments/Ellpack.h>
+#include <TNL/Matrices/SparseMatrix.h>
+
+
+#include "SparseMatrixTest.hpp"
+#include <iostream>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+// test fixture for typed tests
+template< typename Matrix >
+class EllpackMatrixTest : public ::testing::Test
+{
+protected:
+   using EllpackMatrixType = Matrix;
+};
+
+// types for which MatrixTest is instantiated
+using EllpackMatrixTypes = ::testing::Types
+<
+    TNL::Matrices::SparseMatrix< int,     TNL::Containers::Segments::Ellpack, TNL::Devices::Host, short >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Containers::Segments::Ellpack, TNL::Devices::Host, short >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Containers::Segments::Ellpack, TNL::Devices::Host, short >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Containers::Segments::Ellpack, TNL::Devices::Host, short >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Containers::Segments::Ellpack, TNL::Devices::Host, int   >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Containers::Segments::Ellpack, TNL::Devices::Host, int   >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Containers::Segments::Ellpack, TNL::Devices::Host, int   >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Containers::Segments::Ellpack, TNL::Devices::Host, int   >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Containers::Segments::Ellpack, TNL::Devices::Host, long  >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Containers::Segments::Ellpack, TNL::Devices::Host, long  >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Containers::Segments::Ellpack, TNL::Devices::Host, long  >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Containers::Segments::Ellpack, TNL::Devices::Host, long  >
+#ifdef HAVE_CUDA
+   ,TNL::Matrices::SparseMatrix< int,     TNL::Containers::Segments::Ellpack, TNL::Devices::Cuda, short >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Containers::Segments::Ellpack, TNL::Devices::Cuda, short >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Containers::Segments::Ellpack, TNL::Devices::Cuda, short >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Containers::Segments::Ellpack, TNL::Devices::Cuda, short >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Containers::Segments::Ellpack, TNL::Devices::Cuda, int   >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Containers::Segments::Ellpack, TNL::Devices::Cuda, int   >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Containers::Segments::Ellpack, TNL::Devices::Cuda, int   >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Containers::Segments::Ellpack, TNL::Devices::Cuda, int   >,
+    TNL::Matrices::SparseMatrix< int,     TNL::Containers::Segments::Ellpack, TNL::Devices::Cuda, long  >,
+    TNL::Matrices::SparseMatrix< long,    TNL::Containers::Segments::Ellpack, TNL::Devices::Cuda, long  >,
+    TNL::Matrices::SparseMatrix< float,   TNL::Containers::Segments::Ellpack, TNL::Devices::Cuda, long  >,
+    TNL::Matrices::SparseMatrix< double,  TNL::Containers::Segments::Ellpack, TNL::Devices::Cuda, long  >
+#endif
+>;
+
+TYPED_TEST_SUITE( EllpackMatrixTest, EllpackMatrixTypes);
+
+TYPED_TEST( EllpackMatrixTest, setDimensionsTest )
+{
+    using EllpackMatrixType = typename TestFixture::EllpackMatrixType;
+
+    test_SetDimensions< EllpackMatrixType >();
+}
+
+//TYPED_TEST( EllpackMatrixTest, setCompressedRowLengthsTest )
+//{
+////    using EllpackMatrixType = typename TestFixture::EllpackMatrixType;
+//
+////    test_SetCompressedRowLengths< EllpackMatrixType >();
+//
+//    bool testRan = false;
+//    EXPECT_TRUE( testRan );
+//    std::cout << "\nTEST DID NOT RUN. NOT WORKING.\n\n";
+//    std::cout << "      This test is dependent on the input format. \n";
+//    std::cout << "      Almost every format allocates elements per row differently.\n\n";
+//    std::cout << "\n    TODO: Finish implementation of getNonZeroRowLength (Only non-zero elements, not the number of allocated elements.)\n\n";
+//}
+
+TYPED_TEST( EllpackMatrixTest, setLikeTest )
+{
+    using EllpackMatrixType = typename TestFixture::EllpackMatrixType;
+
+    test_SetLike< EllpackMatrixType, EllpackMatrixType >();
+}
+
+TYPED_TEST( EllpackMatrixTest, resetTest )
+{
+    using EllpackMatrixType = typename TestFixture::EllpackMatrixType;
+
+    test_Reset< EllpackMatrixType >();
+}
+
+TYPED_TEST( EllpackMatrixTest, setElementTest )
+{
+    using EllpackMatrixType = typename TestFixture::EllpackMatrixType;
+
+    test_SetElement< EllpackMatrixType >();
+}
+
+TYPED_TEST( EllpackMatrixTest, addElementTest )
+{
+    using EllpackMatrixType = typename TestFixture::EllpackMatrixType;
+
+    test_AddElement< EllpackMatrixType >();
+}
+
+TYPED_TEST( EllpackMatrixTest, setRowTest )
+{
+    using EllpackMatrixType = typename TestFixture::EllpackMatrixType;
+
+    test_SetRow< EllpackMatrixType >();
+}
+
+TYPED_TEST( EllpackMatrixTest, vectorProductTest )
+{
+    using EllpackMatrixType = typename TestFixture::EllpackMatrixType;
+
+    test_VectorProduct< EllpackMatrixType >();
+}
+
+TYPED_TEST( EllpackMatrixTest, saveAndLoadTest )
+{
+    using EllpackMatrixType = typename TestFixture::EllpackMatrixType;
+
+    test_SaveAndLoad< EllpackMatrixType >( "test_SparseMatrixTest_Ellpack_segments" );
+}
+
+TYPED_TEST( EllpackMatrixTest, printTest )
+{
+    using EllpackMatrixType = typename TestFixture::EllpackMatrixType;
+
+    test_Print< EllpackMatrixType >();
+}
+
+#endif
+
+#include "../main.h"