diff --git a/src/TNL/Algorithms/AtomicOperations.h b/src/TNL/Algorithms/AtomicOperations.h
index 456839b5ea2ba69fcb333c814a6d6ff3af4f5d11..6da10e2acab1c1429f854053ce06361bda540726 100644
--- a/src/TNL/Algorithms/AtomicOperations.h
+++ b/src/TNL/Algorithms/AtomicOperations.h
@@ -8,19 +8,13 @@
 
 #pragma once
 
-#ifdef HAVE_CUDA
-   #include <cuda.h>
-#endif
-#include <TNL/Devices/Sequential.h>
-#include <TNL/Devices/Host.h>
-#include <TNL/Devices/Cuda.h>
+#include <TNL/Atomic.h>
 
 namespace TNL {
 namespace Algorithms {
 
 template< typename Device >
-struct AtomicOperations
-{};
+struct AtomicOperations;
 
 template<>
 struct AtomicOperations< Devices::Host >
@@ -31,11 +25,16 @@ struct AtomicOperations< Devices::Host >
    TNL_NVCC_HD_WARNING_DISABLE
    template< typename Value >
    __cuda_callable__
-   static void
+   static Value
    add( Value& v, const Value& a )
    {
-      #pragma omp atomic update
-      v += a;
+      Value old;
+      #pragma omp atomic capture
+      {
+         old = v;
+         v += a;
+      }
+      return old;
    }
 };
 
@@ -48,10 +47,12 @@ struct AtomicOperations< Devices::Sequential >
    TNL_NVCC_HD_WARNING_DISABLE
    template< typename Value >
    __cuda_callable__
-   static void
+   static Value
    add( Value& v, const Value& a )
    {
+      const Value old = v;
       v += a;
+      return old;
    }
 };
 
@@ -60,56 +61,26 @@ struct AtomicOperations< Devices::Cuda >
 {
    template< typename Value >
    __cuda_callable__
-   static void
+   static Value
    add( Value& v, const Value& a )
    {
 #ifdef HAVE_CUDA
-      atomicAdd( &v, a );
-#endif  // HAVE_CUDA
-   }
-
-#ifdef HAVE_CUDA
-   __device__
-   static void
-   add( double& v, const double& a )
-   {
-   #if __CUDA_ARCH__ < 600
-      unsigned long long int* v_as_ull = (unsigned long long int*) &v;
-      unsigned long long int old = *v_as_ull, assumed;
-
-      do {
-         assumed = old;
-         old = atomicCAS( v_as_ull, assumed, __double_as_longlong( a + __longlong_as_double( assumed ) ) );
-
-         // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
-      } while( assumed != old );
-   #else   // __CUDA_ARCH__ < 600
-      atomicAdd( &v, a );
-   #endif  //__CUDA_ARCH__ < 600
-   }
-#else   // HAVE_CUDA
-   static void
-   add( double& v, const double& a )
-   {}
-#endif  // HAVE_CUDA
-
-   __cuda_callable__
-   static void
-   add( long int& v, const long int& a )
-   {
-#ifdef HAVE_CUDA
-      TNL_ASSERT_TRUE( false, "Atomic add for long int is not supported on CUDA." );
-#endif  // HAVE_CUDA
+      return atomicAdd( &v, a );
+#else
+      return 0;
+#endif
    }
 
    __cuda_callable__
-   static void
+   static short int
    add( short int& v, const short int& a )
    {
 #ifdef HAVE_CUDA
       TNL_ASSERT_TRUE( false, "Atomic add for short int is not supported on CUDA." );
-#endif  // HAVE_CUDA
+#endif
+      return 0;
    }
 };
+
 }  // namespace Algorithms
 }  // namespace TNL
diff --git a/src/TNL/Atomic.h b/src/TNL/Atomic.h
index 94ac5dfc8cf4993f35a03f81a5d3e1ccbcd3c344..2f457d74c81f30fcccf8bef5fbe4e4c4a50869ea 100644
--- a/src/TNL/Atomic.h
+++ b/src/TNL/Atomic.h
@@ -14,11 +14,12 @@
 #include <TNL/Devices/Sequential.h>
 #include <TNL/Devices/Cuda.h>
 
-// double-precision atomicAdd function for Maxwell and older GPUs
-// copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
 #ifdef HAVE_CUDA
-   #if __CUDA_ARCH__ < 600
 namespace {
+
+// double-precision atomicAdd function for Maxwell and older GPUs
+// copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
+   #if defined( __CUDA_ARCH__ ) && __CUDA_ARCH__ < 600
 __device__
 double
 atomicAdd( double* address, double val )
@@ -35,8 +36,28 @@ atomicAdd( double* address, double val )
 
    return __longlong_as_double( old );
 }
-}  // namespace
    #endif
+
+__device__
+long int
+atomicAdd( long int* address, long int val )
+{
+   unsigned long long int* address_as_unsigned = reinterpret_cast< unsigned long long int* >( address );
+   long int old = *address;
+   long int assumed;
+
+   do {
+      assumed = old;
+      long int sum = val + assumed;
+      old = atomicCAS( address_as_unsigned,
+                       *reinterpret_cast< unsigned long long int* >( &assumed ),
+                       *reinterpret_cast< unsigned long long int* >( &sum ) );
+   } while( assumed != old );
+
+   return old;
+}
+
+}  // namespace
 #endif
 
 namespace TNL {
diff --git a/src/TNL/Matrices/Sandbox/SparseSandboxMatrix.h b/src/TNL/Matrices/Sandbox/SparseSandboxMatrix.h
index 27a9dfaa83fb06c2539269e8f21b1f61fa5585ca..2e4d55643cc7995cc1fcb7a9e2f9445d286b9deb 100644
--- a/src/TNL/Matrices/Sandbox/SparseSandboxMatrix.h
+++ b/src/TNL/Matrices/Sandbox/SparseSandboxMatrix.h
@@ -1054,11 +1054,12 @@ public:
    void addMatrix( const SparseMatrix< Real2, Segments, Device, Index2 >& matrix,
                    const RealType& matrixMultiplicator = 1.0,
                    const RealType& thisMatrixMultiplicator = 1.0 );
+   */
 
    template< typename Real2, typename Index2 >
-   void getTransposition( const SparseMatrix< Real2, Segments, Device, Index2 >& matrix,
-                          const RealType& matrixMultiplicator = 1.0 );
-    */
+   void
+   getTransposition( const SparseSandboxMatrix< Real2, Device, Index2, MatrixType >& matrix,
+                     const RealType& matrixMultiplicator = 1.0 );
 
    template< typename Vector1, typename Vector2 >
    bool
diff --git a/src/TNL/Matrices/Sandbox/SparseSandboxMatrix.hpp b/src/TNL/Matrices/Sandbox/SparseSandboxMatrix.hpp
index abb963b984984877a25d6b29ed4f53203fe5f794..f84f79f920953136d6b0cb96f2841e1176195536 100644
--- a/src/TNL/Matrices/Sandbox/SparseSandboxMatrix.hpp
+++ b/src/TNL/Matrices/Sandbox/SparseSandboxMatrix.hpp
@@ -112,7 +112,7 @@ SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAlloca
                          this->getColumns(),
                          this->getValues().getConstView(),
                          this->columnIndexes.getConstView(),
-                         this->segments.getConstView() );
+                         const_cast< SparseSandboxMatrix* >( this )->rowPointers.getView() );
 }
 
 template< typename Real, typename Device, typename Index, typename MatrixType, typename RealAllocator, typename IndexAllocator >
@@ -479,7 +479,7 @@ template< typename Function >
 void
 SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::forAllRows( Function&& function ) const
 {
-   this->getConsView().forAllRows( function );
+   this->getConstView().forAllRows( function );
 }
 
 template< typename Real, typename Device, typename Index, typename MatrixType, typename RealAllocator, typename IndexAllocator >
@@ -521,7 +521,8 @@ SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAlloca
    this->sequentialForRows( 0, this->getRows(), function );
 }
 
-/*template< typename Real,
+/*
+template< typename Real,
           template< typename, typename, typename > class Segments,
           typename Device,
           typename Index,
@@ -532,23 +533,67 @@ IndexAllocator2 > void SparseSandboxMatrix< Real, Device, Index, MatrixType, Rea
 SparseSandboxMatrix< Real2, Segments2, Device, Index2, RealAllocator2, IndexAllocator2 >& matrix, const RealType&
 matrixMultiplicator, const RealType& thisMatrixMultiplicator )
 {
-
 }
+*/
 
-template< typename Real,
-          template< typename, typename, typename > class Segments,
-          typename Device,
-          typename Index,
-          typename RealAllocator,
-          typename IndexAllocator >
+template< typename Real, typename Device, typename Index, typename MatrixType, typename RealAllocator, typename IndexAllocator >
 template< typename Real2, typename Index2 >
 void
-SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::
-getTransposition( const SparseSandboxMatrix< Real2, Device, Index2 >& matrix,
-                  const RealType& matrixMultiplicator )
+SparseSandboxMatrix< Real, Device, Index, MatrixType, RealAllocator, IndexAllocator >::getTransposition(
+   const SparseSandboxMatrix< Real2, Device, Index2, MatrixType >& matrix,
+   const RealType& matrixMultiplicator )
 {
+   // set transposed dimensions
+   setDimensions( matrix.getColumns(), matrix.getRows() );
 
-}*/
+   // stage 1: compute row capacities for the transposition
+   RowsCapacitiesType capacities;
+   capacities.resize( this->getRows(), 0 );
+   auto capacities_view = capacities.getView();
+   using MatrixRowView = typename SparseSandboxMatrix< Real2, Device, Index2, MatrixType >::ConstRowView;
+   matrix.forAllRows(
+      [ = ] __cuda_callable__( const MatrixRowView& row ) mutable
+      {
+         for( IndexType c = 0; c < row.getSize(); c++ ) {
+            // row index of the transpose = column index of the input
+            const IndexType& transRowIdx = row.getColumnIndex( c );
+            if( transRowIdx < 0 )
+               continue;
+            // increment the capacity for the row in the transpose
+            Algorithms::AtomicOperations< DeviceType >::add( capacities_view[ row.getColumnIndex( c ) ], IndexType( 1 ) );
+         }
+      } );
+
+   // set the row capacities
+   setRowCapacities( capacities );
+   capacities.reset();
+
+   // index of the first unwritten element per row
+   RowsCapacitiesType offsets;
+   offsets.resize( this->getRows(), 0 );
+   auto offsets_view = offsets.getView();
+
+   // stage 2: copy and transpose the data
+   auto trans_view = getView();
+   matrix.forAllRows(
+      [ = ] __cuda_callable__( const MatrixRowView& row ) mutable
+      {
+         // row index of the input = column index of the transpose
+         const IndexType& rowIdx = row.getRowIndex();
+         for( IndexType c = 0; c < row.getSize(); c++ ) {
+            // row index of the transpose = column index of the input
+            const IndexType& transRowIdx = row.getColumnIndex( c );
+            if( transRowIdx < 0 )
+               continue;
+            // local index in the row of the transpose
+            const IndexType transLocalIdx =
+               Algorithms::AtomicOperations< DeviceType >::add( offsets_view[ transRowIdx ], IndexType( 1 ) );
+            // get the row in the transposed matrix and set the value
+            auto transRow = trans_view.getRow( transRowIdx );
+            transRow.setElement( transLocalIdx, rowIdx, row.getValue( c ) * matrixMultiplicator );
+         }
+      } );
+}
 
 template< typename Real, typename Device, typename Index, typename MatrixType, typename RealAllocator, typename IndexAllocator >
 template< typename Vector1, typename Vector2 >
diff --git a/src/TNL/Matrices/Sandbox/SparseSandboxMatrixView.h b/src/TNL/Matrices/Sandbox/SparseSandboxMatrixView.h
index 58e4b77dc87c8bf078bb3180afe1ea494b17bff8..45840f487bb14b50bc8be715e1012722ad98be52 100644
--- a/src/TNL/Matrices/Sandbox/SparseSandboxMatrixView.h
+++ b/src/TNL/Matrices/Sandbox/SparseSandboxMatrixView.h
@@ -83,7 +83,7 @@ public:
    /**
     * \brief The type of matrix elements.
     */
-   using RealType = Real;
+   using RealType = std::remove_const_t< Real >;
 
    // using ComputeRealType = ComputeReal;
 
diff --git a/src/TNL/Matrices/Sandbox/SparseSandboxMatrixView.hpp b/src/TNL/Matrices/Sandbox/SparseSandboxMatrixView.hpp
index a80f766d4ea9078828cc36cadbf2cfc4a49fce6e..506793a264ac004acc7f1611132345f8d9c9a889 100644
--- a/src/TNL/Matrices/Sandbox/SparseSandboxMatrixView.hpp
+++ b/src/TNL/Matrices/Sandbox/SparseSandboxMatrixView.hpp
@@ -793,7 +793,7 @@ template< typename Real, typename Device, typename Index, typename MatrixType >
 void
 SparseSandboxMatrixView< Real, Device, Index, MatrixType >::save( File& file ) const
 {
-   MatrixView< RealType, DeviceType, IndexType >::save( file );
+   MatrixView< Real, Device, Index >::save( file );
    file << this->columnIndexes << this->rowPointers;  // SANDBOX_TODO: Replace this with medata required by your format
 }
 
diff --git a/src/TNL/Matrices/SparseMatrix.h b/src/TNL/Matrices/SparseMatrix.h
index 76aee6dfb4498c782f9d99c6281e9c264749b5b8..b47ef9c853d991774ac384e2f5cf2f2fb25466e8 100644
--- a/src/TNL/Matrices/SparseMatrix.h
+++ b/src/TNL/Matrices/SparseMatrix.h
@@ -1088,12 +1088,13 @@ public:
    void addMatrix( const SparseMatrix< Real2, Segments, Device, Index2 >& matrix,
                    const RealType& matrixMultiplicator = 1.0,
                    const RealType& thisMatrixMultiplicator = 1.0 );
-
-   template< typename Real2, typename Index2 >
-   void getTransposition( const SparseMatrix< Real2, Segments, Device, Index2 >& matrix,
-                          const RealType& matrixMultiplicator = 1.0 );
     */
 
+   template< typename Real2, typename Index2, template< typename, typename, typename > class Segments2 >
+   void
+   getTransposition( const SparseMatrix< Real2, Device, Index2, MatrixType, Segments2 >& matrix,
+                     const ComputeRealType& matrixMultiplicator = 1.0 );
+
    /**
     * \brief Copy-assignment of exactly the same matrix type.
     *
diff --git a/src/TNL/Matrices/SparseMatrix.hpp b/src/TNL/Matrices/SparseMatrix.hpp
index f53e8c3cbe11e850f43365afcd1d5d5c11868757..a2d497f1e51cb96c9a9bfb4bb49bbbe631ecb8f3 100644
--- a/src/TNL/Matrices/SparseMatrix.hpp
+++ b/src/TNL/Matrices/SparseMatrix.hpp
@@ -795,7 +795,7 @@ void
 SparseMatrix< Real, Device, Index, MatrixType, Segments, ComputeReal, RealAllocator, IndexAllocator >::forAllRows(
    Function&& function ) const
 {
-   this->getConsView().forAllRows( function );
+   this->getConstView().forAllRows( function );
 }
 
 template< typename Real,
@@ -870,7 +870,8 @@ SparseMatrix< Real, Device, Index, MatrixType, Segments, ComputeReal, RealAlloca
    this->sequentialForRows( (IndexType) 0, this->getRows(), function );
 }
 
-/*template< typename Real,
+/*
+template< typename Real,
           template< typename, typename, typename > class Segments,
           typename Device,
           typename Index,
@@ -882,23 +883,75 @@ addMatrix( const SparseMatrix< Real2, Segments2, Device, Index2, RealAllocator2,
            const RealType& matrixMultiplicator,
            const RealType& thisMatrixMultiplicator )
 {
-
 }
+*/
 
 template< typename Real,
-          template< typename, typename, typename > class Segments,
           typename Device,
           typename Index,
+          typename MatrixType,
+          template< typename, typename, typename >
+          class Segments,
+          typename ComputeReal,
           typename RealAllocator,
           typename IndexAllocator >
-template< typename Real2, typename Index2 >
+template< typename Real2, typename Index2, template< typename, typename, typename > class Segments2 >
 void
-SparseMatrix< Real, Device, Index, MatrixType, Segments, ComputeReal, RealAllocator, IndexAllocator >::
-getTransposition( const SparseMatrix< Real2, Device, Index2 >& matrix,
-                  const RealType& matrixMultiplicator )
+SparseMatrix< Real, Device, Index, MatrixType, Segments, ComputeReal, RealAllocator, IndexAllocator >::getTransposition(
+   const SparseMatrix< Real2, Device, Index2, MatrixType, Segments2 >& matrix,
+   const ComputeRealType& matrixMultiplicator )
 {
+   // set transposed dimensions
+   setDimensions( matrix.getColumns(), matrix.getRows() );
+
+   // stage 1: compute row capacities for the transposition
+   RowsCapacitiesType capacities;
+   capacities.resize( this->getRows(), 0 );
+   auto capacities_view = capacities.getView();
+   using MatrixRowView = typename SparseMatrix< Real2, Device, Index2, MatrixType, Segments2 >::ConstRowView;
+   matrix.forAllRows(
+      [ = ] __cuda_callable__( const MatrixRowView& row ) mutable
+      {
+         for( IndexType c = 0; c < row.getSize(); c++ ) {
+            // row index of the transpose = column index of the input
+            const IndexType& transRowIdx = row.getColumnIndex( c );
+            if( transRowIdx == row.getPaddingIndex() )
+               continue;
+            // increment the capacity for the row in the transpose
+            Algorithms::AtomicOperations< DeviceType >::add( capacities_view[ transRowIdx ], IndexType( 1 ) );
+         }
+      } );
+
+   // set the row capacities
+   setRowCapacities( capacities );
+   capacities.reset();
+
+   // index of the first unwritten element per row
+   RowsCapacitiesType offsets;
+   offsets.resize( this->getRows(), 0 );
+   auto offsets_view = offsets.getView();
 
-}*/
+   // stage 2: copy and transpose the data
+   auto trans_view = getView();
+   matrix.forAllRows(
+      [ = ] __cuda_callable__( const MatrixRowView& row ) mutable
+      {
+         // row index of the input = column index of the transpose
+         const IndexType& rowIdx = row.getRowIndex();
+         for( IndexType c = 0; c < row.getSize(); c++ ) {
+            // row index of the transpose = column index of the input
+            const IndexType& transRowIdx = row.getColumnIndex( c );
+            if( transRowIdx == row.getPaddingIndex() )
+               continue;
+            // local index in the row of the transpose
+            const IndexType transLocalIdx =
+               Algorithms::AtomicOperations< DeviceType >::add( offsets_view[ transRowIdx ], IndexType( 1 ) );
+            // get the row in the transposed matrix and set the value
+            auto transRow = trans_view.getRow( transRowIdx );
+            transRow.setElement( transLocalIdx, rowIdx, row.getValue( c ) * matrixMultiplicator );
+         }
+      } );
+}
 
 // copy assignment
 template< typename Real,
diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp
index 9420363b82be57eee4995bffa783ce02fac09114..9ee552fbd6e88be04f9247148690707a78f22f1c 100644
--- a/src/TNL/Matrices/SparseMatrixView.hpp
+++ b/src/TNL/Matrices/SparseMatrixView.hpp
@@ -820,35 +820,6 @@ SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView, ComputeReal >::
    this->sequentialForRows( (IndexType) 0, this->getRows(), function );
 }
 
-/*template< typename Real,
-          template< typename, typename > class SegmentsView,
-          typename Device,
-          typename Index,
-          typename RealAllocator,
-          typename IndexAllocator >
-template< typename Real2, template< typename, typename > class Segments2, typename Index2, typename RealAllocator2, typename
-IndexAllocator2 > void SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView, ComputeReal >:: addMatrix( const
-SparseMatrixView< Real2, Segments2, Device, Index2, RealAllocator2, IndexAllocator2 >& matrix, const RealType&
-matrixMultiplicator, const RealType& thisMatrixMultiplicator )
-{
-
-}
-
-template< typename Real,
-          template< typename, typename > class SegmentsView,
-          typename Device,
-          typename Index,
-          typename RealAllocator,
-          typename IndexAllocator >
-template< typename Real2, typename Index2 >
-void
-SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView, ComputeReal >::
-getTransposition( const SparseMatrixView< Real2, Device, Index2 >& matrix,
-                  const RealType& matrixMultiplicator )
-{
-
-}*/
-
 template< typename Real,
           typename Device,
           typename Index,
diff --git a/src/TNL/Solvers/Linear/Hypre.h b/src/TNL/Solvers/Linear/Hypre.h
index a5d24aeecf89eae75eb08cb002ecb42d99f2924a..a300595822ccadfad29741a4be7a794c89441ca0 100644
--- a/src/TNL/Solvers/Linear/Hypre.h
+++ b/src/TNL/Solvers/Linear/Hypre.h
@@ -80,7 +80,8 @@ public:
     *                    will not be updated for the new matrix when calling
     *                    the \ref solve method.
     */
-   virtual void setMatrix( const Matrices::HypreParCSRMatrix& op, bool reuse_setup = false )
+   virtual void
+   setMatrix( const Matrices::HypreParCSRMatrix& op, bool reuse_setup = false )
    {
       A = &op;
       if( setup_called && reuse_setup )
diff --git a/src/TNL/Solvers/Linear/Hypre.hpp b/src/TNL/Solvers/Linear/Hypre.hpp
index 16226a469e6cb1ffb797571b94f470c1ff887965..37089d4d798e8187622447812189ad2b2e668fe6 100644
--- a/src/TNL/Solvers/Linear/Hypre.hpp
+++ b/src/TNL/Solvers/Linear/Hypre.hpp
@@ -248,7 +248,7 @@ HypreFlexGMRES::setMatrix( const Matrices::HypreParCSRMatrix& op, bool reuse_set
 {
    HypreSolver::setMatrix( op, reuse_setup );
    if( precond != nullptr )
-     precond->setMatrix( *A, reuse_setup );
+      precond->setMatrix( *A, reuse_setup );
 }
 
 void
@@ -286,7 +286,7 @@ HypreParaSails::HypreParaSails( MPI_Comm comm )
 
 HypreParaSails::HypreParaSails( const Matrices::HypreParCSRMatrix& A ) : HypreSolver( A )
 {
-   HYPRE_ParaSailsCreate( A.getCommunicator() , &solver );
+   HYPRE_ParaSailsCreate( A.getCommunicator(), &solver );
 }
 
 HypreParaSails::~HypreParaSails()
diff --git a/src/UnitTests/Algorithms/distributedScanTest.h b/src/UnitTests/Algorithms/distributedScanTest.h
index 1ba5277d4bbbeef1b0e741e981a0a3dd270f18db..765e2427c44370c8f69380dc748ac6b0468d7e90 100644
--- a/src/UnitTests/Algorithms/distributedScanTest.h
+++ b/src/UnitTests/Algorithms/distributedScanTest.h
@@ -17,6 +17,12 @@ using namespace TNL::Algorithms;
 using namespace TNL::Algorithms::detail;
 using namespace TNL::MPI;
 
+// this is a workaround for an nvcc 11.7 bug: it drops the scope of enum class members in template function calls
+#ifdef HAVE_CUDA
+static constexpr auto Inclusive = TNL::Algorithms::detail::ScanType::Inclusive;
+static constexpr auto Exclusive = TNL::Algorithms::detail::ScanType::Exclusive;
+#endif
+
 /*
  * Light check of DistributedArray.
  *
diff --git a/src/UnitTests/Algorithms/scanTest.h b/src/UnitTests/Algorithms/scanTest.h
index 9611d7acd66c9b02c86bda2d4f91747a2f9f5687..fe794ef53ff75ac82fbd1d032bbbd278decfa072 100644
--- a/src/UnitTests/Algorithms/scanTest.h
+++ b/src/UnitTests/Algorithms/scanTest.h
@@ -15,6 +15,12 @@ using namespace TNL::Arithmetics;
 using namespace TNL::Algorithms;
 using namespace TNL::Algorithms::detail;
 
+// this is a workaround for an nvcc 11.7 bug: it drops the scope of enum class members in template function calls
+#ifdef HAVE_CUDA
+static constexpr auto Inclusive = TNL::Algorithms::detail::ScanType::Inclusive;
+static constexpr auto Exclusive = TNL::Algorithms::detail::ScanType::Exclusive;
+#endif
+
 // test fixture for typed tests
 template< typename Array >
 class ScanTest : public ::testing::Test
diff --git a/src/UnitTests/Matrices/SparseMatrixTest.h b/src/UnitTests/Matrices/SparseMatrixTest.h
index 2281d229709f53214d14c420dd92ff67d093801c..c4086e634807b842b8aa67d23e7fac303854a958 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest.h
+++ b/src/UnitTests/Matrices/SparseMatrixTest.h
@@ -99,10 +99,17 @@ TYPED_TEST( MatrixTest, reduceRows )
     test_reduceRows< MatrixType >();
 }
 
-TYPED_TEST( MatrixTest, saveAndLoadTest )
+TYPED_TEST( MatrixTest, saveAndLoad )
 {
     using MatrixType = typename TestFixture::MatrixType;
 
     test_SaveAndLoad< MatrixType >( saveAndLoadFileName );
 }
+
+TYPED_TEST( MatrixTest, getTransposition )
+{
+    using MatrixType = typename TestFixture::MatrixType;
+
+    test_getTransposition< MatrixType >();
+}
 #endif
diff --git a/src/UnitTests/Matrices/SparseMatrixTest.hpp b/src/UnitTests/Matrices/SparseMatrixTest.hpp
index 0e7884859d63a07c6fd97cd41354fbda590dde79..6a74e06be6eb7eb5dae680ed8db2a99f7cc6840b 100644
--- a/src/UnitTests/Matrices/SparseMatrixTest.hpp
+++ b/src/UnitTests/Matrices/SparseMatrixTest.hpp
@@ -1356,4 +1356,68 @@ void test_SaveAndLoad( const char* filename )
    EXPECT_EQ( std::remove( filename ), 0 );
 }
 
+template< typename Matrix >
+void test_getTransposition()
+{
+   using RealType = typename Matrix::RealType;
+   using IndexType = typename Matrix::IndexType;
+
+   /*
+    * Sets up the following 5x4 sparse matrix:
+    *
+    *    /  1  2  3  0 \
+    *    |  0  4  0  5 |
+    *    |  6  7  8  0 |
+    *    |  0  9 10 11 |
+    *    \ 12  0  0  0 /
+    */
+
+   const IndexType m_rows = 5;
+   const IndexType m_cols = 4;
+
+   Matrix matrix( m_rows, m_cols );
+   typename Matrix::RowsCapacitiesType capacities( m_rows, 4 );
+   matrix.setRowCapacities( capacities );
+
+   RealType value = 1;
+   for( IndexType i = 0; i < m_cols - 1; i++ )   // 0th row
+      matrix.setElement( 0, i, value++ );
+
+   matrix.setElement( 1, 1, value++ );      // 1st row
+   matrix.setElement( 1, 3, value++ );
+
+   for( IndexType i = 0; i < m_cols - 1; i++ )   // 2nd row
+      matrix.setElement( 2, i, value++ );
+
+   for( IndexType i = 1; i < m_cols; i++ )       // 3rd row
+      matrix.setElement( 3, i, value++ );
+
+   matrix.setElement( 4, 0, value++ );      // 4th row
+
+   Matrix transposed;
+   transposed.getTransposition( matrix );
+
+   EXPECT_EQ( transposed.getElement( 0, 0 ),  1 );
+   EXPECT_EQ( transposed.getElement( 1, 0 ),  2 );
+   EXPECT_EQ( transposed.getElement( 2, 0 ),  3 );
+   EXPECT_EQ( transposed.getElement( 3, 0 ),  0 );
+
+   EXPECT_EQ( transposed.getElement( 0, 1 ),  0 );
+   EXPECT_EQ( transposed.getElement( 1, 1 ),  4 );
+   EXPECT_EQ( transposed.getElement( 2, 1 ),  0 );
+   EXPECT_EQ( transposed.getElement( 3, 1 ),  5 );
+
+   EXPECT_EQ( transposed.getElement( 0, 2 ),  6 );
+   EXPECT_EQ( transposed.getElement( 1, 2 ),  7 );
+   EXPECT_EQ( transposed.getElement( 2, 2 ),  8 );
+   EXPECT_EQ( transposed.getElement( 3, 2 ),  0 );
+
+   EXPECT_EQ( transposed.getElement( 0, 3 ),  0 );
+   EXPECT_EQ( transposed.getElement( 1, 3 ),  9 );
+   EXPECT_EQ( transposed.getElement( 2, 3 ), 10 );
+   EXPECT_EQ( transposed.getElement( 3, 3 ), 11 );
+
+   EXPECT_EQ( transposed.getElement( 0, 4 ), 12 );
+}
+
 #endif