From 190cc1a7c3579240746c5f95cf398669852132ae Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Wed, 16 Apr 2014 15:21:13 +0200
Subject: [PATCH] Implementing tridiagonal matrix in CUDA.

---
 .gitignore                                    |   2 +
 CMakeLists.txt                                |   1 +
 install                                       |   2 +-
 .../matrices/tnlTridiagonalMatrix_impl.h      | 282 ++++++++++++++----
 src/matrices/tnlTridiagonalMatrix.h           |  35 ++-
 tests/unit-tests/CMakeLists.txt               |   6 +-
 tests/unit-tests/matrices/CMakeLists.txt      |  20 +-
 .../matrices/tnlTridiagonalMatrixTest.cu      |   8 +-
 .../matrices/tnlTridiagonalMatrixTester.h     |  20 +-
 9 files changed, 276 insertions(+), 100 deletions(-)

diff --git a/.gitignore b/.gitignore
index de822c5284..5dae13a3d4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,6 +14,8 @@
 /Debug
 /Release
 
+.settings
+
 # /src/
 /src/Makefile.in
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5b7515f5f9..f0ac89d075 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -34,6 +34,7 @@ endif()
 
 if( WITH_CUDA STREQUAL "yes" )
    AddCompilerFlag( "-DHAVE_NOT_CXX11 -U_GLIBCXX_ATOMIC_BUILTINS -U_GLIBCXX_USE_INT128" )
+   set( CUDA_LINKER_OPTIONS "-arch sm_20 -shared" ) # TODO: fix this in the rest of cmake files
 else()
    AddCompilerFlag( "-std=gnu++0x" )
 endif()      
diff --git a/install b/install
index 72f4cc8fd6..62a2a978e2 100755
--- a/install
+++ b/install
@@ -2,7 +2,7 @@
 
 TARGET=TNL
 INSTALL_PREFIX=${HOME}/local
-WITH_CUDA=no
+WITH_CUDA=yes
 WITH_CUSPARSE=no
 CUDA_ARCHITECTURE=2.0
 TEMPLATE_EXPLICIT_INSTANTIATION=yes
diff --git a/src/implementation/matrices/tnlTridiagonalMatrix_impl.h b/src/implementation/matrices/tnlTridiagonalMatrix_impl.h
index 88b893db60..84e2b57603 100644
--- a/src/implementation/matrices/tnlTridiagonalMatrix_impl.h
+++ b/src/implementation/matrices/tnlTridiagonalMatrix_impl.h
@@ -21,6 +21,85 @@
 #include <core/tnlAssert.h>
 #include <matrices/tnlTridiagonalMatrix.h>
 
+template< typename Device >
+class tnlTridiagonalMatrixDeviceDependentCode;
+
+template<>
+class tnlTridiagonalMatrixDeviceDependentCode< tnlHost >
+{
+   public:
+
+      template< typename Index >
+      static Index getElementIndex( const Index rows,
+                                    const Index row,
+                                    const Index column )
+      {
+         return 3*row + column - row;
+      }
+
+      template< typename Vector,
+                typename Index,
+                typename ValuesType  >
+      static typename Vector::RealType rowVectorProduct( const Index rows,
+                                                         const ValuesType& values,
+                                                         const Index row,
+                                                         const Vector& vector )
+      {
+         if( row == 0 )
+            return vector[ 0 ] * values[ 0 ] +
+                   vector[ 1 ] * values[ 1 ];
+         Index i = 3 * row - 1;
+         if( row == rows - 1 )
+            return vector[ row - 1 ] * values[ i++ ] +
+                   vector[ row ] * values[ i ];
+         return vector[ row - 1 ] * values[ i++ ] +
+                vector[ row ] * values[ i++ ] +
+                vector[ row + 1 ] * values[ i ];
+      }
+};
+
+template<>
+class tnlTridiagonalMatrixDeviceDependentCode< tnlCuda >
+{
+   public:
+
+      template< typename Index >
+#ifdef HAVE_CUDA
+      __device__ __host__
+#endif
+      static Index getElementIndex( const Index rows,
+                                    const Index row,
+                                    const Index column )
+      {
+         return ( column - row + 1 )*rows + row - 1;
+      }
+      
+      template< typename Vector,
+                typename Index,
+                typename ValuesType >
+#ifdef HAVE_CUDA
+      __device__
+#endif
+      static typename Vector::RealType rowVectorProduct( const Index rows,
+                                                         const ValuesType& values,
+                                                         const Index row,
+                                                         const Vector& vector )
+      {
+         if( row == 0 )
+            return vector[ 0 ] * values[ 0 ] +
+                   vector[ 1 ] * values[ rows - 1 ];
+         Index i = row - 1;
+         if( row == rows - 1 )
+            return vector[ row - 1 ] * values[ i ] +
+                   vector[ row ] * values[ i + rows ];
+         return vector[ row - 1 ] * values[ i ] +
+                vector[ row ] * values[ i + rows ] +
+                vector[ row + 1 ] * values[ i + 2*rows ];
+      }
+};
+
+
+
 template< typename Real,
           typename Device,
           typename Index >
@@ -58,6 +137,7 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::setDimensions( const IndexType
    if( ! values.setSize( 3*Min( rows, columns ) ) )
       return false;
    this->values.setValue( 0.0 );
+   return true;
 }
 
 template< typename Real,
@@ -174,7 +254,7 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::setElementFast( const IndexTyp
                                                                   const IndexType column,
                                                                   const RealType& value )
 {
-   this->values.setElement( this->getElementIndex( row, column ), value );
+   this->values[ this->getElementIndex( row, column ) ] = value;
    return true;
 }
 
@@ -189,7 +269,6 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::setElement( const IndexType ro
    return true;
 }
 
-
 template< typename Real,
           typename Device,
           typename Index >
@@ -201,7 +280,8 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::addElementFast( const IndexTyp
                                                                   const RealType& value,
                                                                   const RealType& thisElementMultiplicator )
 {
-   this->values.addElement( this->getElementIndex( row, column ), value, thisElementMultiplicator );
+   const Index i = this->getElementIndex( row, column );
+   this->values[ i ] = thisElementMultiplicator*this->values[ i ] + value;
 }
 
 template< typename Real,
@@ -212,10 +292,10 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::addElement( const IndexType ro
                                                               const RealType& value,
                                                               const RealType& thisElementMultiplicator )
 {
-   //this->values.addElement( this->getElementIndex( row, column ), value, thisElementMultiplicator );
+   const Index i = this->getElementIndex( row, column );
+   this->values.setElement( i, thisElementMultiplicator * this->values.getElement( i ) + value );
 }
 
-
 template< typename Real,
           typename Device,
           typename Index >
@@ -231,7 +311,7 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::setRowFast( const IndexType ro
             cerr << " elements = " << elements
                  << " this->columns = " << this->columns
                  << " this->getName() = " << this->getName() );
-   return this->addRow( row, columns, values, elements, 0.0 );
+   return this->addRowFast( row, columns, values, elements, 0.0 );
 }
 
 template< typename Real,
@@ -246,10 +326,9 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::setRow( const IndexType row,
             cerr << " elements = " << elements
                  << " this->columns = " << this->columns
                  << " this->getName() = " << this->getName() );
-   //return this->addRow( row, columns, values, elements, 0.0 );
+   return this->addRow( row, columns, values, elements, 0.0 );
 }
 
-
 template< typename Real,
           typename Device,
           typename Index >
@@ -273,7 +352,7 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::addRowFast( const IndexType ro
       const IndexType& column = columns[ i ];
       if( column < row - 1 || column > row + 1 )
          return false;
-      addElement( row, column, values[ i ], thisRowMultiplicator );
+      addElementFast( row, column, values[ i ], thisRowMultiplicator );
    }
    return true;
 }
@@ -291,16 +370,16 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::addRow( const IndexType row,
             cerr << " elements = " << elements
                  << " this->columns = " << this->columns
                  << " this->getName() = " << this->getName() );
-   /*if( elements > 3 )
+   if( elements > 3 )
       return false;
    for( IndexType i = 0; i < elements; i++ )
    {
-      const IndexType& column = columns[ i ];
+      const IndexType column = columns[ i ];
       if( column < row - 1 || column > row + 1 )
          return false;
       addElement( row, column, values[ i ], thisRowMultiplicator );
    }
-   return true;*/
+   return true;
 }
 
 template< typename Real,
@@ -314,7 +393,7 @@ Real tnlTridiagonalMatrix< Real, Device, Index >::getElementFast( const IndexTyp
 {
    if( abs( column - row ) > 1 )
       return 0.0;
-   return this->values.getElement( this->getElementIndex( row, column ) );
+   return this->values[ this->getElementIndex( row, column ) ];
 }
 
 template< typename Real,
@@ -323,10 +402,11 @@ template< typename Real,
 Real tnlTridiagonalMatrix< Real, Device, Index >::getElement( const IndexType row,
                                                               const IndexType column ) const
 {
-   return this->getElementFast( row, column );
+   if( abs( column - row ) > 1 )
+      return 0.0;
+   return this->values.getElement( this->getElementIndex( row, column ) );
 }
 
-
 template< typename Real,
           typename Device,
           typename Index >
@@ -357,56 +437,50 @@ void tnlTridiagonalMatrix< Real, Device, Index >::getRow( const IndexType row,
                                                           IndexType* columns,
                                                           RealType* values ) const
 {
-   /*IndexType elementPointer( 0 );
+   IndexType elementPointer( 0 );
    for( IndexType i = -1; i <= 1; i++ )
    {
       const IndexType column = row + 1;
       if( column >= 0 && column < this->getColumns() )
       {
          columns[ elementPointer ] = column;
-         values[ elementPointer ] = this->values[ this->getElementIndex( row, column ) ];
+         values[ elementPointer ] = this->values.getElement( this->getElementIndex( row, column ) );
          elementPointer++;
       }
-   }*/
-}
-
-
-template< typename Real,
-          typename Device,
-          typename Index >
-Real& tnlTridiagonalMatrix< Real, Device, Index >::operator()( const IndexType row,
-                                                               const IndexType column )
-{
-   return this->values[ this->getElementIndex( row, column ) ];
+   }
 }
 
 template< typename Real,
           typename Device,
           typename Index >
-const Real& tnlTridiagonalMatrix< Real, Device, Index >::operator()( const IndexType row,
-                                                                     const IndexType column ) const
+template< typename Vector >
+#ifdef HAVE_CUDA
+   __device__ __host__
+#endif
+typename Vector::RealType tnlTridiagonalMatrix< Real, Device, Index >::rowVectorProduct( const IndexType row,
+                                                                                         const Vector& vector ) const
 {
-   return this->values[ this->getElementIndex( row, column ) ];
+   return tnlTridiagonalMatrixDeviceDependentCode< Device >::
+             rowVectorProduct( this->rows,
+                               this->values,
+                               row,
+                               vector );
 }
 
+#ifdef HAVE_CUDA
 template< typename Real,
-          typename Device,
-          typename Index >
-template< typename Vector >
-typename Vector::RealType tnlTridiagonalMatrix< Real, Device, Index >::rowVectorProduct( const IndexType row,
-                                                                                         const Vector& vector ) const
+          typename Index,
+          typename Vector >
+__global__ void tnlTridiagonalMatrixVectorProductCudaKernel( tnlTridiagonalMatrix< Real, tnlCuda, Index >* matrix,
+                                                             const Vector* inVector,
+                                                             Vector* outVector,
+                                                             const Index gridIdx )
 {
-   if( row == 0 )
-      return vector[ 0 ] * this->values[ 0 ] +
-             vector[ 1 ] * this->values[ 1 ];
-   IndexType i = 3 * row - 1;
-   if( row == this->rows - 1 )
-      return vector[ row - 1 ] * this->values[ i++ ] +
-             vector[ row ] * this->values[ i ];
-   return vector[ row - 1 ] * this->values[ i++ ] +
-          vector[ row ] * this->values[ i++ ] +
-          vector[ row + 1 ] * this->values[ i ];
+   const Index rowIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   if( rowIdx < matrix->getRows() )
+      ( *outVector )[ rowIdx ] = matrix->rowVectorProduct( rowIdx, *inVector );
 }
+#endif
 
 template< typename Real,
           typename Device,
@@ -426,8 +500,32 @@ void tnlTridiagonalMatrix< Real, Device, Index >::vectorProduct( const Vector& i
                     << "Vector size: " << outVector.getSize() << endl
                     << "Vector name: " << outVector.getName() << endl );
 
-   for( IndexType row = 0; row < this->getRows(); row++ )
-      outVector[ row ] = this->rowVectorProduct( row, inVector );
+   if( Device::getDevice() == tnlHostDevice )
+      for( IndexType row = 0; row < this->getRows(); row++ )
+         outVector[ row ] = this->rowVectorProduct( row, inVector );
+
+   if( Device::getDevice() == tnlCudaDevice )
+   {
+#ifdef HAVE_CUDA
+      ThisType* kernel_this = tnlCuda::passToDevice( *this );
+      Vector* kernel_inVector = tnlCuda::passToDevice( inVector );
+      Vector* kernel_outVector = tnlCuda::passToDevice( outVector );
+      dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() );
+      const IndexType cudaBlocks = roundUpDivision( this->getRows(), cudaBlockSize.x );
+      const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() );
+      for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
+      {
+         if( gridIdx == cudaGrids - 1 )
+            cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize();
+         tnlTridiagonalMatrixVectorProductCudaKernel<<< cudaGridSize, cudaBlockSize >>>
+                                                    ( kernel_this, kernel_inVector, kernel_outVector, gridIdx );
+      }
+      tnlCuda::freeFromDevice( kernel_this );
+      tnlCuda::freeFromDevice( kernel_inVector );
+      tnlCuda::freeFromDevice( kernel_outVector );
+      checkCudaDevice;
+#endif
+   }
 }
 
 template< typename Real,
@@ -443,38 +541,84 @@ void tnlTridiagonalMatrix< Real, Device, Index >::addMatrix( const tnlTridiagona
                  << "This matrix rows: " << this->getRows() << endl
                  << "This matrix name: " << this->getName() << endl );
 
-   const IndexType lastI = this->values.getSize();
    if( thisMatrixMultiplicator == 1.0 )
-   {
-      for( IndexType i = 0; i < lastI; i++ )
-         this->values[ i ] += matrixMultiplicator*matrix.values[ i ];
-   }
+      this->values.alphaXPlusY( matrixMultiplicator, matrix.values );
    else
+      this->values.alphaXPlusBetaY( matrixMultiplicator, matrix.values, thisMatrixMultiplicator );
+}
+
+#ifdef HAVE_CUDA
+template< typename Real,
+          typename Real2,
+          typename Index,
+          typename Index2 >          
+__global__ void tnlTridiagonalMatrixTranspositionCudaKernel( const tnlTridiagonalMatrix< Real2, tnlCuda, Index2 >* inMatrix,
+                                                             tnlTridiagonalMatrix< Real, tnlCuda, Index >* outMatrix,
+                                                             const Real matrixMultiplicator,
+                                                             const Index gridIdx )
+{
+   const Index rowIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   if( rowIdx < inMatrix->getRows() )
    {
-      for( IndexType i = 0; i < lastI; i++ )
-         this->values[ i ] = thisMatrixMultiplicator * this->values[ i ] +
-            matrixMultiplicator*matrix.values[ i ];
+      if( rowIdx > 0 )
+        outMatrix->setElementFast( rowIdx-1,
+                                   rowIdx,
+                                   matrixMultiplicator * inMatrix->getElementFast( rowIdx, rowIdx-1 ) );
+      outMatrix->setElementFast( rowIdx,
+                                 rowIdx,
+                                 matrixMultiplicator * inMatrix->getElementFast( rowIdx, rowIdx ) );
+      if( rowIdx < inMatrix->getRows()-1 )
+         outMatrix->setElementFast( rowIdx+1,
+                                    rowIdx,
+                                    matrixMultiplicator * inMatrix->getElementFast( rowIdx, rowIdx+1 ) );
    }
 }
+#endif
 
 template< typename Real,
           typename Device,
           typename Index >
-   template< typename Real2, typename Index2, int tileDim >
+   template< typename Real2, typename Index2 >
 void tnlTridiagonalMatrix< Real, Device, Index >::getTransposition( const tnlTridiagonalMatrix< Real2, Device, Index2 >& matrix,
                                                                     const RealType& matrixMultiplicator )
 {
    tnlAssert( this->getRows() == matrix.getRows(),
                cerr << "This matrix rows: " << this->getRows() << endl
                     << "That matrix rows: " << matrix.getRows() << endl );
-
-   const IndexType& rows = matrix.getRows();
-   for( IndexType i = 1; i < rows; i++ )
+   if( Device::getDevice() == tnlHostDevice )
    {
-      RealType aux = matrix. getElement( i, i - 1 );
-      this->setElement( i, i - 1, matrix.getElement( i - 1, i ) );
-      this->setElement( i, i, matrix.getElement( i, i ) );
-      this->setElement( i - 1, i, aux );
+      const IndexType& rows = matrix.getRows();
+      for( IndexType i = 1; i < rows; i++ )
+      {
+         RealType aux = matrix. getElement( i, i - 1 );
+         this->setElement( i, i - 1, matrix.getElement( i - 1, i ) );
+         this->setElement( i, i, matrix.getElement( i, i ) );
+         this->setElement( i - 1, i, aux );
+      }
+   }
+   if( Device::getDevice() == tnlCudaDevice )
+   {
+#ifdef HAVE_CUDA
+      ThisType* kernel_this = tnlCuda::passToDevice( *this );
+      typedef  tnlTridiagonalMatrix< Real2, Device, Index2 > InMatrixType;
+      InMatrixType* kernel_inMatrix = tnlCuda::passToDevice( matrix );
+      dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() );
+      const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
+      const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() );
+      for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
+      {
+         if( gridIdx == cudaGrids - 1 )
+            cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize();
+         tnlTridiagonalMatrixTranspositionCudaKernel<<< cudaGridSize, cudaBlockSize >>>
+                                                    ( kernel_inMatrix,
+                                                      kernel_this,
+                                                      matrixMultiplicator,
+                                                      gridIdx );
+      }
+      tnlCuda::freeFromDevice( kernel_this );
+      tnlCuda::freeFromDevice( kernel_inMatrix );
+      checkCudaDevice;
+#endif
    }
 }
 
@@ -547,7 +691,7 @@ void tnlTridiagonalMatrix< Real, Device, Index >::print( ostream& str ) const
       str <<"Row: " << row << " -> ";
       for( IndexType column = row - 1; column < row + 2; column++ )
          if( column >= 0 && column < this->columns )
-            str << " Col:" << column << "->" << this->operator()( row, column ) << "\t";
+            str << " Col:" << column << "->" << this->getElement( row, column ) << "\t";
       str << endl;
    }
 }
@@ -555,15 +699,21 @@ void tnlTridiagonalMatrix< Real, Device, Index >::print( ostream& str ) const
 template< typename Real,
           typename Device,
           typename Index >
+#ifdef HAVE_CUDA
+   __device__ __host__
+#endif
 Index tnlTridiagonalMatrix< Real, Device, Index >::getElementIndex( const IndexType row,
                                                                     const IndexType column ) const
 {
+   // TODO: remove the #ifndef when CUDA supports std:cerr
+#ifndef HAVE_CUDA   
    tnlAssert( row >= 0 && column >= 0 && row < this->rows && column < this->rows,
               cerr << " this->rows = " << this->rows
                    << " row = " << row << " column = " << column );
    tnlAssert( abs( row - column ) < 2,
               cerr << "row = " << row << " column = " << column << endl );
-   return 3*row + column - row;
+#endif   
+   return tnlTridiagonalMatrixDeviceDependentCode< Device >::getElementIndex( this->rows, row, column );
 }
 
 
diff --git a/src/matrices/tnlTridiagonalMatrix.h b/src/matrices/tnlTridiagonalMatrix.h
index 5ca97720da..cba9cd70b6 100644
--- a/src/matrices/tnlTridiagonalMatrix.h
+++ b/src/matrices/tnlTridiagonalMatrix.h
@@ -32,6 +32,7 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index >
    typedef Device DeviceType;
    typedef Index IndexType;
    typedef typename tnlMatrix< Real, Device, Index >::RowLengthsVector RowLengthsVector;
+   typedef tnlTridiagonalMatrix< Real, Device, Index > ThisType;
 
    tnlTridiagonalMatrix();
 
@@ -82,6 +83,11 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index >
                         const RealType& value,
                         const RealType& thisElementMultiplicator = 1.0 );
 
+   bool addElement( const IndexType row,
+                    const IndexType column,
+                    const RealType& value,
+                    const RealType& thisElementMultiplicator = 1.0 );
+
 #ifdef HAVE_CUDA
    __device__ __host__
 #endif
@@ -90,6 +96,11 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index >
                     const RealType* values,
                     const IndexType elements );
 
+   bool setRow( const IndexType row,
+                const IndexType* columns,
+                const RealType* values,
+                const IndexType elements );
+
 #ifdef HAVE_CUDA
    __device__ __host__
 #endif
@@ -99,6 +110,12 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index >
                     const IndexType elements,
                     const RealType& thisRowMultiplicator = 1.0 );
 
+   bool addRow( const IndexType row,
+                const IndexType* columns,
+                const RealType* values,
+                const IndexType elements,
+                const RealType& thisRowMultiplicator = 1.0 );
+
 #ifdef HAVE_CUDA
    __device__ __host__
 #endif
@@ -115,13 +132,14 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index >
                     IndexType* columns,
                     RealType* values ) const;
 
-   RealType& operator()( const IndexType row,
-                         const IndexType column );
-
-   const RealType& operator()( const IndexType row,
-                               const IndexType column ) const;
+   void getRow( const IndexType row,
+                IndexType* columns,
+                RealType* values ) const;
 
    template< typename Vector >
+#ifdef HAVE_CUDA
+   __device__ __host__
+#endif
    typename Vector::RealType rowVectorProduct( const IndexType row,
                                                const Vector& vector ) const;
 
@@ -135,11 +153,11 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index >
                    const RealType& thisMatrixMultiplicator = 1.0 );
 
 #ifdef HAVE_NOT_CXX11
-   template< typename Real2, typename Index2, int tileDim >
+   template< typename Real2, typename Index2 >
    void getTransposition( const tnlTridiagonalMatrix< Real2, Device, Index2 >& matrix,
                           const RealType& matrixMultiplicator = 1.0 );
 #else
-   template< typename Real2, typename Index2, int tileDim = 32 >
+   template< typename Real2, typename Index2 >
    void getTransposition( const tnlTridiagonalMatrix< Real2, Device, Index2 >& matrix,
                           const RealType& matrixMultiplicator = 1.0 );
 #endif   
@@ -162,6 +180,9 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index >
 
    protected:
 
+#ifdef HAVE_CUDA
+   __device__ __host__
+#endif
    IndexType getElementIndex( const IndexType row,
                               const IndexType column ) const;
 
diff --git a/tests/unit-tests/CMakeLists.txt b/tests/unit-tests/CMakeLists.txt
index b13a994863..5741c78742 100755
--- a/tests/unit-tests/CMakeLists.txt
+++ b/tests/unit-tests/CMakeLists.txt
@@ -66,7 +66,11 @@ if( BUILD_CUDA )
    SET_TESTS_PROPERTIES ( core/vectors/tnlVectorTest-cuda${mpiExt}${debugExt} PROPERTIES DEPENDS core/cuda/tnlVectorOperationsTest-cuda${mpiExt}${debugExt} )
    
    ADD_TEST( matrices/tnlDenseMatrixTest-cuda${mpiExt}${debugExt} ${EXECUTABLE_OUTPUT_PATH}/tnlDenseMatrixTest-cuda${mpiExt}${debugExt} )
-   SET_TESTS_PROPERTIES ( matrices/tnlDenseMatrixTest-cuda${mpiExt}${debugExt} PROPERTIES DEPENDS core/cuda/tnlVectorTest-cuda${mpiExt}${debugExt} )         
+   SET_TESTS_PROPERTIES ( matrices/tnlDenseMatrixTest-cuda${mpiExt}${debugExt} PROPERTIES DEPENDS core/cuda/tnlVectorTest-cuda${mpiExt}${debugExt} )
+   
+   ADD_TEST( matrices/tnlTridiagonalMatrixTest-cuda${mpiExt}${debugExt} ${EXECUTABLE_OUTPUT_PATH}/tnlTridiagonalMatrixTest-cuda${mpiExt}${debugExt} )
+   SET_TESTS_PROPERTIES ( matrices/tnlTridiagonalMatrixTest-cuda${mpiExt}${debugExt} PROPERTIES DEPENDS core/cuda/tnlDenseMatrixTest-cuda${mpiExt}${debugExt} )         
+            
             
 endif()
 
diff --git a/tests/unit-tests/matrices/CMakeLists.txt b/tests/unit-tests/matrices/CMakeLists.txt
index 4d9da43c63..466a1c99bd 100755
--- a/tests/unit-tests/matrices/CMakeLists.txt
+++ b/tests/unit-tests/matrices/CMakeLists.txt
@@ -16,16 +16,16 @@ if( BUILD_CUDA )
                                                                       tnl${mpiExt}${debugExt}-0.1 )
 endif()                                                                      
 
-#ADD_EXECUTABLE( tnlTridiagonalMatrixTest${mpiExt}${debugExt} ${headers} tnlTridiagonalMatrixTest.cpp )
-#TARGET_LINK_LIBRARIES( tnlTridiagonalMatrixTest${mpiExt}${debugExt} ${CPPUNIT_LIBRARIES}
-#                                                              tnl${mpiExt}${debugExt}-0.1 )
-#
-#if( BUILD_CUDA )                                                           
-#   CUDA_ADD_EXECUTABLE( tnlTridiagonalMatrixTest-cuda${mpiExt}${debugExt} ${headers} tnlTridiagonalMatrixTest.cu
-#                        OPTIONS -arch sm_20 )
-#   TARGET_LINK_LIBRARIES( tnlTridiagonalMatrixTest-cuda${mpiExt}${debugExt} ${CPPUNIT_LIBRARIES}
-#                                                                      tnl${mpiExt}${debugExt}-0.1 )
-#endif()                                                                      
+ADD_EXECUTABLE( tnlTridiagonalMatrixTest${mpiExt}${debugExt} ${headers} tnlTridiagonalMatrixTest.cpp )
+TARGET_LINK_LIBRARIES( tnlTridiagonalMatrixTest${mpiExt}${debugExt} ${CPPUNIT_LIBRARIES}
+                                                              tnl${mpiExt}${debugExt}-0.1 )
+
+if( BUILD_CUDA )                                                           
+   CUDA_ADD_EXECUTABLE( tnlTridiagonalMatrixTest-cuda${mpiExt}${debugExt} ${headers} tnlTridiagonalMatrixTest.cu
+                        OPTIONS -arch sm_20 )
+   TARGET_LINK_LIBRARIES( tnlTridiagonalMatrixTest-cuda${mpiExt}${debugExt} ${CPPUNIT_LIBRARIES}
+                                                                      tnl${mpiExt}${debugExt}-0.1 )
+endif()                                                                      
 #                                                                          
 #ADD_EXECUTABLE( tnlMultidiagonalMatrixTest${mpiExt}${debugExt} ${headers} tnlMultidiagonalMatrixTest.cpp )
 #TARGET_LINK_LIBRARIES( tnlMultidiagonalMatrixTest${mpiExt}${debugExt} ${CPPUNIT_LIBRARIES}
diff --git a/tests/unit-tests/matrices/tnlTridiagonalMatrixTest.cu b/tests/unit-tests/matrices/tnlTridiagonalMatrixTest.cu
index d8ddc787ae..119ffe0d59 100644
--- a/tests/unit-tests/matrices/tnlTridiagonalMatrixTest.cu
+++ b/tests/unit-tests/matrices/tnlTridiagonalMatrixTest.cu
@@ -25,10 +25,10 @@
 int main( int argc, char* argv[] )
 {
 #ifdef HAVE_CPPUNIT
-   if( ! tnlUnitTestStarter :: run< tnlTridiagonalMatrixTester< float, tnlHost, int > >() ||
-       ! tnlUnitTestStarter :: run< tnlTridiagonalMatrixTester< double, tnlHost, int > >() ||
-       ! tnlUnitTestStarter :: run< tnlTridiagonalMatrixTester< float, tnlHost, long int > >() ||
-       ! tnlUnitTestStarter :: run< tnlTridiagonalMatrixTester< double, tnlHost, long int > >()
+   if( ! tnlUnitTestStarter :: run< tnlTridiagonalMatrixTester< float, tnlCuda, int > >() ||
+       ! tnlUnitTestStarter :: run< tnlTridiagonalMatrixTester< double, tnlCuda, int > >() ||
+       ! tnlUnitTestStarter :: run< tnlTridiagonalMatrixTester< float, tnlCuda, long int > >() ||
+       ! tnlUnitTestStarter :: run< tnlTridiagonalMatrixTester< double, tnlCuda, long int > >()
        )
      return EXIT_FAILURE;
    return EXIT_SUCCESS;
diff --git a/tests/unit-tests/matrices/tnlTridiagonalMatrixTester.h b/tests/unit-tests/matrices/tnlTridiagonalMatrixTester.h
index 98d55a2ae2..3b9a5cfc0a 100644
--- a/tests/unit-tests/matrices/tnlTridiagonalMatrixTester.h
+++ b/tests/unit-tests/matrices/tnlTridiagonalMatrixTester.h
@@ -99,9 +99,9 @@ class tnlTridiagonalMatrixTester : public CppUnit :: TestCase
          CPPUNIT_ASSERT( m.getElement( i, i ) == i );
 
       for( int i = 0; i < 10; i++ )
-         m( i, i ) = i;
+         m.setElement( i, i, i );
       for( int i = 0; i < 10; i++ )
-         CPPUNIT_ASSERT( m( i, i ) == i );
+         CPPUNIT_ASSERT( m.getElement( i, i ) == i );
    }
 
    void addElementTest()
@@ -134,14 +134,14 @@ class tnlTridiagonalMatrixTester : public CppUnit :: TestCase
       for( int i = 0; i < 10; i++ )
          m.setElement( i, i, i );
 
-      tnlVector< IndexType, Device, IndexType > columns;
-      VectorType values;
+      tnlVector< IndexType, tnlHost, IndexType > columns;
+      tnlVector< RealType, tnlHost, IndexType > values;
       columns.setSize( 3 );
       values.setSize( 3 );
       for( IndexType i = 4; i <= 6; i++ )
       {
-         columns[ i - 4 ] = i;
-         values[ i - 4 ] = i;
+         columns.setElement( i - 4, i );
+         values.setElement( i - 4, i );
       }
       m.setRow( 5, columns.getData(), values.getData(), 3 );
 
@@ -172,9 +172,8 @@ class tnlTridiagonalMatrixTester : public CppUnit :: TestCase
          m.setElement( i, i, i );
       }
       m.vectorProduct( v, w );
-
       for( int i = 0; i < size; i++ )
-         CPPUNIT_ASSERT( w[ i ] == i*i );
+         CPPUNIT_ASSERT( w.getElement( i ) == i*i );
    }
 
    void addMatrixTest()
@@ -185,7 +184,7 @@ class tnlTridiagonalMatrixTester : public CppUnit :: TestCase
       for( int i = 0; i < size; i++ )
          for( int j = 0; j < size; j++ )
             if( abs( i - j ) <= 1 )
-               m( i, j ) = i*size + j;
+               m.setElement( i, j, i*size + j );
 
       MatrixType m2;
       m2.setLike( m );
@@ -222,8 +221,7 @@ class tnlTridiagonalMatrixTester : public CppUnit :: TestCase
       MatrixType mTransposed;
       mTransposed.setLike( m );
       mTransposed.template getTransposition< typename MatrixType::RealType,
-                                             typename MatrixType::IndexType,
-                                             32 >( m );
+                                             typename MatrixType::IndexType >( m );
 
       for( int i = 0; i < size; i++ )
          for( int j = 0; j < size; j++ )
-- 
GitLab