diff --git a/src/implementation/matrices/tnlCSRMatrix_impl.h b/src/implementation/matrices/tnlCSRMatrix_impl.h
index 26e5d52ab1950da26e7b871aa532f53fd0aff366..c8d383bdbf9997af54637d1789141c7de471cfb6 100644
--- a/src/implementation/matrices/tnlCSRMatrix_impl.h
+++ b/src/implementation/matrices/tnlCSRMatrix_impl.h
@@ -27,7 +27,8 @@ template< typename Real,
           typename Device,
           typename Index >
 tnlCSRMatrix< Real, Device, Index >::tnlCSRMatrix()
-: spmvCudaKernel( scalar )
+: spmvCudaKernel( vector ),
+  cudaWarpSize( 32 ) //tnlCuda::getWarpSize() )
 {
 };
 
@@ -557,11 +558,78 @@ void tnlCSRMatrix< Real, Device, Index >::setCudaKernelType( const SPMVCudaKerne
 template< typename Real,
           typename Device,
           typename Index >
+#ifdef HAVE_CUDA
+   __device__ __host__
+#endif
 typename tnlCSRMatrix< Real, Device, Index >::SPMVCudaKernel tnlCSRMatrix< Real, Device, Index >::getCudaKernelType() const
 {
    return this->spmvCudaKernel;
 }
 
+template< typename Real,
+          typename Device,
+          typename Index >
+void tnlCSRMatrix< Real, Device, Index >::setCudaWarpSize( const int warpSize )
+{
+   this->cudaWarpSize = warpSize;
+}
+
+template< typename Real,
+          typename Device,
+          typename Index >
+int tnlCSRMatrix< Real, Device, Index >::getCudaWarpSize() const
+{
+   return this->cudaWarpSize;
+}
+
+
+#ifdef HAVE_CUDA
+template< typename Real,
+          typename Device,
+          typename Index >
+   template< typename Vector,
+             int warpSize >
+__device__
+void tnlCSRMatrix< Real, Device, Index >::vectorProductCuda( const Vector& inVector,
+                                                             Vector& outVector,
+                                                             int gridIdx ) const
+{
+   Real* aux = getSharedMemory< Real >();
+   IndexType globalIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   const IndexType warpStart = warpSize * ( globalIdx / warpSize );
+   const IndexType warpEnd = Min( warpStart + warpSize, this->getRows() );
+   const IndexType inWarpIdx = globalIdx % warpSize;
+   for( IndexType row = warpStart; row < warpEnd; row++ )
+   {
+      aux[ threadIdx.x ] = 0.0;
+
+      IndexType elementPtr = this->rowPointers[ row ] + inWarpIdx;
+      const IndexType rowEnd = this->rowPointers[ row + 1 ];
+      IndexType column;
+      while( elementPtr < rowEnd &&
+             ( column = this->columnIndexes[ elementPtr ] ) < this->getColumns() )
+      {
+         aux[ threadIdx.x ] += inVector[ column ] * this->values[ elementPtr ];
+         elementPtr += warpSize;
+      }
+      if( warpSize == 32 )
+         if( inWarpIdx < 16 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 16 ];
+      if( warpSize >= 16 )
+         if( inWarpIdx < 8 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 8 ];
+      if( warpSize >= 8 )
+         if( inWarpIdx < 4 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 4 ];
+      if( warpSize >= 4 )
+         if( inWarpIdx < 2 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 2 ];
+      if( warpSize >= 2 )
+         if( inWarpIdx < 1 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 1 ];
+      __syncthreads(); // TODO: I am not sure why
+
+      if( inWarpIdx == 0 )
+         outVector[ row ] = aux[ threadIdx.x ];
+   }
+}
+#endif
+
 template<>
 class tnlCSRMatrixDeviceDependentCode< tnlHost >
 {
@@ -571,10 +639,11 @@ class tnlCSRMatrixDeviceDependentCode< tnlHost >
 
       template< typename Real,
                 typename Index,
-                typename Vector >
+                typename InVector,
+                typename OutVector >
       static void vectorProduct( const tnlCSRMatrix< Real, Device, Index >& matrix,      
-                                 const Vector& inVector,
-                                 Vector& outVector )
+                                 const InVector& inVector,
+                                 OutVector& outVector )
       {
          for( Index row = 0; row < matrix.getRows(); row ++ )
             outVector[ row ] = matrix.rowVectorProduct( row, inVector );
@@ -585,17 +654,25 @@ class tnlCSRMatrixDeviceDependentCode< tnlHost >
 #ifdef HAVE_CUDA
 template< typename Real,
           typename Index,
-          typename Vector >
-__global__ void tnlCSRMatrixVectorProductCudaScalarKernel( const tnlCSRMatrix< Real, tnlCuda, Index >* matrix,
-                                                           const Vector* inVector,
-                                                           Vector* outVector,
-                                                           int gridIdx )
+          typename Vector,
+          int warpSize >
+__global__ void tnlCSRMatrixVectorProductCudaKernel( const tnlCSRMatrix< Real, tnlCuda, Index >* matrix,
+                                                     const Vector* inVector,
+                                                     Vector* outVector,
+                                                     int gridIdx )
 {
    typedef tnlCSRMatrix< Real, tnlCuda, Index > Matrix;
    tnlStaticAssert( Matrix::DeviceType::DeviceType == tnlCudaDevice, );
-    const typename Matrix::IndexType rowIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-    if( rowIdx < matrix->getRows() )
-       ( *outVector )[ rowIdx ] = matrix->rowVectorProduct( rowIdx, *inVector );
+   const typename Matrix::IndexType rowIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
+   if( matrix->getCudaKernelType() == Matrix::scalar )
+   {
+      if( rowIdx < matrix->getRows() )
+         ( *outVector )[ rowIdx ] = matrix->rowVectorProduct( rowIdx, *inVector );
+   }
+   if( matrix->getCudaKernelType() == Matrix::vector )
+   {
+      matrix->template vectorProductCuda< Vector, warpSize >( *inVector, *outVector, gridIdx );
+   }
 }
 #endif
 
@@ -613,20 +690,56 @@ void tnlCSRMatrixVectorProductCuda( const tnlCSRMatrix< Real, tnlCuda, Index >&
    Vector* kernel_inVector = tnlCuda::passToDevice( inVector );
    Vector* kernel_outVector = tnlCuda::passToDevice( outVector );
    dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() );
-   if( matrix.getCudaKernelType() == Matrix::scalar )
+   const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
+   const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() );
+   for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
    {
-      const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
-      const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() );
-      for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
-      {
-         if( gridIdx == cudaGrids - 1 )
-            cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize();
-         tnlCSRMatrixVectorProductCudaScalarKernel<<< cudaGridSize, cudaBlockSize >>>
-                                                  ( kernel_this,
-                                                    kernel_inVector,
-                                                    kernel_outVector,
-                                                    gridIdx );
-      }
+      if( gridIdx == cudaGrids - 1 )
+         cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize();
+      const int sharedMemory = cudaBlockSize.x * sizeof( Real );
+      if( matrix.getCudaWarpSize() == 32 )
+         tnlCSRMatrixVectorProductCudaKernel< Real, Index, Vector, 32 >
+                                            <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
+                                            ( kernel_this,
+                                              kernel_inVector,
+                                              kernel_outVector,
+                                              gridIdx );
+      if( matrix.getCudaWarpSize() == 16 )
+         tnlCSRMatrixVectorProductCudaKernel< Real, Index, Vector, 16 >
+                                            <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
+                                            ( kernel_this,
+                                              kernel_inVector,
+                                              kernel_outVector,
+                                              gridIdx );
+      if( matrix.getCudaWarpSize() == 8 )
+         tnlCSRMatrixVectorProductCudaKernel< Real, Index, Vector, 8 >
+                                            <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
+                                            ( kernel_this,
+                                              kernel_inVector,
+                                              kernel_outVector,
+                                              gridIdx );
+      if( matrix.getCudaWarpSize() == 4 )
+         tnlCSRMatrixVectorProductCudaKernel< Real, Index, Vector, 4 >
+                                            <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
+                                            ( kernel_this,
+                                              kernel_inVector,
+                                              kernel_outVector,
+                                              gridIdx );
+      if( matrix.getCudaWarpSize() == 2 )
+         tnlCSRMatrixVectorProductCudaKernel< Real, Index, Vector, 2 >
+                                            <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
+                                            ( kernel_this,
+                                              kernel_inVector,
+                                              kernel_outVector,
+                                              gridIdx );
+      if( matrix.getCudaWarpSize() == 1 )
+         tnlCSRMatrixVectorProductCudaKernel< Real, Index, Vector, 1 >
+                                            <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
+                                            ( kernel_this,
+                                              kernel_inVector,
+                                              kernel_outVector,
+                                              gridIdx );
+
    }
    tnlCuda::freeFromDevice( kernel_this );
    tnlCuda::freeFromDevice( kernel_inVector );
@@ -645,10 +758,11 @@ class tnlCSRMatrixDeviceDependentCode< tnlCuda >
 
       template< typename Real,
                 typename Index,
-                typename Vector >
+                typename InVector,
+                typename OutVector >
       static void vectorProduct( const tnlCSRMatrix< Real, Device, Index >& matrix,
-                                 const Vector& inVector,
-                                 Vector& outVector )
+                                 const InVector& inVector,
+                                 OutVector& outVector )
       {
          tnlCSRMatrixVectorProductCuda( matrix, inVector, outVector );
       }
diff --git a/src/implementation/matrices/tnlChunkedEllpackMatrix_impl.h b/src/implementation/matrices/tnlChunkedEllpackMatrix_impl.h
index c178338e6dd3f8179ca037e650f30e8a1a1109ae..38742ae43d526ee8bb02946577dfe60fa54eb514 100644
--- a/src/implementation/matrices/tnlChunkedEllpackMatrix_impl.h
+++ b/src/implementation/matrices/tnlChunkedEllpackMatrix_impl.h
@@ -1098,9 +1098,10 @@ __device__ void tnlChunkedEllpackMatrix< Real, Device, Index >::computeSliceVect
 template< typename Real,
           typename Device,
           typename Index >
-   template< typename Vector >
-void tnlChunkedEllpackMatrix< Real, Device, Index >::vectorProduct( const Vector& inVector,
-                                                                    Vector& outVector ) const
+   template< typename InVector,
+             typename OutVector >
+void tnlChunkedEllpackMatrix< Real, Device, Index >::vectorProduct( const InVector& inVector,
+                                                                    OutVector& outVector ) const
 {
    DeviceDependentCode::vectorProduct( *this, inVector, outVector );
 }
@@ -1295,10 +1296,11 @@ class tnlChunkedEllpackMatrixDeviceDependentCode< tnlHost >
 
       template< typename Real,
                 typename Index,
-                typename Vector >
+                typename InVector,
+                typename OutVector >
       static void vectorProduct( const tnlChunkedEllpackMatrix< Real, Device, Index >& matrix,
-                                 const Vector& inVector,
-                                 Vector& outVector )
+                                 const InVector& inVector,
+                                 OutVector& outVector )
       {
          for( Index row = 0; row < matrix.getRows(); row ++ )
             outVector[ row ] = matrix.rowVectorProduct( row, inVector );
@@ -1308,10 +1310,11 @@ class tnlChunkedEllpackMatrixDeviceDependentCode< tnlHost >
 #ifdef HAVE_CUDA
 template< typename Real,
           typename Index,
-          typename Vector >
+          typename InVector,
+          typename OutVector >
 __global__ void tnlChunkedEllpackMatrixVectorProductCudaKernel( const tnlChunkedEllpackMatrix< Real, tnlCuda, Index >* matrix,
-                                                                const Vector* inVector,
-                                                                Vector* outVector,
+                                                                const InVector* inVector,
+                                                                OutVector* outVector,
                                                                 int gridIdx )
 {
    const Index sliceIdx = gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x;
@@ -1359,18 +1362,19 @@ class tnlChunkedEllpackMatrixDeviceDependentCode< tnlCuda >
 
       template< typename Real,
                 typename Index, 
-                typename Vector >
+                typename InVector,
+                typename OutVector >
       static void vectorProduct( const tnlChunkedEllpackMatrix< Real, Device, Index >& matrix,
-                                 const Vector& inVector,
-                                 Vector& outVector )
+                                 const InVector& inVector,
+                                 OutVector& outVector )
       {
          #ifdef HAVE_CUDA
             typedef tnlChunkedEllpackMatrix< Real, tnlCuda, Index > Matrix;
             typedef Index IndexType;
             typedef Real RealType;
             Matrix* kernel_this = tnlCuda::passToDevice( matrix );
-            Vector* kernel_inVector = tnlCuda::passToDevice( inVector );
-            Vector* kernel_outVector = tnlCuda::passToDevice( outVector );
+            InVector* kernel_inVector = tnlCuda::passToDevice( inVector );
+            OutVector* kernel_outVector = tnlCuda::passToDevice( outVector );
             dim3 cudaBlockSize( matrix.getNumberOfChunksInSlice() ),
                  cudaGridSize( tnlCuda::getMaxGridSize() );
             const IndexType cudaBlocks = matrix.getNumberOfSlices();
@@ -1381,7 +1385,7 @@ class tnlChunkedEllpackMatrixDeviceDependentCode< tnlCuda >
             {
                if( gridIdx == cudaGrids - 1 )
                   cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize();
-               tnlChunkedEllpackMatrixVectorProductCudaKernel< Real, Index, Vector >
+               tnlChunkedEllpackMatrixVectorProductCudaKernel< Real, Index, InVector, OutVector >
                                                              <<< cudaGridSize, cudaBlockSize, sharedMemory  >>>
                                                              ( kernel_this,
                                                                kernel_inVector,
diff --git a/src/implementation/matrices/tnlDenseMatrix_impl.h b/src/implementation/matrices/tnlDenseMatrix_impl.h
index 3871dbff8b9039ad909ba6b7726b186fb6c352c8..9a10ad6d25317f768dbd94d85986abf0d0062ba5 100644
--- a/src/implementation/matrices/tnlDenseMatrix_impl.h
+++ b/src/implementation/matrices/tnlDenseMatrix_impl.h
@@ -352,27 +352,13 @@ typename Vector::RealType tnlDenseMatrix< Real, Device, Index >::rowVectorProduc
    return sum;
 }
 
-/*#ifdef HAVE_CUDA
-template< typename Real,
-          typename Index,
-          typename Vector >
-__global__ void tnlDenseMatrixVectorProductCudaKernel( tnlDenseMatrix< Real, tnlCuda, Index >* matrix,
-                                                       const Vector* inVector,
-                                                       Vector* outVector,
-                                                       const Index gridIdx )
-{
-   const Index rowIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
-   if( rowIdx < matrix->getRows() )
-      ( *outVector )[ rowIdx ] = matrix->rowVectorProduct( rowIdx, *inVector );
-}
-#endif*/
-
 template< typename Real,
           typename Device,
           typename Index >
-   template< typename Vector >
-void tnlDenseMatrix< Real, Device, Index >::vectorProduct( const Vector& inVector,
-                                                           Vector& outVector ) const
+   template< typename InVector,
+             typename OutVector >
+void tnlDenseMatrix< Real, Device, Index >::vectorProduct( const InVector& inVector,
+                                                           OutVector& outVector ) const
 {
    tnlAssert( this->getColumns() == inVector.getSize(),
             cerr << "Matrix columns: " << this->getColumns() << endl
@@ -385,11 +371,7 @@ void tnlDenseMatrix< Real, Device, Index >::vectorProduct( const Vector& inVecto
                     << "Vector size: " << outVector.getSize() << endl
                     << "Vector name: " << outVector.getName() << endl );
 
-   if( Device::getDevice() == tnlHostDevice )
-      for( IndexType row = 0; row < this->getRows(); row++ )
-         outVector[ row ] = rowVectorProduct( row, inVector );
-   if( Device::getDevice() == tnlCudaDevice )
-      tnlMatrixVectorProductCuda( *this, inVector, outVector );
+   DeviceDependentCode::vectorProduct( *this, inVector, outVector );
 }
 
 template< typename Real,
@@ -928,5 +910,43 @@ Index tnlDenseMatrix< Real, Device, Index >::getElementIndex( const IndexType ro
       return column * this->rows + row;
 }
 
+template<>
+class tnlDenseMatrixDeviceDependentCode< tnlHost >
+{
+   public:
+
+      typedef tnlHost Device;
+
+      template< typename Real,
+                typename Index,
+                typename InVector,
+                typename OutVector >
+      static void vectorProduct( const tnlDenseMatrix< Real, Device, Index >& matrix,
+                                 const InVector& inVector,
+                                 OutVector& outVector )
+      {
+         for( Index row = 0; row < matrix.getRows(); row ++ )
+            outVector[ row ] = matrix.rowVectorProduct( row, inVector );
+      }
+};
+
+template<>
+class tnlDenseMatrixDeviceDependentCode< tnlCuda >
+{
+   public:
+
+      typedef tnlCuda Device;
+
+      template< typename Real,
+                typename Index,
+                typename InVector,
+                typename OutVector >
+      static void vectorProduct( const tnlDenseMatrix< Real, Device, Index >& matrix,
+                                 const InVector& inVector,
+                                 OutVector& outVector )
+      {
+         tnlMatrixVectorProductCuda( matrix, inVector, outVector );
+      }
+};
 
 #endif /* TNLDENSEMATRIX_IMPL_H_ */
diff --git a/src/implementation/matrices/tnlEllpackMatrix_impl.h b/src/implementation/matrices/tnlEllpackMatrix_impl.h
index d08bd09a150d4abfff73eb8916ebc2510409ac34..f9163ceeddb6e8f0eafad85672f03c5d7bd44743 100644
--- a/src/implementation/matrices/tnlEllpackMatrix_impl.h
+++ b/src/implementation/matrices/tnlEllpackMatrix_impl.h
@@ -489,9 +489,10 @@ typename Vector::RealType tnlEllpackMatrix< Real, Device, Index >::rowVectorProd
 template< typename Real,
           typename Device,
           typename Index >
-   template< typename Vector >
-void tnlEllpackMatrix< Real, Device, Index >::vectorProduct( const Vector& inVector,
-                                                                   Vector& outVector ) const
+   template< typename InVector,
+             typename OutVector >
+void tnlEllpackMatrix< Real, Device, Index >::vectorProduct( const InVector& inVector,
+                                                                   OutVector& outVector ) const
 {
    DeviceDependentCode::vectorProduct( *this, inVector, outVector );
 }
@@ -665,10 +666,11 @@ class tnlEllpackMatrixDeviceDependentCode< tnlHost >
 
       template< typename Real,
                 typename Index,
-                typename Vector >
+                typename InVector,
+                typename OutVector >
       static void vectorProduct( const tnlEllpackMatrix< Real, Device, Index >& matrix,
-                                 const Vector& inVector,
-                                 Vector& outVector )
+                                 const InVector& inVector,
+                                 OutVector& outVector )
       {
          for( Index row = 0; row < matrix.getRows(); row ++ )
             outVector[ row ] = matrix.rowVectorProduct( row, inVector );
@@ -716,10 +718,11 @@ class tnlEllpackMatrixDeviceDependentCode< tnlCuda >
 
       template< typename Real,
                 typename Index,
-                typename Vector >
+                typename InVector,
+                typename OutVector >
       static void vectorProduct( const tnlEllpackMatrix< Real, Device, Index >& matrix,
-                                 const Vector& inVector,
-                                 Vector& outVector )
+                                 const InVector& inVector,
+                                 OutVector& outVector )
       {
          tnlMatrixVectorProductCuda( matrix, inVector, outVector );
       }
diff --git a/src/implementation/matrices/tnlMatrix_impl.h b/src/implementation/matrices/tnlMatrix_impl.h
index 73fea5e04fc34858238f0e9c5e2d1ddb747b712e..ff8ee34e7eb251ed0694947692c0f4f2f5b4fe64 100644
--- a/src/implementation/matrices/tnlMatrix_impl.h
+++ b/src/implementation/matrices/tnlMatrix_impl.h
@@ -225,10 +225,11 @@ void tnlMatrix< Real, Device, Index >::print( ostream& str ) const
 
 #ifdef HAVE_CUDA
 template< typename Matrix,
-          typename Vector >
+          typename InVector,
+          typename OutVector >
 __global__ void tnlMatrixVectorProductCudaKernel( const Matrix* matrix,
-                                                  const Vector* inVector,
-                                                  Vector* outVector,
+                                                  const InVector* inVector,
+                                                  OutVector* outVector,
                                                   int gridIdx )
 {
    tnlStaticAssert( Matrix::DeviceType::DeviceType == tnlCudaDevice, );
@@ -239,16 +240,17 @@ __global__ void tnlMatrixVectorProductCudaKernel( const Matrix* matrix,
 #endif
 
 template< typename Matrix,
-          typename Vector >
+          typename InVector,
+          typename OutVector >
 void tnlMatrixVectorProductCuda( const Matrix& matrix,
-                                 const Vector& inVector,
-                                 Vector& outVector )
+                                 const InVector& inVector,
+                                 OutVector& outVector )
 {
 #ifdef HAVE_CUDA
    typedef typename Matrix::IndexType IndexType;
    Matrix* kernel_this = tnlCuda::passToDevice( matrix );
-   Vector* kernel_inVector = tnlCuda::passToDevice( inVector );
-   Vector* kernel_outVector = tnlCuda::passToDevice( outVector );
+   InVector* kernel_inVector = tnlCuda::passToDevice( inVector );
+   OutVector* kernel_outVector = tnlCuda::passToDevice( outVector );
    dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() );
    const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
    const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() );
diff --git a/src/implementation/matrices/tnlMultidiagonalMatrix_impl.h b/src/implementation/matrices/tnlMultidiagonalMatrix_impl.h
index 86ce92d62779846da43017ac58fdfbe2ba9c4b68..46f3d2084d0bd65b00a877d9b640470436114a24 100644
--- a/src/implementation/matrices/tnlMultidiagonalMatrix_impl.h
+++ b/src/implementation/matrices/tnlMultidiagonalMatrix_impl.h
@@ -468,17 +468,23 @@ typename Vector::RealType tnlMultidiagonalMatrix< Real, Device, Index >::rowVect
 template< typename Real,
           typename Device,
           typename Index >
-   template< typename Vector >
-void tnlMultidiagonalMatrix< Real, Device, Index >::vectorProduct( const Vector& inVector,
-                                                                   Vector& outVector ) const
+   template< typename InVector,
+             typename OutVector >
+void tnlMultidiagonalMatrix< Real, Device, Index >::vectorProduct( const InVector& inVector,
+                                                                   OutVector& outVector ) const
 {
-   if( Device::getDevice() == tnlHostDevice )
-   {
-      for( Index row = 0; row < this->getRows(); row ++ )
-         outVector[ row ] = this->rowVectorProduct( row, inVector );
-   }
-   if( Device::getDevice() == tnlCudaDevice )
-      tnlMatrixVectorProductCuda( *this, inVector, outVector );
+   tnlAssert( this->getColumns() == inVector.getSize(),
+            cerr << "Matrix columns: " << this->getColumns() << endl
+                 << "Matrix name: " << this->getName() << endl
+                 << "Vector size: " << inVector.getSize() << endl
+                 << "Vector name: " << inVector.getName() << endl );
+   tnlAssert( this->getRows() == outVector.getSize(),
+               cerr << "Matrix rows: " << this->getRows() << endl
+                    << "Matrix name: " << this->getName() << endl
+                    << "Vector size: " << outVector.getSize() << endl
+                    << "Vector name: " << outVector.getName() << endl );
+
+   DeviceDependentCode::vectorProduct( *this, inVector, outVector );
 }
 
 template< typename Real,
@@ -680,6 +686,8 @@ class tnlMultidiagonalMatrixDeviceDependentCode< tnlHost >
 {
    public:
 
+      typedef tnlHost Device;
+
       template< typename Index >
       static Index getElementIndex( const Index rows,
                                     const Index diagonals,
@@ -688,6 +696,18 @@ class tnlMultidiagonalMatrixDeviceDependentCode< tnlHost >
       {
          return row*diagonals + diagonal;
       }
+
+      template< typename Real,
+                typename Index,
+                typename InVector,
+                typename OutVector >
+      static void vectorProduct( const tnlMultidiagonalMatrix< Real, Device, Index >& matrix,
+                                 const InVector& inVector,
+                                 OutVector& outVector )
+      {
+         for( Index row = 0; row < matrix.getRows(); row ++ )
+            outVector[ row ] = matrix.rowVectorProduct( row, inVector );
+      }
 };
 
 template<>
@@ -695,6 +715,8 @@ class tnlMultidiagonalMatrixDeviceDependentCode< tnlCuda >
 {
    public:
 
+      typedef tnlCuda Device;
+
       template< typename Index >
 #ifdef HAVE_CUDA
       __device__ __host__
@@ -706,6 +728,17 @@ class tnlMultidiagonalMatrixDeviceDependentCode< tnlCuda >
       {
          return diagonal*rows + row;
       }
+
+      template< typename Real,
+                typename Index,
+                typename InVector,
+                typename OutVector >
+      static void vectorProduct( const tnlMultidiagonalMatrix< Real, Device, Index >& matrix,
+                                 const InVector& inVector,
+                                 OutVector& outVector )
+      {
+         tnlMatrixVectorProductCuda( matrix, inVector, outVector );
+      }
 };
 
 
diff --git a/src/implementation/matrices/tnlSlicedEllpackMatrix_impl.h b/src/implementation/matrices/tnlSlicedEllpackMatrix_impl.h
index 31a01c47ca623cec1499ba5a629ec966dc7a5ef3..db4a6edeb37cd4373c446a54246e33d854c18c9e 100644
--- a/src/implementation/matrices/tnlSlicedEllpackMatrix_impl.h
+++ b/src/implementation/matrices/tnlSlicedEllpackMatrix_impl.h
@@ -492,9 +492,10 @@ template< typename Real,
           typename Device,
           typename Index,
           int SliceSize >
-   template< typename Vector >
-void tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::vectorProduct( const Vector& inVector,
-                                                                              Vector& outVector ) const
+   template< typename InVector,
+             typename OutVector >
+void tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::vectorProduct( const InVector& inVector,
+                                                                              OutVector& outVector ) const
 {
    DeviceDependentCode::vectorProduct( *this, inVector, outVector );
 }
@@ -732,11 +733,12 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlHost >
 
       template< typename Real,
                 typename Index,
-                typename Vector,
+                typename InVector,
+                typename OutVector,
                 int SliceSize >
       static void vectorProduct( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix,
-                                 const Vector& inVector,
-                                 Vector& outVector )
+                                 const InVector& inVector,
+                                 OutVector& outVector )
       {
          for( Index row = 0; row < matrix.getRows(); row ++ )
             outVector[ row ] = matrix.rowVectorProduct( row, inVector );
@@ -836,11 +838,12 @@ class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda >
 
       template< typename Real,
                 typename Index,
-                typename Vector,
+                typename InVector,
+                typename OutVector,
                 int SliceSize >
       static void vectorProduct( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix,
-                                 const Vector& inVector,
-                                 Vector& outVector )
+                                 const InVector& inVector,
+                                 OutVector& outVector )
       {
          tnlMatrixVectorProductCuda( matrix, inVector, outVector );
       }
diff --git a/src/implementation/matrices/tnlTridiagonalMatrix_impl.h b/src/implementation/matrices/tnlTridiagonalMatrix_impl.h
index f37ca4ba3ff4b81f552c42d287ef9dff1db1e7e0..e493a475695029e6c8439582a0f5056f3b3fd66d 100644
--- a/src/implementation/matrices/tnlTridiagonalMatrix_impl.h
+++ b/src/implementation/matrices/tnlTridiagonalMatrix_impl.h
@@ -394,9 +394,10 @@ typename Vector::RealType tnlTridiagonalMatrix< Real, Device, Index >::rowVector
 template< typename Real,
           typename Device,
           typename Index >
-   template< typename Vector >
-void tnlTridiagonalMatrix< Real, Device, Index >::vectorProduct( const Vector& inVector,
-                                                                 Vector& outVector ) const
+   template< typename InVector,
+             typename OutVector >
+void tnlTridiagonalMatrix< Real, Device, Index >::vectorProduct( const InVector& inVector,
+                                                                 OutVector& outVector ) const
 {
    tnlAssert( this->getColumns() == inVector.getSize(),
             cerr << "Matrix columns: " << this->getColumns() << endl
@@ -409,12 +410,7 @@ void tnlTridiagonalMatrix< Real, Device, Index >::vectorProduct( const Vector& i
                     << "Vector size: " << outVector.getSize() << endl
                     << "Vector name: " << outVector.getName() << endl );
 
-   if( Device::getDevice() == tnlHostDevice )
-      for( IndexType row = 0; row < this->getRows(); row++ )
-         outVector[ row ] = this->rowVectorProduct( row, inVector );
-
-   if( Device::getDevice() == tnlCudaDevice )
-      tnlMatrixVectorProductCuda( *this, inVector, outVector );
+   DeviceDependentCode::vectorProduct( *this, inVector, outVector );
 }
 
 template< typename Real,
@@ -610,6 +606,8 @@ class tnlTridiagonalMatrixDeviceDependentCode< tnlHost >
 {
    public:
 
+      typedef tnlHost Device;
+
       template< typename Index >
       static Index getElementIndex( const Index rows,
                                     const Index row,
@@ -637,12 +635,26 @@ class tnlTridiagonalMatrixDeviceDependentCode< tnlHost >
                 vector[ row ] * values[ i++ ] +
                 vector[ row + 1 ] * values[ i ];
       }
+
+      template< typename Real,
+                typename Index,
+                typename InVector,
+                typename OutVector >
+      static void vectorProduct( const tnlTridiagonalMatrix< Real, Device, Index >& matrix,
+                                 const InVector& inVector,
+                                 OutVector& outVector )
+      {
+         for( Index row = 0; row < matrix.getRows(); row ++ )
+            outVector[ row ] = matrix.rowVectorProduct( row, inVector );
+      }
 };
 
 template<>
 class tnlTridiagonalMatrixDeviceDependentCode< tnlCuda >
 {
    public:
+      
+      typedef tnlCuda Device;
 
       template< typename Index >
 #ifdef HAVE_CUDA
@@ -677,6 +689,17 @@ class tnlTridiagonalMatrixDeviceDependentCode< tnlCuda >
                 vector[ row ] * values[ i + rows ] +
                 vector[ row + 1 ] * values[ i + 2*rows ];
       }
+
+      template< typename Real,
+                typename Index,
+                typename InVector,
+                typename OutVector >
+      static void vectorProduct( const tnlTridiagonalMatrix< Real, Device, Index >& matrix,
+                                 const InVector& inVector,
+                                 OutVector& outVector )
+      {
+         tnlMatrixVectorProductCuda( matrix, inVector, outVector );
+      }
 };
 
 
diff --git a/src/implementation/solvers/linear/krylov/tnlGMRESSolver_impl.h b/src/implementation/solvers/linear/krylov/tnlGMRESSolver_impl.h
index 1b9a5947a25fea617a420c5bfeebfb5d479a8c5b..97142cca8fd503941fb23e0c877e25522fb138f2 100644
--- a/src/implementation/solvers/linear/krylov/tnlGMRESSolver_impl.h
+++ b/src/implementation/solvers/linear/krylov/tnlGMRESSolver_impl.h
@@ -31,7 +31,7 @@ tnlGMRESSolver< Matrix, Preconditioner > :: tnlGMRESSolver()
   _sn( "tnlGMRESSolver::_sn" ),
   _H( "tnlGMRESSolver:_H" ),
   size( 0 ),
-  restarting( 0 ),
+  restarting( 10 ),
   matrix( 0 ),
   preconditioner( 0 )
 {
@@ -164,7 +164,7 @@ bool tnlGMRESSolver< Matrix, Preconditioner > :: solve( const Vector& b, Vector&
           */
          if( preconditioner )
          {
-            matrix -> vectorProduct( vi, _M_tmp );
+            matrix->vectorProduct( vi, _M_tmp );
             this->preconditioner->solve( _M_tmp, _w );
          }
          else
diff --git a/src/matrices/tnlCSRMatrix.h b/src/matrices/tnlCSRMatrix.h
index 4c0552325149dd4b6b5d7d0b2edce7fec9870aa2..7031b88cbc60991f3e53746167c11709e4f94b62 100644
--- a/src/matrices/tnlCSRMatrix.h
+++ b/src/matrices/tnlCSRMatrix.h
@@ -136,7 +136,8 @@ class tnlCSRMatrix : public tnlSparseMatrix< Real, Device, Index >
    typename Vector::RealType rowVectorProduct( const IndexType row,
                                                const Vector& vector ) const;
 
-   template< typename InVector, typename OutVector >
+   template< typename InVector,
+             typename OutVector >
    void vectorProduct( const InVector& inVector,
                        OutVector& outVector ) const;
 
@@ -167,14 +168,32 @@ class tnlCSRMatrix : public tnlSparseMatrix< Real, Device, Index >
 
    void setCudaKernelType( const SPMVCudaKernel kernel );
 
+#ifdef HAVE_CUDA
+   __device__ __host__
+#endif
    SPMVCudaKernel getCudaKernelType() const;
 
+   void setCudaWarpSize( const int warpSize );
+
+   int getCudaWarpSize() const;
+
+#ifdef HAVE_CUDA
+   template< typename Vector,
+             int warpSize >
+   __device__
+   void vectorProductCuda( const Vector& inVector,
+                           Vector& outVector,
+                           int gridIdx ) const;
+#endif
+
    protected:
 
    tnlVector< Index, Device, Index > rowPointers;
 
    SPMVCudaKernel spmvCudaKernel;
 
+   int cudaWarpSize;
+
    typedef tnlCSRMatrixDeviceDependentCode< DeviceType > DeviceDependentCode;
    friend class tnlCSRMatrixDeviceDependentCode< DeviceType >;
 
diff --git a/src/matrices/tnlChunkedEllpackMatrix.h b/src/matrices/tnlChunkedEllpackMatrix.h
index 007b1e58c30b78a9453770ae223aa83ce2c47f4c..87cace2d49db4ca9b1ed9a7eff1ff6de2b624d21 100644
--- a/src/matrices/tnlChunkedEllpackMatrix.h
+++ b/src/matrices/tnlChunkedEllpackMatrix.h
@@ -191,9 +191,10 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
 #endif
 
 
-   template< typename Vector >
-   void vectorProduct( const Vector& inVector,
-                       Vector& outVector ) const;
+   template< typename InVector,
+             typename OutVector >
+   void vectorProduct( const InVector& inVector,
+                       OutVector& outVector ) const;
 
    template< typename Real2, typename Index2 >
    void addMatrix( const tnlChunkedEllpackMatrix< Real2, Device, Index2 >& matrix,
diff --git a/src/matrices/tnlDenseMatrix.h b/src/matrices/tnlDenseMatrix.h
index 80cd8f6dfaf9155627d5440d9bd43828daef97de..d991c5aefa3644ea1b8d3324ec3f42833afe8120 100644
--- a/src/matrices/tnlDenseMatrix.h
+++ b/src/matrices/tnlDenseMatrix.h
@@ -22,6 +22,9 @@
 #include <matrices/tnlMatrix.h>
 #include <core/arrays/tnlArray.h>
 
+template< typename Device >
+class tnlDenseMatrixDeviceDependentCode;
+
 template< typename Real = double,
           typename Device = tnlHost,
           typename Index = int >
@@ -145,9 +148,9 @@ class tnlDenseMatrix : public tnlMatrix< Real, Device, Index >
    typename Vector::RealType rowVectorProduct( const IndexType row,
                                                const Vector& vector ) const;
 
-   template< typename Vector >
-   void vectorProduct( const Vector& inVector,
-                       Vector& outVector ) const;
+   template< typename InVector, typename OutVector >
+   void vectorProduct( const InVector& inVector,
+                       OutVector& outVector ) const;
 
    template< typename Matrix >
    void addMatrix( const Matrix& matrix,
@@ -202,6 +205,8 @@ class tnlDenseMatrix : public tnlMatrix< Real, Device, Index >
    IndexType getElementIndex( const IndexType row,
                               const IndexType column ) const;
 
+   typedef tnlDenseMatrixDeviceDependentCode< DeviceType > DeviceDependentCode;
+   friend class tnlDenseMatrixDeviceDependentCode< DeviceType >;
 
 };
 
diff --git a/src/matrices/tnlEllpackMatrix.h b/src/matrices/tnlEllpackMatrix.h
index 646c0219c67d0dd7b2caaa8d939de8ab43f88402..be90ae6f711e6f041067dfc45c4980a6dc420bec 100644
--- a/src/matrices/tnlEllpackMatrix.h
+++ b/src/matrices/tnlEllpackMatrix.h
@@ -148,9 +148,10 @@ template< typename Vector >
    typename Vector::RealType rowVectorProduct( const IndexType row,
                                                const Vector& vector ) const;
 
-   template< typename Vector >
-   void vectorProduct( const Vector& inVector,
-                       Vector& outVector ) const;
+   template< typename InVector,
+             typename OutVector >
+   void vectorProduct( const InVector& inVector,
+                       OutVector& outVector ) const;
 
    template< typename Real2, typename Index2 >
    void addMatrix( const tnlEllpackMatrix< Real2, Device, Index2 >& matrix,
diff --git a/src/matrices/tnlMatrix.h b/src/matrices/tnlMatrix.h
index e927ca66ab6b24912cf18d06f7d0da159761301d..a80d2843a4a9917b6cf30e9d2e03b0f35bb395a4 100644
--- a/src/matrices/tnlMatrix.h
+++ b/src/matrices/tnlMatrix.h
@@ -133,10 +133,11 @@ ostream& operator << ( ostream& str, const tnlMatrix< Real, Device, Index >& m )
 }
 
 template< typename Matrix,
-          typename Vector >
+          typename InVector,
+          typename OutVector >
 void tnlMatrixVectorProductCuda( const Matrix& matrix,
-                                 const Vector& inVector,
-                                 Vector& outVector );
+                                 const InVector& inVector,
+                                 OutVector& outVector );
 
 
 #include <implementation/matrices/tnlMatrix_impl.h>
diff --git a/src/matrices/tnlMultidiagonalMatrix.h b/src/matrices/tnlMultidiagonalMatrix.h
index 831a4fbc2b320a683235b6f596172f87d3fd82ae..d0ecf615c4d9f6c15a506250e031e90804376501 100644
--- a/src/matrices/tnlMultidiagonalMatrix.h
+++ b/src/matrices/tnlMultidiagonalMatrix.h
@@ -21,6 +21,9 @@
 #include <matrices/tnlMatrix.h>
 #include <core/vectors/tnlVector.h>
 
+template< typename Device >
+class tnlMultidiagonalMatrixDeviceDependentCode;
+
 template< typename Real, typename Device = tnlHost, typename Index = int >
 class tnlMultidiagonalMatrix : public tnlMatrix< Real, Device, Index >
 {
@@ -148,9 +151,10 @@ class tnlMultidiagonalMatrix : public tnlMatrix< Real, Device, Index >
    typename Vector::RealType rowVectorProduct( const IndexType row,
                                                const Vector& vector ) const;
 
-   template< typename Vector >
-   void vectorProduct( const Vector& inVector,
-                       Vector& outVector ) const;
+   template< typename InVector,
+             typename OutVector >
+   void vectorProduct( const InVector& inVector,
+                       OutVector& outVector ) const;
 
    template< typename Real2, typename Index2 >
    void addMatrix( const tnlMultidiagonalMatrix< Real2, Device, Index2 >& matrix,
@@ -183,7 +187,6 @@ class tnlMultidiagonalMatrix : public tnlMatrix< Real, Device, Index >
                          const IndexType column,
                          IndexType& index ) const;
 
-
 #ifdef HAVE_CUDA
    __device__ __host__
 #endif
@@ -195,6 +198,10 @@ class tnlMultidiagonalMatrix : public tnlMatrix< Real, Device, Index >
 
    tnlVector< Index, Device, Index > diagonalsShift;
 
+   typedef tnlMultidiagonalMatrixDeviceDependentCode< DeviceType > DeviceDependentCode;
+   friend class tnlMultidiagonalMatrixDeviceDependentCode< DeviceType >;
+
+
 };
 
 #include <implementation/matrices/tnlMultidiagonalMatrix_impl.h>
diff --git a/src/matrices/tnlSlicedEllpackMatrix.h b/src/matrices/tnlSlicedEllpackMatrix.h
index d3dfc76b74f2434021823bc03a87166bbcedb31a..2530868d274a8c944a06095ff24fb293f0bb2f4f 100644
--- a/src/matrices/tnlSlicedEllpackMatrix.h
+++ b/src/matrices/tnlSlicedEllpackMatrix.h
@@ -158,9 +158,10 @@ class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
    typename Vector::RealType rowVectorProduct( const IndexType row,
                                                const Vector& vector ) const;
 
-   template< typename Vector >
-   void vectorProduct( const Vector& inVector,
-                       Vector& outVector ) const;
+   template< typename InVector,
+             typename OutVector >
+   void vectorProduct( const InVector& inVector,
+                       OutVector& outVector ) const;
 
    template< typename Real2, typename Index2 >
    void addMatrix( const tnlSlicedEllpackMatrix< Real2, Device, Index2 >& matrix,
diff --git a/src/matrices/tnlTridiagonalMatrix.h b/src/matrices/tnlTridiagonalMatrix.h
index cba9cd70b677739fba7d8665af19d14bfdb4a69f..7bcafd79e0ae21684b87a1b3789a0ca5a492d58d 100644
--- a/src/matrices/tnlTridiagonalMatrix.h
+++ b/src/matrices/tnlTridiagonalMatrix.h
@@ -21,6 +21,9 @@
 #include <matrices/tnlMatrix.h>
 #include <core/vectors/tnlVector.h>
 
+template< typename Device >
+class tnlTridiagonalMatrixDeviceDependentCode;
+
 template< typename Real = double,
           typename Device = tnlHost,
           typename Index = int >
@@ -143,9 +146,10 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index >
    typename Vector::RealType rowVectorProduct( const IndexType row,
                                                const Vector& vector ) const;
 
-   template< typename Vector >
-   void vectorProduct( const Vector& inVector,
-                       Vector& outVector ) const;
+   template< typename InVector,
+             typename OutVector >
+   void vectorProduct( const InVector& inVector,
+                       OutVector& outVector ) const;
 
    template< typename Real2, typename Index2 >
    void addMatrix( const tnlTridiagonalMatrix< Real2, Device, Index2 >& matrix,
@@ -187,6 +191,9 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index >
                               const IndexType column ) const;
 
    tnlVector< RealType, DeviceType, IndexType > values;
+
+   typedef tnlTridiagonalMatrixDeviceDependentCode< DeviceType > DeviceDependentCode;
+   friend class tnlTridiagonalMatrixDeviceDependentCode< DeviceType >;
 };
 
 #include <implementation/matrices/tnlTridiagonalMatrix_impl.h>
diff --git a/tests/benchmarks/CMakeLists.txt b/tests/benchmarks/CMakeLists.txt
index fbbc8f18becd1dd325c2a777f14cc5434130bf4d..7eacc115c2072d74bbc77cc4a8305882c2292e0f 100755
--- a/tests/benchmarks/CMakeLists.txt
+++ b/tests/benchmarks/CMakeLists.txt
@@ -1,5 +1,31 @@
 ADD_SUBDIRECTORY( share )
 
+IF( BUILD_CUDA )
+    CUDA_ADD_EXECUTABLE( tnl-benchmark-spmv${debugExt} tnl-benchmark-spmv.cu
+                         OPTIONS -arch sm_20 -shared )
+    SET_TARGET_PROPERTIES( tnl-benchmark-spmv${debugExt} PROPERTIES CUDA_COMPILE_FLAGS "${CXX_OPTIMIZE_FLAGS}" )
+    CUDA_ADD_EXECUTABLE( tnl-benchmark-linear-solvers${debugExt} tnl-benchmark-linear-solvers.cu
+                         OPTIONS -arch sm_20 -shared )
+    SET_TARGET_PROPERTIES( tnl-benchmark-spmv${debugExt} PROPERTIES CUDA_COMPILE_FLAGS "${CXX_OPTIMIZE_FLAGS}" )
+    SET_TARGET_PROPERTIES( tnl-benchmark-linear-solvers${debugExt} PROPERTIES CUDA_COMPILE_FLAGS "${CXX_OPTIMIZE_FLAGS}" )                    
+ELSE()
+    ADD_EXECUTABLE( tnl-benchmark-spmv${debugExt} tnl-benchmark-spmv.cpp )
+    ADD_EXECUTABLE( tnl-benchmark-linear-solvers${debugExt} tnl-benchmark-linear-solvers.cpp )
+    SET_TARGET_PROPERTIES( tnl-benchmark-spmv${debugExt} PROPERTIES COMPILE_FLAGS "${CXX_OPTIMIZE_FLAGS}" )
+    SET_TARGET_PROPERTIES( tnl-benchmark-linear-solvers${debugExt} PROPERTIES COMPILE_FLAGS "${CXX_OPTIMIZE_FLAGS}" )
+ENDIF()
+TARGET_LINK_LIBRARIES( tnl-benchmark-spmv${debugExt} tnl${debugExt}-${tnlVersion} )
+TARGET_LINK_LIBRARIES( tnl-benchmark-linear-solvers${debugExt} tnl${debugExt}-${tnlVersion} )
+                                                              
+INSTALL( TARGETS tnl-benchmark-spmv${debugExt}
+                 tnl-benchmark-linear-solvers${debugExt}
+         RUNTIME DESTINATION bin )
+
+
+
+
+
+####################################################
 SET( tnlSpmvBenchmark_headers sparse-matrix-benchmark.h             
                               tnlSpmvBenchmark.h
                               tnlSpmvBenchmarkAdaptiveRgCSRMatrix.h
@@ -19,21 +45,7 @@ ELSE()
 ENDIF()
 TARGET_LINK_LIBRARIES( tnl-sparse-matrix-benchmark${debugExt} tnl${debugExt}-${tnlVersion}
                                                               ${CUSPARSE_LIBRARY} )
-                                                              
-                                                              
-IF( BUILD_CUDA )
-    CUDA_ADD_EXECUTABLE( tnl-benchmark-spmv${debugExt} tnl-benchmark-spmv.cu
-                         OPTIONS -arch sm_20 -shared )
-    SET_TARGET_PROPERTIES( tnl-benchmark-spmv${debugExt} PROPERTIES CUDA_COMPILE_FLAGS "${CXX_OPTIMIZE_FLAGS}" )        
-ELSE()
-    ADD_EXECUTABLE( tnl-benchmark-spmv${debugExt} tnl-benchmark-spmv.cpp )
-    SET_TARGET_PROPERTIES( tnl-benchmark-spmv${debugExt} PROPERTIES COMPILE_FLAGS "${CXX_OPTIMIZE_FLAGS}" )
-ENDIF()
-TARGET_LINK_LIBRARIES( tnl-benchmark-spmv${debugExt} tnl${debugExt}-${tnlVersion} )
-                                                              
-INSTALL( TARGETS tnl-sparse-matrix-benchmark${debugExt}
-                 tnl-benchmark-spmv${debugExt}
-         RUNTIME DESTINATION bin )
+
 
 
 #IF( BUILD_CUDA )
diff --git a/tests/benchmarks/share/CMakeLists.txt b/tests/benchmarks/share/CMakeLists.txt
index 3734e0f0a7a9f2c1b79cf57bc8d4e167ca356f05..e8bb6840d9ef9b75fa1c6faef90785611898c5e6 100755
--- a/tests/benchmarks/share/CMakeLists.txt
+++ b/tests/benchmarks/share/CMakeLists.txt
@@ -1,6 +1,7 @@
 INSTALL( FILES tnl-sparse-matrix-benchmark.cfg.desc
                tnl-matrix-solvers-benchmark.cfg.desc
                tnl-benchmark-spmv.cfg.desc
+               tnl-benchmark-linear-solvers.cfg.desc
          DESTINATION share/tnl-${tnlVersion} )
 INSTALL( FILES matrix-market
                florida-matrix-market
@@ -9,6 +10,7 @@ INSTALL( FILES matrix-market
                draw-matrices
                run-matrix-solvers-benchmark
                run-tnl-benchmark-spmv
+               run-tnl-benchmark-linear-solvers
                cuda-profiler.conf
                process-cuda-profile.pl 
                tnl-log-to-html.py
diff --git a/tests/benchmarks/share/run-tnl-benchmark-linear-solvers b/tests/benchmarks/share/run-tnl-benchmark-linear-solvers
new file mode 100644
index 0000000000000000000000000000000000000000..d0cc977f610951a2e52258dfb26ea6ed2840012d
--- /dev/null
+++ b/tests/benchmarks/share/run-tnl-benchmark-linear-solvers
@@ -0,0 +1,68 @@
+#!/usr/bin/env bash
+                
+DEBUG="no"
+STOP_TIME="1"
+export CUDA_PROFILE=0
+
+PWD=`pwd`
+IWD="$PWD"
+BASE="ftp://math.nist.gov/pub/MatrixMarket2/Harwell-Boeing/"
+BENCHMARK="tnl-benchmark-linear-solvers"
+BENCHMARK_DBG="tnl-benchmark-linear-solvers-dbg"
+
+export CUDA_PROFILE_CONFIG="$IWD/cuda-profiler.conf"
+PROCESS_CUDA_PROFILE="$IWD/process-cuda-profile.pl"
+source matrix-market
+source florida-matrix-market
+
+for link in $MM_MATRICES;
+do
+   echo "======================================================================================================"
+   matrix=matrices`echo $link | sed 's/ftp:\/\/math.nist.gov\/pub//'`
+   unzipped_matrix=`echo $matrix | sed 's/.gz//'`
+   if test ! -e $matrix;
+   then
+      echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first."
+      #echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first." >> sparse-matrix-benchmark.log            
+   else
+      gunzip -c ${matrix} > ${unzipped_matrix}      
+      echo "Benchmarking with the matrix $unzipped_matrix ..."
+      export CUDA_PROFILE_LOG=$unzipped_matrix.float.log
+      if test x$DEBUG = xyes;
+      then
+         gdb --args ${BENCHMARK_DBG} --test mtx --input-file $unzipped_matrix --log-file sparse-matrix-benchmark.log --verbose 1
+      else
+         $BENCHMARK --test mtx --input-file $unzipped_matrix --log-file sparse-matrix-benchmark.log --verbose 1
+      fi
+      #perl $PROCESS_CUDA_PROFILE $unzipped_matrix.float.log sparse-matrix-profiling-float.log          
+   fi
+done
+
+for link in $FLORIDA_MM_MATRICES;
+do
+   matrix=matrices`echo $link | sed 's/http:\/\/www.cise.ufl.edu\/research\/sparse//'`
+   if test ! -e $matrix;
+   then      
+      echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first."
+      #echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first." >> sparse-matrix-benchmark.log
+   else
+     DIRNAME=`dirname $matrix`
+     FILENAME=`basename $matrix`
+     cd $DIRNAME
+     tar zxvf $FILENAME
+     cd $IWD
+     SUBDIRNAME=`echo $FILENAME | sed 's/.tar.gz//'`
+     rm -f $DIRNAME/$SUBDIRNAME/*_b.mtx # these are usualy in array format
+     for file in $DIRNAME/$SUBDIRNAME/*.mtx;
+     do
+         echo "======================================================================================================"
+         echo "Benchmarking with the matrix $file ..."
+         if test x$DEBUG = xyes;
+         then
+            gdb --args $BENCHMARK --test mtx --input-file $file --log-file sparse-matrix-benchmark.log --verbose 1
+         else
+            $BENCHMARK --test mtx --input-file $file --log-file sparse-matrix-benchmark.log --verbose 1                        
+         fi
+     done
+   fi
+done
diff --git a/tests/benchmarks/share/run-tnl-benchmark-spmv b/tests/benchmarks/share/run-tnl-benchmark-spmv
index 6abb115919bf2c7a50c26d9fee5f2052eabb65ed..75ea08219bb454533bd882594d44772586ef50cb 100644
--- a/tests/benchmarks/share/run-tnl-benchmark-spmv
+++ b/tests/benchmarks/share/run-tnl-benchmark-spmv
@@ -7,8 +7,8 @@ export CUDA_PROFILE=0
 PWD=`pwd`
 IWD="$PWD"
 BASE="ftp://math.nist.gov/pub/MatrixMarket2/Harwell-Boeing/"
-SPARSE_MATRIX_BENCHMARK="tnl-benchmark-spmv"
-SPARSE_MATRIX_BENCHMARK_DBG="tnl-benchmark-spmv-dbg"
+BENCHMARK="tnl-benchmark-spmv"
+BENCHMARK_DBG="tnl-benchmark-spmv-dbg"
 
 export CUDA_PROFILE_CONFIG="$IWD/cuda-profiler.conf"
 PROCESS_CUDA_PROFILE="$IWD/process-cuda-profile.pl"
@@ -17,7 +17,7 @@ source florida-matrix-market
 
 for link in $MM_MATRICES;
 do
-   echo "###############################################################################################"
+   echo "======================================================================================================"
    matrix=matrices`echo $link | sed 's/ftp:\/\/math.nist.gov\/pub//'`
    unzipped_matrix=`echo $matrix | sed 's/.gz//'`
    if test ! -e $matrix;
@@ -30,9 +30,9 @@ do
       export CUDA_PROFILE_LOG=$unzipped_matrix.float.log
       if test x$DEBUG = xyes;
       then
-         gdb --args ${SPARSE_MATRIX_BENCHMARK_DBG} --test mtx --input-file $unzipped_matrix --log-file sparse-matrix-benchmark.log --stop-time $STOP_TIME --verbose 1
+         gdb --args ${BENCHMARK_DBG} --test mtx --input-file $unzipped_matrix --log-file sparse-matrix-benchmark.log --stop-time $STOP_TIME --verbose 1
       else
-         $SPARSE_MATRIX_BENCHMARK --test mtx --input-file $unzipped_matrix --pdf-file $unzipped_matrix.pdf --log-file sparse-matrix-benchmark.log --stop-time $STOP_TIME --verbose 1
+         $BENCHMARK --test mtx --input-file $unzipped_matrix --pdf-file $unzipped_matrix.pdf --log-file sparse-matrix-benchmark.log --stop-time $STOP_TIME --verbose 1
       fi
       #perl $PROCESS_CUDA_PROFILE $unzipped_matrix.float.log sparse-matrix-profiling-float.log          
    fi
@@ -54,14 +54,14 @@ do
      SUBDIRNAME=`echo $FILENAME | sed 's/.tar.gz//'`
      rm -f $DIRNAME/$SUBDIRNAME/*_b.mtx # these are usualy in array format
      for file in $DIRNAME/$SUBDIRNAME/*.mtx;
-     do
-         echo "###############################################################################################"
+     do        
+         echo "======================================================================================================"
          echo "Benchmarking with the matrix $file ..."
          if test x$DEBUG = xyes;
          then
-            gdb --args $SPARSE_MATRIX_BENCHMARK --test mtx --input-file $file --pdf-file $file.pdf --log-file sparse-matrix-benchmark.log --stop-time $STOP_TIME --verbose 1
+            gdb --args $BENCHMARK --test mtx --input-file $file --pdf-file $file.pdf --log-file sparse-matrix-benchmark.log --stop-time $STOP_TIME --verbose 1
          else
-            $SPARSE_MATRIX_BENCHMARK --test mtx --input-file $file --pdf-file $file.pdf --log-file sparse-matrix-benchmark.log --stop-time $STOP_TIME --verbose 1                        
+            $BENCHMARK --test mtx --input-file $file --pdf-file $file.pdf --log-file sparse-matrix-benchmark.log --stop-time $STOP_TIME --verbose 1                        
          fi
      done
    fi
diff --git a/tests/benchmarks/share/tnl-benchmark-linear-solvers.cfg.desc b/tests/benchmarks/share/tnl-benchmark-linear-solvers.cfg.desc
new file mode 100644
index 0000000000000000000000000000000000000000..9f104892d592ce0e90dd5baacf555ea3748b3a69
--- /dev/null
+++ b/tests/benchmarks/share/tnl-benchmark-linear-solvers.cfg.desc
@@ -0,0 +1,11 @@
+group benchmarkSetup
+{
+   string test(!)                                       [Can be tridiagonal, multidiagonal, multidiagonal-with-long-rows, mtx, tnl.];
+   string input-file(!)                                 [Input binary file name.];
+   string log-file("tnl-benchmark-linear-solvers.log")  [Log file name.];
+   string precision("double")                           [Precision of the arithmetics.];
+   string matrix-format( "csr" )                        [Matrix format can be dense, tridiagonal, multidiagonal, ellpack, sliced-ellpack, chunked-ellpack, csr.];
+   string solver( "gmres" )                             [Solver can be sor, cg, gmres, ... ];
+   string device( "host" )                              [Device cen host or cuda.]; 
+   integer verbose(1)                                   [Verbose mode.];
+},[Arguments to set-up the benchmark.];
\ No newline at end of file
diff --git a/tests/benchmarks/tnl-benchmark-linear-solvers.cpp b/tests/benchmarks/tnl-benchmark-linear-solvers.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6e050dccd0c42656d756b6dc1083f700779a6950
--- /dev/null
+++ b/tests/benchmarks/tnl-benchmark-linear-solvers.cpp
@@ -0,0 +1,18 @@
+/***************************************************************************
+                          tnl-benchmark-linear-solvers.cpp  -  description
+                             -------------------
+    begin                : Jun 22, 2014
+    copyright            : (C) 2014 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+
+#include "tnl-benchmark-linear-solvers.h"
diff --git a/tests/benchmarks/tnl-benchmark-linear-solvers.cu b/tests/benchmarks/tnl-benchmark-linear-solvers.cu
new file mode 100644
index 0000000000000000000000000000000000000000..317bbc9aa6c18ab9b6fa98e1b8ba7a60e7d32b47
--- /dev/null
+++ b/tests/benchmarks/tnl-benchmark-linear-solvers.cu
@@ -0,0 +1,18 @@
+/***************************************************************************
+                          tnl-benchmark-linear-solvers.cu  -  description
+                             -------------------
+    begin                : Jun 22, 2014
+    copyright            : (C) 2014 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+
+#include "tnl-benchmark-linear-solvers.h"
\ No newline at end of file
diff --git a/tests/benchmarks/tnl-benchmark-linear-solvers.h b/tests/benchmarks/tnl-benchmark-linear-solvers.h
new file mode 100644
index 0000000000000000000000000000000000000000..42d53e1900bdaf042c0f5a54f0ce4863764e5982
--- /dev/null
+++ b/tests/benchmarks/tnl-benchmark-linear-solvers.h
@@ -0,0 +1,195 @@
+/***************************************************************************
+                          tnl-benchmark-linear-solvers.h  -  description
+                             -------------------
+    begin                : Jun 22, 2014
+    copyright            : (C) 2014 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+
+#ifndef TNL_BENCHMARK_LINEAR_SOLVERS_H_
+#define TNL_BENCHMARK_LINEAR_SOLVERS_H_
+
+#include <fstream>
+#include <iomanip>
+#include <unistd.h>
+
+#include <config/tnlConfigDescription.h>
+#include <config/tnlParameterContainer.h>
+#include <core/tnlTimerRT.h>
+#include <matrices/tnlDenseMatrix.h>
+#include <matrices/tnlTridiagonalMatrix.h>
+#include <matrices/tnlMultidiagonalMatrix.h>
+#include <matrices/tnlCSRMatrix.h>
+#include <matrices/tnlEllpackMatrix.h>
+#include <matrices/tnlSlicedEllpackMatrix.h>
+#include <matrices/tnlChunkedEllpackMatrix.h>
+#include <matrices/tnlMatrixReader.h>
+#include <solvers/linear/krylov/tnlGMRESSolver.h>
+#include <solvers/linear/tnlLinearResidueGetter.h>
+#include <solvers/tnlIterativeSolverMonitor.h>
+
+using namespace std;
+
+#include "tnlConfig.h"
+const char configFile[] = TNL_CONFIG_DIRECTORY "tnl-benchmark-linear-solvers.cfg.desc";
+
+template< typename Solver >
+bool benchmarkSolver( const tnlParameterContainer& parameters,
+                      const typename Solver::MatrixType& matrix)
+{
+   typedef typename Solver::MatrixType MatrixType;
+   typedef typename MatrixType::RealType RealType;
+   typedef typename MatrixType::DeviceType DeviceType;
+   typedef typename MatrixType::IndexType IndexType;
+   typedef tnlVector< RealType, DeviceType, IndexType > VectorType;
+
+   VectorType x, y, b;
+   x.setSize( matrix.getColumns() );
+   x.setValue( 1.0 / ( RealType ) matrix.getColumns() );
+   y.setSize( matrix.getColumns() );
+   b.setSize( matrix.getRows() );
+   matrix.vectorProduct( x, b );
+
+   Solver solver;
+   tnlIterativeSolverMonitor< RealType, IndexType > monitor;
+   monitor.setVerbose( 1 );
+   solver.setSolverMonitor( monitor );
+   solver.setMatrix( matrix );
+   solver.setMaxResidue( 1.0e-6 );
+   solver.template solve< VectorType, tnlLinearResidueGetter< MatrixType, VectorType > >( b, y );
+   cout << endl;
+   return true;
+}
+
+template< typename Matrix >
+bool readMatrix( const tnlParameterContainer& parameters,
+                 Matrix& matrix )
+{
+   const tnlString fileName = parameters.GetParameter< tnlString >( "input-file" );
+
+   Matrix* hostMatrix;
+   if( Matrix::DeviceType::DeviceType == tnlCudaDevice )
+   {
+
+   }
+   if( Matrix::DeviceType::DeviceType == tnlHostDevice )
+   {
+      hostMatrix = &matrix;
+      try
+      {
+         if( ! tnlMatrixReader< Matrix >::readMtxFile( fileName, *hostMatrix ) )
+         {
+            cerr << "I am not able to read the matrix file " << fileName << "." << endl;
+            /*logFile << endl;
+            logFile << inputFileName << endl;
+            logFile << "Benchmark failed: Unable to read the matrix." << endl;*/
+            return false;
+         }
+      }
+      catch( std::bad_alloc )
+      {
+         cerr << "Not enough memory to read the matrix." << endl;
+         /*logFile << endl;
+         logFile << inputFileName << endl;
+         logFile << "Benchmark failed: Not enough memory." << endl;*/
+         return false;
+      }
+   }
+   return true;
+}
+
+template< typename Matrix >
+bool resolveLinearSolver( const tnlParameterContainer& parameters )
+{
+   const tnlString& solver = parameters.GetParameter< tnlString >( "solver" );
+
+   Matrix matrix;
+   if( ! readMatrix( parameters, matrix ) )
+      return false;
+
+   if( solver == "gmres" )
+      return benchmarkSolver< tnlGMRESSolver< Matrix > >( parameters, matrix );
+
+   cerr << "Unknown solver " << solver << "." << endl;
+   return false;
+}
+
+template< typename Real,
+          typename Device >
+bool resolveMatrixFormat( const tnlParameterContainer& parameters )
+{
+   const tnlString& matrixFormat = parameters.GetParameter< tnlString >( "matrix-format" );
+
+   if( matrixFormat == "dense" )
+      return resolveLinearSolver< tnlDenseMatrix< Real, Device, int > >( parameters );
+
+   if( matrixFormat == "tridiagonal" )
+      return resolveLinearSolver< tnlTridiagonalMatrix< Real, Device, int > >( parameters );
+
+   if( matrixFormat == "multidiagonal" )
+      return resolveLinearSolver< tnlMultidiagonalMatrix< Real, Device, int > >( parameters );
+
+   if( matrixFormat == "ellpack" )
+      return resolveLinearSolver< tnlEllpackMatrix< Real, Device, int > >( parameters );
+
+   if( matrixFormat == "sliced-ellpack" )
+      return resolveLinearSolver< tnlSlicedEllpackMatrix< Real, Device, int > >( parameters );
+
+   if( matrixFormat == "chunked-ellpack" )
+      return resolveLinearSolver< tnlChunkedEllpackMatrix< Real, Device, int > >( parameters );
+
+   if( matrixFormat == "csr" )
+      return resolveLinearSolver< tnlCSRMatrix< Real, Device, int > >( parameters );
+
+   cerr << "Unknown matrix format " << matrixFormat << "." << endl;
+   return false;
+}
+
+template< typename Real >
+bool resolveDevice( const tnlParameterContainer& parameters )
+{
+   const tnlString& device = parameters.GetParameter< tnlString >( "device" );
+
+   if( device == "host" )
+      return resolveMatrixFormat< Real, tnlHost >( parameters );
+
+   //if( device == "cuda" )
+   //   return resolveMatrixFormat< Real, tnlCuda >( parameters );
+
+   cerr << "Uknown device " << device << "." << endl;
+   return false;
+}
+
+int main( int argc, char* argv[] )
+{
+   tnlParameterContainer parameters;
+   tnlConfigDescription conf_desc;
+
+   if( conf_desc. ParseConfigDescription( configFile ) != 0 )
+      return 1;
+   if( ! ParseCommandLine( argc, argv, conf_desc, parameters ) )
+   {
+      conf_desc. PrintUsage( argv[ 0 ] );
+      return 1;
+   }
+   const tnlString& precision = parameters.GetParameter< tnlString >( "precision" );
+   if( precision == "float" )
+      if( ! resolveDevice< float >( parameters ) )
+         return EXIT_FAILURE;
+   if( precision == "double" )
+      if( ! resolveDevice< double >( parameters ) )
+         return EXIT_FAILURE;
+   return EXIT_SUCCESS;
+}
+
+
+#endif /* TNL_BENCHMARK_LINEAR_SOLVERS_H_ */
diff --git a/tests/benchmarks/tnl-benchmark-spmv.h b/tests/benchmarks/tnl-benchmark-spmv.h
index 0a7a994adb8b8d7b53c2f1eb07efc9642fdb37cf..eb431fe5596d7f78a817282cdadce5a90df3e86a 100644
--- a/tests/benchmarks/tnl-benchmark-spmv.h
+++ b/tests/benchmarks/tnl-benchmark-spmv.h
@@ -58,9 +58,35 @@ bool initLogFile( fstream& logFile, const tnlString& fileName )
       logFile << "#  Speedup" << speedupColoring << endl;
 #ifdef HAVE_CUDA
       logFile << "# CUDA" << endl;
-      logFile << "#  Gflops" << endl;
-      logFile << "#  Throughput" << endl;
-      logFile << "#  Speedup" << speedupColoring << " SORT - csr-cuda-speedup.txt" << endl;
+      logFile << "#  Scalar" << endl;
+      logFile << "#   Gflops" << endl;
+      logFile << "#   Throughput" << endl;
+      logFile << "#   Speedup" << speedupColoring << " SORT - csr-scalar-cuda-speedup.txt" << endl;
+      logFile << "#  Vector" << endl;
+      logFile << "#   Warp Size 1" << endl;
+      logFile << "#    Gflops" << endl;
+      logFile << "#    Throughput" << endl;
+      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-cuda-speedup.txt" << endl;
+      logFile << "#   Warp Size 2" << endl;
+      logFile << "#    Gflops" << endl;
+      logFile << "#    Throughput" << endl;
+      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-cuda-speedup.txt" << endl;
+      logFile << "#   Warp Size 4" << endl;
+      logFile << "#    Gflops" << endl;
+      logFile << "#    Throughput" << endl;
+      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-cuda-speedup.txt" << endl;
+      logFile << "#   Warp Size 8" << endl;
+      logFile << "#    Gflops" << endl;
+      logFile << "#    Throughput" << endl;
+      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-cuda-speedup.txt" << endl;
+      logFile << "#   Warp Size 16" << endl;
+      logFile << "#    Gflops" << endl;
+      logFile << "#    Throughput" << endl;
+      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-cuda-speedup.txt" << endl;
+      logFile << "#   Warp Size 32" << endl;
+      logFile << "#    Gflops" << endl;
+      logFile << "#    Throughput" << endl;
+      logFile << "#    Speedup" << speedupColoring << " SORT - csr-vector-cuda-speedup.txt" << endl;
 #endif
       logFile << "#Ellpack Format" << endl;
       logFile << "# Padding (in %)" << paddingColoring << endl;
@@ -235,12 +261,23 @@ bool setupBenchmark( const tnlParameterContainer& parameters )
    {
       typedef tnlCSRMatrix< Real, tnlHost, int > CSRMatrixType;
       CSRMatrixType csrMatrix;
-      if( ! tnlMatrixReader< CSRMatrixType >::readMtxFile( inputFileName, csrMatrix ) )
+      try
       {
-         cerr << "I am not able to read the matrix file " << inputFileName << "." << endl;
+         if( ! tnlMatrixReader< CSRMatrixType >::readMtxFile( inputFileName, csrMatrix ) )
+         {
+            cerr << "I am not able to read the matrix file " << inputFileName << "." << endl;
+            logFile << endl;
+            logFile << inputFileName << endl;
+            logFile << "Benchmark failed: Unable to read the matrix." << endl;
+            return false;
+         }
+      }
+      catch( std::bad_alloc )
+      {
+         cerr << "Not enough memory to read the matrix." << endl;
          logFile << endl;
          logFile << inputFileName << endl;
-         logFile << "Benchmark failed." << endl;
+         logFile << "Benchmark failed: Not enough memory." << endl;
          return false;
       }
       if( verbose )
@@ -289,16 +326,78 @@ bool setupBenchmark( const tnlParameterContainer& parameters )
       if( ! cudaCSRMatrix.copyFrom( csrMatrix, rowLengthsCuda ) )
       {
          cerr << "I am not able to transfer the matrix on GPU." << endl;
-         writeTestFailed( logFile, 3 );
+         writeTestFailed( logFile, 21 );
       }
       else
       {
          cout << " done.   \r";
+         cudaCSRMatrix.setCudaKernelType( CSRMatrixCudaType::scalar );
+         benchmarkMatrix( cudaCSRMatrix,
+                          cudaX,
+                          cudaB,
+                          nonzeroElements,
+                          "CSR Cuda Scalar",
+                          stopTime,
+                          baseline,
+                          verbose,
+                          logFile );
+         cudaCSRMatrix.setCudaKernelType( CSRMatrixCudaType::vector );
+         cudaCSRMatrix.setCudaWarpSize( 1 );
+         benchmarkMatrix( cudaCSRMatrix,
+                          cudaX,
+                          cudaB,
+                          nonzeroElements,
+                          "CSR Cuda Vector 1",
+                          stopTime,
+                          baseline,
+                          verbose,
+                          logFile );
+         cudaCSRMatrix.setCudaWarpSize( 2 );
+         benchmarkMatrix( cudaCSRMatrix,
+                          cudaX,
+                          cudaB,
+                          nonzeroElements,
+                          "CSR Cuda Vector 2",
+                          stopTime,
+                          baseline,
+                          verbose,
+                          logFile );
+         cudaCSRMatrix.setCudaWarpSize( 4 );
+         benchmarkMatrix( cudaCSRMatrix,
+                          cudaX,
+                          cudaB,
+                          nonzeroElements,
+                          "CSR Cuda Vector 4",
+                          stopTime,
+                          baseline,
+                          verbose,
+                          logFile );
+         cudaCSRMatrix.setCudaWarpSize( 8 );
+         benchmarkMatrix( cudaCSRMatrix,
+                          cudaX,
+                          cudaB,
+                          nonzeroElements,
+                          "CSR Cuda Vector 8",
+                          stopTime,
+                          baseline,
+                          verbose,
+                          logFile );
+         cudaCSRMatrix.setCudaWarpSize( 16 );
+         benchmarkMatrix( cudaCSRMatrix,
+                          cudaX,
+                          cudaB,
+                          nonzeroElements,
+                          "CSR Cuda Vector 16",
+                          stopTime,
+                          baseline,
+                          verbose,
+                          logFile );
+         cudaCSRMatrix.setCudaWarpSize( 32 );
          benchmarkMatrix( cudaCSRMatrix,
                           cudaX,
                           cudaB,
                           nonzeroElements,
-                          "CSR Cuda",
+                          "CSR Cuda Vector 32",
                           stopTime,
                           baseline,
                           verbose,