diff --git a/src/core/vectors/tnlSharedVector.h b/src/core/vectors/tnlSharedVector.h
index 828490c426dfbe7459439927c73b6d8b2518afa6..8dfe5575c2c38dcf02cedae88311a300d80af270 100644
--- a/src/core/vectors/tnlSharedVector.h
+++ b/src/core/vectors/tnlSharedVector.h
@@ -143,20 +143,4 @@ class tnlSharedVector : public tnlSharedArray< Real, Device, Index >
 
 #include <implementation/core/vectors/tnlSharedVector_impl.h>
 
-#ifdef TEMPLATE_EXPLICIT_INSTANTIATION
-
-extern template class tnlSharedVector< float, tnlHost, int >;
-extern template class tnlSharedVector< double, tnlHost, int >;
-extern template class tnlSharedVector< float, tnlHost, long int >;
-extern template class tnlSharedVector< double, tnlHost, long int >;
-
-#ifdef HAVE_CUDA
-extern template class tnlSharedVector< float, tnlCuda, int >;
-extern template class tnlSharedVector< double, tnlCuda, int >;
-extern template class tnlSharedVector< float, tnlCuda, long int >;
-extern template class tnlSharedVector< double, tnlCuda, long int >;
-#endif
-
-#endif
-
 #endif /* TNLSHAREDVECTOR_H_ */
diff --git a/src/implementation/core/arrays/CMakeLists.txt b/src/implementation/core/arrays/CMakeLists.txt
index 69b3be5c4f469c5b45965e9eebed8bfacbbcccd0..90bb09f5495c8c3b0c24701df093c2814e3d4fd9 100755
--- a/src/implementation/core/arrays/CMakeLists.txt
+++ b/src/implementation/core/arrays/CMakeLists.txt
@@ -19,6 +19,7 @@ IF( BUILD_CUDA )
         ${CURRENT_DIR}/tnlArrayOperationsHost_impl.cu
         ${CURRENT_DIR}/tnlArrayOperationsCuda_impl.cu
         ${CURRENT_DIR}/tnlArray_impl.cu
+        ${CURRENT_DIR}/tnlSharedArray_impl.cu
         ${CURRENT_DIR}/tnlMultiArray_impl.cu
         PARENT_SCOPE )
 ELSE()
diff --git a/src/implementation/core/arrays/tnlSharedArray_impl.cu b/src/implementation/core/arrays/tnlSharedArray_impl.cu
new file mode 100644
index 0000000000000000000000000000000000000000..51d2eae92b17f2e597ba3754df9264389ad7c634
--- /dev/null
+++ b/src/implementation/core/arrays/tnlSharedArray_impl.cu
@@ -0,0 +1,29 @@
+/***************************************************************************
+                          tnlSharedArray_impl.cu  -  description
+                             -------------------
+    begin                : Jan 20, 2013
+    copyright            : (C) 2013 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+
+#include <core/arrays/tnlSharedArray.h>
+
+#ifdef TEMPLATE_EXPLICIT_INSTANTIATION
+
+#ifdef HAVE_CUDA
+template class tnlSharedArray< float, tnlCuda, int >;
+template class tnlSharedArray< double, tnlCuda, int >;
+template class tnlSharedArray< float, tnlCuda, long int >;
+template class tnlSharedArray< double, tnlCuda, long int >;
+#endif
+
+#endif
\ No newline at end of file
diff --git a/src/implementation/core/arrays/tnlSharedArray_impl.h b/src/implementation/core/arrays/tnlSharedArray_impl.h
index 134b4c984c7129da9965d8adfbadc2a0f0d666a8..1b3372ae8a89ffd7755bb9b47a435f5b8024dbed 100644
--- a/src/implementation/core/arrays/tnlSharedArray_impl.h
+++ b/src/implementation/core/arrays/tnlSharedArray_impl.h
@@ -409,16 +409,18 @@ ostream& operator << ( ostream& str, const tnlSharedArray< Element, Device, Inde
 
 #ifdef TEMPLATE_EXPLICIT_INSTANTIATION
 
-extern template class tnlSharedArray< float, tnlHost, int >;
+// TODO: this does not work with CUDA 5.5 - fix it later
+
+/*extern template class tnlSharedArray< float, tnlHost, int >;
 extern template class tnlSharedArray< double, tnlHost, int >;
 extern template class tnlSharedArray< float, tnlHost, long int >;
-extern template class tnlSharedArray< double, tnlHost, long int >;
+extern template class tnlSharedArray< double, tnlHost, long int >;*/
 
 #ifdef HAVE_CUDA
-extern template class tnlSharedArray< float, tnlCuda, int >;
+/*extern template class tnlSharedArray< float, tnlCuda, int >;
 extern template class tnlSharedArray< double, tnlCuda, int >;
 extern template class tnlSharedArray< float, tnlCuda, long int >;
-extern template class tnlSharedArray< double, tnlCuda, long int >;
+extern template class tnlSharedArray< double, tnlCuda, long int >;*/
 #endif
 
 #endif
diff --git a/src/implementation/core/vectors/tnlSharedVector_impl.h b/src/implementation/core/vectors/tnlSharedVector_impl.h
index 152d03a3bfe480915db20b51a85901e93d6489a3..c345157971eb12b4d24c1971c510319fe307d40e 100644
--- a/src/implementation/core/vectors/tnlSharedVector_impl.h
+++ b/src/implementation/core/vectors/tnlSharedVector_impl.h
@@ -372,4 +372,22 @@ void tnlSharedVector< Real, Device, Index > :: computeExclusivePrefixSum( const
    tnlVectorOperations< Device >::computeExclusivePrefixSum( *this, begin, end );
 }
 
+
+#ifdef TEMPLATE_EXPLICIT_INSTANTIATION
+
+extern template class tnlSharedVector< float, tnlHost, int >;
+extern template class tnlSharedVector< double, tnlHost, int >;
+extern template class tnlSharedVector< float, tnlHost, long int >;
+extern template class tnlSharedVector< double, tnlHost, long int >;
+
+#ifdef HAVE_CUDA
+// TODO: fix this - it does not work with CUDA 5.5
+/*extern template class tnlSharedVector< float, tnlCuda, int >;
+extern template class tnlSharedVector< double, tnlCuda, int >;
+extern template class tnlSharedVector< float, tnlCuda, long int >;
+extern template class tnlSharedVector< double, tnlCuda, long int >;*/
+#endif
+
+#endif
+
 #endif /* TNLSHAREDVECTOR_H_IMPLEMENTATION */
diff --git a/src/implementation/matrices/tnlChunkedEllpackMatrix_impl.h b/src/implementation/matrices/tnlChunkedEllpackMatrix_impl.h
index 38742ae43d526ee8bb02946577dfe60fa54eb514..9d97fe4d2401d33a35aae74fa075e8f76e28a89f 100644
--- a/src/implementation/matrices/tnlChunkedEllpackMatrix_impl.h
+++ b/src/implementation/matrices/tnlChunkedEllpackMatrix_impl.h
@@ -1062,9 +1062,10 @@ typename Vector::RealType tnlChunkedEllpackMatrix< Real, Device, Index >::chunkV
 template< typename Real,
           typename Device,
           typename Index >
-   template< typename Vector >
-__device__ void tnlChunkedEllpackMatrix< Real, Device, Index >::computeSliceVectorProduct( const Vector* inVector,
-                                                                                           Vector* outVector,
+   template< typename InVector,
+             typename OutVector >
+__device__ void tnlChunkedEllpackMatrix< Real, Device, Index >::computeSliceVectorProduct( const InVector* inVector,
+                                                                                           OutVector* outVector,
                                                                                            int sliceIdx  ) const
 {
    tnlStaticAssert( DeviceType::DeviceType == tnlCudaDevice, );
diff --git a/src/implementation/matrices/tnlMatrixReader_impl.h b/src/implementation/matrices/tnlMatrixReader_impl.h
index 994f8a0b25f8f506b14bb49e4ef787a6b5c91486..c5ce0d7c6b3cb43884e77e7fa05306fb74ad1937 100644
--- a/src/implementation/matrices/tnlMatrixReader_impl.h
+++ b/src/implementation/matrices/tnlMatrixReader_impl.h
@@ -44,6 +44,15 @@ template< typename Matrix >
 bool tnlMatrixReader< Matrix >::readMtxFile( std::istream& file,
                                              Matrix& matrix,
                                              bool verbose )
+{
+   return tnlMatrixReaderDeviceDependentCode< typename Matrix::DeviceType >::readMtxFile( file, matrix, verbose );
+}
+
+template< typename Matrix >
+bool tnlMatrixReader< Matrix >::readMtxFileHostMatrix( std::istream& file,
+                                                       Matrix& matrix,
+                                                       typename Matrix::RowLengthsVector& rowLengths,
+                                                       bool verbose )
 {
    IndexType rows, columns;
    bool symmetricMatrix( false );
@@ -51,7 +60,7 @@ bool tnlMatrixReader< Matrix >::readMtxFile( std::istream& file,
    if( ! readMtxHeader( file, rows, columns, symmetricMatrix, verbose ) )
       return false;
 
-   tnlVector< int, tnlHost, int > rowLengths;
+
    if( ! matrix.setDimensions( rows, columns ) ||
        ! rowLengths.setSize( rows ) )
    {
@@ -371,5 +380,47 @@ bool tnlMatrixReader< Matrix >::parseMtxLineWithElement( const tnlString& line,
    return true;
 }
 
+template<>
+class tnlMatrixReaderDeviceDependentCode< tnlHost >
+{
+   public:
+
+   template< typename Matrix >
+   static bool readMtxFile( std::istream& file,
+                            Matrix& matrix,
+                            bool verbose )
+   {
+      typename Matrix::RowLengthsVector rowLengths;
+      return tnlMatrixReader< Matrix >::readMtxFileHostMatrix( file, matrix, rowLengths, verbose );
+   }
+};
+
+template<>
+class tnlMatrixReaderDeviceDependentCode< tnlCuda >
+{
+   public:
+
+   template< typename Matrix >
+   static bool readMtxFile( std::istream& file,
+                            Matrix& matrix,
+                            bool verbose )
+   {
+      typedef typename Matrix::HostType HostMatrixType;
+      typedef typename HostMatrixType::RowLengthsVector RowLengthsVector;
+
+      HostMatrixType hostMatrix;
+      RowLengthsVector rowLengthsVector;
+      if( ! tnlMatrixReader< HostMatrixType >::readMtxFileHostMatrix( file, hostMatrix, rowLengthsVector, verbose ) )
+         return false;
+
+      typename Matrix::RowLengthsVector cudaRowLengthsVector;
+      cudaRowLengthsVector.setLike( rowLengthsVector );
+      cudaRowLengthsVector = rowLengthsVector;
+      if( ! matrix.copyFrom( hostMatrix, cudaRowLengthsVector ) )
+         return false;
+      return true;
+   }
+};
+
 
 #endif /* TNLMATRIXREADER_IMPL_H_ */
diff --git a/src/matrices/tnlCSRMatrix.h b/src/matrices/tnlCSRMatrix.h
index d3d67c91c2f39388db0601ed5184027469d4c3a7..efdf54026601e629f381ed9a8cf84797eb368793 100644
--- a/src/matrices/tnlCSRMatrix.h
+++ b/src/matrices/tnlCSRMatrix.h
@@ -33,6 +33,10 @@ class tnlCSRMatrix : public tnlSparseMatrix< Real, Device, Index >
    typedef Device DeviceType;
    typedef Index IndexType;
    typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >:: RowLengthsVector RowLengthsVector;
+   typedef tnlCSRMatrix< Real, Device, Index > ThisType;
+   typedef tnlCSRMatrix< Real, tnlHost, Index > HostType;
+   typedef tnlCSRMatrix< Real, tnlCuda, Index > CudaType;
+
 
    enum SPMVCudaKernel { scalar, vector, hybrid };
 
diff --git a/src/matrices/tnlChunkedEllpackMatrix.h b/src/matrices/tnlChunkedEllpackMatrix.h
index 87cace2d49db4ca9b1ed9a7eff1ff6de2b624d21..1ee65c1ef76d7725a2b2f4f58f08e56b9ce39dcc 100644
--- a/src/matrices/tnlChunkedEllpackMatrix.h
+++ b/src/matrices/tnlChunkedEllpackMatrix.h
@@ -62,6 +62,9 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
    typedef Index IndexType;
    typedef tnlChunkedEllpackSliceInfo< IndexType > ChunkedEllpackSliceInfo;
    typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >:: RowLengthsVector RowLengthsVector;
+   typedef tnlChunkedEllpackMatrix< Real, Device, Index > ThisType;
+   typedef tnlChunkedEllpackMatrix< Real, tnlHost, Index > HostType;
+   typedef tnlChunkedEllpackMatrix< Real, tnlCuda, Index > CudaType;
 
    tnlChunkedEllpackMatrix();
 
@@ -184,9 +187,10 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
                                                const Vector& vector ) const;
 
 #ifdef HAVE_CUDA
-   template< typename Vector >
-   __device__ void computeSliceVectorProduct( const Vector* inVector,
-                                              Vector* outVector,
+   template< typename InVector,
+             typename OutVector >
+   __device__ void computeSliceVectorProduct( const InVector* inVector,
+                                              OutVector* outVector,
                                               int gridIdx  ) const;
 #endif
 
diff --git a/src/matrices/tnlDenseMatrix.h b/src/matrices/tnlDenseMatrix.h
index d991c5aefa3644ea1b8d3324ec3f42833afe8120..4b3989803fa9ecaaa94a01e28136caafb0fa3968 100644
--- a/src/matrices/tnlDenseMatrix.h
+++ b/src/matrices/tnlDenseMatrix.h
@@ -37,6 +37,9 @@ class tnlDenseMatrix : public tnlMatrix< Real, Device, Index >
    typedef Index IndexType;
    typedef typename tnlMatrix< Real, Device, Index >::RowLengthsVector RowLengthsVector;
    typedef tnlDenseMatrix< Real, Device, Index > ThisType;
+   typedef tnlDenseMatrix< Real, tnlHost, Index > HostType;
+   typedef tnlDenseMatrix< Real, tnlCuda, Index > CudaType;
+
 
    tnlDenseMatrix();
 
diff --git a/src/matrices/tnlEllpackMatrix.h b/src/matrices/tnlEllpackMatrix.h
index be90ae6f711e6f041067dfc45c4980a6dc420bec..e56b1922ff21e0ab8ac0032615b82d95001081cb 100644
--- a/src/matrices/tnlEllpackMatrix.h
+++ b/src/matrices/tnlEllpackMatrix.h
@@ -36,6 +36,9 @@ class tnlEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
    typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >::ValuesVector ValuesVector;
    typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector;
    typedef tnlEllpackMatrix< Real, Device, Index > ThisType;
+   typedef tnlEllpackMatrix< Real, tnlHost, Index > HostType;
+   typedef tnlEllpackMatrix< Real, tnlCuda, Index > CudaType;
+
 
    tnlEllpackMatrix();
 
diff --git a/src/matrices/tnlMatrixReader.h b/src/matrices/tnlMatrixReader.h
index 011def34683b5e1c369556e90424267d7403c150..a2ba34efea5f7fee523ae16226ea931d5c0f0268 100644
--- a/src/matrices/tnlMatrixReader.h
+++ b/src/matrices/tnlMatrixReader.h
@@ -22,6 +22,10 @@
 #include <core/tnlString.h>
 #include <core/vectors/tnlVector.h>
 
+template< typename Device >
+class tnlMatrixReaderDeviceDependentCode
+{};
+
 template< typename Matrix >
 class tnlMatrixReader
 {
@@ -38,6 +42,12 @@ class tnlMatrixReader
                             Matrix& matrix,
                             bool verbose = false );
 
+   static bool readMtxFileHostMatrix( std::istream& file,
+                                      Matrix& matrix,
+                                      typename Matrix::RowLengthsVector& rowLengths,
+                                      bool verbose );
+
+
    static bool verifyMtxFile( std::istream& file,
                               const Matrix& matrix,
                               bool verbose = false );
@@ -74,10 +84,10 @@ class tnlMatrixReader
                                         IndexType& row,
                                         IndexType& column,
                                         RealType& value );
-
 };
 
 
+
 #include <implementation/matrices/tnlMatrixReader_impl.h>
 
 #endif /* TNLMATRIXREADER_H_ */
diff --git a/src/matrices/tnlMultidiagonalMatrix.h b/src/matrices/tnlMultidiagonalMatrix.h
index d0ecf615c4d9f6c15a506250e031e90804376501..702b4481002262773ade66a5bf32a662dd13ee5e 100644
--- a/src/matrices/tnlMultidiagonalMatrix.h
+++ b/src/matrices/tnlMultidiagonalMatrix.h
@@ -34,6 +34,9 @@ class tnlMultidiagonalMatrix : public tnlMatrix< Real, Device, Index >
    typedef Index IndexType;
    typedef typename tnlMatrix< Real, Device, Index >::RowLengthsVector RowLengthsVector;
    typedef tnlMultidiagonalMatrix< Real, Device, Index > ThisType;
+   typedef tnlMultidiagonalMatrix< Real, tnlHost, Index > HostType;
+   typedef tnlMultidiagonalMatrix< Real, tnlCuda, Index > CudaType;
+
 
    tnlMultidiagonalMatrix();
 
diff --git a/src/matrices/tnlSlicedEllpackMatrix.h b/src/matrices/tnlSlicedEllpackMatrix.h
index 2530868d274a8c944a06095ff24fb293f0bb2f4f..6184473d8e386787dc4af60e9492cf6f14e59ac8 100644
--- a/src/matrices/tnlSlicedEllpackMatrix.h
+++ b/src/matrices/tnlSlicedEllpackMatrix.h
@@ -54,6 +54,9 @@ class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
    typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >::ValuesVector ValuesVector;
    typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector;
    typedef tnlSlicedEllpackMatrix< Real, Device, Index > ThisType;
+   typedef tnlSlicedEllpackMatrix< Real, tnlHost, Index > HostType;
+   typedef tnlSlicedEllpackMatrix< Real, tnlCuda, Index > CudaType;
+
 
    tnlSlicedEllpackMatrix();
 
diff --git a/src/matrices/tnlTridiagonalMatrix.h b/src/matrices/tnlTridiagonalMatrix.h
index 7bcafd79e0ae21684b87a1b3789a0ca5a492d58d..b7f77ba2fdd9dfd802ff66ac61c85cb4b7b6c729 100644
--- a/src/matrices/tnlTridiagonalMatrix.h
+++ b/src/matrices/tnlTridiagonalMatrix.h
@@ -36,6 +36,9 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index >
    typedef Index IndexType;
    typedef typename tnlMatrix< Real, Device, Index >::RowLengthsVector RowLengthsVector;
    typedef tnlTridiagonalMatrix< Real, Device, Index > ThisType;
+   typedef tnlTridiagonalMatrix< Real, tnlHost, Index > HostType;
+   typedef tnlTridiagonalMatrix< Real, tnlCuda, Index > CudaType;
+
 
    tnlTridiagonalMatrix();
 
diff --git a/src/solvers/linear/krylov/tnlGMRESSolver.h b/src/solvers/linear/krylov/tnlGMRESSolver.h
index 0fbf069db5f47202a1dc5c8369722dc6a5c2cf5c..4ccee98ee4981307b96e099461611323f3b7c2a8 100644
--- a/src/solvers/linear/krylov/tnlGMRESSolver.h
+++ b/src/solvers/linear/krylov/tnlGMRESSolver.h
@@ -122,10 +122,11 @@ extern template class tnlGMRESSolver< tnlMultiDiagonalMatrix< double, tnlHost, l
 
 
 #ifdef HAVE_CUDA
-extern template class tnlGMRESSolver< tnlCSRMatrix< float,  tnlCuda, int > >;
+// TODO: fix this - does not work with CUDA 5.5
+/*extern template class tnlGMRESSolver< tnlCSRMatrix< float,  tnlCuda, int > >;
 extern template class tnlGMRESSolver< tnlCSRMatrix< double, tnlCuda, int > >;
 extern template class tnlGMRESSolver< tnlCSRMatrix< float,  tnlCuda, long int > >;
-extern template class tnlGMRESSolver< tnlCSRMatrix< double, tnlCuda, long int > >;
+extern template class tnlGMRESSolver< tnlCSRMatrix< double, tnlCuda, long int > >;*/
 
 /*extern template class tnlGMRESSolver< tnlEllpackMatrix< float,  tnlCuda, int > >;
 extern template class tnlGMRESSolver< tnlEllpackMatrix< double, tnlCuda, int > >;
diff --git a/tests/benchmarks/tnl-benchmark-linear-solvers.h b/tests/benchmarks/tnl-benchmark-linear-solvers.h
index 82f506dee8bf94c1b63074f9855136f146f1b8bb..1d5fa65232cb6b1f7ce24d7392796730111dc7a2 100644
--- a/tests/benchmarks/tnl-benchmark-linear-solvers.h
+++ b/tests/benchmarks/tnl-benchmark-linear-solvers.h
@@ -34,6 +34,9 @@
 #include <matrices/tnlChunkedEllpackMatrix.h>
 #include <matrices/tnlMatrixReader.h>
 #include <solvers/linear/krylov/tnlGMRESSolver.h>
+#include <solvers/linear/krylov/tnlCGSolver.h>
+#include <solvers/linear/krylov/tnlBICGStabSolver.h>
+#include <solvers/linear/krylov/tnlTFQMRSolver.h>
 #include <solvers/linear/tnlLinearResidueGetter.h>
 #include <solvers/tnlIterativeSolverMonitor.h>
 
@@ -119,6 +122,15 @@ bool resolveLinearSolver( const tnlParameterContainer& parameters )
    if( solver == "gmres" )
       return benchmarkSolver< tnlGMRESSolver< Matrix > >( parameters, matrix );
 
+   if( solver == "cg" )
+      return benchmarkSolver< tnlCGSolver< Matrix > >( parameters, matrix );
+
+   if( solver == "bicgstab" )
+      return benchmarkSolver< tnlBICGStabSolver< Matrix > >( parameters, matrix );
+
+   if( solver == "tfqmr" )
+      return benchmarkSolver< tnlTFQMRSolver< Matrix > >( parameters, matrix );
+
    cerr << "Unknown solver " << solver << "." << endl;
    return false;
 }
@@ -162,8 +174,8 @@ bool resolveDevice( const tnlParameterContainer& parameters )
    if( device == "host" )
       return resolveMatrixFormat< Real, tnlHost >( parameters );
 
-   //if( device == "cuda" )
-   //   return resolveMatrixFormat< Real, tnlCuda >( parameters );
+   if( device == "cuda" )
+      return resolveMatrixFormat< Real, tnlCuda >( parameters );
 
    cerr << "Uknown device " << device << "." << endl;
    return false;