From a950ebc32cf351d48abe726ec9de9aa3811866c4 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Tue, 4 Feb 2014 19:59:11 +0100
Subject: [PATCH] Debuging arrays for CUDA.

---
 CMakeLists.txt                                |   2 +-
 install                                       |  20 +-
 src/core/arrays/tnlMultiArray.h               |  54 +++--
 src/core/tnlTuple.h                           | 189 +++++++++++++++++-
 src/implementation/core/arrays/CMakeLists.txt |   1 +
 .../core/arrays/tnlArray_impl.cpp             |   7 +
 .../core/arrays/tnlMultiArray1D_impl.h        |  12 +-
 .../core/arrays/tnlMultiArray2D_impl.h        |  19 +-
 .../core/arrays/tnlMultiArray3D_impl.h        |  13 +-
 .../core/arrays/tnlMultiArray4D_impl.h        |  13 +-
 .../core/arrays/tnlMultiArray_impl.cpp        |   6 +-
 .../core/arrays/tnlMultiArray_impl.cu         |  43 ++++
 .../core/arrays/tnlMultiArrayTester.h         | 143 ++++++++-----
 13 files changed, 402 insertions(+), 120 deletions(-)
 create mode 100644 src/implementation/core/arrays/tnlMultiArray_impl.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5b7515f5f9..620e46a82d 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -49,7 +49,7 @@ if( NOT WITH_CUDA STREQUAL "no" )
         set(BUILD_SHARED_LIBS ON)
         set(CUDA_SEPARABLE_COMPILATION ON)
         
-        set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-DHAVE_CUDA)
+        set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-DHAVE_CUDA;-keep)
         if( CUDA_ARCHITECTURE STREQUAL "2.0" )
             set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-DCUDA_ARCH=20)            
         endif()    
diff --git a/install b/install
index 887b902967..48296a4c21 100755
--- a/install
+++ b/install
@@ -36,14 +36,14 @@ make -j${CPUS} ${VERBOSE}
 make -j${CPUS} test
 make -j${CPUS} install
 
-#cd ../Release
-#${CMAKE} .. -DCMAKE_INSTALL_PREFIX=${HOME}/local \
-#            -DCUDA_ARCHITECTURE=${CUDA_ARCHITECTURE} \
-#            -DWITH_CUDA=${WITH_CUDA} \
-#            -DWITH_CUSPARSE=${WITH_CUSPARSE} \
-#            -DPETSC_DIR=${PETSC_DIR} \
-#            -DWITH_TEMPLATE_EXPLICIT_INSTANTIATION=${TEMPLATE_EXPLICIT_INSTANTIATION}
-#make -j${CPUS} ${VERBOSE}
-#make -j${CPUS} test
-#make -j${CPUS} install
+cd ../Release
+${CMAKE} .. -DCMAKE_INSTALL_PREFIX=${HOME}/local \
+            -DCUDA_ARCHITECTURE=${CUDA_ARCHITECTURE} \
+            -DWITH_CUDA=${WITH_CUDA} \
+            -DWITH_CUSPARSE=${WITH_CUSPARSE} \
+            -DPETSC_DIR=${PETSC_DIR} \
+            -DWITH_TEMPLATE_EXPLICIT_INSTANTIATION=${TEMPLATE_EXPLICIT_INSTANTIATION}
+make -j${CPUS} ${VERBOSE}
+make -j${CPUS} test
+make -j${CPUS} install
 
diff --git a/src/core/arrays/tnlMultiArray.h b/src/core/arrays/tnlMultiArray.h
index 5656805c66..252b5f9f5a 100644
--- a/src/core/arrays/tnlMultiArray.h
+++ b/src/core/arrays/tnlMultiArray.h
@@ -38,6 +38,9 @@ class tnlMultiArray< 1, Element, Device, Index > : public tnlArray< Element, Dev
    typedef Device DeviceType;
    typedef Index IndexType;
 
+#ifdef HAVE_CUDA
+   //__device__ __host__
+#endif
    tnlMultiArray();
 
    tnlMultiArray( const tnlString& name );
@@ -49,7 +52,7 @@ class tnlMultiArray< 1, Element, Device, Index > : public tnlArray< Element, Dev
    bool setDimensions( const tnlTuple< 1, Index >& dimensions );
 
 #ifdef HAVE_CUDA
-   __device__ __host__
+   //__device__ __host__
 #endif
    void getDimensions( Index& iSize ) const;
 
@@ -63,7 +66,7 @@ class tnlMultiArray< 1, Element, Device, Index > : public tnlArray< Element, Dev
    bool setLike( const MultiArray& v );
    
 #ifdef HAVE_CUDA
-   __device__ __host__
+   //__device__ __host__
 #endif
    Index getElementIndex( const Index i ) const;
 
@@ -123,6 +126,9 @@ class tnlMultiArray< 2, Element, Device, Index > : public tnlArray< Element, Dev
    typedef Device DeviceType;
    typedef Index IndexType;
 
+#ifdef HAVE_CUDA
+   //__device__ __host__
+#endif
    tnlMultiArray();
 
    tnlMultiArray( const tnlString& name );
@@ -134,12 +140,12 @@ class tnlMultiArray< 2, Element, Device, Index > : public tnlArray< Element, Dev
    bool setDimensions( const tnlTuple< 2, Index >& dimensions );
 
 #ifdef HAVE_CUDA
-   __device__ __host__
+   //__device__ __host__
 #endif
    void getDimensions( Index& jSize, Index& iSize ) const;
 
 #ifdef HAVE_CUDA
-   __device__ __host__
+   //__device__ __host__
 #endif
    const tnlTuple< 2, Index >& getDimensions() const;
 
@@ -148,7 +154,7 @@ class tnlMultiArray< 2, Element, Device, Index > : public tnlArray< Element, Dev
    bool setLike( const MultiArray& v );
 
 #ifdef HAVE_CUDA
-   __device__ __host__
+   //__device__ __host__
 #endif
    Index getElementIndex( const Index j, const Index i ) const;
 
@@ -167,12 +173,12 @@ class tnlMultiArray< 2, Element, Device, Index > : public tnlArray< Element, Dev
     *  (GPU device usually).
     */
 #ifdef HAVE_CUDA
-   __device__ __host__
+   //__device__ __host__
 #endif
    Element& operator()( const Index j, const Index i );
 
 #ifdef HAVE_CUDA
-   __device__ __host__
+   //__device__ __host__
 #endif
    const Element& operator()( const Index j, const Index i ) const;
 
@@ -212,6 +218,9 @@ class tnlMultiArray< 3, Element, Device, Index > : public tnlArray< Element, Dev
    typedef Device DeviceType;
    typedef Index IndexType;
 
+#ifdef HAVE_CUDA
+   //__device__ __host__
+#endif
    tnlMultiArray();
 
    tnlMultiArray( const tnlString& name );
@@ -223,12 +232,12 @@ class tnlMultiArray< 3, Element, Device, Index > : public tnlArray< Element, Dev
    bool setDimensions( const tnlTuple< 3, Index >& dimensions );
 
 #ifdef HAVE_CUDA
-   __device__ __host__
+   //__device__ __host__
 #endif
    void getDimensions( Index& k, Index& j, Index& iSize ) const;
 
 #ifdef HAVE_CUDA
-   __device__ __host__
+   //__device__ __host__
 #endif
    const tnlTuple< 3, Index >& getDimensions() const;
 
@@ -237,7 +246,7 @@ class tnlMultiArray< 3, Element, Device, Index > : public tnlArray< Element, Dev
    bool setLike( const MultiArray& v );
 
 #ifdef HAVE_CUDA
-   __device__ __host__
+   //__device__ __host__
 #endif
    Index getElementIndex( const Index k, const Index j, const Index i ) const;
 
@@ -256,12 +265,12 @@ class tnlMultiArray< 3, Element, Device, Index > : public tnlArray< Element, Dev
     *  (GPU device usualy).
     */
 #ifdef HAVE_CUDA
-   __device__ __host__
+   //__device__ __host__
 #endif
    Element& operator()( const Index k, const Index j, const Index i );
 
 #ifdef HAVE_CUDA
-   __device__ __host__
+   //__device__ __host__
 #endif
    const Element& operator()( const Index k, const Index j, const Index i ) const;
 
@@ -301,6 +310,9 @@ class tnlMultiArray< 4, Element, Device, Index > : public tnlArray< Element, Dev
    typedef Device DeviceType;
    typedef Index IndexType;
 
+#ifdef HAVE_CUDA
+   //__device__ __host__
+#endif
    tnlMultiArray();
 
    tnlMultiArray( const tnlString& name );
@@ -312,12 +324,12 @@ class tnlMultiArray< 4, Element, Device, Index > : public tnlArray< Element, Dev
    bool setDimensions( const tnlTuple< 4, Index >& dimensions );
 
 #ifdef HAVE_CUDA
-   __device__ __host__
+   //__device__ __host__
 #endif
    void getDimensions( Index& l, Index& k, Index& j, Index& iSize ) const;
 
 #ifdef HAVE_CUDA
-   __device__ __host__
+   //__device__ __host__
 #endif
    const tnlTuple< 4, Index >& getDimensions() const;
 
@@ -326,7 +338,7 @@ class tnlMultiArray< 4, Element, Device, Index > : public tnlArray< Element, Dev
    bool setLike( const MultiArray& v );
 
 #ifdef HAVE_CUDA
-   __device__ __host__
+  // __device__ __host__
 #endif
    Index getElementIndex( const Index l, const Index k, const Index j, const Index i ) const;
 
@@ -345,12 +357,12 @@ class tnlMultiArray< 4, Element, Device, Index > : public tnlArray< Element, Dev
     *  (GPU device usualy).
     */
 #ifdef HAVE_CUDA
-   __device__ __host__
+   //__device__ __host__
 #endif
    Element& operator()( const Index l, const Index k, const Index j, const Index i );
 
 #ifdef HAVE_CUDA
-   __device__ __host__
+   //__device__ __host__
 #endif
    const Element& operator()( const Index l, const Index k, const Index j, const Index i ) const;
 
@@ -417,10 +429,8 @@ extern template class tnlMultiArray< 4, double, tnlHost, int >;
 extern template class tnlMultiArray< 4, float,  tnlHost, long int >;
 extern template class tnlMultiArray< 4, double, tnlHost, long int >;
 
-#ifdef HAVE_CUDA
-#endif
-
-extern template class tnlMultiArray< 1, float,  tnlCuda, int >;
+// TODO: There are problems with nvlink - it maght be better in later versions
+/*extern template class tnlMultiArray< 1, float,  tnlCuda, int >;
 extern template class tnlMultiArray< 1, double, tnlCuda, int >;
 extern template class tnlMultiArray< 1, float,  tnlCuda, long int >;
 extern template class tnlMultiArray< 1, double, tnlCuda, long int >;
@@ -435,7 +445,7 @@ extern template class tnlMultiArray< 3, double, tnlCuda, long int >;
 extern template class tnlMultiArray< 4, float,  tnlCuda, int >;
 extern template class tnlMultiArray< 4, double, tnlCuda, int >;
 extern template class tnlMultiArray< 4, float,  tnlCuda, long int >;
-extern template class tnlMultiArray< 4, double, tnlCuda, long int >;
+extern template class tnlMultiArray< 4, double, tnlCuda, long int >;*/
 
 #endif
 
diff --git a/src/core/tnlTuple.h b/src/core/tnlTuple.h
index b971634bf9..c3137d79cd 100644
--- a/src/core/tnlTuple.h
+++ b/src/core/tnlTuple.h
@@ -33,87 +33,171 @@ class tnlTuple
    public:
    typedef Real RealType;
 
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
    tnlTuple();
 
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
    tnlTuple( const Real v[ Size ] );
 
    //! This sets all vector components to v
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
    tnlTuple( const Real& v );
 
    //! Copy constructor
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
    tnlTuple( const tnlTuple< Size, Real >& v );
 
    //! This is constructore of vector with Size = 2.
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
    tnlTuple( const Real& v1,
               const Real& v2 );
 
-   //! This is constructore of vector with Size = 3.
+   //! This is constructore of vector with Size = 3
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
    tnlTuple( const Real& v1,
              const Real& v2,
              const Real& v3 );
 
    static tnlString getType();
 
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
    const Real& operator[]( int i ) const;
 
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
    Real& operator[]( int i );
    
    //! Returns the first coordinate
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
    Real& x();
 
    //! Returns the first coordinate
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
    const Real& x() const;
 
    //! Returns the second coordinate
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
    Real& y();
 
    //! Returns the second coordinate
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
    const Real& y() const;
 
    //! Returns the third coordinate
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
    Real& z();
 
    //! Returns the third coordinate
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
    const Real& z() const;
 
    //! Adding operator
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
    tnlTuple& operator += ( const tnlTuple& v );
 
    //! Subtracting operator
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
    tnlTuple& operator -= ( const tnlTuple& v );
 
    //! Multiplication with number
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
    tnlTuple& operator *= ( const Real& c );
 
    //! Adding operator
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
    tnlTuple operator + ( const tnlTuple& u ) const;
 
    //! Subtracting operator
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
    tnlTuple operator - ( const tnlTuple& u ) const;
 
    //! Multiplication with number
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
    tnlTuple operator * ( const Real& c ) const;
 
    //! 
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
    tnlTuple& operator = ( const tnlTuple& v );
 
    //! Scalar product
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
    Real operator * ( const tnlTuple& u ) const;
 
    //! Comparison operator
    template< typename Real2 >
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
    bool operator == ( const tnlTuple< Size, Real2 >& v ) const;
 
    //! Comparison operator
    template< typename Real2 >
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
    bool operator != ( const tnlTuple< Size, Real2 >& v ) const;
 
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
    bool operator < ( const tnlTuple& v ) const;
 
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
    bool operator <= ( const tnlTuple& v ) const;
 
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
    bool operator > ( const tnlTuple& v ) const;
 
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
    bool operator >= ( const tnlTuple& v ) const;
 
    bool save( tnlFile& file ) const;
@@ -132,12 +216,18 @@ template< int Size, typename Real >
 ostream& operator << ( ostream& str, const tnlTuple< Size, Real >& v );
 
 template< int Size, typename Real >
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
 tnlTuple< Size, Real > :: tnlTuple()
 {
    bzero( data, Size * sizeof( Real ) );
 };
 
 template< int Size, typename Real >
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
 tnlTuple< Size, Real > :: tnlTuple( const Real v[ Size ] )
 {
    if( Size == 1 )
@@ -161,6 +251,9 @@ tnlTuple< Size, Real > :: tnlTuple( const Real v[ Size ] )
 };
 
 template< int Size, typename Real >
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
 tnlTuple< Size, Real > :: tnlTuple( const Real& v )
 {
    if( Size == 1 )
@@ -181,6 +274,9 @@ tnlTuple< Size, Real > :: tnlTuple( const Real& v )
 };
 
 template< int Size, typename Real >
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
 tnlTuple< Size, Real > :: tnlTuple( const tnlTuple< Size, Real >& v )
 {
    if( Size == 1 )
@@ -205,28 +301,37 @@ tnlTuple< Size, Real > :: tnlTuple( const tnlTuple< Size, Real >& v )
 
 
 template< int Size, typename Real >
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
 tnlTuple< Size, Real > :: tnlTuple( const Real& v1,
                                     const Real& v2 )
 {
    tnlAssert( Size == 2,
-              cerr << "Using this constructor does not makes sense for Size different then 2.")
+              printf( "Using this constructor does not makes sense for Size different then 2.\n") );
    data[ 0 ] = v1;
    data[ 1 ] = v2;
 }
 
 template< int Size, typename Real >
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
 tnlTuple< Size, Real > :: tnlTuple( const Real& v1,
                                       const Real& v2,
                                       const Real& v3 )
 {
    tnlAssert( Size == 3,
-              cerr << "Using this constructor does not makes sense for Size different then 3.")
+              printf( "Using this constructor does not makes sense for Size different then 3.\n") );
    data[ 0 ] = v1;
    data[ 1 ] = v2;
    data[ 2 ] = v3;
 }
 
 template< int Size, typename Real >
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
 tnlString tnlTuple< Size, Real > :: getType()
 {
    return tnlString( "tnlTuple< " ) +
@@ -237,6 +342,9 @@ tnlString tnlTuple< Size, Real > :: getType()
 }
 
 template< int Size, typename Real >
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
 const Real& tnlTuple< Size, Real > :: operator[]( int i ) const
 {
    assert( i >= 0 && i < Size );
@@ -244,6 +352,9 @@ const Real& tnlTuple< Size, Real > :: operator[]( int i ) const
 };
 
 template< int Size, typename Real >
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
 Real& tnlTuple< Size, Real > :: operator[]( int i )
 {
    assert( i < Size );
@@ -251,48 +362,60 @@ Real& tnlTuple< Size, Real > :: operator[]( int i )
 };
 
 template< int Size, typename Real >
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
 Real& tnlTuple< Size, Real > :: x()
 {
    tnlAssert( Size > 0, cerr << "Size = " << Size << endl; );
    if( Size < 1 )
    {
-      cerr << "The size of the tnlTuple is too small to get x coordinate." << endl;
+      printf( "The size of the tnlTuple is too small to get x coordinate.\n" );
       abort();
    }
    return data[ 0 ];
 };
 
 template< int Size, typename Real >
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
 const Real& tnlTuple< Size, Real > :: x() const
 {
    tnlAssert( Size > 0, cerr << "Size = " << Size << endl; );
    if( Size < 1 )
    {
-      cerr << "The size of the tnlTuple is too small to get x coordinate." << endl;
+      printf( "The size of the tnlTuple is too small to get x coordinate.\n" );
       abort();
    }
    return data[ 0 ];
 };
 
 template< int Size, typename Real >
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
 Real& tnlTuple< Size, Real > :: y()
 {
    tnlAssert( Size > 1, cerr << "Size = " << Size << endl; );
    if( Size < 2 )
    {
-      cerr << "The size of the tnlTuple is too small to get y coordinate." << endl;
+      printf( "The size of the tnlTuple is too small to get y coordinate.\n" );
       abort();
    }
    return data[ 1 ];
 };
 
 template< int Size, typename Real >
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
 const Real& tnlTuple< Size, Real > :: y() const
 {
    tnlAssert( Size > 1, cerr << "Size = " << Size << endl; );
    if( Size < 2 )
    {
-      cerr << "The size of the tnlTuple is too small to get y coordinate." << endl;
+      printf( "The size of the tnlTuple is too small to get y coordinate.\n" );
       abort();
    }
    return data[ 1 ];
@@ -300,30 +423,39 @@ const Real& tnlTuple< Size, Real > :: y() const
 };
 
 template< int Size, typename Real >
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
 Real& tnlTuple< Size, Real > :: z()
 {
    tnlAssert( Size > 2, cerr << "Size = " << Size << endl; );
    if( Size < 3 )
    {
-      cerr << "The size of the tnlTuple is too small to get z coordinate." << endl;
+      printf( "The size of the tnlTuple is too small to get z coordinate.\n" );
       abort();
    }
    return data[ 2 ];
 };
 
 template< int Size, typename Real >
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
 const Real& tnlTuple< Size, Real > :: z() const
 {
    tnlAssert( Size > 2, cerr << "Size = " << Size << endl; );
    if( Size < 3 )
    {
-      cerr << "The size of the tnlTuple is too small to get z coordinate." << endl;
+      printf( "The size of the tnlTuple is too small to get z coordinate.\n" );
       abort();
    }
    return data[ 2 ];
 };
 
 template< int Size, typename Real >
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
 tnlTuple< Size, Real >& tnlTuple< Size, Real > :: operator += ( const tnlTuple& v )
 {
    if( Size == 1 )
@@ -348,6 +480,9 @@ tnlTuple< Size, Real >& tnlTuple< Size, Real > :: operator += ( const tnlTuple&
 };
 
 template< int Size, typename Real >
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
 tnlTuple< Size, Real >& tnlTuple< Size, Real > :: operator -= ( const tnlTuple& v )
 {
    if( Size == 1 )
@@ -372,6 +507,9 @@ tnlTuple< Size, Real >& tnlTuple< Size, Real > :: operator -= ( const tnlTuple&
 };
 
 template< int Size, typename Real >
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
 tnlTuple< Size, Real >& tnlTuple< Size, Real > :: operator *= ( const Real& c )
 {
    if( Size == 1 )
@@ -396,6 +534,9 @@ tnlTuple< Size, Real >& tnlTuple< Size, Real > :: operator *= ( const Real& c )
 };
 
 template< int Size, typename Real >
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
 tnlTuple< Size, Real > tnlTuple< Size, Real > :: operator + ( const tnlTuple& u ) const
 {
    // TODO: Leads to sigsegv
@@ -403,6 +544,9 @@ tnlTuple< Size, Real > tnlTuple< Size, Real > :: operator + ( const tnlTuple& u
 };
 
 template< int Size, typename Real >
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
 tnlTuple< Size, Real > tnlTuple< Size, Real > :: operator - ( const tnlTuple& u ) const
 {
    // TODO: Leads to sigsegv
@@ -410,12 +554,18 @@ tnlTuple< Size, Real > tnlTuple< Size, Real > :: operator - ( const tnlTuple& u
 };
 
 template< int Size, typename Real >
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
 tnlTuple< Size, Real > tnlTuple< Size, Real > :: operator * ( const Real& c ) const
 {
    return tnlTuple( * this ) *= c;
 };
 
 template< int Size, typename Real >
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
 tnlTuple< Size, Real >& tnlTuple< Size, Real > :: operator = ( const tnlTuple& v )
 {
    memcpy( &data[ 0 ], &v. data[ 0 ], Size * sizeof( Real ) );
@@ -426,6 +576,9 @@ tnlTuple< Size, Real >& tnlTuple< Size, Real > :: operator = ( const tnlTuple& v
 };
 
 template< int Size, typename Real >
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
 Real tnlTuple< Size, Real > :: operator * ( const tnlTuple& u ) const
 {
    if( Size == 1 )
@@ -448,6 +601,9 @@ Real tnlTuple< Size, Real > :: operator * ( const tnlTuple& u ) const
 
 template< int Size, typename Real >
 template< typename Real2 >
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
 bool tnlTuple< Size, Real > :: operator == ( const tnlTuple< Size, Real2 >& u ) const
 {
    if( Size == 1 )
@@ -468,6 +624,9 @@ bool tnlTuple< Size, Real > :: operator == ( const tnlTuple< Size, Real2 >& u )
 
 template< int Size, typename Real >
 template< typename Real2 >
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
 bool tnlTuple< Size, Real > :: operator != ( const tnlTuple< Size, Real2 >& u ) const
 {
    if( Size == 1 )
@@ -486,6 +645,9 @@ bool tnlTuple< Size, Real > :: operator != ( const tnlTuple< Size, Real2 >& u )
 };
 
 template< int Size, typename Real >
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
 bool tnlTuple< Size, Real > :: operator < ( const tnlTuple& u ) const
 {
    if( Size == 1 )
@@ -507,6 +669,9 @@ bool tnlTuple< Size, Real > :: operator < ( const tnlTuple& u ) const
 }
 
 template< int Size, typename Real >
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
 bool tnlTuple< Size, Real > :: operator <= ( const tnlTuple& u ) const
 {
    if( Size == 1 )
@@ -525,6 +690,9 @@ bool tnlTuple< Size, Real > :: operator <= ( const tnlTuple& u ) const
 }
 
 template< int Size, typename Real >
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
 bool tnlTuple< Size, Real > :: operator > ( const tnlTuple& u ) const
 {
    if( Size == 1 )
@@ -543,6 +711,9 @@ bool tnlTuple< Size, Real > :: operator > ( const tnlTuple& u ) const
 }
 
 template< int Size, typename Real >
+#ifdef HAVE_CUDA
+   __host__ __device__
+#endif
 bool tnlTuple< Size, Real > :: operator >= ( const tnlTuple& u ) const
 {
    if( Size == 1 )
diff --git a/src/implementation/core/arrays/CMakeLists.txt b/src/implementation/core/arrays/CMakeLists.txt
index 7ffaae7c81..69b3be5c4f 100755
--- a/src/implementation/core/arrays/CMakeLists.txt
+++ b/src/implementation/core/arrays/CMakeLists.txt
@@ -19,6 +19,7 @@ IF( BUILD_CUDA )
         ${CURRENT_DIR}/tnlArrayOperationsHost_impl.cu
         ${CURRENT_DIR}/tnlArrayOperationsCuda_impl.cu
         ${CURRENT_DIR}/tnlArray_impl.cu
+        ${CURRENT_DIR}/tnlMultiArray_impl.cu
         PARENT_SCOPE )
 ELSE()
    set( common_SOURCES
diff --git a/src/implementation/core/arrays/tnlArray_impl.cpp b/src/implementation/core/arrays/tnlArray_impl.cpp
index b60b601703..6a42960582 100644
--- a/src/implementation/core/arrays/tnlArray_impl.cpp
+++ b/src/implementation/core/arrays/tnlArray_impl.cpp
@@ -24,4 +24,11 @@ template class tnlArray< double, tnlHost, int >;
 template class tnlArray< float, tnlHost, long int >;
 template class tnlArray< double, tnlHost, long int >;
 
+#ifndef HAVE_CUDA
+template class tnlArray< float, tnlCuda, int >;
+template class tnlArray< double, tnlCuda, int >;
+template class tnlArray< float, tnlCuda, long int >;
+template class tnlArray< double, tnlCuda, long int >;
+#endif
+
 #endif
diff --git a/src/implementation/core/arrays/tnlMultiArray1D_impl.h b/src/implementation/core/arrays/tnlMultiArray1D_impl.h
index 5be9b62c64..d1489cf3c1 100644
--- a/src/implementation/core/arrays/tnlMultiArray1D_impl.h
+++ b/src/implementation/core/arrays/tnlMultiArray1D_impl.h
@@ -18,9 +18,10 @@
 #ifndef TNLMULTIARRAY1D_IMPL_H_
 #define TNLMULTIARRAY1D_IMPL_H_
 
-
-
 template< typename Element, typename Device, typename Index >
+#ifdef HAVE_CUDA
+   //__device__ __host__
+#endif
 tnlMultiArray< 1, Element, Device, Index > :: tnlMultiArray()
 {
 }
@@ -72,7 +73,7 @@ bool tnlMultiArray< 1, Element, Device, Index > :: setLike( const MultiArray& mu
 
 template< typename Element, typename Device, typename Index >
 #ifdef HAVE_CUDA
-   __device__ __host__
+   //__device__ __host__
 #endif
 void tnlMultiArray< 1, Element, Device, Index > :: getDimensions( Index& xSize ) const
 {
@@ -90,13 +91,12 @@ const tnlTuple< 1, Index >& tnlMultiArray< 1, Element, Device, Index > :: getDim
 
 template< typename Element, typename Device, typename Index >
 #ifdef HAVE_CUDA
-   __device__ __host__
+   //__device__ __host__
 #endif
 Index tnlMultiArray< 1, Element, Device, Index > :: getElementIndex( const Index i ) const
 {
    tnlAssert( i >= 0 && i < this -> dimensions[ 0 ],
-              cerr << "i = " << i
-                   << "this -> dimensions[ 0 ] " << this -> dimensions[ 0 ] );
+              printf( "i = %d this -> dimensions[ 0 ] = %d \n", i, this -> dimensions[ 0 ] ) );
    return i;
 }
 
diff --git a/src/implementation/core/arrays/tnlMultiArray2D_impl.h b/src/implementation/core/arrays/tnlMultiArray2D_impl.h
index 07b89e10dd..2134dd10b9 100644
--- a/src/implementation/core/arrays/tnlMultiArray2D_impl.h
+++ b/src/implementation/core/arrays/tnlMultiArray2D_impl.h
@@ -21,6 +21,9 @@
 
 
 template< typename Element, typename Device, typename Index >
+#ifdef HAVE_CUDA
+   //__device__ __host__
+#endif
 tnlMultiArray< 2, Element, Device, Index > :: tnlMultiArray()
 {
 }
@@ -81,7 +84,7 @@ bool tnlMultiArray< 2, Element, Device, Index > :: setLike( const MultiArray& mu
 
 template< typename Element, typename Device, typename Index >
 #ifdef HAVE_CUDA
-   __device__ __host__
+   //__device__ __host__
 #endif
 void tnlMultiArray< 2, Element, Device, Index > :: getDimensions( Index& jSize, Index& iSize ) const
 {
@@ -91,7 +94,7 @@ void tnlMultiArray< 2, Element, Device, Index > :: getDimensions( Index& jSize,
 
 template< typename Element, typename Device, typename Index >
 #ifdef HAVE_CUDA
-   __device__ __host__
+   //__device__ __host__
 #endif
 const tnlTuple< 2, Index >& tnlMultiArray< 2, Element, Device, Index > :: getDimensions() const
 {
@@ -100,15 +103,13 @@ const tnlTuple< 2, Index >& tnlMultiArray< 2, Element, Device, Index > :: getDim
 
 template< typename Element, typename Device, typename Index >
 #ifdef HAVE_CUDA
-   __device__ __host__
+   //__device__ __host__
 #endif
 Index tnlMultiArray< 2, Element, Device, Index > :: getElementIndex( const Index j, const Index i ) const
 {
    tnlAssert( i >= 0 && i < this -> dimensions[ 0 ] && j >= 0 && j < this -> dimensions[ 1 ],
-              cerr << "i = " << i
-                   << "j = " << j
-                   << "this -> dimensions[ 0 ] = " << this -> dimensions[ 0 ]
-                   << "this -> dimensions[ 1 ] = " << this -> dimensions[ 1 ] );
+              printf( "i = %d j = %d this -> dimensions[ 0 ] = %d this -> dimensions[ 1 ] = %d \n",
+               i, j, this -> dimensions[ 0 ], this -> dimensions[ 1 ] ) );
    return j * this -> dimensions[ 0 ] + i;
 }
 
@@ -126,7 +127,7 @@ void tnlMultiArray< 2, Element, Device, Index > :: setElement( const Index j, co
 
 template< typename Element, typename Device, typename Index >
 #ifdef HAVE_CUDA
-   __device__ __host__
+   //__device__ __host__
 #endif
 Element& tnlMultiArray< 2, Element, Device, Index > :: operator()( const Index j, const Index i )
 {
@@ -135,7 +136,7 @@ Element& tnlMultiArray< 2, Element, Device, Index > :: operator()( const Index j
 
 template< typename Element, typename Device, typename Index >
 #ifdef HAVE_CUDA
-   __device__ __host__
+   //__device__ __host__
 #endif
 const Element& tnlMultiArray< 2, Element, Device, Index > :: operator()( const Index j, const Index i ) const
 {
diff --git a/src/implementation/core/arrays/tnlMultiArray3D_impl.h b/src/implementation/core/arrays/tnlMultiArray3D_impl.h
index 7df7e75f78..8ecbbd0086 100644
--- a/src/implementation/core/arrays/tnlMultiArray3D_impl.h
+++ b/src/implementation/core/arrays/tnlMultiArray3D_impl.h
@@ -21,6 +21,9 @@
 
 
 template< typename Element, typename Device, typename Index >
+#ifdef HAVE_CUDA
+   //__device__ __host__
+#endif
 tnlMultiArray< 3, Element, Device, Index > :: tnlMultiArray()
 {
 }
@@ -86,7 +89,7 @@ bool tnlMultiArray< 3, Element, Device, Index > :: setLike( const MultiArray& mu
 
 template< typename Element, typename Device, typename Index >
 #ifdef HAVE_CUDA
-   __device__ __host__
+   //__device__ __host__
 #endif
 void tnlMultiArray< 3, Element, Device, Index > :: getDimensions( Index& kSize,
                                                                   Index& jSize,
@@ -99,7 +102,7 @@ void tnlMultiArray< 3, Element, Device, Index > :: getDimensions( Index& kSize,
 
 template< typename Element, typename Device, typename Index >
 #ifdef HAVE_CUDA
-   __device__ __host__
+   //__device__ __host__
 #endif
 const tnlTuple< 3, Index >& tnlMultiArray< 3, Element, Device, Index > :: getDimensions() const
 {
@@ -108,7 +111,7 @@ const tnlTuple< 3, Index >& tnlMultiArray< 3, Element, Device, Index > :: getDim
 
 template< typename Element, typename Device, typename Index >
 #ifdef HAVE_CUDA
-   __device__ __host__
+   //__device__ __host__
 #endif
 Index tnlMultiArray< 3, Element, Device, Index > :: getElementIndex( const Index k,
                                                                      const Index j,
@@ -143,7 +146,7 @@ void tnlMultiArray< 3, Element, Device, Index > :: setElement( const Index k,
 
 template< typename Element, typename Device, typename Index >
 #ifdef HAVE_CUDA
-   __device__ __host__
+   //__device__ __host__
 #endif
 Element& tnlMultiArray< 3, Element, Device, Index > :: operator()( const Index k,
                                                                         const Index j,
@@ -154,7 +157,7 @@ Element& tnlMultiArray< 3, Element, Device, Index > :: operator()( const Index k
 
 template< typename Element, typename Device, typename Index >
 #ifdef HAVE_CUDA
-   __device__ __host__
+   //__device__ __host__
 #endif
 const Element& tnlMultiArray< 3, Element, Device, Index > :: operator()( const Index k,
                                                                                const Index j,
diff --git a/src/implementation/core/arrays/tnlMultiArray4D_impl.h b/src/implementation/core/arrays/tnlMultiArray4D_impl.h
index c041b5eb46..b70f78f934 100644
--- a/src/implementation/core/arrays/tnlMultiArray4D_impl.h
+++ b/src/implementation/core/arrays/tnlMultiArray4D_impl.h
@@ -21,6 +21,9 @@
 
 
 template< typename Element, typename Device, typename Index >
+#ifdef HAVE_CUDA
+   //__device__ __host__
+#endif
 tnlMultiArray< 4, Element, Device, Index > :: tnlMultiArray()
 {
 }
@@ -91,7 +94,7 @@ bool tnlMultiArray< 4, Element, Device, Index > :: setLike( const MultiArray& mu
 
 template< typename Element, typename Device, typename Index >
 #ifdef HAVE_CUDA
-   __device__ __host__
+   //__device__ __host__
 #endif
 void tnlMultiArray< 4, Element, Device, Index > :: getDimensions( Index& lSize,
                                                                        Index& kSize,
@@ -106,7 +109,7 @@ void tnlMultiArray< 4, Element, Device, Index > :: getDimensions( Index& lSize,
 
 template< typename Element, typename Device, typename Index >
 #ifdef HAVE_CUDA
-   __device__ __host__
+   //__device__ __host__
 #endif
 const tnlTuple< 4, Index >& tnlMultiArray< 4, Element, Device, Index > :: getDimensions() const
 {
@@ -115,7 +118,7 @@ const tnlTuple< 4, Index >& tnlMultiArray< 4, Element, Device, Index > :: getDim
 
 template< typename Element, typename Device, typename Index >
 #ifdef HAVE_CUDA
-   __device__ __host__
+   //__device__ __host__
 #endif
 Index tnlMultiArray< 4, Element, Device, Index > :: getElementIndex( const Index l,
                                                                      const Index k,
@@ -155,7 +158,7 @@ void tnlMultiArray< 4, Element, Device, Index > :: setElement( const Index l,
 
 template< typename Element, typename Device, typename Index >
 #ifdef HAVE_CUDA
-   __device__ __host__
+   //__device__ __host__
 #endif
 Element& tnlMultiArray< 4, Element, Device, Index > :: operator()( const Index l,
                                                                         const Index k,
@@ -167,7 +170,7 @@ Element& tnlMultiArray< 4, Element, Device, Index > :: operator()( const Index l
 
 template< typename Element, typename Device, typename Index >
 #ifdef HAVE_CUDA
-   __device__ __host__
+   //__device__ __host__
 #endif
 const Element& tnlMultiArray< 4, Element, Device, Index > :: operator()( const Index l,
                                                                                const Index k,
diff --git a/src/implementation/core/arrays/tnlMultiArray_impl.cpp b/src/implementation/core/arrays/tnlMultiArray_impl.cpp
index 5b1b46c3a4..8d6efafd08 100644
--- a/src/implementation/core/arrays/tnlMultiArray_impl.cpp
+++ b/src/implementation/core/arrays/tnlMultiArray_impl.cpp
@@ -36,8 +36,7 @@ template class tnlMultiArray< 4, double, tnlHost, int >;
 template class tnlMultiArray< 4, float,  tnlHost, long int >;
 template class tnlMultiArray< 4, double, tnlHost, long int >;
 
-#ifdef HAVE_CUDA
-#endif
+#ifndef HAVE_CUDA
 
 template class tnlMultiArray< 1, float,  tnlCuda, int >;
 template class tnlMultiArray< 1, double, tnlCuda, int >;
@@ -59,4 +58,7 @@ template class tnlMultiArray< 4, double, tnlCuda, long int >;
 #endif
 
 
+#endif
+
+
 
diff --git a/src/implementation/core/arrays/tnlMultiArray_impl.cu b/src/implementation/core/arrays/tnlMultiArray_impl.cu
new file mode 100644
index 0000000000..29ff2431e5
--- /dev/null
+++ b/src/implementation/core/arrays/tnlMultiArray_impl.cu
@@ -0,0 +1,43 @@
+/***************************************************************************
+                          tnlMultiArray_impl.cu  -  description
+                             -------------------
+    begin                : Feb 4, 2014
+    copyright            : (C) 2014 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/***************************************************************************
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ ***************************************************************************/
+
+#include <core/arrays/tnlMultiArray.h>
+
+#ifdef TEMPLATE_EXPLICIT_INSTANTIATION
+
+#ifdef HAVE_CUDA
+// TODO: There are problems with nvlink - it maght be better in later versions
+/*template class tnlMultiArray< 1, float,  tnlCuda, int >;
+template class tnlMultiArray< 1, double, tnlCuda, int >;
+template class tnlMultiArray< 1, float,  tnlCuda, long int >;
+template class tnlMultiArray< 1, double, tnlCuda, long int >;
+template class tnlMultiArray< 2, float,  tnlCuda, int >;
+template class tnlMultiArray< 2, double, tnlCuda, int >;
+template class tnlMultiArray< 2, float,  tnlCuda, long int >;
+template class tnlMultiArray< 2, double, tnlCuda, long int >;
+template class tnlMultiArray< 3, float,  tnlCuda, int >;
+template class tnlMultiArray< 3, double, tnlCuda, int >;
+template class tnlMultiArray< 3, float,  tnlCuda, long int >;
+template class tnlMultiArray< 3, double, tnlCuda, long int >;
+template class tnlMultiArray< 4, float,  tnlCuda, int >;
+template class tnlMultiArray< 4, double, tnlCuda, int >;
+template class tnlMultiArray< 4, float,  tnlCuda, long int >;
+template class tnlMultiArray< 4, double, tnlCuda, long int >;*/
+
+#endif
+
+#endif
\ No newline at end of file
diff --git a/tests/unit-tests/core/arrays/tnlMultiArrayTester.h b/tests/unit-tests/core/arrays/tnlMultiArrayTester.h
index a0b473cd7b..ca421a1ecb 100644
--- a/tests/unit-tests/core/arrays/tnlMultiArrayTester.h
+++ b/tests/unit-tests/core/arrays/tnlMultiArrayTester.h
@@ -18,6 +18,8 @@
 #ifndef TNLMULTIARRAYTESTER_H_
 #define TNLMULTIARRAYTESTER_H_
 
+#ifdef HAVE_CPPUNIT
+
 #include <cppunit/TestSuite.h>
 #include <cppunit/TestResult.h>
 #include <cppunit/TestCaller.h>
@@ -27,12 +29,35 @@
 #include <core/tnlFile.h>
 
 #ifdef HAVE_CUDA
-template< int Dimensions, typename ElementType, typename IndexType >
-__global__ void testSetGetElementKernel( tnlMultiArray< Dimensions, ElementType, tnlCuda, IndexType >* u );
-#endif
+template< typename ElementType, typename IndexType >
+__global__ void testSetGetElementKernel( tnlMultiArray< 1, ElementType, tnlCuda, IndexType >* u )
+{
+   if( threadIdx.x < ( *u ).getDimensions().x() )
+      ( *u )( threadIdx.x ) = threadIdx.x;
+}
 
+template< typename ElementType, typename IndexType >
+__global__ void testSetGetElementKernel( tnlMultiArray< 2, ElementType, tnlCuda, IndexType >* u )
+{
+   /*if( threadIdx.x < ( *u ).getDimensions().x() &&
+       threadIdx.x < ( *u ).getDimensions().y() )
+      ( *u )( threadIdx.x, threadIdx.x ) = threadIdx.x;
+      */
+}
 
-template< int Dimension, typename ElementType, typename Device, typename IndexType >
+template< typename ElementType, typename IndexType >
+__global__ void testSetGetElementKernel( tnlMultiArray< 3, ElementType, tnlCuda, IndexType >* u )
+{
+   /*if( threadIdx.x < ( *u ).getDimensions().x() &&
+       threadIdx.x < ( *u ).getDimensions().y() &&
+       threadIdx.x < ( *u ).getDimensions().z() )
+      ( *u )( threadIdx.x, threadIdx.x, threadIdx.x ) = threadIdx.x;
+      */
+}
+
+#endif /* HAVE_CUDA */
+
+template< int Dimensions, typename ElementType, typename Device, typename IndexType >
 class tnlMultiArrayTester : public CppUnit :: TestCase
 {
    public:
@@ -63,29 +88,69 @@ class tnlMultiArrayTester : public CppUnit :: TestCase
 
    void testConstructorDestructor()
    {
-      tnlMultiArray< Dimension, ElementType, Device, IndexType > u;
+      tnlMultiArray< Dimensions, ElementType, Device, IndexType > u;
    }
 
    void testSetSize()
    {
-      tnlMultiArray< Dimension, ElementType, Device, IndexType > u, v;
+      tnlMultiArray< Dimensions, ElementType, Device, IndexType > u, v;
       u. setSize( 10 );
       v. setSize( 10 );
    }
 
+   void setDiagonalElement( tnlMultiArray< 1, ElementType, Device, IndexType >& u,
+                            const IndexType& i,
+                            const ElementType& v )
+   {
+      u.setElement( i, v );
+   }
+
+   void setDiagonalElement( tnlMultiArray< 2, ElementType, Device, IndexType >& u,
+                            const IndexType& i,
+                            const ElementType& v )
+   {
+      u.setElement( i, i, v );
+   }
+
+   void setDiagonalElement( tnlMultiArray< 3, ElementType, Device, IndexType >& u,
+                            const IndexType& i,
+                            const ElementType& v )
+   {
+      u.setElement( i, i, i, v );
+   }
+   
+   IndexType getDiagonalElement( tnlMultiArray< 1, ElementType, Device, IndexType >& u,
+                                 const IndexType& i )
+   {
+      return u.getElement( i );
+   }
+   
+   IndexType getDiagonalElement( tnlMultiArray< 2, ElementType, Device, IndexType >& u,
+                                 const IndexType& i )
+   {
+      return u.getElement( i, i );
+   }
+   
+   IndexType getDiagonalElement( tnlMultiArray< 3, ElementType, Device, IndexType >& u,
+                                 const IndexType& i )
+   {
+      return u.getElement( i, i, i );
+   }
+
+
    void testSetGetElement()
    {
-      tnlMultiArray< Dimension, ElementType, Device, IndexType > u( "tnlMultiArrayTester :: u" );
+      tnlMultiArray< Dimensions, ElementType, Device, IndexType > u( "tnlMultiArrayTester :: u" );
       u. setSize( 10 );
-      if( Device::getDevice() == tnlDeviceHost )
+      if( Device::getDevice() == tnlHostDevice )
       {
          for( int i = 0; i < 10; i ++ )
-            u. setElement( i, i );
+            this->setDiagonalElement( u, i, i  );
       }
-      if( Device::getDevice() == tnlDeviceCuda )
+      if( Device::getDevice() == tnlCudaDevice )
       {
 #ifdef HAVE_CUDA
-         tnlArray< ElementType, Device, IndexType >* kernel_u =
+         tnlMultiArray< Dimensions, ElementType, Device, IndexType >* kernel_u =
                   tnlCuda::passToDevice( u );
          testSetGetElementKernel<<< 1, 16 >>>( kernel_u );
          tnlCuda::freeFromDevice( kernel_u );
@@ -93,22 +158,22 @@ class tnlMultiArrayTester : public CppUnit :: TestCase
 #endif
       }
       for( int i = 0; i < 10; i ++ )
-         CPPUNIT_ASSERT( u. getElement( i ) == i );
+         CPPUNIT_ASSERT( getDiagonalElement( u, i ) == i );
    };
 
    void testComparisonOperator()
    {
-      tnlMultiArray< Dimension, ElementType, Device, IndexType > u( "tnlMultiArrayTester :: u" );
-      tnlMultiArray< Dimension, ElementType, Device, IndexType > v( "tnlMultiArrayTester :: v" );
-      tnlMultiArray< Dimension, ElementType, Device, IndexType > w( "tnlMultiArrayTester :: w" );
+      tnlMultiArray< Dimensions, ElementType, Device, IndexType > u( "tnlMultiArrayTester :: u" );
+      tnlMultiArray< Dimensions, ElementType, Device, IndexType > v( "tnlMultiArrayTester :: v" );
+      tnlMultiArray< Dimensions, ElementType, Device, IndexType > w( "tnlMultiArrayTester :: w" );
       u. setSize( 10 );
       v. setSize( 10 );
       w. setSize( 10 );
       for( int i = 0; i < 10; i ++ )
       {
-         u. setElement( i, i );
-         v. setElement( i, i );
-         w. setElement( i, 2*1 );
+         setDiagonalElement( u, i, i );
+         setDiagonalElement( v, i, i );
+         setDiagonalElement( w, i, 2*1 );
       }
       CPPUNIT_ASSERT( u == v );
       CPPUNIT_ASSERT( ! ( u != v ) );
@@ -118,14 +183,14 @@ class tnlMultiArrayTester : public CppUnit :: TestCase
 
    void testEquivalenceOperator()
    {
-      tnlMultiArray< Dimension, ElementType, Device, IndexType > u;
-      tnlMultiArray< Dimension, ElementType, Device, IndexType > v;
+      tnlMultiArray< Dimensions, ElementType, Device, IndexType > u;
+      tnlMultiArray< Dimensions, ElementType, Device, IndexType > v;
       u. setName( "tnlMultiArrayTester :: testEquivalenceOperator :: u" );
       v. setName( "tnlMultiArrayTester :: testEquivalenceOperator :: v" );
       u. setSize( 10 );
       v. setSize( 10 );
       for( int i = 0; i < 10; i ++ )
-         u. setElement( i, i );
+         setDiagonalElement( u, i, i );
       v = u;
       //CPPUNIT_ASSERT( u == v );
       //CPPUNIT_ASSERT( ! ( u != v ) );
@@ -133,7 +198,7 @@ class tnlMultiArrayTester : public CppUnit :: TestCase
 
    void testGetSize()
    {
-      tnlMultiArray< Dimension, ElementType, Device, IndexType > u( "tnlMultiArrayTester :: testSetSize - u" );
+      tnlMultiArray< Dimensions, ElementType, Device, IndexType > u( "tnlMultiArrayTester :: testSetSize - u" );
       const int maxSize = 10;
       for( int i = 0; i < maxSize; i ++ )
          u. setSize( i );
@@ -143,7 +208,7 @@ class tnlMultiArrayTester : public CppUnit :: TestCase
 
    void testReset()
    {
-      tnlMultiArray< Dimension, ElementType, Device, IndexType > u( "tnlMultiArrayTester :: testReset - u" );
+      tnlMultiArray< Dimensions, ElementType, Device, IndexType > u( "tnlMultiArrayTester :: testReset - u" );
       u. setSize( 100 );
       CPPUNIT_ASSERT( u. getSize() == 100 );
       u. reset();
@@ -159,22 +224,22 @@ class tnlMultiArrayTester : public CppUnit :: TestCase
    {
       for( int i = 0; i < 100; i ++ )
       {
-         tnlMultiArray< Dimension, ElementType, Device, IndexType > u( "tnlMultiArrayTester :: testSetSizeAndDestructor - u" );
+         tnlMultiArray< Dimensions, ElementType, Device, IndexType > u( "tnlMultiArrayTester :: testSetSizeAndDestructor - u" );
          u. setSize( i );
       }
    }
 
    void testSaveAndLoad()
    {
-      tnlMultiArray< Dimension, ElementType, Device, IndexType > v( "test-array-v" );
+      tnlMultiArray< Dimensions, ElementType, Device, IndexType > v( "test-array-v" );
       v. setSize( 100 );
       for( int i = 0; i < 100; i ++ )
-         v. setElement( i, 3.14147 );
+         setDiagonalElement( v, i, 3.14147 );
       tnlFile file;
       file. open( "test-file.tnl", tnlWriteMode );
       v. save( file );
       file. close();
-      tnlMultiArray< Dimension, ElementType, Device, IndexType > u( "test-array-u" );
+      tnlMultiArray< Dimensions, ElementType, Device, IndexType > u( "test-array-u" );
       file. open( "test-file.tnl", tnlReadMode );
       u. load( file );
       file. close();
@@ -182,30 +247,6 @@ class tnlMultiArrayTester : public CppUnit :: TestCase
    }
 };
 
-#ifdef HAVE_CUDA
-template< typename ElementType, typename IndexType >
-__global__ void testSetGetElementKernel( tnlMultiArray< 1, ElementType, tnlCuda, IndexType >* u )
-{
-   if( threadIdx.x < ( *u ).getDimensions().x() )
-      ( *u )( threadIdx.x ) = threadIdx.x;
-}
-
-__global__ void testSetGetElementKernel( tnlMultiArray< 2, ElementType, tnlCuda, IndexType >* u )
-{
-   if( threadIdx.x < ( *u ).getDimensions().x() &&
-       threadIdx.x < ( *u ).getDimensions().y() )
-      ( *u )( threadIdx.x, threadIdx.x ) = threadIdx.x;
-}
-
-__global__ void testSetGetElementKernel( tnlMultiArray< 3, ElementType, tnlCuda, IndexType >* u )
-{
-   if( threadIdx.x < ( *u ).getDimensions().x() &&
-       threadIdx.x < ( *u ).getDimensions().y() &&
-       threadIdx.x < ( *u ).getDimensions().z() )
-      ( *u )( threadIdx.x, threadIdx.x, threadIdx.x ) = threadIdx.x;
-}
-
-#endif /* HAVE_CUDA */
 
 #else /* HAVE_CPPUNIT */
 template< int, Dimensions, typename ElementType, typename Device, typename IndexType >
-- 
GitLab