From 2b7be771b644dd5ab70d758791adf573073b3ccb Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Tue, 7 May 2019 13:37:49 +0200
Subject: [PATCH] Implementing BLAS benchmark.

---
 CMakeLists.txt                             |  3 +-
 src/Benchmarks/BLAS/CMakeLists.txt         |  4 +-
 src/Benchmarks/BLAS/blasWrappers.h         | 75 +++++++-----------
 src/Benchmarks/BLAS/vector-operations.h    | 91 +++++++++++++++++++---
 src/TNL/Containers/VectorViewExpressions.h | 62 ---------------
 5 files changed, 111 insertions(+), 124 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ccc0d9750d..2447f85403 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -320,7 +320,7 @@ endif()
 find_package( BLAS )
 if( BLAS_FOUND )
    set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_BLAS" )
-   #message( "BLAS: ${BLAS_LIBRARIES}" )
+   set( HAVE_BLAS TRUE)
 endif()
 #if( BUILD_MPI )
 #   FIND_PATH( PETSC_INCLUDE_DIR petsc.h
@@ -455,3 +455,4 @@ message( "   CMAKE_SHARED_LINKER_FLAGS_DEBUG = ${CMAKE_SHARED_LINKER_FLAGS_DEBUG
 message( "   CMAKE_SHARED_LINKER_FLAGS_RELEASE = ${CMAKE_SHARED_LINKER_FLAGS_RELEASE}" )
 message( "   CUDA_NVCC_FLAGS = ${CUDA_NVCC_FLAGS}" )
 message( "   GMP_LIBRARIES = ${GMP_LIBRARIES}" )
+message( "   BLAS_libraries = ${BLAS_LIBRARIES}" )
diff --git a/src/Benchmarks/BLAS/CMakeLists.txt b/src/Benchmarks/BLAS/CMakeLists.txt
index 4e5a1199b7..96611d0031 100644
--- a/src/Benchmarks/BLAS/CMakeLists.txt
+++ b/src/Benchmarks/BLAS/CMakeLists.txt
@@ -4,6 +4,8 @@ if( BUILD_CUDA )
 else()
     ADD_EXECUTABLE( tnl-benchmark-blas tnl-benchmark-blas.cpp )
 endif()
-target_link_libraries( tnl-benchmark-blas ${BLAS_LIBRARIES} )
+if( HAVE_BLAS )
+   target_link_libraries( tnl-benchmark-blas ${BLAS_LIBRARIES} )
+endif()
 
 install( TARGETS tnl-benchmark-blas RUNTIME DESTINATION bin )
diff --git a/src/Benchmarks/BLAS/blasWrappers.h b/src/Benchmarks/BLAS/blasWrappers.h
index c54981af45..d1e0edff11 100644
--- a/src/Benchmarks/BLAS/blasWrappers.h
+++ b/src/Benchmarks/BLAS/blasWrappers.h
@@ -1,7 +1,5 @@
 #pragma once
 
-#ifdef HAVE_CUDA
-
 #ifdef HAVE_BLAS
 
 #include <cblas.h>
@@ -17,15 +15,15 @@ inline int blasIgamax( int n, const double *x, int incx )
 }
 
 
-inline int blasIgamin( int n, const float *x, int incx )
+/*inline int blasIgamin( int n, const float *x, int incx )
 {
-   return cblas_Isamin( n, x, incx );
+   return cblas_isamin( n, x, incx );
 }
 
 inline int blasIgamin( int n, const double *x, int incx )
 {
-   return cblas_Idamin( n, x, incx );
-}
+   return cblas_idamin( n, x, incx );
+}*/
 
 
 inline float blasGasum( int n, const float *x, int incx )
@@ -39,73 +37,54 @@ inline double blasGasum( int n, const double *x, int incx )
 }
 
 
-inline void
-blasGaxpy( int n, const float *alpha, 
-           const float *x, int incx,
-           float *y, int incy )
+inline void blasGaxpy( int n, const float alpha,
+                       const float *x, int incx,
+                       float *y, int incy )
 {
    cblas_saxpy( n, alpha, x, incx, y, incy );
 }
 
-inline blasStatus_t
-blasGaxpy( blasHandle_t int n,
-             const double          *alpha,
-             const double          *x, int incx,
-             double                *y, int incy )
+inline void blasGaxpy( int n, const double alpha,
+                       const double* x, int incx,
+                       double *y, int incy )
 {
-   return cblas_Daxpy( n, alpha, x, incx, y, incy );
+   cblas_daxpy( n, alpha, x, incx, y, incy );
 }
 
 
-inline blasStatus_t
-blasGdot( blasHandle_t int n,
-            const float        *x, int incx,
-            const float        *y, int incy,
-            float         *result )
+inline float blasGdot( int n, const float* x, int incx,
+                       const float* y, int incy )
 {
-   return cblas_Sdot( n, x, incx, y, incy, result );
+   return cblas_sdot( n, x, incx, y, incy );
 }
 
-inline blasStatus_t
-blasGdot( blasHandle_t int n,
-            const double       *x, int incx,
-            const double       *y, int incy,
-            double        *result )
+inline double blasGdot( int n, const double* x, int incx,
+                        const double* y, int incy )
 {
-   return cblas_Ddot( n, x, incx, y, incy, result );
+   return cblas_ddot( n, x, incx, y, incy );
 }
 
 
-inline blasStatus_t
-blasGnrm2( blasHandle_t int n,
-             const float           *x, int incx, float  *result )
+inline float blasGnrm2( int n, const float* x, int incx )
 {
-   return cblas_Snrm2( n, x, incx, result );
+   return cblas_snrm2( n, x, incx );
 }
 
-inline blasStatus_t
-blasGnrm2( blasHandle_t int n,
-             const double          *x, int incx, double *result )
+inline double blasGnrm2( int n, const double* x, int incx )
 {
-   return cblas_Dnrm2( n, x, incx, result );
+   return cblas_dnrm2( n, x, incx );
 }
 
 
-inline blasStatus_t
-blasGscal( blasHandle_t int n,
-             const float           *alpha,
-             float           *x, int incx )
+inline void blasGscal( int n, const float alpha,
+                       float* x, int incx )
 {
-   return cblas_Sscal( n, alpha, x, incx );
+   cblas_sscal( n, alpha, x, incx );
 }
 
-inline blasStatus_t
-blasGscal( blasHandle_t int n,
-             const double          *alpha,
-             double          *x, int incx )
+inline void blasGscal( int n, const double alpha,
+                       double* x, int incx )
 {
-   return cblas_Dscal( n, alpha, x, incx );
+   cblas_dscal( n, alpha, x, incx );
 }
-
-#endif
 #endif
diff --git a/src/Benchmarks/BLAS/vector-operations.h b/src/Benchmarks/BLAS/vector-operations.h
index a732726db8..32c843b432 100644
--- a/src/Benchmarks/BLAS/vector-operations.h
+++ b/src/Benchmarks/BLAS/vector-operations.h
@@ -18,7 +18,9 @@
 
 #include <TNL/Containers/Vector.h>
 
+#ifdef HAVE_BLAS
 #include "blasWrappers.h"
+#endif
 
 #ifdef HAVE_CUDA
 #include "cublasWrappers.h"
@@ -87,10 +89,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
       reset2();
    };
 
-
    reset12();
 
-
+   ////
+   // Max
    auto maxHost = [&]() {
       resultHost = hostVector.max();
    };
@@ -112,7 +114,8 @@ benchmarkVectorOperations( Benchmark & benchmark,
    benchmark.time< Devices::Cuda >( reset1, "GPU ET", maxCudaET );
 #endif
 
-
+   ////
+   // Min
    auto minHost = [&]() {
       resultHost = hostVector.min();
    };
@@ -133,7 +136,8 @@ benchmarkVectorOperations( Benchmark & benchmark,
    benchmark.time< Devices::Cuda >( reset1, "GPU", minCudaET );
 #endif
 
-
+   ////
+   // Absmax
    auto absMaxHost = [&]() {
       resultHost = hostVector.absMax();
    };
@@ -146,6 +150,12 @@ benchmarkVectorOperations( Benchmark & benchmark,
    auto absMaxCudaET = [&]() {
       resultDevice = max( abs( deviceView ) );
    };
+#ifdef HAVE_BLAS
+   auto absMaxBlas = [&]() {
+      int index = blasIgamax( size, hostVector.getData(), 1 );
+      resultHost = hostVector.getElement( index );
+   };
+#endif
 #ifdef HAVE_CUDA
    auto absMaxCublas = [&]() {
       int index = 0;
@@ -158,13 +168,15 @@ benchmarkVectorOperations( Benchmark & benchmark,
    benchmark.setOperation( "absMax", datasetSize );
    benchmark.time< Devices::Host >( reset1, "CPU", absMaxHost );
    benchmark.time< Devices::Host >( reset1, "CPU ET", absMaxHostET );
+   benchmark.time< Devices::Host >( reset1, "BLAS", absMaxBlas );
 #ifdef HAVE_CUDA
    benchmark.time< Devices::Cuda >( reset1, "GPU", absMaxCuda );
    benchmark.time< Devices::Cuda >( reset1, "GPU ET", absMaxCudaET );
    benchmark.time< Devices::Cuda >( reset1, "cuBLAS", absMaxCublas );
 #endif
 
-
+   ////
+   // Absmin
    auto absMinHost = [&]() {
       resultHost = hostVector.absMin();
    };
@@ -177,6 +189,12 @@ benchmarkVectorOperations( Benchmark & benchmark,
    auto absMinCudaET = [&]() {
       resultDevice = min( abs( deviceView ) );
    };
+/*#ifdef HAVE_BLAS
+   auto absMinBlas = [&]() {
+      int index = blasIgamin( size, hostVector.getData(), 1 );
+      resultHost = hostVector.getElement( index );
+   };
+#endif*/
 #ifdef HAVE_CUDA
    auto absMinCublas = [&]() {
       int index = 0;
@@ -188,14 +206,16 @@ benchmarkVectorOperations( Benchmark & benchmark,
 #endif
    benchmark.setOperation( "absMin", datasetSize );
    benchmark.time< Devices::Host >( reset1, "CPU", absMinHost );
-   benchmark.time< Devices::Host >( reset1, "CPU", absMinHostET );
+   benchmark.time< Devices::Host >( reset1, "CPU ET", absMinHostET );
+   //benchmark.time< Devices::Host >( reset1, "BLAS", absMinBlas );
 #ifdef HAVE_CUDA
    benchmark.time< Devices::Cuda >( reset1, "GPU", absMinCuda );
    benchmark.time< Devices::Cuda >( reset1, "GPU", absMinCudaET );
    benchmark.time< Devices::Cuda >( reset1, "cuBLAS", absMinCublas );
 #endif
 
-
+   ////
+   // Sum
    auto sumHost = [&]() {
       resultHost = hostVector.sum();
    };
@@ -216,7 +236,8 @@ benchmarkVectorOperations( Benchmark & benchmark,
    benchmark.time< Devices::Cuda >( reset1, "GPU", sumCudaET );
 #endif
 
-
+   ////
+   // L1 norm
    auto l1normHost = [&]() {
       resultHost = hostVector.lpNorm( 1.0 );
    };
@@ -245,7 +266,8 @@ benchmarkVectorOperations( Benchmark & benchmark,
    benchmark.time< Devices::Cuda >( reset1, "cuBLAS", l1normCublas );
 #endif
 
-
+   ////
+   // L2 norm
    auto l2normHost = [&]() {
       resultHost = hostVector.lpNorm( 2.0 );
    };
@@ -258,6 +280,11 @@ benchmarkVectorOperations( Benchmark & benchmark,
    auto l2normCudaET = [&]() {
       resultDevice = lpNorm( deviceView, 2.0 );
    };
+#ifdef HAVE_BLAS
+   auto l2normBlas = [&]() {
+      resultHost = blasGnrm2( size, hostVector.getData(), 1 );
+   };
+#endif
 #ifdef HAVE_CUDA
    auto l2normCublas = [&]() {
       cublasGnrm2( cublasHandle, size,
@@ -268,13 +295,15 @@ benchmarkVectorOperations( Benchmark & benchmark,
    benchmark.setOperation( "l2 norm", datasetSize );
    benchmark.time< Devices::Host >( reset1, "CPU", l2normHost );
    benchmark.time< Devices::Host >( reset1, "CPU", l2normHostET );
+   benchmark.time< Devices::Host >( reset1, "BLAS", l2normBlas );
 #ifdef HAVE_CUDA
    benchmark.time< Devices::Cuda >( reset1, "GPU", l2normCuda );
    benchmark.time< Devices::Cuda >( reset1, "GPU", l2normCudaET );
    benchmark.time< Devices::Cuda >( reset1, "cuBLAS", l2normCublas );
 #endif
 
-
+   ////
+   // L3 norm
    auto l3normHost = [&]() {
       resultHost = hostVector.lpNorm( 3.0 );
    };
@@ -296,13 +325,26 @@ benchmarkVectorOperations( Benchmark & benchmark,
    benchmark.time< Devices::Cuda >( reset1, "GPU", l3normCudaET );
 #endif
 
-
+   ////
+   // Scalar product
    auto scalarProductHost = [&]() {
       resultHost = hostVector.scalarProduct( hostVector2 );
    };
    auto scalarProductCuda = [&]() {
       resultDevice = deviceVector.scalarProduct( deviceVector2 );
    };
+   auto scalarProductHostET = [&]() {
+      resultHost = ( hostView, hostView2 );
+   };
+   auto scalarProductCudaET = [&]() {
+      resultDevice = ( deviceView, deviceView2 );
+   };
+
+#ifdef HAVE_BLAS
+   auto scalarProductBlas = [&]() {
+      resultHost = blasGdot( size, hostVector.getData(), 1, hostVector2.getData(), 1 );
+   };
+#endif
 #ifdef HAVE_CUDA
    auto scalarProductCublas = [&]() {
       cublasGdot( cublasHandle, size,
@@ -313,11 +355,16 @@ benchmarkVectorOperations( Benchmark & benchmark,
 #endif
    benchmark.setOperation( "scalar product", 2 * datasetSize );
    benchmark.time< Devices::Host >( reset1, "CPU", scalarProductHost );
+   benchmark.time< Devices::Host >( reset1, "CPU ET", scalarProductHostET );
+   benchmark.time< Devices::Host >( reset1, "BLAS", scalarProductBlas );
 #ifdef HAVE_CUDA
    benchmark.time< Devices::Cuda >( reset1, "GPU", scalarProductCuda );
+   benchmark.time< Devices::Cuda >( reset1, "GPU ET", scalarProductCudaET );
    benchmark.time< Devices::Cuda >( reset1, "cuBLAS", scalarProductCublas );
 #endif
 
+   ////
+   // Prefix sum
    /*
    std::cout << "Benchmarking prefix-sum:" << std::endl;
    timer.reset();
@@ -348,6 +395,8 @@ benchmarkVectorOperations( Benchmark & benchmark,
    */
 
 
+   ////
+   // Scalar multiplication
    auto multiplyHost = [&]() {
       hostVector *= 0.5;
    };
@@ -369,13 +418,28 @@ benchmarkVectorOperations( Benchmark & benchmark,
    benchmark.time< Devices::Cuda >( reset1, "cuBLAS", multiplyCublas );
 #endif
 
-
+   ////
+   // Vector addition
    auto addVectorHost = [&]() {
       hostVector.addVector( hostVector2 );
    };
    auto addVectorCuda = [&]() {
       deviceVector.addVector( deviceVector2 );
    };
+   auto addVectorHostET = [&]() {
+      hostView += hostView2;
+   };
+   auto addVectorCudaET = [&]() {
+      deviceView += deviceView2;
+   };
+#ifdef HAVE_CUDA
+   auto addVectorBlas = [&]() {
+      const Real alpha = 1.0;
+      blasGaxpy( size, alpha,
+                 deviceVector2.getData(), 1,
+                  deviceVector.getData(), 1 );
+   };
+#endif
 #ifdef HAVE_CUDA
    auto addVectorCublas = [&]() {
       const Real alpha = 1.0;
@@ -387,8 +451,11 @@ benchmarkVectorOperations( Benchmark & benchmark,
 #endif
    benchmark.setOperation( "vector addition", 3 * datasetSize );
    benchmark.time< Devices::Host >( reset1, "CPU", addVectorHost );
+   benchmark.time< Devices::Host >( reset1, "CPU ET", addVectorHostET );
+   benchmark.time< Devices::Host >( reset1, "BLAS", addVectorBlas );
 #ifdef HAVE_CUDA
    benchmark.time< Devices::Cuda >( reset1, "GPU", addVectorCuda );
+   benchmark.time< Devices::Cuda >( reset1, "GPU ET", addVectorCudaET );
    benchmark.time< Devices::Cuda >( reset1, "cuBLAS", addVectorCublas );
 #endif
 
diff --git a/src/TNL/Containers/VectorViewExpressions.h b/src/TNL/Containers/VectorViewExpressions.h
index f83641f2f2..b57089fd7b 100644
--- a/src/TNL/Containers/VectorViewExpressions.h
+++ b/src/TNL/Containers/VectorViewExpressions.h
@@ -24,7 +24,6 @@ namespace TNL {
 ////
 // Addition
 template< typename Real, typename Device, typename Index, typename ET >
-__cuda_callable__
 const Containers::Expressions::BinaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, ET, Containers::Expressions::Addition >
 operator+( const Containers::VectorView< Real, Device, Index >& a, const ET& b )
 {
@@ -32,7 +31,6 @@ operator+( const Containers::VectorView< Real, Device, Index >& a, const ET& b )
 }
 
 template< typename ET, typename Real, typename Device, typename Index >
-__cuda_callable__
 const Containers::Expressions::BinaryExpressionTemplate< ET, Containers::VectorView< Real, Device, Index >, Containers::Expressions::Addition >
 operator+( const ET& a, const Containers::VectorView< Real, Device, Index >& b )
 {
@@ -40,7 +38,6 @@ operator+( const ET& a, const Containers::VectorView< Real, Device, Index >& b )
 }
 
 template< typename Real1, typename Real2, typename Device, typename Index >
-__cuda_callable__
 const Containers::Expressions::BinaryExpressionTemplate< Containers::VectorView< Real1, Device, Index >, Containers::VectorView< Real2, Device, Index >, Containers::Expressions::Addition >
 operator+( const Containers::VectorView< Real1, Device, Index >& a, const Containers::VectorView< Real2, Device, Index >& b )
 {
@@ -50,7 +47,6 @@ operator+( const Containers::VectorView< Real1, Device, Index >& a, const Contai
 ////
 // Subtraction
 template< typename Real, typename Device, typename Index, typename ET >
-__cuda_callable__
 const Containers::Expressions::BinaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, ET, Containers::Expressions::Subtraction >
 operator-( const Containers::VectorView< Real, Device, Index >& a, const ET& b )
 {
@@ -58,7 +54,6 @@ operator-( const Containers::VectorView< Real, Device, Index >& a, const ET& b )
 }
 
 template< typename ET, typename Real, typename Device, typename Index >
-__cuda_callable__
 const Containers::Expressions::BinaryExpressionTemplate< ET, Containers::VectorView< Real, Device, Index >, Containers::Expressions::Subtraction >
 operator-( const ET& a, const Containers::VectorView< Real, Device, Index >& b )
 {
@@ -66,7 +61,6 @@ operator-( const ET& a, const Containers::VectorView< Real, Device, Index >& b )
 }
 
 template< typename Real1, typename Real2, typename Device, typename Index >
-__cuda_callable__
 const Containers::Expressions::BinaryExpressionTemplate< Containers::VectorView< Real1, Device, Index >, Containers::VectorView< Real2, Device, Index >, Containers::Expressions::Subtraction >
 operator-( const Containers::VectorView< Real1, Device, Index >& a, const Containers::VectorView< Real2, Device, Index >& b )
 {
@@ -76,7 +70,6 @@ operator-( const Containers::VectorView< Real1, Device, Index >& a, const Contai
 ////
 // Multiplication
 template< typename Real, typename Device, typename Index, typename ET >
-__cuda_callable__
 const Containers::Expressions::BinaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, ET, Containers::Expressions::Multiplication >
 operator*( const Containers::VectorView< Real, Device, Index >& a, const ET& b )
 {
@@ -84,7 +77,6 @@ operator*( const Containers::VectorView< Real, Device, Index >& a, const ET& b )
 }
 
 template< typename ET, typename Real, typename Device, typename Index >
-__cuda_callable__
 const Containers::Expressions::BinaryExpressionTemplate< ET, Containers::VectorView< Real, Device, Index >, Containers::Expressions::Multiplication >
 operator*( const ET& a, const Containers::VectorView< Real, Device, Index >& b )
 {
@@ -92,7 +84,6 @@ operator*( const ET& a, const Containers::VectorView< Real, Device, Index >& b )
 }
 
 template< typename Real1, typename Real2, typename Device, typename Index >
-__cuda_callable__
 const Containers::Expressions::BinaryExpressionTemplate< Containers::VectorView< Real1, Device, Index >, Containers::VectorView< Real2, Device, Index >, Containers::Expressions::Multiplication >
 operator*( const Containers::VectorView< Real1, Device, Index >& a, const Containers::VectorView< Real2, Device, Index >& b )
 {
@@ -102,7 +93,6 @@ operator*( const Containers::VectorView< Real1, Device, Index >& a, const Contai
 ////
 // Division
 template< typename Real, typename Device, typename Index, typename ET >
-__cuda_callable__
 const Containers::Expressions::BinaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, ET, Containers::Expressions::Division >
 operator/( const Containers::VectorView< Real, Device, Index >& a, const ET& b )
 {
@@ -110,7 +100,6 @@ operator/( const Containers::VectorView< Real, Device, Index >& a, const ET& b )
 }
 
 template< typename ET, typename Real, typename Device, typename Index >
-__cuda_callable__
 const Containers::Expressions::BinaryExpressionTemplate< ET, Containers::VectorView< Real, Device, Index >, Containers::Expressions::Division >
 operator/( const ET& a, const Containers::VectorView< Real, Device, Index >& b )
 {
@@ -118,7 +107,6 @@ operator/( const ET& a, const Containers::VectorView< Real, Device, Index >& b )
 }
 
 template< typename Real1, typename Real2, typename Device, typename Index >
-__cuda_callable__
 const Containers::Expressions::BinaryExpressionTemplate< Containers::VectorView< Real1, Device, Index >, Containers::VectorView< Real2, Device, Index >, Containers::Expressions::Division >
 operator/( const Containers::VectorView< Real1, Device, Index >& a, const Containers::VectorView< Real2, Device, Index >& b )
 {
@@ -128,7 +116,6 @@ operator/( const Containers::VectorView< Real1, Device, Index >& a, const Contai
 ////
 // Min
 template< typename Real, typename Device, typename Index, typename ET >
-__cuda_callable__
 const Containers::Expressions::BinaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, ET, Containers::Expressions::Min >
 min( const Containers::VectorView< Real, Device, Index >& a, const ET& b )
 {
@@ -136,7 +123,6 @@ min( const Containers::VectorView< Real, Device, Index >& a, const ET& b )
 }
 
 template< typename ET, typename Real, typename Device, typename Index >
-__cuda_callable__
 const Containers::Expressions::BinaryExpressionTemplate< ET, Containers::VectorView< Real, Device, Index >, Containers::Expressions::Min >
 min( const ET& a, const Containers::VectorView< Real, Device, Index >& b )
 {
@@ -144,7 +130,6 @@ min( const ET& a, const Containers::VectorView< Real, Device, Index >& b )
 }
 
 template< typename Real1, typename Real2, typename Device, typename Index >
-__cuda_callable__
 const Containers::Expressions::BinaryExpressionTemplate< Containers::VectorView< Real1, Device, Index >, Containers::VectorView< Real2, Device, Index >, Containers::Expressions::Min >
 min( const Containers::VectorView< Real1, Device, Index >& a, const Containers::VectorView< Real2, Device, Index >& b )
 {
@@ -154,7 +139,6 @@ min( const Containers::VectorView< Real1, Device, Index >& a, const Containers::
 ////
 // Max
 template< typename Real, typename Device, typename Index, typename ET >
-__cuda_callable__
 const Containers::Expressions::BinaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, ET, Containers::Expressions::Max >
 max( const Containers::VectorView< Real, Device, Index >& a, const ET& b )
 {
@@ -162,7 +146,6 @@ max( const Containers::VectorView< Real, Device, Index >& a, const ET& b )
 }
 
 template< typename ET, typename Real, typename Device, typename Index >
-__cuda_callable__
 const Containers::Expressions::BinaryExpressionTemplate< ET, Containers::VectorView< Real, Device, Index >, Containers::Expressions::Max >
 max( const ET& a, const Containers::VectorView< Real, Device, Index >& b )
 {
@@ -170,7 +153,6 @@ max( const ET& a, const Containers::VectorView< Real, Device, Index >& b )
 }
 
 template< typename Real1, typename Real2, typename Device, typename Index >
-__cuda_callable__
 const Containers::Expressions::BinaryExpressionTemplate< Containers::VectorView< Real1, Device, Index >, Containers::VectorView< Real2, Device, Index >, Containers::Expressions::Max >
 max( const Containers::VectorView< Real1, Device, Index >& a, const Containers::VectorView< Real2, Device, Index >& b )
 {
@@ -180,21 +162,18 @@ max( const Containers::VectorView< Real1, Device, Index >& a, const Containers::
 ////
 // Comparison operations - operator ==
 template< typename Real, typename Device, typename Index, typename ET >
-__cuda_callable__
 bool operator==( const Containers::VectorView< Real, Device, Index >& a, const ET& b )
 {
    return Containers::Expressions::ComparisonEQ( a, b );
 }
 
 template< typename ET, typename Real, typename Device, typename Index >
-__cuda_callable__
 bool operator==( const ET& a, const Containers::VectorView< Real, Device, Index >& b )
 {
    return Containers::Expressions::ComparisonEQ( a, b );
 }
 
 template< typename Real1, typename Real2, typename Device, typename Index >
-__cuda_callable__
 bool operator==( const Containers::VectorView< Real1, Device, Index >& a, const Containers::VectorView< Real2, Device, Index >& b )
 {
    return Containers::Expressions::ComparisonEQ( a, b );
@@ -203,21 +182,18 @@ bool operator==( const Containers::VectorView< Real1, Device, Index >& a, const
 ////
 // Comparison operations - operator !=
 template< typename Real, typename Device, typename Index, typename ET >
-__cuda_callable__
 bool operator!=( const Containers::VectorView< Real, Device, Index >& a, const ET& b )
 {
    return Containers::Expressions::ComparisonNE( a, b );
 }
 
 template< typename ET, typename Real, typename Device, typename Index >
-__cuda_callable__
 bool operator!=( const ET& a, const Containers::VectorView< Real, Device, Index >& b )
 {
    return Containers::Expressions::ComparisonNE( a, b );
 }
 
 template< typename Real1, typename Real2, typename Device, typename Index >
-__cuda_callable__
 bool operator!=( const Containers::VectorView< Real1, Device, Index >& a, const Containers::VectorView< Real2, Device, Index >& b )
 {
    return Containers::Expressions::ComparisonNE( a, b );
@@ -226,21 +202,18 @@ bool operator!=( const Containers::VectorView< Real1, Device, Index >& a, const
 ////
 // Comparison operations - operator <
 template< typename Real, typename Device, typename Index, typename ET >
-__cuda_callable__
 bool operator<( const Containers::VectorView< Real, Device, Index >& a, const ET& b )
 {
    return Containers::Expressions::ComparisonLT( a, b );
 }
 
 template< typename ET, typename Real, typename Device, typename Index >
-__cuda_callable__
 bool operator<( const ET& a, const Containers::VectorView< Real, Device, Index >& b )
 {
    return Containers::Expressions::ComparisonLT( a, b );
 }
 
 template< typename Real1, typename Real2, typename Device, typename Index >
-__cuda_callable__
 bool operator<( const Containers::VectorView< Real1, Device, Index >& a, const Containers::VectorView< Real2, Device, Index >& b )
 {
    return Containers::Expressions::ComparisonLT( a, b );
@@ -249,21 +222,18 @@ bool operator<( const Containers::VectorView< Real1, Device, Index >& a, const C
 ////
 // Comparison operations - operator <=
 template< typename Real, typename Device, typename Index, typename ET >
-__cuda_callable__
 bool operator<=( const Containers::VectorView< Real, Device, Index >& a, const ET& b )
 {
    return Containers::Expressions::ComparisonLE( a, b );
 }
 
 template< typename ET, typename Real, typename Device, typename Index >
-__cuda_callable__
 bool operator<=( const ET& a, const Containers::VectorView< Real, Device, Index >& b )
 {
    return Containers::Expressions::ComparisonLE( a, b );
 }
 
 template< typename Real1, typename Real2, typename Device, typename Index >
-__cuda_callable__
 bool operator<=( const Containers::VectorView< Real1, Device, Index >& a, const Containers::VectorView< Real2, Device, Index >& b )
 {
    return Containers::Expressions::ComparisonLE( a, b );
@@ -272,21 +242,18 @@ bool operator<=( const Containers::VectorView< Real1, Device, Index >& a, const
 ////
 // Comparison operations - operator >
 template< typename Real, typename Device, typename Index, typename ET >
-__cuda_callable__
 bool operator>( const Containers::VectorView< Real, Device, Index >& a, const ET& b )
 {
    return Containers::Expressions::ComparisonGT( a, b );
 }
 
 template< typename ET, typename Real, typename Device, typename Index >
-__cuda_callable__
 bool operator>( const ET& a, const Containers::VectorView< Real, Device, Index >& b )
 {
    return Containers::Expressions::ComparisonGT( a, b );
 }
 
 template< typename Real1, typename Real2, typename Device, typename Index >
-__cuda_callable__
 bool operator>( const Containers::VectorView< Real1, Device, Index >& a, const Containers::VectorView< Real2, Device, Index >& b )
 {
    return Containers::Expressions::ComparisonGT( a, b );
@@ -295,21 +262,18 @@ bool operator>( const Containers::VectorView< Real1, Device, Index >& a, const C
 ////
 // Comparison operations - operator >=
 template< typename Real, typename Device, typename Index, typename ET >
-__cuda_callable__
 bool operator>=( const Containers::VectorView< Real, Device, Index >& a, const ET& b )
 {
    return Containers::Expressions::ComparisonGE( a, b );
 }
 
 template< typename ET, typename Real, typename Device, typename Index >
-__cuda_callable__
 bool operator>=( const ET& a, const Containers::VectorView< Real, Device, Index >& b )
 {
    return Containers::Expressions::ComparisonGE( a, b );
 }
 
 template< typename Real1, typename Real2, typename Device, typename Index >
-__cuda_callable__
 bool operator>=( const Containers::VectorView< Real1, Device, Index >& a, const Containers::VectorView< Real2, Device, Index >& b )
 {
    return Containers::Expressions::ComparisonGE( a, b );
@@ -318,7 +282,6 @@ bool operator>=( const Containers::VectorView< Real1, Device, Index >& a, const
 ////
 // Minus
 template< typename Real, typename Device, typename Index >
-__cuda_callable__
 const Containers::Expressions::UnaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, Containers::Expressions::Minus >
 operator-( const Containers::VectorView< Real, Device, Index >& a )
 {
@@ -328,7 +291,6 @@ operator-( const Containers::VectorView< Real, Device, Index >& a )
 ////
 // Abs
 template< typename Real, typename Device, typename Index >
-__cuda_callable__
 const Containers::Expressions::UnaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, Containers::Expressions::Abs >
 abs( const Containers::VectorView< Real, Device, Index >& a )
 {
@@ -338,7 +300,6 @@ abs( const Containers::VectorView< Real, Device, Index >& a )
 ////
 // Sine
 template< typename Real, typename Device, typename Index >
-__cuda_callable__
 const Containers::Expressions::UnaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, Containers::Expressions::Sin >
 sin( const Containers::VectorView< Real, Device, Index >& a )
 {
@@ -348,7 +309,6 @@ sin( const Containers::VectorView< Real, Device, Index >& a )
 ////
 // Cosine
 template< typename Real, typename Device, typename Index >
-__cuda_callable__
 const Containers::Expressions::UnaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, Containers::Expressions::Cos >
 cos( const Containers::VectorView< Real, Device, Index >& a )
 {
@@ -358,7 +318,6 @@ cos( const Containers::VectorView< Real, Device, Index >& a )
 ////
 // Tangent
 template< typename Real, typename Device, typename Index >
-__cuda_callable__
 const Containers::Expressions::UnaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, Containers::Expressions::Tan >
 tan( const Containers::VectorView< Real, Device, Index >& a )
 {
@@ -368,7 +327,6 @@ tan( const Containers::VectorView< Real, Device, Index >& a )
 ////
 // Sqrt
 template< typename Real, typename Device, typename Index >
-__cuda_callable__
 const Containers::Expressions::UnaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, Containers::Expressions::Sqrt >
 sqrt( const Containers::VectorView< Real, Device, Index >& a )
 {
@@ -378,7 +336,6 @@ sqrt( const Containers::VectorView< Real, Device, Index >& a )
 ////
 // Cbrt
 template< typename Real, typename Device, typename Index >
-__cuda_callable__
 const Containers::Expressions::UnaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, Containers::Expressions::Cbrt >
 cbrt( const Containers::VectorView< Real, Device, Index >& a )
 {
@@ -388,7 +345,6 @@ cbrt( const Containers::VectorView< Real, Device, Index >& a )
 ////
 // Power
 template< typename Real, typename Device, typename Index, typename ExpType >
-__cuda_callable__
 const Containers::Expressions::UnaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, Containers::Expressions::Pow, ExpType >
 pow( const Containers::VectorView< Real, Device, Index >& a, const ExpType& exp )
 {
@@ -398,7 +354,6 @@ pow( const Containers::VectorView< Real, Device, Index >& a, const ExpType& exp
 ////
 // Floor
 template< typename Real, typename Device, typename Index >
-__cuda_callable__
 const Containers::Expressions::UnaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, Containers::Expressions::Floor >
 floor( const Containers::VectorView< Real, Device, Index >& a )
 {
@@ -408,7 +363,6 @@ floor( const Containers::VectorView< Real, Device, Index >& a )
 ////
 // Ceil
 template< typename Real, typename Device, typename Index >
-__cuda_callable__
 const Containers::Expressions::UnaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, Containers::Expressions::Ceil >
 ceil( const Containers::VectorView< Real, Device, Index >& a )
 {
@@ -418,7 +372,6 @@ ceil( const Containers::VectorView< Real, Device, Index >& a )
 ////
 // Acos
 template< typename Real, typename Device, typename Index >
-__cuda_callable__
 const Containers::Expressions::UnaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, Containers::Expressions::Acos >
 acos( const Containers::VectorView< Real, Device, Index >& a )
 {
@@ -428,7 +381,6 @@ acos( const Containers::VectorView< Real, Device, Index >& a )
 ////
 // Asin
 template< typename Real, typename Device, typename Index >
-__cuda_callable__
 const Containers::Expressions::UnaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, Containers::Expressions::Asin >
 asin( const Containers::VectorView< Real, Device, Index >& a )
 {
@@ -438,7 +390,6 @@ asin( const Containers::VectorView< Real, Device, Index >& a )
 ////
 // Atan
 template< typename Real, typename Device, typename Index >
-__cuda_callable__
 const Containers::Expressions::UnaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, Containers::Expressions::Atan >
 atan( const Containers::VectorView< Real, Device, Index >& a )
 {
@@ -448,7 +399,6 @@ atan( const Containers::VectorView< Real, Device, Index >& a )
 ////
 // Cosh
 template< typename Real, typename Device, typename Index >
-__cuda_callable__
 const Containers::Expressions::UnaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, Containers::Expressions::Cosh >
 cosh( const Containers::VectorView< Real, Device, Index >& a )
 {
@@ -458,7 +408,6 @@ cosh( const Containers::VectorView< Real, Device, Index >& a )
 ////
 // Tanh
 template< typename Real, typename Device, typename Index >
-__cuda_callable__
 const Containers::Expressions::UnaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, Containers::Expressions::Tanh >
 tanh( const Containers::VectorView< Real, Device, Index >& a )
 {
@@ -468,7 +417,6 @@ tanh( const Containers::VectorView< Real, Device, Index >& a )
 ////
 // Log
 template< typename Real, typename Device, typename Index >
-__cuda_callable__
 const Containers::Expressions::UnaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, Containers::Expressions::Log >
 log( const Containers::VectorView< Real, Device, Index >& a )
 {
@@ -478,7 +426,6 @@ log( const Containers::VectorView< Real, Device, Index >& a )
 ////
 // Log10
 template< typename Real, typename Device, typename Index >
-__cuda_callable__
 const Containers::Expressions::UnaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, Containers::Expressions::Log10 >
 log10( const Containers::VectorView< Real, Device, Index >& a )
 {
@@ -488,7 +435,6 @@ log10( const Containers::VectorView< Real, Device, Index >& a )
 ////
 // Log2
 template< typename Real, typename Device, typename Index >
-__cuda_callable__
 const Containers::Expressions::UnaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, Containers::Expressions::Log2 >
 log2( const Containers::VectorView< Real, Device, Index >& a )
 {
@@ -498,7 +444,6 @@ log2( const Containers::VectorView< Real, Device, Index >& a )
 ////
 // Exp
 template< typename Real, typename Device, typename Index >
-__cuda_callable__
 const Containers::Expressions::UnaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, Containers::Expressions::Exp >
 exp( const Containers::VectorView< Real, Device, Index >& a )
 {
@@ -508,7 +453,6 @@ exp( const Containers::VectorView< Real, Device, Index >& a )
 ////
 // Sign
 template< typename Real, typename Device, typename Index >
-__cuda_callable__
 const Containers::Expressions::UnaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, Containers::Expressions::Sign >
 sign( const Containers::VectorView< Real, Device, Index >& a )
 {
@@ -602,21 +546,18 @@ binaryAnd( const Containers::VectorView< Real, Device, Index >& a )
 ////
 // Scalar product
 template< typename Real, typename Device, typename Index, typename ET >
-__cuda_callable__
 Real operator,( const Containers::VectorView< Real, Device, Index >& a, const ET& b )
 {
    return TNL::sum( a * b );
 }
 
 template< typename ET, typename Real, typename Device, typename Index >
-__cuda_callable__
 Real operator,( const ET& a, const Containers::VectorView< Real, Device, Index >& b )
 {
    return TNL::sum( a * b );
 }
 
 template< typename Real1, typename Real2, typename Device, typename Index >
-__cuda_callable__
 auto operator,( const Containers::VectorView< Real1, Device, Index >& a, const Containers::VectorView< Real2, Device, Index >& b )
 ->decltype( TNL::sum( a * b ) )
 {
@@ -626,7 +567,6 @@ auto operator,( const Containers::VectorView< Real1, Device, Index >& a, const C
 ////
 // TODO: Replace this with multiplication when its safe
 template< typename Real, typename Device, typename Index, typename ET >
-__cuda_callable__
 Containers::VectorView< Real, Device, Index >
 Scale( const Containers::VectorView< Real, Device, Index >& a, const ET& b )
 {
@@ -635,7 +575,6 @@ Scale( const Containers::VectorView< Real, Device, Index >& a, const ET& b )
 }
 
 template< typename ET, typename Real, typename Device, typename Index >
-__cuda_callable__
 Containers::Expressions::BinaryExpressionTemplate< ET, Containers::VectorView< Real, Device, Index >, Containers::Expressions::Multiplication >
 Scale( const ET& a, const Containers::VectorView< Real, Device, Index >& b )
 {
@@ -644,7 +583,6 @@ Scale( const ET& a, const Containers::VectorView< Real, Device, Index >& b )
 }
 
 template< typename Real1, typename Real2, typename Device, typename Index >
-__cuda_callable__
 Containers::Expressions::BinaryExpressionTemplate< Containers::VectorView< Real1, Device, Index >, Containers::VectorView< Real2, Device, Index >, Containers::Expressions::Multiplication >
 Scale( const Containers::VectorView< Real1, Device, Index >& a, const Containers::VectorView< Real2, Device, Index >& b )
 {
-- 
GitLab