From 7390a03b163ce4c26f310612d8700c7c842ffdb4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Tue, 27 Aug 2019 09:56:04 +0200
Subject: [PATCH] Avoiding compiler warnings for builds without CUDA

---
 src/Benchmarks/BLAS/array-operations.h        |  32 +-
 src/Benchmarks/BLAS/spmv.h                    |   7 +-
 src/Benchmarks/BLAS/vector-operations.h       | 287 ++++++++----------
 .../Containers/ndarray/SizesHolderHelpers.h   |   6 +-
 src/TNL/Devices/Cuda_impl.h                   |   7 +-
 .../tnlFastSweepingMethod2D_impl.h            |   6 +-
 .../Containers/DistributedArrayTest.h         |   6 +-
 .../ndarray/DistributedNDArray_1D_test.h      |   3 +-
 src/UnitTests/Matrices/SparseMatrixCopyTest.h |  26 +-
 9 files changed, 185 insertions(+), 195 deletions(-)

diff --git a/src/Benchmarks/BLAS/array-operations.h b/src/Benchmarks/BLAS/array-operations.h
index b689c7e196..cff60c8cca 100644
--- a/src/Benchmarks/BLAS/array-operations.h
+++ b/src/Benchmarks/BLAS/array-operations.h
@@ -69,12 +69,12 @@ benchmarkArrayOperations( Benchmark & benchmark,
    auto compareHost = [&]() {
       resultHost = (int) ( hostArray == hostArray2 );
    };
-   auto compareCuda = [&]() {
-      resultDevice = (int) ( deviceArray == deviceArray2 );
-   };
    benchmark.setOperation( "comparison (operator==)", 2 * datasetSize );
    benchmark.time< Devices::Host >( reset1, "CPU", compareHost );
 #ifdef HAVE_CUDA
+   auto compareCuda = [&]() {
+      resultDevice = (int) ( deviceArray == deviceArray2 );
+   };
    benchmark.time< Devices::Cuda >( reset1, "GPU", compareCuda );
 #endif
 
@@ -82,25 +82,25 @@ benchmarkArrayOperations( Benchmark & benchmark,
    auto copyAssignHostHost = [&]() {
       hostArray = hostArray2;
    };
-   auto copyAssignCudaCuda = [&]() {
-      deviceArray = deviceArray2;
-   };
    benchmark.setOperation( "copy (operator=)", 2 * datasetSize );
    // copyBasetime is used later inside HAVE_CUDA guard, so the compiler will
    // complain when compiling without CUDA
    const double copyBasetime = benchmark.time< Devices::Host >( reset1, "CPU", copyAssignHostHost );
 #ifdef HAVE_CUDA
+   auto copyAssignCudaCuda = [&]() {
+      deviceArray = deviceArray2;
+   };
    benchmark.time< Devices::Cuda >( reset1, "GPU", copyAssignCudaCuda );
 #endif
 
 
+#ifdef HAVE_CUDA
    auto copyAssignHostCuda = [&]() {
       deviceArray = hostArray;
    };
    auto copyAssignCudaHost = [&]() {
       hostArray = deviceArray;
    };
-#ifdef HAVE_CUDA
    benchmark.setOperation( "copy (operator=)", datasetSize, copyBasetime );
    benchmark.time< Devices::Cuda >( reset1, "CPU->GPU", copyAssignHostCuda );
    benchmark.time< Devices::Cuda >( reset1, "GPU->CPU", copyAssignCudaHost );
@@ -110,12 +110,12 @@ benchmarkArrayOperations( Benchmark & benchmark,
    auto setValueHost = [&]() {
       hostArray.setValue( 3.0 );
    };
-   auto setValueCuda = [&]() {
-      deviceArray.setValue( 3.0 );
-   };
    benchmark.setOperation( "setValue", datasetSize );
    benchmark.time< Devices::Host >( reset1, "CPU", setValueHost );
 #ifdef HAVE_CUDA
+   auto setValueCuda = [&]() {
+      deviceArray.setValue( 3.0 );
+   };
    benchmark.time< Devices::Cuda >( reset1, "GPU", setValueCuda );
 #endif
 
@@ -123,9 +123,6 @@ benchmarkArrayOperations( Benchmark & benchmark,
    auto setSizeHost = [&]() {
       hostArray.setSize( size );
    };
-   auto setSizeCuda = [&]() {
-      deviceArray.setSize( size );
-   };
    auto resetSize1 = [&]() {
       hostArray.reset();
 #ifdef HAVE_CUDA
@@ -135,6 +132,9 @@ benchmarkArrayOperations( Benchmark & benchmark,
    benchmark.setOperation( "allocation (setSize)", datasetSize );
    benchmark.time< Devices::Host >( resetSize1, "CPU", setSizeHost );
 #ifdef HAVE_CUDA
+   auto setSizeCuda = [&]() {
+      deviceArray.setSize( size );
+   };
    benchmark.time< Devices::Cuda >( resetSize1, "GPU", setSizeCuda );
 #endif
 
@@ -142,9 +142,6 @@ benchmarkArrayOperations( Benchmark & benchmark,
    auto resetSizeHost = [&]() {
       hostArray.reset();
    };
-   auto resetSizeCuda = [&]() {
-      deviceArray.reset();
-   };
    auto setSize1 = [&]() {
       hostArray.setSize( size );
 #ifdef HAVE_CUDA
@@ -154,6 +151,9 @@ benchmarkArrayOperations( Benchmark & benchmark,
    benchmark.setOperation( "deallocation (reset)", datasetSize );
    benchmark.time< Devices::Host >( setSize1, "CPU", resetSizeHost );
 #ifdef HAVE_CUDA
+   auto resetSizeCuda = [&]() {
+      deviceArray.reset();
+   };
    benchmark.time< Devices::Cuda >( setSize1, "GPU", resetSizeCuda );
 #endif
 }
diff --git a/src/Benchmarks/BLAS/spmv.h b/src/Benchmarks/BLAS/spmv.h
index 7f114e514a..b6c91a2470 100644
--- a/src/Benchmarks/BLAS/spmv.h
+++ b/src/Benchmarks/BLAS/spmv.h
@@ -155,13 +155,12 @@ benchmarkSpMV( Benchmark & benchmark,
    auto spmvHost = [&]() {
       hostMatrix.vectorProduct( hostVector, hostVector2 );
    };
-   auto spmvCuda = [&]() {
-      deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
-   };
-
    benchmark.setOperation( datasetSize );
    benchmark.time< Devices::Host >( reset, "CPU", spmvHost );
 #ifdef HAVE_CUDA
+   auto spmvCuda = [&]() {
+      deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
+   };
    benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda );
 #endif
 }
diff --git a/src/Benchmarks/BLAS/vector-operations.h b/src/Benchmarks/BLAS/vector-operations.h
index 80c63020dc..f2b22c7dfa 100644
--- a/src/Benchmarks/BLAS/vector-operations.h
+++ b/src/Benchmarks/BLAS/vector-operations.h
@@ -114,20 +114,19 @@ benchmarkVectorOperations( Benchmark & benchmark,
    auto maxHost = [&]() {
       resultHost = Benchmarks::CommonVectorOperations< Devices::Host >::getVectorMax( hostVector );
    };
-   auto maxCuda = [&]() {
-      resultDevice = Benchmarks::CommonVectorOperations< Devices::Cuda >::getVectorMax( deviceVector );
-   };
    auto maxHostET = [&]() {
       resultHost = max( hostView );
    };
-   auto maxCudaET = [&]() {
-      resultDevice = max( deviceView );
-   };
-
    benchmark.setOperation( "max", datasetSize );
    benchmark.time< Devices::Host >( reset1, "CPU legacy", maxHost );
    benchmark.time< Devices::Host >( reset1, "CPU ET", maxHostET );
 #ifdef HAVE_CUDA
+   auto maxCuda = [&]() {
+      resultDevice = Benchmarks::CommonVectorOperations< Devices::Cuda >::getVectorMax( deviceVector );
+   };
+   auto maxCudaET = [&]() {
+      resultDevice = max( deviceView );
+   };
    benchmark.time< Devices::Cuda >( reset1, "GPU legacy", maxCuda );
    benchmark.time< Devices::Cuda >( reset1, "GPU ET", maxCudaET );
 #endif
@@ -137,19 +136,19 @@ benchmarkVectorOperations( Benchmark & benchmark,
    auto minHost = [&]() {
       resultHost = Benchmarks::CommonVectorOperations< Devices::Host >::getVectorMin( hostVector );
    };
-   auto minCuda = [&]() {
-      resultDevice = Benchmarks::CommonVectorOperations< Devices::Cuda >::getVectorMin( deviceVector );
-   };
    auto minHostET = [&]() {
       resultHost = min( hostView );
    };
-   auto minCudaET = [&]() {
-      resultDevice = min( deviceView );
-   };
    benchmark.setOperation( "min", datasetSize );
    benchmark.time< Devices::Host >( reset1, "CPU legacy", minHost );
    benchmark.time< Devices::Host >( reset1, "CPU ET", minHostET );
 #ifdef HAVE_CUDA
+   auto minCuda = [&]() {
+      resultDevice = Benchmarks::CommonVectorOperations< Devices::Cuda >::getVectorMin( deviceVector );
+   };
+   auto minCudaET = [&]() {
+      resultDevice = min( deviceView );
+   };
    benchmark.time< Devices::Cuda >( reset1, "GPU legacy", minCuda );
    benchmark.time< Devices::Cuda >( reset1, "GPU ET", minCudaET );
 #endif
@@ -159,22 +158,28 @@ benchmarkVectorOperations( Benchmark & benchmark,
    auto absMaxHost = [&]() {
       resultHost = Benchmarks::CommonVectorOperations< Devices::Host >::getVectorAbsMax( hostVector );
    };
-   auto absMaxCuda = [&]() {
-      resultDevice = Benchmarks::CommonVectorOperations< Devices::Cuda >::getVectorAbsMax( deviceVector );
-   };
    auto absMaxHostET = [&]() {
       resultHost = max( abs( hostView ) );
    };
-   auto absMaxCudaET = [&]() {
-      resultDevice = max( abs( deviceView ) );
-   };
 #ifdef HAVE_BLAS
    auto absMaxBlas = [&]() {
       int index = blasIgamax( size, hostVector.getData(), 1 );
       resultHost = hostVector.getElement( index );
    };
+#endif
+   benchmark.setOperation( "absMax", datasetSize );
+   benchmark.time< Devices::Host >( reset1, "CPU legacy", absMaxHost );
+   benchmark.time< Devices::Host >( reset1, "CPU ET", absMaxHostET );
+#ifdef HAVE_BLAS
+   benchmark.time< Devices::Host >( reset1, "CPU BLAS", absMaxBlas );
 #endif
 #ifdef HAVE_CUDA
+   auto absMaxCuda = [&]() {
+      resultDevice = Benchmarks::CommonVectorOperations< Devices::Cuda >::getVectorAbsMax( deviceVector );
+   };
+   auto absMaxCudaET = [&]() {
+      resultDevice = max( abs( deviceView ) );
+   };
    auto absMaxCublas = [&]() {
       int index = 0;
       cublasIgamax( cublasHandle, size,
@@ -182,14 +187,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
                     &index );
       resultDevice = deviceVector.getElement( index );
    };
-#endif
-   benchmark.setOperation( "absMax", datasetSize );
-   benchmark.time< Devices::Host >( reset1, "CPU legacy", absMaxHost );
-   benchmark.time< Devices::Host >( reset1, "CPU ET", absMaxHostET );
-#ifdef HAVE_BLAS
-   benchmark.time< Devices::Host >( reset1, "CPU BLAS", absMaxBlas );
-#endif
-#ifdef HAVE_CUDA
    benchmark.time< Devices::Cuda >( reset1, "GPU legacy", absMaxCuda );
    benchmark.time< Devices::Cuda >( reset1, "GPU ET", absMaxCudaET );
    benchmark.time< Devices::Cuda >( reset1, "cuBLAS", absMaxCublas );
@@ -200,22 +197,26 @@ benchmarkVectorOperations( Benchmark & benchmark,
    auto absMinHost = [&]() {
       resultHost = Benchmarks::CommonVectorOperations< Devices::Host >::getVectorAbsMin( hostVector );
    };
-   auto absMinCuda = [&]() {
-      resultDevice = Benchmarks::CommonVectorOperations< Devices::Cuda >::getVectorAbsMin( deviceVector );
-   };
    auto absMinHostET = [&]() {
       resultHost = min( abs( hostView ) );
    };
-   auto absMinCudaET = [&]() {
-      resultDevice = min( abs( deviceView ) );
-   };
 /*#ifdef HAVE_BLAS
    auto absMinBlas = [&]() {
       int index = blasIgamin( size, hostVector.getData(), 1 );
       resultHost = hostVector.getElement( index );
    };
 #endif*/
+   benchmark.setOperation( "absMin", datasetSize );
+   benchmark.time< Devices::Host >( reset1, "CPU legacy", absMinHost );
+   benchmark.time< Devices::Host >( reset1, "CPU ET", absMinHostET );
+   //benchmark.time< Devices::Host >( reset1, "CPU BLAS", absMinBlas );
 #ifdef HAVE_CUDA
+   auto absMinCuda = [&]() {
+      resultDevice = Benchmarks::CommonVectorOperations< Devices::Cuda >::getVectorAbsMin( deviceVector );
+   };
+   auto absMinCudaET = [&]() {
+      resultDevice = min( abs( deviceView ) );
+   };
    auto absMinCublas = [&]() {
       int index = 0;
       cublasIgamin( cublasHandle, size,
@@ -223,12 +224,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
                     &index );
       resultDevice = deviceVector.getElement( index );
    };
-#endif
-   benchmark.setOperation( "absMin", datasetSize );
-   benchmark.time< Devices::Host >( reset1, "CPU legacy", absMinHost );
-   benchmark.time< Devices::Host >( reset1, "CPU ET", absMinHostET );
-   //benchmark.time< Devices::Host >( reset1, "CPU BLAS", absMinBlas );
-#ifdef HAVE_CUDA
    benchmark.time< Devices::Cuda >( reset1, "GPU legacy", absMinCuda );
    benchmark.time< Devices::Cuda >( reset1, "GPU ET", absMinCudaET );
    benchmark.time< Devices::Cuda >( reset1, "cuBLAS", absMinCublas );
@@ -239,19 +234,19 @@ benchmarkVectorOperations( Benchmark & benchmark,
    auto sumHost = [&]() {
       resultHost = Benchmarks::CommonVectorOperations< Devices::Host >::getVectorSum( hostVector );
    };
-   auto sumCuda = [&]() {
-      resultDevice = Benchmarks::CommonVectorOperations< Devices::Cuda >::getVectorSum( deviceVector );
-   };
    auto sumHostET = [&]() {
       resultHost = sum( hostView );
    };
-   auto sumCudaET = [&]() {
-      resultDevice = sum( deviceView );
-   };
    benchmark.setOperation( "sum", datasetSize );
    benchmark.time< Devices::Host >( reset1, "CPU legacy", sumHost );
    benchmark.time< Devices::Host >( reset1, "CPU ET", sumHostET );
 #ifdef HAVE_CUDA
+   auto sumCuda = [&]() {
+      resultDevice = Benchmarks::CommonVectorOperations< Devices::Cuda >::getVectorSum( deviceVector );
+   };
+   auto sumCudaET = [&]() {
+      resultDevice = sum( deviceView );
+   };
    benchmark.time< Devices::Cuda >( reset1, "GPU legacy", sumCuda );
    benchmark.time< Devices::Cuda >( reset1, "GPU ET", sumCudaET );
 #endif
@@ -261,26 +256,13 @@ benchmarkVectorOperations( Benchmark & benchmark,
    auto l1normHost = [&]() {
       resultHost = Benchmarks::CommonVectorOperations< Devices::Host >::getVectorLpNorm( hostVector, 1.0 );
    };
-   auto l1normCuda = [&]() {
-      resultDevice = Benchmarks::CommonVectorOperations< Devices::Cuda >::getVectorLpNorm( deviceVector, 1.0 );
-   };
    auto l1normHostET = [&]() {
       resultHost = lpNorm( hostView, 1.0 );
    };
-   auto l1normCudaET = [&]() {
-      resultDevice = lpNorm( deviceView, 1.0 );
-   };
 #ifdef HAVE_BLAS
    auto l1normBlas = [&]() {
       resultHost = blasGasum( size, hostVector.getData(), 1 );
    };
-#endif
-#ifdef HAVE_CUDA
-   auto l1normCublas = [&]() {
-      cublasGasum( cublasHandle, size,
-                   deviceVector.getData(), 1,
-                   &resultDevice );
-   };
 #endif
    benchmark.setOperation( "l1 norm", datasetSize );
    benchmark.time< Devices::Host >( reset1, "CPU legacy", l1normHost );
@@ -289,6 +271,17 @@ benchmarkVectorOperations( Benchmark & benchmark,
    benchmark.time< Devices::Host >( reset1, "CPU BLAS", l1normBlas );
 #endif
 #ifdef HAVE_CUDA
+   auto l1normCuda = [&]() {
+      resultDevice = Benchmarks::CommonVectorOperations< Devices::Cuda >::getVectorLpNorm( deviceVector, 1.0 );
+   };
+   auto l1normCudaET = [&]() {
+      resultDevice = lpNorm( deviceView, 1.0 );
+   };
+   auto l1normCublas = [&]() {
+      cublasGasum( cublasHandle, size,
+                   deviceVector.getData(), 1,
+                   &resultDevice );
+   };
    benchmark.time< Devices::Cuda >( reset1, "GPU legacy", l1normCuda );
    benchmark.time< Devices::Cuda >( reset1, "GPU ET", l1normCudaET );
    benchmark.time< Devices::Cuda >( reset1, "cuBLAS", l1normCublas );
@@ -299,26 +292,13 @@ benchmarkVectorOperations( Benchmark & benchmark,
    auto l2normHost = [&]() {
       resultHost = Benchmarks::CommonVectorOperations< Devices::Host >::getVectorLpNorm( hostVector, 2.0 );
    };
-   auto l2normCuda = [&]() {
-      resultDevice = Benchmarks::CommonVectorOperations< Devices::Cuda >::getVectorLpNorm( deviceVector, 2.0 );
-   };
    auto l2normHostET = [&]() {
       resultHost = lpNorm( hostView, 2.0 );
    };
-   auto l2normCudaET = [&]() {
-      resultDevice = lpNorm( deviceView, 2.0 );
-   };
 #ifdef HAVE_BLAS
    auto l2normBlas = [&]() {
       resultHost = blasGnrm2( size, hostVector.getData(), 1 );
    };
-#endif
-#ifdef HAVE_CUDA
-   auto l2normCublas = [&]() {
-      cublasGnrm2( cublasHandle, size,
-                   deviceVector.getData(), 1,
-                   &resultDevice );
-   };
 #endif
    benchmark.setOperation( "l2 norm", datasetSize );
    benchmark.time< Devices::Host >( reset1, "CPU legacy", l2normHost );
@@ -327,6 +307,17 @@ benchmarkVectorOperations( Benchmark & benchmark,
    benchmark.time< Devices::Host >( reset1, "CPU BLAS", l2normBlas );
 #endif
 #ifdef HAVE_CUDA
+   auto l2normCuda = [&]() {
+      resultDevice = Benchmarks::CommonVectorOperations< Devices::Cuda >::getVectorLpNorm( deviceVector, 2.0 );
+   };
+   auto l2normCudaET = [&]() {
+      resultDevice = lpNorm( deviceView, 2.0 );
+   };
+   auto l2normCublas = [&]() {
+      cublasGnrm2( cublasHandle, size,
+                   deviceVector.getData(), 1,
+                   &resultDevice );
+   };
    benchmark.time< Devices::Cuda >( reset1, "GPU legacy", l2normCuda );
    benchmark.time< Devices::Cuda >( reset1, "GPU ET", l2normCudaET );
    benchmark.time< Devices::Cuda >( reset1, "cuBLAS", l2normCublas );
@@ -337,19 +328,19 @@ benchmarkVectorOperations( Benchmark & benchmark,
    auto l3normHost = [&]() {
       resultHost = Benchmarks::CommonVectorOperations< Devices::Host >::getVectorLpNorm( hostVector, 3.0 );
    };
-   auto l3normCuda = [&]() {
-      resultDevice = Benchmarks::CommonVectorOperations< Devices::Cuda >::getVectorLpNorm( deviceVector, 3.0 );
-   };
    auto l3normHostET = [&]() {
       resultHost = lpNorm( hostView, 3.0 );
    };
-   auto l3normCudaET = [&]() {
-      resultDevice = lpNorm( deviceView, 3.0 );
-   };
    benchmark.setOperation( "l3 norm", datasetSize );
    benchmark.time< Devices::Host >( reset1, "CPU legacy", l3normHost );
    benchmark.time< Devices::Host >( reset1, "CPU ET", l3normHostET );
 #ifdef HAVE_CUDA
+   auto l3normCuda = [&]() {
+      resultDevice = Benchmarks::CommonVectorOperations< Devices::Cuda >::getVectorLpNorm( deviceVector, 3.0 );
+   };
+   auto l3normCudaET = [&]() {
+      resultDevice = lpNorm( deviceView, 3.0 );
+   };
    benchmark.time< Devices::Cuda >( reset1, "GPU legacy", l3normCuda );
    benchmark.time< Devices::Cuda >( reset1, "GPU ET", l3normCudaET );
 #endif
@@ -359,27 +350,13 @@ benchmarkVectorOperations( Benchmark & benchmark,
    auto scalarProductHost = [&]() {
       resultHost = Benchmarks::CommonVectorOperations< Devices::Host >::getScalarProduct( hostVector, hostVector2 );
    };
-   auto scalarProductCuda = [&]() {
-      resultDevice = Benchmarks::CommonVectorOperations< Devices::Cuda >::getScalarProduct( deviceVector, deviceVector2 );
-   };
    auto scalarProductHostET = [&]() {
       resultHost = ( hostVector, hostVector2 );
    };
-   auto scalarProductCudaET = [&]() {
-      resultDevice = ( deviceView, deviceView2 );
-   };
 #ifdef HAVE_BLAS
    auto scalarProductBlas = [&]() {
       resultHost = blasGdot( size, hostVector.getData(), 1, hostVector2.getData(), 1 );
    };
-#endif
-#ifdef HAVE_CUDA
-   auto scalarProductCublas = [&]() {
-      cublasGdot( cublasHandle, size,
-                  deviceVector.getData(), 1,
-                  deviceVector2.getData(), 1,
-                  &resultDevice );
-   };
 #endif
    benchmark.setOperation( "scalar product", 2 * datasetSize );
    benchmark.time< Devices::Host >( reset1, "CPU legacy", scalarProductHost );
@@ -388,6 +365,18 @@ benchmarkVectorOperations( Benchmark & benchmark,
    benchmark.time< Devices::Host >( reset1, "CPU BLAS", scalarProductBlas );
 #endif
 #ifdef HAVE_CUDA
+   auto scalarProductCuda = [&]() {
+      resultDevice = Benchmarks::CommonVectorOperations< Devices::Cuda >::getScalarProduct( deviceVector, deviceVector2 );
+   };
+   auto scalarProductCudaET = [&]() {
+      resultDevice = ( deviceView, deviceView2 );
+   };
+   auto scalarProductCublas = [&]() {
+      cublasGdot( cublasHandle, size,
+                  deviceVector.getData(), 1,
+                  deviceVector2.getData(), 1,
+                  &resultDevice );
+   };
    benchmark.time< Devices::Cuda >( reset1, "GPU legacy", scalarProductCuda );
    benchmark.time< Devices::Cuda >( reset1, "GPU ET", scalarProductCudaET );
    benchmark.time< Devices::Cuda >( reset1, "cuBLAS", scalarProductCublas );
@@ -398,28 +387,26 @@ benchmarkVectorOperations( Benchmark & benchmark,
    auto multiplyHost = [&]() {
       hostVector *= 0.5;
    };
-   auto multiplyCuda = [&]() {
-      deviceVector *= 0.5;
-   };
 #ifdef HAVE_BLAS
    auto multiplyBlas = [&]() {
       blasGscal( hostVector.getSize(), (Real) 0.5, hostVector.getData(), 1 );
    };
+#endif
+   benchmark.setOperation( "scalar multiplication", 2 * datasetSize );
+   benchmark.time< Devices::Host >( reset1, "CPU ET", multiplyHost );
+#ifdef HAVE_BLAS
+   benchmark.time< Devices::Host >( reset1, "CPU BLAS", multiplyBlas );
 #endif
 #ifdef HAVE_CUDA
+   auto multiplyCuda = [&]() {
+      deviceVector *= 0.5;
+   };
    auto multiplyCublas = [&]() {
       const Real alpha = 0.5;
       cublasGscal( cublasHandle, size,
                    &alpha,
                    deviceVector.getData(), 1 );
    };
-#endif
-   benchmark.setOperation( "scalar multiplication", 2 * datasetSize );
-   benchmark.time< Devices::Host >( reset1, "CPU ET", multiplyHost );
-#ifdef HAVE_BLAS
-   benchmark.time< Devices::Host >( reset1, "CPU BLAS", multiplyBlas );
-#endif
-#ifdef HAVE_CUDA
    benchmark.time< Devices::Cuda >( reset1, "GPU ET", multiplyCuda );
    benchmark.time< Devices::Cuda >( reset1, "cuBLAS", multiplyCublas );
 #endif
@@ -429,15 +416,9 @@ benchmarkVectorOperations( Benchmark & benchmark,
    auto addVectorHost = [&]() {
       Benchmarks::VectorOperations< Devices::Host >::addVector( hostVector, hostVector2, (Real) 1.0, (Real) 1.0 );
    };
-   auto addVectorCuda = [&]() {
-      Benchmarks::VectorOperations< Devices::Cuda >::addVector( deviceVector, deviceVector2, (Real) 1.0, (Real) 1.0 );
-   };
    auto addVectorHostET = [&]() {
       hostView += hostView2;
    };
-   auto addVectorCudaET = [&]() {
-      deviceView += deviceView2;
-   };
 #ifdef HAVE_BLAS
    auto addVectorBlas = [&]() {
       const Real alpha = 1.0;
@@ -445,8 +426,20 @@ benchmarkVectorOperations( Benchmark & benchmark,
                  hostVector2.getData(), 1,
                  hostVector.getData(), 1 );
    };
+#endif
+   benchmark.setOperation( "vector addition", 3 * datasetSize );
+   benchmark.time< Devices::Host >( resetAll, "CPU legacy", addVectorHost );
+   benchmark.time< Devices::Host >( resetAll, "CPU ET", addVectorHostET );
+#ifdef HAVE_BLAS
+   benchmark.time< Devices::Host >( resetAll, "CPU BLAS", addVectorBlas );
 #endif
 #ifdef HAVE_CUDA
+   auto addVectorCuda = [&]() {
+      Benchmarks::VectorOperations< Devices::Cuda >::addVector( deviceVector, deviceVector2, (Real) 1.0, (Real) 1.0 );
+   };
+   auto addVectorCudaET = [&]() {
+      deviceView += deviceView2;
+   };
    auto addVectorCublas = [&]() {
       const Real alpha = 1.0;
       cublasGaxpy( cublasHandle, size,
@@ -454,14 +447,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
                    deviceVector2.getData(), 1,
                    deviceVector.getData(), 1 );
    };
-#endif
-   benchmark.setOperation( "vector addition", 3 * datasetSize );
-   benchmark.time< Devices::Host >( resetAll, "CPU legacy", addVectorHost );
-   benchmark.time< Devices::Host >( resetAll, "CPU ET", addVectorHostET );
-#ifdef HAVE_BLAS
-   benchmark.time< Devices::Host >( resetAll, "CPU BLAS", addVectorBlas );
-#endif
-#ifdef HAVE_CUDA
    benchmark.time< Devices::Cuda >( resetAll, "GPU legacy", addVectorCuda );
    benchmark.time< Devices::Cuda >( resetAll, "GPU ET", addVectorCudaET );
    benchmark.time< Devices::Cuda >( resetAll, "cuBLAS", addVectorCublas );
@@ -473,16 +458,9 @@ benchmarkVectorOperations( Benchmark & benchmark,
       Benchmarks::VectorOperations< Devices::Host >::addVector( hostVector, hostVector2, (Real) 1.0, (Real) 1.0 );
       Benchmarks::VectorOperations< Devices::Host >::addVector( hostVector, hostVector3, (Real) 1.0, (Real) 1.0 );
    };
-   auto addTwoVectorsCuda = [&]() {
-      Benchmarks::VectorOperations< Devices::Cuda >::addVector( deviceVector, deviceVector2, (Real) 1.0, (Real) 1.0 );
-      Benchmarks::VectorOperations< Devices::Cuda >::addVector( deviceVector, deviceVector3, (Real) 1.0, (Real) 1.0 );
-   };
    auto addTwoVectorsHostET = [&]() {
       hostView += hostView2 + hostView3;
    };
-   auto addTwoVectorsCudaET = [&]() {
-      deviceView += deviceView2 + deviceView3;
-   };
 #ifdef HAVE_BLAS
    auto addTwoVectorsBlas = [&]() {
       const Real alpha = 1.0;
@@ -493,8 +471,21 @@ benchmarkVectorOperations( Benchmark & benchmark,
                  hostVector3.getData(), 1,
                  hostVector.getData(), 1 );
    };
+#endif
+   benchmark.setOperation( "two vectors addition", 4 * datasetSize );
+   benchmark.time< Devices::Host >( resetAll, "CPU legacy", addTwoVectorsHost );
+   benchmark.time< Devices::Host >( resetAll, "CPU ET", addTwoVectorsHostET );
+#ifdef HAVE_BLAS
+   benchmark.time< Devices::Host >( resetAll, "CPU BLAS", addTwoVectorsBlas );
 #endif
 #ifdef HAVE_CUDA
+   auto addTwoVectorsCuda = [&]() {
+      Benchmarks::VectorOperations< Devices::Cuda >::addVector( deviceVector, deviceVector2, (Real) 1.0, (Real) 1.0 );
+      Benchmarks::VectorOperations< Devices::Cuda >::addVector( deviceVector, deviceVector3, (Real) 1.0, (Real) 1.0 );
+   };
+   auto addTwoVectorsCudaET = [&]() {
+      deviceView += deviceView2 + deviceView3;
+   };
    auto addTwoVectorsCublas = [&]() {
       const Real alpha = 1.0;
       cublasGaxpy( cublasHandle, size,
@@ -506,14 +497,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
                    deviceVector3.getData(), 1,
                    deviceVector.getData(), 1 );
    };
-#endif
-   benchmark.setOperation( "two vectors addition", 4 * datasetSize );
-   benchmark.time< Devices::Host >( resetAll, "CPU legacy", addTwoVectorsHost );
-   benchmark.time< Devices::Host >( resetAll, "CPU ET", addTwoVectorsHostET );
-#ifdef HAVE_BLAS
-   benchmark.time< Devices::Host >( resetAll, "CPU BLAS", addTwoVectorsBlas );
-#endif
-#ifdef HAVE_CUDA
    benchmark.time< Devices::Cuda >( resetAll, "GPU legacy", addTwoVectorsCuda );
    benchmark.time< Devices::Cuda >( resetAll, "GPU ET", addTwoVectorsCudaET );
    benchmark.time< Devices::Cuda >( resetAll, "cuBLAS", addTwoVectorsCublas );
@@ -526,17 +509,9 @@ benchmarkVectorOperations( Benchmark & benchmark,
       Benchmarks::VectorOperations< Devices::Host >::addVector( hostVector, hostVector3, (Real) 1.0, (Real) 1.0 );
       Benchmarks::VectorOperations< Devices::Host >::addVector( hostVector, hostVector4, (Real) 1.0, (Real) 1.0 );
    };
-   auto addThreeVectorsCuda = [&]() {
-      Benchmarks::VectorOperations< Devices::Cuda >::addVector( deviceVector, deviceVector2, (Real) 1.0, (Real) 1.0 );
-      Benchmarks::VectorOperations< Devices::Cuda >::addVector( deviceVector, deviceVector3, (Real) 1.0, (Real) 1.0 );
-      Benchmarks::VectorOperations< Devices::Cuda >::addVector( deviceVector, deviceVector4, (Real) 1.0, (Real) 1.0 );
-   };
    auto addThreeVectorsHostET = [&]() {
       hostView += hostView2 + hostView3 + hostView4;
    };
-   auto addThreeVectorsCudaET = [&]() {
-      deviceView += deviceView2 + deviceView3 + deviceView4;
-   };
 #ifdef HAVE_BLAS
    auto addThreeVectorsBlas = [&]() {
       const Real alpha = 1.0;
@@ -546,12 +521,26 @@ benchmarkVectorOperations( Benchmark & benchmark,
       blasGaxpy( size, alpha,
                  hostVector3.getData(), 1,
                  hostVector.getData(), 1 );
-       blasGaxpy( size, alpha,
+      blasGaxpy( size, alpha,
                  hostVector4.getData(), 1,
                  hostVector.getData(), 1 );
    };
+#endif
+   benchmark.setOperation( "three vectors addition", 5 * datasetSize );
+   benchmark.time< Devices::Host >( resetAll, "CPU legacy", addThreeVectorsHost );
+   benchmark.time< Devices::Host >( resetAll, "CPU ET", addThreeVectorsHostET );
+#ifdef HAVE_BLAS
+   benchmark.time< Devices::Host >( resetAll, "CPU BLAS", addThreeVectorsBlas );
 #endif
 #ifdef HAVE_CUDA
+   auto addThreeVectorsCuda = [&]() {
+      Benchmarks::VectorOperations< Devices::Cuda >::addVector( deviceVector, deviceVector2, (Real) 1.0, (Real) 1.0 );
+      Benchmarks::VectorOperations< Devices::Cuda >::addVector( deviceVector, deviceVector3, (Real) 1.0, (Real) 1.0 );
+      Benchmarks::VectorOperations< Devices::Cuda >::addVector( deviceVector, deviceVector4, (Real) 1.0, (Real) 1.0 );
+   };
+   auto addThreeVectorsCudaET = [&]() {
+      deviceView += deviceView2 + deviceView3 + deviceView4;
+   };
    auto addThreeVectorsCublas = [&]() {
       const Real alpha = 1.0;
       cublasGaxpy( cublasHandle, size,
@@ -567,14 +556,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
                    deviceVector4.getData(), 1,
                    deviceVector.getData(), 1 );
    };
-#endif
-   benchmark.setOperation( "three vectors addition", 5 * datasetSize );
-   benchmark.time< Devices::Host >( resetAll, "CPU legacy", addThreeVectorsHost );
-   benchmark.time< Devices::Host >( resetAll, "CPU ET", addThreeVectorsHostET );
-#ifdef HAVE_BLAS
-   benchmark.time< Devices::Host >( resetAll, "CPU BLAS", addThreeVectorsBlas );
-#endif
-#ifdef HAVE_CUDA
    benchmark.time< Devices::Cuda >( resetAll, "GPU legacy", addThreeVectorsCuda );
    benchmark.time< Devices::Cuda >( resetAll, "GPU ET", addThreeVectorsCudaET );
    benchmark.time< Devices::Cuda >( resetAll, "cuBLAS", addThreeVectorsCublas );
@@ -585,12 +566,12 @@ benchmarkVectorOperations( Benchmark & benchmark,
    auto inclusivePrefixSumHost = [&]() {
       hostVector.prefixSum();
    };
-   auto inclusivePrefixSumCuda = [&]() {
-      deviceVector.prefixSum();
-   };
    benchmark.setOperation( "inclusive prefix sum", 2 * datasetSize );
    benchmark.time< Devices::Host >( reset1, "CPU ET", inclusivePrefixSumHost );
 #ifdef HAVE_CUDA
+   auto inclusivePrefixSumCuda = [&]() {
+      deviceVector.prefixSum();
+   };
    benchmark.time< Devices::Cuda >( reset1, "GPU ET", inclusivePrefixSumCuda );
 #endif
 
@@ -599,12 +580,12 @@ benchmarkVectorOperations( Benchmark & benchmark,
    auto exclusivePrefixSumHost = [&]() {
       hostVector.template prefixSum< Containers::Algorithms::PrefixSumType::Exclusive >();
    };
-   auto exclusivePrefixSumCuda = [&]() {
-      deviceVector.template prefixSum< Containers::Algorithms::PrefixSumType::Exclusive >();
-   };
    benchmark.setOperation( "exclusive prefix sum", 2 * datasetSize );
    benchmark.time< Devices::Host >( reset1, "CPU ET", exclusivePrefixSumHost );
 #ifdef HAVE_CUDA
+   auto exclusivePrefixSumCuda = [&]() {
+      deviceVector.template prefixSum< Containers::Algorithms::PrefixSumType::Exclusive >();
+   };
    benchmark.time< Devices::Cuda >( reset1, "GPU ET", exclusivePrefixSumCuda );
 #endif
 
diff --git a/src/TNL/Containers/ndarray/SizesHolderHelpers.h b/src/TNL/Containers/ndarray/SizesHolderHelpers.h
index 2e92ed43df..9d1c0d439f 100644
--- a/src/TNL/Containers/ndarray/SizesHolderHelpers.h
+++ b/src/TNL/Containers/ndarray/SizesHolderHelpers.h
@@ -184,7 +184,8 @@ struct SetSizesCopyHelper
          target.template setSize< level >( source.template getSize< level >() );
          SetSizesCopyHelper< TargetHolder, SourceHolder, level - 1 >::copy( target, source );
       }
-      else if( target.template getStaticSize< level >() != source.template getSize< level >() )
+      else if( source.template getSize< level >() < 0 ||
+               target.template getStaticSize< level >() != (std::size_t) source.template getSize< level >() )
          throw std::logic_error( "Cannot copy sizes due to inconsistent underlying types (static sizes don't match)." );
    }
 };
@@ -198,7 +199,8 @@ struct SetSizesCopyHelper< TargetHolder, SourceHolder, 0 >
    {
       if( target.template getStaticSize< 0 >() == 0 )
          target.template setSize< 0 >( source.template getSize< 0 >() );
-      else if( target.template getStaticSize< 0 >() != source.template getSize< 0 >() )
+      else if( source.template getSize< 0 >() ||
+               target.template getStaticSize< 0 >() != (std::size_t) source.template getSize< 0 >() )
          throw std::logic_error( "Cannot copy sizes due to inconsistent underlying types (static sizes don't match)." );
    }
 };
diff --git a/src/TNL/Devices/Cuda_impl.h b/src/TNL/Devices/Cuda_impl.h
index 234f45b720..07e2c1ddcb 100644
--- a/src/TNL/Devices/Cuda_impl.h
+++ b/src/TNL/Devices/Cuda_impl.h
@@ -294,6 +294,7 @@ __device__ Element* Cuda::getSharedMemory()
 {
    return CudaSharedMemory< Element >();
 }
+#endif
 
 #ifdef HAVE_CUDA
 inline void Cuda::checkDevice( const char* file_name, int line, cudaError error )
@@ -326,6 +327,8 @@ inline bool Cuda::synchronizeDevice( int deviceId )
    getSmartPointersSynchronizationTimer().stop();
    return b;
 #endif
+#else
+   return true;
 #endif
 }
 
@@ -353,6 +356,7 @@ namespace {
 
 // double-precision atomicAdd function for Maxwell and older GPUs
 // copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
+#ifdef HAVE_CUDA
 #if __CUDA_ARCH__ < 600
 namespace {
    __device__ double atomicAdd(double* address, double val)
@@ -374,8 +378,7 @@ namespace {
    }
 } // namespace
 #endif
-
-#endif /* HAVE_CUDA */
+#endif
 
 } // namespace Devices
 } // namespace TNL
diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
index 8fff004495..89cb608810 100644
--- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
@@ -87,13 +87,14 @@ solve( const MeshPointer& mesh,
   int i = Communicators::MpiCommunicator::GetRank( Communicators::MpiCommunicator::AllGroup );
   //printf( "Hello world from rank: %d ", i );
   //Communicators::MpiCommunicator::Request r = Communicators::MpiCommunicator::ISend( auxPtr, 0, 0, Communicators::MpiCommunicator::AllGroup );
-  if( i == 1 )
+  if( i == 1 ) {
     /*for( int k = 0; k < 16*16; k++ )
       aux[ k ] = 10;*/
     printf( "1: mesh x: %d\n", mesh->getDimensions().x() );
     printf( "1: mesh y: %d\n", mesh->getDimensions().y() );
     //aux.save("aux_proc1.tnl");
-  if( i == 0 )
+  }
+  if( i == 0 ) {
     printf( "0: mesh x: %d\n", mesh->getDimensions().x() );
     printf( "0: mesh y: %d\n", mesh->getDimensions().y() );
     //aux.save("aux_proc0.tnl");
@@ -104,6 +105,7 @@ solve( const MeshPointer& mesh,
         printf("%f.2\t",aux[ k * 16 + l ] );
     printf("\n");
     }*/
+  }
     
   /*bool a = Communicators::MpiCommunicator::IsInitialized();
   if( a )
diff --git a/src/UnitTests/Containers/DistributedArrayTest.h b/src/UnitTests/Containers/DistributedArrayTest.h
index 851bd95811..f4bd358303 100644
--- a/src/UnitTests/Containers/DistributedArrayTest.h
+++ b/src/UnitTests/Containers/DistributedArrayTest.h
@@ -146,8 +146,9 @@ TYPED_TEST( DistributedArrayTest, elementwiseAccess )
       const IndexType gi = localRange.getGlobalIndex( i );
       EXPECT_EQ( localArrayView.getElement( i ), 0 );
       EXPECT_EQ( this->distributedArray.getElement( gi ), 0 );
-      if( std::is_same< typename TestFixture::DeviceType, Devices::Host >::value )
+      if( std::is_same< typename TestFixture::DeviceType, Devices::Host >::value ) {
          EXPECT_EQ( this->distributedArray[ gi ], 0 );
+      }
    }
 
    // use setValue
@@ -161,8 +162,9 @@ TYPED_TEST( DistributedArrayTest, elementwiseAccess )
       const IndexType gi = localRange.getGlobalIndex( i );
       EXPECT_EQ( localArrayView.getElement( i ), i + 1 );
       EXPECT_EQ( this->distributedArray.getElement( gi ), i + 1 );
-      if( std::is_same< typename TestFixture::DeviceType, Devices::Host >::value )
+      if( std::is_same< typename TestFixture::DeviceType, Devices::Host >::value ) {
          EXPECT_EQ( this->distributedArray[ gi ], i + 1 );
+      }
    }
 
    this->distributedArray.setValue( 0 );
diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h
index 3dda2d1b47..2faf5ba040 100644
--- a/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h
+++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h
@@ -172,8 +172,9 @@ TYPED_TEST( DistributedNDArray_1D_test, elementwiseAccess )
    for( IndexType gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) {
 //      EXPECT_EQ( localArrayView.getElement( i ), 0 );
       EXPECT_EQ( this->distributedNDArray.getElement( gi ), 0 );
-      if( std::is_same< typename TestFixture::DeviceType, Devices::Host >::value )
+      if( std::is_same< typename TestFixture::DeviceType, Devices::Host >::value ) {
          EXPECT_EQ( this->distributedNDArray[ gi ], 0 );
+      }
    }
 
    // use operator()
diff --git a/src/UnitTests/Matrices/SparseMatrixCopyTest.h b/src/UnitTests/Matrices/SparseMatrixCopyTest.h
index a38f294979..9b09ef4d45 100644
--- a/src/UnitTests/Matrices/SparseMatrixCopyTest.h
+++ b/src/UnitTests/Matrices/SparseMatrixCopyTest.h
@@ -54,34 +54,34 @@ void setupUnevenRowSizeMatrix( Matrix& m )
     rowLengths.setElement( 7,  1 );
     rowLengths.setElement( 9,  1 );
     m.setCompressedRowLengths( rowLengths );
-    
+
     int value = 1;
     for( int i = 0; i < cols - 4; i++ )  // 0th row
         m.setElement( 0, i, value++ );
-    
+
     for( int i = 3; i < cols; i++ )      // 1st row
         m.setElement( 1, i, value++ );
-    
+
     for( int i = 0; i < cols - 3; i++ )  // 2nd row
         m.setElement( 2, i, value++ );
-    
+
     for( int i = 1; i < cols; i++ )      // 3rd row
         m.setElement( 3, i, value++ );
-    
+
     for( int i = 0; i < cols - 1; i++ )  // 4th row
         m.setElement( 4, i, value++ );
-    
+
     for( int i = 0; i < cols - 4; i++ )  // 5th row
         m.setElement( 5, i, value++ );
-    
-        m.setElement( 6, 0, value++ );   // 6th row
-        
-        m.setElement( 7, 0, value++ );   // 7th row
-    
+
+    m.setElement( 6, 0, value++ );   // 6th row
+
+    m.setElement( 7, 0, value++ );   // 7th row
+
     for( int i = 0; i < cols - 1; i++ )  // 8th row 
         m.setElement( 8, i, value++ );
-    
-        m.setElement( 9, 5, value++ );   // 9th row
+
+    m.setElement( 9, 5, value++ );   // 9th row
 }
 
 template< typename Matrix >
-- 
GitLab