From 7390a03b163ce4c26f310612d8700c7c842ffdb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz> Date: Tue, 27 Aug 2019 09:56:04 +0200 Subject: [PATCH] Avoiding compiler warnings for builds without CUDA --- src/Benchmarks/BLAS/array-operations.h | 32 +- src/Benchmarks/BLAS/spmv.h | 7 +- src/Benchmarks/BLAS/vector-operations.h | 287 ++++++++---------- .../Containers/ndarray/SizesHolderHelpers.h | 6 +- src/TNL/Devices/Cuda_impl.h | 7 +- .../tnlFastSweepingMethod2D_impl.h | 6 +- .../Containers/DistributedArrayTest.h | 6 +- .../ndarray/DistributedNDArray_1D_test.h | 3 +- src/UnitTests/Matrices/SparseMatrixCopyTest.h | 26 +- 9 files changed, 185 insertions(+), 195 deletions(-) diff --git a/src/Benchmarks/BLAS/array-operations.h b/src/Benchmarks/BLAS/array-operations.h index b689c7e196..cff60c8cca 100644 --- a/src/Benchmarks/BLAS/array-operations.h +++ b/src/Benchmarks/BLAS/array-operations.h @@ -69,12 +69,12 @@ benchmarkArrayOperations( Benchmark & benchmark, auto compareHost = [&]() { resultHost = (int) ( hostArray == hostArray2 ); }; - auto compareCuda = [&]() { - resultDevice = (int) ( deviceArray == deviceArray2 ); - }; benchmark.setOperation( "comparison (operator==)", 2 * datasetSize ); benchmark.time< Devices::Host >( reset1, "CPU", compareHost ); #ifdef HAVE_CUDA + auto compareCuda = [&]() { + resultDevice = (int) ( deviceArray == deviceArray2 ); + }; benchmark.time< Devices::Cuda >( reset1, "GPU", compareCuda ); #endif @@ -82,25 +82,25 @@ benchmarkArrayOperations( Benchmark & benchmark, auto copyAssignHostHost = [&]() { hostArray = hostArray2; }; - auto copyAssignCudaCuda = [&]() { - deviceArray = deviceArray2; - }; benchmark.setOperation( "copy (operator=)", 2 * datasetSize ); // copyBasetime is used later inside HAVE_CUDA guard, so the compiler will // complain when compiling without CUDA const double copyBasetime = benchmark.time< Devices::Host >( reset1, "CPU", copyAssignHostHost ); #ifdef HAVE_CUDA + auto copyAssignCudaCuda = [&]() { + deviceArray = deviceArray2; + }; benchmark.time< Devices::Cuda >( reset1, "GPU", copyAssignCudaCuda ); #endif +#ifdef HAVE_CUDA auto copyAssignHostCuda = [&]() { deviceArray = hostArray; }; auto copyAssignCudaHost = [&]() { hostArray = deviceArray; }; -#ifdef HAVE_CUDA benchmark.setOperation( "copy (operator=)", datasetSize, copyBasetime ); benchmark.time< Devices::Cuda >( reset1, "CPU->GPU", copyAssignHostCuda ); benchmark.time< Devices::Cuda >( reset1, "GPU->CPU", copyAssignCudaHost ); @@ -110,12 +110,12 @@ benchmarkArrayOperations( Benchmark & benchmark, auto setValueHost = [&]() { hostArray.setValue( 3.0 ); }; - auto setValueCuda = [&]() { - deviceArray.setValue( 3.0 ); - }; benchmark.setOperation( "setValue", datasetSize ); benchmark.time< Devices::Host >( reset1, "CPU", setValueHost ); #ifdef HAVE_CUDA + auto setValueCuda = [&]() { + deviceArray.setValue( 3.0 ); + }; benchmark.time< Devices::Cuda >( reset1, "GPU", setValueCuda ); #endif @@ -123,9 +123,6 @@ benchmarkArrayOperations( Benchmark & benchmark, auto setSizeHost = [&]() { hostArray.setSize( size ); }; - auto setSizeCuda = [&]() { - deviceArray.setSize( size ); - }; auto resetSize1 = [&]() { hostArray.reset(); #ifdef HAVE_CUDA @@ -135,6 +132,9 @@ benchmarkArrayOperations( Benchmark & benchmark, benchmark.setOperation( "allocation (setSize)", datasetSize ); benchmark.time< Devices::Host >( resetSize1, "CPU", setSizeHost ); #ifdef HAVE_CUDA + auto setSizeCuda = [&]() { + deviceArray.setSize( size ); + }; benchmark.time< Devices::Cuda >( resetSize1, "GPU", setSizeCuda ); #endif @@ -142,9 +142,6 @@ benchmarkArrayOperations( Benchmark & benchmark, auto resetSizeHost = [&]() { hostArray.reset(); }; - auto resetSizeCuda = [&]() { - deviceArray.reset(); - }; auto setSize1 = [&]() { hostArray.setSize( size ); #ifdef HAVE_CUDA @@ -154,6 +151,9 @@ benchmarkArrayOperations( Benchmark & benchmark, benchmark.setOperation( "deallocation (reset)", datasetSize ); benchmark.time< Devices::Host >( setSize1, "CPU", resetSizeHost ); #ifdef HAVE_CUDA + auto resetSizeCuda = [&]() { + deviceArray.reset(); + }; benchmark.time< Devices::Cuda >( setSize1, "GPU", resetSizeCuda ); #endif } diff --git a/src/Benchmarks/BLAS/spmv.h b/src/Benchmarks/BLAS/spmv.h index 7f114e514a..b6c91a2470 100644 --- a/src/Benchmarks/BLAS/spmv.h +++ b/src/Benchmarks/BLAS/spmv.h @@ -155,13 +155,12 @@ benchmarkSpMV( Benchmark & benchmark, auto spmvHost = [&]() { hostMatrix.vectorProduct( hostVector, hostVector2 ); }; - auto spmvCuda = [&]() { - deviceMatrix.vectorProduct( deviceVector, deviceVector2 ); - }; - benchmark.setOperation( datasetSize ); benchmark.time< Devices::Host >( reset, "CPU", spmvHost ); #ifdef HAVE_CUDA + auto spmvCuda = [&]() { + deviceMatrix.vectorProduct( deviceVector, deviceVector2 ); + }; benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda ); #endif } diff --git a/src/Benchmarks/BLAS/vector-operations.h b/src/Benchmarks/BLAS/vector-operations.h index 80c63020dc..f2b22c7dfa 100644 --- a/src/Benchmarks/BLAS/vector-operations.h +++ b/src/Benchmarks/BLAS/vector-operations.h @@ -114,20 +114,19 @@ benchmarkVectorOperations( Benchmark & benchmark, auto maxHost = [&]() { resultHost = Benchmarks::CommonVectorOperations< Devices::Host >::getVectorMax( hostVector ); }; - auto maxCuda = [&]() { - resultDevice = Benchmarks::CommonVectorOperations< Devices::Cuda >::getVectorMax( deviceVector ); - }; auto maxHostET = [&]() { resultHost = max( hostView ); }; - auto maxCudaET = [&]() { - resultDevice = max( deviceView ); - }; - benchmark.setOperation( "max", datasetSize ); benchmark.time< Devices::Host >( reset1, "CPU legacy", maxHost ); benchmark.time< Devices::Host >( reset1, "CPU ET", maxHostET ); #ifdef HAVE_CUDA + auto maxCuda = [&]() { + resultDevice = Benchmarks::CommonVectorOperations< Devices::Cuda >::getVectorMax( deviceVector ); + }; + auto maxCudaET = [&]() { + resultDevice = max( deviceView ); + }; benchmark.time< Devices::Cuda >( reset1, "GPU legacy", maxCuda ); benchmark.time< Devices::Cuda >( reset1, "GPU ET", maxCudaET ); #endif @@ -137,19 +136,19 @@ benchmarkVectorOperations( Benchmark & benchmark, auto minHost = [&]() { resultHost = Benchmarks::CommonVectorOperations< Devices::Host >::getVectorMin( hostVector ); }; - auto minCuda = [&]() { - resultDevice = Benchmarks::CommonVectorOperations< Devices::Cuda >::getVectorMin( deviceVector ); - }; auto minHostET = [&]() { resultHost = min( hostView ); }; - auto minCudaET = [&]() { - resultDevice = min( deviceView ); - }; benchmark.setOperation( "min", datasetSize ); benchmark.time< Devices::Host >( reset1, "CPU legacy", minHost ); benchmark.time< Devices::Host >( reset1, "CPU ET", minHostET ); #ifdef HAVE_CUDA + auto minCuda = [&]() { + resultDevice = Benchmarks::CommonVectorOperations< Devices::Cuda >::getVectorMin( deviceVector ); + }; + auto minCudaET = [&]() { + resultDevice = min( deviceView ); + }; benchmark.time< Devices::Cuda >( reset1, "GPU legacy", minCuda ); benchmark.time< Devices::Cuda >( reset1, "GPU ET", minCudaET ); #endif @@ -159,22 +158,28 @@ benchmarkVectorOperations( Benchmark & benchmark, auto absMaxHost = [&]() { resultHost = Benchmarks::CommonVectorOperations< Devices::Host >::getVectorAbsMax( hostVector ); }; - auto absMaxCuda = [&]() { - resultDevice = Benchmarks::CommonVectorOperations< Devices::Cuda >::getVectorAbsMax( deviceVector ); - }; auto absMaxHostET = [&]() { resultHost = max( abs( hostView ) ); }; - auto absMaxCudaET = [&]() { - resultDevice = max( abs( deviceView ) ); - }; #ifdef HAVE_BLAS auto absMaxBlas = [&]() { int index = blasIgamax( size, hostVector.getData(), 1 ); resultHost = hostVector.getElement( index ); }; +#endif + benchmark.setOperation( "absMax", datasetSize ); + benchmark.time< Devices::Host >( reset1, "CPU legacy", absMaxHost ); + benchmark.time< Devices::Host >( reset1, "CPU ET", absMaxHostET ); +#ifdef HAVE_BLAS + benchmark.time< Devices::Host >( reset1, "CPU BLAS", absMaxBlas ); #endif #ifdef HAVE_CUDA + auto absMaxCuda = [&]() { + resultDevice = Benchmarks::CommonVectorOperations< Devices::Cuda >::getVectorAbsMax( deviceVector ); + }; + auto absMaxCudaET = [&]() { + resultDevice = max( abs( deviceView ) ); + }; auto absMaxCublas = [&]() { int index = 0; cublasIgamax( cublasHandle, size, @@ -182,14 +187,6 @@ benchmarkVectorOperations( Benchmark & benchmark, &index ); resultDevice = deviceVector.getElement( index ); }; -#endif - benchmark.setOperation( "absMax", datasetSize ); - benchmark.time< Devices::Host >( reset1, "CPU legacy", absMaxHost ); - benchmark.time< Devices::Host >( reset1, "CPU ET", absMaxHostET ); -#ifdef HAVE_BLAS - benchmark.time< Devices::Host >( reset1, "CPU BLAS", absMaxBlas ); -#endif -#ifdef HAVE_CUDA benchmark.time< Devices::Cuda >( reset1, "GPU legacy", absMaxCuda ); benchmark.time< Devices::Cuda >( reset1, "GPU ET", absMaxCudaET ); benchmark.time< Devices::Cuda >( reset1, "cuBLAS", absMaxCublas ); @@ -200,22 +197,26 @@ benchmarkVectorOperations( Benchmark & benchmark, auto absMinHost = [&]() { resultHost = Benchmarks::CommonVectorOperations< Devices::Host >::getVectorAbsMin( hostVector ); }; - auto absMinCuda = [&]() { - resultDevice = Benchmarks::CommonVectorOperations< Devices::Cuda >::getVectorAbsMin( deviceVector ); - }; auto absMinHostET = [&]() { resultHost = min( abs( hostView ) ); }; - auto absMinCudaET = [&]() { - resultDevice = min( abs( deviceView ) ); - }; /*#ifdef HAVE_BLAS auto absMinBlas = [&]() { int index = blasIgamin( size, hostVector.getData(), 1 ); resultHost = hostVector.getElement( index ); }; #endif*/ + benchmark.setOperation( "absMin", datasetSize ); + benchmark.time< Devices::Host >( reset1, "CPU legacy", absMinHost ); + benchmark.time< Devices::Host >( reset1, "CPU ET", absMinHostET ); + //benchmark.time< Devices::Host >( reset1, "CPU BLAS", absMinBlas ); #ifdef HAVE_CUDA + auto absMinCuda = [&]() { + resultDevice = Benchmarks::CommonVectorOperations< Devices::Cuda >::getVectorAbsMin( deviceVector ); + }; + auto absMinCudaET = [&]() { + resultDevice = min( abs( deviceView ) ); + }; auto absMinCublas = [&]() { int index = 0; cublasIgamin( cublasHandle, size, @@ -223,12 +224,6 @@ benchmarkVectorOperations( Benchmark & benchmark, &index ); resultDevice = deviceVector.getElement( index ); }; -#endif - benchmark.setOperation( "absMin", datasetSize ); - benchmark.time< Devices::Host >( reset1, "CPU legacy", absMinHost ); - benchmark.time< Devices::Host >( reset1, "CPU ET", absMinHostET ); - //benchmark.time< Devices::Host >( reset1, "CPU BLAS", absMinBlas ); -#ifdef HAVE_CUDA benchmark.time< Devices::Cuda >( reset1, "GPU legacy", absMinCuda ); benchmark.time< Devices::Cuda >( reset1, "GPU ET", absMinCudaET ); benchmark.time< Devices::Cuda >( reset1, "cuBLAS", absMinCublas ); @@ -239,19 +234,19 @@ benchmarkVectorOperations( Benchmark & benchmark, auto sumHost = [&]() { resultHost = Benchmarks::CommonVectorOperations< Devices::Host >::getVectorSum( hostVector ); }; - auto sumCuda = [&]() { - resultDevice = Benchmarks::CommonVectorOperations< Devices::Cuda >::getVectorSum( deviceVector ); - }; auto sumHostET = [&]() { resultHost = sum( hostView ); }; - auto sumCudaET = [&]() { - resultDevice = sum( deviceView ); - }; benchmark.setOperation( "sum", datasetSize ); benchmark.time< Devices::Host >( reset1, "CPU legacy", sumHost ); benchmark.time< Devices::Host >( reset1, "CPU ET", sumHostET ); #ifdef HAVE_CUDA + auto sumCuda = [&]() { + resultDevice = Benchmarks::CommonVectorOperations< Devices::Cuda >::getVectorSum( deviceVector ); + }; + auto sumCudaET = [&]() { + resultDevice = sum( deviceView ); + }; benchmark.time< Devices::Cuda >( reset1, "GPU legacy", sumCuda ); benchmark.time< Devices::Cuda >( reset1, "GPU ET", sumCudaET ); #endif @@ -261,26 +256,13 @@ benchmarkVectorOperations( Benchmark & benchmark, auto l1normHost = [&]() { resultHost = Benchmarks::CommonVectorOperations< Devices::Host >::getVectorLpNorm( hostVector, 1.0 ); }; - auto l1normCuda = [&]() { - resultDevice = Benchmarks::CommonVectorOperations< Devices::Cuda >::getVectorLpNorm( deviceVector, 1.0 ); - }; auto l1normHostET = [&]() { resultHost = lpNorm( hostView, 1.0 ); }; - auto l1normCudaET = [&]() { - resultDevice = lpNorm( deviceView, 1.0 ); - }; #ifdef HAVE_BLAS auto l1normBlas = [&]() { resultHost = blasGasum( size, hostVector.getData(), 1 ); }; -#endif -#ifdef HAVE_CUDA - auto l1normCublas = [&]() { - cublasGasum( cublasHandle, size, - deviceVector.getData(), 1, - &resultDevice ); - }; #endif benchmark.setOperation( "l1 norm", datasetSize ); benchmark.time< Devices::Host >( reset1, "CPU legacy", l1normHost ); @@ -289,6 +271,17 @@ benchmarkVectorOperations( Benchmark & benchmark, benchmark.time< Devices::Host >( reset1, "CPU BLAS", l1normBlas ); #endif #ifdef HAVE_CUDA + auto l1normCuda = [&]() { + resultDevice = Benchmarks::CommonVectorOperations< Devices::Cuda >::getVectorLpNorm( deviceVector, 1.0 ); + }; + auto l1normCudaET = [&]() { + resultDevice = lpNorm( deviceView, 1.0 ); + }; + auto l1normCublas = [&]() { + cublasGasum( cublasHandle, size, + deviceVector.getData(), 1, + &resultDevice ); + }; benchmark.time< Devices::Cuda >( reset1, "GPU legacy", l1normCuda ); benchmark.time< Devices::Cuda >( reset1, "GPU ET", l1normCudaET ); benchmark.time< Devices::Cuda >( reset1, "cuBLAS", l1normCublas ); @@ -299,26 +292,13 @@ benchmarkVectorOperations( Benchmark & benchmark, auto l2normHost = [&]() { resultHost = Benchmarks::CommonVectorOperations< Devices::Host >::getVectorLpNorm( hostVector, 2.0 ); }; - auto l2normCuda = [&]() { - resultDevice = Benchmarks::CommonVectorOperations< Devices::Cuda >::getVectorLpNorm( deviceVector, 2.0 ); - }; auto l2normHostET = [&]() { resultHost = lpNorm( hostView, 2.0 ); }; - auto l2normCudaET = [&]() { - resultDevice = lpNorm( deviceView, 2.0 ); - }; #ifdef HAVE_BLAS auto l2normBlas = [&]() { resultHost = blasGnrm2( size, hostVector.getData(), 1 ); }; -#endif -#ifdef HAVE_CUDA - auto l2normCublas = [&]() { - cublasGnrm2( cublasHandle, size, - deviceVector.getData(), 1, - &resultDevice ); - }; #endif benchmark.setOperation( "l2 norm", datasetSize ); benchmark.time< Devices::Host >( reset1, "CPU legacy", l2normHost ); @@ -327,6 +307,17 @@ benchmarkVectorOperations( Benchmark & benchmark, benchmark.time< Devices::Host >( reset1, "CPU BLAS", l2normBlas ); #endif #ifdef HAVE_CUDA + auto l2normCuda = [&]() { + resultDevice = Benchmarks::CommonVectorOperations< Devices::Cuda >::getVectorLpNorm( deviceVector, 2.0 ); + }; + auto l2normCudaET = [&]() { + resultDevice = lpNorm( deviceView, 2.0 ); + }; + auto l2normCublas = [&]() { + cublasGnrm2( cublasHandle, size, + deviceVector.getData(), 1, + &resultDevice ); + }; benchmark.time< Devices::Cuda >( reset1, "GPU legacy", l2normCuda ); benchmark.time< Devices::Cuda >( reset1, "GPU ET", l2normCudaET ); benchmark.time< Devices::Cuda >( reset1, "cuBLAS", l2normCublas ); @@ -337,19 +328,19 @@ benchmarkVectorOperations( Benchmark & benchmark, auto l3normHost = [&]() { resultHost = Benchmarks::CommonVectorOperations< Devices::Host >::getVectorLpNorm( hostVector, 3.0 ); }; - auto l3normCuda = [&]() { - resultDevice = Benchmarks::CommonVectorOperations< Devices::Cuda >::getVectorLpNorm( deviceVector, 3.0 ); - }; auto l3normHostET = [&]() { resultHost = lpNorm( hostView, 3.0 ); }; - auto l3normCudaET = [&]() { - resultDevice = lpNorm( deviceView, 3.0 ); - }; benchmark.setOperation( "l3 norm", datasetSize ); benchmark.time< Devices::Host >( reset1, "CPU legacy", l3normHost ); benchmark.time< Devices::Host >( reset1, "CPU ET", l3normHostET ); #ifdef HAVE_CUDA + auto l3normCuda = [&]() { + resultDevice = Benchmarks::CommonVectorOperations< Devices::Cuda >::getVectorLpNorm( deviceVector, 3.0 ); + }; + auto l3normCudaET = [&]() { + resultDevice = lpNorm( deviceView, 3.0 ); + }; benchmark.time< Devices::Cuda >( reset1, "GPU legacy", l3normCuda ); benchmark.time< Devices::Cuda >( reset1, "GPU ET", l3normCudaET ); #endif @@ -359,27 +350,13 @@ benchmarkVectorOperations( Benchmark & benchmark, auto scalarProductHost = [&]() { resultHost = Benchmarks::CommonVectorOperations< Devices::Host >::getScalarProduct( hostVector, hostVector2 ); }; - auto scalarProductCuda = [&]() { - resultDevice = Benchmarks::CommonVectorOperations< Devices::Cuda >::getScalarProduct( deviceVector, deviceVector2 ); - }; auto scalarProductHostET = [&]() { resultHost = ( hostVector, hostVector2 ); }; - auto scalarProductCudaET = [&]() { - resultDevice = ( deviceView, deviceView2 ); - }; #ifdef HAVE_BLAS auto scalarProductBlas = [&]() { resultHost = blasGdot( size, hostVector.getData(), 1, hostVector2.getData(), 1 ); }; -#endif -#ifdef HAVE_CUDA - auto scalarProductCublas = [&]() { - cublasGdot( cublasHandle, size, - deviceVector.getData(), 1, - deviceVector2.getData(), 1, - &resultDevice ); - }; #endif benchmark.setOperation( "scalar product", 2 * datasetSize ); benchmark.time< Devices::Host >( reset1, "CPU legacy", scalarProductHost ); @@ -388,6 +365,18 @@ benchmarkVectorOperations( Benchmark & benchmark, benchmark.time< Devices::Host >( reset1, "CPU BLAS", scalarProductBlas ); #endif #ifdef HAVE_CUDA + auto scalarProductCuda = [&]() { + resultDevice = Benchmarks::CommonVectorOperations< Devices::Cuda >::getScalarProduct( deviceVector, deviceVector2 ); + }; + auto scalarProductCudaET = [&]() { + resultDevice = ( deviceView, deviceView2 ); + }; + auto scalarProductCublas = [&]() { + cublasGdot( cublasHandle, size, + deviceVector.getData(), 1, + deviceVector2.getData(), 1, + &resultDevice ); + }; benchmark.time< Devices::Cuda >( reset1, "GPU legacy", scalarProductCuda ); benchmark.time< Devices::Cuda >( reset1, "GPU ET", scalarProductCudaET ); benchmark.time< Devices::Cuda >( reset1, "cuBLAS", scalarProductCublas ); @@ -398,28 +387,26 @@ benchmarkVectorOperations( Benchmark & benchmark, auto multiplyHost = [&]() { hostVector *= 0.5; }; - auto multiplyCuda = [&]() { - deviceVector *= 0.5; - }; #ifdef HAVE_BLAS auto multiplyBlas = [&]() { blasGscal( hostVector.getSize(), (Real) 0.5, hostVector.getData(), 1 ); }; +#endif + benchmark.setOperation( "scalar multiplication", 2 * datasetSize ); + benchmark.time< Devices::Host >( reset1, "CPU ET", multiplyHost ); +#ifdef HAVE_BLAS + benchmark.time< Devices::Host >( reset1, "CPU BLAS", multiplyBlas ); #endif #ifdef HAVE_CUDA + auto multiplyCuda = [&]() { + deviceVector *= 0.5; + }; auto multiplyCublas = [&]() { const Real alpha = 0.5; cublasGscal( cublasHandle, size, &alpha, deviceVector.getData(), 1 ); }; -#endif - benchmark.setOperation( "scalar multiplication", 2 * datasetSize ); - benchmark.time< Devices::Host >( reset1, "CPU ET", multiplyHost ); -#ifdef HAVE_BLAS - benchmark.time< Devices::Host >( reset1, "CPU BLAS", multiplyBlas ); -#endif -#ifdef HAVE_CUDA benchmark.time< Devices::Cuda >( reset1, "GPU ET", multiplyCuda ); benchmark.time< Devices::Cuda >( reset1, "cuBLAS", multiplyCublas ); #endif @@ -429,15 +416,9 @@ benchmarkVectorOperations( Benchmark & benchmark, auto addVectorHost = [&]() { Benchmarks::VectorOperations< Devices::Host >::addVector( hostVector, hostVector2, (Real) 1.0, (Real) 1.0 ); }; - auto addVectorCuda = [&]() { - Benchmarks::VectorOperations< Devices::Cuda >::addVector( deviceVector, deviceVector2, (Real) 1.0, (Real) 1.0 ); - }; auto addVectorHostET = [&]() { hostView += hostView2; }; - auto addVectorCudaET = [&]() { - deviceView += deviceView2; - }; #ifdef HAVE_BLAS auto addVectorBlas = [&]() { const Real alpha = 1.0; @@ -445,8 +426,20 @@ benchmarkVectorOperations( Benchmark & benchmark, hostVector2.getData(), 1, hostVector.getData(), 1 ); }; +#endif + benchmark.setOperation( "vector addition", 3 * datasetSize ); + benchmark.time< Devices::Host >( resetAll, "CPU legacy", addVectorHost ); + benchmark.time< Devices::Host >( resetAll, "CPU ET", addVectorHostET ); +#ifdef HAVE_BLAS + benchmark.time< Devices::Host >( resetAll, "CPU BLAS", addVectorBlas ); #endif #ifdef HAVE_CUDA + auto addVectorCuda = [&]() { + Benchmarks::VectorOperations< Devices::Cuda >::addVector( deviceVector, deviceVector2, (Real) 1.0, (Real) 1.0 ); + }; + auto addVectorCudaET = [&]() { + deviceView += deviceView2; + }; auto addVectorCublas = [&]() { const Real alpha = 1.0; cublasGaxpy( cublasHandle, size, @@ -454,14 +447,6 @@ benchmarkVectorOperations( Benchmark & benchmark, deviceVector2.getData(), 1, deviceVector.getData(), 1 ); }; -#endif - benchmark.setOperation( "vector addition", 3 * datasetSize ); - benchmark.time< Devices::Host >( resetAll, "CPU legacy", addVectorHost ); - benchmark.time< Devices::Host >( resetAll, "CPU ET", addVectorHostET ); -#ifdef HAVE_BLAS - benchmark.time< Devices::Host >( resetAll, "CPU BLAS", addVectorBlas ); -#endif -#ifdef HAVE_CUDA benchmark.time< Devices::Cuda >( resetAll, "GPU legacy", addVectorCuda ); benchmark.time< Devices::Cuda >( resetAll, "GPU ET", addVectorCudaET ); benchmark.time< Devices::Cuda >( resetAll, "cuBLAS", addVectorCublas ); @@ -473,16 +458,9 @@ benchmarkVectorOperations( Benchmark & benchmark, Benchmarks::VectorOperations< Devices::Host >::addVector( hostVector, hostVector2, (Real) 1.0, (Real) 1.0 ); Benchmarks::VectorOperations< Devices::Host >::addVector( hostVector, hostVector3, (Real) 1.0, (Real) 1.0 ); }; - auto addTwoVectorsCuda = [&]() { - Benchmarks::VectorOperations< Devices::Cuda >::addVector( deviceVector, deviceVector2, (Real) 1.0, (Real) 1.0 ); - Benchmarks::VectorOperations< Devices::Cuda >::addVector( deviceVector, deviceVector3, (Real) 1.0, (Real) 1.0 ); - }; auto addTwoVectorsHostET = [&]() { hostView += hostView2 + hostView3; }; - auto addTwoVectorsCudaET = [&]() { - deviceView += deviceView2 + deviceView3; - }; #ifdef HAVE_BLAS auto addTwoVectorsBlas = [&]() { const Real alpha = 1.0; @@ -493,8 +471,21 @@ benchmarkVectorOperations( Benchmark & benchmark, hostVector3.getData(), 1, hostVector.getData(), 1 ); }; +#endif + benchmark.setOperation( "two vectors addition", 4 * datasetSize ); + benchmark.time< Devices::Host >( resetAll, "CPU legacy", addTwoVectorsHost ); + benchmark.time< Devices::Host >( resetAll, "CPU ET", addTwoVectorsHostET ); +#ifdef HAVE_BLAS + benchmark.time< Devices::Host >( resetAll, "CPU BLAS", addTwoVectorsBlas ); #endif #ifdef HAVE_CUDA + auto addTwoVectorsCuda = [&]() { + Benchmarks::VectorOperations< Devices::Cuda >::addVector( deviceVector, deviceVector2, (Real) 1.0, (Real) 1.0 ); + Benchmarks::VectorOperations< Devices::Cuda >::addVector( deviceVector, deviceVector3, (Real) 1.0, (Real) 1.0 ); + }; + auto addTwoVectorsCudaET = [&]() { + deviceView += deviceView2 + deviceView3; + }; auto addTwoVectorsCublas = [&]() { const Real alpha = 1.0; cublasGaxpy( cublasHandle, size, @@ -506,14 +497,6 @@ benchmarkVectorOperations( Benchmark & benchmark, deviceVector3.getData(), 1, deviceVector.getData(), 1 ); }; -#endif - benchmark.setOperation( "two vectors addition", 4 * datasetSize ); - benchmark.time< Devices::Host >( resetAll, "CPU legacy", addTwoVectorsHost ); - benchmark.time< Devices::Host >( resetAll, "CPU ET", addTwoVectorsHostET ); -#ifdef HAVE_BLAS - benchmark.time< Devices::Host >( resetAll, "CPU BLAS", addTwoVectorsBlas ); -#endif -#ifdef HAVE_CUDA benchmark.time< Devices::Cuda >( resetAll, "GPU legacy", addTwoVectorsCuda ); benchmark.time< Devices::Cuda >( resetAll, "GPU ET", addTwoVectorsCudaET ); benchmark.time< Devices::Cuda >( resetAll, "cuBLAS", addTwoVectorsCublas ); @@ -526,17 +509,9 @@ benchmarkVectorOperations( Benchmark & benchmark, Benchmarks::VectorOperations< Devices::Host >::addVector( hostVector, hostVector3, (Real) 1.0, (Real) 1.0 ); Benchmarks::VectorOperations< Devices::Host >::addVector( hostVector, hostVector4, (Real) 1.0, (Real) 1.0 ); }; - auto addThreeVectorsCuda = [&]() { - Benchmarks::VectorOperations< Devices::Cuda >::addVector( deviceVector, deviceVector2, (Real) 1.0, (Real) 1.0 ); - Benchmarks::VectorOperations< Devices::Cuda >::addVector( deviceVector, deviceVector3, (Real) 1.0, (Real) 1.0 ); - Benchmarks::VectorOperations< Devices::Cuda >::addVector( deviceVector, deviceVector4, (Real) 1.0, (Real) 1.0 ); - }; auto addThreeVectorsHostET = [&]() { hostView += hostView2 + hostView3 + hostView4; }; - auto addThreeVectorsCudaET = [&]() { - deviceView += deviceView2 + deviceView3 + deviceView4; - }; #ifdef HAVE_BLAS auto addThreeVectorsBlas = [&]() { const Real alpha = 1.0; @@ -546,12 +521,26 @@ benchmarkVectorOperations( Benchmark & benchmark, blasGaxpy( size, alpha, hostVector3.getData(), 1, hostVector.getData(), 1 ); - blasGaxpy( size, alpha, + blasGaxpy( size, alpha, hostVector4.getData(), 1, hostVector.getData(), 1 ); }; +#endif + benchmark.setOperation( "three vectors addition", 5 * datasetSize ); + benchmark.time< Devices::Host >( resetAll, "CPU legacy", addThreeVectorsHost ); + benchmark.time< Devices::Host >( resetAll, "CPU ET", addThreeVectorsHostET ); +#ifdef HAVE_BLAS + benchmark.time< Devices::Host >( resetAll, "CPU BLAS", addThreeVectorsBlas ); #endif #ifdef HAVE_CUDA + auto addThreeVectorsCuda = [&]() { + Benchmarks::VectorOperations< Devices::Cuda >::addVector( deviceVector, deviceVector2, (Real) 1.0, (Real) 1.0 ); + Benchmarks::VectorOperations< Devices::Cuda >::addVector( deviceVector, deviceVector3, (Real) 1.0, (Real) 1.0 ); + Benchmarks::VectorOperations< Devices::Cuda >::addVector( deviceVector, deviceVector4, (Real) 1.0, (Real) 1.0 ); + }; + auto addThreeVectorsCudaET = [&]() { + deviceView += deviceView2 + deviceView3 + deviceView4; + }; auto addThreeVectorsCublas = [&]() { const Real alpha = 1.0; cublasGaxpy( cublasHandle, size, @@ -567,14 +556,6 @@ benchmarkVectorOperations( Benchmark & benchmark, deviceVector4.getData(), 1, deviceVector.getData(), 1 ); }; -#endif - benchmark.setOperation( "three vectors addition", 5 * datasetSize ); - benchmark.time< Devices::Host >( resetAll, "CPU legacy", addThreeVectorsHost ); - benchmark.time< Devices::Host >( resetAll, "CPU ET", addThreeVectorsHostET ); -#ifdef HAVE_BLAS - benchmark.time< Devices::Host >( resetAll, "CPU BLAS", addThreeVectorsBlas ); -#endif -#ifdef HAVE_CUDA benchmark.time< Devices::Cuda >( resetAll, "GPU legacy", addThreeVectorsCuda ); benchmark.time< Devices::Cuda >( resetAll, "GPU ET", addThreeVectorsCudaET ); benchmark.time< Devices::Cuda >( resetAll, "cuBLAS", addThreeVectorsCublas ); @@ -585,12 +566,12 @@ benchmarkVectorOperations( Benchmark & benchmark, auto inclusivePrefixSumHost = [&]() { hostVector.prefixSum(); }; - auto inclusivePrefixSumCuda = [&]() { - deviceVector.prefixSum(); - }; benchmark.setOperation( "inclusive prefix sum", 2 * datasetSize ); benchmark.time< Devices::Host >( reset1, "CPU ET", inclusivePrefixSumHost ); #ifdef HAVE_CUDA + auto inclusivePrefixSumCuda = [&]() { + deviceVector.prefixSum(); + }; benchmark.time< Devices::Cuda >( reset1, "GPU ET", inclusivePrefixSumCuda ); #endif @@ -599,12 +580,12 @@ benchmarkVectorOperations( Benchmark & benchmark, auto exclusivePrefixSumHost = [&]() { hostVector.template prefixSum< Containers::Algorithms::PrefixSumType::Exclusive >(); }; - auto exclusivePrefixSumCuda = [&]() { - deviceVector.template prefixSum< Containers::Algorithms::PrefixSumType::Exclusive >(); - }; benchmark.setOperation( "exclusive prefix sum", 2 * datasetSize ); benchmark.time< Devices::Host >( reset1, "CPU ET", exclusivePrefixSumHost ); #ifdef HAVE_CUDA + auto exclusivePrefixSumCuda = [&]() { + deviceVector.template prefixSum< Containers::Algorithms::PrefixSumType::Exclusive >(); + }; benchmark.time< Devices::Cuda >( reset1, "GPU ET", exclusivePrefixSumCuda ); #endif diff --git a/src/TNL/Containers/ndarray/SizesHolderHelpers.h b/src/TNL/Containers/ndarray/SizesHolderHelpers.h index 2e92ed43df..9d1c0d439f 100644 --- a/src/TNL/Containers/ndarray/SizesHolderHelpers.h +++ b/src/TNL/Containers/ndarray/SizesHolderHelpers.h @@ -184,7 +184,8 @@ struct SetSizesCopyHelper target.template setSize< level >( source.template getSize< level >() ); SetSizesCopyHelper< TargetHolder, SourceHolder, level - 1 >::copy( target, source ); } - else if( target.template getStaticSize< level >() != source.template getSize< level >() ) + else if( source.template getSize< level >() < 0 || + target.template getStaticSize< level >() != (std::size_t) source.template getSize< level >() ) throw std::logic_error( "Cannot copy sizes due to inconsistent underlying types (static sizes don't match)." ); } }; @@ -198,7 +199,8 @@ struct SetSizesCopyHelper< TargetHolder, SourceHolder, 0 > { if( target.template getStaticSize< 0 >() == 0 ) target.template setSize< 0 >( source.template getSize< 0 >() ); - else if( target.template getStaticSize< 0 >() != source.template getSize< 0 >() ) + else if( source.template getSize< 0 >() || + target.template getStaticSize< 0 >() != (std::size_t) source.template getSize< 0 >() ) throw std::logic_error( "Cannot copy sizes due to inconsistent underlying types (static sizes don't match)." ); } }; diff --git a/src/TNL/Devices/Cuda_impl.h b/src/TNL/Devices/Cuda_impl.h index 234f45b720..07e2c1ddcb 100644 --- a/src/TNL/Devices/Cuda_impl.h +++ b/src/TNL/Devices/Cuda_impl.h @@ -294,6 +294,7 @@ __device__ Element* Cuda::getSharedMemory() { return CudaSharedMemory< Element >(); } +#endif #ifdef HAVE_CUDA inline void Cuda::checkDevice( const char* file_name, int line, cudaError error ) @@ -326,6 +327,8 @@ inline bool Cuda::synchronizeDevice( int deviceId ) getSmartPointersSynchronizationTimer().stop(); return b; #endif +#else + return true; #endif } @@ -353,6 +356,7 @@ namespace { // double-precision atomicAdd function for Maxwell and older GPUs // copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions +#ifdef HAVE_CUDA #if __CUDA_ARCH__ < 600 namespace { __device__ double atomicAdd(double* address, double val) @@ -374,8 +378,7 @@ namespace { } } // namespace #endif - -#endif /* HAVE_CUDA */ +#endif } // namespace Devices } // namespace TNL diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h index 8fff004495..89cb608810 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h @@ -87,13 +87,14 @@ solve( const MeshPointer& mesh, int i = Communicators::MpiCommunicator::GetRank( Communicators::MpiCommunicator::AllGroup ); //printf( "Hello world from rank: %d ", i ); //Communicators::MpiCommunicator::Request r = Communicators::MpiCommunicator::ISend( auxPtr, 0, 0, Communicators::MpiCommunicator::AllGroup ); - if( i == 1 ) + if( i == 1 ) { /*for( int k = 0; k < 16*16; k++ ) aux[ k ] = 10;*/ printf( "1: mesh x: %d\n", mesh->getDimensions().x() ); printf( "1: mesh y: %d\n", mesh->getDimensions().y() ); //aux.save("aux_proc1.tnl"); - if( i == 0 ) + } + if( i == 0 ) { printf( "0: mesh x: %d\n", mesh->getDimensions().x() ); printf( "0: mesh y: %d\n", mesh->getDimensions().y() ); //aux.save("aux_proc0.tnl"); @@ -104,6 +105,7 @@ solve( const MeshPointer& mesh, printf("%f.2\t",aux[ k * 16 + l ] ); printf("\n"); }*/ + } /*bool a = Communicators::MpiCommunicator::IsInitialized(); if( a ) diff --git a/src/UnitTests/Containers/DistributedArrayTest.h b/src/UnitTests/Containers/DistributedArrayTest.h index 851bd95811..f4bd358303 100644 --- a/src/UnitTests/Containers/DistributedArrayTest.h +++ b/src/UnitTests/Containers/DistributedArrayTest.h @@ -146,8 +146,9 @@ TYPED_TEST( DistributedArrayTest, elementwiseAccess ) const IndexType gi = localRange.getGlobalIndex( i ); EXPECT_EQ( localArrayView.getElement( i ), 0 ); EXPECT_EQ( this->distributedArray.getElement( gi ), 0 ); - if( std::is_same< typename TestFixture::DeviceType, Devices::Host >::value ) + if( std::is_same< typename TestFixture::DeviceType, Devices::Host >::value ) { EXPECT_EQ( this->distributedArray[ gi ], 0 ); + } } // use setValue @@ -161,8 +162,9 @@ TYPED_TEST( DistributedArrayTest, elementwiseAccess ) const IndexType gi = localRange.getGlobalIndex( i ); EXPECT_EQ( localArrayView.getElement( i ), i + 1 ); EXPECT_EQ( this->distributedArray.getElement( gi ), i + 1 ); - if( std::is_same< typename TestFixture::DeviceType, Devices::Host >::value ) + if( std::is_same< typename TestFixture::DeviceType, Devices::Host >::value ) { EXPECT_EQ( this->distributedArray[ gi ], i + 1 ); + } } this->distributedArray.setValue( 0 ); diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h index 3dda2d1b47..2faf5ba040 100644 --- a/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h +++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h @@ -172,8 +172,9 @@ TYPED_TEST( DistributedNDArray_1D_test, elementwiseAccess ) for( IndexType gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) { // EXPECT_EQ( localArrayView.getElement( i ), 0 ); EXPECT_EQ( this->distributedNDArray.getElement( gi ), 0 ); - if( std::is_same< typename TestFixture::DeviceType, Devices::Host >::value ) + if( std::is_same< typename TestFixture::DeviceType, Devices::Host >::value ) { EXPECT_EQ( this->distributedNDArray[ gi ], 0 ); + } } // use operator() diff --git a/src/UnitTests/Matrices/SparseMatrixCopyTest.h b/src/UnitTests/Matrices/SparseMatrixCopyTest.h index a38f294979..9b09ef4d45 100644 --- a/src/UnitTests/Matrices/SparseMatrixCopyTest.h +++ b/src/UnitTests/Matrices/SparseMatrixCopyTest.h @@ -54,34 +54,34 @@ void setupUnevenRowSizeMatrix( Matrix& m ) rowLengths.setElement( 7, 1 ); rowLengths.setElement( 9, 1 ); m.setCompressedRowLengths( rowLengths ); - + int value = 1; for( int i = 0; i < cols - 4; i++ ) // 0th row m.setElement( 0, i, value++ ); - + for( int i = 3; i < cols; i++ ) // 1st row m.setElement( 1, i, value++ ); - + for( int i = 0; i < cols - 3; i++ ) // 2nd row m.setElement( 2, i, value++ ); - + for( int i = 1; i < cols; i++ ) // 3rd row m.setElement( 3, i, value++ ); - + for( int i = 0; i < cols - 1; i++ ) // 4th row m.setElement( 4, i, value++ ); - + for( int i = 0; i < cols - 4; i++ ) // 5th row m.setElement( 5, i, value++ ); - - m.setElement( 6, 0, value++ ); // 6th row - - m.setElement( 7, 0, value++ ); // 7th row - + + m.setElement( 6, 0, value++ ); // 6th row + + m.setElement( 7, 0, value++ ); // 7th row + for( int i = 0; i < cols - 1; i++ ) // 8th row m.setElement( 8, i, value++ ); - - m.setElement( 9, 5, value++ ); // 9th row + + m.setElement( 9, 5, value++ ); // 9th row } template< typename Matrix > -- GitLab