Loading CMakeLists.txt +7 −0 Original line number Diff line number Diff line Loading @@ -21,6 +21,7 @@ set(WITH_CUDA_ARCH "auto" CACHE STRING "Build for these CUDA architectures") option(WITH_OPENMP "Build with OpenMP support" ON) option(WITH_GMP "Build with GMP support" OFF) option(WITH_TESTS "Build tests" ON) option(WITH_PROFILING "Enable code profiling compiler flags" OFF ) option(WITH_COVERAGE "Enable code coverage reports from unit tests" OFF) option(WITH_EXAMPLES "Compile the 'examples' directory" ON) option(WITH_TOOLS "Compile the 'src/Tools' directory" ON) Loading Loading @@ -243,6 +244,11 @@ if( OPENMP_FOUND AND ${WITH_OPENMP} ) set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_OPENMP ${OpenMP_CXX_FLAGS}" ) endif() if( ${WITH_PROFILING} ) set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g" ) set( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-line-info") endif() find_package( DCMTK ) if( DCMTK_FOUND ) set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_DCMTK_H" ) Loading Loading @@ -414,6 +420,7 @@ message( " WITH_CUDA_ARCH = ${WITH_CUDA_ARCH}" ) message( " WITH_OPENMP = ${WITH_OPENMP}" ) message( " WITH_GMP = ${WITH_GMP}" ) message( " WITH_TESTS = ${WITH_TESTS}" ) message( " WITH_PROFILING = ${WITH_PROFILING}" ) message( " WITH_COVERAGE = ${WITH_COVERAGE}" ) message( " WITH_EXAMPLES = ${WITH_EXAMPLES}" ) message( " WITH_TOOLS = ${WITH_TOOLS}" ) Loading build +4 −0 Original line number Diff line number Diff line Loading @@ -22,6 +22,7 @@ WITH_CUDA_ARCH="auto" WITH_OPENMP="yes" WITH_GMP="no" WITH_TESTS="yes" WITH_PROFILING="no" WITH_COVERAGE="no" WITH_EXAMPLES="yes" WITH_PYTHON="yes" Loading Loading @@ -57,6 +58,7 @@ do --with-openmp=* ) WITH_OPENMP="${option#*=}" ;; --with-gmp=* ) WITH_GMP="${option#*=}" ;; --with-tests=* ) WITH_TESTS="${option#*=}" ;; --with-profiling=* ) WITH_PROFILING="${option#*=}" ;; --with-coverage=* ) WITH_COVERAGE="${option#*=}" ;; --with-examples=* ) WITH_EXAMPLES="${option#*=}" ;; --with-tools=* ) WITH_TOOLS="${option#*=}" ;; Loading Loading @@ -95,6 +97,7 @@ if [[ ${HELP} == "yes" ]]; then echo " --with-openmp=yes/no Enables OpenMP. 'yes' by default." echo " --with-gmp=yes/no Enables the wrapper for GNU Multiple Precision Arithmetic Library. 'no' by default." echo " --with-tests=yes/no Enables unit tests. 'yes' by default." echo " --with-profiling=yes/no Enables code profiling compiler falgs. 'no' by default." echo " --with-coverage=yes/no Enables code coverage reports for unit tests. 'no' by default (lcov is required)." echo " --with-examples=yes/no Compile the 'examples' directory. 'yes' by default." echo " --with-tools=yes/no Compile the 'src/Tools' directory. 'yes' by default." Loading Loading @@ -165,6 +168,7 @@ cmake_command=( -DWITH_OPENMP=${WITH_OPENMP} -DWITH_GMP=${WITH_GMP} -DWITH_TESTS=${WITH_TESTS} -DWITH_PROFILING=${WITH_PROFILING} -DWITH_COVERAGE=${WITH_COVERAGE} -DWITH_EXAMPLES=${WITH_EXAMPLES} -DWITH_TOOLS=${WITH_TOOLS} Loading src/Benchmarks/BLAS/array-operations.h +12 −12 Original line number Diff line number Diff line Loading @@ -72,9 +72,9 @@ benchmarkArrayOperations( Benchmark & benchmark, resultDevice = (int) deviceArray == deviceArray2; }; benchmark.setOperation( "comparison (operator==)", 2 * datasetSize ); benchmark.time( reset1, "CPU", compareHost ); benchmark.time< Devices::Host >( reset1, "CPU", compareHost ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", compareCuda ); benchmark.time< Devices::Cuda >( reset1, "GPU", compareCuda ); #endif Loading @@ -87,9 +87,9 @@ benchmarkArrayOperations( Benchmark & benchmark, benchmark.setOperation( "copy (operator=)", 2 * datasetSize ); // copyBasetime is used later inside HAVE_CUDA guard, so the compiler will // complain when compiling without CUDA const double copyBasetime = benchmark.time( reset1, "CPU", copyAssignHostHost ); const double copyBasetime = benchmark.time< Devices::Host >( reset1, "CPU", copyAssignHostHost ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", copyAssignCudaCuda ); benchmark.time< Devices::Cuda >( reset1, "GPU", copyAssignCudaCuda ); #endif Loading @@ -101,8 +101,8 @@ benchmarkArrayOperations( Benchmark & benchmark, }; #ifdef HAVE_CUDA benchmark.setOperation( "copy (operator=)", datasetSize, copyBasetime ); benchmark.time( reset1, "CPU->GPU", copyAssignHostCuda ); benchmark.time( reset1, "GPU->CPU", copyAssignCudaHost ); benchmark.time< Devices::Cuda >( reset1, "CPU->GPU", copyAssignHostCuda ); benchmark.time< Devices::Cuda >( reset1, "GPU->CPU", copyAssignCudaHost ); #endif Loading @@ -113,9 +113,9 @@ benchmarkArrayOperations( Benchmark & benchmark, deviceArray.setValue( 3.0 ); }; benchmark.setOperation( "setValue", datasetSize ); benchmark.time( reset1, "CPU", setValueHost ); benchmark.time< Devices::Host >( reset1, "CPU", setValueHost ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", setValueCuda ); benchmark.time< Devices::Cuda >( reset1, "GPU", setValueCuda ); #endif Loading @@ -132,9 +132,9 @@ benchmarkArrayOperations( Benchmark & benchmark, #endif }; benchmark.setOperation( "allocation (setSize)", datasetSize ); benchmark.time( resetSize1, "CPU", setSizeHost ); benchmark.time< Devices::Host >( resetSize1, "CPU", setSizeHost ); #ifdef HAVE_CUDA benchmark.time( resetSize1, "GPU", setSizeCuda ); benchmark.time< Devices::Cuda >( resetSize1, "GPU", setSizeCuda ); #endif Loading @@ -151,9 +151,9 @@ benchmarkArrayOperations( Benchmark & benchmark, #endif }; benchmark.setOperation( "deallocation (reset)", datasetSize ); benchmark.time( setSize1, "CPU", resetSizeHost ); benchmark.time< Devices::Host >( setSize1, "CPU", resetSizeHost ); #ifdef HAVE_CUDA benchmark.time( setSize1, "GPU", resetSizeCuda ); benchmark.time< Devices::Cuda >( setSize1, "GPU", resetSizeCuda ); #endif return true; Loading src/Benchmarks/BLAS/spmv.h +2 −2 Original line number Diff line number Diff line Loading @@ -161,9 +161,9 @@ benchmarkSpMV( Benchmark & benchmark, }; benchmark.setOperation( datasetSize ); benchmark.time( reset, "CPU", spmvHost ); benchmark.time< Devices::Host >( reset, "CPU", spmvHost ); #ifdef HAVE_CUDA benchmark.time( reset, "GPU", spmvCuda ); benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda ); #endif return true; Loading src/Benchmarks/BLAS/vector-operations.h +30 −30 Original line number Diff line number Diff line Loading @@ -64,7 +64,7 @@ benchmarkVectorOperations( Benchmark & benchmark, deviceVector.setValue( 1.0 ); #endif // A relatively harmless call to keep the compiler from realizing we // don't actually do any useful work with the result of the reduciton. // don't actually do any useful work with the result of the reduction. srand48(resultHost); resultHost = resultDevice = 0.0; }; Loading @@ -90,9 +90,9 @@ benchmarkVectorOperations( Benchmark & benchmark, resultDevice = deviceVector.max(); }; benchmark.setOperation( "max", datasetSize ); benchmark.time( reset1, "CPU", maxHost ); benchmark.time< Devices::Host >( reset1, "CPU", maxHost ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", maxCuda ); benchmark.time< Devices::Cuda >( reset1, "GPU", maxCuda ); #endif Loading @@ -103,9 +103,9 @@ benchmarkVectorOperations( Benchmark & benchmark, resultDevice = deviceVector.min(); }; benchmark.setOperation( "min", datasetSize ); benchmark.time( reset1, "CPU", minHost ); benchmark.time< Devices::Host >( reset1, "CPU", minHost ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", minCuda ); benchmark.time< Devices::Cuda >( reset1, "GPU", minCuda ); #endif Loading @@ -125,10 +125,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "absMax", datasetSize ); benchmark.time( reset1, "CPU", absMaxHost ); benchmark.time< Devices::Host >( reset1, "CPU", absMaxHost ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", absMaxCuda ); benchmark.time( reset1, "cuBLAS", absMaxCublas ); benchmark.time< Devices::Cuda >( reset1, "GPU", absMaxCuda ); benchmark.time< Devices::Cuda >( reset1, "cuBLAS", absMaxCublas ); #endif Loading @@ -148,10 +148,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "absMin", datasetSize ); benchmark.time( reset1, "CPU", absMinHost ); benchmark.time< Devices::Host >( reset1, "CPU", absMinHost ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", absMinCuda ); benchmark.time( reset1, "cuBLAS", absMinCublas ); benchmark.time< Devices::Cuda >( reset1, "GPU", absMinCuda ); benchmark.time< Devices::Cuda >( reset1, "cuBLAS", absMinCublas ); #endif Loading @@ -162,9 +162,9 @@ benchmarkVectorOperations( Benchmark & benchmark, resultDevice = deviceVector.sum(); }; benchmark.setOperation( "sum", datasetSize ); benchmark.time( reset1, "CPU", sumHost ); benchmark.time< Devices::Host >( reset1, "CPU", sumHost ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", sumCuda ); benchmark.time< Devices::Cuda >( reset1, "GPU", sumCuda ); #endif Loading @@ -182,10 +182,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "l1 norm", datasetSize ); benchmark.time( reset1, "CPU", l1normHost ); benchmark.time< Devices::Host >( reset1, "CPU", l1normHost ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", l1normCuda ); benchmark.time( reset1, "cuBLAS", l1normCublas ); benchmark.time< Devices::Cuda >( reset1, "GPU", l1normCuda ); benchmark.time< Devices::Cuda >( reset1, "cuBLAS", l1normCublas ); #endif Loading @@ -203,10 +203,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "l2 norm", datasetSize ); benchmark.time( reset1, "CPU", l2normHost ); benchmark.time< Devices::Host >( reset1, "CPU", l2normHost ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", l2normCuda ); benchmark.time( reset1, "cuBLAS", l2normCublas ); benchmark.time< Devices::Cuda >( reset1, "GPU", l2normCuda ); benchmark.time< Devices::Cuda >( reset1, "cuBLAS", l2normCublas ); #endif Loading @@ -217,9 +217,9 @@ benchmarkVectorOperations( Benchmark & benchmark, resultDevice = deviceVector.lpNorm( 3.0 ); }; benchmark.setOperation( "l3 norm", datasetSize ); benchmark.time( reset1, "CPU", l3normHost ); benchmark.time< Devices::Host >( reset1, "CPU", l3normHost ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", l3normCuda ); benchmark.time< Devices::Cuda >( reset1, "GPU", l3normCuda ); #endif Loading @@ -238,10 +238,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "scalar product", 2 * datasetSize ); benchmark.time( reset1, "CPU", scalarProductHost ); benchmark.time< Devices::Host >( reset1, "CPU", scalarProductHost ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", scalarProductCuda ); benchmark.time( reset1, "cuBLAS", scalarProductCublas ); benchmark.time< Devices::Cuda >( reset1, "GPU", scalarProductCuda ); benchmark.time< Devices::Cuda >( reset1, "cuBLAS", scalarProductCublas ); #endif /* Loading Loading @@ -289,10 +289,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "scalar multiplication", 2 * datasetSize ); benchmark.time( reset1, "CPU", multiplyHost ); benchmark.time< Devices::Host >( reset1, "CPU", multiplyHost ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", multiplyCuda ); benchmark.time( reset1, "cuBLAS", multiplyCublas ); benchmark.time< Devices::Cuda >( reset1, "GPU", multiplyCuda ); benchmark.time< Devices::Cuda >( reset1, "cuBLAS", multiplyCublas ); #endif Loading @@ -312,10 +312,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "vector addition", 3 * datasetSize ); benchmark.time( reset1, "CPU", addVectorHost ); benchmark.time< Devices::Host >( reset1, "CPU", addVectorHost ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", addVectorCuda ); benchmark.time( reset1, "cuBLAS", addVectorCublas ); benchmark.time< Devices::Cuda >( reset1, "GPU", addVectorCuda ); benchmark.time< Devices::Cuda >( reset1, "cuBLAS", addVectorCublas ); #endif Loading Loading
CMakeLists.txt +7 −0 Original line number Diff line number Diff line Loading @@ -21,6 +21,7 @@ set(WITH_CUDA_ARCH "auto" CACHE STRING "Build for these CUDA architectures") option(WITH_OPENMP "Build with OpenMP support" ON) option(WITH_GMP "Build with GMP support" OFF) option(WITH_TESTS "Build tests" ON) option(WITH_PROFILING "Enable code profiling compiler flags" OFF ) option(WITH_COVERAGE "Enable code coverage reports from unit tests" OFF) option(WITH_EXAMPLES "Compile the 'examples' directory" ON) option(WITH_TOOLS "Compile the 'src/Tools' directory" ON) Loading Loading @@ -243,6 +244,11 @@ if( OPENMP_FOUND AND ${WITH_OPENMP} ) set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_OPENMP ${OpenMP_CXX_FLAGS}" ) endif() if( ${WITH_PROFILING} ) set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g" ) set( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-line-info") endif() find_package( DCMTK ) if( DCMTK_FOUND ) set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_DCMTK_H" ) Loading Loading @@ -414,6 +420,7 @@ message( " WITH_CUDA_ARCH = ${WITH_CUDA_ARCH}" ) message( " WITH_OPENMP = ${WITH_OPENMP}" ) message( " WITH_GMP = ${WITH_GMP}" ) message( " WITH_TESTS = ${WITH_TESTS}" ) message( " WITH_PROFILING = ${WITH_PROFILING}" ) message( " WITH_COVERAGE = ${WITH_COVERAGE}" ) message( " WITH_EXAMPLES = ${WITH_EXAMPLES}" ) message( " WITH_TOOLS = ${WITH_TOOLS}" ) Loading
build +4 −0 Original line number Diff line number Diff line Loading @@ -22,6 +22,7 @@ WITH_CUDA_ARCH="auto" WITH_OPENMP="yes" WITH_GMP="no" WITH_TESTS="yes" WITH_PROFILING="no" WITH_COVERAGE="no" WITH_EXAMPLES="yes" WITH_PYTHON="yes" Loading Loading @@ -57,6 +58,7 @@ do --with-openmp=* ) WITH_OPENMP="${option#*=}" ;; --with-gmp=* ) WITH_GMP="${option#*=}" ;; --with-tests=* ) WITH_TESTS="${option#*=}" ;; --with-profiling=* ) WITH_PROFILING="${option#*=}" ;; --with-coverage=* ) WITH_COVERAGE="${option#*=}" ;; --with-examples=* ) WITH_EXAMPLES="${option#*=}" ;; --with-tools=* ) WITH_TOOLS="${option#*=}" ;; Loading Loading @@ -95,6 +97,7 @@ if [[ ${HELP} == "yes" ]]; then echo " --with-openmp=yes/no Enables OpenMP. 'yes' by default." echo " --with-gmp=yes/no Enables the wrapper for GNU Multiple Precision Arithmetic Library. 'no' by default." echo " --with-tests=yes/no Enables unit tests. 'yes' by default." echo " --with-profiling=yes/no Enables code profiling compiler falgs. 'no' by default." echo " --with-coverage=yes/no Enables code coverage reports for unit tests. 'no' by default (lcov is required)." echo " --with-examples=yes/no Compile the 'examples' directory. 'yes' by default." echo " --with-tools=yes/no Compile the 'src/Tools' directory. 'yes' by default." Loading Loading @@ -165,6 +168,7 @@ cmake_command=( -DWITH_OPENMP=${WITH_OPENMP} -DWITH_GMP=${WITH_GMP} -DWITH_TESTS=${WITH_TESTS} -DWITH_PROFILING=${WITH_PROFILING} -DWITH_COVERAGE=${WITH_COVERAGE} -DWITH_EXAMPLES=${WITH_EXAMPLES} -DWITH_TOOLS=${WITH_TOOLS} Loading
src/Benchmarks/BLAS/array-operations.h +12 −12 Original line number Diff line number Diff line Loading @@ -72,9 +72,9 @@ benchmarkArrayOperations( Benchmark & benchmark, resultDevice = (int) deviceArray == deviceArray2; }; benchmark.setOperation( "comparison (operator==)", 2 * datasetSize ); benchmark.time( reset1, "CPU", compareHost ); benchmark.time< Devices::Host >( reset1, "CPU", compareHost ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", compareCuda ); benchmark.time< Devices::Cuda >( reset1, "GPU", compareCuda ); #endif Loading @@ -87,9 +87,9 @@ benchmarkArrayOperations( Benchmark & benchmark, benchmark.setOperation( "copy (operator=)", 2 * datasetSize ); // copyBasetime is used later inside HAVE_CUDA guard, so the compiler will // complain when compiling without CUDA const double copyBasetime = benchmark.time( reset1, "CPU", copyAssignHostHost ); const double copyBasetime = benchmark.time< Devices::Host >( reset1, "CPU", copyAssignHostHost ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", copyAssignCudaCuda ); benchmark.time< Devices::Cuda >( reset1, "GPU", copyAssignCudaCuda ); #endif Loading @@ -101,8 +101,8 @@ benchmarkArrayOperations( Benchmark & benchmark, }; #ifdef HAVE_CUDA benchmark.setOperation( "copy (operator=)", datasetSize, copyBasetime ); benchmark.time( reset1, "CPU->GPU", copyAssignHostCuda ); benchmark.time( reset1, "GPU->CPU", copyAssignCudaHost ); benchmark.time< Devices::Cuda >( reset1, "CPU->GPU", copyAssignHostCuda ); benchmark.time< Devices::Cuda >( reset1, "GPU->CPU", copyAssignCudaHost ); #endif Loading @@ -113,9 +113,9 @@ benchmarkArrayOperations( Benchmark & benchmark, deviceArray.setValue( 3.0 ); }; benchmark.setOperation( "setValue", datasetSize ); benchmark.time( reset1, "CPU", setValueHost ); benchmark.time< Devices::Host >( reset1, "CPU", setValueHost ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", setValueCuda ); benchmark.time< Devices::Cuda >( reset1, "GPU", setValueCuda ); #endif Loading @@ -132,9 +132,9 @@ benchmarkArrayOperations( Benchmark & benchmark, #endif }; benchmark.setOperation( "allocation (setSize)", datasetSize ); benchmark.time( resetSize1, "CPU", setSizeHost ); benchmark.time< Devices::Host >( resetSize1, "CPU", setSizeHost ); #ifdef HAVE_CUDA benchmark.time( resetSize1, "GPU", setSizeCuda ); benchmark.time< Devices::Cuda >( resetSize1, "GPU", setSizeCuda ); #endif Loading @@ -151,9 +151,9 @@ benchmarkArrayOperations( Benchmark & benchmark, #endif }; benchmark.setOperation( "deallocation (reset)", datasetSize ); benchmark.time( setSize1, "CPU", resetSizeHost ); benchmark.time< Devices::Host >( setSize1, "CPU", resetSizeHost ); #ifdef HAVE_CUDA benchmark.time( setSize1, "GPU", resetSizeCuda ); benchmark.time< Devices::Cuda >( setSize1, "GPU", resetSizeCuda ); #endif return true; Loading
src/Benchmarks/BLAS/spmv.h +2 −2 Original line number Diff line number Diff line Loading @@ -161,9 +161,9 @@ benchmarkSpMV( Benchmark & benchmark, }; benchmark.setOperation( datasetSize ); benchmark.time( reset, "CPU", spmvHost ); benchmark.time< Devices::Host >( reset, "CPU", spmvHost ); #ifdef HAVE_CUDA benchmark.time( reset, "GPU", spmvCuda ); benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda ); #endif return true; Loading
src/Benchmarks/BLAS/vector-operations.h +30 −30 Original line number Diff line number Diff line Loading @@ -64,7 +64,7 @@ benchmarkVectorOperations( Benchmark & benchmark, deviceVector.setValue( 1.0 ); #endif // A relatively harmless call to keep the compiler from realizing we // don't actually do any useful work with the result of the reduciton. // don't actually do any useful work with the result of the reduction. srand48(resultHost); resultHost = resultDevice = 0.0; }; Loading @@ -90,9 +90,9 @@ benchmarkVectorOperations( Benchmark & benchmark, resultDevice = deviceVector.max(); }; benchmark.setOperation( "max", datasetSize ); benchmark.time( reset1, "CPU", maxHost ); benchmark.time< Devices::Host >( reset1, "CPU", maxHost ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", maxCuda ); benchmark.time< Devices::Cuda >( reset1, "GPU", maxCuda ); #endif Loading @@ -103,9 +103,9 @@ benchmarkVectorOperations( Benchmark & benchmark, resultDevice = deviceVector.min(); }; benchmark.setOperation( "min", datasetSize ); benchmark.time( reset1, "CPU", minHost ); benchmark.time< Devices::Host >( reset1, "CPU", minHost ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", minCuda ); benchmark.time< Devices::Cuda >( reset1, "GPU", minCuda ); #endif Loading @@ -125,10 +125,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "absMax", datasetSize ); benchmark.time( reset1, "CPU", absMaxHost ); benchmark.time< Devices::Host >( reset1, "CPU", absMaxHost ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", absMaxCuda ); benchmark.time( reset1, "cuBLAS", absMaxCublas ); benchmark.time< Devices::Cuda >( reset1, "GPU", absMaxCuda ); benchmark.time< Devices::Cuda >( reset1, "cuBLAS", absMaxCublas ); #endif Loading @@ -148,10 +148,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "absMin", datasetSize ); benchmark.time( reset1, "CPU", absMinHost ); benchmark.time< Devices::Host >( reset1, "CPU", absMinHost ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", absMinCuda ); benchmark.time( reset1, "cuBLAS", absMinCublas ); benchmark.time< Devices::Cuda >( reset1, "GPU", absMinCuda ); benchmark.time< Devices::Cuda >( reset1, "cuBLAS", absMinCublas ); #endif Loading @@ -162,9 +162,9 @@ benchmarkVectorOperations( Benchmark & benchmark, resultDevice = deviceVector.sum(); }; benchmark.setOperation( "sum", datasetSize ); benchmark.time( reset1, "CPU", sumHost ); benchmark.time< Devices::Host >( reset1, "CPU", sumHost ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", sumCuda ); benchmark.time< Devices::Cuda >( reset1, "GPU", sumCuda ); #endif Loading @@ -182,10 +182,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "l1 norm", datasetSize ); benchmark.time( reset1, "CPU", l1normHost ); benchmark.time< Devices::Host >( reset1, "CPU", l1normHost ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", l1normCuda ); benchmark.time( reset1, "cuBLAS", l1normCublas ); benchmark.time< Devices::Cuda >( reset1, "GPU", l1normCuda ); benchmark.time< Devices::Cuda >( reset1, "cuBLAS", l1normCublas ); #endif Loading @@ -203,10 +203,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "l2 norm", datasetSize ); benchmark.time( reset1, "CPU", l2normHost ); benchmark.time< Devices::Host >( reset1, "CPU", l2normHost ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", l2normCuda ); benchmark.time( reset1, "cuBLAS", l2normCublas ); benchmark.time< Devices::Cuda >( reset1, "GPU", l2normCuda ); benchmark.time< Devices::Cuda >( reset1, "cuBLAS", l2normCublas ); #endif Loading @@ -217,9 +217,9 @@ benchmarkVectorOperations( Benchmark & benchmark, resultDevice = deviceVector.lpNorm( 3.0 ); }; benchmark.setOperation( "l3 norm", datasetSize ); benchmark.time( reset1, "CPU", l3normHost ); benchmark.time< Devices::Host >( reset1, "CPU", l3normHost ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", l3normCuda ); benchmark.time< Devices::Cuda >( reset1, "GPU", l3normCuda ); #endif Loading @@ -238,10 +238,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "scalar product", 2 * datasetSize ); benchmark.time( reset1, "CPU", scalarProductHost ); benchmark.time< Devices::Host >( reset1, "CPU", scalarProductHost ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", scalarProductCuda ); benchmark.time( reset1, "cuBLAS", scalarProductCublas ); benchmark.time< Devices::Cuda >( reset1, "GPU", scalarProductCuda ); benchmark.time< Devices::Cuda >( reset1, "cuBLAS", scalarProductCublas ); #endif /* Loading Loading @@ -289,10 +289,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "scalar multiplication", 2 * datasetSize ); benchmark.time( reset1, "CPU", multiplyHost ); benchmark.time< Devices::Host >( reset1, "CPU", multiplyHost ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", multiplyCuda ); benchmark.time( reset1, "cuBLAS", multiplyCublas ); benchmark.time< Devices::Cuda >( reset1, "GPU", multiplyCuda ); benchmark.time< Devices::Cuda >( reset1, "cuBLAS", multiplyCublas ); #endif Loading @@ -312,10 +312,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "vector addition", 3 * datasetSize ); benchmark.time( reset1, "CPU", addVectorHost ); benchmark.time< Devices::Host >( reset1, "CPU", addVectorHost ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", addVectorCuda ); benchmark.time( reset1, "cuBLAS", addVectorCublas ); benchmark.time< Devices::Cuda >( reset1, "GPU", addVectorCuda ); benchmark.time< Devices::Cuda >( reset1, "cuBLAS", addVectorCublas ); #endif Loading