Commit 75a4ea7a authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Merge branch 'traversers-optimizations' into 'develop'

Traversers optimizations

See merge request !20
parents 3b73e43e 5f4d085f
Loading
Loading
Loading
Loading
+7 −0
Original line number Diff line number Diff line
@@ -21,6 +21,7 @@ set(WITH_CUDA_ARCH "auto" CACHE STRING "Build for these CUDA architectures")
option(WITH_OPENMP "Build with OpenMP support" ON)
option(WITH_GMP "Build with GMP support" OFF)
option(WITH_TESTS "Build tests" ON)
option(WITH_PROFILING "Enable code profiling compiler flags" OFF )
option(WITH_COVERAGE "Enable code coverage reports from unit tests" OFF)
option(WITH_EXAMPLES "Compile the 'examples' directory" ON)
option(WITH_TOOLS "Compile the 'src/Tools' directory" ON)
@@ -243,6 +244,11 @@ if( OPENMP_FOUND AND ${WITH_OPENMP} )
   set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_OPENMP ${OpenMP_CXX_FLAGS}" )
endif()

if( ${WITH_PROFILING} )
    set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g" )
    set( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-line-info")
endif()

find_package( DCMTK )
if( DCMTK_FOUND )
   set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_DCMTK_H" )
@@ -414,6 +420,7 @@ message( " WITH_CUDA_ARCH = ${WITH_CUDA_ARCH}" )
message( "   WITH_OPENMP = ${WITH_OPENMP}" )
message( "   WITH_GMP = ${WITH_GMP}" )
message( "   WITH_TESTS = ${WITH_TESTS}" )
message( "   WITH_PROFILING = ${WITH_PROFILING}" )
message( "   WITH_COVERAGE = ${WITH_COVERAGE}" )
message( "   WITH_EXAMPLES = ${WITH_EXAMPLES}" )
message( "   WITH_TOOLS = ${WITH_TOOLS}" )
+4 −0
Original line number Diff line number Diff line
@@ -22,6 +22,7 @@ WITH_CUDA_ARCH="auto"
WITH_OPENMP="yes"
WITH_GMP="no"
WITH_TESTS="yes"
WITH_PROFILING="no"
WITH_COVERAGE="no"
WITH_EXAMPLES="yes"
WITH_PYTHON="yes"
@@ -57,6 +58,7 @@ do
        --with-openmp=*                  ) WITH_OPENMP="${option#*=}" ;;
        --with-gmp=*                     ) WITH_GMP="${option#*=}" ;;
        --with-tests=*                   ) WITH_TESTS="${option#*=}" ;;
        --with-profiling=*               ) WITH_PROFILING="${option#*=}" ;;
        --with-coverage=*                ) WITH_COVERAGE="${option#*=}" ;;
        --with-examples=*                ) WITH_EXAMPLES="${option#*=}" ;;
        --with-tools=*                   ) WITH_TOOLS="${option#*=}" ;;
@@ -95,6 +97,7 @@ if [[ ${HELP} == "yes" ]]; then
    echo "   --with-openmp=yes/no                  Enables OpenMP. 'yes' by default."
    echo "   --with-gmp=yes/no                     Enables the wrapper for GNU Multiple Precision Arithmetic Library. 'no' by default."
    echo "   --with-tests=yes/no                   Enables unit tests. 'yes' by default."
    echo "   --with-profiling=yes/no               Enables code profiling compiler falgs. 'no' by default."
    echo "   --with-coverage=yes/no                Enables code coverage reports for unit tests. 'no' by default (lcov is required)."
    echo "   --with-examples=yes/no                Compile the 'examples' directory. 'yes' by default."
    echo "   --with-tools=yes/no                   Compile the 'src/Tools' directory. 'yes' by default."
@@ -165,6 +168,7 @@ cmake_command=(
         -DWITH_OPENMP=${WITH_OPENMP}
         -DWITH_GMP=${WITH_GMP}
         -DWITH_TESTS=${WITH_TESTS}
         -DWITH_PROFILING=${WITH_PROFILING}
         -DWITH_COVERAGE=${WITH_COVERAGE}
         -DWITH_EXAMPLES=${WITH_EXAMPLES}
         -DWITH_TOOLS=${WITH_TOOLS}
+12 −12
Original line number Diff line number Diff line
@@ -72,9 +72,9 @@ benchmarkArrayOperations( Benchmark & benchmark,
      resultDevice = (int) deviceArray == deviceArray2;
   };
   benchmark.setOperation( "comparison (operator==)", 2 * datasetSize );
   benchmark.time( reset1, "CPU", compareHost );
   benchmark.time< Devices::Host >( reset1, "CPU", compareHost );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", compareCuda );
   benchmark.time< Devices::Cuda >( reset1, "GPU", compareCuda );
#endif


@@ -87,9 +87,9 @@ benchmarkArrayOperations( Benchmark & benchmark,
   benchmark.setOperation( "copy (operator=)", 2 * datasetSize );
   // copyBasetime is used later inside HAVE_CUDA guard, so the compiler will
   // complain when compiling without CUDA
   const double copyBasetime = benchmark.time( reset1, "CPU", copyAssignHostHost );
   const double copyBasetime = benchmark.time< Devices::Host >( reset1, "CPU", copyAssignHostHost );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", copyAssignCudaCuda );
   benchmark.time< Devices::Cuda >( reset1, "GPU", copyAssignCudaCuda );
#endif


@@ -101,8 +101,8 @@ benchmarkArrayOperations( Benchmark & benchmark,
   };
#ifdef HAVE_CUDA
   benchmark.setOperation( "copy (operator=)", datasetSize, copyBasetime );
   benchmark.time( reset1, "CPU->GPU", copyAssignHostCuda );
   benchmark.time( reset1, "GPU->CPU", copyAssignCudaHost );
   benchmark.time< Devices::Cuda >( reset1, "CPU->GPU", copyAssignHostCuda );
   benchmark.time< Devices::Cuda >( reset1, "GPU->CPU", copyAssignCudaHost );
#endif


@@ -113,9 +113,9 @@ benchmarkArrayOperations( Benchmark & benchmark,
      deviceArray.setValue( 3.0 );
   };
   benchmark.setOperation( "setValue", datasetSize );
   benchmark.time( reset1, "CPU", setValueHost );
   benchmark.time< Devices::Host >( reset1, "CPU", setValueHost );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", setValueCuda );
   benchmark.time< Devices::Cuda >( reset1, "GPU", setValueCuda );
#endif


@@ -132,9 +132,9 @@ benchmarkArrayOperations( Benchmark & benchmark,
#endif
   };
   benchmark.setOperation( "allocation (setSize)", datasetSize );
   benchmark.time( resetSize1, "CPU", setSizeHost );
   benchmark.time< Devices::Host >( resetSize1, "CPU", setSizeHost );
#ifdef HAVE_CUDA
   benchmark.time( resetSize1, "GPU", setSizeCuda );
   benchmark.time< Devices::Cuda >( resetSize1, "GPU", setSizeCuda );
#endif


@@ -151,9 +151,9 @@ benchmarkArrayOperations( Benchmark & benchmark,
#endif
   };
   benchmark.setOperation( "deallocation (reset)", datasetSize );
   benchmark.time( setSize1, "CPU", resetSizeHost );
   benchmark.time< Devices::Host >( setSize1, "CPU", resetSizeHost );
#ifdef HAVE_CUDA
   benchmark.time( setSize1, "GPU", resetSizeCuda );
   benchmark.time< Devices::Cuda >( setSize1, "GPU", resetSizeCuda );
#endif

   return true;
+2 −2
Original line number Diff line number Diff line
@@ -161,9 +161,9 @@ benchmarkSpMV( Benchmark & benchmark,
   };

   benchmark.setOperation( datasetSize );
   benchmark.time( reset, "CPU", spmvHost );
   benchmark.time< Devices::Host >( reset, "CPU", spmvHost );
#ifdef HAVE_CUDA
   benchmark.time( reset, "GPU", spmvCuda );
   benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda );
#endif

   return true;
+30 −30
Original line number Diff line number Diff line
@@ -64,7 +64,7 @@ benchmarkVectorOperations( Benchmark & benchmark,
      deviceVector.setValue( 1.0 );
#endif
      // A relatively harmless call to keep the compiler from realizing we
      // don't actually do any useful work with the result of the reduciton.
      // don't actually do any useful work with the result of the reduction.
      srand48(resultHost);
      resultHost = resultDevice = 0.0;
   };
@@ -90,9 +90,9 @@ benchmarkVectorOperations( Benchmark & benchmark,
      resultDevice = deviceVector.max();
   };
   benchmark.setOperation( "max", datasetSize );
   benchmark.time( reset1, "CPU", maxHost );
   benchmark.time< Devices::Host >( reset1, "CPU", maxHost );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", maxCuda );
   benchmark.time< Devices::Cuda >( reset1, "GPU", maxCuda );
#endif


@@ -103,9 +103,9 @@ benchmarkVectorOperations( Benchmark & benchmark,
      resultDevice = deviceVector.min();
   };
   benchmark.setOperation( "min", datasetSize );
   benchmark.time( reset1, "CPU", minHost );
   benchmark.time< Devices::Host >( reset1, "CPU", minHost );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", minCuda );
   benchmark.time< Devices::Cuda >( reset1, "GPU", minCuda );
#endif


@@ -125,10 +125,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
   };
#endif
   benchmark.setOperation( "absMax", datasetSize );
   benchmark.time( reset1, "CPU", absMaxHost );
   benchmark.time< Devices::Host >( reset1, "CPU", absMaxHost );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", absMaxCuda );
   benchmark.time( reset1, "cuBLAS", absMaxCublas );
   benchmark.time< Devices::Cuda >( reset1, "GPU", absMaxCuda );
   benchmark.time< Devices::Cuda >( reset1, "cuBLAS", absMaxCublas );
#endif


@@ -148,10 +148,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
   };
#endif
   benchmark.setOperation( "absMin", datasetSize );
   benchmark.time( reset1, "CPU", absMinHost );
   benchmark.time< Devices::Host >( reset1, "CPU", absMinHost );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", absMinCuda );
   benchmark.time( reset1, "cuBLAS", absMinCublas );
   benchmark.time< Devices::Cuda >( reset1, "GPU", absMinCuda );
   benchmark.time< Devices::Cuda >( reset1, "cuBLAS", absMinCublas );
#endif


@@ -162,9 +162,9 @@ benchmarkVectorOperations( Benchmark & benchmark,
      resultDevice = deviceVector.sum();
   };
   benchmark.setOperation( "sum", datasetSize );
   benchmark.time( reset1, "CPU", sumHost );
   benchmark.time< Devices::Host >( reset1, "CPU", sumHost );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", sumCuda );
   benchmark.time< Devices::Cuda >( reset1, "GPU", sumCuda );
#endif


@@ -182,10 +182,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
   };
#endif
   benchmark.setOperation( "l1 norm", datasetSize );
   benchmark.time( reset1, "CPU", l1normHost );
   benchmark.time< Devices::Host >( reset1, "CPU", l1normHost );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", l1normCuda );
   benchmark.time( reset1, "cuBLAS", l1normCublas );
   benchmark.time< Devices::Cuda >( reset1, "GPU", l1normCuda );
   benchmark.time< Devices::Cuda >( reset1, "cuBLAS", l1normCublas );
#endif


@@ -203,10 +203,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
   };
#endif
   benchmark.setOperation( "l2 norm", datasetSize );
   benchmark.time( reset1, "CPU", l2normHost );
   benchmark.time< Devices::Host >( reset1, "CPU", l2normHost );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", l2normCuda );
   benchmark.time( reset1, "cuBLAS", l2normCublas );
   benchmark.time< Devices::Cuda >( reset1, "GPU", l2normCuda );
   benchmark.time< Devices::Cuda >( reset1, "cuBLAS", l2normCublas );
#endif


@@ -217,9 +217,9 @@ benchmarkVectorOperations( Benchmark & benchmark,
      resultDevice = deviceVector.lpNorm( 3.0 );
   };
   benchmark.setOperation( "l3 norm", datasetSize );
   benchmark.time( reset1, "CPU", l3normHost );
   benchmark.time< Devices::Host >( reset1, "CPU", l3normHost );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", l3normCuda );
   benchmark.time< Devices::Cuda >( reset1, "GPU", l3normCuda );
#endif


@@ -238,10 +238,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
   };
#endif
   benchmark.setOperation( "scalar product", 2 * datasetSize );
   benchmark.time( reset1, "CPU", scalarProductHost );
   benchmark.time< Devices::Host >( reset1, "CPU", scalarProductHost );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", scalarProductCuda );
   benchmark.time( reset1, "cuBLAS", scalarProductCublas );
   benchmark.time< Devices::Cuda >( reset1, "GPU", scalarProductCuda );
   benchmark.time< Devices::Cuda >( reset1, "cuBLAS", scalarProductCublas );
#endif

   /*
@@ -289,10 +289,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
   };
#endif
   benchmark.setOperation( "scalar multiplication", 2 * datasetSize );
   benchmark.time( reset1, "CPU", multiplyHost );
   benchmark.time< Devices::Host >( reset1, "CPU", multiplyHost );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", multiplyCuda );
   benchmark.time( reset1, "cuBLAS", multiplyCublas );
   benchmark.time< Devices::Cuda >( reset1, "GPU", multiplyCuda );
   benchmark.time< Devices::Cuda >( reset1, "cuBLAS", multiplyCublas );
#endif


@@ -312,10 +312,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
   };
#endif
   benchmark.setOperation( "vector addition", 3 * datasetSize );
   benchmark.time( reset1, "CPU", addVectorHost );
   benchmark.time< Devices::Host >( reset1, "CPU", addVectorHost );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", addVectorCuda );
   benchmark.time( reset1, "cuBLAS", addVectorCublas );
   benchmark.time< Devices::Cuda >( reset1, "GPU", addVectorCuda );
   benchmark.time< Devices::Cuda >( reset1, "cuBLAS", addVectorCublas );
#endif


Loading