Merge branch 'traversers-optimizations' into 'develop' (75a4ea7a) · Commits · TNL / tnl-dev

CMakeLists.txt

+7 −0

Original line number	Diff line number	Diff line
		@@ -21,6 +21,7 @@ set(WITH_CUDA_ARCH "auto" CACHE STRING "Build for these CUDA architectures")
		option(WITH_OPENMP "Build with OpenMP support" ON)
		option(WITH_GMP "Build with GMP support" OFF)
		option(WITH_TESTS "Build tests" ON)
		option(WITH_PROFILING "Enable code profiling compiler flags" OFF )
		option(WITH_COVERAGE "Enable code coverage reports from unit tests" OFF)
		option(WITH_EXAMPLES "Compile the 'examples' directory" ON)
		option(WITH_TOOLS "Compile the 'src/Tools' directory" ON)
		@@ -243,6 +244,11 @@ if( OPENMP_FOUND AND ${WITH_OPENMP} )
		set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_OPENMP ${OpenMP_CXX_FLAGS}" )
		endif()

		if( ${WITH_PROFILING} )
		set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g" )
		set( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-line-info")
		endif()

		find_package( DCMTK )
		if( DCMTK_FOUND )
		set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_DCMTK_H" )
		@@ -414,6 +420,7 @@ message( " WITH_CUDA_ARCH = ${WITH_CUDA_ARCH}" )
		message( " WITH_OPENMP = ${WITH_OPENMP}" )
		message( " WITH_GMP = ${WITH_GMP}" )
		message( " WITH_TESTS = ${WITH_TESTS}" )
		message( " WITH_PROFILING = ${WITH_PROFILING}" )
		message( " WITH_COVERAGE = ${WITH_COVERAGE}" )
		message( " WITH_EXAMPLES = ${WITH_EXAMPLES}" )
		message( " WITH_TOOLS = ${WITH_TOOLS}" )

build

+4 −0

Original line number	Diff line number	Diff line
		@@ -22,6 +22,7 @@ WITH_CUDA_ARCH="auto"
		WITH_OPENMP="yes"
		WITH_GMP="no"
		WITH_TESTS="yes"
		WITH_PROFILING="no"
		WITH_COVERAGE="no"
		WITH_EXAMPLES="yes"
		WITH_PYTHON="yes"
		@@ -57,6 +58,7 @@ do
		--with-openmp=* ) WITH_OPENMP="${option#*=}" ;;
		--with-gmp=* ) WITH_GMP="${option#*=}" ;;
		--with-tests=* ) WITH_TESTS="${option#*=}" ;;
		--with-profiling=* ) WITH_PROFILING="${option#*=}" ;;
		--with-coverage=* ) WITH_COVERAGE="${option#*=}" ;;
		--with-examples=* ) WITH_EXAMPLES="${option#*=}" ;;
		--with-tools=* ) WITH_TOOLS="${option#*=}" ;;
		@@ -95,6 +97,7 @@ if [[ ${HELP} == "yes" ]]; then
		echo " --with-openmp=yes/no Enables OpenMP. 'yes' by default."
		echo " --with-gmp=yes/no Enables the wrapper for GNU Multiple Precision Arithmetic Library. 'no' by default."
		echo " --with-tests=yes/no Enables unit tests. 'yes' by default."
		echo " --with-profiling=yes/no Enables code profiling compiler falgs. 'no' by default."
		echo " --with-coverage=yes/no Enables code coverage reports for unit tests. 'no' by default (lcov is required)."
		echo " --with-examples=yes/no Compile the 'examples' directory. 'yes' by default."
		echo " --with-tools=yes/no Compile the 'src/Tools' directory. 'yes' by default."
		@@ -165,6 +168,7 @@ cmake_command=(
		-DWITH_OPENMP=${WITH_OPENMP}
		-DWITH_GMP=${WITH_GMP}
		-DWITH_TESTS=${WITH_TESTS}
		-DWITH_PROFILING=${WITH_PROFILING}
		-DWITH_COVERAGE=${WITH_COVERAGE}
		-DWITH_EXAMPLES=${WITH_EXAMPLES}
		-DWITH_TOOLS=${WITH_TOOLS}

src/Benchmarks/BLAS/array-operations.h

+12 −12

Original line number	Diff line number	Diff line
		@@ -72,9 +72,9 @@ benchmarkArrayOperations( Benchmark & benchmark,
		resultDevice = (int) deviceArray == deviceArray2;
		};
		benchmark.setOperation( "comparison (operator==)", 2 * datasetSize );
		benchmark.time( reset1, "CPU", compareHost );
		benchmark.time< Devices::Host >( reset1, "CPU", compareHost );
		#ifdef HAVE_CUDA
		benchmark.time( reset1, "GPU", compareCuda );
		benchmark.time< Devices::Cuda >( reset1, "GPU", compareCuda );
		#endif


		@@ -87,9 +87,9 @@ benchmarkArrayOperations( Benchmark & benchmark,
		benchmark.setOperation( "copy (operator=)", 2 * datasetSize );
		// copyBasetime is used later inside HAVE_CUDA guard, so the compiler will
		// complain when compiling without CUDA
		const double copyBasetime = benchmark.time( reset1, "CPU", copyAssignHostHost );
		const double copyBasetime = benchmark.time< Devices::Host >( reset1, "CPU", copyAssignHostHost );
		#ifdef HAVE_CUDA
		benchmark.time( reset1, "GPU", copyAssignCudaCuda );
		benchmark.time< Devices::Cuda >( reset1, "GPU", copyAssignCudaCuda );
		#endif


		@@ -101,8 +101,8 @@ benchmarkArrayOperations( Benchmark & benchmark,
		};
		#ifdef HAVE_CUDA
		benchmark.setOperation( "copy (operator=)", datasetSize, copyBasetime );
		benchmark.time( reset1, "CPU->GPU", copyAssignHostCuda );
		benchmark.time( reset1, "GPU->CPU", copyAssignCudaHost );
		benchmark.time< Devices::Cuda >( reset1, "CPU->GPU", copyAssignHostCuda );
		benchmark.time< Devices::Cuda >( reset1, "GPU->CPU", copyAssignCudaHost );
		#endif


		@@ -113,9 +113,9 @@ benchmarkArrayOperations( Benchmark & benchmark,
		deviceArray.setValue( 3.0 );
		};
		benchmark.setOperation( "setValue", datasetSize );
		benchmark.time( reset1, "CPU", setValueHost );
		benchmark.time< Devices::Host >( reset1, "CPU", setValueHost );
		#ifdef HAVE_CUDA
		benchmark.time( reset1, "GPU", setValueCuda );
		benchmark.time< Devices::Cuda >( reset1, "GPU", setValueCuda );
		#endif


		@@ -132,9 +132,9 @@ benchmarkArrayOperations( Benchmark & benchmark,
		#endif
		};
		benchmark.setOperation( "allocation (setSize)", datasetSize );
		benchmark.time( resetSize1, "CPU", setSizeHost );
		benchmark.time< Devices::Host >( resetSize1, "CPU", setSizeHost );
		#ifdef HAVE_CUDA
		benchmark.time( resetSize1, "GPU", setSizeCuda );
		benchmark.time< Devices::Cuda >( resetSize1, "GPU", setSizeCuda );
		#endif


		@@ -151,9 +151,9 @@ benchmarkArrayOperations( Benchmark & benchmark,
		#endif
		};
		benchmark.setOperation( "deallocation (reset)", datasetSize );
		benchmark.time( setSize1, "CPU", resetSizeHost );
		benchmark.time< Devices::Host >( setSize1, "CPU", resetSizeHost );
		#ifdef HAVE_CUDA
		benchmark.time( setSize1, "GPU", resetSizeCuda );
		benchmark.time< Devices::Cuda >( setSize1, "GPU", resetSizeCuda );
		#endif

		return true;

src/Benchmarks/BLAS/spmv.h

+2 −2

Original line number	Diff line number	Diff line
		@@ -161,9 +161,9 @@ benchmarkSpMV( Benchmark & benchmark,
		};

		benchmark.setOperation( datasetSize );
		benchmark.time( reset, "CPU", spmvHost );
		benchmark.time< Devices::Host >( reset, "CPU", spmvHost );
		#ifdef HAVE_CUDA
		benchmark.time( reset, "GPU", spmvCuda );
		benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda );
		#endif

		return true;

src/Benchmarks/BLAS/vector-operations.h

+30 −30

Original line number	Diff line number	Diff line
		@@ -64,7 +64,7 @@ benchmarkVectorOperations( Benchmark & benchmark,
		deviceVector.setValue( 1.0 );
		#endif
		// A relatively harmless call to keep the compiler from realizing we
		// don't actually do any useful work with the result of the reduciton.
		// don't actually do any useful work with the result of the reduction.
		srand48(resultHost);
		resultHost = resultDevice = 0.0;
		};
		@@ -90,9 +90,9 @@ benchmarkVectorOperations( Benchmark & benchmark,
		resultDevice = deviceVector.max();
		};
		benchmark.setOperation( "max", datasetSize );
		benchmark.time( reset1, "CPU", maxHost );
		benchmark.time< Devices::Host >( reset1, "CPU", maxHost );
		#ifdef HAVE_CUDA
		benchmark.time( reset1, "GPU", maxCuda );
		benchmark.time< Devices::Cuda >( reset1, "GPU", maxCuda );
		#endif


		@@ -103,9 +103,9 @@ benchmarkVectorOperations( Benchmark & benchmark,
		resultDevice = deviceVector.min();
		};
		benchmark.setOperation( "min", datasetSize );
		benchmark.time( reset1, "CPU", minHost );
		benchmark.time< Devices::Host >( reset1, "CPU", minHost );
		#ifdef HAVE_CUDA
		benchmark.time( reset1, "GPU", minCuda );
		benchmark.time< Devices::Cuda >( reset1, "GPU", minCuda );
		#endif


		@@ -125,10 +125,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
		};
		#endif
		benchmark.setOperation( "absMax", datasetSize );
		benchmark.time( reset1, "CPU", absMaxHost );
		benchmark.time< Devices::Host >( reset1, "CPU", absMaxHost );
		#ifdef HAVE_CUDA
		benchmark.time( reset1, "GPU", absMaxCuda );
		benchmark.time( reset1, "cuBLAS", absMaxCublas );
		benchmark.time< Devices::Cuda >( reset1, "GPU", absMaxCuda );
		benchmark.time< Devices::Cuda >( reset1, "cuBLAS", absMaxCublas );
		#endif


		@@ -148,10 +148,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
		};
		#endif
		benchmark.setOperation( "absMin", datasetSize );
		benchmark.time( reset1, "CPU", absMinHost );
		benchmark.time< Devices::Host >( reset1, "CPU", absMinHost );
		#ifdef HAVE_CUDA
		benchmark.time( reset1, "GPU", absMinCuda );
		benchmark.time( reset1, "cuBLAS", absMinCublas );
		benchmark.time< Devices::Cuda >( reset1, "GPU", absMinCuda );
		benchmark.time< Devices::Cuda >( reset1, "cuBLAS", absMinCublas );
		#endif


		@@ -162,9 +162,9 @@ benchmarkVectorOperations( Benchmark & benchmark,
		resultDevice = deviceVector.sum();
		};
		benchmark.setOperation( "sum", datasetSize );
		benchmark.time( reset1, "CPU", sumHost );
		benchmark.time< Devices::Host >( reset1, "CPU", sumHost );
		#ifdef HAVE_CUDA
		benchmark.time( reset1, "GPU", sumCuda );
		benchmark.time< Devices::Cuda >( reset1, "GPU", sumCuda );
		#endif


		@@ -182,10 +182,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
		};
		#endif
		benchmark.setOperation( "l1 norm", datasetSize );
		benchmark.time( reset1, "CPU", l1normHost );
		benchmark.time< Devices::Host >( reset1, "CPU", l1normHost );
		#ifdef HAVE_CUDA
		benchmark.time( reset1, "GPU", l1normCuda );
		benchmark.time( reset1, "cuBLAS", l1normCublas );
		benchmark.time< Devices::Cuda >( reset1, "GPU", l1normCuda );
		benchmark.time< Devices::Cuda >( reset1, "cuBLAS", l1normCublas );
		#endif


		@@ -203,10 +203,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
		};
		#endif
		benchmark.setOperation( "l2 norm", datasetSize );
		benchmark.time( reset1, "CPU", l2normHost );
		benchmark.time< Devices::Host >( reset1, "CPU", l2normHost );
		#ifdef HAVE_CUDA
		benchmark.time( reset1, "GPU", l2normCuda );
		benchmark.time( reset1, "cuBLAS", l2normCublas );
		benchmark.time< Devices::Cuda >( reset1, "GPU", l2normCuda );
		benchmark.time< Devices::Cuda >( reset1, "cuBLAS", l2normCublas );
		#endif


		@@ -217,9 +217,9 @@ benchmarkVectorOperations( Benchmark & benchmark,
		resultDevice = deviceVector.lpNorm( 3.0 );
		};
		benchmark.setOperation( "l3 norm", datasetSize );
		benchmark.time( reset1, "CPU", l3normHost );
		benchmark.time< Devices::Host >( reset1, "CPU", l3normHost );
		#ifdef HAVE_CUDA
		benchmark.time( reset1, "GPU", l3normCuda );
		benchmark.time< Devices::Cuda >( reset1, "GPU", l3normCuda );
		#endif


		@@ -238,10 +238,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
		};
		#endif
		benchmark.setOperation( "scalar product", 2 * datasetSize );
		benchmark.time( reset1, "CPU", scalarProductHost );
		benchmark.time< Devices::Host >( reset1, "CPU", scalarProductHost );
		#ifdef HAVE_CUDA
		benchmark.time( reset1, "GPU", scalarProductCuda );
		benchmark.time( reset1, "cuBLAS", scalarProductCublas );
		benchmark.time< Devices::Cuda >( reset1, "GPU", scalarProductCuda );
		benchmark.time< Devices::Cuda >( reset1, "cuBLAS", scalarProductCublas );
		#endif

		/*
		@@ -289,10 +289,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
		};
		#endif
		benchmark.setOperation( "scalar multiplication", 2 * datasetSize );
		benchmark.time( reset1, "CPU", multiplyHost );
		benchmark.time< Devices::Host >( reset1, "CPU", multiplyHost );
		#ifdef HAVE_CUDA
		benchmark.time( reset1, "GPU", multiplyCuda );
		benchmark.time( reset1, "cuBLAS", multiplyCublas );
		benchmark.time< Devices::Cuda >( reset1, "GPU", multiplyCuda );
		benchmark.time< Devices::Cuda >( reset1, "cuBLAS", multiplyCublas );
		#endif


		@@ -312,10 +312,10 @@ benchmarkVectorOperations( Benchmark & benchmark,
		};
		#endif
		benchmark.setOperation( "vector addition", 3 * datasetSize );
		benchmark.time( reset1, "CPU", addVectorHost );
		benchmark.time< Devices::Host >( reset1, "CPU", addVectorHost );
		#ifdef HAVE_CUDA
		benchmark.time( reset1, "GPU", addVectorCuda );
		benchmark.time( reset1, "cuBLAS", addVectorCublas );
		benchmark.time< Devices::Cuda >( reset1, "GPU", addVectorCuda );
		benchmark.time< Devices::Cuda >( reset1, "cuBLAS", addVectorCublas );
		#endif