Merge branch 'mpi' into anselm-special-log (5b42f5bc) · Commits · TNL / tnl-dev

CMakeLists.txt

+0 −4

Original line number	Diff line number	Diff line
		@@ -409,10 +409,6 @@ endif()
		# endif()
		#endif()

		if( OPTIMIZED_VECTOR_HOST_OPERATIONS STREQUAL "yes" )
		AddCompilerFlag( "-DOPTIMIZED_VECTOR_HOST_OPERATIONS " )
		endif()

		CONFIGURE_FILE( "tnlConfig.h.in" "${PROJECT_BUILD_PATH}/TNL/tnlConfig.h" )
		INSTALL( FILES ${PROJECT_BUILD_PATH}/TNL/tnlConfig.h DESTINATION ${TNL_TARGET_INCLUDE_DIRECTORY} )

build

+0 −3

Original line number	Diff line number	Diff line
		@@ -34,7 +34,6 @@ INSTANTIATE_INT="yes"
		INSTANTIATE_LONG_DOUBLE="no"
		INSTANTIATE_DOUBLE="yes"
		INSTANTIATE_FLOAT="no"
		OPTIMIZED_VECTOR_HOST_OPERATIONS="no"

		for option in "$@"
		do
		@@ -75,7 +74,6 @@ do
		INSTANTIATE_DOUBLE="yes"
		INSTANTIATE_FLOAT="no"
		WITH_CUDA_ARCH="auto" ;;
		--optimize-vector-host-operations=* ) OPTIMIZED_VECTOR_HOST_OPERATIONS="yes" ;;
		* )
		echo "Unknown option ${option}. Use --help for more information."
		exit 1 ;;
		@@ -175,7 +173,6 @@ cmake_command=(
		-DINSTANTIATE_LONG_DOUBLE=${INSTANTIATE_LONG_DOUBLE}
		-DINSTANTIATE_INT=${INSTANTIATE_INT}
		-DINSTANTIATE_LONG_INT=${INSTANTIATE_LONG_INT}
		-DOPTIMIZED_VECTOR_HOST_OPERATIONS=${OPTIMIZED_VECTOR_HOST_OPERATIONS}
		)

		# Skip running cmake if it was already run and the cmake command is the same.

src/Benchmarks/BLAS/vector-operations.h

+0 −109

Original line number	Diff line number	Diff line
		@@ -87,23 +87,11 @@ benchmarkVectorOperations( Benchmark & benchmark,
		auto maxHost = [&]() {
		resultHost = hostVector.max();
		};
		auto maxHostGeneral = [&]() {
		Real result( 0 );
		Containers::Algorithms::ParallelReductionMax< Real > operation;
		Containers::Algorithms::Reduction< Devices::Host >::reduce(
		operation,
		hostVector.getSize(),
		hostVector.getData(),
		( Real* ) 0,
		result );
		return result;
		};
		auto maxCuda = [&]() {
		resultDevice = deviceVector.max();
		};
		benchmark.setOperation( "max", datasetSize );
		benchmark.time( reset1, "CPU", maxHost );
		benchmark.time( reset1, "CPU (general)", maxHostGeneral );
		#ifdef HAVE_CUDA
		benchmark.time( reset1, "GPU", maxCuda );
		#endif
		@@ -112,23 +100,11 @@ benchmarkVectorOperations( Benchmark & benchmark,
		auto minHost = [&]() {
		resultHost = hostVector.min();
		};
		auto minHostGeneral = [&]() {
		Real result( 0 );
		Containers::Algorithms::ParallelReductionMin< Real > operation;
		Containers::Algorithms::Reduction< Devices::Host >::reduce(
		operation,
		hostVector.getSize(),
		hostVector.getData(),
		( Real* ) 0,
		result );
		return result;
		};
		auto minCuda = [&]() {
		resultDevice = deviceVector.min();
		};
		benchmark.setOperation( "min", datasetSize );
		benchmark.time( reset1, "CPU", minHost );
		benchmark.time( reset1, "CPU (general)", minHostGeneral );
		#ifdef HAVE_CUDA
		benchmark.time( reset1, "GPU", minCuda );
		#endif
		@@ -137,17 +113,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
		auto absMaxHost = [&]() {
		resultHost = hostVector.absMax();
		};
		auto absMaxHostGeneral = [&]() {
		Real result( 0 );
		Containers::Algorithms::ParallelReductionAbsMax< Real > operation;
		Containers::Algorithms::Reduction< Devices::Host >::reduce(
		operation,
		hostVector.getSize(),
		hostVector.getData(),
		( Real* ) 0,
		result );
		return result;
		};
		auto absMaxCuda = [&]() {
		resultDevice = deviceVector.absMax();
		};
		@@ -162,7 +127,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
		#endif
		benchmark.setOperation( "absMax", datasetSize );
		benchmark.time( reset1, "CPU", absMaxHost );
		benchmark.time( reset1, "CPU (general)", absMaxHostGeneral );
		#ifdef HAVE_CUDA
		benchmark.time( reset1, "GPU", absMaxCuda );
		benchmark.time( reset1, "cuBLAS", absMaxCublas );
		@@ -172,17 +136,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
		auto absMinHost = [&]() {
		resultHost = hostVector.absMin();
		};
		auto absMinHostGeneral = [&]() {
		Real result( 0 );
		Containers::Algorithms::ParallelReductionAbsMin< Real > operation;
		Containers::Algorithms::Reduction< Devices::Host >::reduce(
		operation,
		hostVector.getSize(),
		hostVector.getData(),
		( Real* ) 0,
		result );
		return result;
		};
		auto absMinCuda = [&]() {
		resultDevice = deviceVector.absMin();
		};
		@@ -197,7 +150,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
		#endif
		benchmark.setOperation( "absMin", datasetSize );
		benchmark.time( reset1, "CPU", absMinHost );
		benchmark.time( reset1, "CPU (general)", absMinHostGeneral );
		#ifdef HAVE_CUDA
		benchmark.time( reset1, "GPU", absMinCuda );
		benchmark.time( reset1, "cuBLAS", absMinCublas );
		@@ -207,23 +159,11 @@ benchmarkVectorOperations( Benchmark & benchmark,
		auto sumHost = [&]() {
		resultHost = hostVector.sum();
		};
		auto sumHostGeneral = [&]() {
		Real result( 0 );
		Containers::Algorithms::ParallelReductionSum< Real > operation;
		Containers::Algorithms::Reduction< Devices::Host >::reduce(
		operation,
		hostVector.getSize(),
		hostVector.getData(),
		( Real* ) 0,
		result );
		return result;
		};
		auto sumCuda = [&]() {
		resultDevice = deviceVector.sum();
		};
		benchmark.setOperation( "sum", datasetSize );
		benchmark.time( reset1, "CPU", sumHost );
		benchmark.time( reset1, "CPU (general)", sumHostGeneral );
		#ifdef HAVE_CUDA
		benchmark.time( reset1, "GPU", sumCuda );
		#endif
		@@ -232,17 +172,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
		auto l1normHost = [&]() {
		resultHost = hostVector.lpNorm( 1.0 );
		};
		auto l1normHostGeneral = [&]() {
		Real result( 0 );
		Containers::Algorithms::ParallelReductionAbsSum< Real > operation;
		Containers::Algorithms::Reduction< Devices::Host >::reduce(
		operation,
		hostVector.getSize(),
		hostVector.getData(),
		( Real* ) 0,
		result );
		return result;
		};
		auto l1normCuda = [&]() {
		resultDevice = deviceVector.lpNorm( 1.0 );
		};
		@@ -255,7 +184,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
		#endif
		benchmark.setOperation( "l1 norm", datasetSize );
		benchmark.time( reset1, "CPU", l1normHost );
		benchmark.time( reset1, "CPU (general)", l1normHostGeneral );
		#ifdef HAVE_CUDA
		benchmark.time( reset1, "GPU", l1normCuda );
		benchmark.time( reset1, "cuBLAS", l1normCublas );
		@@ -265,17 +193,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
		auto l2normHost = [&]() {
		resultHost = hostVector.lpNorm( 2.0 );
		};
		auto l2normHostGeneral = [&]() {
		Real result( 0 );
		Containers::Algorithms::ParallelReductionL2Norm< Real > operation;
		Containers::Algorithms::Reduction< Devices::Host >::reduce(
		operation,
		hostVector.getSize(),
		hostVector.getData(),
		( Real* ) 0,
		result );
		return result;
		};
		auto l2normCuda = [&]() {
		resultDevice = deviceVector.lpNorm( 2.0 );
		};
		@@ -288,7 +205,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
		#endif
		benchmark.setOperation( "l2 norm", datasetSize );
		benchmark.time( reset1, "CPU", l2normHost );
		benchmark.time( reset1, "CPU (general)", l2normHostGeneral );
		#ifdef HAVE_CUDA
		benchmark.time( reset1, "GPU", l2normCuda );
		benchmark.time( reset1, "cuBLAS", l2normCublas );
		@@ -298,24 +214,11 @@ benchmarkVectorOperations( Benchmark & benchmark,
		auto l3normHost = [&]() {
		resultHost = hostVector.lpNorm( 3.0 );
		};
		auto l3normHostGeneral = [&]() {
		Real result( 0 );
		Containers::Algorithms::ParallelReductionLpNorm< Real > operation;
		operation.setPower( 3.0 );
		Containers::Algorithms::Reduction< Devices::Host >::reduce(
		operation,
		hostVector.getSize(),
		hostVector.getData(),
		( Real* ) 0,
		result );
		return result;
		};
		auto l3normCuda = [&]() {
		resultDevice = deviceVector.lpNorm( 3.0 );
		};
		benchmark.setOperation( "l3 norm", datasetSize );
		benchmark.time( reset1, "CPU", l3normHost );
		benchmark.time( reset1, "CPU (general)", l3normHostGeneral );
		#ifdef HAVE_CUDA
		benchmark.time( reset1, "GPU", l3normCuda );
		#endif
		@@ -324,17 +227,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
		auto scalarProductHost = [&]() {
		resultHost = hostVector.scalarProduct( hostVector2 );
		};
		auto scalarProductHostGeneral = [&]() {
		Real result( 0 );
		Containers::Algorithms::ParallelReductionScalarProduct< Real, Real > operation;
		Containers::Algorithms::Reduction< Devices::Host >::reduce(
		operation,
		hostVector.getSize(),
		hostVector.getData(),
		hostVector2.getData(),
		result );
		return result;
		};
		auto scalarProductCuda = [&]() {
		resultDevice = deviceVector.scalarProduct( deviceVector2 );
		};
		@@ -348,7 +240,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
		#endif
		benchmark.setOperation( "scalar product", 2 * datasetSize );
		benchmark.time( reset1, "CPU", scalarProductHost );
		benchmark.time( reset1, "CPU (general)", scalarProductHostGeneral );
		#ifdef HAVE_CUDA
		benchmark.time( reset1, "GPU", scalarProductCuda );
		benchmark.time( reset1, "cuBLAS", scalarProductCublas );

src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem_impl.h

+2 −2

Original line number	Diff line number	Diff line
		@@ -514,7 +514,7 @@ getExplicitUpdate( const RealType& time,
		cell.getBasis(),
		gridXIdx,
		gridYIdx );
		cudaThreadSynchronize();
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;

		//std::cerr << "Computing the heat equation ..." << std::endl;
		@@ -534,7 +534,7 @@ getExplicitUpdate( const RealType& time,
		cell.getBasis(),
		gridXIdx,
		gridYIdx );
		cudaThreadSynchronize();
		cudaDeviceSynchronize();
		TNL_CHECK_CUDA_DEVICE;
		}
		if( this->cudaKernelType == "templated" )

src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation-bug.h

+1 −1

Original line number	Diff line number	Diff line
		@@ -47,7 +47,7 @@ int main( int argc, char* argv[] )
		while( iteration < 10000 )
		{
		testKernel< GridEntity ><<< cudaGridSize, cudaBlockSize >>>();
		cudaThreadSynchronize();
		cudaDeviceSynchronize();
		iteration++;
		}
		auto t_stop = std::chrono::high_resolution_clock::now();