Refactoring BLAS benchmarks (75b18f59) · Commits · TNL / tnl-dev

CMakeLists.txt

+2 −2

Original line number	Diff line number	Diff line
		@@ -44,8 +44,8 @@ endif()
		# set Debug/Release options
		set( CMAKE_CXX_FLAGS "-std=c++11 -Wall -Wno-unused-local-typedefs -Wno-unused-variable" )
		set( CMAKE_CXX_FLAGS_DEBUG "-g" )
		set( CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -DNDEBUG" )
		#set( CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -DNDEBUG -ftree-vectorizer-verbose=1 -ftree-vectorize -fopt-info-vec-missed -funroll-loops" )
		set( CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -mtune=native -DNDEBUG" )
		#set( CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -mtune=native -DNDEBUG -ftree-vectorizer-verbose=1 -ftree-vectorize -fopt-info-vec-missed -funroll-loops" )
		# pass -rdynamic only in Debug mode
		set( CMAKE_SHARED_LIBRARY_LINK_C_FLAGS "" )
		set( CMAKE_SHARED_LIBRARY_LINK_C_FLAGS_DEBUG "-rdynamic" )

src/TNL/String.cpp

+6 −0

Original line number	Diff line number	Diff line
		@@ -55,6 +55,12 @@ String :: String( int number )
		this->setString( convertToString( number ).getString() );
		}

		String :: String( unsigned long int number )
		: string( 0 ), length( 0 )
		{
		this->setString( convertToString( number ).getString() );
		}

		String :: String( long int number )
		: string( 0 ), length( 0 )
		{

src/TNL/String.h

+2 −0

Original line number	Diff line number	Diff line
		@@ -55,6 +55,8 @@ class String

		String( int number );

		String( unsigned long int number );

		String( long int number );

		String( float number );

tests/benchmarks/CMakeLists.txt

+14 −19

Original line number	Diff line number	Diff line
		@@ -2,11 +2,11 @@ ADD_SUBDIRECTORY( share )
		ADD_SUBDIRECTORY( heat-equation-benchmark )

		IF( BUILD_CUDA )
		CUDA_ADD_EXECUTABLE( tnl-cuda-benchmarks${debugExt} tnl-cuda-benchmarks.cu )
		CUDA_ADD_EXECUTABLE( tnl-benchmark-blas${debugExt} tnl-benchmark-blas.cu )
		if( HAVE_CUBLAS STREQUAL "yes" )
		CUDA_ADD_CUBLAS_TO_TARGET( tnl-cuda-benchmarks${debugExt} )
		CUDA_ADD_CUBLAS_TO_TARGET( tnl-benchmark-blas${debugExt} )
		endif()
		TARGET_LINK_LIBRARIES( tnl-cuda-benchmarks${debugExt} tnl${debugExt}-${tnlVersion} ${CUSPARSE_LIBRARY} )
		TARGET_LINK_LIBRARIES( tnl-benchmark-blas${debugExt} tnl${debugExt}-${tnlVersion} ${CUSPARSE_LIBRARY} )

		CUDA_ADD_EXECUTABLE( tnl-benchmark-spmv${debugExt} tnl-benchmark-spmv.cu )
		TARGET_LINK_LIBRARIES( tnl-benchmark-spmv${debugExt} tnl${debugExt}-${tnlVersion} ${CUSPARSE_LIBRARY} )
		@@ -14,6 +14,9 @@ IF( BUILD_CUDA )
		CUDA_ADD_EXECUTABLE( tnl-benchmark-linear-solvers${debugExt} tnl-benchmark-linear-solvers.cu )
		TARGET_LINK_LIBRARIES( tnl-benchmark-linear-solvers${debugExt} tnl${debugExt}-${tnlVersion} ${CUSPARSE_LIBRARY} )
		ELSE()
		ADD_EXECUTABLE( tnl-benchmark-blas${debugExt} tnl-benchmark-blas.cpp )
		TARGET_LINK_LIBRARIES( tnl-benchmark-blas${debugExt} tnl${debugExt}-${tnlVersion} )

		ADD_EXECUTABLE( tnl-benchmark-spmv${debugExt} tnl-benchmark-spmv.cpp )
		TARGET_LINK_LIBRARIES( tnl-benchmark-spmv${debugExt} tnl${debugExt}-${tnlVersion} )

		@@ -21,16 +24,8 @@ ELSE()
		TARGET_LINK_LIBRARIES( tnl-benchmark-linear-solvers${debugExt} tnl${debugExt}-${tnlVersion} )
		ENDIF()

		if( BUILD_CUDA )
		INSTALL( TARGETS
		tnl-cuda-benchmarks${debugExt}
		RUNTIME DESTINATION bin )
		endif()

		INSTALL( TARGETS
		tnl-benchmark-blas${debugExt}
		tnl-benchmark-spmv${debugExt}
		tnl-benchmark-linear-solvers${debugExt}
		RUNTIME DESTINATION bin )

tests/benchmarks/array-operations.h

+38 −18

Original line number	Diff line number	Diff line
		@@ -14,7 +14,7 @@ template< typename Real = double,
		bool
		benchmarkArrayOperations( Benchmark & benchmark,
		const int & loops,
		const int & size )
		const long & size )
		{
		typedef Containers::Array< Real, Devices::Host, Index > HostArray;
		typedef Containers::Array< Real, Devices::Cuda, Index > CudaArray;
		@@ -25,9 +25,14 @@ benchmarkArrayOperations( Benchmark & benchmark,
		HostArray hostArray, hostArray2;
		CudaArray deviceArray, deviceArray2;
		if( ! hostArray.setSize( size ) \|\|
		! hostArray2.setSize( size ) \|\|
		! hostArray2.setSize( size )
		#ifdef HAVE_CUDA
		\|\|
		! deviceArray.setSize( size ) \|\|
		! deviceArray2.setSize( size ) )
		! deviceArray2.setSize( size )
		#endif
		)

		{
		const char* msg = "error: allocation of arrays failed";
		std::cerr << msg << std::endl;
		@@ -41,11 +46,15 @@ benchmarkArrayOperations( Benchmark & benchmark,
		// reset functions
		auto reset1 = [&]() {
		hostArray.setValue( 1.0 );
		#ifdef HAVE_CUDA
		deviceArray.setValue( 1.0 );
		#endif
		};
		auto reset2 = [&]() {
		hostArray2.setValue( 1.0 );
		#ifdef HAVE_CUDA
		deviceArray2.setValue( 1.0 );
		#endif
		};
		auto reset12 = [&]() {
		reset1();
		@@ -63,9 +72,10 @@ benchmarkArrayOperations( Benchmark & benchmark,
		resultDevice = (int) deviceArray == deviceArray2;
		};
		benchmark.setOperation( "comparison (operator==)", 2 * datasetSize );
		benchmark.time( reset1,
		"CPU", compareHost,
		"GPU", compareCuda );
		benchmark.time( reset1, "CPU", compareHost );
		#ifdef HAVE_CUDA
		benchmark.time( reset1, "GPU", compareCuda );
		#endif


		auto copyAssignHostHost = [&]() {
		@@ -75,9 +85,10 @@ benchmarkArrayOperations( Benchmark & benchmark,
		deviceArray = deviceArray2;
		};
		benchmark.setOperation( "copy (operator=)", 2 * datasetSize );
		double basetime = benchmark.time( reset1,
		"CPU", copyAssignHostHost,
		"GPU", copyAssignCudaCuda );
		benchmark.time( reset1, "CPU", copyAssignHostHost );
		#ifdef HAVE_CUDA
		benchmark.time( reset1, "GPU", copyAssignCudaCuda );
		#endif


		auto copyAssignHostCuda = [&]() {
		@@ -86,10 +97,12 @@ benchmarkArrayOperations( Benchmark & benchmark,
		auto copyAssignCudaHost = [&]() {
		hostArray = deviceArray;
		};
		#ifdef HAVE_CUDA
		benchmark.setOperation( "copy (operator=)", datasetSize, basetime );
		benchmark.time( reset1,
		"CPU->GPU", copyAssignHostCuda,
		"GPU->CPU", copyAssignCudaHost );
		#endif


		auto setValueHost = [&]() {
		@@ -99,9 +112,10 @@ benchmarkArrayOperations( Benchmark & benchmark,
		deviceArray.setValue( 3.0 );
		};
		benchmark.setOperation( "setValue", datasetSize );
		benchmark.time( reset1,
		"CPU", setValueHost,
		"GPU", setValueCuda );
		benchmark.time( reset1, "CPU", setValueHost );
		#ifdef HAVE_CUDA
		benchmark.time( reset1, "GPU", setValueCuda );
		#endif


		auto setSizeHost = [&]() {
		@@ -112,12 +126,15 @@ benchmarkArrayOperations( Benchmark & benchmark,
		};
		auto resetSize1 = [&]() {
		hostArray.reset();
		#ifdef HAVE_CUDA
		deviceArray.reset();
		#endif
		};
		benchmark.setOperation( "allocation (setSize)", datasetSize );
		benchmark.time( resetSize1,
		"CPU", setSizeHost,
		"GPU", setSizeCuda );
		benchmark.time( resetSize1, "CPU", setSizeHost );
		#ifdef HAVE_CUDA
		benchmark.time( resetSize1, "GPU", setSizeCuda );
		#endif


		auto resetSizeHost = [&]() {
		@@ -128,12 +145,15 @@ benchmarkArrayOperations( Benchmark & benchmark,
		};
		auto setSize1 = [&]() {
		hostArray.setSize( size );
		#ifdef HAVE_CUDA
		deviceArray.setSize( size );
		#endif
		};
		benchmark.setOperation( "deallocation (reset)", datasetSize );
		benchmark.time( setSize1,
		"CPU", resetSizeHost,
		"GPU", resetSizeCuda );
		benchmark.time( setSize1, "CPU", resetSizeHost );
		#ifdef HAVE_CUDA
		benchmark.time( setSize1, "GPU", resetSizeCuda );
		#endif

		return true;
		}