Merge branch 'ndarray' into 'develop' (3e785f01) · Commits · TNL / tnl-dev

src/Benchmarks/CMakeLists.txt

+1 −0

Original line number	Diff line number	Diff line
		add_subdirectory( HeatEquation )
		add_subdirectory( BLAS )
		add_subdirectory( NDArray )
		add_subdirectory( SpMV )
		add_subdirectory( DistSpMV )
		add_subdirectory( LinearSolvers )

src/Benchmarks/NDArray/CMakeLists.txt

0 → 100644

+17 −0

Original line number	Diff line number	Diff line
		add_executable( tnl-benchmark-ndarray tnl-benchmark-ndarray.cpp )
		target_compile_options( tnl-benchmark-ndarray PRIVATE ${CXX_TESTS_FLAGS} )
		install( TARGETS tnl-benchmark-ndarray RUNTIME DESTINATION bin )

		add_executable( tnl-benchmark-ndarray-boundary tnl-benchmark-ndarray-boundary.cpp )
		target_compile_options( tnl-benchmark-ndarray-boundary PRIVATE ${CXX_TESTS_FLAGS} )
		install( TARGETS tnl-benchmark-ndarray-boundary RUNTIME DESTINATION bin )

		if( BUILD_CUDA )
		cuda_add_executable( tnl-benchmark-ndarray-cuda tnl-benchmark-ndarray-cuda.cu
		OPTIONS ${CXX_TESTS_FLAGS} )
		install( TARGETS tnl-benchmark-ndarray-cuda RUNTIME DESTINATION bin )

		cuda_add_executable( tnl-benchmark-ndarray-boundary-cuda tnl-benchmark-ndarray-boundary-cuda.cu
		OPTIONS ${CXX_TESTS_FLAGS} )
		install( TARGETS tnl-benchmark-ndarray-boundary-cuda RUNTIME DESTINATION bin )
		endif()

src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary-cuda.cu

0 → 100644

+1 −0

Original line number	Diff line number	Diff line
		#include "tnl-benchmark-ndarray-boundary.h"

src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.cpp

0 → 100644

+1 −0

Original line number	Diff line number	Diff line
		#include "tnl-benchmark-ndarray-boundary.h"

src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.h

0 → 100644

+466 −0

Original line number	Diff line number	Diff line
		/***************************************************************************
		tnl-benchmark-ndarray-boundary.h - description
		-------------------
		begin : Feb 9, 2019
		copyright : (C) 2019 by Tomas Oberhuber et al.
		email : tomas.oberhuber@fjfi.cvut.cz
		***************************************************************************/

		/* See Copyright Notice in tnl/Copyright */

		// Implemented by: Jakub Klinkovsky

		#pragma once

		#include <TNL/Assert.h>
		#include <TNL/Math.h>
		#include <TNL/ParallelFor.h>

		#include <TNL/Containers/NDArray.h>

		#include "../Benchmarks.h"

		using namespace TNL;
		using namespace TNL::Benchmarks;
		using namespace TNL::Containers;
		using std::index_sequence;

		using value_type = float;
		//using index_type = std::size_t;
		using index_type = unsigned;

		template< typename Array >
		void expect_eq_chunked( Array& a, Array& b )
		{
		// TODO: use something like EXPECT_EQ
		TNL_ASSERT_EQ( a.getSize(), b.getSize(), "array sizes don't match" );
		if( a.getSize() != b.getSize() )
		return;

		using IndexType = typename Array::IndexType;

		const IndexType chunk_size = 4096;
		for( IndexType c = 0; c < (IndexType) roundUpDivision( a.getSize(), chunk_size ); c++ ) {
		const typename Array::IndexType this_chunk_size = TNL::min( chunk_size, a.getSize() - c * chunk_size );
		Array a_chunk( &a[ c * chunk_size ], this_chunk_size );
		Array b_chunk( &b[ c * chunk_size ], this_chunk_size );
		// TODO: use something like EXPECT_EQ
		TNL_ASSERT_EQ( a_chunk, b_chunk, "chunks are not equal" );
		}
		}

		template< typename Array >
		void expect_eq( Array& a, Array& b )
		{
		if( std::is_same< typename Array::DeviceType, TNL::Devices::Cuda >::value ) {
		typename Array::HostType a_host, b_host;
		a_host = a;
		b_host = b;
		expect_eq_chunked( a_host, b_host );
		}
		else {
		expect_eq_chunked( a, b );
		}
		}

		template< typename Device >
		const char* performer()
		{
		if( std::is_same< Device, Devices::Host >::value )
		return "CPU";
		else if( std::is_same< Device, Devices::Cuda >::value )
		return "GPU";
		else
		return "unknown";
		}

		void reset() {}

		// NOTE: having the sizes as function parameters keeps the compiler from treating them
		// as "compile-time constants" and thus e.g. optimizing the 1D iterations with memcpy

		template< typename Device >
		void benchmark_1D( Benchmark& benchmark, index_type size = 500000000 )
		{
		NDArray< value_type,
		SizesHolder< index_type, 0 >,
		std::make_index_sequence< 1 >,
		Device > a, b;
		a.setSizes( size );
		b.setSizes( size );
		a.getStorageArray().setValue( -1 );
		b.getStorageArray().setValue( 1 );

		auto a_view = a.getView();
		auto b_view = b.getView();

		auto f = [&]() {
		a.forBoundary( [=] __cuda_callable__ ( index_type i ) mutable { a_view( i ) = b_view( i ); } );
		a.forInternal( [=] __cuda_callable__ ( index_type i ) mutable { a_view( i ) = b_view( i ); } );
		};

		const double datasetSize = 2 * size * sizeof(value_type) / oneGB;
		benchmark.setOperation( "1D", datasetSize );
		benchmark.time< Device >( reset, performer< Device >(), f );

		expect_eq( a.getStorageArray(), b.getStorageArray() );
		}

		template< typename Device >
		void benchmark_2D( Benchmark& benchmark, index_type size = 22333 )
		{
		NDArray< value_type,
		SizesHolder< index_type, 0, 0 >,
		std::make_index_sequence< 2 >,
		Device > a, b;
		a.setSizes( size, size );
		b.setSizes( size, size );
		a.getStorageArray().setValue( -1 );
		b.getStorageArray().setValue( 1 );

		auto a_view = a.getView();
		auto b_view = b.getView();

		auto f = [&]() {
		a.forBoundary( [=] __cuda_callable__ ( index_type i, index_type j ) mutable { a_view( i, j ) = b_view( i, j ); } );
		a.forInternal( [=] __cuda_callable__ ( index_type i, index_type j ) mutable { a_view( i, j ) = b_view( i, j ); } );
		};

		const double datasetSize = 2 * std::pow( size, 2 ) * sizeof(value_type) / oneGB;
		benchmark.setOperation( "2D", datasetSize );
		benchmark.time< Device >( reset, performer< Device >(), f );

		expect_eq( a.getStorageArray(), b.getStorageArray() );
		}

		template< typename Device >
		void benchmark_3D( Benchmark& benchmark, index_type size = 800 )
		{
		NDArray< value_type,
		SizesHolder< index_type, 0, 0, 0 >,
		std::make_index_sequence< 3 >,
		Device > a, b;
		a.setSizes( size, size, size );
		b.setSizes( size, size, size );
		a.getStorageArray().setValue( -1 );
		b.getStorageArray().setValue( 1 );

		auto a_view = a.getView();
		auto b_view = b.getView();

		auto f = [&]() {
		a.forBoundary( [=] __cuda_callable__ ( index_type i, index_type j, index_type k ) mutable { a_view( i, j, k ) = b_view( i, j, k ); } );
		a.forInternal( [=] __cuda_callable__ ( index_type i, index_type j, index_type k ) mutable { a_view( i, j, k ) = b_view( i, j, k ); } );
		};

		const double datasetSize = 2 * std::pow( size, 3 ) * sizeof(value_type) / oneGB;
		benchmark.setOperation( "3D", datasetSize );
		benchmark.time< Device >( reset, performer< Device >(), f );

		expect_eq( a.getStorageArray(), b.getStorageArray() );
		}

		// TODO: implement general ParallelBoundaryExecutor
		//template< typename Device >
		//void benchmark_4D( Benchmark& benchmark, index_type size = 150 )
		//{
		// NDArray< value_type,
		// SizesHolder< index_type, 0, 0, 0, 0 >,
		// std::make_index_sequence< 4 >,
		// Device > a, b;
		// a.setSizes( size, size, size, size );
		// b.setSizes( size, size, size, size );
		// a.getStorageArray().setValue( -1 );
		// b.getStorageArray().setValue( 1 );
		//
		// auto a_view = a.getView();
		// auto b_view = b.getView();
		//
		// auto f = [&]() {
		// a.forBoundary( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l ) mutable { a_view( i, j, k, l ) = b_view( i, j, k, l ); } );
		// a.forInternal( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l ) mutable { a_view( i, j, k, l ) = b_view( i, j, k, l ); } );
		// };
		//
		// const double datasetSize = 2 * std::pow( size, 4 ) * sizeof(value_type) / oneGB;
		// benchmark.setOperation( "4D", datasetSize );
		// benchmark.time< Device >( reset, performer< Device >(), f );
		//
		// expect_eq( a.getStorageArray(), b.getStorageArray() );
		//}
		//
		//template< typename Device >
		//void benchmark_5D( Benchmark& benchmark, index_type size = 56 )
		//{
		// NDArray< value_type,
		// SizesHolder< index_type, 0, 0, 0, 0, 0 >,
		// std::make_index_sequence< 5 >,
		// Device > a, b;
		// a.setSizes( size, size, size, size, size );
		// b.setSizes( size, size, size, size, size );
		// a.getStorageArray().setValue( -1 );
		// b.getStorageArray().setValue( 1 );
		//
		// auto a_view = a.getView();
		// auto b_view = b.getView();
		//
		// auto f = [&]() {
		// a.forBoundary( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l, index_type m ) mutable { a_view( i, j, k, l, m ) = b_view( i, j, k, l, m ); } );
		// a.forInternal( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l, index_type m ) mutable { a_view( i, j, k, l, m ) = b_view( i, j, k, l, m ); } );
		// };
		//
		// const double datasetSize = 2 * std::pow( size, 5 ) * sizeof(value_type) / oneGB;
		// benchmark.setOperation( "5D", datasetSize );
		// benchmark.time< Device >( reset, performer< Device >(), f );
		//
		// expect_eq( a.getStorageArray(), b.getStorageArray() );
		//}
		//
		//template< typename Device >
		//void benchmark_6D( Benchmark& benchmark, index_type size = 28 )
		//{
		// NDArray< value_type,
		// SizesHolder< index_type, 0, 0, 0, 0, 0, 0 >,
		// std::make_index_sequence< 6 >,
		// Device > a, b;
		// a.setSizes( size, size, size, size, size, size );
		// b.setSizes( size, size, size, size, size, size );
		// a.getStorageArray().setValue( -1 );
		// b.getStorageArray().setValue( 1 );
		//
		// auto a_view = a.getView();
		// auto b_view = b.getView();
		//
		// auto f = [&]() {
		// a.forBoundary( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l, index_type m, index_type n ) mutable { a_view( i, j, k, l, m, n ) = b_view( i, j, k, l, m, n ); } );
		// a.forInternal( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l, index_type m, index_type n ) mutable { a_view( i, j, k, l, m, n ) = b_view( i, j, k, l, m, n ); } );
		// };
		//
		// const double datasetSize = 2 * std::pow( size, 6 ) * sizeof(value_type) / oneGB;
		// benchmark.setOperation( "6D", datasetSize );
		// benchmark.time< Device >( reset, performer< Device >(), f );
		//
		// expect_eq( a.getStorageArray(), b.getStorageArray() );
		//}


		template< typename Device >
		void benchmark_2D_perm( Benchmark& benchmark, index_type size = 22333 )
		{
		NDArray< value_type,
		SizesHolder< index_type, 0, 0 >,
		std::index_sequence< 1, 0 >,
		Device > a, b;
		a.setSizes( size, size );
		b.setSizes( size, size );
		a.getStorageArray().setValue( -1 );
		b.getStorageArray().setValue( 1 );

		auto a_view = a.getView();
		auto b_view = b.getView();

		auto f = [&]() {
		a.forBoundary( [=] __cuda_callable__ ( index_type i, index_type j ) mutable { a_view( i, j ) = b_view( i, j ); } );
		a.forInternal( [=] __cuda_callable__ ( index_type i, index_type j ) mutable { a_view( i, j ) = b_view( i, j ); } );
		};

		const double datasetSize = 2 * std::pow( size, 2 ) * sizeof(value_type) / oneGB;
		benchmark.setOperation( "2D permuted", datasetSize );
		benchmark.time< Device >( reset, performer< Device >(), f );

		expect_eq( a.getStorageArray(), b.getStorageArray() );
		}

		template< typename Device >
		void benchmark_3D_perm( Benchmark& benchmark, index_type size = 800 )
		{
		NDArray< value_type,
		SizesHolder< index_type, 0, 0, 0 >,
		std::index_sequence< 2, 1, 0 >,
		Device > a, b;
		a.setSizes( size, size, size );
		b.setSizes( size, size, size );
		a.getStorageArray().setValue( -1 );
		b.getStorageArray().setValue( 1 );

		auto a_view = a.getView();
		auto b_view = b.getView();

		auto f = [&]() {
		a.forBoundary( [=] __cuda_callable__ ( index_type i, index_type j, index_type k ) mutable { a_view( i, j, k ) = b_view( i, j, k ); } );
		a.forInternal( [=] __cuda_callable__ ( index_type i, index_type j, index_type k ) mutable { a_view( i, j, k ) = b_view( i, j, k ); } );
		};

		const double datasetSize = 2 * std::pow( size, 3 ) * sizeof(value_type) / oneGB;
		benchmark.setOperation( "3D permuted", datasetSize );
		benchmark.time< Device >( reset, performer< Device >(), f );

		expect_eq( a.getStorageArray(), b.getStorageArray() );
		}

		// TODO: implement general ParallelBoundaryExecutor
		//template< typename Device >
		//void benchmark_4D_perm( Benchmark& benchmark, index_type size = 150 )
		//{
		// NDArray< value_type,
		// SizesHolder< index_type, 0, 0, 0, 0 >,
		// std::index_sequence< 3, 2, 1, 0 >,
		// Device > a, b;
		// a.setSizes( size, size, size, size );
		// b.setSizes( size, size, size, size );
		// a.getStorageArray().setValue( -1 );
		// b.getStorageArray().setValue( 1 );
		//
		// auto a_view = a.getView();
		// auto b_view = b.getView();
		//
		// auto f = [&]() {
		// a.forBoundary( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l ) mutable { a_view( i, j, k, l ) = b_view( i, j, k, l ); } );
		// a.forInternal( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l ) mutable { a_view( i, j, k, l ) = b_view( i, j, k, l ); } );
		// };
		//
		// const double datasetSize = 2 * std::pow( size, 4 ) * sizeof(value_type) / oneGB;
		// benchmark.setOperation( "4D permuted", datasetSize );
		// benchmark.time< Device >( reset, performer< Device >(), f );
		//
		// expect_eq( a.getStorageArray(), b.getStorageArray() );
		//}
		//
		//template< typename Device >
		//void benchmark_5D_perm( Benchmark& benchmark, index_type size = 56 )
		//{
		// NDArray< value_type,
		// SizesHolder< index_type, 0, 0, 0, 0, 0 >,
		// std::index_sequence< 4, 3, 2, 1, 0 >,
		// Device > a, b;
		// a.setSizes( size, size, size, size, size );
		// b.setSizes( size, size, size, size, size );
		// a.getStorageArray().setValue( -1 );
		// b.getStorageArray().setValue( 1 );
		//
		// auto a_view = a.getView();
		// auto b_view = b.getView();
		//
		// auto f = [&]() {
		// a.forBoundary( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l, index_type m ) mutable { a_view( i, j, k, l, m ) = b_view( i, j, k, l, m ); } );
		// a.forInternal( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l, index_type m ) mutable { a_view( i, j, k, l, m ) = b_view( i, j, k, l, m ); } );
		// };
		//
		// const double datasetSize = 2 * std::pow( size, 5 ) * sizeof(value_type) / oneGB;
		// benchmark.setOperation( "5D permuted", datasetSize );
		// benchmark.time< Device >( reset, performer< Device >(), f );
		//
		// expect_eq( a.getStorageArray(), b.getStorageArray() );
		//}
		//
		//template< typename Device >
		//void benchmark_6D_perm( Benchmark& benchmark, index_type size = 28 )
		//{
		// NDArray< value_type,
		// SizesHolder< index_type, 0, 0, 0, 0, 0, 0 >,
		// std::index_sequence< 5, 4, 3, 2, 1, 0 >,
		// Device > a, b;
		// a.setSizes( size, size, size, size, size, size );
		// b.setSizes( size, size, size, size, size, size );
		// a.getStorageArray().setValue( -1 );
		// b.getStorageArray().setValue( 1 );
		//
		// auto a_view = a.getView();
		// auto b_view = b.getView();
		//
		// auto f = [&]() {
		// a.forBoundary( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l, index_type m, index_type n ) mutable { a_view( i, j, k, l, m, n ) = b_view( i, j, k, l, m, n ); } );
		// a.forInternal( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l, index_type m, index_type n ) mutable { a_view( i, j, k, l, m, n ) = b_view( i, j, k, l, m, n ); } );
		// };
		//
		// const double datasetSize = 2 * std::pow( size, 6 ) * sizeof(value_type) / oneGB;
		// benchmark.setOperation( "6D permuted", datasetSize );
		// benchmark.time< Device >( reset, performer< Device >(), f );
		//
		// expect_eq( a.getStorageArray(), b.getStorageArray() );
		//}

		template< typename Device >
		void run_benchmarks( Benchmark& benchmark )
		{
		benchmark_1D< Device >( benchmark );
		benchmark_2D< Device >( benchmark );
		benchmark_3D< Device >( benchmark );
		// benchmark_4D< Device >( benchmark );
		// benchmark_5D< Device >( benchmark );
		// benchmark_6D< Device >( benchmark );
		benchmark_2D_perm< Device >( benchmark );
		benchmark_3D_perm< Device >( benchmark );
		// benchmark_4D_perm< Device >( benchmark );
		// benchmark_5D_perm< Device >( benchmark );
		// benchmark_6D_perm< Device >( benchmark );
		}

		void setupConfig( Config::ConfigDescription & config )
		{
		config.addDelimiter( "Benchmark settings:" );
		config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-ndarray-boundary.log");
		config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
		config.addEntryEnum( "append" );
		config.addEntryEnum( "overwrite" );
		config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
		config.addEntry< int >( "verbose", "Verbose mode.", 1 );
		config.addEntry< String >( "devices", "Run benchmarks on these devices.", "all" );
		config.addEntryEnum( "all" );
		config.addEntryEnum( "host" );
		#ifdef HAVE_CUDA
		config.addEntryEnum( "cuda" );
		#endif

		config.addDelimiter( "Device settings:" );
		Devices::Host::configSetup( config );
		Devices::Cuda::configSetup( config );
		}

		int main( int argc, char* argv[] )
		{
		Config::ParameterContainer parameters;
		Config::ConfigDescription conf_desc;

		setupConfig( conf_desc );

		if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) {
		conf_desc.printUsage( argv[ 0 ] );
		return EXIT_FAILURE;
		}

		if( ! Devices::Host::setup( parameters ) \|\|
		! Devices::Cuda::setup( parameters ) )
		return EXIT_FAILURE;

		const String & logFileName = parameters.getParameter< String >( "log-file" );
		const String & outputMode = parameters.getParameter< String >( "output-mode" );
		const int loops = parameters.getParameter< int >( "loops" );
		const int verbose = parameters.getParameter< int >( "verbose" );

		// open log file
		auto mode = std::ios::out;
		if( outputMode == "append" )
		mode \|= std::ios::app;
		std::ofstream logFile( logFileName.getString(), mode );

		// init benchmark and common metadata
		Benchmark benchmark( loops, verbose );

		// prepare global metadata
		Benchmark::MetadataMap metadata = getHardwareMetadata();

		const String devices = parameters.getParameter< String >( "devices" );
		if( devices == "all" \|\| devices == "host" )
		run_benchmarks< Devices::Host >( benchmark );
		#ifdef HAVE_CUDA
		if( devices == "all" \|\| devices == "cuda" )
		run_benchmarks< Devices::Cuda >( benchmark );
		#endif

		if( ! benchmark.save( logFile ) ) {
		std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl;
		return EXIT_FAILURE;
		}

		return EXIT_SUCCESS;
		}