Switching to "ExecutionType" instead of "DeviceType" (2ae9e97e) · Commits · TNL / tnl-dev

src/Benchmarks/BLAS/CommonVectorOperations.h

+2 −2

Original line number	Diff line number	Diff line
		@@ -13,10 +13,10 @@
		namespace TNL {
		namespace Benchmarks {

		template< typename Device >
		template< typename Execution >
		struct CommonVectorOperations
		{
		using DeviceType = Device;
		using ExecutionType = Execution;

		template< typename Vector, typename ResultType = typename Vector::RealType >
		static ResultType getVectorMax( const Vector& v );

src/Benchmarks/BLAS/CommonVectorOperations.hpp

+51 −51

Original line number	Diff line number	Diff line
		@@ -16,10 +16,10 @@
		namespace TNL {
		namespace Benchmarks {

		template< typename Device >
		template< typename Execution >
		template< typename Vector, typename ResultType >
		ResultType
		CommonVectorOperations< Device >::
		CommonVectorOperations< Execution >::
		getVectorMax( const Vector& v )
		{
		TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
		@@ -30,13 +30,13 @@ getVectorMax( const Vector& v )
		const auto* data = v.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; };
		auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
		return Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
		return Algorithms::Reduction< ExecutionType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
		}

		template< typename Device >
		template< typename Execution >
		template< typename Vector, typename ResultType >
		ResultType
		CommonVectorOperations< Device >::
		CommonVectorOperations< Execution >::
		getVectorMin( const Vector& v )
		{
		TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
		@@ -47,13 +47,13 @@ getVectorMin( const Vector& v )
		const auto* data = v.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return data[ i ]; };
		auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
		return Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
		return Algorithms::Reduction< ExecutionType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
		}

		template< typename Device >
		template< typename Execution >
		template< typename Vector, typename ResultType >
		ResultType
		CommonVectorOperations< Device >::
		CommonVectorOperations< Execution >::
		getVectorAbsMax( const Vector& v )
		{
		TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
		@@ -64,13 +64,13 @@ getVectorAbsMax( const Vector& v )
		const auto* data = v.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
		auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
		return Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
		return Algorithms::Reduction< ExecutionType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
		}

		template< typename Device >
		template< typename Execution >
		template< typename Vector, typename ResultType >
		ResultType
		CommonVectorOperations< Device >::
		CommonVectorOperations< Execution >::
		getVectorAbsMin( const Vector& v )
		{
		TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
		@@ -81,13 +81,13 @@ getVectorAbsMin( const Vector& v )
		const auto* data = v.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
		auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
		return Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
		return Algorithms::Reduction< ExecutionType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
		}

		template< typename Device >
		template< typename Execution >
		template< typename Vector, typename ResultType >
		ResultType
		CommonVectorOperations< Device >::
		CommonVectorOperations< Execution >::
		getVectorL1Norm( const Vector& v )
		{
		TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
		@@ -97,13 +97,13 @@ getVectorL1Norm( const Vector& v )

		const auto* data = v.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
		return Algorithms::Reduction< DeviceType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
		return Algorithms::Reduction< ExecutionType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
		}

		template< typename Device >
		template< typename Execution >
		template< typename Vector, typename ResultType >
		ResultType
		CommonVectorOperations< Device >::
		CommonVectorOperations< Execution >::
		getVectorL2Norm( const Vector& v )
		{
		TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
		@@ -113,13 +113,13 @@ getVectorL2Norm( const Vector& v )

		const auto* data = v.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data[ i ] * data[ i ]; };
		return std::sqrt( Algorithms::Reduction< DeviceType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ) );
		return std::sqrt( Algorithms::Reduction< ExecutionType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ) );
		}

		template< typename Device >
		template< typename Execution >
		template< typename Vector, typename ResultType, typename Scalar >
		ResultType
		CommonVectorOperations< Device >::
		CommonVectorOperations< Execution >::
		getVectorLpNorm( const Vector& v,
		const Scalar p )
		{
		@@ -136,13 +136,13 @@ getVectorLpNorm( const Vector& v,

		const auto* data = v.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data[ i ] ), p ); };
		return std::pow( Algorithms::Reduction< DeviceType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ), 1.0 / p );
		return std::pow( Algorithms::Reduction< ExecutionType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ), 1.0 / p );
		}

		template< typename Device >
		template< typename Execution >
		template< typename Vector, typename ResultType >
		ResultType
		CommonVectorOperations< Device >::
		CommonVectorOperations< Execution >::
		getVectorSum( const Vector& v )
		{
		TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
		@@ -155,13 +155,13 @@ getVectorSum( const Vector& v )

		const auto* data = v.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; };
		return Algorithms::Reduction< DeviceType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
		return Algorithms::Reduction< ExecutionType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
		}

		template< typename Device >
		template< typename Execution >
		template< typename Vector1, typename Vector2, typename ResultType >
		ResultType
		CommonVectorOperations< Device >::
		CommonVectorOperations< Execution >::
		getVectorDifferenceMax( const Vector1& v1,
		const Vector2& v2 )
		{
		@@ -175,13 +175,13 @@ getVectorDifferenceMax( const Vector1& v1,
		const auto* data2 = v2.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; };
		auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
		return Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
		return Algorithms::Reduction< ExecutionType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
		}

		template< typename Device >
		template< typename Execution >
		template< typename Vector1, typename Vector2, typename ResultType >
		ResultType
		CommonVectorOperations< Device >::
		CommonVectorOperations< Execution >::
		getVectorDifferenceMin( const Vector1& v1,
		const Vector2& v2 )
		{
		@@ -195,13 +195,13 @@ getVectorDifferenceMin( const Vector1& v1,
		const auto* data2 = v2.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; };
		auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
		return Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
		return Algorithms::Reduction< ExecutionType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
		}

		template< typename Device >
		template< typename Execution >
		template< typename Vector1, typename Vector2, typename ResultType >
		ResultType
		CommonVectorOperations< Device >::
		CommonVectorOperations< Execution >::
		getVectorDifferenceAbsMax( const Vector1& v1,
		const Vector2& v2 )
		{
		@@ -215,13 +215,13 @@ getVectorDifferenceAbsMax( const Vector1& v1,
		const auto* data2 = v2.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); };
		auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
		return Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
		return Algorithms::Reduction< ExecutionType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
		}

		template< typename Device >
		template< typename Execution >
		template< typename Vector1, typename Vector2, typename ResultType >
		ResultType
		CommonVectorOperations< Device >::
		CommonVectorOperations< Execution >::
		getVectorDifferenceAbsMin( const Vector1& v1,
		const Vector2& v2 )
		{
		@@ -235,13 +235,13 @@ getVectorDifferenceAbsMin( const Vector1& v1,
		const auto* data2 = v2.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); };
		auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
		return Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
		return Algorithms::Reduction< ExecutionType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
		}

		template< typename Device >
		template< typename Execution >
		template< typename Vector1, typename Vector2, typename ResultType >
		ResultType
		CommonVectorOperations< Device >::
		CommonVectorOperations< Execution >::
		getVectorDifferenceL1Norm( const Vector1& v1,
		const Vector2& v2 )
		{
		@@ -254,13 +254,13 @@ getVectorDifferenceL1Norm( const Vector1& v1,
		const auto* data1 = v1.getData();
		const auto* data2 = v2.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); };
		return Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
		return Algorithms::Reduction< ExecutionType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
		}

		template< typename Device >
		template< typename Execution >
		template< typename Vector1, typename Vector2, typename ResultType >
		ResultType
		CommonVectorOperations< Device >::
		CommonVectorOperations< Execution >::
		getVectorDifferenceL2Norm( const Vector1& v1,
		const Vector2& v2 )
		{
		@@ -276,13 +276,13 @@ getVectorDifferenceL2Norm( const Vector1& v1,
		auto diff = data1[ i ] - data2[ i ];
		return diff * diff;
		};
		return std::sqrt( Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ) );
		return std::sqrt( Algorithms::Reduction< ExecutionType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ) );
		}

		template< typename Device >
		template< typename Execution >
		template< typename Vector1, typename Vector2, typename ResultType, typename Scalar >
		ResultType
		CommonVectorOperations< Device >::
		CommonVectorOperations< Execution >::
		getVectorDifferenceLpNorm( const Vector1& v1,
		const Vector2& v2,
		const Scalar p )
		@@ -302,13 +302,13 @@ getVectorDifferenceLpNorm( const Vector1& v1,
		const auto* data1 = v1.getData();
		const auto* data2 = v2.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data1[ i ] - data2[ i ] ), p ); };
		return std::pow( Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ), 1.0 / p );
		return std::pow( Algorithms::Reduction< ExecutionType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ), 1.0 / p );
		}

		template< typename Device >
		template< typename Execution >
		template< typename Vector1, typename Vector2, typename ResultType >
		ResultType
		CommonVectorOperations< Device >::
		CommonVectorOperations< Execution >::
		getVectorDifferenceSum( const Vector1& v1,
		const Vector2& v2 )
		{
		@@ -321,13 +321,13 @@ getVectorDifferenceSum( const Vector1& v1,
		const auto* data1 = v1.getData();
		const auto* data2 = v2.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; };
		return Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
		return Algorithms::Reduction< ExecutionType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
		}

		template< typename Device >
		template< typename Execution >
		template< typename Vector1, typename Vector2, typename ResultType >
		ResultType
		CommonVectorOperations< Device >::
		CommonVectorOperations< Execution >::
		getScalarProduct( const Vector1& v1,
		const Vector2& v2 )
		{
		@@ -340,7 +340,7 @@ getScalarProduct( const Vector1& v1,
		const auto* data1 = v1.getData();
		const auto* data2 = v2.getData();
		auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] * data2[ i ]; };
		return Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
		return Algorithms::Reduction< ExecutionType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
		}

		} // namespace Benchmarks

src/Benchmarks/BLAS/VectorOperations.h

+13 −13

Original line number	Diff line number	Diff line
		@@ -10,18 +10,18 @@

		#pragma once

		#include <TNL/Devices/Host.h>
		#include <TNL/Devices/Cuda.h>
		#include <TNL/Execution/OpenMP.h>
		#include <TNL/Execution/Cuda.h>
		#include <TNL/Algorithms/ParallelFor.h>

		namespace TNL {
		namespace Benchmarks {

		template< typename Device >
		template< typename Execution >
		struct VectorOperations;

		template<>
		struct VectorOperations< Devices::Host >
		struct VectorOperations< Execution::OpenMP >
		{
		static constexpr int OpenMPVectorOperationsThreshold = 512;
		static constexpr int PrefetchDistance = 128;
		@@ -41,13 +41,13 @@ struct VectorOperations< Devices::Host >

		if( thisMultiplicator == 1.0 )
		#ifdef HAVE_OPENMP
		#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold )
		#pragma omp parallel for if( TNL::Execution::OpenMP::isEnabled() && n > OpenMPVectorOperationsThreshold )
		#endif
		for( Index i = 0; i < n; i ++ )
		y[ i ] += alpha * x[ i ];
		else
		#ifdef HAVE_OPENMP
		#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold )
		#pragma omp parallel for if( TNL::Execution::OpenMP::isEnabled() && n > OpenMPVectorOperationsThreshold )
		#endif
		for( Index i = 0; i < n; i ++ )
		y[ i ] = thisMultiplicator * y[ i ] + alpha * x[ i ];
		@@ -70,13 +70,13 @@ struct VectorOperations< Devices::Host >
		const Index n = v.getSize();
		if( thisMultiplicator == 1.0 )
		#ifdef HAVE_OPENMP
		#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold )
		#pragma omp parallel for if( TNL::Execution::OpenMP::isEnabled() && n > OpenMPVectorOperationsThreshold )
		#endif
		for( Index i = 0; i < n; i ++ )
		v[ i ] += multiplicator1 * v1[ i ] + multiplicator2 * v2[ i ];
		else
		#ifdef HAVE_OPENMP
		#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold )
		#pragma omp parallel for if( TNL::Execution::OpenMP::isEnabled() && n > OpenMPVectorOperationsThreshold )
		#endif
		for( Index i = 0; i < n; i ++ )
		v[ i ] = thisMultiplicator * v[ i ] + multiplicator1 * v1[ i ] + multiplicator2 * v2[ i ];
		@@ -84,7 +84,7 @@ struct VectorOperations< Devices::Host >
		};

		template<>
		struct VectorOperations< Devices::Cuda >
		struct VectorOperations< Execution::Cuda >
		{
		template< typename Vector1, typename Vector2, typename Scalar1, typename Scalar2 >
		static void addVector( Vector1& _y,
		@@ -104,9 +104,9 @@ struct VectorOperations< Devices::Cuda >
		auto add2 = [=] __cuda_callable__ ( IndexType i ) { y[ i ] = thisMultiplicator * y[ i ] + alpha * x[ i ]; };

		if( thisMultiplicator == 1.0 )
		Algorithms::ParallelFor< Devices::Cuda >::exec( (IndexType) 0, _y.getSize(), add1 );
		Algorithms::ParallelFor< Execution::Cuda >::exec( (IndexType) 0, _y.getSize(), add1 );
		else
		Algorithms::ParallelFor< Devices::Cuda >::exec( (IndexType) 0, _y.getSize(), add2 );
		Algorithms::ParallelFor< Execution::Cuda >::exec( (IndexType) 0, _y.getSize(), add2 );
		}

		template< typename Vector1, typename Vector2, typename Vector3, typename Scalar1, typename Scalar2, typename Scalar3 >
		@@ -131,9 +131,9 @@ struct VectorOperations< Devices::Cuda >
		auto add2 = [=] __cuda_callable__ ( IndexType i ) { v[ i ] = thisMultiplicator * v[ i ] + multiplicator1 * v1[ i ] + multiplicator2 * v2[ i ]; };

		if( thisMultiplicator == 1.0 )
		Algorithms::ParallelFor< Devices::Cuda >::exec( (IndexType) 0, _v.getSize(), add1 );
		Algorithms::ParallelFor< Execution::Cuda >::exec( (IndexType) 0, _v.getSize(), add1 );
		else
		Algorithms::ParallelFor< Devices::Cuda >::exec( (IndexType) 0, _v.getSize(), add2 );
		Algorithms::ParallelFor< Execution::Cuda >::exec( (IndexType) 0, _v.getSize(), add2 );
		}
		};

src/Benchmarks/BLAS/array-operations.h

+19 −19

Original line number	Diff line number	Diff line
		@@ -23,14 +23,14 @@ namespace Benchmarks {

		template< typename Real = double,
		typename Index = int,
		template<typename> class HostAllocator = Allocators::Default< Devices::Host >::Allocator,
		template<typename> class CudaAllocator = Allocators::Default< Devices::Cuda >::Allocator >
		template<typename> class HostAllocator = Allocators::Default< Execution::OpenMP >::Allocator,
		template<typename> class CudaAllocator = Allocators::Default< Execution::Cuda >::Allocator >
		void
		benchmarkArrayOperations( Benchmark & benchmark,
		const long & size )
		{
		using HostArray = Containers::Array< Real, Devices::Host, Index, HostAllocator< Real > >;
		using CudaArray = Containers::Array< Real, Devices::Cuda, Index, CudaAllocator< Real > >;
		using HostArray = Containers::Array< Real, Execution::OpenMP, Index, HostAllocator< Real > >;
		using CudaArray = Containers::Array< Real, Execution::Cuda, Index, CudaAllocator< Real > >;

		double datasetSize = (double) size * sizeof( Real ) / oneGB;

		@@ -77,14 +77,14 @@ benchmarkArrayOperations( Benchmark & benchmark,
		resultHost = false;
		};
		benchmark.setOperation( "comparison (memcmp)", 2 * datasetSize );
		benchmark.time< Devices::Host >( reset12, "CPU", compareHost );
		benchmark.time< Execution::OpenMP >( reset12, "CPU", compareHost );

		// std::memcpy and cudaMemcpy
		auto copyHost = [&]() {
		std::memcpy( hostArray.getData(), hostArray2.getData(), hostArray.getSize() * sizeof(Real) );
		};
		benchmark.setOperation( "copy (memcpy)", 2 * datasetSize );
		benchmark.time< Devices::Host >( reset12, "CPU", copyHost );
		benchmark.time< Execution::OpenMP >( reset12, "CPU", copyHost );
		#ifdef HAVE_CUDA
		auto copyCuda = [&]() {
		cudaMemcpy( deviceArray.getData(),
		@@ -93,7 +93,7 @@ benchmarkArrayOperations( Benchmark & benchmark,
		cudaMemcpyDeviceToDevice );
		TNL_CHECK_CUDA_DEVICE;
		};
		benchmark.time< Devices::Cuda >( reset12, "GPU", copyCuda );
		benchmark.time< Execution::Cuda >( reset12, "GPU", copyCuda );
		#endif
		}

		@@ -102,12 +102,12 @@ benchmarkArrayOperations( Benchmark & benchmark,
		resultHost = (int) ( hostArray == hostArray2 );
		};
		benchmark.setOperation( "comparison (operator==)", 2 * datasetSize );
		benchmark.time< Devices::Host >( reset1, "CPU", compareHost );
		benchmark.time< Execution::OpenMP >( reset1, "CPU", compareHost );
		#ifdef HAVE_CUDA
		auto compareCuda = [&]() {
		resultDevice = (int) ( deviceArray == deviceArray2 );
		};
		benchmark.time< Devices::Cuda >( reset1, "GPU", compareCuda );
		benchmark.time< Execution::Cuda >( reset1, "GPU", compareCuda );
		#endif


		@@ -117,12 +117,12 @@ benchmarkArrayOperations( Benchmark & benchmark,
		benchmark.setOperation( "copy (operator=)", 2 * datasetSize );
		// copyBasetime is used later inside HAVE_CUDA guard, so the compiler will
		// complain when compiling without CUDA
		const double copyBasetime = benchmark.time< Devices::Host >( reset1, "CPU", copyAssignHostHost );
		const double copyBasetime = benchmark.time< Execution::OpenMP >( reset1, "CPU", copyAssignHostHost );
		#ifdef HAVE_CUDA
		auto copyAssignCudaCuda = [&]() {
		deviceArray = deviceArray2;
		};
		benchmark.time< Devices::Cuda >( reset1, "GPU", copyAssignCudaCuda );
		benchmark.time< Execution::Cuda >( reset1, "GPU", copyAssignCudaCuda );
		#endif


		@@ -134,8 +134,8 @@ benchmarkArrayOperations( Benchmark & benchmark,
		hostArray = deviceArray;
		};
		benchmark.setOperation( "copy (operator=)", datasetSize, copyBasetime );
		benchmark.time< Devices::Cuda >( reset1, "CPU->GPU", copyAssignHostCuda );
		benchmark.time< Devices::Cuda >( reset1, "GPU->CPU", copyAssignCudaHost );
		benchmark.time< Execution::Cuda >( reset1, "CPU->GPU", copyAssignHostCuda );
		benchmark.time< Execution::Cuda >( reset1, "GPU->CPU", copyAssignCudaHost );
		#endif


		@@ -143,12 +143,12 @@ benchmarkArrayOperations( Benchmark & benchmark,
		hostArray.setValue( 3.0 );
		};
		benchmark.setOperation( "setValue", datasetSize );
		benchmark.time< Devices::Host >( reset1, "CPU", setValueHost );
		benchmark.time< Execution::OpenMP >( reset1, "CPU", setValueHost );
		#ifdef HAVE_CUDA
		auto setValueCuda = [&]() {
		deviceArray.setValue( 3.0 );
		};
		benchmark.time< Devices::Cuda >( reset1, "GPU", setValueCuda );
		benchmark.time< Execution::Cuda >( reset1, "GPU", setValueCuda );
		#endif


		@@ -162,12 +162,12 @@ benchmarkArrayOperations( Benchmark & benchmark,
		#endif
		};
		benchmark.setOperation( "allocation (setSize)", datasetSize );
		benchmark.time< Devices::Host >( resetSize1, "CPU", setSizeHost );
		benchmark.time< Execution::OpenMP >( resetSize1, "CPU", setSizeHost );
		#ifdef HAVE_CUDA
		auto setSizeCuda = [&]() {
		deviceArray.setSize( size );
		};
		benchmark.time< Devices::Cuda >( resetSize1, "GPU", setSizeCuda );
		benchmark.time< Execution::Cuda >( resetSize1, "GPU", setSizeCuda );
		#endif


		@@ -181,12 +181,12 @@ benchmarkArrayOperations( Benchmark & benchmark,
		#endif
		};
		benchmark.setOperation( "deallocation (reset)", datasetSize );
		benchmark.time< Devices::Host >( setSize1, "CPU", resetSizeHost );
		benchmark.time< Execution::OpenMP >( setSize1, "CPU", resetSizeHost );
		#ifdef HAVE_CUDA
		auto resetSizeCuda = [&]() {
		deviceArray.reset();
		};
		benchmark.time< Devices::Cuda >( setSize1, "GPU", resetSizeCuda );
		benchmark.time< Execution::Cuda >( setSize1, "GPU", resetSizeCuda );
		#endif
		}

src/Benchmarks/BLAS/spmv.h

+11 −11

Original line number	Diff line number	Diff line
		@@ -24,8 +24,8 @@ namespace TNL {
		namespace Benchmarks {

		// silly alias to match the number of template parameters with other formats
		template< typename Real, typename Device, typename Index >
		using SlicedEllpack = Matrices::SlicedEllpack< Real, Device, Index >;
		template< typename Real, typename Execution, typename Index >
		using SlicedEllpack = Matrices::SlicedEllpack< Real, Execution, Index >;

		template< typename Matrix >
		int setHostTestMatrix( Matrix& matrix,
		@@ -81,7 +81,7 @@ void setCudaTestMatrix( Matrix& matrix,
		cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
		setCudaTestMatrixKernel< Matrix >
		<<< cudaGridSize, cudaBlockSize >>>
		( &kernel_matrix.template modifyData< Devices::Cuda >(), elementsPerRow, gridIdx );
		( &kernel_matrix.template modifyData< Execution::Cuda >(), elementsPerRow, gridIdx );
		TNL_CHECK_CUDA_DEVICE;
		}
		#endif
		@@ -96,15 +96,15 @@ benchmarkSpMV( Benchmark & benchmark,
		const int & size,
		const int elementsPerRow = 5 )
		{
		typedef Matrix< Real, Devices::Host, int > HostMatrix;
		typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix;
		typedef Containers::Vector< Real, Devices::Host, int > HostVector;
		typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector;
		typedef Matrix< Real, Execution::OpenMP, int > HostMatrix;
		typedef Matrix< Real, Execution::Cuda, int > DeviceMatrix;
		typedef Containers::Vector< Real, Execution::OpenMP, int > HostVector;
		typedef Containers::Vector< Real, Execution::Cuda, int > CudaVector;

		HostMatrix hostMatrix;
		DeviceMatrix deviceMatrix;
		Containers::Vector< int, Devices::Host, int > hostRowLengths;
		Containers::Vector< int, Devices::Cuda, int > deviceRowLengths;
		Containers::Vector< int, Execution::OpenMP, int > hostRowLengths;
		Containers::Vector< int, Execution::Cuda, int > deviceRowLengths;
		HostVector hostVector, hostVector2;
		CudaVector deviceVector, deviceVector2;

		@@ -156,12 +156,12 @@ benchmarkSpMV( Benchmark & benchmark,
		hostMatrix.vectorProduct( hostVector, hostVector2 );
		};
		benchmark.setOperation( datasetSize );
		benchmark.time< Devices::Host >( reset, "CPU", spmvHost );
		benchmark.time< Execution::OpenMP >( reset, "CPU", spmvHost );
		#ifdef HAVE_CUDA
		auto spmvCuda = [&]() {
		deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
		};
		benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda );
		benchmark.time< Execution::Cuda >( reset, "GPU", spmvCuda );
		#endif
		}