Commit 2ae9e97e authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Switching to "ExecutionType" instead of "DeviceType"

This continues the split of Device into Execution and Allocator. The
execution types in TNL/Execution are: Sequential, OpenMP and Cuda
(Execution::OpenMP is instead of Devices::Host).

TODO:

- smart pointers: replace Device with Allocator (methods getData() and
  modifyData() should be removed, instead there should be getHostData()
  and getImageData() in both const and non-const variants)
- serialization: use a placeholder string (like "any") because data from files should be loadable with any Executor or Allocator
- revise BuildConfigTags for problem-solvers
- compatibility of Executors with Allocators
- dynamic execution policy - to specify runtime parameters for a
  specific (parallel) algorithm
   - implementation:
      some hierarchy of class templates which have the static execution policy as a template parameter
      specific classes for certain algorithms (like Reduction or PrefixSum)
      e.g. `DefaultExecutionParameters<CUDA>` → `ReductionExecutionParameters<CUDA>`
                                              → `PrefixSumExecutionParameters<CUDA>`
      most algorithms should use `DefaultExecutionParameters<DeviceType>`
   - then:
      - extend tests:
         ParallelFor: achieve full coverage with small array size
         finishing reduction and multireduction on host/GPU
         prefix-sum: specify suitable maxGridSize, blockSize, elementsInBlock and decrease VECTOR_TEST_SIZE
      - cuda reduction: profiling + probably change "finish" to launch only 1 block of threads
         - try zero-copy buffer on the host instead of CudaReductionBuffer
      - try using `ParallelFor` with a specific block size in LBM
      - custom kernel launch configuration for traversers
parent a556f79e
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -13,10 +13,10 @@
namespace TNL {
namespace Benchmarks {

template< typename Device >
template< typename Execution >
struct CommonVectorOperations
{
   using DeviceType = Device;
   using ExecutionType = Execution;

   template< typename Vector, typename ResultType = typename Vector::RealType >
   static ResultType getVectorMax( const Vector& v );
+51 −51
Original line number Diff line number Diff line
@@ -16,10 +16,10 @@
namespace TNL {
namespace Benchmarks {

template< typename Device >
template< typename Execution >
   template< typename Vector, typename ResultType >
ResultType
CommonVectorOperations< Device >::
CommonVectorOperations< Execution >::
getVectorMax( const Vector& v )
{
   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
@@ -30,13 +30,13 @@ getVectorMax( const Vector& v )
   const auto* data = v.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; };
   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
   return Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
   return Algorithms::Reduction< ExecutionType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
}

template< typename Device >
template< typename Execution >
   template< typename Vector, typename ResultType >
ResultType
CommonVectorOperations< Device >::
CommonVectorOperations< Execution >::
getVectorMin( const Vector& v )
{
   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
@@ -47,13 +47,13 @@ getVectorMin( const Vector& v )
   const auto* data = v.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return data[ i ]; };
   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
   return Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
   return Algorithms::Reduction< ExecutionType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
}

template< typename Device >
template< typename Execution >
   template< typename Vector, typename ResultType >
ResultType
CommonVectorOperations< Device >::
CommonVectorOperations< Execution >::
getVectorAbsMax( const Vector& v )
{
   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
@@ -64,13 +64,13 @@ getVectorAbsMax( const Vector& v )
   const auto* data = v.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
   return Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
   return Algorithms::Reduction< ExecutionType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
}

template< typename Device >
template< typename Execution >
   template< typename Vector, typename ResultType >
ResultType
CommonVectorOperations< Device >::
CommonVectorOperations< Execution >::
getVectorAbsMin( const Vector& v )
{
   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
@@ -81,13 +81,13 @@ getVectorAbsMin( const Vector& v )
   const auto* data = v.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
   return Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
   return Algorithms::Reduction< ExecutionType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
}

template< typename Device >
template< typename Execution >
   template< typename Vector, typename ResultType >
ResultType
CommonVectorOperations< Device >::
CommonVectorOperations< Execution >::
getVectorL1Norm( const Vector& v )
{
   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
@@ -97,13 +97,13 @@ getVectorL1Norm( const Vector& v )

   const auto* data = v.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
   return Algorithms::Reduction< DeviceType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
   return Algorithms::Reduction< ExecutionType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
}

template< typename Device >
template< typename Execution >
   template< typename Vector, typename ResultType >
ResultType
CommonVectorOperations< Device >::
CommonVectorOperations< Execution >::
getVectorL2Norm( const Vector& v )
{
   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
@@ -113,13 +113,13 @@ getVectorL2Norm( const Vector& v )

   const auto* data = v.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data[ i ] * data[ i ]; };
   return std::sqrt( Algorithms::Reduction< DeviceType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ) );
   return std::sqrt( Algorithms::Reduction< ExecutionType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ) );
}

template< typename Device >
template< typename Execution >
   template< typename Vector, typename ResultType, typename Scalar >
ResultType
CommonVectorOperations< Device >::
CommonVectorOperations< Execution >::
getVectorLpNorm( const Vector& v,
                 const Scalar p )
{
@@ -136,13 +136,13 @@ getVectorLpNorm( const Vector& v,

   const auto* data = v.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data[ i ] ), p ); };
   return std::pow( Algorithms::Reduction< DeviceType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ), 1.0 / p );
   return std::pow( Algorithms::Reduction< ExecutionType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ), 1.0 / p );
}

template< typename Device >
template< typename Execution >
   template< typename Vector, typename ResultType >
ResultType
CommonVectorOperations< Device >::
CommonVectorOperations< Execution >::
getVectorSum( const Vector& v )
{
   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
@@ -155,13 +155,13 @@ getVectorSum( const Vector& v )

   const auto* data = v.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i )  -> ResultType { return data[ i ]; };
   return Algorithms::Reduction< DeviceType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
   return Algorithms::Reduction< ExecutionType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
}

template< typename Device >
template< typename Execution >
   template< typename Vector1, typename Vector2, typename ResultType >
ResultType
CommonVectorOperations< Device >::
CommonVectorOperations< Execution >::
getVectorDifferenceMax( const Vector1& v1,
                        const Vector2& v2 )
{
@@ -175,13 +175,13 @@ getVectorDifferenceMax( const Vector1& v1,
   const auto* data2 = v2.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; };
   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
   return Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
   return Algorithms::Reduction< ExecutionType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
}

template< typename Device >
template< typename Execution >
   template< typename Vector1, typename Vector2, typename ResultType >
ResultType
CommonVectorOperations< Device >::
CommonVectorOperations< Execution >::
getVectorDifferenceMin( const Vector1& v1,
                        const Vector2& v2 )
{
@@ -195,13 +195,13 @@ getVectorDifferenceMin( const Vector1& v1,
   const auto* data2 = v2.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; };
   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
   return Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
   return Algorithms::Reduction< ExecutionType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
}

template< typename Device >
template< typename Execution >
   template< typename Vector1, typename Vector2, typename ResultType >
ResultType
CommonVectorOperations< Device >::
CommonVectorOperations< Execution >::
getVectorDifferenceAbsMax( const Vector1& v1,
                           const Vector2& v2 )
{
@@ -215,13 +215,13 @@ getVectorDifferenceAbsMax( const Vector1& v1,
   const auto* data2 = v2.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); };
   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
   return Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
   return Algorithms::Reduction< ExecutionType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
}

template< typename Device >
template< typename Execution >
   template< typename Vector1, typename Vector2, typename ResultType >
ResultType
CommonVectorOperations< Device >::
CommonVectorOperations< Execution >::
getVectorDifferenceAbsMin( const Vector1& v1,
                           const Vector2& v2 )
{
@@ -235,13 +235,13 @@ getVectorDifferenceAbsMin( const Vector1& v1,
   const auto* data2 = v2.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); };
   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
   return Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
   return Algorithms::Reduction< ExecutionType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
}

template< typename Device >
template< typename Execution >
   template< typename Vector1, typename Vector2, typename ResultType >
ResultType
CommonVectorOperations< Device >::
CommonVectorOperations< Execution >::
getVectorDifferenceL1Norm( const Vector1& v1,
                           const Vector2& v2 )
{
@@ -254,13 +254,13 @@ getVectorDifferenceL1Norm( const Vector1& v1,
   const auto* data1 = v1.getData();
   const auto* data2 = v2.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); };
   return Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
   return Algorithms::Reduction< ExecutionType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
}

template< typename Device >
template< typename Execution >
   template< typename Vector1, typename Vector2, typename ResultType >
ResultType
CommonVectorOperations< Device >::
CommonVectorOperations< Execution >::
getVectorDifferenceL2Norm( const Vector1& v1,
                           const Vector2& v2 )
{
@@ -276,13 +276,13 @@ getVectorDifferenceL2Norm( const Vector1& v1,
      auto diff = data1[ i ] - data2[ i ];
      return diff * diff;
   };
   return std::sqrt( Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ) );
   return std::sqrt( Algorithms::Reduction< ExecutionType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ) );
}

template< typename Device >
template< typename Execution >
   template< typename Vector1, typename Vector2, typename ResultType, typename Scalar >
ResultType
CommonVectorOperations< Device >::
CommonVectorOperations< Execution >::
getVectorDifferenceLpNorm( const Vector1& v1,
                           const Vector2& v2,
                           const Scalar p )
@@ -302,13 +302,13 @@ getVectorDifferenceLpNorm( const Vector1& v1,
   const auto* data1 = v1.getData();
   const auto* data2 = v2.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data1[ i ] - data2[ i ] ), p ); };
   return std::pow( Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ), 1.0 / p );
   return std::pow( Algorithms::Reduction< ExecutionType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ), 1.0 / p );
}

template< typename Device >
template< typename Execution >
   template< typename Vector1, typename Vector2, typename ResultType >
ResultType
CommonVectorOperations< Device >::
CommonVectorOperations< Execution >::
getVectorDifferenceSum( const Vector1& v1,
                        const Vector2& v2 )
{
@@ -321,13 +321,13 @@ getVectorDifferenceSum( const Vector1& v1,
   const auto* data1 = v1.getData();
   const auto* data2 = v2.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; };
   return Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
   return Algorithms::Reduction< ExecutionType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
}

template< typename Device >
template< typename Execution >
   template< typename Vector1, typename Vector2, typename ResultType >
ResultType
CommonVectorOperations< Device >::
CommonVectorOperations< Execution >::
getScalarProduct( const Vector1& v1,
                  const Vector2& v2 )
{
@@ -340,7 +340,7 @@ getScalarProduct( const Vector1& v1,
   const auto* data1 = v1.getData();
   const auto* data2 = v2.getData();
   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] * data2[ i ]; };
   return Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
   return Algorithms::Reduction< ExecutionType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
}

} // namespace Benchmarks
+13 −13
Original line number Diff line number Diff line
@@ -10,18 +10,18 @@

#pragma once

#include <TNL/Devices/Host.h>
#include <TNL/Devices/Cuda.h>
#include <TNL/Execution/OpenMP.h>
#include <TNL/Execution/Cuda.h>
#include <TNL/Algorithms/ParallelFor.h>

namespace TNL {
namespace Benchmarks {

template< typename Device >
template< typename Execution >
struct VectorOperations;

template<>
struct VectorOperations< Devices::Host >
struct VectorOperations< Execution::OpenMP >
{
   static constexpr int OpenMPVectorOperationsThreshold = 512;
   static constexpr int PrefetchDistance = 128;
@@ -41,13 +41,13 @@ struct VectorOperations< Devices::Host >

      if( thisMultiplicator == 1.0 )
         #ifdef HAVE_OPENMP
         #pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold )
         #pragma omp parallel for if( TNL::Execution::OpenMP::isEnabled() && n > OpenMPVectorOperationsThreshold )
         #endif
         for( Index i = 0; i < n; i ++ )
            y[ i ] += alpha * x[ i ];
      else
         #ifdef HAVE_OPENMP
         #pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold )
         #pragma omp parallel for if( TNL::Execution::OpenMP::isEnabled() && n > OpenMPVectorOperationsThreshold )
         #endif
         for( Index i = 0; i < n; i ++ )
            y[ i ] = thisMultiplicator * y[ i ] + alpha * x[ i ];
@@ -70,13 +70,13 @@ struct VectorOperations< Devices::Host >
      const Index n = v.getSize();
      if( thisMultiplicator == 1.0 )
         #ifdef HAVE_OPENMP
         #pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold )
         #pragma omp parallel for if( TNL::Execution::OpenMP::isEnabled() && n > OpenMPVectorOperationsThreshold )
         #endif
         for( Index i = 0; i < n; i ++ )
            v[ i ] += multiplicator1 * v1[ i ] + multiplicator2 * v2[ i ];
      else
         #ifdef HAVE_OPENMP
         #pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold )
         #pragma omp parallel for if( TNL::Execution::OpenMP::isEnabled() && n > OpenMPVectorOperationsThreshold )
         #endif
         for( Index i = 0; i < n; i ++ )
            v[ i ] = thisMultiplicator * v[ i ] + multiplicator1 * v1[ i ] + multiplicator2 * v2[ i ];
@@ -84,7 +84,7 @@ struct VectorOperations< Devices::Host >
};

template<>
struct VectorOperations< Devices::Cuda >
struct VectorOperations< Execution::Cuda >
{
   template< typename Vector1, typename Vector2, typename Scalar1, typename Scalar2 >
   static void addVector( Vector1& _y,
@@ -104,9 +104,9 @@ struct VectorOperations< Devices::Cuda >
      auto add2 = [=] __cuda_callable__ ( IndexType i ) { y[ i ] = thisMultiplicator * y[ i ] + alpha * x[ i ]; };

      if( thisMultiplicator == 1.0 )
         Algorithms::ParallelFor< Devices::Cuda >::exec( (IndexType) 0, _y.getSize(), add1 );
         Algorithms::ParallelFor< Execution::Cuda >::exec( (IndexType) 0, _y.getSize(), add1 );
      else
         Algorithms::ParallelFor< Devices::Cuda >::exec( (IndexType) 0, _y.getSize(), add2 );
         Algorithms::ParallelFor< Execution::Cuda >::exec( (IndexType) 0, _y.getSize(), add2 );
   }

   template< typename Vector1, typename Vector2, typename Vector3, typename Scalar1, typename Scalar2, typename Scalar3 >
@@ -131,9 +131,9 @@ struct VectorOperations< Devices::Cuda >
      auto add2 = [=] __cuda_callable__ ( IndexType i ) { v[ i ] = thisMultiplicator * v[ i ] + multiplicator1 * v1[ i ] + multiplicator2 * v2[ i ]; };

      if( thisMultiplicator == 1.0 )
         Algorithms::ParallelFor< Devices::Cuda >::exec( (IndexType) 0, _v.getSize(), add1 );
         Algorithms::ParallelFor< Execution::Cuda >::exec( (IndexType) 0, _v.getSize(), add1 );
      else
         Algorithms::ParallelFor< Devices::Cuda >::exec( (IndexType) 0, _v.getSize(), add2 );
         Algorithms::ParallelFor< Execution::Cuda >::exec( (IndexType) 0, _v.getSize(), add2 );
   }
};

+19 −19
Original line number Diff line number Diff line
@@ -23,14 +23,14 @@ namespace Benchmarks {

template< typename Real = double,
          typename Index = int,
          template<typename> class HostAllocator = Allocators::Default< Devices::Host >::Allocator,
          template<typename> class CudaAllocator = Allocators::Default< Devices::Cuda >::Allocator >
          template<typename> class HostAllocator = Allocators::Default< Execution::OpenMP >::Allocator,
          template<typename> class CudaAllocator = Allocators::Default< Execution::Cuda >::Allocator >
void
benchmarkArrayOperations( Benchmark & benchmark,
                          const long & size )
{
   using HostArray = Containers::Array< Real, Devices::Host, Index, HostAllocator< Real > >;
   using CudaArray = Containers::Array< Real, Devices::Cuda, Index, CudaAllocator< Real > >;
   using HostArray = Containers::Array< Real, Execution::OpenMP, Index, HostAllocator< Real > >;
   using CudaArray = Containers::Array< Real, Execution::Cuda, Index, CudaAllocator< Real > >;

   double datasetSize = (double) size * sizeof( Real ) / oneGB;

@@ -77,14 +77,14 @@ benchmarkArrayOperations( Benchmark & benchmark,
            resultHost = false;
      };
      benchmark.setOperation( "comparison (memcmp)", 2 * datasetSize );
      benchmark.time< Devices::Host >( reset12, "CPU", compareHost );
      benchmark.time< Execution::OpenMP >( reset12, "CPU", compareHost );

      // std::memcpy and cudaMemcpy
      auto copyHost = [&]() {
         std::memcpy( hostArray.getData(), hostArray2.getData(), hostArray.getSize() * sizeof(Real) );
      };
      benchmark.setOperation( "copy (memcpy)", 2 * datasetSize );
      benchmark.time< Devices::Host >( reset12, "CPU", copyHost );
      benchmark.time< Execution::OpenMP >( reset12, "CPU", copyHost );
#ifdef HAVE_CUDA
      auto copyCuda = [&]() {
         cudaMemcpy( deviceArray.getData(),
@@ -93,7 +93,7 @@ benchmarkArrayOperations( Benchmark & benchmark,
                     cudaMemcpyDeviceToDevice );
         TNL_CHECK_CUDA_DEVICE;
      };
      benchmark.time< Devices::Cuda >( reset12, "GPU", copyCuda );
      benchmark.time< Execution::Cuda >( reset12, "GPU", copyCuda );
#endif
   }

@@ -102,12 +102,12 @@ benchmarkArrayOperations( Benchmark & benchmark,
      resultHost = (int) ( hostArray == hostArray2 );
   };
   benchmark.setOperation( "comparison (operator==)", 2 * datasetSize );
   benchmark.time< Devices::Host >( reset1, "CPU", compareHost );
   benchmark.time< Execution::OpenMP >( reset1, "CPU", compareHost );
#ifdef HAVE_CUDA
   auto compareCuda = [&]() {
      resultDevice = (int) ( deviceArray == deviceArray2 );
   };
   benchmark.time< Devices::Cuda >( reset1, "GPU", compareCuda );
   benchmark.time< Execution::Cuda >( reset1, "GPU", compareCuda );
#endif


@@ -117,12 +117,12 @@ benchmarkArrayOperations( Benchmark & benchmark,
   benchmark.setOperation( "copy (operator=)", 2 * datasetSize );
   // copyBasetime is used later inside HAVE_CUDA guard, so the compiler will
   // complain when compiling without CUDA
   const double copyBasetime = benchmark.time< Devices::Host >( reset1, "CPU", copyAssignHostHost );
   const double copyBasetime = benchmark.time< Execution::OpenMP >( reset1, "CPU", copyAssignHostHost );
#ifdef HAVE_CUDA
   auto copyAssignCudaCuda = [&]() {
      deviceArray = deviceArray2;
   };
   benchmark.time< Devices::Cuda >( reset1, "GPU", copyAssignCudaCuda );
   benchmark.time< Execution::Cuda >( reset1, "GPU", copyAssignCudaCuda );
#endif


@@ -134,8 +134,8 @@ benchmarkArrayOperations( Benchmark & benchmark,
      hostArray = deviceArray;
   };
   benchmark.setOperation( "copy (operator=)", datasetSize, copyBasetime );
   benchmark.time< Devices::Cuda >( reset1, "CPU->GPU", copyAssignHostCuda );
   benchmark.time< Devices::Cuda >( reset1, "GPU->CPU", copyAssignCudaHost );
   benchmark.time< Execution::Cuda >( reset1, "CPU->GPU", copyAssignHostCuda );
   benchmark.time< Execution::Cuda >( reset1, "GPU->CPU", copyAssignCudaHost );
#endif


@@ -143,12 +143,12 @@ benchmarkArrayOperations( Benchmark & benchmark,
      hostArray.setValue( 3.0 );
   };
   benchmark.setOperation( "setValue", datasetSize );
   benchmark.time< Devices::Host >( reset1, "CPU", setValueHost );
   benchmark.time< Execution::OpenMP >( reset1, "CPU", setValueHost );
#ifdef HAVE_CUDA
   auto setValueCuda = [&]() {
      deviceArray.setValue( 3.0 );
   };
   benchmark.time< Devices::Cuda >( reset1, "GPU", setValueCuda );
   benchmark.time< Execution::Cuda >( reset1, "GPU", setValueCuda );
#endif


@@ -162,12 +162,12 @@ benchmarkArrayOperations( Benchmark & benchmark,
#endif
   };
   benchmark.setOperation( "allocation (setSize)", datasetSize );
   benchmark.time< Devices::Host >( resetSize1, "CPU", setSizeHost );
   benchmark.time< Execution::OpenMP >( resetSize1, "CPU", setSizeHost );
#ifdef HAVE_CUDA
   auto setSizeCuda = [&]() {
      deviceArray.setSize( size );
   };
   benchmark.time< Devices::Cuda >( resetSize1, "GPU", setSizeCuda );
   benchmark.time< Execution::Cuda >( resetSize1, "GPU", setSizeCuda );
#endif


@@ -181,12 +181,12 @@ benchmarkArrayOperations( Benchmark & benchmark,
#endif
   };
   benchmark.setOperation( "deallocation (reset)", datasetSize );
   benchmark.time< Devices::Host >( setSize1, "CPU", resetSizeHost );
   benchmark.time< Execution::OpenMP >( setSize1, "CPU", resetSizeHost );
#ifdef HAVE_CUDA
   auto resetSizeCuda = [&]() {
      deviceArray.reset();
   };
   benchmark.time< Devices::Cuda >( setSize1, "GPU", resetSizeCuda );
   benchmark.time< Execution::Cuda >( setSize1, "GPU", resetSizeCuda );
#endif
}

+11 −11
Original line number Diff line number Diff line
@@ -24,8 +24,8 @@ namespace TNL {
namespace Benchmarks {

// silly alias to match the number of template parameters with other formats
template< typename Real, typename Device, typename Index >
using SlicedEllpack = Matrices::SlicedEllpack< Real, Device, Index >;
template< typename Real, typename Execution, typename Index >
using SlicedEllpack = Matrices::SlicedEllpack< Real, Execution, Index >;

template< typename Matrix >
int setHostTestMatrix( Matrix& matrix,
@@ -81,7 +81,7 @@ void setCudaTestMatrix( Matrix& matrix,
         cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
      setCudaTestMatrixKernel< Matrix >
         <<< cudaGridSize, cudaBlockSize >>>
         ( &kernel_matrix.template modifyData< Devices::Cuda >(), elementsPerRow, gridIdx );
         ( &kernel_matrix.template modifyData< Execution::Cuda >(), elementsPerRow, gridIdx );
        TNL_CHECK_CUDA_DEVICE;
   }
#endif
@@ -96,15 +96,15 @@ benchmarkSpMV( Benchmark & benchmark,
               const int & size,
               const int elementsPerRow = 5 )
{
   typedef Matrix< Real, Devices::Host, int > HostMatrix;
   typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix;
   typedef Containers::Vector< Real, Devices::Host, int > HostVector;
   typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector;
   typedef Matrix< Real, Execution::OpenMP, int > HostMatrix;
   typedef Matrix< Real, Execution::Cuda, int > DeviceMatrix;
   typedef Containers::Vector< Real, Execution::OpenMP, int > HostVector;
   typedef Containers::Vector< Real, Execution::Cuda, int > CudaVector;

   HostMatrix hostMatrix;
   DeviceMatrix deviceMatrix;
   Containers::Vector< int, Devices::Host, int > hostRowLengths;
   Containers::Vector< int, Devices::Cuda, int > deviceRowLengths;
   Containers::Vector< int, Execution::OpenMP, int > hostRowLengths;
   Containers::Vector< int, Execution::Cuda, int > deviceRowLengths;
   HostVector hostVector, hostVector2;
   CudaVector deviceVector, deviceVector2;

@@ -156,12 +156,12 @@ benchmarkSpMV( Benchmark & benchmark,
      hostMatrix.vectorProduct( hostVector, hostVector2 );
   };
   benchmark.setOperation( datasetSize );
   benchmark.time< Devices::Host >( reset, "CPU", spmvHost );
   benchmark.time< Execution::OpenMP >( reset, "CPU", spmvHost );
#ifdef HAVE_CUDA
   auto spmvCuda = [&]() {
      deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
   };
   benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda );
   benchmark.time< Execution::Cuda >( reset, "GPU", spmvCuda );
#endif
}

Loading