Commit 7dce286c authored by Tomáš Oberhuber's avatar Tomáš Oberhuber Committed by Jakub Klinkovský
Browse files

Fixed grid traversers benchmark.

parent 56f0c672
Loading
Loading
Loading
Loading
+0 −109
Original line number Diff line number Diff line
/***************************************************************************
                          WriteOne.h  -  description
                             -------------------
    begin                : Dec 19, 2018
    copyright            : (C) 2018 by oberhuber
    email                : tomas.oberhuber@fjfi.cvut.cz
 ***************************************************************************/

/* See Copyright Notice in tnl/Copyright */

// Implemented by: Tomas Oberhuber

#pragma once

#include <TNL/ParallelFor.h>
#include <TNL/Devices/Host.h>
#include <TNL/Devices/Cuda.h>
#include <TNL/Containers/Vector.h>

namespace TNL {
   namespace Benchmarks {
      

template< int Dimenions,
          typename Device,
          typename Real,
          typename Index >
class WriteOne{};

template< typename Device,
          typename Real,
          typename Index >
class WriteOne< 1, Device, Real, Index >
{
   public:
      
      using Vector = Containers::Vector< Real, Device, Index >;
      
      static void run( std::size_t size )
      {
         Vector v( size );
         auto writeOne = [] __cuda_callable__ ( Index i, Real* data )
         {
            data[ i ] = 1.0;
         };
         
         ParallelFor< Device >::exec( ( std::size_t ) 0, size, writeOne, v.getData() );
      }
};


template< typename Device,
          typename Real,
          typename Index >
class WriteOne< 2, Device, Real, Index >
{
   public:
      
      using Vector = Containers::Vector< Real, Device, Index >;
      
      static void run( std::size_t size )
      {
         Vector v( size * size );
         auto writeOne = [=] __cuda_callable__ ( Index i, Index j,  Real* data )
         {
            data[ i * size + j ] = 1.0;
         };
         
         ParallelFor2D< Device >::exec( ( std::size_t ) 0,
                                        ( std::size_t ) 0,
                                        size,
                                        size,
                                        writeOne, v.getData() );         
      }
};

template< typename Device,
          typename Real,
          typename Index >
class WriteOne< 3, Device, Real, Index >
{
   public:
      
      using Vector = Containers::Vector< Real, Device, Index >;
      
      static void run( std::size_t size )
      {
         Vector v( size * size * size );
         auto writeOne = [=] __cuda_callable__ ( Index i, Index j, Index k, Real* data )
         {
            data[ ( i * size + j ) * size + k ] = 1.0;
         };
         
         ParallelFor3D< Device >::exec( ( std::size_t ) 0, 
                                        ( std::size_t ) 0, 
                                        ( std::size_t ) 0, 
                                        size,
                                        size,
                                        size,
                                        writeOne, v.getData() );         
      }
};


   } // namespace Benchmarks
} // namespace TNL


+1 −19
Original line number Diff line number Diff line
@@ -13,7 +13,7 @@
#pragma once

#include "../Benchmarks.h"
#include "WriteOne.h"


#include <TNL/Containers/Vector.h>

@@ -29,24 +29,6 @@ class benchmarkTraversingFullGrid

      static void run ( Benchmark& benchmark, std::size_t size )
      {
         auto reset = [&]()
         {};
         
         auto testHost = [&] ()
         {
            WriteOne< Dimension, Devices::Host, Real, Index >::run( size );
         }; 
         
         auto testCuda = [&] ()
         {
            WriteOne< Dimension, Devices::Cuda, Real, Index >::run( size );
         }; 
         
         benchmark.setOperation( "writeOne", size * sizeof( Real ) );
         benchmark.time( reset, "CPU", testHost );
#ifdef HAVE_CUDA
         benchmark.time( reset, "GPU", testCuda );
#endif

      }
};
+56 −30
Original line number Diff line number Diff line
@@ -13,7 +13,8 @@
#pragma once

#include "../Benchmarks.h"
#include "grid-traversing.h"
//#include "grid-traversing.h"
#include "GridTraversersBenchmark.h"

#include <TNL/Config/ConfigDescription.h>
#include <TNL/Devices/Host.h>
@@ -23,29 +24,10 @@
using namespace TNL;
using namespace TNL::Benchmarks;

void setupConfig( Config::ConfigDescription& config )
{
   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-blas.log");
   config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
   config.addEntryEnum( "append" );
   config.addEntryEnum( "overwrite" );
   config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" );
   config.addEntryEnum( "float" );
   config.addEntryEnum( "double" );
   config.addEntryEnum( "all" );
   config.addEntry< int >( "dimension", "Set the problem dimension. 0 means all dimensions 1,2 and 3.", 0 );   
   config.addEntry< int >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 10 );
   config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 1000 );
   config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 );
   config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );   
   config.addEntry< int >( "verbose", "Verbose mode.", 1 );

   config.addDelimiter( "Device settings:" );
   Devices::Host::configSetup( config );
   Devices::Cuda::configSetup( config );   
}

template< int Dimension >
template< int Dimension,
          typename Real = float,
          typename Index = int >
bool runBenchmark( const Config::ParameterContainer& parameters,
                   Benchmark& benchmark,
                   Benchmark::MetadataMap& metadata )
@@ -62,14 +44,59 @@ bool runBenchmark( const Config::ParameterContainer& parameters,
   benchmark.newBenchmark( String("Full grid traversing " + convertToString( Dimension ) + "D" ), metadata );
   for( std::size_t size = minSize; size <= maxSize; size *= 2 )
   {
      benchmark.setMetadataColumns( Benchmark::MetadataColumns({
         {"size", convertToString( size ) },
      } ));
      benchmarkTraversingFullGrid< Dimension >::run( benchmark, size );

      GridTraversersBenchmark< Dimension, Devices::Host, Real, Index > hostTraverserBenchmark( size );
      GridTraversersBenchmark< Dimension, Devices::Cuda, Real, Index > cudaTraverserBenchmark( size );         

      auto reset = [&]() {};
      
      benchmark.setMetadataColumns(
         Benchmark::MetadataColumns( 
            {  {"size", convertToString( size ) }, } ) );

      auto hostWriteOne = [&] ()
      {
         hostTraverserBenchmark.writeOne();
      }; 

      auto cudaWriteOne = [&] ()
      {
         cudaTraverserBenchmark.writeOne();
      }; 

      benchmark.setOperation( "writeOne", size * sizeof( Real ) );
      benchmark.time( reset, "CPU", hostWriteOne );
#ifdef HAVE_CUDA
      benchmark.time( reset, "GPU", cudaWriteOne );
#endif
      
   }   
   return true;
}

void setupConfig( Config::ConfigDescription& config )
{
   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-blas.log");
   config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
   config.addEntryEnum( "append" );
   config.addEntryEnum( "overwrite" );
   config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" );
   config.addEntryEnum( "float" );
   config.addEntryEnum( "double" );
   config.addEntryEnum( "all" );
   config.addEntry< int >( "dimension", "Set the problem dimension. 0 means all dimensions 1,2 and 3.", 0 );   
   config.addEntry< int >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 10 );
   config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 1000 );
   config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 );
   config.addEntry< bool >( "verbose", "Verbose mode.", true );

   Benchmark::configSetup( config );
   
   config.addDelimiter( "Device settings:" );
   Devices::Host::configSetup( config );
   Devices::Cuda::configSetup( config );   
}

template< int Dimension >
bool setupBenchmark( const Config::ParameterContainer& parameters )
{
@@ -77,10 +104,9 @@ bool setupBenchmark( const Config::ParameterContainer& parameters )
   const String & outputMode = parameters.getParameter< String >( "output-mode" );
   const String & precision = parameters.getParameter< String >( "precision" );
   const unsigned sizeStepFactor = parameters.getParameter< unsigned >( "size-step-factor" );
   const unsigned loops = parameters.getParameter< unsigned >( "loops" );
   const unsigned verbose = parameters.getParameter< unsigned >( "verbose" );
   
   Benchmark benchmark( loops, verbose );

   Benchmark benchmark; //( loops, verbose );
   Benchmark::MetadataMap metadata = getHardwareMetadata();
   runBenchmark< Dimension >( parameters, benchmark, metadata );
   
+6 −9
Original line number Diff line number Diff line
INSTALL( FILES matrix-market
               florida-matrix-market
               get-matrices
               convert-matrices
               draw-matrices
INSTALL( FILES tnl-run-heat-equation-benchmark
               run-tnl-benchmark-spmv
               run-tnl-benchmark-traversers
               run-matrix-solvers-benchmark
               run-tnl-benchmark-spmv
               run-tnl-benchmark-linear-solvers
               tnl-run-heat-equation-benchmark
               cuda-profiler.conf
               process-cuda-profile.pl 
               
               DESTINATION ${TNL_TARGET_DATA_DIRECTORY}/benchmark-scripts )

INSTALL( FILES tnl-run-spmv-benchmark
INSTALL( FILES run-tnl-benchmark-spmv
               run-tnl-benchmark-traversers
         DESTINATION bin
         PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE )