Commit e444116e authored by Jakub Klinkovský's avatar Jakub Klinkovský Committed by Jakub Klinkovský
Browse files

Added NDArray

parent 84b7a213
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
add_subdirectory( HeatEquation )
add_subdirectory( BLAS )
add_subdirectory( NDArray )
add_subdirectory( SpMV )
add_subdirectory( DistSpMV )
add_subdirectory( LinearSolvers )
+9 −0
Original line number Diff line number Diff line
add_executable( tnl-benchmark-ndarray tnl-benchmark-ndarray.cpp )
target_compile_options( tnl-benchmark-ndarray PRIVATE ${CXX_TESTS_FLAGS} )
install( TARGETS tnl-benchmark-ndarray RUNTIME DESTINATION bin )

if( BUILD_CUDA )
   cuda_add_executable( tnl-benchmark-ndarray-cuda tnl-benchmark-ndarray-cuda.cu
                        OPTIONS ${CXX_TESTS_FLAGS} )
   install( TARGETS tnl-benchmark-ndarray-cuda RUNTIME DESTINATION bin )
endif()
+1 −0
Original line number Diff line number Diff line
#include "tnl-benchmark-ndarray.h"
+1 −0
Original line number Diff line number Diff line
#include "tnl-benchmark-ndarray.h"
+464 −0
Original line number Diff line number Diff line
/***************************************************************************
                          tnl-benchmark-ndarray.h  -  description
                             -------------------
    begin                : Dec 24, 2018
    copyright            : (C) 2018 by Tomas Oberhuber et al.
    email                : tomas.oberhuber@fjfi.cvut.cz
 ***************************************************************************/

/* See Copyright Notice in tnl/Copyright */

// Implemented by: Jakub Klinkovsky

#pragma once

#include <TNL/Assert.h>
#include <TNL/Math.h>
#include <TNL/ParallelFor.h>

#include <TNL/Containers/NDArray.h>
#include <TNL/Containers/ndarray/Operations.h>

#include "../Benchmarks.h"

using namespace TNL;
using namespace TNL::Benchmarks;
using namespace TNL::Containers;
using std::index_sequence;

using value_type = float;
//using index_type = std::size_t;
using index_type = unsigned;

template< typename Array >
void expect_eq_chunked( Array& a, Array& b )
{
   // TODO: use something like EXPECT_EQ
   TNL_ASSERT_EQ( a.getSize(), b.getSize(), "array sizes don't match" );
   if( a.getSize() != b.getSize() )
      return;

   using IndexType = typename Array::IndexType;

   const IndexType chunk_size = 4096;
   for( IndexType c = 0; c < (IndexType) roundUpDivision( a.getSize(), chunk_size ); c++ ) {
      const typename Array::IndexType this_chunk_size = TNL::min( chunk_size, a.getSize() - c * chunk_size );
      Array a_chunk( &a[ c * chunk_size ], this_chunk_size );
      Array b_chunk( &b[ c * chunk_size ], this_chunk_size );
      // TODO: use something like EXPECT_EQ
      TNL_ASSERT_EQ( a_chunk, b_chunk, "chunks are not equal" );
   }
}

template< typename Array >
void expect_eq( Array& a, Array& b )
{
   if( std::is_same< typename Array::DeviceType, TNL::Devices::Cuda >::value ) {
      typename Array::HostType a_host, b_host;
      a_host = a;
      b_host = b;
      expect_eq_chunked( a_host, b_host );
   }
   else {
      expect_eq_chunked( a, b );
   }
}

template< typename Device >
const char* performer()
{
   if( std::is_same< Device, Devices::Host >::value )
      return "CPU";
   else if( std::is_same< Device, Devices::Cuda >::value )
      return "GPU";
   else
      return "unknown";
}

void reset() {}

// NOTE: having the sizes as function parameters keeps the compiler from treating them
// as "compile-time constants" and thus e.g. optimizing the 1D iterations with memcpy

template< typename Device >
void benchmark_array( Benchmark& benchmark, index_type size = 500000000 )
{
   Array< value_type, Device > a, b;
   a.setSize( size );
   b.setSize( size );
   a.setValue( -1 );
   b.setValue( 1 );

   auto kernel = [] __cuda_callable__
      ( int i,
        value_type* a,
        const value_type* b )
   {
      a[ i ] = b[ i ];
   };

   auto f = [&]() {
      TNL::ParallelFor< Device >::exec( 0, (int) size, kernel, a.getData(), b.getData() );
   };

   // warm-up for all benchmarks
   f();

   const double datasetSize = 2 * size * sizeof(value_type) / oneGB;
   benchmark.setOperation( "array", datasetSize );
   benchmark.time< Device >( reset, performer< Device >(), f );

   expect_eq( a, b );
}

template< typename Device >
void benchmark_1D( Benchmark& benchmark, index_type size = 500000000 )
{
   NDArray< value_type,
            SizesHolder< index_type, 0 >,
            std::make_index_sequence< 1 >,
            std::make_index_sequence< 1 >,
            Device > a, b;
   a.setSizes( size );
   b.setSizes( size );
   a.getStorageArray().setValue( -1 );
   b.getStorageArray().setValue( 1 );

   auto f = [&]() {
      nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b );
   };

   const double datasetSize = 2 * size * sizeof(value_type) / oneGB;
   benchmark.setOperation( "1D", datasetSize );
   benchmark.time< Device >( reset, performer< Device >(), f );

   expect_eq( a.getStorageArray(), b.getStorageArray() );
}

template< typename Device >
void benchmark_2D( Benchmark& benchmark, index_type size = 22333 )
{
   NDArray< value_type,
            SizesHolder< index_type, 0, 0 >,
            std::make_index_sequence< 2 >,
            std::make_index_sequence< 2 >,
            Device > a, b;
   a.setSizes( size, size );
   b.setSizes( size, size );
   a.getStorageArray().setValue( -1 );
   b.getStorageArray().setValue( 1 );

   auto f = [&]() {
      nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b );
   };

   const double datasetSize = 2 * std::pow( size, 2 ) * sizeof(value_type) / oneGB;
   benchmark.setOperation( "2D", datasetSize );
   benchmark.time< Device >( reset, performer< Device >(), f );

   expect_eq( a.getStorageArray(), b.getStorageArray() );
}

template< typename Device >
void benchmark_3D( Benchmark& benchmark, index_type size = 800 )
{
   NDArray< value_type,
            SizesHolder< index_type, 0, 0, 0 >,
            std::make_index_sequence< 3 >,
            std::make_index_sequence< 3 >,
            Device > a, b;
   a.setSizes( size, size, size );
   b.setSizes( size, size, size );
   a.getStorageArray().setValue( -1 );
   b.getStorageArray().setValue( 1 );

   auto f = [&]() {
      nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b );
   };

   const double datasetSize = 2 * std::pow( size, 3 ) * sizeof(value_type) / oneGB;
   benchmark.setOperation( "3D", datasetSize );
   benchmark.time< Device >( reset, performer< Device >(), f );

   expect_eq( a.getStorageArray(), b.getStorageArray() );
}

template< typename Device >
void benchmark_4D( Benchmark& benchmark, index_type size = 150 )
{
   NDArray< value_type,
            SizesHolder< index_type, 0, 0, 0, 0 >,
            std::make_index_sequence< 4 >,
            std::make_index_sequence< 4 >,
            Device > a, b;
   a.setSizes( size, size, size, size );
   b.setSizes( size, size, size, size );
   a.getStorageArray().setValue( -1 );
   b.getStorageArray().setValue( 1 );

   auto f = [&]() {
      nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b );
   };

   const double datasetSize = 2 * std::pow( size, 4 ) * sizeof(value_type) / oneGB;
   benchmark.setOperation( "4D", datasetSize );
   benchmark.time< Device >( reset, performer< Device >(), f );

   expect_eq( a.getStorageArray(), b.getStorageArray() );
}

template< typename Device >
void benchmark_5D( Benchmark& benchmark, index_type size = 56 )
{
   NDArray< value_type,
            SizesHolder< index_type, 0, 0, 0, 0, 0 >,
            std::make_index_sequence< 5 >,
            std::make_index_sequence< 5 >,
            Device > a, b;
   a.setSizes( size, size, size, size, size );
   b.setSizes( size, size, size, size, size );
   a.getStorageArray().setValue( -1 );
   b.getStorageArray().setValue( 1 );

   auto f = [&]() {
      nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b );
   };

   const double datasetSize = 2 * std::pow( size, 5 ) * sizeof(value_type) / oneGB;
   benchmark.setOperation( "5D", datasetSize );
   benchmark.time< Device >( reset, performer< Device >(), f );

   expect_eq( a.getStorageArray(), b.getStorageArray() );
}

template< typename Device >
void benchmark_6D( Benchmark& benchmark, index_type size = 28 )
{
   NDArray< value_type,
            SizesHolder< index_type, 0, 0, 0, 0, 0, 0 >,
            std::make_index_sequence< 6 >,
            std::make_index_sequence< 6 >,
            Device > a, b;
   a.setSizes( size, size, size, size, size, size );
   b.setSizes( size, size, size, size, size, size );
   a.getStorageArray().setValue( -1 );
   b.getStorageArray().setValue( 1 );

   auto f = [&]() {
      nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b );
   };

   const double datasetSize = 2 * std::pow( size, 6 ) * sizeof(value_type) / oneGB;
   benchmark.setOperation( "6D", datasetSize );
   benchmark.time< Device >( reset, performer< Device >(), f );

   expect_eq( a.getStorageArray(), b.getStorageArray() );
}


template< typename Device >
void benchmark_2D_perm( Benchmark& benchmark, index_type size = 22333 )
{
   NDArray< value_type,
            SizesHolder< index_type, 0, 0 >,
            std::index_sequence< 1, 0 >,
            std::index_sequence< 1, 0 >,
            Device > a, b;
   a.setSizes( size, size );
   b.setSizes( size, size );
   a.getStorageArray().setValue( -1 );
   b.getStorageArray().setValue( 1 );

   auto f = [&]() {
      nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b );
   };

   const double datasetSize = 2 * std::pow( size, 2 ) * sizeof(value_type) / oneGB;
   benchmark.setOperation( "2D permuted", datasetSize );
   benchmark.time< Device >( reset, performer< Device >(), f );

   expect_eq( a.getStorageArray(), b.getStorageArray() );
}

template< typename Device >
void benchmark_3D_perm( Benchmark& benchmark, index_type size = 800 )
{
   NDArray< value_type,
            SizesHolder< index_type, 0, 0, 0 >,
            std::index_sequence< 2, 1, 0 >,
            std::index_sequence< 2, 1, 0 >,
            Device > a, b;
   a.setSizes( size, size, size );
   b.setSizes( size, size, size );
   a.getStorageArray().setValue( -1 );
   b.getStorageArray().setValue( 1 );

   auto f = [&]() {
      nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b );
   };

   const double datasetSize = 2 * std::pow( size, 3 ) * sizeof(value_type) / oneGB;
   benchmark.setOperation( "3D permuted", datasetSize );
   benchmark.time< Device >( reset, performer< Device >(), f );

   expect_eq( a.getStorageArray(), b.getStorageArray() );
}

template< typename Device >
void benchmark_4D_perm( Benchmark& benchmark, index_type size = 150 )
{
   NDArray< value_type,
            SizesHolder< index_type, 0, 0, 0, 0 >,
            std::index_sequence< 3, 2, 1, 0 >,
            std::index_sequence< 3, 2, 1, 0 >,
            Device > a, b;
   a.setSizes( size, size, size, size );
   b.setSizes( size, size, size, size );
   a.getStorageArray().setValue( -1 );
   b.getStorageArray().setValue( 1 );

   auto f = [&]() {
      nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b );
   };

   const double datasetSize = 2 * std::pow( size, 4 ) * sizeof(value_type) / oneGB;
   benchmark.setOperation( "4D permuted", datasetSize );
   benchmark.time< Device >( reset, performer< Device >(), f );

   expect_eq( a.getStorageArray(), b.getStorageArray() );
}

template< typename Device >
void benchmark_5D_perm( Benchmark& benchmark, index_type size = 56 )
{
   NDArray< value_type,
            SizesHolder< index_type, 0, 0, 0, 0, 0 >,
            std::index_sequence< 4, 3, 2, 1, 0 >,
            std::index_sequence< 4, 3, 2, 1, 0 >,
            Device > a, b;
   a.setSizes( size, size, size, size, size );
   b.setSizes( size, size, size, size, size );
   a.getStorageArray().setValue( -1 );
   b.getStorageArray().setValue( 1 );

   auto f = [&]() {
      nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b );
   };

   const double datasetSize = 2 * std::pow( size, 5 ) * sizeof(value_type) / oneGB;
   benchmark.setOperation( "5D permuted", datasetSize );
   benchmark.time< Device >( reset, performer< Device >(), f );

   expect_eq( a.getStorageArray(), b.getStorageArray() );
}

template< typename Device >
void benchmark_6D_perm( Benchmark& benchmark, index_type size = 28 )
{
   NDArray< value_type,
            SizesHolder< index_type, 0, 0, 0, 0, 0, 0 >,
            std::index_sequence< 5, 4, 3, 2, 1, 0 >,
            std::index_sequence< 5, 4, 3, 2, 1, 0 >,
            Device > a, b;
   a.setSizes( size, size, size, size, size, size );
   b.setSizes( size, size, size, size, size, size );
   a.getStorageArray().setValue( -1 );
   b.getStorageArray().setValue( 1 );

   auto f = [&]() {
      nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b );
   };

   const double datasetSize = 2 * std::pow( size, 6 ) * sizeof(value_type) / oneGB;
   benchmark.setOperation( "6D permuted", datasetSize );
   benchmark.time< Device >( reset, performer< Device >(), f );

   expect_eq( a.getStorageArray(), b.getStorageArray() );
}

template< typename Device >
void run_benchmarks( Benchmark& benchmark )
{
   benchmark_array< Device >( benchmark );
   benchmark_1D< Device >( benchmark );
   benchmark_2D< Device >( benchmark );
   benchmark_3D< Device >( benchmark );
   benchmark_4D< Device >( benchmark );
   benchmark_5D< Device >( benchmark );
   benchmark_6D< Device >( benchmark );
   benchmark_2D_perm< Device >( benchmark );
   benchmark_3D_perm< Device >( benchmark );
   benchmark_4D_perm< Device >( benchmark );
   benchmark_5D_perm< Device >( benchmark );
   benchmark_6D_perm< Device >( benchmark );
}

void setupConfig( Config::ConfigDescription & config )
{
   config.addDelimiter( "Benchmark settings:" );
   config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-ndarray.log");
   config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" );
   config.addEntryEnum( "append" );
   config.addEntryEnum( "overwrite" );
   config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
   config.addEntry< int >( "verbose", "Verbose mode.", 1 );
   config.addEntry< String >( "devices", "Run benchmarks on these devices.", "all" );
   config.addEntryEnum( "all" );
   config.addEntryEnum( "host" );
   #ifdef HAVE_CUDA
   config.addEntryEnum( "cuda" );
   #endif

   config.addDelimiter( "Device settings:" );
   Devices::Host::configSetup( config );
   Devices::Cuda::configSetup( config );
}

int main( int argc, char* argv[] )
{
   Config::ParameterContainer parameters;
   Config::ConfigDescription conf_desc;

   setupConfig( conf_desc );

   if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) {
      conf_desc.printUsage( argv[ 0 ] );
      return EXIT_FAILURE;
   }

   if( ! Devices::Host::setup( parameters ) ||
       ! Devices::Cuda::setup( parameters ) )
      return EXIT_FAILURE;

   const String & logFileName = parameters.getParameter< String >( "log-file" );
   const String & outputMode = parameters.getParameter< String >( "output-mode" );
   const int loops = parameters.getParameter< int >( "loops" );
   const int verbose = parameters.getParameter< int >( "verbose" );

   // open log file
   auto mode = std::ios::out;
   if( outputMode == "append" )
       mode |= std::ios::app;
   std::ofstream logFile( logFileName.getString(), mode );

   // init benchmark and common metadata
   Benchmark benchmark( loops, verbose );

   // prepare global metadata
   Benchmark::MetadataMap metadata = getHardwareMetadata();

   const String devices = parameters.getParameter< String >( "devices" );
   if( devices == "all" || devices == "host" )
      run_benchmarks< Devices::Host >( benchmark );
#ifdef HAVE_CUDA
   if( devices == "all" || devices == "cuda" )
      run_benchmarks< Devices::Cuda >( benchmark );
#endif

   if( ! benchmark.save( logFile ) ) {
      std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl;
      return EXIT_FAILURE;
   }

   return EXIT_SUCCESS;
}
Loading