From 09467575801555fe35a275763980c0e07ebb0558 Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz> Date: Tue, 25 Dec 2018 13:11:48 +0100 Subject: [PATCH] Added pure-C test to traversers benchmark. --- src/Benchmarks/Benchmarks.h | 2 +- .../Traversers/GridTraversersBenchmark.h | 174 ++++++++++++++++-- .../Traversers/tnl-benchmark-traversers.h | 66 +++++-- 3 files changed, 208 insertions(+), 34 deletions(-) diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h index 7a6b12676d..c371e2dfb3 100644 --- a/src/Benchmarks/Benchmarks.h +++ b/src/Benchmarks/Benchmarks.h @@ -48,7 +48,7 @@ timeFunction( ComputeFunction compute, Timer timer; // set timer to the monitor - //monitor.setTimer( timer ); + monitor.setTimer( timer ); // warm up reset(); diff --git a/src/Benchmarks/Traversers/GridTraversersBenchmark.h b/src/Benchmarks/Traversers/GridTraversersBenchmark.h index 0190532c31..ee18adfa6e 100644 --- a/src/Benchmarks/Traversers/GridTraversersBenchmark.h +++ b/src/Benchmarks/Traversers/GridTraversersBenchmark.h @@ -53,9 +53,37 @@ class WriteOneUserData using MeshType = typename MeshFunctionPointer::ObjectType::MeshType; MeshFunctionPointer u; - }; - + +template< typename Real, + typename Index > +__global__ void simpleCudaKernel1D( const Index size, const dim3 gridIdx, Real* v_data ) +{ + const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + if( threadIdx_x < size ) + v_data[ threadIdx_x ] = 1.0; +} + +template< typename Real, + typename Index > +__global__ void simpleCudaKernel2D( const Index size, const dim3 gridIdx, Real* v_data ) +{ + const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; + if( threadIdx_x < size && threadIdx_y < size ) + v_data[ threadIdx_y * size + threadIdx_x ] = 1.0; +} + +template< typename Real, + typename Index > +__global__ void simpleCudaKernel3D( const Index size, const dim3 gridIdx, Real* v_data ) +{ + const Index threadIdx_x = ( gridIdx.x * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const Index threadIdx_y = ( gridIdx.y * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; + const Index threadIdx_z = ( gridIdx.z * Devices::Cuda::getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; + if( threadIdx_x < size && threadIdx_y < size && threadIdx_z < size ) + v_data[ ( threadIdx_z * size + threadIdx_y ) * size + threadIdx_x ] = 1.0; +} template< int Dimension, typename Device, @@ -85,19 +113,55 @@ class GridTraversersBenchmark< 1, Device, Real, Index > :v( size ), size( size ), grid( size ), u( grid ) { userData.u = this->u; + v_data = v.getData(); + } + + void reset() + { + v.setValue( 0.0 ); + u->getData().setValue( 0.0 ); + }; + + void writeOneUsingPureC() + { + if( std::is_same< Device, Devices::Host >::value ) + { + for( int i = 0; i < size; i++ ) + v_data[ i ] = 1.0; + } + else // Device == Devices::Cuda + { +#ifdef HAVE_CUDA + dim3 blockSize( 256 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size ); + dim3 gridIdx; + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + simpleCudaKernel1D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } +#endif + } } void writeOneUsingParallelFor() { - auto f = [] __cuda_callable__ ( Index i, Real* data ) { data[ i ] = 1.0; }; - ParallelFor< Device >::exec( ( Index ) 0, size, f, v.getData() ); } - + void writeOneUsingTraverser() { traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > @@ -108,6 +172,7 @@ class GridTraversersBenchmark< 1, Device, Real, Index > Index size; Vector v; + Real* v_data; GridPointer grid; MeshFunctionPointer u; Traverser traverser; @@ -133,11 +198,52 @@ class GridTraversersBenchmark< 2, Device, Real, Index > using TraverserUserData = WriteOneUserData< MeshFunctionPointer >; using WriteOneTraverserUserDataType = WriteOneUserData< MeshFunctionPointer >; using WriteOneEntitiesProcessorType = WriteOneEntitiesProcessor< WriteOneTraverserUserDataType >; - + GridTraversersBenchmark( Index size ) :size( size ), v( size * size ), grid( size, size ), u( grid ) { userData.u = this->u; + v_data = v.getData(); + } + + void reset() + { + v.setValue( 0.0 ); + u->getData().setValue( 0.0 ); + }; + + void writeOneUsingPureC() + { + if( std::is_same< Device, Devices::Host >::value ) + { + for( int i = 0; i < size; i++ ) + for( int j = 0; j < size; j++ ) + v_data[ i * size + j ] = 1.0; + } + else // Device == Devices::Cuda + { +#ifdef HAVE_CUDA + dim3 blockSize( 256 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size, + size ); + dim3 gridIdx; + for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + simpleCudaKernel2D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } +#endif + } } void writeOneUsingParallelFor() @@ -154,18 +260,18 @@ class GridTraversersBenchmark< 2, Device, Real, Index > this->size, f, v.getData() ); } - + void writeOneUsingTraverser() { traverser.template processAllEntities< WriteOneTraverserUserDataType, WriteOneEntitiesProcessorType > ( grid, userData ); } - protected: Index size; Vector v; + Real* v_data; GridPointer grid; MeshFunctionPointer u; Traverser traverser; @@ -178,7 +284,7 @@ template< typename Device, class GridTraversersBenchmark< 3, Device, Real, Index > { public: - + using Vector = Containers::Vector< Real, Device, Index >; using Grid = Meshes::Grid< 3, Real, Device, Index >; using GridPointer = Pointers::SharedPointer< Grid >; @@ -198,6 +304,50 @@ class GridTraversersBenchmark< 3, Device, Real, Index > u( grid ) { userData.u = this->u; + v_data = v.getData(); + } + + void reset() + { + v.setValue( 0.0 ); + u->getData().setValue( 0.0 ); + }; + + void writeOneUsingPureC() + { + if( std::is_same< Device, Devices::Host >::value ) + { + for( int i = 0; i < size; i++ ) + for( int j = 0; j < size; j++ ) + for( int k = 0; k < size; k++ ) + v_data[ ( i * size + j ) * size + k ] = 1.0; + } + else // Device == Devices::Cuda + { +#ifdef HAVE_CUDA + dim3 blockSize( 256 ), blocksCount, gridsCount; + Devices::Cuda::setupThreads( + blockSize, + blocksCount, + gridsCount, + size, + size, + size ); + dim3 gridIdx; + for( gridIdx.z = 0; gridIdx.z < gridsCount.z; gridIdx.z++ ) + for( gridIdx.y = 0; gridIdx.y < gridsCount.y; gridIdx.y++ ) + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) + { + dim3 gridSize; + Devices::Cuda::setupGrid( + blocksCount, + gridsCount, + gridIdx, + gridSize ); + simpleCudaKernel3D<<< gridSize, blockSize >>>( size, gridIdx, v_data ); + } +#endif + } } void writeOneUsingParallelFor() @@ -227,6 +377,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > Index size; Vector v; + Real* v_data; GridPointer grid; MeshFunctionPointer u; Traverser traverser; @@ -235,7 +386,4 @@ class GridTraversersBenchmark< 3, Device, Real, Index > } // namespace Benchmarks -} // namespace TNL - - - +} // namespace TNL \ No newline at end of file diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index d9958e29cc..f1c4efeed6 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -39,21 +39,50 @@ bool runBenchmark( const Config::ParameterContainer& parameters, // const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" ); const int minSize = parameters.getParameter< int >( "min-size" ); const int maxSize = parameters.getParameter< int >( "max-size" ); - + // Full grid traversing - benchmark.newBenchmark( String("Full grid traversing " + convertToString( Dimension ) + "D" ), metadata ); + benchmark.newBenchmark( String("Full grid traversing - write 1" + convertToString( Dimension ) + "D" ), metadata ); for( std::size_t size = minSize; size <= maxSize; size *= 2 ) { GridTraversersBenchmark< Dimension, Devices::Host, Real, Index > hostTraverserBenchmark( size ); GridTraversersBenchmark< Dimension, Devices::Cuda, Real, Index > cudaTraverserBenchmark( size ); - auto reset = [&]() {}; + auto noReset = []() {}; + + auto hostReset = [&]() + { + hostTraverserBenchmark.reset(); + }; + + auto cudaReset = [&]() + { + cudaTraverserBenchmark.reset(); + }; benchmark.setMetadataColumns( Benchmark::MetadataColumns( { {"size", convertToString( size ) }, } ) ); + /**** + * Write one using C for + */ + auto hostWriteOneUsingPureC = [&] () + { + hostTraverserBenchmark.writeOneUsingPureC(); + }; + + auto cudaWriteOneUsingPureC = [&] () + { + cudaTraverserBenchmark.writeOneUsingPureC(); + }; + + benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time( hostReset, "CPU", hostWriteOneUsingPureC ); +#ifdef HAVE_CUDA + benchmark.time( cudaReset, "GPU", cudaWriteOneUsingPureC ); +#endif + /**** * Write one using parallel for */ @@ -67,10 +96,10 @@ bool runBenchmark( const Config::ParameterContainer& parameters, cudaTraverserBenchmark.writeOneUsingParallelFor(); }; - benchmark.setOperation( "write 1 using parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time( reset, "CPU", hostWriteOneUsingParallelFor ); + benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time( hostReset, "CPU", hostWriteOneUsingParallelFor ); #ifdef HAVE_CUDA - benchmark.time( reset, "GPU", cudaWriteOneUsingParallelFor ); + benchmark.time( cudaReset, "GPU", cudaWriteOneUsingParallelFor ); #endif /**** @@ -84,16 +113,14 @@ bool runBenchmark( const Config::ParameterContainer& parameters, auto cudaWriteOneUsingTraverser = [&] () { cudaTraverserBenchmark.writeOneUsingTraverser(); - }; - - benchmark.setOperation( "write 1 using traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time( reset, "CPU", hostWriteOneUsingTraverser ); + } + + benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); + benchmark.time( hostReset, "CPU", hostWriteOneUsingTraverser ); #ifdef HAVE_CUDA - benchmark.time( reset, "GPU", cudaWriteOneUsingTraverser ); + benchmark.time( cudaReset, "GPU", cudaWriteOneUsingTraverser ); #endif - - - } + } return true; } @@ -107,16 +134,16 @@ void setupConfig( Config::ConfigDescription& config ) config.addEntryEnum( "float" ); config.addEntryEnum( "double" ); config.addEntryEnum( "all" ); - config.addEntry< int >( "dimension", "Set the problem dimension. 0 means all dimensions 1,2 and 3.", 0 ); + config.addEntry< int >( "dimension", "Set the problem dimension. 0 means all dimensions 1,2 and 3.", 0 ); config.addEntry< int >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 10 ); config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 1000 ); config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 ); Benchmark::configSetup( config ); - + config.addDelimiter( "Device settings:" ); Devices::Host::configSetup( config ); - Devices::Cuda::configSetup( config ); + Devices::Cuda::configSetup( config ); } template< int Dimension > @@ -126,18 +153,17 @@ bool setupBenchmark( const Config::ParameterContainer& parameters ) const String & outputMode = parameters.getParameter< String >( "output-mode" ); const String & precision = parameters.getParameter< String >( "precision" ); const unsigned sizeStepFactor = parameters.getParameter< unsigned >( "size-step-factor" ); - Benchmark benchmark; //( loops, verbose ); benchmark.setup( parameters ); Benchmark::MetadataMap metadata = getHardwareMetadata(); runBenchmark< Dimension >( parameters, benchmark, metadata ); - + auto mode = std::ios::out; if( outputMode == "append" ) mode |= std::ios::app; std::ofstream logFile( logFileName.getString(), mode ); - + if( ! benchmark.save( logFile ) ) { std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl; -- GitLab