Newer
Older
/***************************************************************************
vector-operations.h - description
-------------------
begin : Dec 30, 2015
copyright : (C) 2015 by Tomas Oberhuber et al.
email : tomas.oberhuber@fjfi.cvut.cz
***************************************************************************/
/* See Copyright Notice in tnl/Copyright */
// Implemented by: Jakub Klinkovsky
#pragma once
#include <stdlib.h> // srand48
#include "../Benchmarks.h"
#include <TNL/Containers/Vector.h>
#ifdef HAVE_CUDA
#include "cublasWrappers.h"
#endif
namespace TNL {
namespace Benchmarks {
template< typename Real = double,
typename Index = int >
bool
benchmarkVectorOperations( Benchmark & benchmark,
const long & size )
{
using HostVector = Containers::Vector< Real, Devices::Host, Index >;
using CudaVector = Containers::Vector< Real, Devices::Cuda, Index >;
using HostView = Containers::VectorView< Real, Devices::Host, Index >;
using CudaView = Containers::VectorView< Real, Devices::Cuda, Index >;
double datasetSize = (double) size * sizeof( Real ) / oneGB;
HostVector hostVector, hostVector2;
CudaVector deviceVector, deviceVector2;
hostVector.setSize( size );
hostVector2.setSize( size );
#ifdef HAVE_CUDA
deviceVector.setSize( size );
deviceVector2.setSize( size );
#endif
HostView hostView( hostVector ), hostView2( hostVector2 );
CudaView deviceView( deviceVector ), deviceView2( deviceVector2 );
Real resultHost, resultDevice;
#ifdef HAVE_CUDA
cublasHandle_t cublasHandle;
cublasCreate( &cublasHandle );
#endif
// reset functions
// (Make sure to always use some in benchmarks, even if it's not necessary
// to assure correct result - it helps to clear cache and avoid optimizations
// of the benchmark loop.)
auto reset1 = [&]() {
hostVector.setValue( 1.0 );
#ifdef HAVE_CUDA
deviceVector.setValue( 1.0 );
#endif
// A relatively harmless call to keep the compiler from realizing we
// don't actually do any useful work with the result of the reduction.
srand48(resultHost);
resultHost = resultDevice = 0.0;
};
auto reset2 = [&]() {
hostVector2.setValue( 1.0 );
#ifdef HAVE_CUDA
deviceVector2.setValue( 1.0 );
#endif
};
auto reset12 = [&]() {
reset1();
reset2();
};
reset12();
auto maxHost = [&]() {
resultHost = hostVector.max();
};
auto maxCuda = [&]() {
resultDevice = deviceVector.max();
};
auto maxHostET = [&]() {
resultHost = max( hostView );
};
auto maxCudaET = [&]() {
resultDevice = max( deviceView );
};
benchmark.setOperation( "max", datasetSize );
benchmark.time< Devices::Host >( reset1, "CPU", maxHost );
benchmark.time< Devices::Host >( reset1, "CPU ET", maxHostET );
benchmark.time< Devices::Cuda >( reset1, "GPU", maxCuda );
benchmark.time< Devices::Cuda >( reset1, "GPU ET", maxCudaET );
#endif
auto minHost = [&]() {
resultHost = hostVector.min();
};
auto minCuda = [&]() {
resultDevice = deviceVector.min();
};
benchmark.setOperation( "min", datasetSize );
benchmark.time< Devices::Host >( reset1, "CPU", minHost );
benchmark.time< Devices::Cuda >( reset1, "GPU", minCuda );
#endif
auto absMaxHost = [&]() {
resultHost = hostVector.absMax();
};
auto absMaxCuda = [&]() {
resultDevice = deviceVector.absMax();
};
auto absMaxHostET = [&]() {
resultHost = max( abs( hostView ) );
};
auto absMaxCudaET = [&]() {
resultDevice = max( abs( deviceView ) );
};
#ifdef HAVE_CUDA
auto absMaxCublas = [&]() {
int index = 0;
cublasIgamax( cublasHandle, size,
deviceVector.getData(), 1,
&index );
resultDevice = deviceVector.getElement( index );
};
#endif
benchmark.setOperation( "absMax", datasetSize );
benchmark.time< Devices::Host >( reset1, "CPU", absMaxHost );
benchmark.time< Devices::Host >( reset1, "CPU ET", absMaxHostET );
benchmark.time< Devices::Cuda >( reset1, "GPU", absMaxCuda );
benchmark.time< Devices::Cuda >( reset1, "GPU ET", absMaxCudaET );
benchmark.time< Devices::Cuda >( reset1, "cuBLAS", absMaxCublas );
#endif
auto absMinHost = [&]() {
resultHost = hostVector.absMin();
};
auto absMinCuda = [&]() {
resultDevice = deviceVector.absMin();
};
#ifdef HAVE_CUDA
auto absMinCublas = [&]() {
int index = 0;
cublasIgamin( cublasHandle, size,
deviceVector.getData(), 1,
&index );
resultDevice = deviceVector.getElement( index );
};
#endif
benchmark.setOperation( "absMin", datasetSize );
benchmark.time< Devices::Host >( reset1, "CPU", absMinHost );
benchmark.time< Devices::Cuda >( reset1, "GPU", absMinCuda );
benchmark.time< Devices::Cuda >( reset1, "cuBLAS", absMinCublas );
#endif
auto sumHost = [&]() {
resultHost = hostVector.sum();
};
auto sumCuda = [&]() {
resultDevice = deviceVector.sum();
};
benchmark.setOperation( "sum", datasetSize );
benchmark.time< Devices::Host >( reset1, "CPU", sumHost );
benchmark.time< Devices::Cuda >( reset1, "GPU", sumCuda );
#endif
auto l1normHost = [&]() {
resultHost = hostVector.lpNorm( 1.0 );
};
auto l1normCuda = [&]() {
resultDevice = deviceVector.lpNorm( 1.0 );
};
#ifdef HAVE_CUDA
auto l1normCublas = [&]() {
cublasGasum( cublasHandle, size,
deviceVector.getData(), 1,
&resultDevice );
};
#endif
benchmark.setOperation( "l1 norm", datasetSize );
benchmark.time< Devices::Host >( reset1, "CPU", l1normHost );
benchmark.time< Devices::Cuda >( reset1, "GPU", l1normCuda );
benchmark.time< Devices::Cuda >( reset1, "cuBLAS", l1normCublas );
#endif
auto l2normHost = [&]() {
resultHost = hostVector.lpNorm( 2.0 );
};
auto l2normCuda = [&]() {
resultDevice = deviceVector.lpNorm( 2.0 );
};
#ifdef HAVE_CUDA
auto l2normCublas = [&]() {
cublasGnrm2( cublasHandle, size,
deviceVector.getData(), 1,
&resultDevice );
};
#endif
benchmark.setOperation( "l2 norm", datasetSize );
benchmark.time< Devices::Host >( reset1, "CPU", l2normHost );
benchmark.time< Devices::Cuda >( reset1, "GPU", l2normCuda );
benchmark.time< Devices::Cuda >( reset1, "cuBLAS", l2normCublas );
#endif
auto l3normHost = [&]() {
resultHost = hostVector.lpNorm( 3.0 );
};
auto l3normCuda = [&]() {
resultDevice = deviceVector.lpNorm( 3.0 );
};
benchmark.setOperation( "l3 norm", datasetSize );
benchmark.time< Devices::Host >( reset1, "CPU", l3normHost );
benchmark.time< Devices::Cuda >( reset1, "GPU", l3normCuda );
#endif
auto scalarProductHost = [&]() {
resultHost = hostVector.scalarProduct( hostVector2 );
};
auto scalarProductCuda = [&]() {
resultDevice = deviceVector.scalarProduct( deviceVector2 );
};
#ifdef HAVE_CUDA
auto scalarProductCublas = [&]() {
cublasGdot( cublasHandle, size,
deviceVector.getData(), 1,
deviceVector2.getData(), 1,
&resultDevice );
};
#endif
benchmark.setOperation( "scalar product", 2 * datasetSize );
benchmark.time< Devices::Host >( reset1, "CPU", scalarProductHost );
benchmark.time< Devices::Cuda >( reset1, "GPU", scalarProductCuda );
benchmark.time< Devices::Cuda >( reset1, "cuBLAS", scalarProductCublas );
#endif
/*
std::cout << "Benchmarking prefix-sum:" << std::endl;
timer.reset();
timer.start();
hostVector.computePrefixSum();
timer.stop();
timeHost = timer.getTime();
std::cout << " CPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << std::endl;
timer.reset();
timer.start();
deviceVector.computePrefixSum();
timer.stop();
timeDevice = timer.getTime();
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
std::cout << " GPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << std::endl;
std::cout << " CPU/GPU speedup: " << timeHost / timeDevice << std::endl;
HostVector auxHostVector;
auxHostVector.setLike( deviceVector );
auxHostVector = deviceVector;
for( int i = 0; i < size; i++ )
if( hostVector.getElement( i ) != auxHostVector.getElement( i ) )
{
std::cerr << "Error in prefix sum at position " << i << ": " << hostVector.getElement( i ) << " != " << auxHostVector.getElement( i ) << std::endl;
}
*/
auto multiplyHost = [&]() {
hostVector *= 0.5;
};
auto multiplyCuda = [&]() {
deviceVector *= 0.5;
};
#ifdef HAVE_CUDA
auto multiplyCublas = [&]() {
const Real alpha = 0.5;
cublasGscal( cublasHandle, size,
&alpha,
deviceVector.getData(), 1 );
};
#endif
benchmark.setOperation( "scalar multiplication", 2 * datasetSize );
benchmark.time< Devices::Host >( reset1, "CPU", multiplyHost );
benchmark.time< Devices::Cuda >( reset1, "GPU", multiplyCuda );
benchmark.time< Devices::Cuda >( reset1, "cuBLAS", multiplyCublas );
#endif
auto addVectorHost = [&]() {
hostVector.addVector( hostVector2 );
};
auto addVectorCuda = [&]() {
deviceVector.addVector( deviceVector2 );
};
#ifdef HAVE_CUDA
auto addVectorCublas = [&]() {
const Real alpha = 1.0;
cublasGaxpy( cublasHandle, size,
&alpha,
deviceVector2.getData(), 1,
deviceVector.getData(), 1 );
};
#endif
benchmark.setOperation( "vector addition", 3 * datasetSize );
benchmark.time< Devices::Host >( reset1, "CPU", addVectorHost );
benchmark.time< Devices::Cuda >( reset1, "GPU", addVectorCuda );
benchmark.time< Devices::Cuda >( reset1, "cuBLAS", addVectorCublas );