Skip to content
Snippets Groups Projects
Commit 8775d239 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Merge branch 'develop' of mmg-gitlab.fjfi.cvut.cz:tnl/tnl-dev into develop

parents b499df1e 2da7e995
No related branches found
No related tags found
No related merge requests found
/***************************************************************************
SpmvBenchmarkResult.h - description
-------------------
begin : Mar 5, 2020
copyright : (C) 2020 by Tomas Oberhuber
email : tomas.oberhuber@fjfi.cvut.cz
***************************************************************************/
/* See Copyright Notice in tnl/Copyright */
#pragma once
#include "../Benchmarks.h"
namespace TNL {
namespace Benchmarks {
template< typename Real = double,
typename Index = int >
struct SpmvBenchmarkResult
: public BenchmarkResult
{
using RealType = Real;
using IndexType = Index;
using HostVector = Containers::Vector< Real, Devices::Host, Index >;
using CudaVector = Containers::Vector< Real, Devices::Cuda, Index >;
SpmvBenchmarkResult( CudaVector& cudaResult, HostVector& hostResult, CudaVector& cusparseResult )
: hostResult( hostResult ), cudaResult( cudaResult), cusparseResult( cusparseResult ){};
virtual HeaderElements getTableHeader() const override
{
return HeaderElements({"time", "stddev", "stddev/time", "speedup", "Host.Diff.Max", "Host.Diff.L2", "Cusparse.Diff.Max", "Cusparse.Diff.L2"});
}
virtual RowElements getRowElements() const override
{
HostVector cudaCopy, cusparseCopy, a, b;
cudaCopy = cudaResult;
cusparseCopy = cusparseResult;
a = cudaCopy - hostResult;
b = cudaCopy - cusparseCopy;
return RowElements({ time, stddev, stddev/time, speedup, max( abs( a ) ), lpNorm( a, 2.0 ), max( abs( b ) ), lpNorm( b, 2.0 ) });
}
HostVector &hostResult;
CudaVector &cudaResult, &cusparseResult;
};
} //namespace Benchmarks
} //namespace TNL
......@@ -15,6 +15,7 @@
#pragma once
#include "../Benchmarks.h"
#include "SpmvBenchmarkResult.h"
#include <TNL/Pointers/DevicePointer.h>
#include <TNL/Matrices/Legacy/CSR.h>
......@@ -113,168 +114,137 @@ benchmarkSpMV( Benchmark& benchmark,
const String& inputFileName,
bool verboseMR )
{
// Setup CSR for cuSPARSE. It will compared to the format given as a template parameter to this function
typedef Matrices::Legacy::CSR< Real, Devices::Host, int > CSR_HostMatrix;
typedef Matrices::Legacy::CSR< Real, Devices::Cuda, int > CSR_DeviceMatrix;
// Setup CSR for cuSPARSE. It will compared to the format given as a template parameter to this function
using CSR_HostMatrix = Matrices::Legacy::CSR< Real, Devices::Host, int >;
using CSR_DeviceMatrix = Matrices::Legacy::CSR< Real, Devices::Cuda, int >;
CSR_HostMatrix CSRhostMatrix;
CSR_DeviceMatrix CSRdeviceMatrix;
CSR_HostMatrix CSRhostMatrix;
CSR_DeviceMatrix CSRdeviceMatrix;
// Read the matrix for CSR, to set up cuSPARSE
MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix, verboseMR );
// Read the matrix for CSR, to set up cuSPARSE
MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix, verboseMR );
#ifdef HAVE_CUDA
// cuSPARSE handle setup
cusparseHandle_t cusparseHandle;
cusparseCreate( &cusparseHandle );
// cuSPARSE handle setup
cusparseHandle_t cusparseHandle;
cusparseCreate( &cusparseHandle );
// cuSPARSE (in TNL's CSR) only works for device, copy the matrix from host to device
CSRdeviceMatrix = CSRhostMatrix;
// cuSPARSE (in TNL's CSR) only works for device, copy the matrix from host to device
CSRdeviceMatrix = CSRhostMatrix;
// Delete the CSRhostMatrix, so it doesn't take up unnecessary space
CSRhostMatrix.reset();
// Delete the CSRhostMatrix, so it doesn't take up unnecessary space
CSRhostMatrix.reset();
// Initialize the cusparseCSR matrix.
TNL::CusparseCSR< Real > cusparseCSR;
cusparseCSR.init( CSRdeviceMatrix, &cusparseHandle );
// Initialize the cusparseCSR matrix.
TNL::CusparseCSR< Real > cusparseCSR;
cusparseCSR.init( CSRdeviceMatrix, &cusparseHandle );
#endif
// Setup the format which is given as a template parameter to this function
typedef Matrix< Real, Devices::Host, int > HostMatrix;
typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix;
typedef Containers::Vector< Real, Devices::Host, int > HostVector;
typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector;
// Setup the format which is given as a template parameter to this function
typedef Matrix< Real, Devices::Host, int > HostMatrix;
typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix;
typedef Containers::Vector< Real, Devices::Host, int > HostVector;
typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector;
HostMatrix hostMatrix;
DeviceMatrix deviceMatrix;
HostVector hostVector, hostVector2;
CudaVector deviceVector, deviceVector2;
HostMatrix hostMatrix;
DeviceMatrix deviceMatrix;
HostVector hostVector, hostVector2;
CudaVector deviceVector, deviceVector2, cusparseVector;
// Load the format
MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix, verboseMR );
// Load the format
MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix, verboseMR );
// Setup MetaData here (not in tnl-benchmark-spmv.h, as done in Benchmarks/BLAS),
// because we need the matrix loaded first to get the rows and columns
benchmark.setMetadataColumns( Benchmark::MetadataColumns({
{ "matrix name", convertToString( getMatrixFileName( inputFileName ) ) },
{ "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) },
{ "rows", convertToString( hostMatrix.getRows() ) },
{ "columns", convertToString( hostMatrix.getColumns() ) },
{ "matrix format", MatrixInfo< HostMatrix >::getFormat() } //convertToString( getType( hostMatrix ) ) }
} ));
// Setup MetaData here (not in tnl-benchmark-spmv.h, as done in Benchmarks/BLAS),
// because we need the matrix loaded first to get the rows and columns
benchmark.setMetadataColumns( Benchmark::MetadataColumns({
{ "matrix name", convertToString( getMatrixFileName( inputFileName ) ) },
{ "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) },
{ "rows", convertToString( hostMatrix.getRows() ) },
{ "columns", convertToString( hostMatrix.getColumns() ) },
{ "matrix format", MatrixInfo< HostMatrix >::getFormat() } //convertToString( getType( hostMatrix ) ) }
} ));
hostVector.setSize( hostMatrix.getColumns() );
hostVector2.setSize( hostMatrix.getRows() );
hostVector.setSize( hostMatrix.getColumns() );
hostVector2.setSize( hostMatrix.getRows() );
#ifdef HAVE_CUDA
deviceMatrix = hostMatrix;
deviceVector.setSize( hostMatrix.getColumns() );
deviceVector2.setSize( hostMatrix.getRows() );
deviceMatrix = hostMatrix;
deviceVector.setSize( hostMatrix.getColumns() );
deviceVector2.setSize( hostMatrix.getRows() );
cusparseVector.setSize( hostMatrix.getRows() );
#endif
// reset function
auto reset = [&]() {
hostVector.setValue( 1.0 );
hostVector2.setValue( 0.0 );
#ifdef HAVE_CUDA
deviceVector.setValue( 1.0 );
deviceVector2.setValue( 0.0 );
// reset function
auto resetHostVectors = [&]() {
hostVector = 1.0;
hostVector2 = 0.0;
};
#ifdef HAVE_CUDA
auto resetCudaVectors = [&]() {
deviceVector = 1.0;
deviceVector2 = 0.0;
};
auto resetCusparseVectors = [&]() {
deviceVector = 1.0;
cusparseVector == 0.0;
};
#endif
};
const int elements = hostMatrix.getNumberOfNonzeroMatrixElements();
const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
const int elements = hostMatrix.getNumberOfNonzeroMatrixElements();
const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
// compute functions
auto spmvHost = [&]() {
hostMatrix.vectorProduct( hostVector, hostVector2 );
};
auto spmvHost = [&]() {
hostMatrix.vectorProduct( hostVector, hostVector2 );
};
#ifdef HAVE_CUDA
auto spmvCuda = [&]() {
deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
};
auto spmvCuda = [&]() {
deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
};
auto spmvCusparse = [&]() {
cusparseCSR.vectorProduct( deviceVector, deviceVector2 );
};
auto spmvCusparse = [&]() {
cusparseCSR.vectorProduct( deviceVector, cusparseVector );
};
#endif
benchmark.setOperation( datasetSize );
benchmark.time< Devices::Host >( reset, "CPU", spmvHost );
benchmark.setOperation( datasetSize );
benchmark.time< Devices::Host >( resetHostVectors, "CPU", spmvHost );
// Initialize the host vector to be compared.
// (The values in hostVector2 will be reset when spmvCuda starts)
HostVector resultHostVector2;
resultHostVector2.setSize( hostVector2.getSize() );
resultHostVector2.setValue( 0.0 );
// Initialize the host vector to be compared.
// (The values in hostVector2 will be reset when spmvCuda starts)
HostVector resultHostVector2;
resultHostVector2.setSize( hostVector2.getSize() );
resultHostVector2.setValue( 0.0 );
// Copy the values
resultHostVector2 = hostVector2;
// Copy the values
resultHostVector2 = hostVector2;
#ifdef HAVE_CUDA
benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda );
// Initialize the device vector to be compared.
// (The values in deviceVector2 will be reset when spmvCusparse starts)
HostVector resultDeviceVector2;
resultDeviceVector2.setSize( deviceVector2.getSize() );
resultDeviceVector2.setValue( 0.0 );
resultDeviceVector2 = deviceVector2;
// Setup cuSPARSE MetaData, since it has the same header as CSR,
// and therefore will not get its own headers (rows, cols, speedup etc.) in log.
// * Not setting this up causes (among other undiscovered errors) the speedup from CPU to GPU on the input format to be overwritten.
benchmark.setMetadataColumns( Benchmark::MetadataColumns({
{ "matrix name", convertToString( getMatrixFileName( inputFileName ) ) },
{ "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) },
{ "rows", convertToString( hostMatrix.getRows() ) },
{ "columns", convertToString( hostMatrix.getColumns() ) },
{ "matrix format", convertToString( "CSR-cuSPARSE-" + getFormatShort( hostMatrix ) ) }
} ));
benchmark.time< Devices::Cuda >( reset, "GPU", spmvCusparse );
HostVector resultcuSPARSEDeviceVector2;
resultcuSPARSEDeviceVector2.setSize( deviceVector2.getSize() );
resultcuSPARSEDeviceVector2.setValue( 0.0 );
resultcuSPARSEDeviceVector2 = deviceVector2;
// Difference between GPU (current format) and GPU-cuSPARSE results
//Real cuSparseDifferenceAbsMax = resultDeviceVector2.differenceAbsMax( resultcuSPARSEDeviceVector2 );
Real cuSparseDifferenceAbsMax = max( abs( resultDeviceVector2 - resultcuSPARSEDeviceVector2 ) );
//Real cuSparseDifferenceLpNorm = resultDeviceVector2.differenceLpNorm( resultcuSPARSEDeviceVector2, 1 );
Real cuSparseDifferenceLpNorm = lpNorm( resultDeviceVector2 - resultcuSPARSEDeviceVector2, 1 );
std::string GPUxGPUcuSparse_resultDifferenceAbsMax = "GPUxGPUcuSPARSE differenceAbsMax = " + std::to_string( cuSparseDifferenceAbsMax );
std::string GPUxGPUcuSparse_resultDifferenceLpNorm = "GPUxGPUcuSPARSE differenceLpNorm = " + std::to_string( cuSparseDifferenceLpNorm );
char *GPUcuSparse_absMax = &GPUxGPUcuSparse_resultDifferenceAbsMax[ 0u ];
char *GPUcuSparse_lpNorm = &GPUxGPUcuSparse_resultDifferenceLpNorm[ 0u ];
// Difference between CPU and GPU results for the current format
//Real differenceAbsMax = resultHostVector2.differenceAbsMax( resultDeviceVector2 );
Real differenceAbsMax = max( abs( resultHostVector2 - resultDeviceVector2 ) );
//Real differenceLpNorm = resultHostVector2.differenceLpNorm( resultDeviceVector2, 1 );
Real differenceLpNorm = lpNorm( resultHostVector2 - resultDeviceVector2, 1 );
std::string CPUxGPU_resultDifferenceAbsMax = "CPUxGPU differenceAbsMax = " + std::to_string( differenceAbsMax );
std::string CPUxGPU_resultDifferenceLpNorm = "CPUxGPU differenceLpNorm = " + std::to_string( differenceLpNorm );
char *CPUxGPU_absMax = &CPUxGPU_resultDifferenceAbsMax[ 0u ];
char *CPUxGPU_lpNorm = &CPUxGPU_resultDifferenceLpNorm[ 0u ];
// Print result differences of CPU and GPU of current format
std::cout << CPUxGPU_absMax << std::endl;
std::cout << CPUxGPU_lpNorm << std::endl;
// Print result differences of GPU of current format and GPU with cuSPARSE.
std::cout << GPUcuSparse_absMax << std::endl;
std::cout << GPUcuSparse_lpNorm << std::endl;
benchmark.time< Devices::Cuda >( resetCudaVectors, "GPU", spmvCuda );
// Initialize the device vector to be compared.
// (The values in deviceVector2 will be reset when spmvCusparse starts)
HostVector resultDeviceVector2;
resultDeviceVector2.setSize( deviceVector2.getSize() );
resultDeviceVector2.setValue( 0.0 );
resultDeviceVector2 = deviceVector2;
// Setup cuSPARSE MetaData, since it has the same header as CSR,
// and therefore will not get its own headers (rows, cols, speedup etc.) in log.
// * Not setting this up causes (among other undiscovered errors) the speedup from CPU to GPU on the input format to be overwritten.
benchmark.setMetadataColumns( Benchmark::MetadataColumns({
{ "matrix name", convertToString( getMatrixFileName( inputFileName ) ) },
{ "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) },
{ "rows", convertToString( hostMatrix.getRows() ) },
{ "columns", convertToString( hostMatrix.getColumns() ) },
{ "matrix format", convertToString( "CSR-cuSPARSE-" + getFormatShort( hostMatrix ) ) }
} ));
SpmvBenchmarkResult< Real, int > benchmarkResult( deviceVector2, hostVector2, cusparseVector );
benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", spmvCusparse, benchmarkResult );
#endif
std::cout << std::endl;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment