Newer
Older
Lukas Cejka
committed
/***************************************************************************
spmv.h - description
-------------------
begin : Dec 30, 2015
copyright : (C) 2015 by Tomas Oberhuber et al.
email : tomas.oberhuber@fjfi.cvut.cz
***************************************************************************/
/* See Copyright Notice in tnl/Copyright */
// Implemented by: Jakub Klinkovsky
#pragma once
#include "../Benchmarks.h"
#include <TNL/Pointers/DevicePointer.h>
#include <TNL/Matrices/CSR.h>
#include <TNL/Matrices/Ellpack.h>
#include <TNL/Matrices/SlicedEllpack.h>
#include <TNL/Matrices/ChunkedEllpack.h>
Lukas Cejka
committed
#include <TNL/Matrices/MatrixReader.h>
using namespace TNL::Matrices;
Lukas Cejka
committed
#include <cusparse.h>
#include "cusparseCSRMatrix.h"
using namespace TNL;
Lukas Cejka
committed
namespace TNL {
namespace Benchmarks {
// silly alias to match the number of template parameters with other formats
template< typename Real, typename Device, typename Index >
using SlicedEllpack = Matrices::SlicedEllpack< Real, Device, Index >;
Lukas Cejka
committed
// Get only the name of the format from getType()
Lukas Cejka
committed
template< typename Matrix >
std::string getMatrixFormat( const Matrix& matrix )
Lukas Cejka
committed
{
std::string mtrxFullType = matrix.getType();
std::string mtrxType = mtrxFullType.substr(0, mtrxFullType.find("<"));
std::string format = mtrxType.substr(mtrxType.find(':') + 2);
return format;
}
template< typename Matrix >
void printMatrixInfo( const Matrix& matrix,
std::ostream& str )
{
str << "\n Format: " << getMatrixFormat( matrix ) << std::endl;
str << " Rows: " << matrix.getRows() << std::endl;
str << " Cols: " << matrix.getColumns() << std::endl;
str << " Nonzero Elements: " << matrix.getNumberOfNonzeroMatrixElements() << std::endl;
Lukas Cejka
committed
}
template< typename Real,
template< typename, typename, typename > class Matrix,
template< typename, typename, typename > class Vector = Containers::Vector >
bool
benchmarkSpMV( Benchmark & benchmark,
Lukas Cejka
committed
const String & inputFileName )
Lukas Cejka
committed
{
Lukas Cejka
committed
typedef Matrix< Real, Devices::Host, int > HostMatrix;
typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix;
typedef Containers::Vector< Real, Devices::Host, int > HostVector;
typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector;
HostMatrix hostMatrix;
DeviceMatrix deviceMatrix;
HostVector hostVector, hostVector2;
CudaVector deviceVector, deviceVector2;
Lukas Cejka
committed
// Start a buffer to capture the output of MatrixReader
std::stringstream buffer;
std::streambuf * old = std::cerr.rdbuf( buffer.rdbuf() );
Lukas Cejka
committed
if( ! MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix ) )
{
Lukas Cejka
committed
// Capture the original output of MatrixReader, so it isn't printed by console.
std::string errorMsgBuffer = buffer.str();
// Reset the buffer
std::cerr.rdbuf( old );
Lukas Cejka
committed
// WHY DID I CAPTURE THE ERROR MESSAGE ONLY TO RUN MatrixReader again? Use the above capture to print into log and console?
Lukas Cejka
committed
std::string matrixFormat = getMatrixFormat( hostMatrix );
Lukas Cejka
committed
//https://stackoverflow.com/questions/5419356/redirect-stdout-stderr-to-a-string
std::stringstream buffer;
std::streambuf * old = std::cerr.rdbuf( buffer.rdbuf() );
Lukas Cejka
committed
MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix );
errorMsgBuffer = buffer.str();
// Reset the buffer
std::cerr.rdbuf( old );
std::string stringErrorMsg = "Benchmark failed: Unable to read the matrix.\n"
"matrix format: " + matrixFormat +
"\nFailed to read the matrix file " +
( std::string )inputFileName + ".\n" +
Lukas Cejka
committed
errorMsgBuffer;
//https://stackoverflow.com/questions/1488775/c-remove-new-line-from-multiline-string
if ( ! stringErrorMsg.empty() && stringErrorMsg[ stringErrorMsg.length() - 1 ] == '\n' )
stringErrorMsg.erase( stringErrorMsg.length() - 1 );
// https://stackoverflow.com/questions/7352099/stdstring-to-char
char* errorMsg = &stringErrorMsg[ 0u ];
Lukas Cejka
committed
Lukas Cejka
committed
// FIXME: Every other benchmark, the errorMsg doesn't have a "!" as
// a prefix in the log file.
// (Try adding more benchmarks in benchmarkSpmvSynthetic(...)
// and you'll see)
benchmark.addErrorMessage( errorMsg, 1 );
std::cout << std::endl;
return false;
}
Lukas Cejka
committed
std::cerr.rdbuf( old );
}
catch( std::bad_alloc )
{
std::string matrixFormat = getMatrixFormat( hostMatrix );
Lukas Cejka
committed
//https://stackoverflow.com/questions/5419356/redirect-stdout-stderr-to-a-string
std::stringstream buffer;
std::streambuf * old = std::cerr.rdbuf(buffer.rdbuf());
MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix );
std::string errorMsgBuffer = buffer.str();
// Reset the buffer
std::cerr.rdbuf( old );
std::string stringErrorMsg = "Benchmark failed: Not enough memory.\n"
"matrix format: " + matrixFormat +
Lukas Cejka
committed
"\nFailed to allocate memory to read the matrix file " +
( std::string )inputFileName + ".\n" +
errorMsgBuffer;
//https://stackoverflow.com/questions/1488775/c-remove-new-line-from-multiline-string
if ( ! stringErrorMsg.empty() && stringErrorMsg[ stringErrorMsg.length() - 1 ] == '\n' )
stringErrorMsg.erase( stringErrorMsg.length() - 1 );
Lukas Cejka
committed
// https://stackoverflow.com/questions/7352099/stdstring-to-char
char *errorMsg = &stringErrorMsg[ 0u ];
Lukas Cejka
committed
// FIXME: Every other benchmark, the errorMsg doesn't have a "!" as
// a prefix in the log file.
// (Try adding more benchmarks in benchmarkSpmvSynthetic(...)
// and you'll see)
benchmark.addErrorMessage( errorMsg, 1 );
std::cout << std::endl;
return false;
}
Lukas Cejka
committed
// printMatrixInfo is redundant, because all the information is in the Benchmark's MetadataColumns
// printMatrixInfo( hostMatrix, std::cout );
#ifdef HAVE_CUDA
// FIXME: This doesn't work for ChunkedEllpack, because
Lukas Cejka
committed
// its cross-device assignment is not implemented yet
deviceMatrix = hostMatrix;
#endif
benchmark.setMetadataColumns( Benchmark::MetadataColumns({
{ "matrix format", convertToString( getMatrixFormat( hostMatrix ) ) },
{ "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) },
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
{ "rows", convertToString( hostMatrix.getRows() ) },
{ "columns", convertToString( hostMatrix.getColumns() ) }
} ));
hostVector.setSize( hostMatrix.getColumns() );
hostVector2.setSize( hostMatrix.getRows() );
#ifdef HAVE_CUDA
deviceVector.setSize( hostMatrix.getColumns() );
deviceVector2.setSize( hostMatrix.getRows() );
#endif
// reset function
auto reset = [&]() {
hostVector.setValue( 1.0 );
hostVector2.setValue( 0.0 );
#ifdef HAVE_CUDA
deviceVector.setValue( 1.0 );
deviceVector2.setValue( 0.0 );
#endif
};
const int elements = hostMatrix.getNumberOfNonzeroMatrixElements();
const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
// compute functions
auto spmvHost = [&]() {
hostMatrix.vectorProduct( hostVector, hostVector2 );
};
auto spmvCuda = [&]() {
deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
};
benchmark.setOperation( datasetSize );
benchmark.time< Devices::Host >( reset, "CPU", spmvHost );
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
// Initialize the host vector to be compared. (The values in hostVector2 will be reset when spmvCuda starts)
HostVector resultHostVector2;
resultHostVector2.setSize( hostVector2.getSize() );
resultHostVector2.setValue( 0.0 );
// Copy the values
for( int i = 0; i < hostVector2.getSize(); i++ )
resultHostVector2.setElement( i, hostVector2.getElement( i ) );
#ifdef HAVE_CUDA
benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda );
#endif
// Setup the device vector to be compared
HostVector resultDeviceVector2;
resultDeviceVector2.setSize( hostVector2.getSize() );
resultDeviceVector2.setValue( 0.0 );
// resultDeviceVector2 += deviceVector2; // Throws a segfault.
// Copy the values
for( int i = 0; i < deviceVector2.getSize(); i++ )
resultDeviceVector2.setElement( i, deviceVector2.getElement( i ) );
Real differenceAbsMax = resultHostVector2.differenceAbsMax( resultDeviceVector2 );
Real differenceLpNorm = resultHostVector2.differenceLpNorm( resultDeviceVector2, 1 );
std::string resultDifferenceAbsMax = "differenceAbsMax = " + std::to_string( differenceAbsMax );
std::string resultDifferenceLpNorm = "differenceLpNorm = " + std::to_string( differenceLpNorm );
char *absMax = &resultDifferenceAbsMax[ 0u ];
char *lpNorm = &resultDifferenceLpNorm[ 0u ];
// FIXME: THIS ISN'T AN ELEGANT SOLUTION, IT MAKES THE LOG FILE VERY LONG
// benchmark.addErrorMessage( absMax, 1 );
// benchmark.addErrorMessage( lpNorm, 1 );
std::cout << std::endl;
return true;
}
// Compares only CSR on GPU and Cusparse on GPU.
template< typename Real,
template< typename, typename, typename > class Vector = Containers::Vector >
bool
benchmarkCusparseSpMV( Benchmark & benchmark,
const String & inputFileName )
{
typedef Matrices::CSR< Real, Devices::Host, int > CSR_HostMatrix;
typedef Matrices::CSR< Real, Devices::Cuda, int > CSR_DeviceMatrix;
typedef Containers::Vector< Real, Devices::Host, int > HostVector;
typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector;
CSR_HostMatrix CSRhostMatrix;
CSR_DeviceMatrix CSRdeviceMatrix;
CudaVector deviceVector, deviceVector2;
try
{
// Start a buffer to capture the output of MatrixReader
std::stringstream buffer;
std::streambuf * old = std::cerr.rdbuf( buffer.rdbuf() );
if( ! MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix ) )
{
// Capture the original output of MatrixReader, so it isn't printed by console.
std::string errorMsgBuffer = buffer.str();
// Reset the buffer
std::cerr.rdbuf( old );
std::string matrixFormat = getMatrixFormat( CSRhostMatrix );
//https://stackoverflow.com/questions/5419356/redirect-stdout-stderr-to-a-string
std::stringstream buffer;
std::streambuf * old = std::cerr.rdbuf( buffer.rdbuf() );
MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix );
errorMsgBuffer = buffer.str();
// Reset the buffer
std::cerr.rdbuf( old );
std::string stringErrorMsg = "Benchmark failed: Unable to read the matrix.\n"
"matrix format: " + matrixFormat +
"\nFailed to read the matrix file " +
( std::string )inputFileName + ".\n" +
errorMsgBuffer;
//https://stackoverflow.com/questions/1488775/c-remove-new-line-from-multiline-string
if ( ! stringErrorMsg.empty() && stringErrorMsg[ stringErrorMsg.length() - 1 ] == '\n' )
stringErrorMsg.erase( stringErrorMsg.length() - 1 );
// https://stackoverflow.com/questions/7352099/stdstring-to-char
char* errorMsg = &stringErrorMsg[ 0u ];
// FIXME: Every other benchmark, the errorMsg doesn't have a "!" as
// a prefix in the log file.
// (Try adding more benchmarks in benchmarkSpmvSynthetic(...)
// and you'll see)
benchmark.addErrorMessage( errorMsg, 1 );
std::cout << std::endl;
return false;
}
std::cerr.rdbuf( old );
}
catch( std::bad_alloc )
{
std::string matrixFormat = getMatrixFormat( CSRhostMatrix );
//https://stackoverflow.com/questions/5419356/redirect-stdout-stderr-to-a-string
std::stringstream buffer;
std::streambuf * old = std::cerr.rdbuf(buffer.rdbuf());
MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix );
std::string errorMsgBuffer = buffer.str();
// Reset the buffer
std::cerr.rdbuf( old );
std::string stringErrorMsg = "Benchmark failed: Not enough memory.\n"
"matrix format: " + matrixFormat +
"\nFailed to allocate memory to read the matrix file " +
( std::string )inputFileName + ".\n" +
errorMsgBuffer;
//https://stackoverflow.com/questions/1488775/c-remove-new-line-from-multiline-string
if ( ! stringErrorMsg.empty() && stringErrorMsg[ stringErrorMsg.length() - 1 ] == '\n' )
stringErrorMsg.erase( stringErrorMsg.length() - 1 );
// https://stackoverflow.com/questions/7352099/stdstring-to-char
char *errorMsg = &stringErrorMsg[ 0u ];
// FIXME: Every other benchmark, the errorMsg doesn't have a "!" as
// a prefix in the log file.
// (Try adding more benchmarks in benchmarkSpmvSynthetic(...)
// and you'll see)
benchmark.addErrorMessage( errorMsg, 1 );
std::cout << std::endl;
return false;
}
benchmark.setMetadataColumns( Benchmark::MetadataColumns({
{ "matrix format", convertToString( getMatrixFormat( CSRhostMatrix ) ) },
{ "non-zeros", convertToString( CSRhostMatrix.getNumberOfNonzeroMatrixElements() ) },
{ "rows", convertToString( CSRhostMatrix.getRows() ) },
{ "columns", convertToString( CSRhostMatrix.getColumns() ) }
} ));
cusparseHandle_t cusparseHandle;
cusparseCreate( &cusparseHandle );
#ifdef HAVE_CUDA
// FIXME: This doesn't work for ChunkedEllpack, because
// its cross-device assignment is not implemented yet
CSRdeviceMatrix = CSRhostMatrix;
TNL::CusparseCSR< Real > cusparseCSR;
cusparseCSR.init( CSRdeviceMatrix, &cusparseHandle );
#endif
#ifdef HAVE_CUDA
deviceVector.setSize( CSRhostMatrix.getColumns() );
deviceVector2.setSize( CSRhostMatrix.getRows() );
#endif
// reset function
auto reset = [&]() {
#ifdef HAVE_CUDA
deviceVector.setValue( 1.0 );
deviceVector2.setValue( 0.0 );
#endif
};
const int elements = CSRhostMatrix.getNumberOfNonzeroMatrixElements();
const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
// compute functions
auto spmvCuda = [&]() {
CSRdeviceMatrix.vectorProduct( deviceVector, deviceVector2 );
};
auto spmvCusparse = [&]() {
cusparseCSR.vectorProduct( deviceVector, deviceVector2 );
};
benchmark.setOperation( datasetSize );
#ifdef HAVE_CUDA
benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda );
// Initialize the cuda vector to be compared. (The values in hostVector2 will be reset when spmvCuda starts)
HostVector resultCusparseVector2;
resultCusparseVector2.setSize( deviceVector2.getSize() );
resultCusparseVector2.setValue( 0.0 );
// Copy the values
for( int i = 0; i < deviceVector2.getSize(); i++ )
resultCusparseVector2.setElement( i, deviceVector2.getElement( i ) );
benchmark.time< Devices::Cuda >( reset, "GPU-Cusparse", spmvCusparse );
// Setup the device vector to be compared
HostVector resultDeviceVector2;
resultDeviceVector2.setSize( resultCusparseVector2.getSize() );
resultDeviceVector2.setValue( 0.0 );
// Copy the values
for( int i = 0; i < deviceVector2.getSize(); i++ )
resultDeviceVector2.setElement( i, deviceVector2.getElement( i ) );
Real differenceAbsMax = resultCusparseVector2.differenceAbsMax( resultDeviceVector2 );
Real differenceLpNorm = resultCusparseVector2.differenceLpNorm( resultDeviceVector2, 1 );
std::string resultDifferenceAbsMax = "differenceAbsMax = " + std::to_string( differenceAbsMax );
std::string resultDifferenceLpNorm = "differenceLpNorm = " + std::to_string( differenceLpNorm );
char *absMax = &resultDifferenceAbsMax[ 0u ];
char *lpNorm = &resultDifferenceLpNorm[ 0u ];
// FIXME: THIS ISN'T AN ELEGANT SOLUTION, IT MAKES THE LOG FILE VERY LONG
// benchmark.addErrorMessage( absMax, 1 );
// benchmark.addErrorMessage( lpNorm, 1 );
std::cout << std::endl;
cusparseDestroy( cusparseHandle );
Lukas Cejka
committed
}
template< typename Real = double,
typename Index = int >
bool
benchmarkSpmvSynthetic( Benchmark & benchmark,
Lukas Cejka
committed
const String& inputFileName )
Lukas Cejka
committed
{
bool result = true;
// TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats)
Lukas Cejka
committed
result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, inputFileName );
// This doesn't have a titles (matrix format, rows, cols, etc.) in the output, because the header is the same as before (CSR).
result |= benchmarkCusparseSpMV< Real, Matrices::CSR >( benchmark, inputFileName );
result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, inputFileName );
result |= benchmarkSpMV< Real, SlicedEllpack >( benchmark, inputFileName );
// result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, inputFileName );
Lukas Cejka
committed
return result;
}
} // namespace Benchmarks
} // namespace TNL