diff --git a/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cpp b/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c9cd17cda0312c07bed4bcaa92c4ef4273704b35 --- /dev/null +++ b/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cpp @@ -0,0 +1,14 @@ +/*************************************************************************** + tnl-benchmark-spmv.cpp - description + ------------------- + begin : Jun 5, 2014 + copyright : (C) 2014 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + + +#include "tnl-benchmark-old-spmv.h" + + diff --git a/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cu b/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cu new file mode 100644 index 0000000000000000000000000000000000000000..433af970b6058e1ae03f480296da566a3cbb79b5 --- /dev/null +++ b/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cu @@ -0,0 +1,12 @@ +/*************************************************************************** + tnl-benchmark-spmv.cu - description + ------------------- + begin : Jun 5, 2014 + copyright : (C) 2014 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + + +#include "tnl-benchmark-old-spmv.h" diff --git a/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.h b/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.h new file mode 100644 index 0000000000000000000000000000000000000000..455c7d412f4f8ae4cc4af7bbd15ba0e47dda978a --- /dev/null +++ b/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.h @@ -0,0 +1,925 @@ +/*************************************************************************** + tnl-benchmark-spmv.h - description + ------------------- + begin : Jun 5, 2014 + copyright : (C) 2014 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#ifdef NOT_USED_ANYMORE + +#pragma once + +#include +#include +#include +#ifdef HAVE_CUDA +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "tnlCusparseCSRMatrix.h" + +using namespace std; +using namespace TNL; +using namespace TNL::Matrices; + +void setupConfig( Config::ConfigDescription& config ) +{ + config.addDelimiter ( "General settings:" ); + config.addRequiredEntry< String >( "test" , "Test to be performed." ); + config.addEntryEnum< String >( "mtx" ); + config.addEntryEnum< String >( "tnl" ); + config.addRequiredEntry< String >( "input-file" , "Input file name." ); + config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-spmv.log"); + config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" ); + config.addEntry< double >( "stop-time", "Seconds to iterate the SpMV operation.", 1.0 ); + config.addEntry< int >( "verbose", "Verbose mode.", 1 ); +} + +bool initLogFile( std::fstream& logFile, const String& fileName ) +{ + if( access( fileName.getString(), F_OK ) == -1 ) + { + logFile.open( fileName.getString(), std::ios::out ); + if( ! logFile ) + return false; + const String fillingColoring = " : COLORING 0 #FFF8DC 20 #FFFF00 40 #FFD700 60 #FF8C0 80 #FF0000 100"; + const String speedupColoring = " : COLORING #0099FF 1 #FFFFFF 2 #00FF99 4 #33FF99 8 #33FF22 16 #FF9900"; + const String paddingColoring = " : COLORING #FFFFFF 1 #FFFFCC 10 #FFFF99 100 #FFFF66 1000 #FFFF33 10000 #FFFF00"; + logFile << "#Matrix file " << std::endl; + logFile << "#Rows" << std::endl; + logFile << "#Columns" << std::endl; + logFile << "#Non-zero elements" << std::endl; + logFile << "#Filling (in %)" << fillingColoring << std::endl; + logFile << "#CSR Format" << std::endl; + logFile << "# CPU" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << std::endl; +#ifdef HAVE_CUDA + logFile << "# Cusparse CSR" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << " SORT - cusparse-csr-speedup.txt" << std::endl; + logFile << "# CUDA" << std::endl; + logFile << "# Scalar" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << " SORT - csr-scalar-cuda-speedup.txt" << std::endl; + logFile << "# Vector" << std::endl; + logFile << "# Warp Size 1" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-1-cuda-speedup.txt" << std::endl; + logFile << "# Warp Size 2" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-2-cuda-speedup.txt" << std::endl; + logFile << "# Warp Size 4" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-4-cuda-speedup.txt" << std::endl; + logFile << "# Warp Size 8" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-8-cuda-speedup.txt" << std::endl; + logFile << "# Warp Size 16" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-16-cuda-speedup.txt" << std::endl; + logFile << "# Warp Size 32" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-32-cuda-speedup.txt" << std::endl; + logFile << "# Hybrid" << std::endl; + logFile << "# Split 2" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << " SORT - csr-hybrid-2-cuda-speedup.txt" << std::endl; + logFile << "# Split 4" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << " SORT - csr-hybrid-4-cuda-speedup.txt" << std::endl; + logFile << "# Split 8" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << " SORT - csr-hybrid-8-cuda-speedup.txt" << std::endl; + logFile << "# Split 16" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << " SORT - csr-hybrid-16-cuda-speedup.txt" << std::endl; + logFile << "# Split 32" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << " SORT - csr-hybrid-32-cuda-speedup.txt" << std::endl; + logFile << "# Split 64" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << " SORT - csr-hybrid-64-cuda-speedup.txt" << std::endl; +#endif + logFile << "#Ellpack Format" << std::endl; + logFile << "# Padding (in %)" << paddingColoring << std::endl; + logFile << "# CPU" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << " SORT - ellpack-host-speedup.txt" << std::endl; +#ifdef HAVE_CUDA + logFile << "# CUDA" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << " SORT - ellpack-cuda-speedup.txt" << std::endl; +#endif + logFile << "#SlicedEllpack Format" << std::endl; + logFile << "# Padding (in %)" << paddingColoring << std::endl; + logFile << "# CPU" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << " SORT - sliced-ellpack-host-speedup.txt" << std::endl; +#ifdef HAVE_CUDA + logFile << "# CUDA" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << " SORT - sliced-ellpack-cuda-speedup.txt" << std::endl; +#endif + logFile << "#ChunkedEllpack Format" << std::endl; + logFile << "# Padding (in %)" << paddingColoring << std::endl; + logFile << "# CPU" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << " SORT - chunked-ellpack-host-speedup.txt" << std::endl; +#ifdef HAVE_CUDA + logFile << "# CUDA" << std::endl; + logFile << "# Gflops" << std::endl; + logFile << "# Throughput" << std::endl; + logFile << "# Speedup" << speedupColoring << " SORT - chunked-ellpack-cuda-speedup.txt" << std::endl; +#endif + return true; + } + logFile.open( fileName.getString(), std::ios::out | std::ios::app ); + //logFile << std::setprecision( 2 ); + if( ! logFile ) + return false; + return true; +} + +template< typename Matrix > +void printMatrixInfo( const String& inputFileName, + const Matrix& matrix, + std::ostream& str ) +{ + str << " Rows: " << std::setw( 8 ) << matrix.getRows(); + str << " Columns: " << std::setw( 8 ) << matrix.getColumns(); + str << " Nonzero Elements: " << std::setw( 10 ) << matrix.getNumberOfNonzeroMatrixElements(); + const double fillingRatio = ( double ) matrix.getNumberOfNonzeroMatrixElements() / ( double ) matrix.getNumberOfMatrixElements(); + str << " Filling: " << std::setw( 5 ) << 100.0 * fillingRatio << "%" << std::endl; + str << std::setw( 25 ) << "Format" + << std::setw( 15 ) << "Padding" + << std::setw( 15 ) << "Time" + << std::setw( 15 ) << "GFLOPS" + << std::setw( 15 ) << "Throughput" + << std::setw( 15 ) << "Speedup" << std::endl; +} + +template< typename Matrix > +bool writeMatrixInfo( const String& inputFileName, + const Matrix& matrix, + std::ostream& logFile ) +{ + logFile << std::endl; + logFile << inputFileName << std::endl; + logFile << " " << matrix.getRows() << std::endl; + logFile << " " << matrix.getColumns() << std::endl; + logFile << " " << matrix.getNumberOfNonzeroMatrixElements() << std::endl; + const double fillingRatio = ( double ) matrix.getNumberOfNonzeroMatrixElements() / ( double ) matrix.getNumberOfMatrixElements(); + logFile << " " << 100.0 * fillingRatio << std::endl; + logFile << std::flush; + if( ! logFile.good() ) + return false; + return true; +} + +double computeGflops( const long int nonzeroElements, + const int iterations, + const double& time ) +{ + return ( double ) ( 2 * iterations * nonzeroElements ) / time * 1.0e-9; +} + +template< typename Real > +double computeThroughput( const long int nonzeroElements, + const int iterations, + const int rows, + const double& time ) +{ + return ( double ) ( ( 2 * nonzeroElements + rows ) * iterations ) * sizeof( Real ) / time * 1.0e-9; +} + +template< typename Matrix, + typename Vector > +double benchmarkMatrix( const Matrix& matrix, + const Vector& x, + Vector& b, + const long int nonzeroElements, + const char* format, + const double& stopTime, + const double& baseline, + int verbose, + std::fstream& logFile ) +{ + Timer timer; + timer.start(); + double time( 0.0 ); + int iterations( 0 ); + while( time < stopTime ) + { + matrix.vectorProduct( x, b ); +#ifdef HAVE_CUDA + if( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value ) + cudaDeviceSynchronize(); +#endif + time = timer.getRealTime(); + iterations++; + } + const double gflops = computeGflops( nonzeroElements, iterations, time ); + const double throughput = computeThroughput< typename Matrix::RealType >( nonzeroElements, iterations, matrix.getRows(), time ); + const long int allocatedElements = matrix.getNumberOfMatrixElements(); + const double padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; + if( verbose ) + { + std::cout << std::setw( 25 ) << format + << std::setw( 15 ) << padding + << std::setw( 15 ) << time + << std::setw( 15 ) << gflops + << std::setw( 15 ) << throughput; + if( baseline ) + std::cout << std::setw( 15 ) << gflops / baseline << std::endl; + else + std::cout << std::setw( 15 ) << "N/A" << std::endl; + } + logFile << " " << gflops << std::endl; + logFile << " " << throughput << std::endl; + if( baseline ) + logFile << gflops / baseline << std::endl; + else + logFile << "N/A" << std::endl; + return gflops; +} + +void writeTestFailed( std::fstream& logFile, + int repeat ) +{ + for( int i = 0; i < repeat; i++ ) + logFile << "N/A" << std::endl; +} + +template< typename Real > +bool setupBenchmark( const Config::ParameterContainer& parameters ) +{ + const String& test = parameters.getParameter< String >( "test" ); + const String& inputFileName = parameters.getParameter< String >( "input-file" ); + const String& logFileName = parameters.getParameter< String >( "log-file" ); + const int verbose = parameters.getParameter< int >( "verbose" ); + const double stopTime = parameters.getParameter< double >( "stop-time" ); + std::fstream logFile; + if( ! initLogFile( logFile, logFileName ) ) + { + std::cerr << "I am not able to open the file " << logFileName << "." << std::endl; + return false; + } + if( test == "mtx" ) + { + typedef Matrices::CSR< Real, Devices::Host, int > CSRType; + CSRType csrMatrix; + try + { + if( ! MatrixReader< CSRType >::readMtxFile( inputFileName, csrMatrix ) ) + { + std::cerr << "I am not able to read the matrix file " << inputFileName << "." << std::endl; + logFile << std::endl; + logFile << inputFileName << std::endl; + logFile << "Benchmark failed: Unable to read the matrix." << std::endl; + return false; + } + } + catch( std::bad_alloc ) + { + std::cerr << "Not enough memory to read the matrix." << std::endl; + logFile << std::endl; + logFile << inputFileName << std::endl; + logFile << "Benchmark failed: Not enough memory." << std::endl; + return false; + } + if( verbose ) + printMatrixInfo( inputFileName, csrMatrix,std::cout ); + if( ! writeMatrixInfo( inputFileName, csrMatrix, logFile ) ) + { + std::cerr << "I am not able to write new matrix to the log file." << std::endl; + return false; + } + const int rows = csrMatrix.getRows(); + const long int nonzeroElements = csrMatrix.getNumberOfMatrixElements(); + Containers::Vector< int, Devices::Host, int > rowLengthsHost; + rowLengthsHost.setSize( rows ); + for( int row = 0; row < rows; row++ ) + rowLengthsHost[ row ] = csrMatrix.getRowLength( row ); + + typedef Containers::Vector< Real, Devices::Host, int > HostVector; + HostVector hostX, hostB; + hostX.setSize( csrMatrix.getColumns() ); + hostX.setValue( 1.0 ); + hostB.setSize( csrMatrix.getRows() ); +#ifdef HAVE_CUDA + typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector; + CudaVector cudaX, cudaB; + Containers::Vector< int, Devices::Cuda, int > rowLengthsCuda; + cudaX.setSize( csrMatrix.getColumns() ); + cudaX.setValue( 1.0 ); + cudaB.setSize( csrMatrix.getRows() ); + rowLengthsCuda.setSize( csrMatrix.getRows() ); + rowLengthsCuda = rowLengthsHost; + cusparseHandle_t cusparseHandle; + cusparseCreate( &cusparseHandle ); +#endif + const double baseline = benchmarkMatrix( csrMatrix, + hostX, + hostB, + nonzeroElements, + "CSR Host", + stopTime, + 0.0, + verbose, + logFile ); +#ifdef HAVE_CUDA + typedef CSR< Real, Devices::Cuda, int > CSRCudaType; + CSRCudaType cudaCSR; + //cout << "Copying matrix to GPU... "; + cudaCSR = csrMatrix; + TNL::CusparseCSR< Real > cusparseCSR; + cusparseCSR.init( cudaCSR, &cusparseHandle ); + benchmarkMatrix( cusparseCSR, + cudaX, + cudaB, + nonzeroElements, + "Cusparse CSR", + stopTime, + baseline, + verbose, + logFile ); + cusparseDestroy( cusparseHandle ); + + std::cout << " done. \r"; + /*cudaCSR.setCudaKernelType( CSRCudaType::scalar ); + benchmarkMatrix( cudaCSR, + cudaX, + cudaB, + nonzeroElements, + "CSR Cuda Scalar", + stopTime, + baseline, + verbose, + logFile ); + cudaCSR.setCudaKernelType( CSRCudaType::vector ); + cudaCSR.setCudaWarpSize( 1 ); + benchmarkMatrix( cudaCSR, + cudaX, + cudaB, + nonzeroElements, + "CSR Cuda Vector 1", + stopTime, + baseline, + verbose, + logFile ); + cudaCSR.setCudaWarpSize( 2 ); + benchmarkMatrix( cudaCSR, + cudaX, + cudaB, + nonzeroElements, + "CSR Cuda Vector 2", + stopTime, + baseline, + verbose, + logFile ); + cudaCSR.setCudaWarpSize( 4 ); + benchmarkMatrix( cudaCSR, + cudaX, + cudaB, + nonzeroElements, + "CSR Cuda Vector 4", + stopTime, + baseline, + verbose, + logFile ); + cudaCSR.setCudaWarpSize( 8 ); + benchmarkMatrix( cudaCSR, + cudaX, + cudaB, + nonzeroElements, + "CSR Cuda Vector 8", + stopTime, + baseline, + verbose, + logFile ); + cudaCSR.setCudaWarpSize( 16 ); + benchmarkMatrix( cudaCSR, + cudaX, + cudaB, + nonzeroElements, + "CSR Cuda Vector 16", + stopTime, + baseline, + verbose, + logFile ); + cudaCSR.setCudaWarpSize( 32 ); + benchmarkMatrix( cudaCSR, + cudaX, + cudaB, + nonzeroElements, + "CSR Cuda Vector 32", + stopTime, + baseline, + verbose, + logFile ); + cudaCSR.setCudaKernelType( CSRCudaType::hybrid ); + cudaCSR.setHybridModeSplit( 2 ); + benchmarkMatrix( cudaCSR, + cudaX, + cudaB, + nonzeroElements, + "CSR Cuda Hyrbid 2", + stopTime, + baseline, + verbose, + logFile ); + cudaCSR.setHybridModeSplit( 4 ); + benchmarkMatrix( cudaCSR, + cudaX, + cudaB, + nonzeroElements, + "CSR Cuda Hyrbid 4", + stopTime, + baseline, + verbose, + logFile ); + cudaCSR.setHybridModeSplit( 8 ); + benchmarkMatrix( cudaCSR, + cudaX, + cudaB, + nonzeroElements, + "CSR Cuda Hyrbid 8", + stopTime, + baseline, + verbose, + logFile ); + cudaCSR.setHybridModeSplit( 16 ); + benchmarkMatrix( cudaCSR, + cudaX, + cudaB, + nonzeroElements, + "CSR Cuda Hyrbid 16", + stopTime, + baseline, + verbose, + logFile ); + cudaCSR.setHybridModeSplit( 32 ); + benchmarkMatrix( cudaCSR, + cudaX, + cudaB, + nonzeroElements, + "CSR Cuda Hyrbid 32", + stopTime, + baseline, + verbose, + logFile ); + cudaCSR.setHybridModeSplit( 64 ); + benchmarkMatrix( cudaCSR, + cudaX, + cudaB, + nonzeroElements, + "CSR Cuda Hyrbid 64", + stopTime, + baseline, + verbose, + logFile );*/ + cudaCSR.reset(); +#endif + + long int allocatedElements; + double padding; + typedef Ellpack< Real, Devices::Host, int > EllpackType; + EllpackType ellpackMatrix; + Matrices::copySparseMatrix( ellpackMatrix, csrMatrix ); + allocatedElements = ellpackMatrix.getNumberOfMatrixElements(); + padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; + logFile << " " << padding << std::endl; + benchmarkMatrix( ellpackMatrix, + hostX, + hostB, + nonzeroElements, + "Ellpack Host", + stopTime, + baseline, + verbose, + logFile ); +#ifdef HAVE_CUDA + typedef Ellpack< Real, Devices::Cuda, int > EllpackCudaType; + EllpackCudaType cudaEllpack; + std::cout << "Copying matrix to GPU... "; + cudaEllpack = ellpackMatrix; + std::cout << " done. \r"; + benchmarkMatrix( cudaEllpack, + cudaX, + cudaB, + nonzeroElements, + "Ellpack Cuda", + stopTime, + baseline, + verbose, + logFile ); + cudaEllpack.reset(); +#endif + ellpackMatrix.reset(); + + typedef Matrices::EllpackSymmetric< Real, Devices::Host, int > EllpackSymmetricType; + EllpackSymmetricType EllpackSymmetric; + if( ! MatrixReader< EllpackSymmetricType >::readMtxFile( inputFileName, EllpackSymmetric, verbose, true ) ) + writeTestFailed( logFile, 7 ); + else + { + allocatedElements = EllpackSymmetric.getNumberOfMatrixElements(); + padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; + logFile << " " << padding < EllpackSymmetricCudaType; + EllpackSymmetricCudaType cudaEllpackSymmetric; + std::cout << "Copying matrix to GPU... "; + for( int i = 0; i < rowLengthsHost.getSize(); i++ ) + rowLengthsHost[ i ] = EllpackSymmetric.getRowLength( i ); + rowLengthsCuda = rowLengthsHost; + + // TODO: fix this + //if( ! cudaEllpackSymmetric.copyFrom( EllpackSymmetric, rowLengthsCuda ) ) + { + std::cerr << "I am not able to transfer the matrix on GPU." < SlicedEllpackMatrixType; + SlicedEllpackMatrixType slicedEllpackMatrix; + if( ! Matrices::MatrixReader< SlicedEllpackMatrixType >::readMtxFile( inputFileName, slicedEllpackMatrix, verbose ) ) + writeTestFailed( logFile, 7 ); + else + { + allocatedElements = slicedEllpackMatrix.getNumberOfMatrixElements(); + padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100; + logFile << " " << padding < SlicedEllpackMatrixCudaType; + SlicedEllpackMatrixCudaType cudaSlicedEllpackMatrix; + for( int i = 0; i < rowLengthsHost.getSize(); i++ ) + rowLengthsHost[ i ] = slicedEllpackMatrix.getRowLength( i ); + rowLengthsCuda = rowLengthsHost; + // TODO: fix + //if( ! cudaSlicedEllpackMatrix.copyFrom( slicedEllpackMatrix, rowLengthsCuda ) ) + { + std::cerr << "Nejde zkopirovat" < ChunkedEllpackType; + ChunkedEllpackType chunkedEllpack; + Matrices::copySparseMatrix( chunkedEllpack, csrMatrix ); + allocatedElements = chunkedEllpack.getNumberOfMatrixElements(); + padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; + logFile << " " << padding << std::endl; + benchmarkMatrix( chunkedEllpack, + hostX, + hostB, + nonzeroElements, + "ChunkedEllpack Host", + stopTime, + baseline, + verbose, + logFile ); + +#ifdef HAVE_CUDA + typedef Matrices::ChunkedEllpack< Real, Devices::Cuda, int > ChunkedEllpackCudaType; + ChunkedEllpackCudaType cudaChunkedEllpack; + std::cout << "Copying matrix to GPU... "; + cudaChunkedEllpack = chunkedEllpack; + std::cout << " done. \r"; + benchmarkMatrix( cudaChunkedEllpack, + cudaX, + cudaB, + nonzeroElements, + "ChunkedEllpack Cuda", + stopTime, + baseline, + verbose, + logFile ); + cudaChunkedEllpack.reset(); +#endif + + typedef Matrices::BiEllpack< Real, Devices::Host, int > BiEllpackMatrixType; + BiEllpackMatrixType biEllpackMatrix; + // TODO: I did not check this during git merging, but I hope its gonna work + // Tomas Oberhuber + // copySparseMatrix( biEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats + /*if( ! biEllpackMatrix.copyFrom( csrMatrix, rowLengthsHost ) ) + writeTestFailed( logFile, 7 ); + else*/ + { + allocatedElements = biEllpackMatrix.getNumberOfMatrixElements(); + padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; + logFile << " " << padding < BiEllpackMatrixCudaType; + BiEllpackMatrixCudaType cudaBiEllpackMatrix; + // TODO: I did not check this during git merging, but I hope its gonna work + // Tomas Oberhuber + // copySparseMatrix( biEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats + std::cout << "Copying matrix to GPU... "; + /*if( ! cudaBiEllpackMatrix.copyFrom( biEllpackMatrix, rowLengthsCuda ) ) + { + std::cerr << "I am not able to transfer the matrix on GPU." < SlicedEllpackSymmetricType; + SlicedEllpackSymmetricType slicedEllpackSymmetric; + if( ! Matrices::MatrixReader< SlicedEllpackSymmetricType >::readMtxFile( inputFileName, slicedEllpackSymmetric, verbose, true ) ) + writeTestFailed( logFile, 7 ); + else + { + allocatedElements = slicedEllpackSymmetric.getNumberOfMatrixElements(); + padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; + logFile << " " << padding < SlicedEllpackSymmetricCudaType; + SlicedEllpackSymmetricCudaType cudaSlicedEllpackSymmetric; + std::cout << "Copying matrix to GPU... "; + for( int i = 0; i < rowLengthsHost.getSize(); i++ ) + rowLengthsHost[ i ] = slicedEllpackSymmetric.getRowLength( i ); + rowLengthsCuda = rowLengthsHost; + // TODO: fiox the nest line + //if( ! cudaSlicedEllpackSymmetric.copyFrom( slicedEllpackSymmetric, rowLengthsCuda ) ) + { + std::cerr << "I am not able to transfer the matrix on GPU." < EllpackSymmetricGraphMatrixType; + EllpackSymmetricGraphMatrixType EllpackSymmetricGraphMatrix; + if( ! Matrices::MatrixReader< EllpackSymmetricGraphMatrixType >::readMtxFile( inputFileName, EllpackSymmetricGraphMatrix, verbose, true ) || + ! EllpackSymmetricGraphMatrix.help() ) + writeTestFailed( logFile, 7 ); + else + { + allocatedElements = EllpackSymmetricGraphMatrix.getNumberOfMatrixElements(); + padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; + logFile << " " << padding < EllpackSymmetricGraphMatrixCudaType; + EllpackSymmetricGraphMatrixCudaType cudaEllpackSymmetricGraphMatrix; + std::cout << "Copying matrix to GPU... "; + for( int i = 0; i < rowLengthsHost.getSize(); i++ ) + rowLengthsHost[ i ] = EllpackSymmetricGraphMatrix.getRowLength( i ); + rowLengthsCuda = rowLengthsHost; + // TODO: fix it + //if( ! cudaEllpackSymmetricGraphMatrix.copyFrom( EllpackSymmetricGraphMatrix, rowLengthsCuda ) ) + { + writeTestFailed( logFile, 3 ); + } + //else if( ! cudaEllpackSymmetricGraphMatrix.help() ) + { + writeTestFailed( logFile, 3 ); + } + //else + { + std::cout << " done. \r"; + benchmarkMatrix( cudaEllpackSymmetricGraphMatrix, + cudaX, + cudaB, + nonzeroElements, + "Ellpack Graph Cuda", + stopTime, + baseline, + verbose, + logFile ); + } + cudaEllpackSymmetricGraphMatrix.reset(); +#endif + } + + + typedef Matrices::AdEllpack< Real, Devices::Host, int > AdEllpackMatrixType; + AdEllpackMatrixType adEllpackMatrix; + // TODO: I did not check this during git merging, but I hope its gonna work + // Tomas Oberhuber + //copySparseMatrix( adEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats + /*if( ! adEllpackMatrix.copyFrom( csrMatrix, rowLengthsHost ) ) + writeTestFailed( logFile, 7 ); + else*/ + { + allocatedElements = adEllpackMatrix.getNumberOfMatrixElements(); + padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; + logFile << " " << padding < AdEllpackMatrixCudaType; + AdEllpackMatrixCudaType cudaAdEllpackMatrix; + // TODO: I did not check this during git merging, but I hope its gonna work + // Tomas Oberhuber + //copySparseMatrix( adEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats + std::cout << "Copying matrix to GPU... "; + /*if( ! cudaAdEllpackMatrix.copyFrom( csrMatrix, rowLengthsCuda ) ) + { + std::cerr << "I am not able to transfer the matrix on GPU." <( "precision" ); + if( precision == "float" ) + if( ! setupBenchmark< float >( parameters ) ) + return EXIT_FAILURE; + if( precision == "double" ) + if( ! setupBenchmark< double >( parameters ) ) + return EXIT_FAILURE; + return EXIT_SUCCESS; +} + +#endif \ No newline at end of file diff --git a/src/Benchmarks/SpMV/tnlCusparseCSRMatrix.h b/src/Benchmarks/SpMV/OldSpMV/tnlCusparseCSRMatrix.h similarity index 99% rename from src/Benchmarks/SpMV/tnlCusparseCSRMatrix.h rename to src/Benchmarks/SpMV/OldSpMV/tnlCusparseCSRMatrix.h index 8f6d376fe27ebed3cd67307bf8f24ea2c5d630d4..fbef4f9a2410669f8c91ef51bf6de404ab1bb7fc 100644 --- a/src/Benchmarks/SpMV/tnlCusparseCSRMatrix.h +++ b/src/Benchmarks/SpMV/OldSpMV/tnlCusparseCSRMatrix.h @@ -8,6 +8,8 @@ /* See Copyright Notice in tnl/Copyright */ +#ifdef NOT_USED_ANYMORE + #include #include #ifdef HAVE_CUDA @@ -157,3 +159,4 @@ class CusparseCSR< float > : public CusparseCSRBase< float > } // namespace TNL +#endif \ No newline at end of file diff --git a/src/Benchmarks/SpMV/cusparseCSRMatrix.h b/src/Benchmarks/SpMV/cusparseCSRMatrix.h new file mode 100644 index 0000000000000000000000000000000000000000..8ed210d9ad2802cf56237c2a02ecda6913c5a352 --- /dev/null +++ b/src/Benchmarks/SpMV/cusparseCSRMatrix.h @@ -0,0 +1,158 @@ +/*************************************************************************** + tnlCusparseCSR.h - description + ------------------- + begin : Jul 3, 2014 + copyright : (C) 2014 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#include +#include +#ifdef HAVE_CUDA +#include +#endif + +namespace TNL { + +template< typename Real > +class CusparseCSRBase +{ + public: + typedef Real RealType; + typedef Devices::Cuda DeviceType; + typedef Matrices::CSR< RealType, Devices::Cuda, int > MatrixType; + + CusparseCSRBase() + : matrix( 0 ) + { + }; + +#ifdef HAVE_CUDA + void init( const MatrixType& matrix, + cusparseHandle_t* cusparseHandle ) + { + this->matrix = &matrix; + this->cusparseHandle = cusparseHandle; + cusparseCreateMatDescr( & this->matrixDescriptor ); + }; +#endif + + int getRows() const + { + return matrix->getRows(); + } + + int getColumns() const + { + return matrix->getColumns(); + } + + int getNumberOfMatrixElements() const + { + return matrix->getNumberOfMatrixElements(); + } + + + template< typename InVector, + typename OutVector > + void vectorProduct( const InVector& inVector, + OutVector& outVector ) const + { + TNL_ASSERT_TRUE( matrix, "matrix was not initialized" ); +#ifdef HAVE_CUDA + cusparseDcsrmv( *( this->cusparseHandle ), + CUSPARSE_OPERATION_NON_TRANSPOSE, + this->matrix->getRows(), + this->matrix->getColumns(), + this->matrix->values.getSize(), + 1.0, + this->matrixDescriptor, + this->matrix->values.getData(), + this->matrix->rowPointers.getData(), + this->matrix->columnIndexes.getData(), + inVector.getData(), + 1.0, + outVector.getData() ); +#endif + } + + protected: + + const MatrixType* matrix; +#ifdef HAVE_CUDA + cusparseHandle_t* cusparseHandle; + + cusparseMatDescr_t matrixDescriptor; +#endif +}; + + +template< typename Real > +class CusparseCSR +{}; + +template<> +class CusparseCSR< double > : public CusparseCSRBase< double > +{ + public: + + template< typename InVector, + typename OutVector > + void vectorProduct( const InVector& inVector, + OutVector& outVector ) const + { + TNL_ASSERT_TRUE( matrix, "matrix was not initialized" ); +#ifdef HAVE_CUDA + double d = 1.0; + double* alpha = &d; + cusparseDcsrmv( *( this->cusparseHandle ), + CUSPARSE_OPERATION_NON_TRANSPOSE, + this->matrix->getRows(), + this->matrix->getColumns(), + this->matrix->getValues().getSize(), + alpha, + this->matrixDescriptor, + this->matrix->getValues().getData(), + this->matrix->getRowPointers().getData(), + this->matrix->getColumnIndexes().getData(), + inVector.getData(), + alpha, + outVector.getData() ); +#endif + } +}; + +template<> +class CusparseCSR< float > : public CusparseCSRBase< float > +{ + public: + + template< typename InVector, + typename OutVector > + void vectorProduct( const InVector& inVector, + OutVector& outVector ) const + { + TNL_ASSERT_TRUE( matrix, "matrix was not initialized" ); +#ifdef HAVE_CUDA + float d = 1.0; + float* alpha = &d; + cusparseScsrmv( *( this->cusparseHandle ), + CUSPARSE_OPERATION_NON_TRANSPOSE, + this->matrix->getRows(), + this->matrix->getColumns(), + this->matrix->getValues().getSize(), + alpha, + this->matrixDescriptor, + this->matrix->getValues().getData(), + this->matrix->getRowPointers().getData(), + this->matrix->getColumnIndexes().getData(), + inVector.getData(), + alpha, + outVector.getData() ); +#endif + } +}; + +} // namespace TNL \ No newline at end of file diff --git a/src/Benchmarks/SpMV/spmv.h b/src/Benchmarks/SpMV/spmv.h new file mode 100644 index 0000000000000000000000000000000000000000..408bcae2976bfd8b84233c22a93153d3ba22d1ac --- /dev/null +++ b/src/Benchmarks/SpMV/spmv.h @@ -0,0 +1,293 @@ +/*************************************************************************** + spmv.h - description + ------------------- + begin : Dec 30, 2018 + copyright : (C) 2015 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Lukas Cejka +// Original implemented by J. Klinkovsky in Benchmarks/BLAS +// This is an edited copy of Benchmarks/BLAS/spmv.h by: Lukas Cejka + +#pragma once + +#include "../Benchmarks.h" + +#include +#include +#include +#include +#include +#include +#include + +#include +using namespace TNL::Matrices; + +#include "cusparseCSRMatrix.h" + +namespace TNL { +namespace Benchmarks { + +// Alias to match the number of template parameters with other formats +template< typename Real, typename Device, typename Index > +using SlicedEllpackAlias = Matrices::SlicedEllpack< Real, Device, Index >; + +// Get the name (with extension) of input matrix file +std::string getMatrixFileName( const String& InputFileName ) +{ + std::string fileName = InputFileName; + + const size_t last_slash_idx = fileName.find_last_of( "/\\" ); + if( std::string::npos != last_slash_idx ) + fileName.erase( 0, last_slash_idx + 1 ); + + return fileName; +} + +// Get only the name of the format from getType() +template< typename Matrix > +std::string getMatrixFormat( const Matrix& matrix ) +{ + std::string mtrxFullType = getType( matrix ); + std::string mtrxType = mtrxFullType.substr( 0, mtrxFullType.find( "<" ) ); + std::string format = mtrxType.substr( mtrxType.find( ':' ) + 2 ); + + return format; +} + +// Print information about the matrix. +template< typename Matrix > +void printMatrixInfo( const Matrix& matrix, + std::ostream& str ) +{ + str << "\n Format: " << getMatrixFormat( matrix ) << std::endl; + str << " Rows: " << matrix.getRows() << std::endl; + str << " Cols: " << matrix.getColumns() << std::endl; + str << " Nonzero Elements: " << matrix.getNumberOfNonzeroMatrixElements() << std::endl; +} + +template< typename Real, + template< typename, typename, typename > class Matrix, + template< typename, typename, typename, typename > class Vector = Containers::Vector > +bool +benchmarkSpMV( Benchmark& benchmark, + const String& inputFileName, + bool verboseMR ) +{ + // Setup CSR for cuSPARSE. It will compared to the format given as a template parameter to this function + typedef Matrices::CSR< Real, Devices::Host, int > CSR_HostMatrix; + typedef Matrices::CSR< Real, Devices::Cuda, int > CSR_DeviceMatrix; + + CSR_HostMatrix CSRhostMatrix; + CSR_DeviceMatrix CSRdeviceMatrix; + + // Read the matrix for CSR, to set up cuSPARSE + try + { + if( ! MatrixReader< CSR_HostMatrix >::readMtxFile( inputFileName, CSRhostMatrix, verboseMR ) ) + { + throw std::bad_alloc(); + return false; + } + } + catch( std::bad_alloc& e ) + { + e.what(); + return false; + } + +#ifdef HAVE_CUDA + // cuSPARSE handle setup + cusparseHandle_t cusparseHandle; + cusparseCreate( &cusparseHandle ); + + // cuSPARSE (in TNL's CSR) only works for device, copy the matrix from host to device + CSRdeviceMatrix = CSRhostMatrix; + + // Delete the CSRhostMatrix, so it doesn't take up unnecessary space + CSRhostMatrix.reset(); + + // Initialize the cusparseCSR matrix. + TNL::CusparseCSR< Real > cusparseCSR; + cusparseCSR.init( CSRdeviceMatrix, &cusparseHandle ); +#endif + + // Setup the format which is given as a template parameter to this function + typedef Matrix< Real, Devices::Host, int > HostMatrix; + typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix; + typedef Containers::Vector< Real, Devices::Host, int > HostVector; + typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector; + + HostMatrix hostMatrix; + DeviceMatrix deviceMatrix; + HostVector hostVector, hostVector2; + CudaVector deviceVector, deviceVector2; + + // Load the format + try + { + if( ! MatrixReader< HostMatrix >::readMtxFile( inputFileName, hostMatrix, verboseMR ) ) + { + throw std::bad_alloc(); + return false; + } + } + catch( std::bad_alloc& e ) + { + e.what(); + return false; + } + + + // Setup MetaData here (not in tnl-benchmark-spmv.h, as done in Benchmarks/BLAS), + // because we need the matrix loaded first to get the rows and columns + benchmark.setMetadataColumns( Benchmark::MetadataColumns({ + { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) }, + { "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) }, + { "rows", convertToString( hostMatrix.getRows() ) }, + { "columns", convertToString( hostMatrix.getColumns() ) }, + { "matrix format", convertToString( getMatrixFormat( hostMatrix ) ) } + } )); + + hostVector.setSize( hostMatrix.getColumns() ); + hostVector2.setSize( hostMatrix.getRows() ); + +#ifdef HAVE_CUDA + deviceMatrix = hostMatrix; + deviceVector.setSize( hostMatrix.getColumns() ); + deviceVector2.setSize( hostMatrix.getRows() ); +#endif + + // reset function + auto reset = [&]() { + hostVector.setValue( 1.0 ); + hostVector2.setValue( 0.0 ); + #ifdef HAVE_CUDA + deviceVector.setValue( 1.0 ); + deviceVector2.setValue( 0.0 ); + #endif + }; + + const int elements = hostMatrix.getNumberOfNonzeroMatrixElements(); + + const double datasetSize = (double) elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; + + // compute functions + auto spmvHost = [&]() { + hostMatrix.vectorProduct( hostVector, hostVector2 ); + }; +#ifdef HAVE_CUDA + auto spmvCuda = [&]() { + deviceMatrix.vectorProduct( deviceVector, deviceVector2 ); + }; + + auto spmvCusparse = [&]() { + cusparseCSR.vectorProduct( deviceVector, deviceVector2 ); + }; +#endif + + benchmark.setOperation( datasetSize ); + benchmark.time< Devices::Host >( reset, "CPU", spmvHost ); + + // Initialize the host vector to be compared. + // (The values in hostVector2 will be reset when spmvCuda starts) + HostVector resultHostVector2; + resultHostVector2.setSize( hostVector2.getSize() ); + resultHostVector2.setValue( 0.0 ); + + // Copy the values + resultHostVector2 = hostVector2; + + // Setup cuSPARSE MetaData, since it has the same header as CSR, + // and therefore will not get its own headers (rows, cols, speedup etc.) in log. + // * Not setting this up causes (among other undiscovered errors) the speedup from CPU to GPU on the input format to be overwritten. + benchmark.setMetadataColumns( Benchmark::MetadataColumns({ + { "matrix name", convertToString( getMatrixFileName( inputFileName ) ) }, + { "non-zeros", convertToString( hostMatrix.getNumberOfNonzeroMatrixElements() ) }, + { "rows", convertToString( hostMatrix.getRows() ) }, + { "columns", convertToString( hostMatrix.getColumns() ) }, + { "matrix format", convertToString( "CSR-cuSPARSE" ) } + } )); + +#ifdef HAVE_CUDA + benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda ); + + // Initialize the device vector to be compared. + // (The values in deviceVector2 will be reset when spmvCusparse starts) + HostVector resultDeviceVector2; + resultDeviceVector2.setSize( deviceVector2.getSize() ); + resultDeviceVector2.setValue( 0.0 ); + + resultDeviceVector2 = deviceVector2; + + benchmark.time< Devices::Cuda >( reset, "GPU", spmvCusparse ); + + HostVector resultcuSPARSEDeviceVector2; + resultcuSPARSEDeviceVector2.setSize( deviceVector2.getSize() ); + resultcuSPARSEDeviceVector2.setValue( 0.0 ); + + resultcuSPARSEDeviceVector2 = deviceVector2; + + // Difference between GPU (curent format) and GPU-cuSPARSE results + //Real cuSparseDifferenceAbsMax = resultDeviceVector2.differenceAbsMax( resultcuSPARSEDeviceVector2 ); + Real cuSparseDifferenceAbsMax = max( abs( resultDeviceVector2 - resultcuSPARSEDeviceVector2 ) ); + //Real cuSparseDifferenceLpNorm = resultDeviceVector2.differenceLpNorm( resultcuSPARSEDeviceVector2, 1 ); + Real cuSparseDifferenceLpNorm = lpNorm( resultDeviceVector2 - resultcuSPARSEDeviceVector2, 1 ); + + std::string GPUxGPUcuSparse_resultDifferenceAbsMax = "GPUxGPUcuSPARSE differenceAbsMax = " + std::to_string( cuSparseDifferenceAbsMax ); + std::string GPUxGPUcuSparse_resultDifferenceLpNorm = "GPUxGPUcuSPARSE differenceLpNorm = " + std::to_string( cuSparseDifferenceLpNorm ); + + char *GPUcuSparse_absMax = &GPUxGPUcuSparse_resultDifferenceAbsMax[ 0u ]; + char *GPUcuSparse_lpNorm = &GPUxGPUcuSparse_resultDifferenceLpNorm[ 0u ]; + + + // Difference between CPU and GPU results for the current format + //Real differenceAbsMax = resultHostVector2.differenceAbsMax( resultDeviceVector2 ); + Real differenceAbsMax = max( abs( resultHostVector2 - resultDeviceVector2 ) ); + //Real differenceLpNorm = resultHostVector2.differenceLpNorm( resultDeviceVector2, 1 ); + Real differenceLpNorm = lpNorm( resultHostVector2 - resultDeviceVector2, 1 ); + + std::string CPUxGPU_resultDifferenceAbsMax = "CPUxGPU differenceAbsMax = " + std::to_string( differenceAbsMax ); + std::string CPUxGPU_resultDifferenceLpNorm = "CPUxGPU differenceLpNorm = " + std::to_string( differenceLpNorm ); + + char *CPUxGPU_absMax = &CPUxGPU_resultDifferenceAbsMax[ 0u ]; + char *CPUxGPU_lpNorm = &CPUxGPU_resultDifferenceLpNorm[ 0u ]; + + // Print result differences of CPU and GPU of current format + std::cout << CPUxGPU_absMax << std::endl; + std::cout << CPUxGPU_lpNorm << std::endl; + + // Print result differences of GPU of current format and GPU with cuSPARSE. + std::cout << GPUcuSparse_absMax << std::endl; + std::cout << GPUcuSparse_lpNorm << std::endl; + #endif + + std::cout << std::endl; + return true; +} + +template< typename Real = double, + typename Index = int > +bool +benchmarkSpmvSynthetic( Benchmark& benchmark, + const String& inputFileName, + bool verboseMR ) +{ + bool result = true; + result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, inputFileName, verboseMR ); + result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, inputFileName, verboseMR ); + result |= benchmarkSpMV< Real, SlicedEllpackAlias >( benchmark, inputFileName, verboseMR ); + result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, inputFileName, verboseMR ); + + // AdEllpack is broken +// result |= benchmarkSpMV< Real, Matrices::AdEllpack >( benchmark, inputFileName, verboseMR ); + result |= benchmarkSpMV< Real, Matrices::BiEllpack >( benchmark, inputFileName, verboseMR ); + return result; +} + +} // namespace Benchmarks +} // namespace TNL diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.cpp b/src/Benchmarks/SpMV/tnl-benchmark-spmv.cpp index fadbcca0ce9d04b01103d4f24b0df03c169fc1c7..466a56914e9097fc5f467332ef91290b481ca293 100644 --- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.cpp +++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.cpp @@ -1,14 +1,11 @@ /*************************************************************************** tnl-benchmark-spmv.cpp - description ------------------- - begin : Jun 5, 2014 - copyright : (C) 2014 by Tomas Oberhuber + begin : March 3, 2019 + copyright : (C) 2019 by Tomas Oberhuber email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ - #include "tnl-benchmark-spmv.h" - - diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.cu b/src/Benchmarks/SpMV/tnl-benchmark-spmv.cu index fed383d86776e521dd31f299e3ba8baa9b0afdae..5a3a711ad22412b2998674cc820a80f2035d7fdc 100644 --- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.cu +++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.cu @@ -1,12 +1,11 @@ /*************************************************************************** tnl-benchmark-spmv.cu - description ------------------- - begin : Jun 5, 2014 - copyright : (C) 2014 by Tomas Oberhuber + begin : March 3, 2019 + copyright : (C) 2019 by Tomas Oberhuber email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ - #include "tnl-benchmark-spmv.h" diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h index c707018ad7e8da62e47249315eb80da37280bbbb..77c079c4c562408a63182ca910c9ebfc5d68e111 100644 --- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h +++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h @@ -1,921 +1,149 @@ /*************************************************************************** tnl-benchmark-spmv.h - description ------------------- - begin : Jun 5, 2014 - copyright : (C) 2014 by Tomas Oberhuber + begin : March 3, 2019 + copyright : (C) 2019 by Tomas Oberhuber et al. email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ -#pragma once +// Implemented by: Lukas Cejka +// Original implemented by J. Klinkovsky in Benchmarks/BLAS +// This is an edited copy of Benchmarks/BLAS/spmv.h by: Lukas Cejka -#include -#include -#include -#ifdef HAVE_CUDA -#include -#endif +#pragma once +#include +#include #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "tnlCusparseCSRMatrix.h" -using namespace std; -using namespace TNL; -using namespace TNL::Matrices; - -void setupConfig( Config::ConfigDescription& config ) -{ - config.addDelimiter ( "General settings:" ); - config.addRequiredEntry< String >( "test" , "Test to be performed." ); - config.addEntryEnum< String >( "mtx" ); - config.addEntryEnum< String >( "tnl" ); - config.addRequiredEntry< String >( "input-file" , "Input file name." ); - config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-spmv.log"); - config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" ); - config.addEntry< double >( "stop-time", "Seconds to iterate the SpMV operation.", 1.0 ); - config.addEntry< int >( "verbose", "Verbose mode.", 1 ); -} - -bool initLogFile( std::fstream& logFile, const String& fileName ) -{ - if( access( fileName.getString(), F_OK ) == -1 ) - { - logFile.open( fileName.getString(), std::ios::out ); - if( ! logFile ) - return false; - const String fillingColoring = " : COLORING 0 #FFF8DC 20 #FFFF00 40 #FFD700 60 #FF8C0 80 #FF0000 100"; - const String speedupColoring = " : COLORING #0099FF 1 #FFFFFF 2 #00FF99 4 #33FF99 8 #33FF22 16 #FF9900"; - const String paddingColoring = " : COLORING #FFFFFF 1 #FFFFCC 10 #FFFF99 100 #FFFF66 1000 #FFFF33 10000 #FFFF00"; - logFile << "#Matrix file " << std::endl; - logFile << "#Rows" << std::endl; - logFile << "#Columns" << std::endl; - logFile << "#Non-zero elements" << std::endl; - logFile << "#Filling (in %)" << fillingColoring << std::endl; - logFile << "#CSR Format" << std::endl; - logFile << "# CPU" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << std::endl; -#ifdef HAVE_CUDA - logFile << "# Cusparse CSR" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - cusparse-csr-speedup.txt" << std::endl; - logFile << "# CUDA" << std::endl; - logFile << "# Scalar" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-scalar-cuda-speedup.txt" << std::endl; - logFile << "# Vector" << std::endl; - logFile << "# Warp Size 1" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-1-cuda-speedup.txt" << std::endl; - logFile << "# Warp Size 2" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-2-cuda-speedup.txt" << std::endl; - logFile << "# Warp Size 4" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-4-cuda-speedup.txt" << std::endl; - logFile << "# Warp Size 8" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-8-cuda-speedup.txt" << std::endl; - logFile << "# Warp Size 16" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-16-cuda-speedup.txt" << std::endl; - logFile << "# Warp Size 32" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-32-cuda-speedup.txt" << std::endl; - logFile << "# Hybrid" << std::endl; - logFile << "# Split 2" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-hybrid-2-cuda-speedup.txt" << std::endl; - logFile << "# Split 4" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-hybrid-4-cuda-speedup.txt" << std::endl; - logFile << "# Split 8" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-hybrid-8-cuda-speedup.txt" << std::endl; - logFile << "# Split 16" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-hybrid-16-cuda-speedup.txt" << std::endl; - logFile << "# Split 32" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-hybrid-32-cuda-speedup.txt" << std::endl; - logFile << "# Split 64" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-hybrid-64-cuda-speedup.txt" << std::endl; -#endif - logFile << "#Ellpack Format" << std::endl; - logFile << "# Padding (in %)" << paddingColoring << std::endl; - logFile << "# CPU" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - ellpack-host-speedup.txt" << std::endl; -#ifdef HAVE_CUDA - logFile << "# CUDA" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - ellpack-cuda-speedup.txt" << std::endl; -#endif - logFile << "#SlicedEllpack Format" << std::endl; - logFile << "# Padding (in %)" << paddingColoring << std::endl; - logFile << "# CPU" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - sliced-ellpack-host-speedup.txt" << std::endl; -#ifdef HAVE_CUDA - logFile << "# CUDA" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - sliced-ellpack-cuda-speedup.txt" << std::endl; -#endif - logFile << "#ChunkedEllpack Format" << std::endl; - logFile << "# Padding (in %)" << paddingColoring << std::endl; - logFile << "# CPU" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - chunked-ellpack-host-speedup.txt" << std::endl; -#ifdef HAVE_CUDA - logFile << "# CUDA" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - chunked-ellpack-cuda-speedup.txt" << std::endl; -#endif - return true; - } - logFile.open( fileName.getString(), std::ios::out | std::ios::app ); - //logFile << std::setprecision( 2 ); - if( ! logFile ) - return false; - return true; -} +#include +#include +#include "spmv.h" -template< typename Matrix > -void printMatrixInfo( const String& inputFileName, - const Matrix& matrix, - std::ostream& str ) -{ - str << " Rows: " << std::setw( 8 ) << matrix.getRows(); - str << " Columns: " << std::setw( 8 ) << matrix.getColumns(); - str << " Nonzero Elements: " << std::setw( 10 ) << matrix.getNumberOfNonzeroMatrixElements(); - const double fillingRatio = ( double ) matrix.getNumberOfNonzeroMatrixElements() / ( double ) matrix.getNumberOfMatrixElements(); - str << " Filling: " << std::setw( 5 ) << 100.0 * fillingRatio << "%" << std::endl; - str << std::setw( 25 ) << "Format" - << std::setw( 15 ) << "Padding" - << std::setw( 15 ) << "Time" - << std::setw( 15 ) << "GFLOPS" - << std::setw( 15 ) << "Throughput" - << std::setw( 15 ) << "Speedup" << std::endl; -} +#include +using namespace TNL::Matrices; -template< typename Matrix > -bool writeMatrixInfo( const String& inputFileName, - const Matrix& matrix, - std::ostream& logFile ) -{ - logFile << std::endl; - logFile << inputFileName << std::endl; - logFile << " " << matrix.getRows() << std::endl; - logFile << " " << matrix.getColumns() << std::endl; - logFile << " " << matrix.getNumberOfNonzeroMatrixElements() << std::endl; - const double fillingRatio = ( double ) matrix.getNumberOfNonzeroMatrixElements() / ( double ) matrix.getNumberOfMatrixElements(); - logFile << " " << 100.0 * fillingRatio << std::endl; - logFile << std::flush; - if( ! logFile.good() ) - return false; - return true; -} +#include // Used for file naming, so logs don't get overwritten. -double computeGflops( const long int nonzeroElements, - const int iterations, - const double& time ) -{ - return ( double ) ( 2 * iterations * nonzeroElements ) / time * 1.0e-9; -} +using namespace TNL; +using namespace TNL::Benchmarks; template< typename Real > -double computeThroughput( const long int nonzeroElements, - const int iterations, - const int rows, - const double& time ) +void +runSpMVBenchmarks( Benchmark & benchmark, + Benchmark::MetadataMap metadata, + const String & inputFileName, + bool verboseMR = false ) { - return ( double ) ( ( 2 * nonzeroElements + rows ) * iterations ) * sizeof( Real ) / time * 1.0e-9; + const String precision = getType< Real >(); + metadata["precision"] = precision; + + // Sparse matrix-vector multiplication + benchmark.newBenchmark( String("Sparse matrix-vector multiplication (") + precision + ")", + metadata ); + // Start the actual benchmark in spmv.h + benchmarkSpmvSynthetic< Real >( benchmark, inputFileName, verboseMR ); } -template< typename Matrix, - typename Vector > -double benchmarkMatrix( const Matrix& matrix, - const Vector& x, - Vector& b, - const long int nonzeroElements, - const char* format, - const double& stopTime, - const double& baseline, - int verbose, - std::fstream& logFile ) +// Get current date time to have different log files names and avoid overwriting. +std::string getCurrDateTime() { - Timer timer; - timer.start(); - double time( 0.0 ); - int iterations( 0 ); - while( time < stopTime ) - { - matrix.vectorProduct( x, b ); -#ifdef HAVE_CUDA - if( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value ) - cudaDeviceSynchronize(); -#endif - time = timer.getRealTime(); - iterations++; - } - const double gflops = computeGflops( nonzeroElements, iterations, time ); - const double throughput = computeThroughput< typename Matrix::RealType >( nonzeroElements, iterations, matrix.getRows(), time ); - const long int allocatedElements = matrix.getNumberOfMatrixElements(); - const double padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; - if( verbose ) - { - std::cout << std::setw( 25 ) << format - << std::setw( 15 ) << padding - << std::setw( 15 ) << time - << std::setw( 15 ) << gflops - << std::setw( 15 ) << throughput; - if( baseline ) - std::cout << std::setw( 15 ) << gflops / baseline << std::endl; - else - std::cout << std::setw( 15 ) << "N/A" << std::endl; - } - logFile << " " << gflops << std::endl; - logFile << " " << throughput << std::endl; - if( baseline ) - logFile << gflops / baseline << std::endl; - else - logFile << "N/A" << std::endl; - return gflops; + time_t rawtime; + struct tm * timeinfo; + char buffer[ 80 ]; + time( &rawtime ); + timeinfo = localtime( &rawtime ); + strftime( buffer, sizeof( buffer ), "%d-%m-%Y--%H:%M:%S", timeinfo ); + std::string curr_date_time( buffer ); + + return curr_date_time; } -void writeTestFailed( std::fstream& logFile, - int repeat ) +void +setupConfig( Config::ConfigDescription & config ) { - for( int i = 0; i < repeat; i++ ) - logFile << "N/A" << std::endl; -} - -template< typename Real > -bool setupBenchmark( const Config::ParameterContainer& parameters ) -{ - const String& test = parameters.getParameter< String >( "test" ); - const String& inputFileName = parameters.getParameter< String >( "input-file" ); - const String& logFileName = parameters.getParameter< String >( "log-file" ); - const int verbose = parameters.getParameter< int >( "verbose" ); - const double stopTime = parameters.getParameter< double >( "stop-time" ); - std::fstream logFile; - if( ! initLogFile( logFile, logFileName ) ) - { - std::cerr << "I am not able to open the file " << logFileName << "." << std::endl; - return false; - } - if( test == "mtx" ) - { - typedef Matrices::CSR< Real, Devices::Host, int > CSRType; - CSRType csrMatrix; - try - { - if( ! MatrixReader< CSRType >::readMtxFile( inputFileName, csrMatrix ) ) - { - std::cerr << "I am not able to read the matrix file " << inputFileName << "." << std::endl; - logFile << std::endl; - logFile << inputFileName << std::endl; - logFile << "Benchmark failed: Unable to read the matrix." << std::endl; - return false; - } - } - catch( const std::bad_alloc& ) - { - std::cerr << "Not enough memory to read the matrix." << std::endl; - logFile << std::endl; - logFile << inputFileName << std::endl; - logFile << "Benchmark failed: Not enough memory." << std::endl; - return false; - } - if( verbose ) - printMatrixInfo( inputFileName, csrMatrix,std::cout ); - if( ! writeMatrixInfo( inputFileName, csrMatrix, logFile ) ) - { - std::cerr << "I am not able to write new matrix to the log file." << std::endl; - return false; - } - const int rows = csrMatrix.getRows(); - const long int nonzeroElements = csrMatrix.getNumberOfMatrixElements(); - Containers::Vector< int, Devices::Host, int > rowLengthsHost; - rowLengthsHost.setSize( rows ); - for( int row = 0; row < rows; row++ ) - rowLengthsHost[ row ] = csrMatrix.getRowLength( row ); - - typedef Containers::Vector< Real, Devices::Host, int > HostVector; - HostVector hostX, hostB; - hostX.setSize( csrMatrix.getColumns() ); - hostX.setValue( 1.0 ); - hostB.setSize( csrMatrix.getRows() ); -#ifdef HAVE_CUDA - typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector; - CudaVector cudaX, cudaB; - Containers::Vector< int, Devices::Cuda, int > rowLengthsCuda; - cudaX.setSize( csrMatrix.getColumns() ); - cudaX.setValue( 1.0 ); - cudaB.setSize( csrMatrix.getRows() ); - rowLengthsCuda.setSize( csrMatrix.getRows() ); - rowLengthsCuda = rowLengthsHost; - cusparseHandle_t cusparseHandle; - cusparseCreate( &cusparseHandle ); -#endif - const double baseline = benchmarkMatrix( csrMatrix, - hostX, - hostB, - nonzeroElements, - "CSR Host", - stopTime, - 0.0, - verbose, - logFile ); -#ifdef HAVE_CUDA - typedef CSR< Real, Devices::Cuda, int > CSRCudaType; - CSRCudaType cudaCSR; - //cout << "Copying matrix to GPU... "; - cudaCSR = csrMatrix; - TNL::CusparseCSR< Real > cusparseCSR; - cusparseCSR.init( cudaCSR, &cusparseHandle ); - benchmarkMatrix( cusparseCSR, - cudaX, - cudaB, - nonzeroElements, - "Cusparse CSR", - stopTime, - baseline, - verbose, - logFile ); - cusparseDestroy( cusparseHandle ); - - std::cout << " done. \r"; - /*cudaCSR.setCudaKernelType( CSRCudaType::scalar ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Scalar", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setCudaKernelType( CSRCudaType::vector ); - cudaCSR.setCudaWarpSize( 1 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Vector 1", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setCudaWarpSize( 2 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Vector 2", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setCudaWarpSize( 4 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Vector 4", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setCudaWarpSize( 8 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Vector 8", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setCudaWarpSize( 16 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Vector 16", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setCudaWarpSize( 32 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Vector 32", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setCudaKernelType( CSRCudaType::hybrid ); - cudaCSR.setHybridModeSplit( 2 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Hyrbid 2", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setHybridModeSplit( 4 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Hyrbid 4", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setHybridModeSplit( 8 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Hyrbid 8", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setHybridModeSplit( 16 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Hyrbid 16", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setHybridModeSplit( 32 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Hyrbid 32", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setHybridModeSplit( 64 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Hyrbid 64", - stopTime, - baseline, - verbose, - logFile );*/ - cudaCSR.reset(); -#endif - - long int allocatedElements; - double padding; - typedef Ellpack< Real, Devices::Host, int > EllpackType; - EllpackType ellpackMatrix; - Matrices::copySparseMatrix( ellpackMatrix, csrMatrix ); - allocatedElements = ellpackMatrix.getNumberOfMatrixElements(); - padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; - logFile << " " << padding << std::endl; - benchmarkMatrix( ellpackMatrix, - hostX, - hostB, - nonzeroElements, - "Ellpack Host", - stopTime, - baseline, - verbose, - logFile ); -#ifdef HAVE_CUDA - typedef Ellpack< Real, Devices::Cuda, int > EllpackCudaType; - EllpackCudaType cudaEllpack; - std::cout << "Copying matrix to GPU... "; - cudaEllpack = ellpackMatrix; - std::cout << " done. \r"; - benchmarkMatrix( cudaEllpack, - cudaX, - cudaB, - nonzeroElements, - "Ellpack Cuda", - stopTime, - baseline, - verbose, - logFile ); - cudaEllpack.reset(); -#endif - ellpackMatrix.reset(); - - typedef Matrices::EllpackSymmetric< Real, Devices::Host, int > EllpackSymmetricType; - EllpackSymmetricType EllpackSymmetric; - if( ! MatrixReader< EllpackSymmetricType >::readMtxFile( inputFileName, EllpackSymmetric, verbose, true ) ) - writeTestFailed( logFile, 7 ); - else - { - allocatedElements = EllpackSymmetric.getNumberOfMatrixElements(); - padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; - logFile << " " << padding < EllpackSymmetricCudaType; - EllpackSymmetricCudaType cudaEllpackSymmetric; - std::cout << "Copying matrix to GPU... "; - for( int i = 0; i < rowLengthsHost.getSize(); i++ ) - rowLengthsHost[ i ] = EllpackSymmetric.getRowLength( i ); - rowLengthsCuda = rowLengthsHost; - - // TODO: fix this - //if( ! cudaEllpackSymmetric.copyFrom( EllpackSymmetric, rowLengthsCuda ) ) - { - std::cerr << "I am not able to transfer the matrix on GPU." < SlicedEllpackMatrixType; - SlicedEllpackMatrixType slicedEllpackMatrix; - if( ! Matrices::MatrixReader< SlicedEllpackMatrixType >::readMtxFile( inputFileName, slicedEllpackMatrix, verbose ) ) - writeTestFailed( logFile, 7 ); - else - { - allocatedElements = slicedEllpackMatrix.getNumberOfMatrixElements(); - padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100; - logFile << " " << padding < SlicedEllpackMatrixCudaType; - SlicedEllpackMatrixCudaType cudaSlicedEllpackMatrix; - for( int i = 0; i < rowLengthsHost.getSize(); i++ ) - rowLengthsHost[ i ] = slicedEllpackMatrix.getRowLength( i ); - rowLengthsCuda = rowLengthsHost; - // TODO: fix - //if( ! cudaSlicedEllpackMatrix.copyFrom( slicedEllpackMatrix, rowLengthsCuda ) ) - { - std::cerr << "Nejde zkopirovat" < ChunkedEllpackType; - ChunkedEllpackType chunkedEllpack; - Matrices::copySparseMatrix( chunkedEllpack, csrMatrix ); - allocatedElements = chunkedEllpack.getNumberOfMatrixElements(); - padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; - logFile << " " << padding << std::endl; - benchmarkMatrix( chunkedEllpack, - hostX, - hostB, - nonzeroElements, - "ChunkedEllpack Host", - stopTime, - baseline, - verbose, - logFile ); - -#ifdef HAVE_CUDA - typedef Matrices::ChunkedEllpack< Real, Devices::Cuda, int > ChunkedEllpackCudaType; - ChunkedEllpackCudaType cudaChunkedEllpack; - std::cout << "Copying matrix to GPU... "; - cudaChunkedEllpack = chunkedEllpack; - std::cout << " done. \r"; - benchmarkMatrix( cudaChunkedEllpack, - cudaX, - cudaB, - nonzeroElements, - "ChunkedEllpack Cuda", - stopTime, - baseline, - verbose, - logFile ); - cudaChunkedEllpack.reset(); -#endif - - typedef Matrices::BiEllpack< Real, Devices::Host, int > BiEllpackMatrixType; - BiEllpackMatrixType biEllpackMatrix; - // TODO: I did not check this during git merging, but I hope its gonna work - // Tomas Oberhuber - // copySparseMatrix( biEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats - /*if( ! biEllpackMatrix.copyFrom( csrMatrix, rowLengthsHost ) ) - writeTestFailed( logFile, 7 ); - else*/ - { - allocatedElements = biEllpackMatrix.getNumberOfMatrixElements(); - padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; - logFile << " " << padding < BiEllpackMatrixCudaType; - BiEllpackMatrixCudaType cudaBiEllpackMatrix; - // TODO: I did not check this during git merging, but I hope its gonna work - // Tomas Oberhuber - // copySparseMatrix( biEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats - std::cout << "Copying matrix to GPU... "; - /*if( ! cudaBiEllpackMatrix.copyFrom( biEllpackMatrix, rowLengthsCuda ) ) - { - std::cerr << "I am not able to transfer the matrix on GPU." < SlicedEllpackSymmetricType; - SlicedEllpackSymmetricType slicedEllpackSymmetric; - if( ! Matrices::MatrixReader< SlicedEllpackSymmetricType >::readMtxFile( inputFileName, slicedEllpackSymmetric, verbose, true ) ) - writeTestFailed( logFile, 7 ); - else - { - allocatedElements = slicedEllpackSymmetric.getNumberOfMatrixElements(); - padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; - logFile << " " << padding < SlicedEllpackSymmetricCudaType; - SlicedEllpackSymmetricCudaType cudaSlicedEllpackSymmetric; - std::cout << "Copying matrix to GPU... "; - for( int i = 0; i < rowLengthsHost.getSize(); i++ ) - rowLengthsHost[ i ] = slicedEllpackSymmetric.getRowLength( i ); - rowLengthsCuda = rowLengthsHost; - // TODO: fiox the nest line - //if( ! cudaSlicedEllpackSymmetric.copyFrom( slicedEllpackSymmetric, rowLengthsCuda ) ) - { - std::cerr << "I am not able to transfer the matrix on GPU." < EllpackSymmetricGraphMatrixType; - EllpackSymmetricGraphMatrixType EllpackSymmetricGraphMatrix; - if( ! Matrices::MatrixReader< EllpackSymmetricGraphMatrixType >::readMtxFile( inputFileName, EllpackSymmetricGraphMatrix, verbose, true ) || - ! EllpackSymmetricGraphMatrix.help() ) - writeTestFailed( logFile, 7 ); - else - { - allocatedElements = EllpackSymmetricGraphMatrix.getNumberOfMatrixElements(); - padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; - logFile << " " << padding < EllpackSymmetricGraphMatrixCudaType; - EllpackSymmetricGraphMatrixCudaType cudaEllpackSymmetricGraphMatrix; - std::cout << "Copying matrix to GPU... "; - for( int i = 0; i < rowLengthsHost.getSize(); i++ ) - rowLengthsHost[ i ] = EllpackSymmetricGraphMatrix.getRowLength( i ); - rowLengthsCuda = rowLengthsHost; - // TODO: fix it - //if( ! cudaEllpackSymmetricGraphMatrix.copyFrom( EllpackSymmetricGraphMatrix, rowLengthsCuda ) ) - { - writeTestFailed( logFile, 3 ); - } - //else if( ! cudaEllpackSymmetricGraphMatrix.help() ) - { - writeTestFailed( logFile, 3 ); - } - //else - { - std::cout << " done. \r"; - benchmarkMatrix( cudaEllpackSymmetricGraphMatrix, - cudaX, - cudaB, - nonzeroElements, - "Ellpack Graph Cuda", - stopTime, - baseline, - verbose, - logFile ); - } - cudaEllpackSymmetricGraphMatrix.reset(); -#endif - } + config.addDelimiter( "Benchmark settings:" ); + config.addRequiredEntry< String >( "input-file", "Input file name." ); + config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-spmv::" + getCurrDateTime() + ".log"); + config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" ); + config.addEntryEnum( "append" ); + config.addEntryEnum( "overwrite" ); + config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" ); + config.addEntryEnum( "float" ); + config.addEntryEnum( "double" ); + config.addEntryEnum( "all" ); + config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 ); + config.addEntry< int >( "verbose", "Verbose mode.", 1 ); + config.addEntry< int >( "verbose-MReader", "Verbose mode for Matrix Reader.", 0 ); - - typedef Matrices::AdEllpack< Real, Devices::Host, int > AdEllpackMatrixType; - AdEllpackMatrixType adEllpackMatrix; - // TODO: I did not check this during git merging, but I hope its gonna work - // Tomas Oberhuber - //copySparseMatrix( adEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats - /*if( ! adEllpackMatrix.copyFrom( csrMatrix, rowLengthsHost ) ) - writeTestFailed( logFile, 7 ); - else*/ - { - allocatedElements = adEllpackMatrix.getNumberOfMatrixElements(); - padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; - logFile << " " << padding < AdEllpackMatrixCudaType; - AdEllpackMatrixCudaType cudaAdEllpackMatrix; - // TODO: I did not check this during git merging, but I hope its gonna work - // Tomas Oberhuber - //copySparseMatrix( adEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats - std::cout << "Copying matrix to GPU... "; - /*if( ! cudaAdEllpackMatrix.copyFrom( csrMatrix, rowLengthsCuda ) ) - { - std::cerr << "I am not able to transfer the matrix on GPU." <( "input-file" ); + const String & logFileName = parameters.getParameter< String >( "log-file" ); + const String & outputMode = parameters.getParameter< String >( "output-mode" ); + const String & precision = parameters.getParameter< String >( "precision" ); + const int loops = parameters.getParameter< int >( "loops" ); + const int verbose = parameters.getParameter< int >( "verbose" ); + const int verboseMR = parameters.getParameter< int >( "verbose-MReader" ); + + // open log file + auto mode = std::ios::out; + if( outputMode == "append" ) + mode |= std::ios::app; + std::ofstream logFile( logFileName.getString(), mode ); + + // init benchmark and common metadata + Benchmark benchmark( loops, verbose ); + + // prepare global metadata + Benchmark::MetadataMap metadata = getHardwareMetadata(); + + + // Initiate setup of benchmarks + if( precision == "all" || precision == "float" ) + runSpMVBenchmarks< float >( benchmark, metadata, inputFileName, verboseMR ); + if( precision == "all" || precision == "double" ) + runSpMVBenchmarks< double >( benchmark, metadata, inputFileName, verboseMR ); + + if( ! benchmark.save( logFile ) ) { + std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl; + return EXIT_FAILURE; } - const String& precision = parameters.getParameter< String >( "precision" ); - if( precision == "float" ) - if( ! setupBenchmark< float >( parameters ) ) - return EXIT_FAILURE; - if( precision == "double" ) - if( ! setupBenchmark< double >( parameters ) ) - return EXIT_FAILURE; + + // Confirm that the benchmark has finished + std::cout << "\n== BENCHMARK FINISHED ==" << std::endl; return EXIT_SUCCESS; } diff --git a/src/Benchmarks/scripts/run-tnl-benchmark-spmv b/src/Benchmarks/scripts/run-tnl-benchmark-spmv index 75ea08219bb454533bd882594d44772586ef50cb..88b4d70d02104c74e250b0e80b3e7b22dd02f131 100755 --- a/src/Benchmarks/scripts/run-tnl-benchmark-spmv +++ b/src/Benchmarks/scripts/run-tnl-benchmark-spmv @@ -12,31 +12,32 @@ BENCHMARK_DBG="tnl-benchmark-spmv-dbg" export CUDA_PROFILE_CONFIG="$IWD/cuda-profiler.conf" PROCESS_CUDA_PROFILE="$IWD/process-cuda-profile.pl" -source matrix-market +#source matrix-market source florida-matrix-market -for link in $MM_MATRICES; -do - echo "======================================================================================================" - matrix=matrices`echo $link | sed 's/ftp:\/\/math.nist.gov\/pub//'` - unzipped_matrix=`echo $matrix | sed 's/.gz//'` - if test ! -e $matrix; - then - echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first." - #echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first." >> sparse-matrix-benchmark.log - else - gunzip -c ${matrix} > ${unzipped_matrix} - echo "Benchmarking with the matrix $unzipped_matrix ..." - export CUDA_PROFILE_LOG=$unzipped_matrix.float.log - if test x$DEBUG = xyes; - then - gdb --args ${BENCHMARK_DBG} --test mtx --input-file $unzipped_matrix --log-file sparse-matrix-benchmark.log --stop-time $STOP_TIME --verbose 1 - else - $BENCHMARK --test mtx --input-file $unzipped_matrix --pdf-file $unzipped_matrix.pdf --log-file sparse-matrix-benchmark.log --stop-time $STOP_TIME --verbose 1 - fi - #perl $PROCESS_CUDA_PROFILE $unzipped_matrix.float.log sparse-matrix-profiling-float.log - fi -done +# !!!Matrices in MatrixMarket2 don't load properly, formatting issues with every file. MatrixReader fails. +#for link in $MM_MATRICES; +#do +# echo "======================================================================================================" +# matrix=matrices`echo $link | sed 's/ftp:\/\/math.nist.gov\/pub//'` +# unzipped_matrix=`echo $matrix | sed 's/.gz//'` +# if test ! -e $matrix; +# then +# echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first." +# #echo "Matrix $matrix is missing !!! Run the script 'get-matrices' first." >> sparse-matrix-benchmark.log +# else +# gunzip -c ${matrix} > ${unzipped_matrix} +# echo "Benchmarking with the matrix $unzipped_matrix ..." +# export CUDA_PROFILE_LOG=$unzipped_matrix.float.log +# if test x$DEBUG = xyes; +# then +# gdb --args ${BENCHMARK_DBG} --input-file $unzipped_matrix --log-file sparse-matrix-benchmark.log --verbose 1 +# else +# $BENCHMARK --input-file $unzipped_matrix --log-file sparse-matrix-benchmark.log --verbose 1 +# fi +# #perl $PROCESS_CUDA_PROFILE $unzipped_matrix.float.log sparse-matrix-profiling-float.log +# fi +#done for link in $FLORIDA_MM_MATRICES; do @@ -51,17 +52,23 @@ do cd $DIRNAME tar zxvf $FILENAME cd $IWD + if [ ! -d "log-files" ]; + then + mkdir log-files + fi SUBDIRNAME=`echo $FILENAME | sed 's/.tar.gz//'` rm -f $DIRNAME/$SUBDIRNAME/*_b.mtx # these are usualy in array format for file in $DIRNAME/$SUBDIRNAME/*.mtx; do echo "======================================================================================================" echo "Benchmarking with the matrix $file ..." + mtx_file_name=`basename $file` + mtx_file_name=${mtx_file_name%.mtx} if test x$DEBUG = xyes; then - gdb --args $BENCHMARK --test mtx --input-file $file --pdf-file $file.pdf --log-file sparse-matrix-benchmark.log --stop-time $STOP_TIME --verbose 1 + gdb --args $BENCHMARK --input-file $file --log-file log-files/sparse-matrix-benchmark.log --output-mode append --verbose 1 else - $BENCHMARK --test mtx --input-file $file --pdf-file $file.pdf --log-file sparse-matrix-benchmark.log --stop-time $STOP_TIME --verbose 1 + $BENCHMARK --input-file $file --log-file log-files/sparse-matrix-benchmark.log --output-mode append --verbose 1 fi done fi diff --git a/src/TNL/Matrices/AdEllpack.h b/src/TNL/Matrices/AdEllpack.h index a50a17232fb184086930e69834ea4d498d9ab5a2..f011e6c804429b4059b972b5249feaa1de5f8922 100644 --- a/src/TNL/Matrices/AdEllpack.h +++ b/src/TNL/Matrices/AdEllpack.h @@ -27,56 +27,94 @@ namespace Matrices { template< typename Device > class AdEllpackDeviceDependentCode; +template< typename MatrixType > struct warpInfo { - int offset; - int rowOffset; - int localLoad; - int reduceMap[ 32 ]; - - warpInfo* next; - warpInfo* previous; + using RealType = typename MatrixType::RealType; + using DeviceType = typename MatrixType::DeviceType; + using IndexType = typename MatrixType::IndexType; + + IndexType offset; + IndexType rowOffset; + IndexType localLoad; + IndexType reduceMap[ 32 ]; + + warpInfo< MatrixType >* next; + warpInfo< MatrixType >* previous; }; +template< typename MatrixType > class warpList { public: + + using RealType = typename MatrixType::RealType; + using DeviceType = typename MatrixType::DeviceType; + using IndexType = typename MatrixType::IndexType; warpList(); - bool addWarp( const int offset, - const int rowOffset, - const int localLoad, - const int* reduceMap ); + bool addWarp( const IndexType offset, + const IndexType rowOffset, + const IndexType localLoad, + const IndexType* reduceMap ); - warpInfo* splitInHalf( warpInfo* warp ); + warpInfo< MatrixType >* splitInHalf( warpInfo< MatrixType >* warp ); - int getNumberOfWarps() + IndexType getNumberOfWarps() { return this->numberOfWarps; } - warpInfo* getNextWarp( warpInfo* warp ) + warpInfo< MatrixType >* getNextWarp( warpInfo< MatrixType >* warp ) { return warp->next; } - warpInfo* getHead() + warpInfo< MatrixType >* getHead() { return this->head; } - warpInfo* getTail() + warpInfo< MatrixType >* getTail() { return this->tail; } ~warpList(); + + void printList() + { + if( this->getHead() == this->getTail() ) + std::cout << "HEAD==TAIL" << std::endl; + else + { + for( warpInfo< MatrixType >* i = this->getHead(); i != this->getTail()->next; i = i->next ) + { + if( i == this->getHead() ) + std::cout << "Head:" << "\ti->localLoad = " << i->localLoad << "\ti->offset = " << i->offset << "\ti->rowOffset = " << i->rowOffset << std::endl; + else if( i == this->getTail() ) + std::cout << "Tail:" << "\ti->localLoad = " << i->localLoad << "\ti->offset = " << i->offset << "\ti->rowOffset = " << i->rowOffset << std::endl; + else + std::cout << "\ti->localLoad = " << i->localLoad << "\ti->offset = " << i->offset << "\ti->rowOffset = " << i->rowOffset << std::endl; + } + std::cout << std::endl; + } + } private: - int numberOfWarps; + IndexType numberOfWarps; - warpInfo* head; - warpInfo* tail; + warpInfo< MatrixType >* head; + warpInfo< MatrixType >* tail; }; template< typename Real, typename Device, typename Index > class AdEllpack : public Sparse< Real, Device, Index > { +private: + // convenient template alias for controlling the selection of copy-assignment operator + template< typename Device2 > + using Enabler = std::enable_if< ! std::is_same< Device2, Device >::value >; + + // friend class will be needed for templated assignment operators + template< typename Real2, typename Device2, typename Index2 > + friend class AdEllpack; + public: typedef Real RealType; @@ -102,9 +140,15 @@ public: IndexType getRowLength( const IndexType row ) const; template< typename Real2, typename Device2, typename Index2 > - bool setLike( const AdEllpack< Real2, Device2, Index2 >& matrix ); + void setLike( const AdEllpack< Real2, Device2, Index2 >& matrix ); void reset(); + + template< typename Real2, typename Device2, typename Index2 > + bool operator == ( const AdEllpack< Real2, Device2, Index2 >& matrix ) const; + + template< typename Real2, typename Device2, typename Index2 > + bool operator != ( const AdEllpack< Real2, Device2, Index2 >& matrix ) const; bool setElement( const IndexType row, const IndexType column, @@ -142,7 +186,15 @@ public: typename OutVector > void vectorProduct( const InVector& inVector, OutVector& outVector ) const; - + + // copy assignment + AdEllpack& operator=( const AdEllpack& matrix ); + + // cross-device copy assignment + template< typename Real2, typename Device2, typename Index2, + typename = typename Enabler< Device2 >::type > + AdEllpack& operator=( const AdEllpack< Real2, Device2, Index2 >& matrix ); + void save( File& file ) const; void load( File& file ); @@ -155,13 +207,13 @@ public: bool balanceLoad( const RealType average, ConstCompressedRowLengthsVectorView rowLengths, - warpList* list ); + warpList< AdEllpack >* list ); void computeWarps( const IndexType SMs, const IndexType threadsPerSM, - warpList* list ); + warpList< AdEllpack >* list ); - bool createArrays( warpList* list ); + bool createArrays( warpList< AdEllpack >* list ); void performRowTest(); diff --git a/src/TNL/Matrices/AdEllpack_impl.h b/src/TNL/Matrices/AdEllpack_impl.h index a0f293b3df94afcfeda8124f8e1d8173cb4c7718..b7b97ff93550ef8c7289b749156e1fd5973e2f7d 100644 --- a/src/TNL/Matrices/AdEllpack_impl.h +++ b/src/TNL/Matrices/AdEllpack_impl.h @@ -21,10 +21,11 @@ namespace Matrices { /* * Auxiliary list implementation */ -warpList::warpList() +template< typename MatrixType > +warpList< MatrixType >::warpList() { - this->head = new warpInfo; - this->tail = new warpInfo; + this->head = new warpInfo< MatrixType >; + this->tail = new warpInfo< MatrixType >; this->head->previous = NULL; this->head->next = this->tail; this->tail->previous = this->head; @@ -33,12 +34,13 @@ warpList::warpList() this->numberOfWarps = 0; } -bool warpList::addWarp( const int offset, - const int rowOffset, - const int localLoad, - const int* reduceMap ) +template< typename MatrixType > +bool warpList< MatrixType >::addWarp( const IndexType offset, + const IndexType rowOffset, + const IndexType localLoad, + const IndexType* reduceMap ) { - warpInfo* temp = new warpInfo(); + warpInfo< MatrixType >* temp = new warpInfo< MatrixType >(); if( !temp ) return false; temp->offset = offset; @@ -56,13 +58,15 @@ bool warpList::addWarp( const int offset, return true; } -warpInfo* warpList::splitInHalf( warpInfo* warp ) +template< typename MatrixType > +warpInfo< MatrixType >* warpList< MatrixType >::splitInHalf( warpInfo< MatrixType >* warp ) { - warpInfo* firstHalf = new warpInfo(); - warpInfo* secondHalf = new warpInfo(); - int localLoad = ( warp->localLoad / 2 ) + ( warp->localLoad % 2 == 0 ? 0 : 1 ); + warpInfo< MatrixType >* firstHalf = new warpInfo< MatrixType >(); + warpInfo< MatrixType >* secondHalf = new warpInfo< MatrixType >(); - int rowOffset = warp->rowOffset; + IndexType localLoad = ( warp->localLoad / 2 ) + ( warp->localLoad % 2 == 0 ? 0 : 1 ); + + IndexType rowOffset = warp->rowOffset; // first half split firstHalf->localLoad = localLoad; @@ -132,11 +136,12 @@ warpInfo* warpList::splitInHalf( warpInfo* warp ) return firstHalf; } -warpList::~warpList() +template< typename MatrixType > +warpList< MatrixType >::~warpList() { while( this->head->next != NULL ) { - warpInfo* temp = new warpInfo; + warpInfo< MatrixType >* temp = new warpInfo< MatrixType >; temp = this->head->next; this->head->next = temp->next; delete temp; @@ -164,17 +169,20 @@ void AdEllpack< Real, Device, Index >:: setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths ) { + TNL_ASSERT( this->getRows() > 0, ); TNL_ASSERT( this->getColumns() > 0, ); + if( std::is_same< DeviceType, Devices::Host >::value ) { - RealType average = 0.0; - for( IndexType row = 0; row < this->getRows(); row++ ) - average += rowLengths.getElement( row ); - average /= ( RealType ) this->getRows(); - this->totalLoad = average; - warpList* list = new warpList(); + RealType average = 0.0; + for( IndexType row = 0; row < this->getRows(); row++ ) + average += rowLengths.getElement( row ); + average /= ( RealType ) this->getRows(); + this->totalLoad = average; + + warpList< AdEllpack >* list = new warpList< AdEllpack >(); if( !this->balanceLoad( average, rowLengths, list ) ) throw 0; // TODO: Make better exception @@ -186,18 +194,11 @@ setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths ) if( !this->createArrays( list ) ) throw 0; // TODO: Make better excpetion - - - - //this->performRowTest(); - //cout << "========================" << std::endl; - //cout << "Testing row lengths" << std::endl; - //cout << "========================" << std::endl; - //this->performRowLengthsTest( rowLengths ); } if( std::is_same< DeviceType, Devices::Cuda >::value ) { + AdEllpack< RealType, Devices::Host, IndexType > hostMatrix; hostMatrix.setDimensions( this->getRows(), this->getColumns() ); Containers::Vector< IndexType, Devices::Host, IndexType > hostRowLengths; @@ -213,7 +214,7 @@ setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths ) this->localLoad = hostMatrix.localLoad; this->reduceMap.setLike( hostMatrix.reduceMap ); this->reduceMap = hostMatrix.reduceMap; - this->totalLoad = hostMatrix.getTotalLoad(); + this->totalLoad = hostMatrix.getTotalLoad(); this->allocateMatrixElements( this->offset.getElement( this->offset.getSize() - 1 ) ); } @@ -279,7 +280,7 @@ void AdEllpack< Real, Device, Index >::performRowTest() } if( row == this->rowOffset.getElement( warp + 1 ) || row + 1 == this->rowOffset.getElement( warp + 1 ) ) ; - else + else { std::cout << "Error warp = " << warp << std::endl; std::cout << "Row: " << row << ", Row offset: " << this->rowOffset.getElement( warp + 1 ) << std::endl; @@ -357,15 +358,13 @@ template< typename Real, template< typename Real2, typename Device2, typename Index2 > -bool AdEllpack< Real, Device, Index >::setLike( const AdEllpack< Real2, Device2, Index2 >& matrix ) +void AdEllpack< Real, Device, Index >::setLike( const AdEllpack< Real2, Device2, Index2 >& matrix ) { - if( !Sparse< Real, Device, Index >::setLike( matrix ) || - !this->offset.setLike( matrix.offset ) || - !this->rowOffset.setLike( matrix.rowOffset ) || - !this->localLoad.setLike( matrix.localLoad ) || - !this->reduceMap.setLike( matrix.reduceMap ) ) - return false; - return true; + Sparse< Real, Device, Index >::setLike( matrix ); + this->offset.setLike( matrix.offset ); + this->rowOffset.setLike( matrix.rowOffset ); + this->localLoad.setLike( matrix.localLoad ); + this->reduceMap.setLike( matrix.reduceMap ); } template< typename Real, @@ -380,6 +379,38 @@ void AdEllpack< Real, Device, Index >::reset() this->reduceMap.reset(); } +template< typename Real, + typename Device, + typename Index > + template< typename Real2, + typename Device2, + typename Index2 > +bool AdEllpack< Real, Device, Index >::operator == ( const AdEllpack< Real2, Device2, Index2 >& matrix ) const +{ + TNL_ASSERT( this->getRows() == matrix.getRows() && + this->getColumns() == matrix.getColumns(), + std::cerr << "this->getRows() = " << this->getRows() + << " matrix.getRows() = " << matrix.getRows() + << " this->getColumns() = " << this->getColumns() + << " matrix.getColumns() = " << matrix.getColumns() ); + + TNL_ASSERT_TRUE( false, "operator == is not yet implemented for AdEllpack."); + + // TODO: implement this + return false; +} + +template< typename Real, + typename Device, + typename Index > + template< typename Real2, + typename Device2, + typename Index2 > +bool AdEllpack< Real, Device, Index >::operator != ( const AdEllpack< Real2, Device2, Index2 >& matrix ) const +{ + return ! ( ( *this ) == matrix ); +} + template< typename Real, typename Device, typename Index > @@ -581,11 +612,57 @@ template< typename Real, template< typename InVector, typename OutVector > void AdEllpack< Real, Device, Index >::vectorProduct( const InVector& inVector, - OutVector& outVector ) const + OutVector& outVector ) const { DeviceDependentCode::vectorProduct( *this, inVector, outVector ); } +// copy assignment +template< typename Real, + typename Device, + typename Index > +AdEllpack< Real, Device, Index >& +AdEllpack< Real, Device, Index >::operator=( const AdEllpack& matrix ) +{ + this->setLike( matrix ); + this->values = matrix.values; + this->columnIndexes = matrix.columnIndexes; + this->offset = matrix.offset; + this->rowOffset = matrix.rowOffset; + this->localLoad = matrix.localLoad; + this->reduceMap = matrix.reduceMap; + this->totalLoad = matrix.totalLoad; + this->warpSize = matrix.warpSize; + return *this; +} + + +// cross-device copy assignment +template< typename Real, + typename Device, + typename Index > + template< typename Real2, typename Device2, typename Index2, typename > +AdEllpack< Real, Device, Index >& +AdEllpack< Real, Device, Index >::operator=( const AdEllpack< Real2, Device2, Index2 >& matrix ) +{ + static_assert( std::is_same< Device, Devices::Host >::value || std::is_same< Device, Devices::Cuda >::value, + "unknown device" ); + static_assert( std::is_same< Device2, Devices::Host >::value || std::is_same< Device2, Devices::Cuda >::value, + "unknown device" ); + + this->setLike( matrix ); + this->values = matrix.values; + this->columnIndexes = matrix.columnIndexes; + this->offset = matrix.offset; + this->rowOffset = matrix.rowOffset; + this->localLoad = matrix.localLoad; + this->reduceMap = matrix.reduceMap; + this->totalLoad = matrix.totalLoad; + this->warpSize = matrix.warpSize; + + return *this; +} + template< typename Real, typename Device, typename Index > @@ -627,7 +704,7 @@ void AdEllpack< Real, Device, Index >::print( std::ostream& str ) const { for( IndexType row = 0; row < this->getRows(); row++ ) { - str << "Row: " << row << " -> \t"; + str << "Row: " << row << " -> "; IndexType warp = this->getWarp( row ); IndexType inWarpOffset = this->getInWarpOffset( row, warp ); @@ -642,8 +719,8 @@ void AdEllpack< Real, Device, Index >::print( std::ostream& str ) const for( IndexType i = 0; i < this->localLoad.getElement( warp ); i++ ) { if( this->columnIndexes.getElement( elementPtr ) != this->getPaddingIndex() ) - str << " column: " << this->columnIndexes.getElement( elementPtr ) << " -> " - << " value: " << this->values.getElement( elementPtr ) << std::endl; + str << " Col:" << this->columnIndexes.getElement( elementPtr ) << "->" + << this->values.getElement( elementPtr ) << "\t"; elementPtr += this->warpSize; } if( ( inWarpOffset < this->warpSize - 1 ) && @@ -658,6 +735,7 @@ void AdEllpack< Real, Device, Index >::print( std::ostream& str ) const else found = true; } + str << std::endl; } } @@ -666,7 +744,7 @@ template< typename Real, typename Index > bool AdEllpack< Real, Device, Index >::balanceLoad( const RealType average, ConstCompressedRowLengthsVectorView rowLengths, - warpList* list ) + warpList< AdEllpack >* list ) { IndexType offset, rowOffset, localLoad, reduceMap[ 32 ]; @@ -749,7 +827,7 @@ bool AdEllpack< Real, Device, Index >::balanceLoad( const RealType average, else { threadsPerRow = ( IndexType ) ( rowLength / ave ) + ( rowLength % ave == 0 ? 0 : 1 ); - if( threadsPerRow < this->warpSize ) + if( threadsPerRow < this->warpSize ) break; localLoad = ave; @@ -781,17 +859,18 @@ template< typename Real, typename Device, typename Index > void AdEllpack< Real, Device, Index >::computeWarps( const IndexType SMs, - const IndexType threadsPerSM, - warpList* list ) + const IndexType threadsPerSM, + warpList< AdEllpack >* list ) { IndexType averageLoad = 0; - warpInfo* temp = list->getHead()->next; - while( temp->next != list->getTail() ) + warpInfo< AdEllpack >* temp = list->getHead()->next; + + while( temp/*->next*/ != list->getTail() ) { averageLoad += temp->localLoad; temp = temp->next; } - averageLoad /= list->getNumberOfWarps(); + averageLoad = roundUpDivision( averageLoad, list->getNumberOfWarps() ); IndexType totalWarps = SMs * ( threadsPerSM / this->warpSize ); IndexType remainingThreads = list->getNumberOfWarps(); @@ -807,7 +886,6 @@ void AdEllpack< Real, Device, Index >::computeWarps( const IndexType SMs, { temp = list->splitInHalf( temp ); warpsToSplit = true; - } temp = temp->next; } @@ -818,7 +896,7 @@ void AdEllpack< Real, Device, Index >::computeWarps( const IndexType SMs, template< typename Real, typename Device, typename Index > -bool AdEllpack< Real, Device, Index >::createArrays( warpList* list ) +bool AdEllpack< Real, Device, Index >::createArrays( warpList< AdEllpack >* list ) { IndexType length = list->getNumberOfWarps(); @@ -828,7 +906,7 @@ bool AdEllpack< Real, Device, Index >::createArrays( warpList* list ) this->reduceMap.setSize( length * this->warpSize ); IndexType iteration = 0; - warpInfo* warp = list->getHead()->next; + warpInfo< AdEllpack >* warp = list->getHead()->next; while( warp != list->getTail() ) { this->offset.setElement( iteration, warp->offset ); @@ -871,7 +949,7 @@ public: OutVector& outVector ) { // parallel vector product simulation - const Index blockSize = 256; + const Index blockSize = 256; const Index blocks = ( Index ) ( matrix.reduceMap.getSize() / blockSize ) + ( matrix.reduceMap.getSize() % blockSize != 0 ); for( Index block = 0; block < blocks; block++ ) { @@ -948,31 +1026,33 @@ void AdEllpack< Real, Device, Index >::spmvCuda2( const InVector& inVector, reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ]; temp[ threadIdx.x ] = 0.0; - IndexType i = 0; - IndexType elementPtr = this->offset[ warpIdx ] + inWarpIdx; - for( ; i < this->localLoad[ warpIdx ]; i++ ) + IndexType i = 0; + IndexType elementPtr = this->offset[ warpIdx ] + inWarpIdx; + const IndexType warpLoad = this->localLoad[ warpIdx ]; + + for( ; i < warpLoad; i++ ) + { + if( this->columnIndexes[ elementPtr ] < this->getColumns() ) { - if( this->columnIndexes[ elementPtr ] < this->getColumns() ) - { - temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; - elementPtr += this->warpSize; - } + temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; + elementPtr += this->warpSize; } + } + if( ( inWarpIdx == 0 ) || ( reduceMap[ threadIdx.x ] > reduceMap[ threadIdx.x - 1 ] ) ) { IndexType elementPtr = threadIdx.x + 1; globalIdx++; - while( //elementPtr < this->reduceMap.getSize() && - globalIdx < ( ( warpIdx + 1 ) << 5 ) && + while( globalIdx < ( ( warpIdx + 1 ) << 5 ) && reduceMap[ elementPtr ] == reduceMap[ threadIdx.x ] ) { - temp[ threadIdx.x ] += temp[ elementPtr ]; - elementPtr++; - globalIdx++; + temp[ threadIdx.x ] += temp[ elementPtr ]; + elementPtr++; + globalIdx++; } outVector[ reduceMap[ threadIdx.x] ] += temp[ threadIdx.x ]; } -} +} template< typename Real, typename Device, @@ -996,40 +1076,46 @@ void AdEllpack< Real, Device, Index >::spmvCuda4( const InVector& inVector, reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ]; temp[ threadIdx.x ] = 0.0; - IndexType i = 0; - IndexType elementPtr = this->offset[ warpIdx ] + inWarpIdx; + IndexType i = 0; + IndexType elementPtr = this->offset[ warpIdx ] + inWarpIdx; + const IndexType warpLoad = this->localLoad[ warpIdx ]; - if( ( this->localLoad[ warpIdx ] & 1 ) == 1 ) - if( this->columnIndexes[ elementPtr ] < this->getColumns() ) - { - temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; - elementPtr += this->warpSize; - i++; - } - for( ; i < this->localLoad[ warpIdx ]; i += 2 ) + if( ( warpLoad & 1 ) == 1 ) + { + if( this->columnIndexes[ elementPtr ] < this->getColumns() ) { - #pragma unroll - for( IndexType j = 0; j < 2; j++ ) - if( this->columnIndexes[ elementPtr ] < this->getColumns() ) - { - temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; - elementPtr += this->warpSize; - } + temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; + elementPtr += this->warpSize; + i++; } + } + + for( ; i < warpLoad; i += 2 ) + { + #pragma unroll + for( IndexType j = 0; j < 2; j++ ) + { + if( this->columnIndexes[ elementPtr ] < this->getColumns() ) + { + temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; + elementPtr += this->warpSize; + } + } + } if( ( inWarpIdx == 0 ) || ( reduceMap[ threadIdx.x ] > reduceMap[ threadIdx.x - 1 ] ) ) { + IndexType end = ( warpIdx + 1 ) << 5; IndexType elementPtr = threadIdx.x + 1; globalIdx++; - while( //elementPtr < this->reduceMap.getSize() && - globalIdx < ( ( warpIdx + 1 ) << 5 ) && + while( globalIdx < end && reduceMap[ elementPtr ] == reduceMap[ threadIdx.x ] ) { - temp[ threadIdx.x ] += temp[ elementPtr ]; - elementPtr++; - globalIdx++; + temp[ threadIdx.x ] += temp[ elementPtr ]; + elementPtr++; + globalIdx++; } - outVector[ reduceMap[ threadIdx.x] ] += temp[ threadIdx.x ]; + outVector[ reduceMap[ threadIdx.x ] ] += temp[ threadIdx.x ]; } } @@ -1047,7 +1133,7 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector, IndexType warpIdx = globalIdx >> 5; IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 ); if( globalIdx >= this->reduceMap.getSize() ) - return; + return; const int blockSize = 128; Real* temp = Cuda::getSharedMemory< Real >(); @@ -1055,39 +1141,59 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector, reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ]; temp[ threadIdx.x ] = 0.0; - IndexType i = 0; - IndexType elementPtr = this->offset[ warpIdx ] + inWarpIdx; + IndexType i = 0; + IndexType elementPtr = this->offset[ warpIdx ] + inWarpIdx; + const IndexType warpLoad = this->localLoad[ warpIdx ]; - while( ( this->localLoad[ warpIdx ] & 7 ) != 0 ) - if( this->columnIndexes[ elementPtr ] < this->getColumns() ) - { - temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; + if( warpLoad < 4 ) + { + while( i < warpLoad && + this->columnIndexes[ elementPtr ] < this->getColumns() ) + { + temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; + elementPtr += this->warpSize; + i++; + } + } + else + { + IndexType alignUnroll = warpLoad & 3; + + while( alignUnroll != 0 && + this->columnIndexes[ elementPtr ] < this->getColumns() ) + { + temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; elementPtr += this->warpSize; i++; - } - for( ; i < this->localLoad[ warpIdx ]; i += 4 ) + alignUnroll--; + } + } + for( ; i < this->localLoad[ warpIdx ]; i += 4 ) + { + #pragma unroll + for( IndexType j = 0; j < 4; j++ ) { - #pragma unroll - for( IndexType j = 0; j < 4; j++ ) - if( this->columnIndexes[ elementPtr ] < this->getColumns() ) - { - temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; - elementPtr += this->warpSize; - } + if( this->columnIndexes[ elementPtr ] < this->getColumns() ) + { + temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; + elementPtr += this->warpSize; + } } + } + if( ( inWarpIdx == 0 ) || ( reduceMap[ threadIdx.x ] > reduceMap[ threadIdx.x - 1 ] ) ) { + IndexType end = ( warpIdx + 1 ) << 5; IndexType elementPtr = threadIdx.x + 1; globalIdx++; - while( //elementPtr < this->reduceMap.getSize() && - globalIdx < ( ( warpIdx + 1 ) << 5 ) && + while( globalIdx < end && reduceMap[ elementPtr ] == reduceMap[ threadIdx.x ] ) { - temp[ threadIdx.x ] += temp[ elementPtr ]; - elementPtr++; - globalIdx++; + temp[ threadIdx.x ] += temp[ elementPtr ]; + elementPtr++; + globalIdx++; } - outVector[ reduceMap[ threadIdx.x] ] += temp[ threadIdx.x ]; + outVector[ reduceMap[ threadIdx.x ] ] += temp[ threadIdx.x ]; } } @@ -1098,14 +1204,14 @@ template< typename InVector, typename OutVector > __device__ void AdEllpack< Real, Device, Index >::spmvCuda16( const InVector& inVector, - OutVector& outVector, - const int gridIdx ) const + OutVector& outVector, + const int gridIdx ) const { IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; IndexType warpIdx = globalIdx >> 5; IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 ); if( globalIdx >= this->reduceMap.getSize() ) - return; + return; const int blockSize = 128; Real* temp = Cuda::getSharedMemory< Real >(); @@ -1113,39 +1219,60 @@ void AdEllpack< Real, Device, Index >::spmvCuda16( const InVector& inVector, reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ]; temp[ threadIdx.x ] = 0.0; - IndexType i = 0; - IndexType elementPtr = this->offset[ warpIdx ] + inWarpIdx; + IndexType i = 0; + IndexType elementPtr = this->offset[ warpIdx ] + inWarpIdx; + const IndexType warpLoad = this->localLoad[ warpIdx ]; - while( ( this->localLoad[ warpIdx ] & 15 ) != 0 ) - if( this->columnIndexes[ elementPtr ] < this->getColumns() ) - { - temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; - elementPtr += this->warpSize; - i++; - } - for( ; i < this->localLoad[ warpIdx ]; i += 8 ) + if( warpLoad < 8 ) + { + while( i < warpLoad && + this->columnIndexes[ elementPtr ] < this->getColumns() ) + { + temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; + elementPtr += this->warpSize; + i++; + } + } + else + { + IndexType alignUnroll = warpLoad & 7; + + while( alignUnroll != 0 && + this->columnIndexes[ elementPtr ] < this->getColumns() ) { - #pragma unroll - for( IndexType j = 0; j < 8; j++ ) - if( this->columnIndexes[ elementPtr ] < this->getColumns() ) - { - temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; - elementPtr += this->warpSize; - } + temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; + elementPtr += this->warpSize; + i++; + alignUnroll--; + } + } + + for( ; i < warpLoad; i += 8 ) + { + #pragma unroll + for( IndexType j = 0; j < 8; j++ ) + { + if( this->columnIndexes[ elementPtr ] < this->getColumns() ) + { + temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; + elementPtr += this->warpSize; + } } + } + if( ( inWarpIdx == 0 ) || ( reduceMap[ threadIdx.x ] > reduceMap[ threadIdx.x - 1 ] ) ) { + IndexType end = ( warpIdx + 1 ) << 5; IndexType elementPtr = threadIdx.x + 1; globalIdx++; - while( //elementPtr < this->reduceMap.getSize() && - globalIdx < ( ( warpIdx + 1 ) << 5 ) && + while( globalIdx < end && reduceMap[ elementPtr ] == reduceMap[ threadIdx.x ] ) { - temp[ threadIdx.x ] += temp[ elementPtr ]; - elementPtr++; - globalIdx++; + temp[ threadIdx.x ] += temp[ elementPtr ]; + elementPtr++; + globalIdx++; } - outVector[ reduceMap[ threadIdx.x] ] += temp[ threadIdx.x ]; + outVector[ reduceMap[ threadIdx.x ] ] += temp[ threadIdx.x ]; } } @@ -1170,37 +1297,59 @@ void AdEllpack< Real, Device, Index >::spmvCuda32( const InVector& inVector, __shared__ IndexType reduceMap[ blockSize ]; reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ]; temp[ threadIdx.x ] = 0.0; - IndexType i = 0; - IndexType elementPtr = this->offset[ warpIdx ] + inWarpIdx; - while( ( this->localLoad[ warpIdx ] & 31 ) != 0 ) - if( this->columnIndexes[ elementPtr ] < this->getColumns() ) - { - temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; - elementPtr += this->warpSize; - i++; - } - for( ; i < this->localLoad[ warpIdx ]; i += 16 ) + IndexType i = 0; + IndexType elementPtr = this->offset[ warpIdx ] + inWarpIdx; + const IndexType warpLoad = this->localLoad[ warpIdx ]; + + if( warpLoad < 16 ) + { + while( i < warpLoad && + this->columnIndexes[ elementPtr ] < this->getColumns() ) { - #pragma unroll - for( IndexType j = 0; j < 16; j++ ) - if( this->columnIndexes[ elementPtr ] < this->getColumns() ) - { - temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; - elementPtr += this->warpSize; - } + temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; + elementPtr += this->warpSize; + i++; } + } + else + { + IndexType alignUnroll = warpLoad & 15; + + while( alignUnroll != 0 && + this->columnIndexes[ elementPtr ] < this->getColumns() ) + { + temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; + elementPtr += this->warpSize; + i++; + alignUnroll--; + } + } + + for( ; i < warpLoad; i += 16 ) + { + #pragma unroll + for( IndexType j = 0; j < 16; j++ ) + { + if( this->columnIndexes[ elementPtr ] < this->getColumns() ) + { + temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; + elementPtr += this->warpSize; + } + } + } + if( ( inWarpIdx == 0 ) || ( reduceMap[ threadIdx.x ] > reduceMap[ threadIdx.x - 1 ] ) ) { + IndexType end = ( warpIdx + 1 ) << 5; IndexType elementPtr = threadIdx.x + 1; globalIdx++; - while( //elementPtr < this->reduceMap.getSize() && - globalIdx < ( ( warpIdx + 1 ) << 5 ) && + while( globalIdx < end && reduceMap[ elementPtr ] == reduceMap[ threadIdx.x ] ) { - temp[ threadIdx.x ] += temp[ elementPtr ]; - elementPtr++; - globalIdx++; + temp[ threadIdx.x ] += temp[ elementPtr ]; + elementPtr++; + globalIdx++; } outVector[ reduceMap[ threadIdx.x] ] += temp[ threadIdx.x ]; } @@ -1274,7 +1423,6 @@ void AdEllpackVectorProductCuda32( const AdEllpack< Real, Devices::Cuda, Index > } #endif -#ifdef HAVE_CUDA template<> class AdEllpackDeviceDependentCode< Devices::Cuda > { @@ -1290,13 +1438,16 @@ public: const InVector& inVector, OutVector& outVector ) { - typedef AdEllpack< Real, Devices::Cuda, Index > Matrix; - typedef typename Matrix::IndexType IndexType; - Matrix* kernel_this = Cuda::passToDevice( matrix ); - InVector* kernel_inVector = Cuda::passToDevice( inVector ); - OutVector* kernel_outVector = Cuda::passToDevice( outVector ); - if( matrix.totalLoad < 2 ) - { +#ifdef HAVE_CUDA + typedef AdEllpack< Real, Devices::Cuda, Index > Matrix; + typedef typename Matrix::IndexType IndexType; + Matrix* kernel_this = Cuda::passToDevice( matrix ); + InVector* kernel_inVector = Cuda::passToDevice( inVector ); + OutVector* kernel_outVector = Cuda::passToDevice( outVector ); + TNL_CHECK_CUDA_DEVICE; + + if( matrix.totalLoad < 2 ) + { dim3 blockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); IndexType cudaBlocks = roundUpDivision( matrix.reduceMap.getSize(), blockSize.x ); IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); @@ -1369,7 +1520,7 @@ public: dim3 blockSize( 128 ), cudaGridSize( Cuda::getMaxGridSize() ); IndexType cudaBlocks = roundUpDivision( matrix.reduceMap.getSize(), blockSize.x ); IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); - for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) + for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); @@ -1410,11 +1561,11 @@ public: Cuda::freeFromDevice( kernel_outVector ); TNL_CHECK_CUDA_DEVICE; } - } - +#endif // HAVE_CUDA + } }; -#endif + } // namespace Matrices } // namespace TNL diff --git a/src/TNL/Matrices/BiEllpack.h b/src/TNL/Matrices/BiEllpack.h index cfc132ccd56318a0c160d6eab943fc5de90b7c7c..3ec4b662fe19979939006a5cd011d037501fdb10 100644 --- a/src/TNL/Matrices/BiEllpack.h +++ b/src/TNL/Matrices/BiEllpack.h @@ -28,9 +28,19 @@ namespace TNL { template< typename Device > class BiEllpackDeviceDependentCode; -template< typename Real, typename Device = Devices::Cuda, typename Index = int, int StripSize = 32 > +template< typename Real, typename Device, typename Index > class BiEllpack : public Sparse< Real, Device, Index > { +private: + + // convenient template alias for controlling the selection of copy-assignment operator + template< typename Device2 > + using Enabler = std::enable_if< ! std::is_same< Device2, Device >::value >; + + // friend class will be needed for templated assignment operators + template< typename Real2, typename Device2, typename Index2 > + friend class BiEllpack; + public: typedef Real RealType; typedef Device DeviceType; @@ -57,7 +67,15 @@ public: template< typename Real2, typename Device2, typename Index2 > - bool setLike( const BiEllpack< Real2, Device2, Index2, StripSize >& matrix ); + void setLike( const BiEllpack< Real2, Device2, Index2 >& matrix ); + + void reset(); + + template< typename Real2, typename Device2, typename Index2 > + bool operator == ( const BiEllpack< Real2, Device2, Index2 >& matrix ) const; + + template< typename Real2, typename Device2, typename Index2 > + bool operator != ( const BiEllpack< Real2, Device2, Index2 >& matrix ) const; void getRowLengths( CompressedRowLengthsVector& rowLengths ) const; @@ -124,8 +142,14 @@ public: IndexType getNumberOfGroups( const IndexType row ) const; bool vectorProductTest() const; + + // copy assignment + BiEllpack& operator=( const BiEllpack& matrix ); - void reset(); + // cross-device copy assignment + template< typename Real2, typename Device2, typename Index2, + typename = typename Enabler< Device2 >::type > + BiEllpack& operator=( const BiEllpack< Real2, Device2, Index2 >& matrix ); void save( File& file ) const; @@ -136,11 +160,13 @@ public: void load( const String& fileName ); void print( std::ostream& str ) const; + + void printValues() const; void performRowBubbleSort( Containers::Vector< Index, Device, Index >& tempRowLengths ); void computeColumnSizes( Containers::Vector< Index, Device, Index >& tempRowLengths ); -// void verifyRowLengths( const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths ); +// void verifyRowLengths( const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths ); template< typename InVector, typename OutVector > @@ -157,11 +183,11 @@ public: IndexType getStripLength( const IndexType strip ) const; __cuda_callable__ - void performRowBubbleSortCudaKernel( const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths, + void performRowBubbleSortCudaKernel( const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths, const IndexType strip ); __cuda_callable__ - void computeColumnSizesCudaKernel( const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths, + void computeColumnSizesCudaKernel( const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths, const IndexType numberOfStrips, const IndexType strip ); @@ -171,6 +197,8 @@ public: typedef BiEllpackDeviceDependentCode< DeviceType > DeviceDependentCode; friend class BiEllpackDeviceDependentCode< DeviceType >; + friend class BiEllpack< RealType, Devices::Host, IndexType >; + friend class BiEllpack< RealType, Devices::Cuda, IndexType >; private: diff --git a/src/TNL/Matrices/BiEllpackSymmetric_impl.h b/src/TNL/Matrices/BiEllpackSymmetric_impl.h index 0af180c0e8c2c54d2c4fdb304fa3e2813d76786c..a27497b3f67b97ae33394a957775c30abd6281d2 100644 --- a/src/TNL/Matrices/BiEllpackSymmetric_impl.h +++ b/src/TNL/Matrices/BiEllpackSymmetric_impl.h @@ -45,6 +45,30 @@ BiEllpackSymmetric< Real, Device, Index, StripSize >::BiEllpackSymmetric() logWarpSize( 5 ) {} +template< typename Real, + typename Device, + typename Index, + int StripSize > +String BiEllpackSymmetric< Real, Device, Index, StripSize >::getType() +{ + return String( "Matrices::BiEllpackMatrix< ") + + String( TNL::getType< Real >() ) + + String( ", " ) + + String( Device :: getDeviceType() ) + + String( ", " ) + + String( TNL::getType< Index >() ) + + String( " >" ); +} + +template< typename Real, + typename Device, + typename Index, + int StripSize > +String BiEllpackSymmetric< Real, Device, Index, StripSize >::getTypeVirtual() const +{ + return this->getType(); +} + template< typename Real, typename Device, typename Index, diff --git a/src/TNL/Matrices/BiEllpack_impl.h b/src/TNL/Matrices/BiEllpack_impl.h index 51646152e8e62d8c26fb55a961869b8acef7826e..c659b758e9cffe531a101baf8fe3cd812436fe2c 100644 --- a/src/TNL/Matrices/BiEllpack_impl.h +++ b/src/TNL/Matrices/BiEllpack_impl.h @@ -22,11 +22,10 @@ namespace TNL { template< typename Real, typename Device, - typename Index, - int StripSize > + typename Index> __cuda_callable__ -Index BiEllpack< Real, Device, Index, StripSize >::power( const IndexType number, - const IndexType exponent ) const +Index BiEllpack< Real, Device, Index >::power( const IndexType number, + const IndexType exponent ) const { if( exponent >= 0 ) { @@ -40,19 +39,17 @@ Index BiEllpack< Real, Device, Index, StripSize >::power( const IndexType number template< typename Real, typename Device, - typename Index, - int StripSize > -BiEllpack< Real, Device, Index, StripSize >::BiEllpack() + typename Index > +BiEllpack< Real, Device, Index >::BiEllpack() : warpSize( 32 ), logWarpSize( 5 ) {} template< typename Real, typename Device, - typename Index, - int StripSize > + typename Index > void -BiEllpack< Real, Device, Index, StripSize >:: +BiEllpack< Real, Device, Index >:: setDimensions( const IndexType rows, const IndexType columns ) { TNL_ASSERT( rows >= 0 && columns >= 0, std::cerr << "rows = " << rows << "columns = " << columns << std::endl ); @@ -73,47 +70,46 @@ setDimensions( const IndexType rows, const IndexType columns ) template< typename Real, typename Device, - typename Index, - int StripSize > + typename Index > void -BiEllpack< Real, Device, Index, StripSize >:: -setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths ) +BiEllpack< Real, Device, Index >:: +setCompressedRowLengths( ConstCompressedRowLengthsVectorView constRowLengths ) { - if( this->getRows() % this->warpSize != 0 ) - this->setVirtualRows( this->getRows() + this->warpSize - ( this->getRows() % this->warpSize ) ); - else - this->setVirtualRows( this->getRows() ); - IndexType strips = this->virtualRows / this->warpSize; - this->rowPermArray.setSize( this->rows ); - this->groupPointers.setSize( strips * ( this->logWarpSize + 1 ) + 1 ); - - for( IndexType i = 0; i < this->groupPointers.getSize(); i++ ) - this->groupPointers.setElement( i, 0 ); - - // FIXME: cannot sort a const vector! - //DeviceDependentCode::performRowBubbleSort( *this, rowLengths ); - //DeviceDependentCode::computeColumnSizes( *this, rowLengths ); - - this->groupPointers.template scan< Algorithms::ScanType::Exclusive >(); - - // uncomment to perform structure test - //DeviceDependentCode::verifyRowPerm( *this, rowLengths ); - //DeviceDependentCode::verifyRowLengths( *this, rowLengths ); - - return - this->allocateMatrixElements( this->warpSize * this->groupPointers.getElement( strips * ( this->logWarpSize + 1 ) ) ); + CompressedRowLengthsVector rowLengths; + rowLengths.reset(); + rowLengths.setLike( constRowLengths ); + + rowLengths = constRowLengths; + + if( this->getRows() % this->warpSize != 0 ) + this->setVirtualRows( this->getRows() + this->warpSize - ( this->getRows() % this->warpSize ) ); + else + this->setVirtualRows( this->getRows() ); + IndexType strips = this->virtualRows / this->warpSize; + this->rowPermArray.setSize( this->rows ); + this->groupPointers.setSize( strips * ( this->logWarpSize + 1 ) + 1 ); + + this->groupPointers.setValue( 0 ); + + DeviceDependentCode::performRowBubbleSort( *this, rowLengths ); + DeviceDependentCode::computeColumnSizes( *this, rowLengths ); + + //this->groupPointers.computeExclusivePrefixSum(); + this->groupPointers.template scan< Algorithms::ScanType::Exclusive >(); + + DeviceDependentCode::verifyRowPerm( *this, rowLengths ); + DeviceDependentCode::verifyRowLengths( *this, rowLengths ); + + return this->allocateMatrixElements( this->warpSize * this->groupPointers.getElement( strips * ( this->logWarpSize + 1 ) ) ); } template< typename Real, typename Device, - typename Index, - int StripSize > + typename Index > __cuda_callable__ -Index BiEllpack< Real, Device, Index, StripSize >::getStripLength( const IndexType strip ) const +Index BiEllpack< Real, Device, Index >::getStripLength( const IndexType strip ) const { - TNL_ASSERT( strip >= 0, - "strip = " << strip - << " this->getName() = " << this->getName() ); + TNL_ASSERT( strip >= 0, std::cerr << "strip = " << strip ); return this->groupPointers.getElement( ( strip + 1 ) * ( this->logWarpSize + 1 ) ) - this->groupPointers.getElement( strip * ( this->logWarpSize + 1 ) ); @@ -121,20 +117,18 @@ Index BiEllpack< Real, Device, Index, StripSize >::getStripLength( const IndexTy template< typename Real, typename Device, - typename Index, - int StripSize > + typename Index > __cuda_callable__ -Index BiEllpack< Real, Device, Index, StripSize >::getNumberOfGroups( const IndexType row ) const +Index BiEllpack< Real, Device, Index >::getNumberOfGroups( const IndexType row ) const { - TNL_ASSERT( row >=0 && row < this->getRows(), + TNL_ASSERT( row >= 0 && row < this->getRows(), std::cerr << "row = " << row - << " this->getRows() = " << this->getRows() - << " this->getName() = " << std::endl; ); + << " this->getRows() = " << this->getRows() ); IndexType strip = row / this->warpSize; - IndexType rowStripPermutation = this->rowPermArray[ row ] - this->warpSize * strip; + IndexType rowStripPermutation = this->rowPermArray.getElement( row ) - this->warpSize * strip; IndexType numberOfGroups = this->logWarpSize + 1; - IndexType bisection = 1; + IndexType bisection = 1; for( IndexType i = 0; i < this->logWarpSize + 1; i++ ) { if( rowStripPermutation < bisection ) @@ -151,13 +145,11 @@ Index BiEllpack< Real, Device, Index, StripSize >::getNumberOfGroups( const Inde template< typename Real, typename Device, - typename Index, - int StripSize > -Index BiEllpack< Real, Device, Index, StripSize >::getRowLength( const IndexType row ) const + typename Index > +Index BiEllpack< Real, Device, Index >::getRowLength( const IndexType row ) const { TNL_ASSERT( row >= 0 && row < this->getRows(), - std::cerr << "row = " << row << " this->getRows() = " << this->getRows() - << " this->getName() = " << std::endl; ); + std::cerr << "row = " << row << " this->getRows() = " << this->getRows() ); const IndexType strip = row / this->warpSize; const IndexType groupBegin = strip * ( this->logWarpSize + 1 ); @@ -185,84 +177,117 @@ Index BiEllpack< Real, Device, Index, StripSize >::getRowLength( const IndexType template< typename Real, typename Device, - typename Index, - int StripSize > + typename Index > template< typename Real2, typename Device2, typename Index2 > -bool BiEllpack< Real, Device, Index, StripSize >::setLike( const BiEllpack< Real2, Device2, Index2, StripSize >& matrix ) +void BiEllpack< Real, Device, Index >::setLike( const BiEllpack< Real2, Device2, Index2 >& matrix ) +{ + Sparse< Real, Device, Index >::setLike( matrix ); + this->rowPermArray.setLike( matrix.rowPermArray ); + this->groupPointers.setLike( matrix.groupPointers ); +} + +template< typename Real, + typename Device, + typename Index > +void BiEllpack< Real, Device, Index >::reset() +{ + Sparse< Real, Device, Index >::reset(); + this->rowPermArray.reset(); + this->groupPointers.reset(); +} + +template< typename Real, + typename Device, + typename Index > + template< typename Real2, + typename Device2, + typename Index2 > +bool BiEllpack< Real, Device, Index >::operator == ( const BiEllpack< Real2, Device2, Index2 >& matrix ) const { - std::cout << "setLike" << std::endl; - std::cout << "settingLike" << std::endl; - if( ! Sparse< Real, Device, Index >::setLike( matrix ) || - ! this->rowPermArray.setLike( matrix.rowPermArray ) || - ! this->groupPointers.setLike( matrix.groupPointers ) ) - return false; - return true; + TNL_ASSERT( this->getRows() == matrix.getRows() && + this->getColumns() == matrix.getColumns(), + std::cerr << "this->getRows() = " << this->getRows() + << " matrix.getRows() = " << matrix.getRows() + << " this->getColumns() = " << this->getColumns() + << " matrix.getColumns() = " << matrix.getColumns() ); + + TNL_ASSERT_TRUE( false, "operator == is not yet implemented for BiEllpack."); + + // TODO: implement this + return false; +} + +template< typename Real, + typename Device, + typename Index > + template< typename Real2, + typename Device2, + typename Index2 > +bool BiEllpack< Real, Device, Index >::operator != ( const BiEllpack< Real2, Device2, Index2 >& matrix ) const +{ + return ! ( ( *this ) == matrix ); } template< typename Real, typename Device, - typename Index, - int StripSize > -void BiEllpack< Real, Device, Index, StripSize >::getRowLengths( CompressedRowLengthsVector& rowLengths) const + typename Index > +void BiEllpack< Real, Device, Index >::getRowLengths( CompressedRowLengthsVector& rowLengths) const { + // WHAT IS THIS??! + // It's called getRowLengths, but takes an argument that it fill up with this matrix's row lengths??? for( IndexType row = 0; row < this->getRows(); row++ ) rowLengths.setElement( row, this->getRowLength( row ) ); } template< typename Real, typename Device, - typename Index, - int StripSize > + typename Index > bool -BiEllpack< Real, Device, Index, StripSize >:: +BiEllpack< Real, Device, Index >:: setElement( const IndexType row, const IndexType column, const RealType& value ) { - TNL_ASSERT( ( row >= 0 && row < this->getRows() ) || - ( column >= 0 && column < this->getColumns() ), - std::cerr << "row = " << row - << " this->getRows() = " << this->getRows() - << " this->getColumns() = " << this->getColumns() - << " this->getName() = " << std::endl; ); + TNL_ASSERT( ( row >= 0 && row < this->getRows() ) || + ( column >= 0 && column < this->getColumns() ), + std::cerr << "row = " << row + << " this->getRows() = " << this->getRows() + << " this->getColumns() = " << this->getColumns() ); - return this->addElement( row, column, value, 0.0 ); + return this->addElement( row, column, value, 0.0 ); } template< typename Real, typename Device, - typename Index, - int StripSize > + typename Index > __cuda_callable__ -bool BiEllpack< Real, Device, Index, StripSize >::setElementFast( const IndexType row, - const IndexType column, - const RealType& value ) +bool BiEllpack< Real, Device, Index >::setElementFast( const IndexType row, + const IndexType column, + const RealType& value ) { TNL_ASSERT( ( row >= 0 && row < this->getRows() ) || ( column >= 0 && column < this->getColumns() ), std::cerr << "row = " << row << " this->getRows() = " << this->getRows() - << " this->getColumns() = " << this->getColumns() - << " this->getName() = " << this->getName() << std::endl ); + << " this->getColumns() = " << this->getColumns() ); return this->addElementFast( row, column, value, 0.0 ); } template< typename Real, typename Device, - typename Index, - int StripSize > -bool BiEllpack< Real, Device, Index, StripSize >::addElement( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator ) + typename Index > +bool BiEllpack< Real, Device, Index >::addElement( const IndexType row, + const IndexType column, + const RealType& value, + const RealType& thisElementMultiplicator ) { - const IndexType strip = row / this->warpSize; - const IndexType groupBegin = strip * ( this->logWarpSize + 1 ); - const IndexType rowStripPerm = this->rowPermArray.getElement( row ) - strip * this->warpSize; - IndexType elementPtr = this->groupPointers.getElement( groupBegin ) * this->warpSize + rowStripPerm; + const IndexType strip = row / this->warpSize; + const IndexType groupBegin = strip * ( this->logWarpSize + 1 ); + const IndexType rowStripPerm = this->rowPermArray.getElement( row ) - strip * this->warpSize; + IndexType elementPtr = this->groupPointers.getElement( groupBegin ) * this->warpSize + rowStripPerm; IndexType rowMultiplicator = 1; IndexType step = this->warpSize; @@ -278,7 +303,7 @@ bool BiEllpack< Real, Device, Index, StripSize >::addElement( const IndexType ro } if( this->columnIndexes.getElement( elementPtr ) == column ) { - this->values.setElement( elementPtr, this->values.getElement( elementPtr ) + value * thisElementMultiplicator ); + this->values.setElement( elementPtr, value + thisElementMultiplicator * this->values.getElement( elementPtr ) ); return true; } elementPtr += step; @@ -291,13 +316,12 @@ bool BiEllpack< Real, Device, Index, StripSize >::addElement( const IndexType ro template< typename Real, typename Device, - typename Index, - int StripSize > + typename Index > __cuda_callable__ -bool BiEllpack< Real, Device, Index, StripSize >::addElementFast( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator ) +bool BiEllpack< Real, Device, Index >::addElementFast( const IndexType row, + const IndexType column, + const RealType& value, + const RealType& thisElementMultiplicator ) { const IndexType strip = row / this->warpSize; const IndexType groupBegin = strip * ( this->logWarpSize + 1 ); @@ -333,7 +357,7 @@ bool BiEllpack< Real, Device, Index, StripSize >::addElementFast( const IndexTyp } if( this->columnIndexes[ elementPtr ] == column ) { - this->values[ elementPtr ] += value * thisElementMultiplicator ; + this->values[ elementPtr ] = thisElementMultiplicator * this->values[ elementPtr ] + value ; return true; } elementPtr += step; @@ -346,18 +370,16 @@ bool BiEllpack< Real, Device, Index, StripSize >::addElementFast( const IndexTyp template< typename Real, typename Device, - typename Index, - int StripSize > + typename Index > bool -BiEllpack< Real, Device, Index, StripSize >:: +BiEllpack< Real, Device, Index >:: setRow( const IndexType row, const IndexType* columns, const RealType* values, const IndexType numberOfElements ) { TNL_ASSERT( row >= 0 && row < this->getRows(), - std::cerr <<"row = " << row << " this->getRows() = " << this->getRows() - << " this->getName() = " << std::endl; ); + std::cerr <<"row = " << row << " this->getRows() = " << this->getRows() ); const IndexType strip = row / this->warpSize; const IndexType groupBegin = strip * ( this->logWarpSize + 1 ); @@ -386,10 +408,9 @@ setRow( const IndexType row, template< typename Real, typename Device, - typename Index, - int StripSize > + typename Index > bool -BiEllpack< Real, Device, Index, StripSize >:: +BiEllpack< Real, Device, Index >:: addRow( const IndexType row, const IndexType* columns, const RealType* values, @@ -397,8 +418,7 @@ addRow( const IndexType row, const RealType& thisElementMultiplicator ) { TNL_ASSERT( row >=0 && row < this->getRows(), - std::cerr << "row = " << row << " this->getRows() = " << this->getRows() - << " this->getName() = " << std::endl ); + std::cerr << "row = " << row << " this->getRows() = " << this->getRows() ); const IndexType strip = row / this->warpSize; const IndexType groupBegin = strip * ( this->logWarpSize + 1 ); @@ -431,17 +451,15 @@ addRow( const IndexType row, template< typename Real, typename Device, - typename Index, - int StripSize > -Real BiEllpack< Real, Device, Index, StripSize >::getElement( const IndexType row, - const IndexType column ) const + typename Index > +Real BiEllpack< Real, Device, Index >::getElement( const IndexType row, + const IndexType column ) const { TNL_ASSERT( ( row >= 0 && row < this->getRows() ) || ( column >= 0 && column < this->getColumns() ), std::cerr << "row = " << row << " this->getRows() = " << this->getRows() - << " this->getColumns() = " << this->getColumns() - << "this->getName() = " << std::endl ); + << " this->getColumns() = " << this->getColumns() ); const IndexType strip = row / this->warpSize; const IndexType groupBegin = strip * ( this->logWarpSize + 1 ); @@ -466,11 +484,10 @@ Real BiEllpack< Real, Device, Index, StripSize >::getElement( const IndexType ro template< typename Real, typename Device, - typename Index, - int StripSize > + typename Index > __cuda_callable__ -Real BiEllpack< Real, Device, Index, StripSize >::getElementFast( const IndexType row, - const IndexType column ) const +Real BiEllpack< Real, Device, Index >::getElementFast( const IndexType row, + const IndexType column ) const { const IndexType strip = row / this->warpSize; const IndexType groupBegin = strip * ( this->logWarpSize + 1 ); @@ -510,16 +527,14 @@ Real BiEllpack< Real, Device, Index, StripSize >::getElementFast( const IndexTyp template< typename Real, typename Device, - typename Index, - int StripSize > -void BiEllpack< Real, Device, Index, StripSize >::getRow( const IndexType row, - IndexType* columns, - RealType* values ) const + typename Index > +void BiEllpack< Real, Device, Index >::getRow( const IndexType row, + IndexType* columns, + RealType* values ) const { TNL_ASSERT( row >=0 && row < this->getRows(), std::cerr << "row = " << row - << " this->getRows() = " << this->getRows() - << " this->getName() = " << this->getName() << std::endl ); + << " this->getRows() = " << this->getRows() ); bool padding = false; const IndexType strip = row / this->warpSize; @@ -551,45 +566,41 @@ void BiEllpack< Real, Device, Index, StripSize >::getRow( const IndexType row, template< typename Real, typename Device, - typename Index, - int StripSize > -void BiEllpack< Real, Device, Index, StripSize >::setVirtualRows(const IndexType rows) + typename Index > +void BiEllpack< Real, Device, Index >::setVirtualRows(const IndexType rows) { this->virtualRows = rows; } template< typename Real, typename Device, - typename Index, - int StripSize > + typename Index > __cuda_callable__ -Index BiEllpack< Real, Device, Index, StripSize >::getGroupLength( const Index strip, - const Index group ) const +Index BiEllpack< Real, Device, Index >::getGroupLength( const Index strip, + const Index group ) const { - return this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ] - - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ]; + return this->groupPointers.getElement( strip * ( this->logWarpSize + 1 ) + group + 1 ) + - this->groupPointers.getElement( strip * ( this->logWarpSize + 1 ) + group ); } template< typename Real, typename Device, - typename Index, - int StripSize > + typename Index > template< typename InVector, typename OutVector > -void BiEllpack< Real, Device, Index, StripSize >::vectorProduct( const InVector& inVector, - OutVector& outVector ) const +void BiEllpack< Real, Device, Index >::vectorProduct( const InVector& inVector, + OutVector& outVector ) const { DeviceDependentCode::vectorProduct( *this, inVector, outVector ); } template< typename Real, typename Device, - typename Index, - int StripSize > + typename Index > template< typename InVector, typename OutVector > -void BiEllpack< Real, Device, Index, StripSize >::vectorProductHost( const InVector& inVector, - OutVector& outVector ) const +void BiEllpack< Real, Device, Index >::vectorProductHost( const InVector& inVector, + OutVector& outVector ) const { const IndexType cudaBlockSize = 256; const IndexType cudaBlocks = roundUpDivision( this->getRows(), cudaBlockSize ); @@ -644,22 +655,52 @@ void BiEllpack< Real, Device, Index, StripSize >::vectorProductHost( const InVec } } +// copy assignment template< typename Real, - typename Device, - typename Index, - int StripSize > -void BiEllpack< Real, Device, Index, StripSize >::reset() + typename Device, + typename Index > +BiEllpack< Real, Device, Index >& +BiEllpack< Real, Device, Index >::operator=( const BiEllpack& matrix ) { - Sparse< Real, Device, Index >::reset(); - this->rowPermArray.reset(); - this->groupPointers.reset(); + this->setLike( matrix ); + this->values = matrix.values; + this->columnIndexes = matrix.columnIndexes; + this->warpSize = matrix.warpSize; + this->logWarpSize = matrix.logWarpSize; + this->virtualRows = matrix.virtualRows; + this->rowPermArray = matrix.rowPermArray; + this->groupPointers = matrix.groupPointers; + return *this; +} + +// cross-device copy assignment +template< typename Real, + typename Device, + typename Index > + template< typename Real2, typename Device2, typename Index2, typename > +BiEllpack< Real, Device, Index >& +BiEllpack< Real, Device, Index >::operator=( const BiEllpack< Real2, Device2, Index2 >& matrix ) +{ + static_assert( std::is_same< Device, Devices::Host >::value || std::is_same< Device, Devices::Cuda >::value, + "unknown device" ); + static_assert( std::is_same< Device2, Devices::Host >::value || std::is_same< Device2, Devices::Cuda >::value, + "unknown device" ); + + this->setLike( matrix ); + this->values = matrix.values; + this->columnIndexes = matrix.columnIndexes; + this->warpSize = matrix.warpSize; + this->logWarpSize = matrix.logWarpSize; + this->virtualRows = matrix.virtualRows; + this->rowPermArray = matrix.rowPermArray; + this->groupPointers = matrix.groupPointers; + return *this; } template< typename Real, typename Device, - typename Index, - int StripSize > -void BiEllpack< Real, Device, Index, StripSize >::save( File& file ) const + typename Index > +void BiEllpack< Real, Device, Index >::save( File& file ) const { Sparse< Real, Device, Index >::save( file ); file << this->groupPointers << this->rowPermArray; @@ -667,9 +708,8 @@ void BiEllpack< Real, Device, Index, StripSize >::save( File& file ) const template< typename Real, typename Device, - typename Index, - int StripSize > -void BiEllpack< Real, Device, Index, StripSize >::load( File& file ) + typename Index > +void BiEllpack< Real, Device, Index >::load( File& file ) { Sparse< Real, Device, Index >::load( file ); file >> this->groupPointers >> this->rowPermArray; @@ -677,31 +717,29 @@ void BiEllpack< Real, Device, Index, StripSize >::load( File& file ) template< typename Real, typename Device, - typename Index, - int StripSize > -void BiEllpack< Real, Device, Index, StripSize >::save( const String& fileName ) const + typename Index > +void BiEllpack< Real, Device, Index >::save( const String& fileName ) const { Object::save( fileName ); } template< typename Real, typename Device, - typename Index, - int StripSize > -void BiEllpack< Real, Device, Index, StripSize >::load( const String& fileName ) + typename Index > +void BiEllpack< Real, Device, Index >::load( const String& fileName ) { Object::load( fileName ); } template< typename Real, typename Device, - typename Index, - int StripSize > -void BiEllpack< Real, Device, Index, StripSize >::print( std::ostream& str ) const + typename Index > +void BiEllpack< Real, Device, Index >::print( std::ostream& str ) const { for( IndexType row = 0; row < this->getRows(); row++ ) { str <<"Row: " << row << " -> "; +// str << row << ": "; bool padding = false; const IndexType strip = row / this->warpSize; const IndexType groupBegin = strip * ( this->logWarpSize + 1 ); @@ -722,6 +760,7 @@ void BiEllpack< Real, Device, Index, StripSize >::print( std::ostream& str ) con RealType value = this->values.getElement( elementPtr ); IndexType column = this->columnIndexes.getElement( elementPtr ); str << " Col:" << column << "->" << value << "\t"; +// str << value << " "; elementPtr += step; } step /= 2; @@ -731,11 +770,30 @@ void BiEllpack< Real, Device, Index, StripSize >::print( std::ostream& str ) con } } +template< typename Real, + typename Device, + typename Index > +void BiEllpack< Real, Device, Index >::printValues() const +{ + for( Index i = 0; i < this->values.getSize(); i++ ) { + if( this->columnIndexes.getElement( i ) != this->getColumns() ) + std::cout << "values.getElement( " << i << " ) = " << this->values.getElement( i ) + << "\tcolumnIndexes.getElement( " << i << " ) = " << this->columnIndexes.getElement( i ) << std::endl; + } + + for( Index i = 0; i < this->rowPermArray.getSize(); i++ ) { + std::cout << "rowPermArray[ " << i << " ] = " << this->rowPermArray.getElement( i ) << std::endl; + } + + for( Index i = 0; i < this->groupPointers.getSize(); i++ ) { + std::cout << "groupPointers[ " << i << " ] = " << this->groupPointers.getElement( i ) << std::endl; + } +} + template< typename Real, typename Device, - typename Index, - int StripSize > -void BiEllpack< Real, Device, Index, StripSize >::performRowBubbleSort( Containers::Vector< Index, Device, Index >& tempRowLengths ) + typename Index > +void BiEllpack< Real, Device, Index >::performRowBubbleSort( Containers::Vector< Index, Device, Index >& tempRowLengths ) { Index strips = this->virtualRows / this->warpSize; for( Index i = 0; i < strips; i++ ) @@ -792,9 +850,8 @@ void BiEllpack< Real, Device, Index, StripSize >::performRowBubbleSort( Containe template< typename Real, typename Device, - typename Index, - int StripSize > -void BiEllpack< Real, Device, Index, StripSize >::computeColumnSizes( Containers::Vector< Index, Device, Index >& tempRowLengths ) + typename Index > +void BiEllpack< Real, Device, Index >::computeColumnSizes( Containers::Vector< Index, Device, Index >& tempRowLengths ) { Index numberOfStrips = this->virtualRows / this->warpSize; for( Index strip = 0; strip < numberOfStrips; strip++ ) @@ -838,10 +895,9 @@ public: typedef Devices::Host Device; template< typename Real, - typename Index, - int StripSize > - static void verifyRowLengths( const BiEllpack< Real, Device, Index, StripSize >& matrix, - const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths ) + typename Index > + static void verifyRowLengths( const BiEllpack< Real, Device, Index >& matrix, + const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths ) { bool ok = true; for( Index row = 0; row < matrix.getRows(); row++ ) @@ -870,14 +926,15 @@ public: ok = false; } if( ok ) - std::cout << "row lengths OK" << std::endl; + { +// std::cout << "row lengths OK" << std::endl; + } } template< typename Real, - typename Index, - int StripSize > - static void verifyRowPerm( const BiEllpack< Real, Device, Index, StripSize >& matrix, - const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths ) + typename Index > + static void verifyRowPerm( const BiEllpack< Real, Device, Index >& matrix, + const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths ) { bool ok = true; Index numberOfStrips = matrix.virtualRows / matrix.warpSize; @@ -914,26 +971,26 @@ public: } } if( ok ) - std::cout << "Permutation OK" << std::endl; + { +// std::cout << "Permutation OK" << std::endl; + } } template< typename Real, typename Index, - int StripSize, typename InVector, typename OutVector > - static void vectorProduct( const BiEllpack< Real, Device, Index, StripSize >& matrix, - const InVector& inVector, - OutVector& outVector ) + static void vectorProduct( const BiEllpack< Real, Device, Index >& matrix, + const InVector& inVector, + OutVector& outVector ) { matrix.vectorProductHost( inVector, outVector ); } template< typename Real, - typename Index, - int StripSize > - static void computeColumnSizes( BiEllpack< Real, Device, Index, StripSize >& matrix, - const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths ) + typename Index > + static void computeColumnSizes( BiEllpack< Real, Device, Index >& matrix, + const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths ) { Index numberOfStrips = matrix.virtualRows / matrix.warpSize; for( Index strip = 0; strip < numberOfStrips; strip++ ) @@ -976,11 +1033,10 @@ public: } template< typename Real, - typename Index, - int StripSize > - static void performRowBubbleSort( BiEllpack< Real, Device, Index, StripSize >& matrix, - const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths - /*Containers::Vector< Index, Device, Index >& tempRowLengths*/ ) + typename Index > + static void performRowBubbleSort( BiEllpack< Real, Device, Index >& matrix, + const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths + /*Containers::Vector< Index, Device, Index >& tempRowLengths*/ ) { Index strips = matrix.virtualRows / matrix.warpSize; for( Index i = 0; i < strips; i++ ) @@ -1037,14 +1093,13 @@ public: #ifdef HAVE_CUDA template< typename Real, typename Device, - typename Index, - int StripSize > + typename Index > template< typename InVector, typename OutVector > __device__ -void BiEllpack< Real, Device, Index, StripSize >::spmvCuda( const InVector& inVector, - OutVector& outVector, - int globalIdx ) const +void BiEllpack< Real, Device, Index >::spmvCuda( const InVector& inVector, + OutVector& outVector, + int globalIdx ) const { const IndexType strip = globalIdx >> this->logWarpSize; const IndexType warpStart = strip << this->logWarpSize; @@ -1064,218 +1119,49 @@ void BiEllpack< Real, Device, Index, StripSize >::spmvCuda( const InVector& inVe for( IndexType group = 0; group < this->logWarpSize + 1; group++ ) { - temp[ threadIdx.x ] = 0.0; - IndexType groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ] - - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ]; + temp[ threadIdx.x ] = 0.0; + IndexType groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ] + - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ]; - if( groupLength > 0 ) - { - for( IndexType i = 0; i < groupLength; i++ ) - { - if( this->columnIndexes[ elementPtr ] < this->getColumns() ) - temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; - elementPtr += this->warpSize; - } - IndexType bisection2 = this->warpSize; - for( IndexType i = 0; i < group; i++ ) + if( groupLength > 0 ) { - bisection2 >>= 1; - if( inWarpIdx < bisection2 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + bisection2 ]; + for( IndexType i = 0; i < groupLength; i++ ) + { + if( this->columnIndexes[ elementPtr ] < this->getColumns() ) + temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; + elementPtr += this->warpSize; + } + IndexType bisection2 = this->warpSize; + for( IndexType i = 0; i < group; i++ ) + { + bisection2 >>= 1; + if( inWarpIdx < bisection2 ) + temp[ threadIdx.x ] += temp[ threadIdx.x + bisection2 ]; + } + if( inWarpIdx < bisection ) + results[ threadIdx.x ] += temp[ threadIdx.x ]; } - if( inWarpIdx < bisection ) - results[ threadIdx.x ] += temp[ threadIdx.x ]; - } - bisection >>= 1; + bisection >>= 1; } __syncthreads(); - if( warpStart + inWarpIdx >= this->getRows() ) - return; - outVector[ warpStart + inWarpIdx ] = results[ this->rowPermArray[ warpStart + inWarpIdx ] & ( cudaBlockSize - 1 ) ]; -} -#endif - -/*#ifdef HAVE_CUDA -template< typename Real, - typename Device, - typename Index, - int StripSize > -template< typename InVector, - typename OutVector > -__device__ -void BiEllpack< Real, Device, Index, StripSize >::spmvCuda( const InVector& inVector, - OutVector& outVector, - int globalIdx ) const -{ - // Loop unrolling test - const IndexType strip = globalIdx >> this->logWarpSize; - const IndexType warpStart = strip << this->logWarpSize; - const IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 ); - - if( warpStart >= this->getRows() ) - return; - - const IndexType cudaBlockSize = 256; - - volatile Real* temp = getSharedMemory< Real >(); - __shared__ Real results[ cudaBlockSize ]; - results[ threadIdx.x ] = 0.0; - IndexType elementPtr = ( this->groupPointers[ strip * ( this->logWarpSize + 1 ) ] << this->logWarpSize ) + inWarpIdx; - - //Loop Unroll #1 - IndexType group = 0; - IndexType groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ] - - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ]; - - if( groupLength > 0 ) - { - for( IndexType i = 0; i < groupLength; i++ ) - { - if( this->columnIndexes[ elementPtr ] < this->getColumns() ) - results[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; - elementPtr += this->warpSize; - } - } - - group++; - temp[ threadIdx.x ] = 0.0; - groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ] - - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ]; - - if( groupLength > 0 ) - { - for( IndexType i = 0; i < groupLength; i++ ) - { - if( this->columnIndexes[ elementPtr ] < this->getColumns() ) - temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; - elementPtr += this->warpSize; - } - //Loop Unroll #2 - if( inWarpIdx < 16 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 16 ]; - if( inWarpIdx < 16 ) - results[ threadIdx.x ] += temp[ threadIdx.x ]; - } - - - //group == 2; - group++; - temp[ threadIdx.x ] = 0.0; - groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ] - - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ]; - if( groupLength > 0 ) - { - for( IndexType i = 0; i < groupLength; i++ ) - { - if( this->columnIndexes[ elementPtr ] < this->getColumns() ) - temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; - elementPtr += this->warpSize; - } - //Loop Unroll #3 - if( inWarpIdx < 16 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 16 ]; - if( inWarpIdx < 8 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 8 ]; - if( inWarpIdx < 8 ) - results[ threadIdx.x ] += temp[ threadIdx.x ]; - } - - //group == 3; - group++; - temp[ threadIdx.x ] = 0.0; - groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ] - - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ]; - if( groupLength > 0 ) - { - for( IndexType i = 0; i < groupLength; i++ ) - { - if( this->columnIndexes[ elementPtr ] < this->getColumns() ) - temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; - elementPtr += this->warpSize; - } - //Loop Unroll #4 - if( inWarpIdx < 16 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 16 ]; - if( inWarpIdx < 8 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 8 ]; - if( inWarpIdx < 4 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 4 ]; - if( inWarpIdx < 4 ) - results[ threadIdx.x ] += temp[ threadIdx.x ]; - } - - //group == 4; - group++; - temp[ threadIdx.x ] = 0.0; - groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ] - - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ]; - if( groupLength > 0 ) - { - for( IndexType i = 0; i < groupLength; i++ ) - { - if( this->columnIndexes[ elementPtr ] < this->getColumns() ) - temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; - elementPtr += this->warpSize; - } - //Loop Unroll #5 - if( inWarpIdx < 16 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 16 ]; - if( inWarpIdx < 8 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 8 ]; - if( inWarpIdx < 4 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 4 ]; - if( inWarpIdx < 2 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 2 ]; - if( inWarpIdx < 2 ) - results[ threadIdx.x ] += temp[ threadIdx.x ]; - } - - //group == 5 - group++; - temp[ threadIdx.x ] = 0.0; - groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ] - - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ]; - if( groupLength > 0 ) - { - for( IndexType i = 0; i < groupLength; i++ ) - { - if( this->columnIndexes[ elementPtr ] < this->getColumns() ) - temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; - elementPtr += this->warpSize; - } - //Loop Unroll #6 - if( inWarpIdx < 16 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 16 ]; - if( inWarpIdx < 8 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 8 ]; - if( inWarpIdx < 4 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 4 ]; - if( inWarpIdx < 2 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 2 ]; - if( inWarpIdx < 1 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 1 ]; - if( inWarpIdx < 1 ) - results[ threadIdx.x ] += temp[ threadIdx.x ]; - } - if( warpStart + inWarpIdx >= this->getRows() ) return; + outVector[ warpStart + inWarpIdx ] = results[ this->rowPermArray[ warpStart + inWarpIdx ] & ( cudaBlockSize - 1 ) ]; } -#endif*/ +#endif #ifdef HAVE_CUDA template< typename Real, typename Index, - int StripSize, typename InVector, typename OutVector > __global__ -void BiEllpackVectorProductCuda( const BiEllpack< Real, Devices::Cuda, Index, StripSize >* matrix, - const InVector* inVector, - OutVector* outVector, - int gridIdx, - const int warpSize ) +void BiEllpackVectorProductCuda( const BiEllpack< Real, Devices::Cuda, Index >* matrix, + const InVector* inVector, + OutVector* outVector, + int gridIdx, + const int warpSize ) { Index globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; matrix->spmvCuda( *inVector, *outVector, globalIdx ); @@ -1285,11 +1171,10 @@ void BiEllpackVectorProductCuda( const BiEllpack< Real, Devices::Cuda, Index, St #ifdef HAVE_CUDA template< typename Real, typename Device, - typename Index, - int StripSize > + typename Index > __cuda_callable__ -void BiEllpack< Real, Device, Index, StripSize >::performRowBubbleSortCudaKernel( const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths, - const IndexType strip ) +void BiEllpack< Real, Device, Index >::performRowBubbleSortCudaKernel( const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths, + const IndexType strip ) { IndexType begin = strip * this->warpSize; IndexType end = ( strip + 1 ) * this->warpSize - 1; @@ -1342,12 +1227,11 @@ void BiEllpack< Real, Device, Index, StripSize >::performRowBubbleSortCudaKernel #ifdef HAVE_CUDA template< typename Real, typename Device, - typename Index, - int StripSize > + typename Index > __cuda_callable__ -void BiEllpack< Real, Device, Index, StripSize >::computeColumnSizesCudaKernel( const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths, - const IndexType numberOfStrips, - const IndexType strip ) +void BiEllpack< Real, Device, Index >::computeColumnSizesCudaKernel( const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths, + const IndexType numberOfStrips, + const IndexType strip ) { if( strip >= numberOfStrips ) return; @@ -1390,12 +1274,11 @@ void BiEllpack< Real, Device, Index, StripSize >::computeColumnSizesCudaKernel( #ifdef HAVE_CUDA template< typename Real, - typename Index, - int StripSize > + typename Index > __global__ -void performRowBubbleSortCuda( BiEllpack< Real, Devices::Cuda, Index, StripSize >* matrix, - const typename BiEllpack< Real, Devices::Cuda, Index, StripSize >::CompressedRowLengthsVector* rowLengths, - int gridIdx ) +void performRowBubbleSortCuda( BiEllpack< Real, Devices::Cuda, Index >* matrix, + const typename BiEllpack< Real, Devices::Cuda, Index >::CompressedRowLengthsVector* rowLengths, + int gridIdx ) { const Index stripIdx = gridIdx * Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x; matrix->performRowBubbleSortCudaKernel( *rowLengths, stripIdx ); @@ -1404,13 +1287,12 @@ void performRowBubbleSortCuda( BiEllpack< Real, Devices::Cuda, Index, StripSize #ifdef HAVE_CUDA template< typename Real, - typename Index, - int StripSize > + typename Index > __global__ -void computeColumnSizesCuda( BiEllpack< Real, Devices::Cuda, Index, StripSize >* matrix, - const typename BiEllpack< Real, Devices::Cuda, Index, StripSize >::CompressedRowLengthsVector* rowLengths, - const Index numberOfStrips, - int gridIdx ) +void computeColumnSizesCuda( BiEllpack< Real, Devices::Cuda, Index >* matrix, + const typename BiEllpack< Real, Devices::Cuda, Index >::CompressedRowLengthsVector* rowLengths, + const Index numberOfStrips, + int gridIdx ) { const Index stripIdx = gridIdx * Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x; matrix->computeColumnSizesCudaKernel( *rowLengths, numberOfStrips, stripIdx ); @@ -1425,48 +1307,47 @@ public: typedef Devices::Cuda Device; template< typename Real, - typename Index, - int StripSize > - static void verifyRowLengths( const BiEllpack< Real, Device, Index, StripSize >& matrix, - const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths ) + typename Index > + static void verifyRowLengths( const BiEllpack< Real, Device, Index >& matrix, + const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths ) { bool ok = true; - std::cout << "inside method" << std::endl; for( Index row = 0; row < matrix.getRows(); row++ ) { - const Index strip = row / matrix.warpSize; - const Index stripLength = matrix.getStripLength( strip ); - const Index groupBegin = ( matrix.logWarpSize + 1 ) * strip; - const Index rowStripPerm = matrix.rowPermArray.getElement( row ) - strip * matrix.warpSize; - const Index begin = matrix.groupPointers.getElement( groupBegin ) * matrix.warpSize + rowStripPerm * stripLength; - Index elementPtr = begin; - Index rowLength = 0; - - for( Index group = 0; group < matrix.getNumberOfGroups( row ); group++ ) - { - for( Index i = 0; i < matrix.getGroupLength( strip, group ); i++ ) - { - Index biElementPtr = elementPtr; - for( Index j = 0; j < matrix.power( 2, group ); j++ ) - { - rowLength++; - biElementPtr += matrix.power( 2, matrix.logWarpSize - group ) * stripLength; - } - elementPtr++; - } - } - if( rowLengths.getElement( row ) > rowLength ) - ok = false; + const Index strip = row / matrix.warpSize; + const Index stripLength = matrix.getStripLength( strip ); + const Index groupBegin = ( matrix.logWarpSize + 1 ) * strip; + const Index rowStripPerm = matrix.rowPermArray.getElement( row ) - strip * matrix.warpSize; + const Index begin = matrix.groupPointers.getElement( groupBegin ) * matrix.warpSize + rowStripPerm * stripLength; + Index elementPtr = begin; + Index rowLength = 0; + + for( Index group = 0; group < matrix.getNumberOfGroups( row ); group++ ) + { + for( Index i = 0; i < matrix.getGroupLength( strip, group ); i++ ) + { + Index biElementPtr = elementPtr; + for( Index j = 0; j < matrix.power( 2, group ); j++ ) + { + rowLength++; + biElementPtr += matrix.power( 2, matrix.logWarpSize - group ) * stripLength; + } + elementPtr++; + } + } + if( rowLengths.getElement( row ) > rowLength ) + ok = false; } if( ok ) - std::cout << "row lengths OK" << std::endl; + { +// std::cout << "row lengths OK" << std::endl; + } } template< typename Real, - typename Index, - int StripSize > - static void verifyRowPerm( const BiEllpack< Real, Device, Index, StripSize >& matrix, - const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths ) + typename Index > + static void verifyRowPerm( const BiEllpack< Real, Device, Index >& matrix, + const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths ) { bool ok = true; Index numberOfStrips = matrix.virtualRows / matrix.warpSize; @@ -1503,18 +1384,19 @@ public: } } if( ok ) - std::cout << "perm OK" << std::endl; + { +// std::cout << "perm OK" << std::endl; + } } template< typename Real, - typename Index, - int StripSize > - static void performRowBubbleSort( BiEllpack< Real, Device, Index, StripSize >& matrix, - const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths ) + typename Index > + static void performRowBubbleSort( BiEllpack< Real, Device, Index >& matrix, + const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths ) { #ifdef HAVE_CUDA - Index numberOfStrips = matrix.virtualRows / StripSize; - typedef BiEllpack< Real, Devices::Cuda, Index, StripSize > Matrix; + Index numberOfStrips = matrix.virtualRows / matrix.warpSize; + typedef BiEllpack< Real, Devices::Cuda, Index > Matrix; typedef typename Matrix::CompressedRowLengthsVector CompressedRowLengthsVector; Matrix* kernel_this = Cuda::passToDevice( matrix ); CompressedRowLengthsVector* kernel_rowLengths = Cuda::passToDevice( rowLengths ); @@ -1525,7 +1407,7 @@ public: { if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); - performRowBubbleSortCuda< Real, Index, StripSize > + performRowBubbleSortCuda< Real, Index > <<< cudaGridSize, cudaBlockSize >>> ( kernel_this, kernel_rowLengths, @@ -1538,14 +1420,13 @@ public: } template< typename Real, - typename Index, - int StripSize > - static void computeColumnSizes( BiEllpack< Real, Device, Index, StripSize >& matrix, - const typename BiEllpack< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths ) + typename Index > + static void computeColumnSizes( BiEllpack< Real, Device, Index >& matrix, + const typename BiEllpack< Real, Device, Index >::CompressedRowLengthsVector& rowLengths ) { #ifdef HAVE_CUDA - const Index numberOfStrips = matrix.virtualRows / StripSize; - typedef BiEllpack< Real, Devices::Cuda, Index, StripSize > Matrix; + const Index numberOfStrips = matrix.virtualRows / matrix.warpSize; + typedef BiEllpack< Real, Devices::Cuda, Index > Matrix; typedef typename Matrix::CompressedRowLengthsVector CompressedRowLengthsVector; Matrix* kernel_this = Cuda::passToDevice( matrix ); CompressedRowLengthsVector* kernel_rowLengths = Cuda::passToDevice( rowLengths ); @@ -1556,7 +1437,7 @@ public: { if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); - computeColumnSizesCuda< Real, Index, StripSize > + computeColumnSizesCuda< Real, Index > <<< cudaGridSize, cudaBlockSize >>> ( kernel_this, kernel_rowLengths, @@ -1572,10 +1453,9 @@ public: template< typename Real, typename Index, - int StripSize, typename InVector, typename OutVector > - static void vectorProduct( const BiEllpack< Real, Device, Index, StripSize >& matrix, + static void vectorProduct( const BiEllpack< Real, Device, Index >& matrix, const InVector& inVector, OutVector& outVector ) { @@ -1593,7 +1473,7 @@ public: if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); const int sharedMemory = cudaBlockSize.x * sizeof( Real ); - BiEllpackVectorProductCuda< Real, Index, StripSize, InVector, OutVector > + BiEllpackVectorProductCuda< Real, Index, InVector, OutVector > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( kernel_this, kernel_inVector, diff --git a/src/TNL/Matrices/COOMatrix_impl.h b/src/TNL/Matrices/COOMatrix_impl.h index bbdd36002ee4af0ca59da81815ed2527c0c0c828..2f9b49d30833982122ac62eb20ab65e265f79ce7 100644 --- a/src/TNL/Matrices/COOMatrix_impl.h +++ b/src/TNL/Matrices/COOMatrix_impl.h @@ -27,6 +27,28 @@ COOMatrix< Real, Device, Index >::COOMatrix() { }; +template< typename Real, + typename Device, + typename Index > +String COOMatrix< Real, Device, Index >::getType() +{ + return String( "Matrices::COOMatrix< " ) + + String( TNL::getType< Real>() ) + + String( ", " ) + + String( Device :: getDeviceType() ) + + String( ", " ) + + String( TNL::getType< Index >() ) + + String( " >" ); +} + +template< typename Real, + typename Device, + typename Index > +String COOMatrix< Real, Device, Index >::getTypeVirtual() const +{ + return this->getType(); +} + template< typename Real, typename Device, typename Index > diff --git a/src/TNL/Matrices/CSR_impl.h b/src/TNL/Matrices/CSR_impl.h index 327d250028acca4349495bd663340f999f55024e..db31d6dcde6a07cd8b19e87f843f3b6e8b994c5c 100644 --- a/src/TNL/Matrices/CSR_impl.h +++ b/src/TNL/Matrices/CSR_impl.h @@ -45,8 +45,8 @@ String CSR< Real, Device, Index >::getSerializationType() { return String( "Matrices::CSR< ") + TNL::getType< Real>() + - String( ", " ) + - getType< Devices::Host >() + + ", [any_device], " + + String( TNL::getType< Index >() ) + String( " >" ); } @@ -122,41 +122,8 @@ template< typename Real, Index CSR< Real, Device, Index >::getNonZeroRowLength( const IndexType row ) const { // TODO: Fix/Implement - throw Exceptions::NotImplementedError( "CSR::getNonZeroRowLength is not implemented." ); -// if( std::is_same< DeviceType, Devices::Host >::value ) -// { -// ConstMatrixRow matrixRow = this->getRow( row ); -// return matrixRow.getNonZeroElementsCount(); -// } -// if( std::is_same< DeviceType, Devices::Cuda >::value ) -// { -// IndexType *cols = new IndexType[4]; -// RealType *vals = new RealType[4]; -// for( int i = 0; i < 4; i++ ) -// { -// cols[i] = i; -// vals[i] = 1.0; -// } -// ConstMatrixRow matrixRow(cols, vals, 4, 1); -// // ConstMatrixRow matrixRow = this->getRow( row );// If the program even compiles, this line fails because a segfault is thrown on the first line of getRow() -// // WHEN debugging with GDB: -// // (gdb) p this->rowPointers[0] -// // Could not find operator[]. -// // (gdb) p rowPointers.getElement(0) -// // Attempt to take address of value not located in memory. -// IndexType resultHost ( 0 ); -// IndexType* resultCuda = Cuda::passToDevice( resultHost ); -// // PROBLEM: If the second parameter of getNonZeroRowLengthCudaKernel is '&resultCuda', the following issue is thrown: -// // 'error: no instance of function template "TNL::Matrices::getNonZeroRowLengthCudaKernel" matches the argument list' -// TNL::Matrices::getNonZeroRowLengthCudaKernel< ConstMatrixRow, IndexType ><<< 1, 1 >>>( matrixRow, resultCuda ); // matrixRow works fine, tested them both separately -// delete []cols; -// delete []vals; -// std::cout << "Checkpoint BEFORE passFromDevice" << std::endl; -// resultHost = Cuda::passFromDevice( resultCuda ); // This causes a crash: Illegal memory address in Cuda_impl.h at TNL_CHECK_CUDA_DEVICE -// std::cout << "Checkpoint AFTER passFromDevice" << std::endl; -// Cuda::freeFromDevice( resultCuda ); -// return resultHost; -// } + TNL_ASSERT( false, std::cerr << "TODO: Fix/Implement" ); + return 0; } template< typename Real, @@ -221,13 +188,6 @@ bool CSR< Real, Device, Index >::addElementFast( const IndexType row, const RealType& value, const RealType& thisElementMultiplicator ) { - /*TNL_ASSERT( row >= 0 && row < this->rows && - column >= 0 && column <= this->rows, - std::cerr << " row = " << row - << " column = " << column - << " this->rows = " << this->rows - << " this->columns = " << this-> columns );*/ - IndexType elementPtr = this->rowPointers[ row ]; const IndexType rowEnd = this->rowPointers[ row + 1 ]; IndexType col = 0; diff --git a/src/TNL/Matrices/ChunkedEllpack.h b/src/TNL/Matrices/ChunkedEllpack.h index a66e1283ab6b8df5e763de35e6a6ac2c14a70bf9..9d422079608f52e7e89a9954496cbd22c0786c06 100644 --- a/src/TNL/Matrices/ChunkedEllpack.h +++ b/src/TNL/Matrices/ChunkedEllpack.h @@ -75,6 +75,11 @@ public: typedef tnlChunkedEllpackSliceInfo< IndexType > ChunkedEllpackSliceInfo; typedef typename Sparse< RealType, DeviceType, IndexType >:: CompressedRowLengthsVector CompressedRowLengthsVector; typedef typename Sparse< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView; + typedef typename Sparse< RealType, DeviceType, IndexType >::ValuesVector ValuesVector; + typedef typename Sparse< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector; + typedef ChunkedEllpack< Real, Device, Index > ThisType; + typedef ChunkedEllpack< Real, Devices::Host, Index > HostType; + typedef ChunkedEllpack< Real, Devices::Cuda, Index > CudaType; typedef Sparse< Real, Device, Index > BaseType; typedef typename BaseType::MatrixRow MatrixRow; typedef SparseRow< const RealType, const IndexType > ConstMatrixRow; diff --git a/src/TNL/Matrices/ChunkedEllpack_impl.h b/src/TNL/Matrices/ChunkedEllpack_impl.h index 48119c659163d9f57bb3feefe58210f48666b224..3b1fd9c8f9fce07344115282ba98411d364d95e3 100644 --- a/src/TNL/Matrices/ChunkedEllpack_impl.h +++ b/src/TNL/Matrices/ChunkedEllpack_impl.h @@ -16,7 +16,7 @@ #include namespace TNL { -namespace Matrices { +namespace Matrices { template< typename Real, typename Index, @@ -43,8 +43,8 @@ String ChunkedEllpack< Real, Device, Index >::getSerializationType() { return String( "Matrices::ChunkedEllpack< ") + getType< Real >() + - String( ", " ) + - getType< Device >() + + String( ", [any device], " ) + + String( TNL::getType< Index >() ) + String( " >" ); } @@ -164,24 +164,7 @@ bool ChunkedEllpack< Real, Device, Index >::setSlice( ConstCompressedRowLengthsV */ IndexType maxChunkInSlice( 0 ); for( IndexType i = sliceBegin; i < sliceEnd; i++ ) - { -// ALL OF THE FOLLOWING std::couts are for troubleshooting purposes, can be deleted. -// std::cout << "Troubleshooting invalid ceil operation: " << std::endl; -// std::cout << "maxChunkInSlice = " << maxChunkInSlice << std::endl; -// std::cout << "( RealType ) rowLengths[ i ] = " << ( RealType ) rowLengths[ i ] << std::endl; -// std::cout << "( RealType ) this->rowToChunkMapping[ i ] = " << ( RealType ) this->rowToChunkMapping[ i ] << std::endl; -// std::cout << " ceil( RealType / RealType ) = " << ceil( ( RealType ) rowLengths[ i ] / ( RealType ) this->rowToChunkMapping[ i ] ) << std::endl; -// std::cout << "( int ) rowLengths[ i ] = " << ( int ) rowLengths[ i ] << std::endl; -// std::cout << "( int ) this->rowToChunkMapping[ i ] = " << ( int ) this->rowToChunkMapping[ i ] << std::endl; -// std::cout << " ceil( int / int ) = " << ceil( ( int ) rowLengths[ i ] / ( int ) this->rowToChunkMapping[ i ] ) << std::endl; -// std::cout << "( float ) rowLengths[ i ] = " << ( float ) rowLengths[ i ] << std::endl; -// std::cout << "( float ) this->rowToChunkMapping[ i ] = " << ( float ) this->rowToChunkMapping[ i ] << std::endl; -// std::cout << " ceil( float / float ) = " << ceil( ( float ) rowLengths[ i ] / ( float ) this->rowToChunkMapping[ i ] ) << std::endl; -// The ceil function doesn't work when rowLengths and the other this.->... is -// typecasted into ( RealType ), because when RealType is int, it will perform -// an integer division and return the int as a double, which in this case -// will be zero and make the assertion fail ( https://stackoverflow.com/questions/33273359/in-c-using-the-ceil-a-division-is-not-working ). -// To fix this, typecast them to ( float ), instead of ( RealType ) + { maxChunkInSlice = max( maxChunkInSlice, roundUpDivision( rowLengths[ i ], this->rowToChunkMapping[ i ] ) ); } @@ -234,13 +217,6 @@ void ChunkedEllpack< Real, Device, Index >::setCompressedRowLengths( ConstCompre this->setSlice( rowLengths, sliceIndex, elementsToAllocation ); this->rowPointers.scan(); } - -// std::cout << "\ngetRowLength after first if: " << std::endl; -// for( IndexType i = 0; i < rowLengths.getSize(); i++ ) -// { -// std::cout << getRowLength( i ) << std::endl; -// } -// std::cout << "\n"; if( std::is_same< Device, Devices::Cuda >::value ) { @@ -265,7 +241,6 @@ void ChunkedEllpack< Real, Device, Index >::setCompressedRowLengths( ConstCompre elementsToAllocation = hostMatrix.values.getSize(); } this->maxRowLength = max( rowLengths ); -// std::cout << "\nrowLengths.max() = " << rowLengths.max() << std::endl; Sparse< Real, Device, Index >::allocateMatrixElements( elementsToAllocation ); } @@ -298,22 +273,7 @@ template< typename Real, Index ChunkedEllpack< Real, Device, Index >::getNonZeroRowLength( const IndexType row ) const { ConstMatrixRow matrixRow = getRow( row ); - return matrixRow.getNonZeroElementsCount( getType< Device >() ); - -// IndexType elementCount ( 0 ); -// ConstMatrixRow matrixRow = this->getRow( row ); -// -// auto computeNonZeros = [&] /*__cuda_callable__*/ ( IndexType i ) mutable -// { -// std::cout << "matrixRow.getElementValue( i ) = " << matrixRow.getElementValue( i ) << " != 0.0" << std::endl; -// if( matrixRow.getElementValue( i ) != 0.0 ) -// elementCount++; -// -// std::cout << "End of lambda elementCount = " << elementCount << std::endl; -// }; -// -// ParallelFor< DeviceType >::exec( ( IndexType ) 0, matrixRow.getLength(), computeNonZeros ); -// return elementCount; + return matrixRow.getNonZeroElementsCount( Device::getDeviceType() ); } template< typename Real, @@ -1260,8 +1220,63 @@ ChunkedEllpack< Real, Device, Index >::operator=( const ChunkedEllpack< Real2, D "unknown device" ); this->setLike( matrix ); + this->chunksInSlice = matrix.chunksInSlice; + this->desiredChunkSize = matrix.desiredChunkSize; + this->rowToChunkMapping = matrix.rowToChunkMapping; + this->rowToSliceMapping = matrix.rowToSliceMapping; + this->rowPointers = matrix.rowPointers; + this->slices = matrix.slices; + this->numberOfSlices = matrix.numberOfSlices; + + // host -> cuda + if( std::is_same< Device, Devices::Cuda >::value ) { + typename ValuesVector::template Self< typename ValuesVector::RealType, Devices::Host > tmpValues; + typename ColumnIndexesVector::template Self< typename ColumnIndexesVector::RealType, Devices::Host > tmpColumnIndexes; + tmpValues.setLike( matrix.values ); + tmpColumnIndexes.setLike( matrix.columnIndexes ); - throw Exceptions::NotImplementedError("Cross-device assignment for the ChunkedEllpack format is not implemented yet."); +#ifdef HAVE_OPENMP +#pragma omp parallel for if( Devices::Host::isOMPEnabled() ) +#endif + for( Index sliceIdx = 0; sliceIdx < matrix.numberOfSlices; sliceIdx++ ) { + const Index chunkSize = matrix.slices.getElement( sliceIdx ).chunkSize; + const Index offset = matrix.slices.getElement( sliceIdx ).pointer; + + for( Index j = 0; j < chunkSize; j++ ) + for( Index i = 0; i < matrix.chunksInSlice; i++ ) { + tmpValues[ offset + j * matrix.chunksInSlice + i ] = matrix.values[ offset + i * chunkSize + j ]; + tmpColumnIndexes[ offset + j * matrix.chunksInSlice + i ] = matrix.columnIndexes[ offset + i * chunkSize + j ]; + } + } + + this->values = tmpValues; + this->columnIndexes = tmpColumnIndexes; + } + + // cuda -> host + if( std::is_same< Device, Devices::Host >::value ) { + ValuesVector tmpValues; + ColumnIndexesVector tmpColumnIndexes; + tmpValues.setLike( matrix.values ); + tmpColumnIndexes.setLike( matrix.columnIndexes ); + tmpValues = matrix.values; + tmpColumnIndexes = matrix.columnIndexes; + +#ifdef HAVE_OPENMP +#pragma omp parallel for if( Devices::Host::isOMPEnabled() ) +#endif + for( Index sliceIdx = 0; sliceIdx < matrix.numberOfSlices; sliceIdx++ ) { + const Index chunkSize = matrix.slices.getElement( sliceIdx ).chunkSize; + const Index offset = matrix.slices.getElement( sliceIdx ).pointer; + + for( Index j = 0; j < chunkSize; j++ ) + for( Index i = 0; i < matrix.chunksInSlice; i++ ) { + this->values[ offset + i * chunkSize + j ] = tmpValues[ offset + j * matrix.chunksInSlice + i ]; + this->columnIndexes[ offset + i * chunkSize + j ] = tmpColumnIndexes[ offset + j * matrix.chunksInSlice + i ]; + } + } + } + return *this; } @@ -1417,14 +1432,14 @@ class ChunkedEllpackDeviceDependentCode< Devices::Cuda > public: typedef Devices::Cuda Device; - + template< typename Real, typename Index > static void resolveSliceSizes( ChunkedEllpack< Real, Device, Index >& matrix, typename ChunkedEllpack< Real, Device, Index >::ConstCompressedRowLengthsVectorView rowLengths ) { } - + template< typename Index > __cuda_callable__ static void initChunkTraverse( const Index sliceOffset, diff --git a/src/TNL/Matrices/EllpackSymmetricGraph_impl.h b/src/TNL/Matrices/EllpackSymmetricGraph_impl.h index b949292c5f1664562525a4ead8ca17b2ad9f343b..1aa9b51a6cbdbaa8c596db148b8dbceae62066fd 100644 --- a/src/TNL/Matrices/EllpackSymmetricGraph_impl.h +++ b/src/TNL/Matrices/EllpackSymmetricGraph_impl.h @@ -42,6 +42,28 @@ Index EllpackSymmetricGraph< Real, Device, Index >::getAlignedRows() const return this->alignedRows; } +template< typename Real, + typename Device, + typename Index > +String EllpackSymmetricGraph< Real, Device, Index > :: getType() +{ + return String( "Matrices::EllpackSymmetricGraph< ") + + String( TNL::getType< Real >() ) + + String( ", " ) + + String( Device::getDeviceType() ) + + String( ", " ) + + String( TNL::getType< Index >() ) + + String( " >" ); +} + +template< typename Real, + typename Device, + typename Index > +String EllpackSymmetricGraph< Real, Device, Index >::getTypeVirtual() const +{ + return this->getType(); +} + template< typename Real, typename Device, typename Index > @@ -51,13 +73,27 @@ void EllpackSymmetricGraph< Real, Device, Index >::setDimensions( const IndexTyp TNL_ASSERT( rows > 0 && columns > 0, std::cerr << "rows = " << rows << " columns = " << columns << std::endl ); + this->rows = rows; - this->columns = columns; + this->columns = columns; + if( std::is_same< DeviceType, Devices::Cuda >::value ) - this->alignedRows = roundToMultiple( columns, Cuda::getWarpSize() ); + { + this->alignedRows = roundToMultiple( columns, Devices::Cuda::getWarpSize() ); + + if( this->rows - this->alignedRows > 0 ) + { + IndexType missingRows = this->rows - this->alignedRows; + missingRows = roundToMultiple( missingRows, Devices::Cuda::getWarpSize() ); + this->alignedRows += missingRows; + +// this->alignedRows += roundToMultiple( this->rows - this->alignedRows, Devices::Cuda::getWarpSize() ); + } + } else this->alignedRows = rows; + if( this->rowLengths != 0 ) - allocateElements(); + allocateElements(); } template< typename Real, @@ -791,6 +827,11 @@ template< typename Real, typename Index > void EllpackSymmetricGraph< Real, Device, Index >::allocateElements() { + IndexType numberOfMatrixElements = this->alignedRows * this->rowLengths; + + TNL_ASSERT_TRUE( this->alignedRows != 0 && numberOfMatrixElements / this->alignedRows == this->rowLengths, + "Ellpack cannot store this matrix. The number of matrix elements has overflown the value that IndexType is capable of storing" ); + Sparse< Real, Device, Index >::allocateMatrixElements( this->alignedRows * this->rowLengths ); } diff --git a/src/TNL/Matrices/EllpackSymmetric_impl.h b/src/TNL/Matrices/EllpackSymmetric_impl.h index 90369f77af0f0085b140934c27fe3fe5a2d8f015..f64cef4c5b909428511e006d15d820c5c83d0f27 100644 --- a/src/TNL/Matrices/EllpackSymmetric_impl.h +++ b/src/TNL/Matrices/EllpackSymmetric_impl.h @@ -26,6 +26,28 @@ EllpackSymmetric< Real, Device, Index > :: EllpackSymmetric() { }; +template< typename Real, + typename Device, + typename Index > +String EllpackSymmetric< Real, Device, Index > :: getType() +{ + return String( "Matrices::EllpackSymmetric< ") + + String( TNL::getType< Real >() ) + + String( ", " ) + + String( Device::getDeviceType() ) + + String( ", " ) + + String( TNL::getType< Index >() ) + + String( " >" ); +} + +template< typename Real, + typename Device, + typename Index > +String EllpackSymmetric< Real, Device, Index >::getTypeVirtual() const +{ + return this->getType(); +} + template< typename Real, typename Device, typename Index > @@ -35,13 +57,27 @@ void EllpackSymmetric< Real, Device, Index >::setDimensions( const IndexType row TNL_ASSERT( rows > 0 && columns > 0, std::cerr << "rows = " << rows << " columns = " << columns <rows = rows; - this->columns = columns; + this->columns = columns; + if( std::is_same< DeviceType, Devices::Cuda >::value ) - this->alignedRows = roundToMultiple( columns, Cuda::getWarpSize() ); + { + this->alignedRows = roundToMultiple( columns, Devices::Cuda::getWarpSize() ); + + if( this->rows - this->alignedRows > 0 ) + { + IndexType missingRows = this->rows - this->alignedRows; + missingRows = roundToMultiple( missingRows, Devices::Cuda::getWarpSize() ); + this->alignedRows += missingRows; + +// this->alignedRows += roundToMultiple( this->rows - this->alignedRows, Devices::Cuda::getWarpSize() ); + } + } else this->alignedRows = rows; + if( this->rowLengths != 0 ) - allocateElements(); + allocateElements(); } template< typename Real, @@ -577,6 +613,11 @@ template< typename Real, typename Index > void EllpackSymmetric< Real, Device, Index >::allocateElements() { + IndexType numberOfMatrixElements = this->alignedRows * this->rowLengths; + + TNL_ASSERT_TRUE( this->alignedRows != 0 && numberOfMatrixElements / this->alignedRows == this->rowLengths, + "Ellpack cannot store this matrix. The number of matrix elements has overflown the value that IndexType is capable of storing" ); + Sparse< Real, Device, Index >::allocateMatrixElements( this->alignedRows * this->rowLengths ); } diff --git a/src/TNL/Matrices/Ellpack_impl.h b/src/TNL/Matrices/Ellpack_impl.h index 5ac812cf2101e7f13bafbfd871ac168429be49cd..5ae12f408727bd1ae2f087f69fcb5bae2458fd55 100644 --- a/src/TNL/Matrices/Ellpack_impl.h +++ b/src/TNL/Matrices/Ellpack_impl.h @@ -16,7 +16,7 @@ #include namespace TNL { -namespace Matrices { +namespace Matrices { template< typename Real, typename Device, @@ -31,11 +31,9 @@ template< typename Real, typename Index > String Ellpack< Real, Device, Index >::getSerializationType() { - return String( "Matrices::Ellpack< ") + - getType< Real >() + - String( ", " ) + - getType< Device >() + - String( ", " ) + + return String( "Matrices::Ellpack< " ) + + String( TNL::getType< Real >() ) + + ", [any device], " + getType< Index >() + String( " >" ); } @@ -59,9 +57,21 @@ void Ellpack< Real, Device, Index >::setDimensions( const IndexType rows, << " columns = " << columns << std::endl ); this->rows = rows; this->columns = columns; + if( std::is_same< Device, Devices::Cuda >::value ) - this->alignedRows = roundToMultiple( rows, Cuda::getWarpSize() ); + { + this->alignedRows = roundToMultiple( columns, Cuda::getWarpSize() ); + if( this->rows - this->alignedRows > 0 ) + { + IndexType missingRows = this->rows - this->alignedRows; + + missingRows = roundToMultiple( missingRows, Cuda::getWarpSize() ); + + this->alignedRows += missingRows; + } + } else this->alignedRows = rows; + if( this->rowLengths != 0 ) allocateElements(); } @@ -76,6 +86,7 @@ void Ellpack< Real, Device, Index >::setCompressedRowLengths( ConstCompressedRow TNL_ASSERT_EQ( this->getRows(), rowLengths.getSize(), "wrong size of the rowLengths vector" ); this->rowLengths = this->maxRowLength = max( rowLengths ); + allocateElements(); } @@ -757,6 +768,14 @@ template< typename Real, typename Index > void Ellpack< Real, Device, Index >::allocateElements() { + IndexType numMtxElmnts = this->alignedRows * this->rowLengths; + + if( this->alignedRows != 0 ) + { + TNL_ASSERT_EQ( numMtxElmnts / this->alignedRows, this->rowLengths, + "Ellpack cannot store this matrix. The number of matrix elements has overflown the value that IndexType is capable of storing" ); + } + Sparse< Real, Device, Index >::allocateMatrixElements( this->alignedRows * this->rowLengths ); } diff --git a/src/TNL/Matrices/MatrixReader_impl.h b/src/TNL/Matrices/MatrixReader_impl.h index 418e6f5b3eda29a659c4487dfaf34d88c12fea1d..dd6ddc07245150c859946e97b2fde3a66021aaa3 100644 --- a/src/TNL/Matrices/MatrixReader_impl.h +++ b/src/TNL/Matrices/MatrixReader_impl.h @@ -68,7 +68,7 @@ bool MatrixReader< Matrix >::readMtxFileHostMatrix( std::istream& file, if( ! computeCompressedRowLengthsFromMtxFile( file, rowLengths, columns, rows, symmetricMatrix, verbose ) ) return false; - + matrix.setCompressedRowLengths( rowLengths ); if( ! readMatrixElementsFromMtxFile( file, matrix, symmetricMatrix, verbose, symReader ) ) @@ -340,6 +340,7 @@ bool MatrixReader< Matrix >::readMatrixElementsFromMtxFile( std::istream& file, IndexType processedElements( 0 ); Timer timer; timer.start(); + while( std::getline( file, line ) ) { if( line[ 0 ] == '%' ) continue; @@ -370,6 +371,7 @@ bool MatrixReader< Matrix >::readMatrixElementsFromMtxFile( std::istream& file, processedElements++; } } + file.clear(); long int fileSize = file.tellg(); timer.stop(); @@ -377,6 +379,7 @@ bool MatrixReader< Matrix >::readMatrixElementsFromMtxFile( std::istream& file, std::cout << " Reading the matrix elements ... " << processedElements << " / " << matrix.getNumberOfMatrixElements() << " -> " << timer.getRealTime() << " sec. i.e. " << fileSize / ( timer.getRealTime() * ( 1 << 20 )) << "MB/s." << std::endl; + return true; } diff --git a/src/TNL/Matrices/Matrix_impl.h b/src/TNL/Matrices/Matrix_impl.h index 33c4d2e654cb32f9ba56516a1678b73d17ee3b96..3371ee4ec453d0c2d6af294ed6ab2df9d3623b32 100644 --- a/src/TNL/Matrices/Matrix_impl.h +++ b/src/TNL/Matrices/Matrix_impl.h @@ -70,6 +70,19 @@ void Matrix< Real, Device, Index >::setLike( const Matrix< Real2, Device2, Index setDimensions( matrix.getRows(), matrix.getColumns() ); } +template< typename Real, + typename Device, + typename Index > +Index Matrix< Real, Device, Index >::getNumberOfNonzeroMatrixElements() const +{ + IndexType nonZeroElements( 0 ); + for( IndexType i = 0; this->values.getSize(); i++ ) + if( this->values.getElement( i ) != 0.0 ) + nonZeroElements++; + + return nonZeroElements; +} + template< typename Real, typename Device, typename Index > diff --git a/src/TNL/Matrices/Multidiagonal_impl.h b/src/TNL/Matrices/Multidiagonal_impl.h index ff1ac384a3a1a95a170f491de8a56dae09651b3c..76f54f748c0744d810518cd9dde5872a894099ad 100644 --- a/src/TNL/Matrices/Multidiagonal_impl.h +++ b/src/TNL/Matrices/Multidiagonal_impl.h @@ -36,7 +36,9 @@ String Multidiagonal< Real, Device, Index >::getSerializationType() return String( "Matrices::Multidiagonal< ") + getType< Real >() + String( ", " ) + - getType< Device >() + + String( Device :: getDeviceType() ) + + String( ", " ) + + String( TNL::getType< Index >() ) + String( " >" ); } diff --git a/src/TNL/Matrices/SlicedEllpack.h b/src/TNL/Matrices/SlicedEllpack.h index 5051fc21868b13a5644ebbdb190371bc50c77224..7176019d2979c57007062e10f02b263047e58157 100644 --- a/src/TNL/Matrices/SlicedEllpack.h +++ b/src/TNL/Matrices/SlicedEllpack.h @@ -25,7 +25,7 @@ #include namespace TNL { -namespace Matrices { +namespace Matrices { template< typename Device > class SlicedEllpackDeviceDependentCode; @@ -93,7 +93,7 @@ public: __cuda_callable__ IndexType getRowLengthFast( const IndexType row ) const; - + IndexType getNonZeroRowLength( const IndexType row ) const; template< typename Real2, typename Device2, typename Index2 > diff --git a/src/TNL/Matrices/SlicedEllpackSymmetricGraph_impl.h b/src/TNL/Matrices/SlicedEllpackSymmetricGraph_impl.h index bfe73f231092a0e4ea90c3011b823c6ab8c17d95..39cb81c6839efe21c90bcba58aff87ad96a5e447 100644 --- a/src/TNL/Matrices/SlicedEllpackSymmetricGraph_impl.h +++ b/src/TNL/Matrices/SlicedEllpackSymmetricGraph_impl.h @@ -25,6 +25,30 @@ template< typename Real, SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::SlicedEllpackSymmetricGraph() : rearranged( false ) { +}; + +template< typename Real, + typename Device, + typename Index, + int SliceSize > +String SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getType() +{ + return String( "Matrices::SlicedEllpackSymmetricGraph< ") + + String( TNL::getType< Real >() ) + + String( ", " ) + + String( Device::getDeviceType() ) + + String( ", " ) + + String( TNL::getType< Index >() ) + + String( " >" ); +} + +template< typename Real, + typename Device, + typename Index, + int SliceSize > +String SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getTypeVirtual() const +{ + return this->getType(); } template< typename Real, diff --git a/src/TNL/Matrices/SlicedEllpackSymmetric_impl.h b/src/TNL/Matrices/SlicedEllpackSymmetric_impl.h index c403fd4c84f09a59883f14f2fc5c23e79c1c65cb..324cc74bc056efcd489e2ac2ba88ee2113ccea0d 100644 --- a/src/TNL/Matrices/SlicedEllpackSymmetric_impl.h +++ b/src/TNL/Matrices/SlicedEllpackSymmetric_impl.h @@ -24,6 +24,30 @@ template< typename Real, int SliceSize > SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::SlicedEllpackSymmetric() { +}; + +template< typename Real, + typename Device, + typename Index, + int SliceSize > +String SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::getType() +{ + return String( "Matrices::SlicedEllpackSymmetric< ") + + String( TNL::getType< Real >() ) + + String( ", " ) + + String( Device :: getDeviceType() ) + + String( ", " ) + + String( TNL::getType< Index >() ) + + String( " >" ); +} + +template< typename Real, + typename Device, + typename Index, + int SliceSize > +String SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::getTypeVirtual() const +{ + return this->getType(); } template< typename Real, diff --git a/src/TNL/Matrices/SlicedEllpack_impl.h b/src/TNL/Matrices/SlicedEllpack_impl.h index 45e8cdee77fbda670d2e3b23a3844ad0bb53d071..8c629b563cfe47f258f44f0705cf7b8b5b6d2435 100644 --- a/src/TNL/Matrices/SlicedEllpack_impl.h +++ b/src/TNL/Matrices/SlicedEllpack_impl.h @@ -34,8 +34,8 @@ String SlicedEllpack< Real, Device, Index, SliceSize >::getSerializationType() { return String( "Matrices::SlicedEllpack< ") + TNL::getType< Real >() + - String( ", " ) + - getType< Device >() + + ", [any_device], " + + String( TNL::getType< Index >() ) + String( " >" ); } diff --git a/src/TNL/Matrices/Sparse_impl.h b/src/TNL/Matrices/Sparse_impl.h index 5886681752ec12e3b6a713331860f371dc8bd18f..d1643db19a48dbf078fe04389e9cb2d061b28a26 100644 --- a/src/TNL/Matrices/Sparse_impl.h +++ b/src/TNL/Matrices/Sparse_impl.h @@ -109,6 +109,8 @@ template< typename Real, typename Index > void Sparse< Real, Device, Index >::allocateMatrixElements( const IndexType& numberOfMatrixElements ) { + TNL_ASSERT_GE( numberOfMatrixElements, 0, "Number of matrix elements must be non-negative." ); + this->values.setSize( numberOfMatrixElements ); this->columnIndexes.setSize( numberOfMatrixElements ); diff --git a/src/TNL/Matrices/Tridiagonal_impl.h b/src/TNL/Matrices/Tridiagonal_impl.h index 62575f1776144e374b560a65e213248a1177de80..2752f6850320035dca48169c5e1ae2806aa47ff5 100644 --- a/src/TNL/Matrices/Tridiagonal_impl.h +++ b/src/TNL/Matrices/Tridiagonal_impl.h @@ -27,6 +27,28 @@ Tridiagonal< Real, Device, Index >::Tridiagonal() { } +template< typename Real, + typename Device, + typename Index > +String Tridiagonal< Real, Device, Index >::getType() +{ + return String( "Matrices::Tridiagonal< " ) + + String( TNL::getType< Real >() ) + + String( ", " ) + + String( Device :: getDeviceType() ) + + String( ", " ) + + String( TNL::getType< Index >() ) + + String( " >" ); +} + +template< typename Real, + typename Device, + typename Index > +String Tridiagonal< Real, Device, Index >::getTypeVirtual() const +{ + return this->getType(); +} + template< typename Real, typename Device, typename Index > diff --git a/src/UnitTests/Matrices/CMakeLists.txt b/src/UnitTests/Matrices/CMakeLists.txt index adc2c6dbbfcaaf9dcf616e3c0f1c65af24cfe8b2..2a08be2198e1dcbff5de4ccacccae38e2f52f17b 100644 --- a/src/UnitTests/Matrices/CMakeLists.txt +++ b/src/UnitTests/Matrices/CMakeLists.txt @@ -66,7 +66,8 @@ ENDIF( BUILD_CUDA ) ADD_TEST( SparseMatrixCopyTest ${EXECUTABLE_OUTPUT_PATH}/SparseMatrixCopyTest${CMAKE_EXECUTABLE_SUFFIX} ) ADD_TEST( SparseMatrixTest ${EXECUTABLE_OUTPUT_PATH}/SparseMatrixTest${CMAKE_EXECUTABLE_SUFFIX} ) -ADD_TEST( SparseMatrixTest_AdEllpack ${EXECUTABLE_OUTPUT_PATH}/SparseMatrixTest_AdEllpack${CMAKE_EXECUTABLE_SUFFIX} ) +# TODO: Uncomment the following when AdEllpack works +#ADD_TEST( SparseMatrixTest_AdEllpack ${EXECUTABLE_OUTPUT_PATH}/SparseMatrixTest_AdEllpack${CMAKE_EXECUTABLE_SUFFIX} ) ADD_TEST( SparseMatrixTest_BiEllpack ${EXECUTABLE_OUTPUT_PATH}/SparseMatrixTest_BiEllpack${CMAKE_EXECUTABLE_SUFFIX} ) ADD_TEST( SparseMatrixTest_ChunkedEllpack ${EXECUTABLE_OUTPUT_PATH}/SparseMatrixTest_ChunkedEllpack${CMAKE_EXECUTABLE_SUFFIX} ) ADD_TEST( SparseMatrixTest_CSR ${EXECUTABLE_OUTPUT_PATH}/SparseMatrixTest_CSR${CMAKE_EXECUTABLE_SUFFIX} ) diff --git a/src/UnitTests/Matrices/DenseMatrixTest.h b/src/UnitTests/Matrices/DenseMatrixTest.h index e8279e575e7c0ab09ef3a705759e7f1c1d522c64..8d9e9c727a0d88c40b90f0623d8b2ec8808e3f95 100644 --- a/src/UnitTests/Matrices/DenseMatrixTest.h +++ b/src/UnitTests/Matrices/DenseMatrixTest.h @@ -8,70 +8,6 @@ /* See Copyright Notice in tnl/Copyright */ -// TODO -/* - * getType() ::HOW? How to test this for each format? edit string how? - * MISTAKE! found it for Cuda instead of Devices::Cuda. Incorrect String in src/TNL/Devices/Cuda.cpp - * getTypeVirtual() ::TEST? This just calls getType(). - * getSerializationType() ::TEST? This just calls getType(). - * getSerializationTypeVirtual() ::TEST? This just calls getSerializationType(). - * setDimensions() ::DONE - * setLike() ::DONE - * setCompressedRowLengths() ::NOT IMPLEMENTED! The function body is empty. - * getRowLength() ::DONE - * getRowLengthFast() ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW - * getMaxRowLength() ::TEST? This function is identical to getRowLength(). - * getNumberOfMatrixElements() ::DONE - * getNumberOfNonZeroMatrixElements() ::DONE - * reset() ::DONE - * setValue() ::DONE - * operator() ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW - * const operator() ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW - * setElementFast() ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW - * setElement() ::DONE ; USED! in any test with individual value assignment. - * addElementFast() ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW - * addElement() ::DONE - * setRowFast() ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW - * setRow() ::DONE - * MISTAKE! This function unlike the setRow() for CSR, doesn't replace all the elements of a row, it only replaces the elements it has values for in its arrays. - * addRowFast() ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW - * addRow() ::DONE - * getElementFast() ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW - * getElement() ::USED! in any test with individual value reading. - * getRowFast() ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW - * getRow() ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW - * const getRow() ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW - * MatrixRow getRow() ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW - * ConstMatrixRow getRow() ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW - * rowVectorProduct() ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW - * vectorProduct() ::DONE - * This used to throw illegal memory access, but instead of using ints for vectors, using Types, helped. - * addMatrix() ::DONE - * DenseMatrixProductKernel() ::HOW? How to test __global__? - * getMatrixProdut() ::HOW? It won't build: When testing CPU: no parameters match function DenseMatrixProductKernel(); when testing GPU: identifier tnlCudaMin is undefined. - * DenseTranspositionAlignedKernel() ::HOW? How to test __global__? - * DenseTranspositionNonAlignedKernel() ::HOW? How to test __global__? - * getTransposition() ::HOW? It won't build when testing CPU: no parameters match functions DenseTranspositionAlignedKernel() and DenseTranspositionNonAlignedKernel(). On GPU if will throw terminate and (core dumped). - * MISTAKE! For GPU it works completely fine, when rows == cols. Otherwise it throws assertion failed. - * performSORIteration() ::HOW? Throws segmentation fault CUDA. - * operator=() ::HOW? What is this supposed to enable? Overloading operators? - * save( String& fileName ) ::DONE - * load( String& fileName ) ::DONE - * save( File& file) ::USED! In save( String& fileName ) - * load( File& file ) ::USED! In load( String& fileName ) - * print() ::DONE - * getElementIndex() ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW - */ - -// GENERAL TODO -/* - * Template tests for all formats. - * Figure out __cuda_callable_. When trying to call __cuda_callable__ functions - * a segmentation fault (core dumped) is thrown. - * ==>__cuda_callable__ works only for CPU at the moment. (for loops vs thread kernel assignment) - */ - - #include #include #include @@ -105,17 +41,14 @@ void host_test_GetType() EXPECT_EQ( mtrxHostInt.getType(), TNL::String( "Matrices::Dense< int, Devices::Host, int >" ) ); } -// QUESITON: Cant these two functions be combined into one? Because if no CUDA is present and we were to call -// CUDA into the function in the TEST, to be tested, then we could have a problem. - template< typename MatrixCudaFloat, typename MatrixCudaInt > void cuda_test_GetType() { MatrixCudaFloat mtrxCudaFloat; MatrixCudaInt mtrxCudaInt; - EXPECT_EQ( mtrxCudaFloat.getType(), TNL::String( "Matrices::Dense< float, Cuda, int >" ) ); // This is mistakenly labeled in /src/TNL/Devices/Cuda.cpp - EXPECT_EQ( mtrxCudaInt.getType(), TNL::String( "Matrices::Dense< int, Cuda, int >" ) ); // Should be Devices::Cuda + EXPECT_EQ( mtrxCudaFloat.getType(), TNL::String( "Matrices::Dense< float, Devices::Cuda, int >" ) ); + EXPECT_EQ( mtrxCudaInt.getType(), TNL::String( "Matrices::Dense< int, Devices::Cuda, int >" ) ); } template< typename Matrix > @@ -832,7 +765,7 @@ void test_AddMatrix() const IndexType rows = 5; const IndexType cols = 4; - Matrix m; // We need this matrix to preserve the values for EXPECT_EQ statements comparing the actual operation; + Matrix m; m.reset(); m.setDimensions( rows, cols ); @@ -1096,88 +1029,6 @@ void test_GetTransposition() EXPECT_EQ( mTransposed.getElement( 1, 0 ), 2 ); EXPECT_EQ( mTransposed.getElement( 1, 1 ), 4 ); EXPECT_EQ( mTransposed.getElement( 1, 2 ), 6 ); - -/* - * Sets up the following 5x5 dense matrix: - * - * / 1 2 3 4 5 \ - * | 6 7 8 9 10 | - * | 11 12 13 14 15 | - * | 16 17 18 19 20 | - * \ 21 22 23 24 25 / - */ - // const IndexType rows = 5; - // const IndexType cols = 5; - // - // Matrix m; - // m.reset(); - // m.setDimensions( rows, cols ); - // - // RealType value = 1; - // for( IndexType i = 0; i < rows; i++ ) - // for( IndexType j = 0; j < cols; j++) - // m.setElement( i, j, value++ ); - -/* - * Sets up the following 5x5 dense matrix: - * - * / 2 12 22 32 42 \ - * | 4 14 24 34 44 | - * | 6 16 26 36 46 | - * | 8 18 28 38 48 | - * \ 10 20 30 40 50 / - */ - // const IndexType resultRows = cols; - // const IndexType resultCols = rows; - // - // Matrix mResult; - // mResult.reset(); - // mResult.setDimensions( resultRows, resultCols ); - // mResult.setValue( 0 ); - // - // RealType matrixMultiplicator = 2; - // - // mResult.getTransposition( m, matrixMultiplicator ); - -/* - * Should result in the following 5x5 resulting dense matrix: - * - * / 0 0 0 0 0 \ - * | 0 0 0 0 0 | - * | 0 0 0 0 0 | - * | 0 0 0 0 0 | - * \ 0 0 0 0 0 / - */ - // - // EXPECT_EQ( mResult.getElement( 0, 0 ), 2 ); - // EXPECT_EQ( mResult.getElement( 0, 1 ), 12 ); - // EXPECT_EQ( mResult.getElement( 0, 2 ), 22 ); - // EXPECT_EQ( mResult.getElement( 0, 3 ), 32 ); - // EXPECT_EQ( mResult.getElement( 0, 4 ), 42 ); - // - // EXPECT_EQ( mResult.getElement( 1, 0 ), 4 ); - // EXPECT_EQ( mResult.getElement( 1, 1 ), 14 ); - // EXPECT_EQ( mResult.getElement( 1, 2 ), 24 ); - // EXPECT_EQ( mResult.getElement( 1, 3 ), 34 ); - // EXPECT_EQ( mResult.getElement( 1, 4 ), 44 ); - // - // EXPECT_EQ( mResult.getElement( 2, 0 ), 6 ); - // EXPECT_EQ( mResult.getElement( 2, 1 ), 16 ); - // EXPECT_EQ( mResult.getElement( 2, 2 ), 26 ); - // EXPECT_EQ( mResult.getElement( 2, 3 ), 36 ); - // EXPECT_EQ( mResult.getElement( 2, 4 ), 46 ); - // - // EXPECT_EQ( mResult.getElement( 3, 0 ), 8 ); - // EXPECT_EQ( mResult.getElement( 3, 1 ), 18 ); - // EXPECT_EQ( mResult.getElement( 3, 2 ), 28 ); - // EXPECT_EQ( mResult.getElement( 3, 3 ), 38 ); - // EXPECT_EQ( mResult.getElement( 3, 4 ), 48 ); - // - // EXPECT_EQ( mResult.getElement( 4, 0 ), 10 ); - // EXPECT_EQ( mResult.getElement( 4, 1 ), 20 ); - // EXPECT_EQ( mResult.getElement( 4, 2 ), 30 ); - // EXPECT_EQ( mResult.getElement( 4, 3 ), 40 ); - // EXPECT_EQ( mResult.getElement( 4, 4 ), 50 ); } @@ -1361,12 +1212,10 @@ void test_Print() for( IndexType j = 0; j < cols; j++) m.setElement( i, j, value++ ); - // This is from: https://stackoverflow.com/questions/5193173/getting-cout-output-to-a-stdstring #include std::stringstream printed; std::stringstream couted; - // This is from: https://stackoverflow.com/questions/19485536/redirect-output-of-an-function-printing-to-console-to-string //change the underlying buffer and save the old buffer auto old_buf = std::cout.rdbuf(printed.rdbuf()); @@ -1374,7 +1223,6 @@ void test_Print() std::cout.rdbuf(old_buf); //reset - //printed << printed.str() << std::endl; couted << "Row: 0 -> Col:0->1 Col:1->2 Col:2->3 Col:3->4\t\n" "Row: 1 -> Col:0->5 Col:1->6 Col:2->7 Col:3->8\t\n" "Row: 2 -> Col:0->9 Col:1->10 Col:2->11 Col:3->12\t\n" @@ -1546,7 +1394,6 @@ TYPED_TEST( MatrixTest, printTest ) TEST( DenseMatrixTest, Dense_getMatrixProductTest_Host ) { -// test_GetMatrixProduct< Dense_host_int >(); bool testRan = false; EXPECT_TRUE( testRan ); std::cout << "\nTEST DID NOT RUN. NOT WORKING.\n\n"; @@ -1563,7 +1410,6 @@ TEST( DenseMatrixTest, Dense_getMatrixProductTest_Host ) #ifdef HAVE_CUDA TEST( DenseMatrixTest, Dense_getMatrixProductTest_Cuda ) { -// test_GetMatrixProduct< Dense_cuda_int >(); bool testRan = false; EXPECT_TRUE( testRan ); std::cout << "\nTEST DID NOT RUN. NOT WORKING.\n\n"; diff --git a/src/UnitTests/Matrices/SparseMatrixTest.h b/src/UnitTests/Matrices/SparseMatrixTest.h index c3716c116fedbcb3ba5f90a48039298d06c5b09c..5baeb42791a526731277adfaa20715a533ab956c 100644 --- a/src/UnitTests/Matrices/SparseMatrixTest.h +++ b/src/UnitTests/Matrices/SparseMatrixTest.h @@ -22,20 +22,6 @@ using CSR_host_int = TNL::Matrices::CSR< int, TNL::Devices::Host, int >; using CSR_cuda_float = TNL::Matrices::CSR< float, TNL::Devices::Cuda, int >; using CSR_cuda_int = TNL::Matrices::CSR< int, TNL::Devices::Cuda, int >; -//// test_getType is not general enough yet. DO NOT TEST IT YET. - -//TEST( SparseMatrixTest, CSR_GetTypeTest_Host ) -//{ -// host_test_GetType< CSR_host_float, CSR_host_int >(); -//} -// -//#ifdef HAVE_CUDA -//TEST( SparseMatrixTest, CSR_GetTypeTest_Cuda ) -//{ -// cuda_test_GetType< CSR_cuda_float, CSR_cuda_int >(); -//} -//#endif - TEST( SparseMatrixTest, CSR_perforSORIterationTest_Host ) { test_PerformSORIteration< CSR_host_float >(); diff --git a/src/UnitTests/Matrices/SparseMatrixTest.hpp b/src/UnitTests/Matrices/SparseMatrixTest.hpp index 03b80259d502cf43eb05f7c6b14053aa2e4ed7d7..ef5b28d240a65c5e26eb987c42b76688c59a8d87 100644 --- a/src/UnitTests/Matrices/SparseMatrixTest.hpp +++ b/src/UnitTests/Matrices/SparseMatrixTest.hpp @@ -8,87 +8,35 @@ /* See Copyright Notice in tnl/Copyright */ -// TODO -/* - * setDimensions() ::DONE - * setCompressedRowLengths() ::DONE - * getRowLength() ::USED! In test_SetCompressedRowLengths() to verify the test itself. - * getRowLengthFast() ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW - * setLike() ::DONE - * reset() ::DONE - * setElementFast() ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW - * setElement() ::DONE - * addElementFast() ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW - * addElement() ::DONE - * setRowFast() ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW - * setRow() ::DONE - * MISTAKE!!! In SlicedEllpack: addElement(), line 263, "column <= this->rows" shouldn't it be: "column <= this->columns", otherwise test_SetRow causes the assertion to fail. - * addRowFast() ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW - * addRow() ::NOT IMPLEMENTED! This calls addRowFast() which isn't implemented. Implement? Is it supposed to add an extra row to the matrix or add elements of a row to another row in the matrix? - * getElementFast() ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW - * getElement() ::USED! In test_SetElement(), test_AddElement() and test_setRow() to verify the test itself. - * getRowFast() ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW - * MatrixRow getRow() ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW - * ConstMatrixRow getRow() ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW - * rowVectorProduct() ::TEST? How to test __cuda_callable__? ONLY TEST ON CPU FOR NOW - * vectorProduct() ::DONE - * This used to throw illegal memory access, but instead of using ints for vectors, using Types, helped. - * addMatrix() ::NOT IMPLEMENTED! - * getTransposition() ::NOT IMPLMENETED! - * performSORIteration() ::HOW? Throws segmentation fault CUDA. - * operator=() ::HOW? What is this supposed to enable? Overloading operators? - * save( File& file) ::USED! In save( String& fileName ) - * load( File& file ) ::USED! In load( String& fileName ) - * save( String& fileName ) ::DONE - * load( String& fileName ) ::DONE - * print() ::DONE - * setCudaKernelType() ::NOT SUPPOSED TO TEST! via notes from 1.11.2018 supervisor meeting. - * getCudaKernelType() ::NOT SUPPOSED TO TEST! via notes from 1.11.2018 supervisor meeting. - * setCudaWarpSize() ::NOT SUPPOSED TO TEST! via notes from 1.11.2018 supervisor meeting. - * getCudaWarpSize() ::NOT SUPPOSED TO TEST! via notes from 1.11.2018 supervisor meeting. - * setHybridModeSplit() ::NOT SUPPOSED TO TEST! via notes from 1.11.2018 supervisor meeting. - * getHybridModeSplit() ::NOT SUPPOSED TO TEST! via notes from 1.11.2018 supervisor meeting. - * spmvCudaVectorized() ::TEST? How to test __device__? - * vectorProductCuda() ::TEST? How to test __device__? - */ - -// GENERAL TODO -/* - * For every function, EXPECT_EQ needs to be done, even for zeros in matrices. - * Figure out __cuda_callable_. When trying to call __cuda_callable__ functions - * a segmentation fault (core dumped) is thrown. - * ==>__cuda_callable__ works only for CPU at the moment. (for loops vs thread kernel assignment). - * If we want to use __cuda_callable__ on the GPU, we need to call it as a kernel. - */ - #include #include #include #include +// Temporary, until test_OperatorEquals doesn't work for all formats. +#include +#include +#include + #ifdef HAVE_GTEST #include template< typename MatrixHostFloat, typename MatrixHostInt > void host_test_GetType() { - MatrixHostFloat mtrxHostFloat; - MatrixHostInt mtrxHostInt; - - - EXPECT_EQ( mtrxHostFloat.getType(), TNL::String( "Matrices::CSR< float, Devices::Host >" ) ); - EXPECT_EQ( mtrxHostInt.getType(), TNL::String( "Matrices::CSR< int, Devices::Host >" ) ); + bool testRan = false; + EXPECT_TRUE( testRan ); + std::cout << "\nTEST DID NOT RUN. NOT WORKING.\n\n"; + std::cerr << "This test has not been implemented properly yet.\n" << std::endl; } template< typename MatrixCudaFloat, typename MatrixCudaInt > void cuda_test_GetType() { - MatrixCudaFloat mtrxCudaFloat; - MatrixCudaInt mtrxCudaInt; - - - EXPECT_EQ( mtrxCudaFloat.getType(), TNL::String( "Matrices::CSR< float, Cuda >" ) ); // This is mistakenly labeled in /src/TNL/Devices/Cuda.cpp - EXPECT_EQ( mtrxCudaInt.getType(), TNL::String( "Matrices::CSR< int, Cuda >" ) ); // Should be Devices::Cuda + bool testRan = false; + EXPECT_TRUE( testRan ); + std::cout << "\nTEST DID NOT RUN. NOT WORKING.\n\n"; + std::cerr << "This test has not been implemented properly yet.\n" << std::endl; } template< typename Matrix > @@ -175,92 +123,6 @@ void test_SetCompressedRowLengths() EXPECT_EQ( m.getNonZeroRowLength( 7 ), 6 ); EXPECT_EQ( m.getNonZeroRowLength( 8 ), 7 ); EXPECT_EQ( m.getNonZeroRowLength( 9 ), 8 ); - -// if( m.getType() == TNL::String( TNL::String( "Matrices::CSR< ") + -// TNL::String( TNL::getType< RealType >() ) + -// TNL::String( ", " ) + -// TNL::String( Matrix::DeviceType::getDeviceType() ) + -// //TNL::String( ", " ) + -// //TNL::String( TNL::getType< IndexType >() ) + -// TNL::String( " >" ) ) -// ) -// { -// EXPECT_EQ( m.getRowLength( 0 ), 3 ); -// EXPECT_EQ( m.getRowLength( 1 ), 3 ); -// EXPECT_EQ( m.getRowLength( 2 ), 1 ); -// EXPECT_EQ( m.getRowLength( 3 ), 2 ); -// EXPECT_EQ( m.getRowLength( 4 ), 3 ); -// EXPECT_EQ( m.getRowLength( 5 ), 4 ); -// EXPECT_EQ( m.getRowLength( 6 ), 5 ); -// EXPECT_EQ( m.getRowLength( 7 ), 6 ); -// EXPECT_EQ( m.getRowLength( 8 ), 7 ); -// EXPECT_EQ( m.getRowLength( 9 ), 8 ); -// } -// else if( m.getType() == TNL::String( TNL::String( "Matrices::AdEllpack< ") + -// TNL::String( TNL::getType< RealType >() ) + -// TNL::String( ", " ) + -// TNL::String( Matrix::DeviceType::getDeviceType() ) + -// TNL::String( ", " ) + -// TNL::String( TNL::getType< IndexType >() ) + -// TNL::String( " >" ) ) -// || -// m.getType() == TNL::String( TNL::String( "Matrices::SlicedEllpack< ") + -// TNL::String( TNL::getType< RealType >() ) + -// TNL::String( ", " ) + -// TNL::String( Matrix::DeviceType::getDeviceType() ) + -// TNL::String( " >" ) ) -// ) -// { -// EXPECT_EQ( m.getRowLength( 0 ), 8 ); -// EXPECT_EQ( m.getRowLength( 1 ), 8 ); -// EXPECT_EQ( m.getRowLength( 2 ), 8 ); -// EXPECT_EQ( m.getRowLength( 3 ), 8 ); -// EXPECT_EQ( m.getRowLength( 4 ), 8 ); -// EXPECT_EQ( m.getRowLength( 5 ), 8 ); -// EXPECT_EQ( m.getRowLength( 6 ), 8 ); -// EXPECT_EQ( m.getRowLength( 7 ), 8 ); -// EXPECT_EQ( m.getRowLength( 8 ), 8 ); -// EXPECT_EQ( m.getRowLength( 9 ), 8 ); -// } -// else if( m.getType() == TNL::String( TNL::String( "Matrices::Ellpack< ") + -// TNL::String( TNL::getType< RealType >() ) + -// TNL::String( ", " ) + -// TNL::String( Matrix::DeviceType::getDeviceType() ) + -// TNL::String( ", " ) + -// TNL::String( TNL::getType< IndexType >() ) + -// TNL::String( " >" ) ) -// || -// m.getType() == TNL::String( TNL::String( "Matrices::ChunkedEllpack< ") + -// TNL::String( TNL::getType< RealType >() ) + -// TNL::String( ", " ) + -// TNL::String( Matrix::DeviceType::getDeviceType() ) + -// TNL::String( " >" ) ) -// ) -// { -// EXPECT_EQ( m.getNonZeroRowLength( 0 ), 3 ); -// EXPECT_EQ( m.getNonZeroRowLength( 1 ), 3 ); -// EXPECT_EQ( m.getNonZeroRowLength( 2 ), 1 ); -// EXPECT_EQ( m.getNonZeroRowLength( 3 ), 2 ); -// EXPECT_EQ( m.getNonZeroRowLength( 4 ), 3 ); -// EXPECT_EQ( m.getNonZeroRowLength( 5 ), 4 ); -// EXPECT_EQ( m.getNonZeroRowLength( 6 ), 5 ); -// EXPECT_EQ( m.getNonZeroRowLength( 7 ), 6 ); -// EXPECT_EQ( m.getNonZeroRowLength( 8 ), 7 ); -// EXPECT_EQ( m.getNonZeroRowLength( 9 ), 8 ); -// } -// else -// { -// EXPECT_EQ( m.getRowLength( 0 ), 3 ); -// EXPECT_EQ( m.getRowLength( 1 ), 3 ); -// EXPECT_EQ( m.getRowLength( 2 ), 1 ); -// EXPECT_EQ( m.getRowLength( 3 ), 2 ); -// EXPECT_EQ( m.getRowLength( 4 ), 3 ); -// EXPECT_EQ( m.getRowLength( 5 ), 4 ); -// EXPECT_EQ( m.getRowLength( 6 ), 5 ); -// EXPECT_EQ( m.getRowLength( 7 ), 6 ); -// EXPECT_EQ( m.getRowLength( 8 ), 7 ); -// EXPECT_EQ( m.getRowLength( 9 ), 8 ); -// } } template< typename Matrix1, typename Matrix2 > @@ -326,60 +188,173 @@ void test_SetElement() using IndexType = typename Matrix::IndexType; /* - * Sets up the following 5x5 sparse matrix: + * Sets up the following 10x10 sparse matrix: * - * / 1 0 0 0 0 \ - * | 0 2 0 0 0 | - * | 0 0 3 0 0 | - * | 0 0 0 4 0 | - * \ 0 0 0 0 5 / + * / 1 0 2 0 3 0 4 0 0 0 \ + * | 5 6 7 0 0 0 0 0 0 0 | + * | 8 9 10 11 12 13 14 15 0 0 | + * | 16 17 0 0 0 0 0 0 0 0 | + * | 18 0 0 0 0 0 0 0 0 0 | + * | 19 0 0 0 0 0 0 0 0 0 | + * | 20 0 0 0 0 0 0 0 0 0 | + * | 21 0 0 0 0 0 0 0 0 0 | + * | 22 23 24 25 26 27 28 29 30 31 | + * \ 32 33 34 35 36 37 38 39 40 41 / */ - const IndexType rows = 5; - const IndexType cols = 5; + const IndexType rows = 10; + const IndexType cols = 10; Matrix m; m.reset(); + m.setDimensions( rows, cols ); + typename Matrix::CompressedRowLengthsVector rowLengths; rowLengths.setSize( rows ); - rowLengths.setValue( 1 ); - m.setCompressedRowLengths( rowLengths ); + rowLengths.setElement( 0, 4 ); + rowLengths.setElement( 1, 3 ); + rowLengths.setElement( 2, 8 ); + rowLengths.setElement( 3, 2 ); + for( IndexType i = 4; i < rows - 2; i++ ) + { + rowLengths.setElement( i, 1 ); + } + rowLengths.setElement( 8, 10 ); + rowLengths.setElement( 9, 10 ); + m.setCompressedRowLengths( rowLengths ); RealType value = 1; - for( IndexType i = 0; i < rows; i++ ) - m.setElement( i, i, value++ ); - - - EXPECT_EQ( m.getElement( 0, 0 ), 1 ); - EXPECT_EQ( m.getElement( 0, 1 ), 0 ); - EXPECT_EQ( m.getElement( 0, 2 ), 0 ); - EXPECT_EQ( m.getElement( 0, 3 ), 0 ); - EXPECT_EQ( m.getElement( 0, 4 ), 0 ); - - EXPECT_EQ( m.getElement( 1, 0 ), 0 ); - EXPECT_EQ( m.getElement( 1, 1 ), 2 ); - EXPECT_EQ( m.getElement( 1, 2 ), 0 ); - EXPECT_EQ( m.getElement( 1, 3 ), 0 ); - EXPECT_EQ( m.getElement( 1, 4 ), 0 ); - - EXPECT_EQ( m.getElement( 2, 0 ), 0 ); - EXPECT_EQ( m.getElement( 2, 1 ), 0 ); - EXPECT_EQ( m.getElement( 2, 2 ), 3 ); - EXPECT_EQ( m.getElement( 2, 3 ), 0 ); - EXPECT_EQ( m.getElement( 2, 4 ), 0 ); - - EXPECT_EQ( m.getElement( 3, 0 ), 0 ); - EXPECT_EQ( m.getElement( 3, 1 ), 0 ); - EXPECT_EQ( m.getElement( 3, 2 ), 0 ); - EXPECT_EQ( m.getElement( 3, 3 ), 4 ); - EXPECT_EQ( m.getElement( 3, 4 ), 0 ); - - EXPECT_EQ( m.getElement( 4, 0 ), 0 ); - EXPECT_EQ( m.getElement( 4, 1 ), 0 ); - EXPECT_EQ( m.getElement( 4, 2 ), 0 ); - EXPECT_EQ( m.getElement( 4, 3 ), 0 ); - EXPECT_EQ( m.getElement( 4, 4 ), 5 ); + for( IndexType i = 0; i < 4; i++ ) + m.setElement( 0, 2 * i, value++ ); + + for( IndexType i = 0; i < 3; i++ ) + m.setElement( 1, i, value++ ); + + for( IndexType i = 0; i < 8; i++ ) + m.setElement( 2, i, value++ ); + + for( IndexType i = 0; i < 2; i++ ) + m.setElement( 3, i, value++ ); + + for( IndexType i = 4; i < 8; i++ ) + m.setElement( i, 0, value++ ); + + for( IndexType j = 8; j < rows; j++) + { + for( IndexType i = 0; i < cols; i++ ) + m.setElement( j, i, value++ ); + } + + EXPECT_EQ( m.getElement( 0, 0 ), 1 ); + EXPECT_EQ( m.getElement( 0, 1 ), 0 ); + EXPECT_EQ( m.getElement( 0, 2 ), 2 ); + EXPECT_EQ( m.getElement( 0, 3 ), 0 ); + EXPECT_EQ( m.getElement( 0, 4 ), 3 ); + EXPECT_EQ( m.getElement( 0, 5 ), 0 ); + EXPECT_EQ( m.getElement( 0, 6 ), 4 ); + EXPECT_EQ( m.getElement( 0, 7 ), 0 ); + EXPECT_EQ( m.getElement( 0, 8 ), 0 ); + EXPECT_EQ( m.getElement( 0, 9 ), 0 ); + + EXPECT_EQ( m.getElement( 1, 0 ), 5 ); + EXPECT_EQ( m.getElement( 1, 1 ), 6 ); + EXPECT_EQ( m.getElement( 1, 2 ), 7 ); + EXPECT_EQ( m.getElement( 1, 3 ), 0 ); + EXPECT_EQ( m.getElement( 1, 4 ), 0 ); + EXPECT_EQ( m.getElement( 1, 5 ), 0 ); + EXPECT_EQ( m.getElement( 1, 6 ), 0 ); + EXPECT_EQ( m.getElement( 1, 7 ), 0 ); + EXPECT_EQ( m.getElement( 1, 8 ), 0 ); + EXPECT_EQ( m.getElement( 1, 9 ), 0 ); + + EXPECT_EQ( m.getElement( 2, 0 ), 8 ); + EXPECT_EQ( m.getElement( 2, 1 ), 9 ); + EXPECT_EQ( m.getElement( 2, 2 ), 10 ); + EXPECT_EQ( m.getElement( 2, 3 ), 11 ); + EXPECT_EQ( m.getElement( 2, 4 ), 12 ); + EXPECT_EQ( m.getElement( 2, 5 ), 13 ); + EXPECT_EQ( m.getElement( 2, 6 ), 14 ); + EXPECT_EQ( m.getElement( 2, 7 ), 15 ); + EXPECT_EQ( m.getElement( 2, 8 ), 0 ); + EXPECT_EQ( m.getElement( 2, 9 ), 0 ); + + EXPECT_EQ( m.getElement( 3, 0 ), 16 ); + EXPECT_EQ( m.getElement( 3, 1 ), 17 ); + EXPECT_EQ( m.getElement( 3, 2 ), 0 ); + EXPECT_EQ( m.getElement( 3, 3 ), 0 ); + EXPECT_EQ( m.getElement( 3, 4 ), 0 ); + EXPECT_EQ( m.getElement( 3, 5 ), 0 ); + EXPECT_EQ( m.getElement( 3, 6 ), 0 ); + EXPECT_EQ( m.getElement( 3, 7 ), 0 ); + EXPECT_EQ( m.getElement( 3, 8 ), 0 ); + EXPECT_EQ( m.getElement( 3, 9 ), 0 ); + + EXPECT_EQ( m.getElement( 4, 0 ), 18 ); + EXPECT_EQ( m.getElement( 4, 1 ), 0 ); + EXPECT_EQ( m.getElement( 4, 2 ), 0 ); + EXPECT_EQ( m.getElement( 4, 3 ), 0 ); + EXPECT_EQ( m.getElement( 4, 4 ), 0 ); + EXPECT_EQ( m.getElement( 4, 5 ), 0 ); + EXPECT_EQ( m.getElement( 4, 6 ), 0 ); + EXPECT_EQ( m.getElement( 4, 7 ), 0 ); + EXPECT_EQ( m.getElement( 4, 8 ), 0 ); + EXPECT_EQ( m.getElement( 4, 9 ), 0 ); + + EXPECT_EQ( m.getElement( 5, 0 ), 19 ); + EXPECT_EQ( m.getElement( 5, 1 ), 0 ); + EXPECT_EQ( m.getElement( 5, 2 ), 0 ); + EXPECT_EQ( m.getElement( 5, 3 ), 0 ); + EXPECT_EQ( m.getElement( 5, 4 ), 0 ); + EXPECT_EQ( m.getElement( 5, 5 ), 0 ); + EXPECT_EQ( m.getElement( 5, 6 ), 0 ); + EXPECT_EQ( m.getElement( 5, 7 ), 0 ); + EXPECT_EQ( m.getElement( 5, 8 ), 0 ); + EXPECT_EQ( m.getElement( 5, 9 ), 0 ); + + EXPECT_EQ( m.getElement( 6, 0 ), 20 ); + EXPECT_EQ( m.getElement( 6, 1 ), 0 ); + EXPECT_EQ( m.getElement( 6, 2 ), 0 ); + EXPECT_EQ( m.getElement( 6, 3 ), 0 ); + EXPECT_EQ( m.getElement( 6, 4 ), 0 ); + EXPECT_EQ( m.getElement( 6, 5 ), 0 ); + EXPECT_EQ( m.getElement( 6, 6 ), 0 ); + EXPECT_EQ( m.getElement( 6, 7 ), 0 ); + EXPECT_EQ( m.getElement( 6, 8 ), 0 ); + EXPECT_EQ( m.getElement( 6, 9 ), 0 ); + + EXPECT_EQ( m.getElement( 7, 0 ), 21 ); + EXPECT_EQ( m.getElement( 7, 1 ), 0 ); + EXPECT_EQ( m.getElement( 7, 2 ), 0 ); + EXPECT_EQ( m.getElement( 7, 3 ), 0 ); + EXPECT_EQ( m.getElement( 7, 4 ), 0 ); + EXPECT_EQ( m.getElement( 7, 5 ), 0 ); + EXPECT_EQ( m.getElement( 7, 6 ), 0 ); + EXPECT_EQ( m.getElement( 7, 7 ), 0 ); + EXPECT_EQ( m.getElement( 7, 8 ), 0 ); + EXPECT_EQ( m.getElement( 7, 9 ), 0 ); + + EXPECT_EQ( m.getElement( 8, 0 ), 22 ); + EXPECT_EQ( m.getElement( 8, 1 ), 23 ); + EXPECT_EQ( m.getElement( 8, 2 ), 24 ); + EXPECT_EQ( m.getElement( 8, 3 ), 25 ); + EXPECT_EQ( m.getElement( 8, 4 ), 26 ); + EXPECT_EQ( m.getElement( 8, 5 ), 27 ); + EXPECT_EQ( m.getElement( 8, 6 ), 28 ); + EXPECT_EQ( m.getElement( 8, 7 ), 29 ); + EXPECT_EQ( m.getElement( 8, 8 ), 30 ); + EXPECT_EQ( m.getElement( 8, 9 ), 31 ); + + EXPECT_EQ( m.getElement( 9, 0 ), 32 ); + EXPECT_EQ( m.getElement( 9, 1 ), 33 ); + EXPECT_EQ( m.getElement( 9, 2 ), 34 ); + EXPECT_EQ( m.getElement( 9, 3 ), 35 ); + EXPECT_EQ( m.getElement( 9, 4 ), 36 ); + EXPECT_EQ( m.getElement( 9, 5 ), 37 ); + EXPECT_EQ( m.getElement( 9, 6 ), 38 ); + EXPECT_EQ( m.getElement( 9, 7 ), 39 ); + EXPECT_EQ( m.getElement( 9, 8 ), 40 ); + EXPECT_EQ( m.getElement( 9, 9 ), 41 ); } template< typename Matrix > @@ -467,6 +442,17 @@ void test_AddElement() // Add new elements to the old elements with a multiplying factor applied to the old elements. +/* + * Sets up the following 6x5 sparse matrix: + * + * / 1 2 3 0 0 \ + * | 0 4 5 6 0 | + * | 0 0 7 8 9 | + * | 10 0 0 0 0 | + * | 0 11 0 0 0 | + * \ 0 0 0 12 0 / + */ + /* * The following setup results in the following 6x5 sparse matrix: * @@ -613,64 +599,331 @@ void test_VectorProduct() using RealType = typename Matrix::RealType; using DeviceType = typename Matrix::DeviceType; using IndexType = typename Matrix::IndexType; + using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >; /* - * Sets up the following 5x4 sparse matrix: + * Sets up the following 4x4 sparse matrix: + * + * / 1 0 0 0 \ + * | 0 2 0 3 | + * | 0 4 0 0 | + * \ 0 0 5 0 / + */ + + const IndexType m_rows_1 = 4; + const IndexType m_cols_1 = 4; + + Matrix m_1; + m_1.reset(); + m_1.setDimensions( m_rows_1, m_cols_1 ); + typename Matrix::CompressedRowLengthsVector rowLengths_1; + rowLengths_1.setSize( m_rows_1 ); + rowLengths_1.setElement( 0, 1 ); + rowLengths_1.setElement( 1, 2 ); + rowLengths_1.setElement( 2, 1 ); + rowLengths_1.setElement( 3, 1 ); + m_1.setCompressedRowLengths( rowLengths_1 ); + + RealType value_1 = 1; + m_1.setElement( 0, 0, value_1++ ); // 0th row + + m_1.setElement( 1, 1, value_1++ ); // 1st row + m_1.setElement( 1, 3, value_1++ ); + + m_1.setElement( 2, 1, value_1++ ); // 2nd row + + m_1.setElement( 3, 2, value_1++ ); // 3rd row + + VectorType inVector_1; + inVector_1.setSize( m_cols_1 ); + for( IndexType i = 0; i < inVector_1.getSize(); i++ ) + inVector_1.setElement( i, 2 ); + + VectorType outVector_1; + outVector_1.setSize( m_rows_1 ); + for( IndexType j = 0; j < outVector_1.getSize(); j++ ) + outVector_1.setElement( j, 0 ); + + + m_1.vectorProduct( inVector_1, outVector_1 ); + + + EXPECT_EQ( outVector_1.getElement( 0 ), 2 ); + EXPECT_EQ( outVector_1.getElement( 1 ), 10 ); + EXPECT_EQ( outVector_1.getElement( 2 ), 8 ); + EXPECT_EQ( outVector_1.getElement( 3 ), 10 ); + + +/* + * Sets up the following 4x4 sparse matrix: * * / 1 2 3 0 \ * | 0 0 0 4 | * | 5 6 7 0 | - * | 0 8 9 10 | - * \ 0 0 11 12 / + * \ 0 8 0 0 / */ - const IndexType m_rows = 5; - const IndexType m_cols = 4; + const IndexType m_rows_2 = 4; + const IndexType m_cols_2 = 4; - Matrix m; - m.reset(); - m.setDimensions( m_rows, m_cols ); - typename Matrix::CompressedRowLengthsVector rowLengths; - rowLengths.setSize( m_rows ); - rowLengths.setValue( 3 ); - m.setCompressedRowLengths( rowLengths ); + Matrix m_2; + m_2.reset(); + m_2.setDimensions( m_rows_2, m_cols_2 ); + typename Matrix::CompressedRowLengthsVector rowLengths_2; + rowLengths_2.setSize( m_rows_2 ); + rowLengths_2.setValue( 3 ); + rowLengths_2.setElement( 1, 1 ); + rowLengths_2.setElement( 3, 1 ); + m_2.setCompressedRowLengths( rowLengths_2 ); - RealType value = 1; - for( IndexType i = 0; i < m_cols - 1; i++ ) // 0th row - m.setElement( 0, i, value++ ); + RealType value_2 = 1; + for( IndexType i = 0; i < 3; i++ ) // 0th row + m_2.setElement( 0, i, value_2++ ); - m.setElement( 1, 3, value++ ); // 1st row + m_2.setElement( 1, 3, value_2++ ); // 1st row - for( IndexType i = 0; i < m_cols - 1; i++ ) // 2nd row - m.setElement( 2, i, value++ ); + for( IndexType i = 0; i < 3; i++ ) // 2nd row + m_2.setElement( 2, i, value_2++ ); - for( IndexType i = 1; i < m_cols; i++ ) // 3rd row - m.setElement( 3, i, value++ ); - - for( IndexType i = 2; i < m_cols; i++ ) // 4th row - m.setElement( 4, i, value++ ); + for( IndexType i = 1; i < 2; i++ ) // 3rd row + m_2.setElement( 3, i, value_2++ ); + + VectorType inVector_2; + inVector_2.setSize( m_cols_2 ); + for( IndexType i = 0; i < inVector_2.getSize(); i++ ) + inVector_2.setElement( i, 2 ); - using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >; + VectorType outVector_2; + outVector_2.setSize( m_rows_2 ); + for( IndexType j = 0; j < outVector_2.getSize(); j++ ) + outVector_2.setElement( j, 0 ); + + + m_2.vectorProduct( inVector_2, outVector_2 ); + + + EXPECT_EQ( outVector_2.getElement( 0 ), 12 ); + EXPECT_EQ( outVector_2.getElement( 1 ), 8 ); + EXPECT_EQ( outVector_2.getElement( 2 ), 36 ); + EXPECT_EQ( outVector_2.getElement( 3 ), 16 ); - VectorType inVector; - inVector.setSize( 4 ); - for( IndexType i = 0; i < inVector.getSize(); i++ ) - inVector.setElement( i, 2 ); + +/* + * Sets up the following 4x4 sparse matrix: + * + * / 1 2 3 0 \ + * | 0 4 5 6 | + * | 7 8 9 0 | + * \ 0 10 11 12 / + */ + + const IndexType m_rows_3 = 4; + const IndexType m_cols_3 = 4; + + Matrix m_3; + m_3.reset(); + m_3.setDimensions( m_rows_3, m_cols_3 ); + typename Matrix::CompressedRowLengthsVector rowLengths_3; + rowLengths_3.setSize( m_rows_3 ); + rowLengths_3.setValue( 3 ); + m_3.setCompressedRowLengths( rowLengths_3 ); + + RealType value_3 = 1; + for( IndexType i = 0; i < 3; i++ ) // 0th row + m_3.setElement( 0, i, value_3++ ); + + for( IndexType i = 1; i < 4; i++ ) + m_3.setElement( 1, i, value_3++ ); // 1st row + + for( IndexType i = 0; i < 3; i++ ) // 2nd row + m_3.setElement( 2, i, value_3++ ); + + for( IndexType i = 1; i < 4; i++ ) // 3rd row + m_3.setElement( 3, i, value_3++ ); + + VectorType inVector_3; + inVector_3.setSize( m_cols_3 ); + for( IndexType i = 0; i < inVector_3.getSize(); i++ ) + inVector_3.setElement( i, 2 ); - VectorType outVector; - outVector.setSize( 5 ); - for( IndexType j = 0; j < outVector.getSize(); j++ ) - outVector.setElement( j, 0 ); + VectorType outVector_3; + outVector_3.setSize( m_rows_3 ); + for( IndexType j = 0; j < outVector_3.getSize(); j++ ) + outVector_3.setElement( j, 0 ); - m.vectorProduct( inVector, outVector); + m_3.vectorProduct( inVector_3, outVector_3 ); - EXPECT_EQ( outVector.getElement( 0 ), 12 ); - EXPECT_EQ( outVector.getElement( 1 ), 8 ); - EXPECT_EQ( outVector.getElement( 2 ), 36 ); - EXPECT_EQ( outVector.getElement( 3 ), 54 ); - EXPECT_EQ( outVector.getElement( 4 ), 46 ); + EXPECT_EQ( outVector_3.getElement( 0 ), 12 ); + EXPECT_EQ( outVector_3.getElement( 1 ), 30 ); + EXPECT_EQ( outVector_3.getElement( 2 ), 48 ); + EXPECT_EQ( outVector_3.getElement( 3 ), 66 ); + + +/* + * Sets up the following 8x8 sparse matrix: + * + * / 1 2 3 0 0 4 0 0 \ + * | 0 5 6 7 8 0 0 0 | + * | 9 10 11 12 13 0 0 0 | + * | 0 14 15 16 17 0 0 0 | + * | 0 0 18 19 20 21 0 0 | + * | 0 0 0 22 23 24 25 0 | + * | 26 27 28 29 30 0 0 0 | + * \ 31 32 33 34 35 0 0 0 / + */ + + const IndexType m_rows_4 = 8; + const IndexType m_cols_4 = 8; + + Matrix m_4; + m_4.reset(); + m_4.setDimensions( m_rows_4, m_cols_4 ); + typename Matrix::CompressedRowLengthsVector rowLengths_4; + rowLengths_4.setSize( m_rows_4 ); + rowLengths_4.setValue( 4 ); + rowLengths_4.setElement( 2, 5 ); + rowLengths_4.setElement( 6, 5 ); + rowLengths_4.setElement( 7, 5 ); + m_4.setCompressedRowLengths( rowLengths_4 ); + + RealType value_4 = 1; + for( IndexType i = 0; i < 3; i++ ) // 0th row + m_4.setElement( 0, i, value_4++ ); + + m_4.setElement( 0, 5, value_4++ ); + + for( IndexType i = 1; i < 5; i++ ) // 1st row + m_4.setElement( 1, i, value_4++ ); + + for( IndexType i = 0; i < 5; i++ ) // 2nd row + m_4.setElement( 2, i, value_4++ ); + + for( IndexType i = 1; i < 5; i++ ) // 3rd row + m_4.setElement( 3, i, value_4++ ); + + for( IndexType i = 2; i < 6; i++ ) // 4th row + m_4.setElement( 4, i, value_4++ ); + + for( IndexType i = 3; i < 7; i++ ) // 5th row + m_4.setElement( 5, i, value_4++ ); + + for( IndexType i = 0; i < 5; i++ ) // 6th row + m_4.setElement( 6, i, value_4++ ); + + for( IndexType i = 0; i < 5; i++ ) // 7th row + m_4.setElement( 7, i, value_4++ ); + + VectorType inVector_4; + inVector_4.setSize( m_cols_4 ); + for( IndexType i = 0; i < inVector_4.getSize(); i++ ) + inVector_4.setElement( i, 2 ); + + VectorType outVector_4; + outVector_4.setSize( m_rows_4 ); + for( IndexType j = 0; j < outVector_4.getSize(); j++ ) + outVector_4.setElement( j, 0 ); + + + m_4.vectorProduct( inVector_4, outVector_4 ); + + + EXPECT_EQ( outVector_4.getElement( 0 ), 20 ); + EXPECT_EQ( outVector_4.getElement( 1 ), 52 ); + EXPECT_EQ( outVector_4.getElement( 2 ), 110 ); + EXPECT_EQ( outVector_4.getElement( 3 ), 124 ); + EXPECT_EQ( outVector_4.getElement( 4 ), 156 ); + EXPECT_EQ( outVector_4.getElement( 5 ), 188 ); + EXPECT_EQ( outVector_4.getElement( 6 ), 280 ); + EXPECT_EQ( outVector_4.getElement( 7 ), 330 ); + + +/* + * Sets up the following 8x8 sparse matrix: + * + * / 1 2 3 0 4 5 0 1 \ 6 + * | 0 6 0 7 0 0 0 1 | 3 + * | 0 8 9 0 10 0 0 1 | 4 + * | 0 11 12 13 14 0 0 1 | 5 + * | 0 15 0 0 0 0 0 1 | 2 + * | 0 16 17 18 19 20 21 1 | 7 + * | 22 23 24 25 26 27 28 1 | 8 + * \ 29 30 31 32 33 34 35 36 / 8 + */ + + const IndexType m_rows_5 = 8; + const IndexType m_cols_5 = 8; + + Matrix m_5; + m_5.reset(); + m_5.setDimensions( m_rows_5, m_cols_5 ); + typename Matrix::CompressedRowLengthsVector rowLengths_5; + rowLengths_5.setSize( m_rows_5 ); + rowLengths_5.setElement(0, 6); + rowLengths_5.setElement(1, 3); + rowLengths_5.setElement(2, 4); + rowLengths_5.setElement(3, 5); + rowLengths_5.setElement(4, 2); + rowLengths_5.setElement(5, 7); + rowLengths_5.setElement(6, 8); + rowLengths_5.setElement(7, 8); + m_5.setCompressedRowLengths( rowLengths_5 ); + + RealType value_5 = 1; + for( IndexType i = 0; i < 3; i++ ) // 0th row + m_5.setElement( 0, i, value_5++ ); + + m_5.setElement( 0, 4, value_5++ ); // 0th row + m_5.setElement( 0, 5, value_5++ ); + + m_5.setElement( 1, 1, value_5++ ); // 1st row + m_5.setElement( 1, 3, value_5++ ); + + for( IndexType i = 1; i < 3; i++ ) // 2nd row + m_5.setElement( 2, i, value_5++ ); + + m_5.setElement( 2, 4, value_5++ ); // 2nd row + + for( IndexType i = 1; i < 5; i++ ) // 3rd row + m_5.setElement( 3, i, value_5++ ); + + m_5.setElement( 4, 1, value_5++ ); // 4th row + + for( IndexType i = 1; i < 7; i++ ) // 5th row + m_5.setElement( 5, i, value_5++ ); + + for( IndexType i = 0; i < 7; i++ ) // 6th row + m_5.setElement( 6, i, value_5++ ); + + for( IndexType i = 0; i < 8; i++ ) // 7th row + m_5.setElement( 7, i, value_5++ ); + + for( IndexType i = 0; i < 7; i++ ) // 1s at the end of rows + m_5.setElement( i, 7, 1); + + VectorType inVector_5; + inVector_5.setSize( m_cols_5 ); + for( IndexType i = 0; i < inVector_5.getSize(); i++ ) + inVector_5.setElement( i, 2 ); + + VectorType outVector_5; + outVector_5.setSize( m_rows_5 ); + for( IndexType j = 0; j < outVector_5.getSize(); j++ ) + outVector_5.setElement( j, 0 ); + + + m_5.vectorProduct( inVector_5, outVector_5 ); + + + EXPECT_EQ( outVector_5.getElement( 0 ), 32 ); + EXPECT_EQ( outVector_5.getElement( 1 ), 28 ); + EXPECT_EQ( outVector_5.getElement( 2 ), 56 ); + EXPECT_EQ( outVector_5.getElement( 3 ), 102 ); + EXPECT_EQ( outVector_5.getElement( 4 ), 32 ); + EXPECT_EQ( outVector_5.getElement( 5 ), 224 ); + EXPECT_EQ( outVector_5.getElement( 6 ), 352 ); + EXPECT_EQ( outVector_5.getElement( 7 ), 520 ); } template< typename Matrix > @@ -753,6 +1006,268 @@ void test_PerformSORIteration() EXPECT_EQ( xVector[ 3 ], 0.25 ); } +// This test is only for AdEllpack +template< typename Matrix > +void test_OperatorEquals() +{ + using RealType = typename Matrix::RealType; + using DeviceType = typename Matrix::DeviceType; + using IndexType = typename Matrix::IndexType; + + if( std::is_same< DeviceType, TNL::Devices::Cuda >::value ) + return; + else + { + using AdELL_host = TNL::Matrices::AdEllpack< RealType, TNL::Devices::Host, IndexType >; + using AdELL_cuda = TNL::Matrices::AdEllpack< RealType, TNL::Devices::Cuda, IndexType >; + + /* + * Sets up the following 8x8 sparse matrix: + * + * / 1 2 3 0 4 5 0 1 \ 6 + * | 0 6 0 7 0 0 0 1 | 3 + * | 0 8 9 0 10 0 0 1 | 4 + * | 0 11 12 13 14 0 0 1 | 5 + * | 0 15 0 0 0 0 0 1 | 2 + * | 0 16 17 18 19 20 21 1 | 7 + * | 22 23 24 25 26 27 28 1 | 8 + * \ 29 30 31 32 33 34 35 36 / 8 + */ + + const IndexType m_rows = 8; + const IndexType m_cols = 8; + + AdELL_host m_host; + + m_host.reset(); + m_host.setDimensions( m_rows, m_cols ); + typename AdELL_host::CompressedRowLengthsVector rowLengths; + rowLengths.setSize( m_rows ); + rowLengths.setElement(0, 6); + rowLengths.setElement(1, 3); + rowLengths.setElement(2, 4); + rowLengths.setElement(3, 5); + rowLengths.setElement(4, 2); + rowLengths.setElement(5, 7); + rowLengths.setElement(6, 8); + rowLengths.setElement(7, 8); + m_host.setCompressedRowLengths( rowLengths ); + + RealType value = 1; + for( IndexType i = 0; i < 3; i++ ) // 0th row + m_host.setElement( 0, i, value++ ); + + m_host.setElement( 0, 4, value++ ); // 0th row + m_host.setElement( 0, 5, value++ ); + + m_host.setElement( 1, 1, value++ ); // 1st row + m_host.setElement( 1, 3, value++ ); + + for( IndexType i = 1; i < 3; i++ ) // 2nd row + m_host.setElement( 2, i, value++ ); + + m_host.setElement( 2, 4, value++ ); // 2nd row + + + for( IndexType i = 1; i < 5; i++ ) // 3rd row + m_host.setElement( 3, i, value++ ); + + m_host.setElement( 4, 1, value++ ); // 4th row + + for( IndexType i = 1; i < 7; i++ ) // 5th row + m_host.setElement( 5, i, value++ ); + + for( IndexType i = 0; i < 7; i++ ) // 6th row + m_host.setElement( 6, i, value++ ); + + for( IndexType i = 0; i < 8; i++ ) // 7th row + m_host.setElement( 7, i, value++ ); + + for( IndexType i = 0; i < 7; i++ ) // 1s at the end or rows: 5, 6 + m_host.setElement( i, 7, 1); + + EXPECT_EQ( m_host.getElement( 0, 0 ), 1 ); + EXPECT_EQ( m_host.getElement( 0, 1 ), 2 ); + EXPECT_EQ( m_host.getElement( 0, 2 ), 3 ); + EXPECT_EQ( m_host.getElement( 0, 3 ), 0 ); + EXPECT_EQ( m_host.getElement( 0, 4 ), 4 ); + EXPECT_EQ( m_host.getElement( 0, 5 ), 5 ); + EXPECT_EQ( m_host.getElement( 0, 6 ), 0 ); + EXPECT_EQ( m_host.getElement( 0, 7 ), 1 ); + + EXPECT_EQ( m_host.getElement( 1, 0 ), 0 ); + EXPECT_EQ( m_host.getElement( 1, 1 ), 6 ); + EXPECT_EQ( m_host.getElement( 1, 2 ), 0 ); + EXPECT_EQ( m_host.getElement( 1, 3 ), 7 ); + EXPECT_EQ( m_host.getElement( 1, 4 ), 0 ); + EXPECT_EQ( m_host.getElement( 1, 5 ), 0 ); + EXPECT_EQ( m_host.getElement( 1, 6 ), 0 ); + EXPECT_EQ( m_host.getElement( 1, 7 ), 1 ); + + EXPECT_EQ( m_host.getElement( 2, 0 ), 0 ); + EXPECT_EQ( m_host.getElement( 2, 1 ), 8 ); + EXPECT_EQ( m_host.getElement( 2, 2 ), 9 ); + EXPECT_EQ( m_host.getElement( 2, 3 ), 0 ); + EXPECT_EQ( m_host.getElement( 2, 4 ), 10 ); + EXPECT_EQ( m_host.getElement( 2, 5 ), 0 ); + EXPECT_EQ( m_host.getElement( 2, 6 ), 0 ); + EXPECT_EQ( m_host.getElement( 2, 7 ), 1 ); + + EXPECT_EQ( m_host.getElement( 3, 0 ), 0 ); + EXPECT_EQ( m_host.getElement( 3, 1 ), 11 ); + EXPECT_EQ( m_host.getElement( 3, 2 ), 12 ); + EXPECT_EQ( m_host.getElement( 3, 3 ), 13 ); + EXPECT_EQ( m_host.getElement( 3, 4 ), 14 ); + EXPECT_EQ( m_host.getElement( 3, 5 ), 0 ); + EXPECT_EQ( m_host.getElement( 3, 6 ), 0 ); + EXPECT_EQ( m_host.getElement( 3, 7 ), 1 ); + + EXPECT_EQ( m_host.getElement( 4, 0 ), 0 ); + EXPECT_EQ( m_host.getElement( 4, 1 ), 15 ); + EXPECT_EQ( m_host.getElement( 4, 2 ), 0 ); + EXPECT_EQ( m_host.getElement( 4, 3 ), 0 ); + EXPECT_EQ( m_host.getElement( 4, 4 ), 0 ); + EXPECT_EQ( m_host.getElement( 4, 5 ), 0 ); + EXPECT_EQ( m_host.getElement( 4, 6 ), 0 ); + EXPECT_EQ( m_host.getElement( 4, 7 ), 1 ); + + EXPECT_EQ( m_host.getElement( 5, 0 ), 0 ); + EXPECT_EQ( m_host.getElement( 5, 1 ), 16 ); + EXPECT_EQ( m_host.getElement( 5, 2 ), 17 ); + EXPECT_EQ( m_host.getElement( 5, 3 ), 18 ); + EXPECT_EQ( m_host.getElement( 5, 4 ), 19 ); + EXPECT_EQ( m_host.getElement( 5, 5 ), 20 ); + EXPECT_EQ( m_host.getElement( 5, 6 ), 21 ); + EXPECT_EQ( m_host.getElement( 5, 7 ), 1 ); + + EXPECT_EQ( m_host.getElement( 6, 0 ), 22 ); + EXPECT_EQ( m_host.getElement( 6, 1 ), 23 ); + EXPECT_EQ( m_host.getElement( 6, 2 ), 24 ); + EXPECT_EQ( m_host.getElement( 6, 3 ), 25 ); + EXPECT_EQ( m_host.getElement( 6, 4 ), 26 ); + EXPECT_EQ( m_host.getElement( 6, 5 ), 27 ); + EXPECT_EQ( m_host.getElement( 6, 6 ), 28 ); + EXPECT_EQ( m_host.getElement( 6, 7 ), 1 ); + + EXPECT_EQ( m_host.getElement( 7, 0 ), 29 ); + EXPECT_EQ( m_host.getElement( 7, 1 ), 30 ); + EXPECT_EQ( m_host.getElement( 7, 2 ), 31 ); + EXPECT_EQ( m_host.getElement( 7, 3 ), 32 ); + EXPECT_EQ( m_host.getElement( 7, 4 ), 33 ); + EXPECT_EQ( m_host.getElement( 7, 5 ), 34 ); + EXPECT_EQ( m_host.getElement( 7, 6 ), 35 ); + EXPECT_EQ( m_host.getElement( 7, 7 ), 36 ); + + AdELL_cuda m_cuda; + + // Copy the host matrix into the cuda matrix + m_cuda = m_host; + + // Reset the host matrix + m_host.reset(); + + // Copy the cuda matrix back into the host matrix + m_host = m_cuda; + + // Check the newly created double-copy host matrix + EXPECT_EQ( m_host.getElement( 0, 0 ), 1 ); + EXPECT_EQ( m_host.getElement( 0, 1 ), 2 ); + EXPECT_EQ( m_host.getElement( 0, 2 ), 3 ); + EXPECT_EQ( m_host.getElement( 0, 3 ), 0 ); + EXPECT_EQ( m_host.getElement( 0, 4 ), 4 ); + EXPECT_EQ( m_host.getElement( 0, 5 ), 5 ); + EXPECT_EQ( m_host.getElement( 0, 6 ), 0 ); + EXPECT_EQ( m_host.getElement( 0, 7 ), 1 ); + + EXPECT_EQ( m_host.getElement( 1, 0 ), 0 ); + EXPECT_EQ( m_host.getElement( 1, 1 ), 6 ); + EXPECT_EQ( m_host.getElement( 1, 2 ), 0 ); + EXPECT_EQ( m_host.getElement( 1, 3 ), 7 ); + EXPECT_EQ( m_host.getElement( 1, 4 ), 0 ); + EXPECT_EQ( m_host.getElement( 1, 5 ), 0 ); + EXPECT_EQ( m_host.getElement( 1, 6 ), 0 ); + EXPECT_EQ( m_host.getElement( 1, 7 ), 1 ); + + EXPECT_EQ( m_host.getElement( 2, 0 ), 0 ); + EXPECT_EQ( m_host.getElement( 2, 1 ), 8 ); + EXPECT_EQ( m_host.getElement( 2, 2 ), 9 ); + EXPECT_EQ( m_host.getElement( 2, 3 ), 0 ); + EXPECT_EQ( m_host.getElement( 2, 4 ), 10 ); + EXPECT_EQ( m_host.getElement( 2, 5 ), 0 ); + EXPECT_EQ( m_host.getElement( 2, 6 ), 0 ); + EXPECT_EQ( m_host.getElement( 2, 7 ), 1 ); + + EXPECT_EQ( m_host.getElement( 3, 0 ), 0 ); + EXPECT_EQ( m_host.getElement( 3, 1 ), 11 ); + EXPECT_EQ( m_host.getElement( 3, 2 ), 12 ); + EXPECT_EQ( m_host.getElement( 3, 3 ), 13 ); + EXPECT_EQ( m_host.getElement( 3, 4 ), 14 ); + EXPECT_EQ( m_host.getElement( 3, 5 ), 0 ); + EXPECT_EQ( m_host.getElement( 3, 6 ), 0 ); + EXPECT_EQ( m_host.getElement( 3, 7 ), 1 ); + + EXPECT_EQ( m_host.getElement( 4, 0 ), 0 ); + EXPECT_EQ( m_host.getElement( 4, 1 ), 15 ); + EXPECT_EQ( m_host.getElement( 4, 2 ), 0 ); + EXPECT_EQ( m_host.getElement( 4, 3 ), 0 ); + EXPECT_EQ( m_host.getElement( 4, 4 ), 0 ); + EXPECT_EQ( m_host.getElement( 4, 5 ), 0 ); + EXPECT_EQ( m_host.getElement( 4, 6 ), 0 ); + EXPECT_EQ( m_host.getElement( 4, 7 ), 1 ); + + EXPECT_EQ( m_host.getElement( 5, 0 ), 0 ); + EXPECT_EQ( m_host.getElement( 5, 1 ), 16 ); + EXPECT_EQ( m_host.getElement( 5, 2 ), 17 ); + EXPECT_EQ( m_host.getElement( 5, 3 ), 18 ); + EXPECT_EQ( m_host.getElement( 5, 4 ), 19 ); + EXPECT_EQ( m_host.getElement( 5, 5 ), 20 ); + EXPECT_EQ( m_host.getElement( 5, 6 ), 21 ); + EXPECT_EQ( m_host.getElement( 5, 7 ), 1 ); + + EXPECT_EQ( m_host.getElement( 6, 0 ), 22 ); + EXPECT_EQ( m_host.getElement( 6, 1 ), 23 ); + EXPECT_EQ( m_host.getElement( 6, 2 ), 24 ); + EXPECT_EQ( m_host.getElement( 6, 3 ), 25 ); + EXPECT_EQ( m_host.getElement( 6, 4 ), 26 ); + EXPECT_EQ( m_host.getElement( 6, 5 ), 27 ); + EXPECT_EQ( m_host.getElement( 6, 6 ), 28 ); + EXPECT_EQ( m_host.getElement( 6, 7 ), 1 ); + + EXPECT_EQ( m_host.getElement( 7, 0 ), 29 ); + EXPECT_EQ( m_host.getElement( 7, 1 ), 30 ); + EXPECT_EQ( m_host.getElement( 7, 2 ), 31 ); + EXPECT_EQ( m_host.getElement( 7, 3 ), 32 ); + EXPECT_EQ( m_host.getElement( 7, 4 ), 33 ); + EXPECT_EQ( m_host.getElement( 7, 5 ), 34 ); + EXPECT_EQ( m_host.getElement( 7, 6 ), 35 ); + EXPECT_EQ( m_host.getElement( 7, 7 ), 36 ); + + // Try vectorProduct with copied cuda matrix to see if it works correctly. + using VectorType = TNL::Containers::Vector< RealType, TNL::Devices::Cuda, IndexType >; + + VectorType inVector; + inVector.setSize( m_cols ); + for( IndexType i = 0; i < inVector.getSize(); i++ ) + inVector.setElement( i, 2 ); + + VectorType outVector; + outVector.setSize( m_rows ); + for( IndexType j = 0; j < outVector.getSize(); j++ ) + outVector.setElement( j, 0 ); + + m_cuda.vectorProduct( inVector, outVector ); + + EXPECT_EQ( outVector.getElement( 0 ), 32 ); + EXPECT_EQ( outVector.getElement( 1 ), 28 ); + EXPECT_EQ( outVector.getElement( 2 ), 56 ); + EXPECT_EQ( outVector.getElement( 3 ), 102 ); + EXPECT_EQ( outVector.getElement( 4 ), 32 ); + EXPECT_EQ( outVector.getElement( 5 ), 224 ); + EXPECT_EQ( outVector.getElement( 6 ), 352 ); + EXPECT_EQ( outVector.getElement( 7 ), 520 ); + } +} + template< typename Matrix > void test_SaveAndLoad( const char* filename ) { @@ -893,12 +1408,10 @@ void test_Print() for( IndexType i = 2; i < m_cols; i++ ) // 4th row m.setElement( 4, i, value++ ); - // This is from: https://stackoverflow.com/questions/5193173/getting-cout-output-to-a-stdstring #include std::stringstream printed; std::stringstream couted; - // This is from: https://stackoverflow.com/questions/19485536/redirect-output-of-an-function-printing-to-console-to-string //change the underlying buffer and save the old buffer auto old_buf = std::cout.rdbuf(printed.rdbuf()); @@ -906,7 +1419,6 @@ void test_Print() std::cout.rdbuf(old_buf); //reset - //printed << printed.str() << std::endl; couted << "Row: 0 -> Col:0->1 Col:1->2 Col:2->3\t\n" "Row: 1 -> Col:3->4\t\n" "Row: 2 -> Col:0->5 Col:1->6 Col:2->7\t\n" diff --git a/src/UnitTests/Matrices/SparseMatrixTest_AdEllpack.h b/src/UnitTests/Matrices/SparseMatrixTest_AdEllpack.h index 13e2a1b6c0a0fd529ec511093f99ee4575d78237..7effb52cd864fc61c6cc27345694c00c487c0328 100644 --- a/src/UnitTests/Matrices/SparseMatrixTest_AdEllpack.h +++ b/src/UnitTests/Matrices/SparseMatrixTest_AdEllpack.h @@ -16,7 +16,6 @@ #ifdef HAVE_GTEST #include -#ifdef NOT_WORKING // test fixture for typed tests template< typename Matrix > class AdEllpackMatrixTest : public ::testing::Test @@ -39,9 +38,9 @@ using AdEllpackMatrixTypes = ::testing::Types TNL::Matrices::AdEllpack< int, TNL::Devices::Host, long >, TNL::Matrices::AdEllpack< long, TNL::Devices::Host, long >, TNL::Matrices::AdEllpack< float, TNL::Devices::Host, long >, - TNL::Matrices::AdEllpack< double, TNL::Devices::Host, long >, + TNL::Matrices::AdEllpack< double, TNL::Devices::Host, long > #ifdef HAVE_CUDA - TNL::Matrices::AdEllpack< int, TNL::Devices::Cuda, short >, + ,TNL::Matrices::AdEllpack< int, TNL::Devices::Cuda, short >, TNL::Matrices::AdEllpack< long, TNL::Devices::Cuda, short >, TNL::Matrices::AdEllpack< float, TNL::Devices::Cuda, short >, TNL::Matrices::AdEllpack< double, TNL::Devices::Cuda, short >, @@ -65,20 +64,6 @@ TYPED_TEST( AdEllpackMatrixTest, setDimensionsTest ) test_SetDimensions< AdEllpackMatrixType >(); } -TYPED_TEST( AdEllpackMatrixTest, setCompressedRowLengthsTest ) -{ -// using AdEllpackMatrixType = typename TestFixture::AdEllpackMatrixType; - -// test_SetCompressedRowLengths< AdEllpackMatrixType >(); - - bool testRan = false; - EXPECT_TRUE( testRan ); - std::cout << "\nTEST DID NOT RUN. NOT WORKING.\n\n"; - std::cout << " This test is dependent on the input format. \n"; - std::cout << " Almost every format allocates elements per row differently.\n\n"; - std::cout << "\n TODO: Finish implementation of getNonZeroRowLength (Only non-zero elements, not the number of allocated elements.)\n\n"; -} - TYPED_TEST( AdEllpackMatrixTest, setLikeTest ) { using AdEllpackMatrixType = typename TestFixture::AdEllpackMatrixType; @@ -94,7 +79,7 @@ TYPED_TEST( AdEllpackMatrixTest, resetTest ) } TYPED_TEST( AdEllpackMatrixTest, setElementTest ) -{ +{ using AdEllpackMatrixType = typename TestFixture::AdEllpackMatrixType; test_SetElement< AdEllpackMatrixType >(); @@ -121,6 +106,13 @@ TYPED_TEST( AdEllpackMatrixTest, vectorProductTest ) test_VectorProduct< AdEllpackMatrixType >(); } +TYPED_TEST( AdEllpackMatrixTest, operatorEqualsTest ) +{ + using AdEllpackMatrixType = typename TestFixture::AdEllpackMatrixType; + + test_OperatorEquals< AdEllpackMatrixType >(); +} + TYPED_TEST( AdEllpackMatrixTest, saveAndLoadTest ) { using AdEllpackMatrixType = typename TestFixture::AdEllpackMatrixType; @@ -134,6 +126,8 @@ TYPED_TEST( AdEllpackMatrixTest, printTest ) test_Print< AdEllpackMatrixType >(); } + +#ifdef NOT_WORKING #endif #endif diff --git a/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h b/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h index 326a87ccb76ccd450be3d539ee2a7e07458e7650..33e530be57e6675bac01f735547a79b4731b57a9 100644 --- a/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h +++ b/src/UnitTests/Matrices/SparseMatrixTest_BiEllpack.h @@ -13,10 +13,9 @@ #include "SparseMatrixTest.hpp" #include -#ifdef HAVE_GTEST +#ifdef HAVE_GTEST #include -#ifdef NOT_WORKING // test fixture for typed tests template< typename Matrix > class BiEllpackMatrixTest : public ::testing::Test @@ -39,21 +38,21 @@ using BiEllpackMatrixTypes = ::testing::Types TNL::Matrices::BiEllpack< int, TNL::Devices::Host, long >, TNL::Matrices::BiEllpack< long, TNL::Devices::Host, long >, TNL::Matrices::BiEllpack< float, TNL::Devices::Host, long >, - TNL::Matrices::BiEllpack< double, TNL::Devices::Host, long >//, -//#ifdef HAVE_CUDA -// TNL::Matrices::BiEllpack< int, TNL::Devices::Cuda, short >, -// TNL::Matrices::BiEllpack< long, TNL::Devices::Cuda, short >, -// TNL::Matrices::BiEllpack< float, TNL::Devices::Cuda, short >, -// TNL::Matrices::BiEllpack< double, TNL::Devices::Cuda, short >, -// TNL::Matrices::BiEllpack< int, TNL::Devices::Cuda, int >, -// TNL::Matrices::BiEllpack< long, TNL::Devices::Cuda, int >, -// TNL::Matrices::BiEllpack< float, TNL::Devices::Cuda, int >, -// TNL::Matrices::BiEllpack< double, TNL::Devices::Cuda, int >, -// TNL::Matrices::BiEllpack< int, TNL::Devices::Cuda, long >, -// TNL::Matrices::BiEllpack< long, TNL::Devices::Cuda, long >, -// TNL::Matrices::BiEllpack< float, TNL::Devices::Cuda, long >, -// TNL::Matrices::BiEllpack< double, TNL::Devices::Cuda, long > -//#endif + TNL::Matrices::BiEllpack< double, TNL::Devices::Host, long > +#ifdef HAVE_CUDA + ,TNL::Matrices::BiEllpack< int, TNL::Devices::Cuda, short >, + TNL::Matrices::BiEllpack< long, TNL::Devices::Cuda, short >, + TNL::Matrices::BiEllpack< float, TNL::Devices::Cuda, short >, + TNL::Matrices::BiEllpack< double, TNL::Devices::Cuda, short >, + TNL::Matrices::BiEllpack< int, TNL::Devices::Cuda, int >, + TNL::Matrices::BiEllpack< long, TNL::Devices::Cuda, int >, + TNL::Matrices::BiEllpack< float, TNL::Devices::Cuda, int >, + TNL::Matrices::BiEllpack< double, TNL::Devices::Cuda, int >, + TNL::Matrices::BiEllpack< int, TNL::Devices::Cuda, long >, + TNL::Matrices::BiEllpack< long, TNL::Devices::Cuda, long >, + TNL::Matrices::BiEllpack< float, TNL::Devices::Cuda, long >, + TNL::Matrices::BiEllpack< double, TNL::Devices::Cuda, long > +#endif >; TYPED_TEST_SUITE( BiEllpackMatrixTest, BiEllpackMatrixTypes); @@ -61,81 +60,86 @@ TYPED_TEST_SUITE( BiEllpackMatrixTest, BiEllpackMatrixTypes); TYPED_TEST( BiEllpackMatrixTest, setDimensionsTest ) { using BiEllpackMatrixType = typename TestFixture::BiEllpackMatrixType; - + test_SetDimensions< BiEllpackMatrixType >(); } -TYPED_TEST( BiEllpackMatrixTest, setCompressedRowLengthsTest ) -{ -// using BiEllpackMatrixType = typename TestFixture::BiEllpackMatrixType; - -// test_SetCompressedRowLengths< BiEllpackMatrixType >(); - - bool testRan = false; - EXPECT_TRUE( testRan ); - std::cout << "\nTEST DID NOT RUN. NOT WORKING.\n\n"; - std::cout << " This test is dependent on the input format. \n"; - std::cout << " Almost every format allocates elements per row differently.\n\n"; - std::cout << "\n TODO: Finish implementation of getNonZeroRowLength (Only non-zero elements, not the number of allocated elements.)\n\n"; -} +//TYPED_TEST( BiEllpackMatrixTest, setCompressedRowLengthsTest ) +//{ +//// using BiEllpackMatrixType = typename TestFixture::BiEllpackMatrixType; +// +//// test_SetCompressedRowLengths< BiEllpackMatrixType >(); +// +// bool testRan = false; +// EXPECT_TRUE( testRan ); +// std::cout << "\nTEST DID NOT RUN. NOT WORKING.\n\n"; +// std::cout << " This test is dependent on the input format. \n"; +// std::cout << " Almost every format allocates elements per row differently.\n\n"; +// std::cout << "\n TODO: Finish implementation of getNonZeroRowLength (Only non-zero elements, not the number of allocated elements.)\n\n"; +//} TYPED_TEST( BiEllpackMatrixTest, setLikeTest ) { using BiEllpackMatrixType = typename TestFixture::BiEllpackMatrixType; - + test_SetLike< BiEllpackMatrixType, BiEllpackMatrixType >(); } TYPED_TEST( BiEllpackMatrixTest, resetTest ) { using BiEllpackMatrixType = typename TestFixture::BiEllpackMatrixType; - + test_Reset< BiEllpackMatrixType >(); } TYPED_TEST( BiEllpackMatrixTest, setElementTest ) { using BiEllpackMatrixType = typename TestFixture::BiEllpackMatrixType; - + test_SetElement< BiEllpackMatrixType >(); } TYPED_TEST( BiEllpackMatrixTest, addElementTest ) { using BiEllpackMatrixType = typename TestFixture::BiEllpackMatrixType; - + test_AddElement< BiEllpackMatrixType >(); } TYPED_TEST( BiEllpackMatrixTest, setRowTest ) { using BiEllpackMatrixType = typename TestFixture::BiEllpackMatrixType; - + test_SetRow< BiEllpackMatrixType >(); } TYPED_TEST( BiEllpackMatrixTest, vectorProductTest ) { using BiEllpackMatrixType = typename TestFixture::BiEllpackMatrixType; - + test_VectorProduct< BiEllpackMatrixType >(); } +//TYPED_TEST( BiEllpackMatrixTest, operatorEqualsTest ) +//{ +// using BiEllpackMatrixType = typename TestFixture::BiEllpackMatrixType; +// +// test_OperatorEquals< BiEllpackMatrixType >(); +//} + TYPED_TEST( BiEllpackMatrixTest, saveAndLoadTest ) { using BiEllpackMatrixType = typename TestFixture::BiEllpackMatrixType; - + test_SaveAndLoad< BiEllpackMatrixType >( "test_SparseMatrixTest_BiEllpack" ); } TYPED_TEST( BiEllpackMatrixTest, printTest ) { using BiEllpackMatrixType = typename TestFixture::BiEllpackMatrixType; - + test_Print< BiEllpackMatrixType >(); } -#endif - -#endif +#endif // HAVE_GTEST #include "../main.h" diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSR.h b/src/UnitTests/Matrices/SparseMatrixTest_CSR.h index d63441381300d030674f64ae49ab65b732b793fe..3530db46c18753102a09b15908fcc5d34fa66026 100644 --- a/src/UnitTests/Matrices/SparseMatrixTest_CSR.h +++ b/src/UnitTests/Matrices/SparseMatrixTest_CSR.h @@ -13,7 +13,7 @@ #include "SparseMatrixTest.hpp" #include -#ifdef HAVE_GTEST +#ifdef HAVE_GTEST #include // test fixture for typed tests @@ -40,7 +40,7 @@ using CSRMatrixTypes = ::testing::Types TNL::Matrices::CSR< float, TNL::Devices::Host, long >, TNL::Matrices::CSR< double, TNL::Devices::Host, long > #ifdef HAVE_CUDA - ,TNL::Matrices::CSR< int, TNL::Devices::Cuda, short >, + ,TNL::Matrices::CSR< int, TNL::Devices::Cuda, short >, TNL::Matrices::CSR< long, TNL::Devices::Cuda, short >, TNL::Matrices::CSR< float, TNL::Devices::Cuda, short >, TNL::Matrices::CSR< double, TNL::Devices::Cuda, short >, @@ -60,16 +60,16 @@ TYPED_TEST_SUITE( CSRMatrixTest, CSRMatrixTypes); TYPED_TEST( CSRMatrixTest, setDimensionsTest ) { using CSRMatrixType = typename TestFixture::CSRMatrixType; - + test_SetDimensions< CSRMatrixType >(); } //TYPED_TEST( CSRMatrixTest, setCompressedRowLengthsTest ) //{ //// using CSRMatrixType = typename TestFixture::CSRMatrixType; -// +// //// test_SetCompressedRowLengths< CSRMatrixType >(); -// +// // bool testRan = false; // EXPECT_TRUE( testRan ); // std::cout << "\nTEST DID NOT RUN. NOT WORKING.\n\n"; @@ -81,56 +81,56 @@ TYPED_TEST( CSRMatrixTest, setDimensionsTest ) TYPED_TEST( CSRMatrixTest, setLikeTest ) { using CSRMatrixType = typename TestFixture::CSRMatrixType; - + test_SetLike< CSRMatrixType, CSRMatrixType >(); } TYPED_TEST( CSRMatrixTest, resetTest ) { using CSRMatrixType = typename TestFixture::CSRMatrixType; - + test_Reset< CSRMatrixType >(); } TYPED_TEST( CSRMatrixTest, setElementTest ) { using CSRMatrixType = typename TestFixture::CSRMatrixType; - + test_SetElement< CSRMatrixType >(); } TYPED_TEST( CSRMatrixTest, addElementTest ) { using CSRMatrixType = typename TestFixture::CSRMatrixType; - + test_AddElement< CSRMatrixType >(); } TYPED_TEST( CSRMatrixTest, setRowTest ) { using CSRMatrixType = typename TestFixture::CSRMatrixType; - + test_SetRow< CSRMatrixType >(); } TYPED_TEST( CSRMatrixTest, vectorProductTest ) { using CSRMatrixType = typename TestFixture::CSRMatrixType; - + test_VectorProduct< CSRMatrixType >(); } TYPED_TEST( CSRMatrixTest, saveAndLoadTest ) { using CSRMatrixType = typename TestFixture::CSRMatrixType; - + test_SaveAndLoad< CSRMatrixType >( "test_SparseMatrixTest_CSR" ); } TYPED_TEST( CSRMatrixTest, printTest ) { using CSRMatrixType = typename TestFixture::CSRMatrixType; - + test_Print< CSRMatrixType >(); } diff --git a/src/UnitTests/Matrices/SparseMatrixTest_ChunkedEllpack.h b/src/UnitTests/Matrices/SparseMatrixTest_ChunkedEllpack.h index 24826e73cc810dd4bc97381eba1fd2bc26040a45..6909b53a5304df75aa021484402f1c3986ec9b5f 100644 --- a/src/UnitTests/Matrices/SparseMatrixTest_ChunkedEllpack.h +++ b/src/UnitTests/Matrices/SparseMatrixTest_ChunkedEllpack.h @@ -24,10 +24,6 @@ protected: using ChunkedEllpackMatrixType = Matrix; }; -// columnIndexes of ChunkedEllpack appear to be broken, when printed, it prints out a bunch of 4s. -// rowPointers have interesting elements? 0 18 36 42 54 72 96 126 162 204 256 when rows = 10, cols = 11; rowLengths = 3 3 1 2 3 4 5 6 7 8 -// and 0 52 103 154 205 256 when rows = 5, cols = 4; rowLengths = 3 3 3 3 3 - // types for which MatrixTest is instantiated using ChEllpackMatrixTypes = ::testing::Types @@ -45,7 +41,7 @@ using ChEllpackMatrixTypes = ::testing::Types TNL::Matrices::ChunkedEllpack< float, TNL::Devices::Host, long >, TNL::Matrices::ChunkedEllpack< double, TNL::Devices::Host, long > #ifdef HAVE_CUDA - ,TNL::Matrices::ChunkedEllpack< int, TNL::Devices::Cuda, short >, + ,TNL::Matrices::ChunkedEllpack< int, TNL::Devices::Cuda, short >, TNL::Matrices::ChunkedEllpack< long, TNL::Devices::Cuda, short >, TNL::Matrices::ChunkedEllpack< float, TNL::Devices::Cuda, short >, TNL::Matrices::ChunkedEllpack< double, TNL::Devices::Cuda, short >, @@ -125,6 +121,13 @@ TYPED_TEST( ChunkedEllpackMatrixTest, vectorProductTest ) test_VectorProduct< ChunkedEllpackMatrixType >(); } +//TYPED_TEST( ChunkedEllpackMatrixTest, operatorEqualsTest ) +//{ +// using ChunkedEllpackMatrixType = typename TestFixture::ChunkedEllpackMatrixType; +// +// test_OperatorEquals< ChunkedEllpackMatrixType >(); +//} + TYPED_TEST( ChunkedEllpackMatrixTest, saveAndLoadTest ) { using ChunkedEllpackMatrixType = typename TestFixture::ChunkedEllpackMatrixType; diff --git a/src/UnitTests/Matrices/SparseMatrixTest_Ellpack.h b/src/UnitTests/Matrices/SparseMatrixTest_Ellpack.h index c5e547613799a75c53e4f2575fd4dbf8b479650f..979068e02ea2d5b4ed5c3dc4f4db2a566c027934 100644 --- a/src/UnitTests/Matrices/SparseMatrixTest_Ellpack.h +++ b/src/UnitTests/Matrices/SparseMatrixTest_Ellpack.h @@ -13,7 +13,7 @@ #include "SparseMatrixTest.hpp" #include -#ifdef HAVE_GTEST +#ifdef HAVE_GTEST #include // test fixture for typed tests @@ -40,7 +40,7 @@ using EllpackMatrixTypes = ::testing::Types TNL::Matrices::Ellpack< float, TNL::Devices::Host, long >, TNL::Matrices::Ellpack< double, TNL::Devices::Host, long > #ifdef HAVE_CUDA - ,TNL::Matrices::Ellpack< int, TNL::Devices::Cuda, short >, + ,TNL::Matrices::Ellpack< int, TNL::Devices::Cuda, short >, TNL::Matrices::Ellpack< long, TNL::Devices::Cuda, short >, TNL::Matrices::Ellpack< float, TNL::Devices::Cuda, short >, TNL::Matrices::Ellpack< double, TNL::Devices::Cuda, short >, @@ -60,16 +60,16 @@ TYPED_TEST_SUITE( EllpackMatrixTest, EllpackMatrixTypes ); TYPED_TEST( EllpackMatrixTest, setDimensionsTest ) { using EllpackMatrixType = typename TestFixture::EllpackMatrixType; - + test_SetDimensions< EllpackMatrixType >(); } //TYPED_TEST( EllpackMatrixTest, setCompressedRowLengthsTest ) //{ //// using EllpackMatrixType = typename TestFixture::EllpackMatrixType; -// +// //// test_SetCompressedRowLengths< EllpackMatrixType >(); -// +// // bool testRan = false; // EXPECT_TRUE( testRan ); // std::cout << "\nTEST DID NOT RUN. NOT WORKING.\n\n"; @@ -81,56 +81,56 @@ TYPED_TEST( EllpackMatrixTest, setDimensionsTest ) TYPED_TEST( EllpackMatrixTest, setLikeTest ) { using EllpackMatrixType = typename TestFixture::EllpackMatrixType; - + test_SetLike< EllpackMatrixType, EllpackMatrixType >(); } TYPED_TEST( EllpackMatrixTest, resetTest ) { using EllpackMatrixType = typename TestFixture::EllpackMatrixType; - + test_Reset< EllpackMatrixType >(); } TYPED_TEST( EllpackMatrixTest, setElementTest ) { using EllpackMatrixType = typename TestFixture::EllpackMatrixType; - + test_SetElement< EllpackMatrixType >(); } TYPED_TEST( EllpackMatrixTest, addElementTest ) { using EllpackMatrixType = typename TestFixture::EllpackMatrixType; - + test_AddElement< EllpackMatrixType >(); } TYPED_TEST( EllpackMatrixTest, setRowTest ) { using EllpackMatrixType = typename TestFixture::EllpackMatrixType; - + test_SetRow< EllpackMatrixType >(); } TYPED_TEST( EllpackMatrixTest, vectorProductTest ) { using EllpackMatrixType = typename TestFixture::EllpackMatrixType; - + test_VectorProduct< EllpackMatrixType >(); } TYPED_TEST( EllpackMatrixTest, saveAndLoadTest ) { using EllpackMatrixType = typename TestFixture::EllpackMatrixType; - + test_SaveAndLoad< EllpackMatrixType >( "test_SparseMatrixTest_Ellpack" ); } TYPED_TEST( EllpackMatrixTest, printTest ) { using EllpackMatrixType = typename TestFixture::EllpackMatrixType; - + test_Print< EllpackMatrixType >(); } diff --git a/src/UnitTests/Matrices/SparseMatrixTest_SlicedEllpack.h b/src/UnitTests/Matrices/SparseMatrixTest_SlicedEllpack.h index 073abb59ab4449a085de679e467d3708bf256804..0798f59dc49fbb5ada03d975fe60a61ae3e85fcc 100644 --- a/src/UnitTests/Matrices/SparseMatrixTest_SlicedEllpack.h +++ b/src/UnitTests/Matrices/SparseMatrixTest_SlicedEllpack.h @@ -13,7 +13,7 @@ #include "SparseMatrixTest.hpp" #include -#ifdef HAVE_GTEST +#ifdef HAVE_GTEST #include // test fixture for typed tests @@ -40,7 +40,7 @@ using SlicedEllpackMatrixTypes = ::testing::Types TNL::Matrices::SlicedEllpack< float, TNL::Devices::Host, long >, TNL::Matrices::SlicedEllpack< double, TNL::Devices::Host, long > #ifdef HAVE_CUDA - ,TNL::Matrices::SlicedEllpack< int, TNL::Devices::Cuda, short >, + ,TNL::Matrices::SlicedEllpack< int, TNL::Devices::Cuda, short >, TNL::Matrices::SlicedEllpack< long, TNL::Devices::Cuda, short >, TNL::Matrices::SlicedEllpack< float, TNL::Devices::Cuda, short >, TNL::Matrices::SlicedEllpack< double, TNL::Devices::Cuda, short >, @@ -60,16 +60,16 @@ TYPED_TEST_SUITE( SlicedEllpackMatrixTest, SlicedEllpackMatrixTypes ); TYPED_TEST( SlicedEllpackMatrixTest, setDimensionsTest ) { using SlicedEllpackMatrixType = typename TestFixture::SlicedEllpackMatrixType; - + test_SetDimensions< SlicedEllpackMatrixType >(); } //TYPED_TEST( SlicedEllpackMatrixTest, setCompressedRowLengthsTest ) //{ //// using SlicedEllpackMatrixType = typename TestFixture::SlicedEllpackMatrixType; -// +// //// test_SetCompressedRowLengths< SlicedEllpackMatrixType >(); -// +// // bool testRan = false; // EXPECT_TRUE( testRan ); // std::cout << "\nTEST DID NOT RUN. NOT WORKING.\n\n"; @@ -81,56 +81,56 @@ TYPED_TEST( SlicedEllpackMatrixTest, setDimensionsTest ) TYPED_TEST( SlicedEllpackMatrixTest, setLikeTest ) { using SlicedEllpackMatrixType = typename TestFixture::SlicedEllpackMatrixType; - + test_SetLike< SlicedEllpackMatrixType, SlicedEllpackMatrixType >(); } TYPED_TEST( SlicedEllpackMatrixTest, resetTest ) { using SlicedEllpackMatrixType = typename TestFixture::SlicedEllpackMatrixType; - + test_Reset< SlicedEllpackMatrixType >(); } TYPED_TEST( SlicedEllpackMatrixTest, setElementTest ) { using SlicedEllpackMatrixType = typename TestFixture::SlicedEllpackMatrixType; - + test_SetElement< SlicedEllpackMatrixType >(); } TYPED_TEST( SlicedEllpackMatrixTest, addElementTest ) { using SlicedEllpackMatrixType = typename TestFixture::SlicedEllpackMatrixType; - + test_AddElement< SlicedEllpackMatrixType >(); } TYPED_TEST( SlicedEllpackMatrixTest, setRowTest ) { using SlicedEllpackMatrixType = typename TestFixture::SlicedEllpackMatrixType; - + test_SetRow< SlicedEllpackMatrixType >(); } TYPED_TEST( SlicedEllpackMatrixTest, vectorProductTest ) { using SlicedEllpackMatrixType = typename TestFixture::SlicedEllpackMatrixType; - + test_VectorProduct< SlicedEllpackMatrixType >(); } TYPED_TEST( SlicedEllpackMatrixTest, saveAndLoadTest ) { using SlicedEllpackMatrixType = typename TestFixture::SlicedEllpackMatrixType; - + test_SaveAndLoad< SlicedEllpackMatrixType >( "test_SparseMatrixTest_SlicedEllpack" ); } TYPED_TEST( SlicedEllpackMatrixTest, printTest ) { using SlicedEllpackMatrixType = typename TestFixture::SlicedEllpackMatrixType; - + test_Print< SlicedEllpackMatrixType >(); }