diff --git a/src/Benchmarks/BLAS/CMakeLists.txt b/src/Benchmarks/BLAS/CMakeLists.txt index 81d83753329658c960b47a08d3f1ac58881bd9ff..9743b3eaea20d0d0f975d77c88326becbe8b2172 100644 --- a/src/Benchmarks/BLAS/CMakeLists.txt +++ b/src/Benchmarks/BLAS/CMakeLists.txt @@ -1,6 +1,8 @@ if( BUILD_CUDA ) - cuda_add_executable( tnl-benchmark-blas tnl-benchmark-blas.cu ) - cuda_add_cublas_to_target( tnl-benchmark-blas ) + #find_library( CUDADEVRT NAMES cudadevrt ) + cuda_add_executable( tnl-benchmark-blas tnl-benchmark-blas.cu ) + cuda_add_cublas_to_target( tnl-benchmark-blas ) + #target_link_libraries( tnl-benchmark-blas ${CUDADEVRT} )#${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudadevrt.a ) else() add_executable( tnl-benchmark-blas tnl-benchmark-blas.cpp ) endif() diff --git a/src/Benchmarks/BLAS/spmv.h b/src/Benchmarks/BLAS/spmv.h index c013e6bfeb8847f3abc546bfdb5a5ec49441ec13..85cb4b7314d87eed342daf4a2e0196f0c7a752d1 100644 --- a/src/Benchmarks/BLAS/spmv.h +++ b/src/Benchmarks/BLAS/spmv.h @@ -16,9 +16,9 @@ #include #include -#include -#include -#include +#include +#include +#include namespace TNL { namespace Benchmarks { diff --git a/src/Benchmarks/SpMV/CMakeLists.txt b/src/Benchmarks/SpMV/CMakeLists.txt index 7cb9c4fcd976e88ec61fc4579aaa612c9da7b656..6af6965345eeacee224edd7b44dc55f389cd7fbe 100644 --- a/src/Benchmarks/SpMV/CMakeLists.txt +++ b/src/Benchmarks/SpMV/CMakeLists.txt @@ -1,6 +1,6 @@ if( BUILD_CUDA ) CUDA_ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cu ) - TARGET_LINK_LIBRARIES( tnl-benchmark-spmv ${CUDA_cusparse_LIBRARY} ) + TARGET_LINK_LIBRARIES( tnl-benchmark-spmv ${CUDA_cusparse_LIBRARY} ${CUDA_cudadevrt_LIBRARY} ) else() ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cpp ) endif() diff --git a/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cpp b/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cpp deleted file mode 100644 index c9cd17cda0312c07bed4bcaa92c4ef4273704b35..0000000000000000000000000000000000000000 --- a/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cpp +++ /dev/null @@ -1,14 +0,0 @@ -/*************************************************************************** - tnl-benchmark-spmv.cpp - description - ------------------- - begin : Jun 5, 2014 - copyright : (C) 2014 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - - -#include "tnl-benchmark-old-spmv.h" - - diff --git a/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cu b/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cu deleted file mode 100644 index 433af970b6058e1ae03f480296da566a3cbb79b5..0000000000000000000000000000000000000000 --- a/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.cu +++ /dev/null @@ -1,12 +0,0 @@ -/*************************************************************************** - tnl-benchmark-spmv.cu - description - ------------------- - begin : Jun 5, 2014 - copyright : (C) 2014 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - - -#include "tnl-benchmark-old-spmv.h" diff --git a/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.h b/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.h deleted file mode 100644 index 455c7d412f4f8ae4cc4af7bbd15ba0e47dda978a..0000000000000000000000000000000000000000 --- a/src/Benchmarks/SpMV/OldSpMV/tnl-benchmark-old-spmv.h +++ /dev/null @@ -1,925 +0,0 @@ -/*************************************************************************** - tnl-benchmark-spmv.h - description - ------------------- - begin : Jun 5, 2014 - copyright : (C) 2014 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#ifdef NOT_USED_ANYMORE - -#pragma once - -#include -#include -#include -#ifdef HAVE_CUDA -#include -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "tnlCusparseCSRMatrix.h" - -using namespace std; -using namespace TNL; -using namespace TNL::Matrices; - -void setupConfig( Config::ConfigDescription& config ) -{ - config.addDelimiter ( "General settings:" ); - config.addRequiredEntry< String >( "test" , "Test to be performed." ); - config.addEntryEnum< String >( "mtx" ); - config.addEntryEnum< String >( "tnl" ); - config.addRequiredEntry< String >( "input-file" , "Input file name." ); - config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-spmv.log"); - config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" ); - config.addEntry< double >( "stop-time", "Seconds to iterate the SpMV operation.", 1.0 ); - config.addEntry< int >( "verbose", "Verbose mode.", 1 ); -} - -bool initLogFile( std::fstream& logFile, const String& fileName ) -{ - if( access( fileName.getString(), F_OK ) == -1 ) - { - logFile.open( fileName.getString(), std::ios::out ); - if( ! logFile ) - return false; - const String fillingColoring = " : COLORING 0 #FFF8DC 20 #FFFF00 40 #FFD700 60 #FF8C0 80 #FF0000 100"; - const String speedupColoring = " : COLORING #0099FF 1 #FFFFFF 2 #00FF99 4 #33FF99 8 #33FF22 16 #FF9900"; - const String paddingColoring = " : COLORING #FFFFFF 1 #FFFFCC 10 #FFFF99 100 #FFFF66 1000 #FFFF33 10000 #FFFF00"; - logFile << "#Matrix file " << std::endl; - logFile << "#Rows" << std::endl; - logFile << "#Columns" << std::endl; - logFile << "#Non-zero elements" << std::endl; - logFile << "#Filling (in %)" << fillingColoring << std::endl; - logFile << "#CSR Format" << std::endl; - logFile << "# CPU" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << std::endl; -#ifdef HAVE_CUDA - logFile << "# Cusparse CSR" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - cusparse-csr-speedup.txt" << std::endl; - logFile << "# CUDA" << std::endl; - logFile << "# Scalar" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-scalar-cuda-speedup.txt" << std::endl; - logFile << "# Vector" << std::endl; - logFile << "# Warp Size 1" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-1-cuda-speedup.txt" << std::endl; - logFile << "# Warp Size 2" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-2-cuda-speedup.txt" << std::endl; - logFile << "# Warp Size 4" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-4-cuda-speedup.txt" << std::endl; - logFile << "# Warp Size 8" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-8-cuda-speedup.txt" << std::endl; - logFile << "# Warp Size 16" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-16-cuda-speedup.txt" << std::endl; - logFile << "# Warp Size 32" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-vector-32-cuda-speedup.txt" << std::endl; - logFile << "# Hybrid" << std::endl; - logFile << "# Split 2" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-hybrid-2-cuda-speedup.txt" << std::endl; - logFile << "# Split 4" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-hybrid-4-cuda-speedup.txt" << std::endl; - logFile << "# Split 8" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-hybrid-8-cuda-speedup.txt" << std::endl; - logFile << "# Split 16" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-hybrid-16-cuda-speedup.txt" << std::endl; - logFile << "# Split 32" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-hybrid-32-cuda-speedup.txt" << std::endl; - logFile << "# Split 64" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - csr-hybrid-64-cuda-speedup.txt" << std::endl; -#endif - logFile << "#Ellpack Format" << std::endl; - logFile << "# Padding (in %)" << paddingColoring << std::endl; - logFile << "# CPU" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - ellpack-host-speedup.txt" << std::endl; -#ifdef HAVE_CUDA - logFile << "# CUDA" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - ellpack-cuda-speedup.txt" << std::endl; -#endif - logFile << "#SlicedEllpack Format" << std::endl; - logFile << "# Padding (in %)" << paddingColoring << std::endl; - logFile << "# CPU" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - sliced-ellpack-host-speedup.txt" << std::endl; -#ifdef HAVE_CUDA - logFile << "# CUDA" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - sliced-ellpack-cuda-speedup.txt" << std::endl; -#endif - logFile << "#ChunkedEllpack Format" << std::endl; - logFile << "# Padding (in %)" << paddingColoring << std::endl; - logFile << "# CPU" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - chunked-ellpack-host-speedup.txt" << std::endl; -#ifdef HAVE_CUDA - logFile << "# CUDA" << std::endl; - logFile << "# Gflops" << std::endl; - logFile << "# Throughput" << std::endl; - logFile << "# Speedup" << speedupColoring << " SORT - chunked-ellpack-cuda-speedup.txt" << std::endl; -#endif - return true; - } - logFile.open( fileName.getString(), std::ios::out | std::ios::app ); - //logFile << std::setprecision( 2 ); - if( ! logFile ) - return false; - return true; -} - -template< typename Matrix > -void printMatrixInfo( const String& inputFileName, - const Matrix& matrix, - std::ostream& str ) -{ - str << " Rows: " << std::setw( 8 ) << matrix.getRows(); - str << " Columns: " << std::setw( 8 ) << matrix.getColumns(); - str << " Nonzero Elements: " << std::setw( 10 ) << matrix.getNumberOfNonzeroMatrixElements(); - const double fillingRatio = ( double ) matrix.getNumberOfNonzeroMatrixElements() / ( double ) matrix.getNumberOfMatrixElements(); - str << " Filling: " << std::setw( 5 ) << 100.0 * fillingRatio << "%" << std::endl; - str << std::setw( 25 ) << "Format" - << std::setw( 15 ) << "Padding" - << std::setw( 15 ) << "Time" - << std::setw( 15 ) << "GFLOPS" - << std::setw( 15 ) << "Throughput" - << std::setw( 15 ) << "Speedup" << std::endl; -} - -template< typename Matrix > -bool writeMatrixInfo( const String& inputFileName, - const Matrix& matrix, - std::ostream& logFile ) -{ - logFile << std::endl; - logFile << inputFileName << std::endl; - logFile << " " << matrix.getRows() << std::endl; - logFile << " " << matrix.getColumns() << std::endl; - logFile << " " << matrix.getNumberOfNonzeroMatrixElements() << std::endl; - const double fillingRatio = ( double ) matrix.getNumberOfNonzeroMatrixElements() / ( double ) matrix.getNumberOfMatrixElements(); - logFile << " " << 100.0 * fillingRatio << std::endl; - logFile << std::flush; - if( ! logFile.good() ) - return false; - return true; -} - -double computeGflops( const long int nonzeroElements, - const int iterations, - const double& time ) -{ - return ( double ) ( 2 * iterations * nonzeroElements ) / time * 1.0e-9; -} - -template< typename Real > -double computeThroughput( const long int nonzeroElements, - const int iterations, - const int rows, - const double& time ) -{ - return ( double ) ( ( 2 * nonzeroElements + rows ) * iterations ) * sizeof( Real ) / time * 1.0e-9; -} - -template< typename Matrix, - typename Vector > -double benchmarkMatrix( const Matrix& matrix, - const Vector& x, - Vector& b, - const long int nonzeroElements, - const char* format, - const double& stopTime, - const double& baseline, - int verbose, - std::fstream& logFile ) -{ - Timer timer; - timer.start(); - double time( 0.0 ); - int iterations( 0 ); - while( time < stopTime ) - { - matrix.vectorProduct( x, b ); -#ifdef HAVE_CUDA - if( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value ) - cudaDeviceSynchronize(); -#endif - time = timer.getRealTime(); - iterations++; - } - const double gflops = computeGflops( nonzeroElements, iterations, time ); - const double throughput = computeThroughput< typename Matrix::RealType >( nonzeroElements, iterations, matrix.getRows(), time ); - const long int allocatedElements = matrix.getNumberOfMatrixElements(); - const double padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; - if( verbose ) - { - std::cout << std::setw( 25 ) << format - << std::setw( 15 ) << padding - << std::setw( 15 ) << time - << std::setw( 15 ) << gflops - << std::setw( 15 ) << throughput; - if( baseline ) - std::cout << std::setw( 15 ) << gflops / baseline << std::endl; - else - std::cout << std::setw( 15 ) << "N/A" << std::endl; - } - logFile << " " << gflops << std::endl; - logFile << " " << throughput << std::endl; - if( baseline ) - logFile << gflops / baseline << std::endl; - else - logFile << "N/A" << std::endl; - return gflops; -} - -void writeTestFailed( std::fstream& logFile, - int repeat ) -{ - for( int i = 0; i < repeat; i++ ) - logFile << "N/A" << std::endl; -} - -template< typename Real > -bool setupBenchmark( const Config::ParameterContainer& parameters ) -{ - const String& test = parameters.getParameter< String >( "test" ); - const String& inputFileName = parameters.getParameter< String >( "input-file" ); - const String& logFileName = parameters.getParameter< String >( "log-file" ); - const int verbose = parameters.getParameter< int >( "verbose" ); - const double stopTime = parameters.getParameter< double >( "stop-time" ); - std::fstream logFile; - if( ! initLogFile( logFile, logFileName ) ) - { - std::cerr << "I am not able to open the file " << logFileName << "." << std::endl; - return false; - } - if( test == "mtx" ) - { - typedef Matrices::CSR< Real, Devices::Host, int > CSRType; - CSRType csrMatrix; - try - { - if( ! MatrixReader< CSRType >::readMtxFile( inputFileName, csrMatrix ) ) - { - std::cerr << "I am not able to read the matrix file " << inputFileName << "." << std::endl; - logFile << std::endl; - logFile << inputFileName << std::endl; - logFile << "Benchmark failed: Unable to read the matrix." << std::endl; - return false; - } - } - catch( std::bad_alloc ) - { - std::cerr << "Not enough memory to read the matrix." << std::endl; - logFile << std::endl; - logFile << inputFileName << std::endl; - logFile << "Benchmark failed: Not enough memory." << std::endl; - return false; - } - if( verbose ) - printMatrixInfo( inputFileName, csrMatrix,std::cout ); - if( ! writeMatrixInfo( inputFileName, csrMatrix, logFile ) ) - { - std::cerr << "I am not able to write new matrix to the log file." << std::endl; - return false; - } - const int rows = csrMatrix.getRows(); - const long int nonzeroElements = csrMatrix.getNumberOfMatrixElements(); - Containers::Vector< int, Devices::Host, int > rowLengthsHost; - rowLengthsHost.setSize( rows ); - for( int row = 0; row < rows; row++ ) - rowLengthsHost[ row ] = csrMatrix.getRowLength( row ); - - typedef Containers::Vector< Real, Devices::Host, int > HostVector; - HostVector hostX, hostB; - hostX.setSize( csrMatrix.getColumns() ); - hostX.setValue( 1.0 ); - hostB.setSize( csrMatrix.getRows() ); -#ifdef HAVE_CUDA - typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector; - CudaVector cudaX, cudaB; - Containers::Vector< int, Devices::Cuda, int > rowLengthsCuda; - cudaX.setSize( csrMatrix.getColumns() ); - cudaX.setValue( 1.0 ); - cudaB.setSize( csrMatrix.getRows() ); - rowLengthsCuda.setSize( csrMatrix.getRows() ); - rowLengthsCuda = rowLengthsHost; - cusparseHandle_t cusparseHandle; - cusparseCreate( &cusparseHandle ); -#endif - const double baseline = benchmarkMatrix( csrMatrix, - hostX, - hostB, - nonzeroElements, - "CSR Host", - stopTime, - 0.0, - verbose, - logFile ); -#ifdef HAVE_CUDA - typedef CSR< Real, Devices::Cuda, int > CSRCudaType; - CSRCudaType cudaCSR; - //cout << "Copying matrix to GPU... "; - cudaCSR = csrMatrix; - TNL::CusparseCSR< Real > cusparseCSR; - cusparseCSR.init( cudaCSR, &cusparseHandle ); - benchmarkMatrix( cusparseCSR, - cudaX, - cudaB, - nonzeroElements, - "Cusparse CSR", - stopTime, - baseline, - verbose, - logFile ); - cusparseDestroy( cusparseHandle ); - - std::cout << " done. \r"; - /*cudaCSR.setCudaKernelType( CSRCudaType::scalar ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Scalar", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setCudaKernelType( CSRCudaType::vector ); - cudaCSR.setCudaWarpSize( 1 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Vector 1", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setCudaWarpSize( 2 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Vector 2", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setCudaWarpSize( 4 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Vector 4", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setCudaWarpSize( 8 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Vector 8", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setCudaWarpSize( 16 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Vector 16", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setCudaWarpSize( 32 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Vector 32", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setCudaKernelType( CSRCudaType::hybrid ); - cudaCSR.setHybridModeSplit( 2 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Hyrbid 2", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setHybridModeSplit( 4 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Hyrbid 4", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setHybridModeSplit( 8 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Hyrbid 8", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setHybridModeSplit( 16 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Hyrbid 16", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setHybridModeSplit( 32 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Hyrbid 32", - stopTime, - baseline, - verbose, - logFile ); - cudaCSR.setHybridModeSplit( 64 ); - benchmarkMatrix( cudaCSR, - cudaX, - cudaB, - nonzeroElements, - "CSR Cuda Hyrbid 64", - stopTime, - baseline, - verbose, - logFile );*/ - cudaCSR.reset(); -#endif - - long int allocatedElements; - double padding; - typedef Ellpack< Real, Devices::Host, int > EllpackType; - EllpackType ellpackMatrix; - Matrices::copySparseMatrix( ellpackMatrix, csrMatrix ); - allocatedElements = ellpackMatrix.getNumberOfMatrixElements(); - padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; - logFile << " " << padding << std::endl; - benchmarkMatrix( ellpackMatrix, - hostX, - hostB, - nonzeroElements, - "Ellpack Host", - stopTime, - baseline, - verbose, - logFile ); -#ifdef HAVE_CUDA - typedef Ellpack< Real, Devices::Cuda, int > EllpackCudaType; - EllpackCudaType cudaEllpack; - std::cout << "Copying matrix to GPU... "; - cudaEllpack = ellpackMatrix; - std::cout << " done. \r"; - benchmarkMatrix( cudaEllpack, - cudaX, - cudaB, - nonzeroElements, - "Ellpack Cuda", - stopTime, - baseline, - verbose, - logFile ); - cudaEllpack.reset(); -#endif - ellpackMatrix.reset(); - - typedef Matrices::EllpackSymmetric< Real, Devices::Host, int > EllpackSymmetricType; - EllpackSymmetricType EllpackSymmetric; - if( ! MatrixReader< EllpackSymmetricType >::readMtxFile( inputFileName, EllpackSymmetric, verbose, true ) ) - writeTestFailed( logFile, 7 ); - else - { - allocatedElements = EllpackSymmetric.getNumberOfMatrixElements(); - padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; - logFile << " " << padding < EllpackSymmetricCudaType; - EllpackSymmetricCudaType cudaEllpackSymmetric; - std::cout << "Copying matrix to GPU... "; - for( int i = 0; i < rowLengthsHost.getSize(); i++ ) - rowLengthsHost[ i ] = EllpackSymmetric.getRowLength( i ); - rowLengthsCuda = rowLengthsHost; - - // TODO: fix this - //if( ! cudaEllpackSymmetric.copyFrom( EllpackSymmetric, rowLengthsCuda ) ) - { - std::cerr << "I am not able to transfer the matrix on GPU." < SlicedEllpackMatrixType; - SlicedEllpackMatrixType slicedEllpackMatrix; - if( ! Matrices::MatrixReader< SlicedEllpackMatrixType >::readMtxFile( inputFileName, slicedEllpackMatrix, verbose ) ) - writeTestFailed( logFile, 7 ); - else - { - allocatedElements = slicedEllpackMatrix.getNumberOfMatrixElements(); - padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100; - logFile << " " << padding < SlicedEllpackMatrixCudaType; - SlicedEllpackMatrixCudaType cudaSlicedEllpackMatrix; - for( int i = 0; i < rowLengthsHost.getSize(); i++ ) - rowLengthsHost[ i ] = slicedEllpackMatrix.getRowLength( i ); - rowLengthsCuda = rowLengthsHost; - // TODO: fix - //if( ! cudaSlicedEllpackMatrix.copyFrom( slicedEllpackMatrix, rowLengthsCuda ) ) - { - std::cerr << "Nejde zkopirovat" < ChunkedEllpackType; - ChunkedEllpackType chunkedEllpack; - Matrices::copySparseMatrix( chunkedEllpack, csrMatrix ); - allocatedElements = chunkedEllpack.getNumberOfMatrixElements(); - padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; - logFile << " " << padding << std::endl; - benchmarkMatrix( chunkedEllpack, - hostX, - hostB, - nonzeroElements, - "ChunkedEllpack Host", - stopTime, - baseline, - verbose, - logFile ); - -#ifdef HAVE_CUDA - typedef Matrices::ChunkedEllpack< Real, Devices::Cuda, int > ChunkedEllpackCudaType; - ChunkedEllpackCudaType cudaChunkedEllpack; - std::cout << "Copying matrix to GPU... "; - cudaChunkedEllpack = chunkedEllpack; - std::cout << " done. \r"; - benchmarkMatrix( cudaChunkedEllpack, - cudaX, - cudaB, - nonzeroElements, - "ChunkedEllpack Cuda", - stopTime, - baseline, - verbose, - logFile ); - cudaChunkedEllpack.reset(); -#endif - - typedef Matrices::BiEllpack< Real, Devices::Host, int > BiEllpackMatrixType; - BiEllpackMatrixType biEllpackMatrix; - // TODO: I did not check this during git merging, but I hope its gonna work - // Tomas Oberhuber - // copySparseMatrix( biEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats - /*if( ! biEllpackMatrix.copyFrom( csrMatrix, rowLengthsHost ) ) - writeTestFailed( logFile, 7 ); - else*/ - { - allocatedElements = biEllpackMatrix.getNumberOfMatrixElements(); - padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; - logFile << " " << padding < BiEllpackMatrixCudaType; - BiEllpackMatrixCudaType cudaBiEllpackMatrix; - // TODO: I did not check this during git merging, but I hope its gonna work - // Tomas Oberhuber - // copySparseMatrix( biEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats - std::cout << "Copying matrix to GPU... "; - /*if( ! cudaBiEllpackMatrix.copyFrom( biEllpackMatrix, rowLengthsCuda ) ) - { - std::cerr << "I am not able to transfer the matrix on GPU." < SlicedEllpackSymmetricType; - SlicedEllpackSymmetricType slicedEllpackSymmetric; - if( ! Matrices::MatrixReader< SlicedEllpackSymmetricType >::readMtxFile( inputFileName, slicedEllpackSymmetric, verbose, true ) ) - writeTestFailed( logFile, 7 ); - else - { - allocatedElements = slicedEllpackSymmetric.getNumberOfMatrixElements(); - padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; - logFile << " " << padding < SlicedEllpackSymmetricCudaType; - SlicedEllpackSymmetricCudaType cudaSlicedEllpackSymmetric; - std::cout << "Copying matrix to GPU... "; - for( int i = 0; i < rowLengthsHost.getSize(); i++ ) - rowLengthsHost[ i ] = slicedEllpackSymmetric.getRowLength( i ); - rowLengthsCuda = rowLengthsHost; - // TODO: fiox the nest line - //if( ! cudaSlicedEllpackSymmetric.copyFrom( slicedEllpackSymmetric, rowLengthsCuda ) ) - { - std::cerr << "I am not able to transfer the matrix on GPU." < EllpackSymmetricGraphMatrixType; - EllpackSymmetricGraphMatrixType EllpackSymmetricGraphMatrix; - if( ! Matrices::MatrixReader< EllpackSymmetricGraphMatrixType >::readMtxFile( inputFileName, EllpackSymmetricGraphMatrix, verbose, true ) || - ! EllpackSymmetricGraphMatrix.help() ) - writeTestFailed( logFile, 7 ); - else - { - allocatedElements = EllpackSymmetricGraphMatrix.getNumberOfMatrixElements(); - padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; - logFile << " " << padding < EllpackSymmetricGraphMatrixCudaType; - EllpackSymmetricGraphMatrixCudaType cudaEllpackSymmetricGraphMatrix; - std::cout << "Copying matrix to GPU... "; - for( int i = 0; i < rowLengthsHost.getSize(); i++ ) - rowLengthsHost[ i ] = EllpackSymmetricGraphMatrix.getRowLength( i ); - rowLengthsCuda = rowLengthsHost; - // TODO: fix it - //if( ! cudaEllpackSymmetricGraphMatrix.copyFrom( EllpackSymmetricGraphMatrix, rowLengthsCuda ) ) - { - writeTestFailed( logFile, 3 ); - } - //else if( ! cudaEllpackSymmetricGraphMatrix.help() ) - { - writeTestFailed( logFile, 3 ); - } - //else - { - std::cout << " done. \r"; - benchmarkMatrix( cudaEllpackSymmetricGraphMatrix, - cudaX, - cudaB, - nonzeroElements, - "Ellpack Graph Cuda", - stopTime, - baseline, - verbose, - logFile ); - } - cudaEllpackSymmetricGraphMatrix.reset(); -#endif - } - - - typedef Matrices::AdEllpack< Real, Devices::Host, int > AdEllpackMatrixType; - AdEllpackMatrixType adEllpackMatrix; - // TODO: I did not check this during git merging, but I hope its gonna work - // Tomas Oberhuber - //copySparseMatrix( adEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats - /*if( ! adEllpackMatrix.copyFrom( csrMatrix, rowLengthsHost ) ) - writeTestFailed( logFile, 7 ); - else*/ - { - allocatedElements = adEllpackMatrix.getNumberOfMatrixElements(); - padding = ( double ) allocatedElements / ( double ) nonzeroElements * 100.0 - 100.0; - logFile << " " << padding < AdEllpackMatrixCudaType; - AdEllpackMatrixCudaType cudaAdEllpackMatrix; - // TODO: I did not check this during git merging, but I hope its gonna work - // Tomas Oberhuber - //copySparseMatrix( adEllpackMatrix, csrMatrix ); // TODO:Fix the getRow method to be compatible with othr formats - std::cout << "Copying matrix to GPU... "; - /*if( ! cudaAdEllpackMatrix.copyFrom( csrMatrix, rowLengthsCuda ) ) - { - std::cerr << "I am not able to transfer the matrix on GPU." <( "precision" ); - if( precision == "float" ) - if( ! setupBenchmark< float >( parameters ) ) - return EXIT_FAILURE; - if( precision == "double" ) - if( ! setupBenchmark< double >( parameters ) ) - return EXIT_FAILURE; - return EXIT_SUCCESS; -} - -#endif \ No newline at end of file diff --git a/src/Benchmarks/SpMV/OldSpMV/tnlCusparseCSRMatrix.h b/src/Benchmarks/SpMV/OldSpMV/tnlCusparseCSRMatrix.h deleted file mode 100644 index fbef4f9a2410669f8c91ef51bf6de404ab1bb7fc..0000000000000000000000000000000000000000 --- a/src/Benchmarks/SpMV/OldSpMV/tnlCusparseCSRMatrix.h +++ /dev/null @@ -1,162 +0,0 @@ -/*************************************************************************** - tnlCusparseCSR.h - description - ------------------- - begin : Jul 3, 2014 - copyright : (C) 2014 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#ifdef NOT_USED_ANYMORE - -#include -#include -#ifdef HAVE_CUDA -#include -#endif - -namespace TNL { - -template< typename Real > -class CusparseCSRBase -{ - public: - typedef Real RealType; - typedef Devices::Cuda DeviceType; - typedef Matrices::CSR< RealType, Devices::Cuda, int > MatrixType; - - CusparseCSRBase() - : matrix( 0 ) - { - }; - -#ifdef HAVE_CUDA - void init( const MatrixType& matrix, - cusparseHandle_t* cusparseHandle ) - { - this->matrix = &matrix; - this->cusparseHandle = cusparseHandle; - cusparseCreateMatDescr( & this->matrixDescriptor ); - }; -#endif - - int getRows() const - { - return matrix->getRows(); - } - - int getColumns() const - { - return matrix->getColumns(); - } - - int getNumberOfMatrixElements() const - { - return matrix->getNumberOfMatrixElements(); - } - - - template< typename InVector, - typename OutVector > - void vectorProduct( const InVector& inVector, - OutVector& outVector ) const - { - TNL_ASSERT_TRUE( matrix, "matrix was not initialized" ); -#ifdef HAVE_CUDA - cusparseDcsrmv( *( this->cusparseHandle ), - CUSPARSE_OPERATION_NON_TRANSPOSE, - this->matrix->getRows(), - this->matrix->getColumns(), - this->matrix->values.getSize(), - 1.0, - this->matrixDescriptor, - this->matrix->values.getData(), - this->matrix->rowPointers.getData(), - this->matrix->columnIndexes.getData(), - inVector.getData(), - 1.0, - outVector.getData() ); -#endif - } - - protected: - - const MatrixType* matrix; -#ifdef HAVE_CUDA - cusparseHandle_t* cusparseHandle; - - cusparseMatDescr_t matrixDescriptor; -#endif -}; - - -template< typename Real > -class CusparseCSR -{}; - -template<> -class CusparseCSR< double > : public CusparseCSRBase< double > -{ - public: - - template< typename InVector, - typename OutVector > - void vectorProduct( const InVector& inVector, - OutVector& outVector ) const - { - TNL_ASSERT_TRUE( matrix, "matrix was not initialized" ); -#ifdef HAVE_CUDA - double d = 1.0; - double* alpha = &d; - cusparseDcsrmv( *( this->cusparseHandle ), - CUSPARSE_OPERATION_NON_TRANSPOSE, - this->matrix->getRows(), - this->matrix->getColumns(), - this->matrix->getValues().getSize(), - alpha, - this->matrixDescriptor, - this->matrix->getValues().getData(), - this->matrix->getRowPointers().getData(), - this->matrix->getColumnIndexes().getData(), - inVector.getData(), - alpha, - outVector.getData() ); -#endif - } -}; - -template<> -class CusparseCSR< float > : public CusparseCSRBase< float > -{ - public: - - template< typename InVector, - typename OutVector > - void vectorProduct( const InVector& inVector, - OutVector& outVector ) const - { - TNL_ASSERT_TRUE( matrix, "matrix was not initialized" ); -#ifdef HAVE_CUDA - float d = 1.0; - float* alpha = &d; - cusparseScsrmv( *( this->cusparseHandle ), - CUSPARSE_OPERATION_NON_TRANSPOSE, - this->matrix->getRows(), - this->matrix->getColumns(), - this->matrix->getValues().getSize(), - alpha, - this->matrixDescriptor, - this->matrix->getValues().getData(), - this->matrix->getRowPointers().getData(), - this->matrix->getColumnIndexes().getData(), - inVector.getData(), - alpha, - outVector.getData() ); -#endif - } -}; - -} // namespace TNL - -#endif \ No newline at end of file diff --git a/src/TNL/Matrices/Legacy/BiEllpack.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack.h similarity index 98% rename from src/TNL/Matrices/Legacy/BiEllpack.h rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack.h index 3f7b06a58f680607f0c5f53914efc9aeb15f9c22..dd173cea11719a0deb6005b14a9dbc0920c2b99a 100644 --- a/src/TNL/Matrices/Legacy/BiEllpack.h +++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack.h @@ -18,7 +18,7 @@ #pragma once -#include +#include #include namespace TNL { @@ -221,5 +221,5 @@ private: } //namespace Matrices } // namespace TNL -#include +#include diff --git a/src/TNL/Matrices/Legacy/BiEllpack_impl.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack_impl.h similarity index 99% rename from src/TNL/Matrices/Legacy/BiEllpack_impl.h rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack_impl.h index 1bb393bb939aed770f4a3878ab3fca895920243f..afda8c2a5a497aa70947271aec943d21c5a437de 100644 --- a/src/TNL/Matrices/Legacy/BiEllpack_impl.h +++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack_impl.h @@ -11,7 +11,7 @@ #pragma once -#include +#include #include #include #include diff --git a/src/TNL/Matrices/Legacy/ChunkedEllpack.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack.h similarity index 98% rename from src/TNL/Matrices/Legacy/ChunkedEllpack.h rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack.h index 93ba63ebf908697f9b716b4989538a4e9e513f0d..10fce9f71b7ee0ce036d5ebd1e33b4ea4792ce7e 100644 --- a/src/TNL/Matrices/Legacy/ChunkedEllpack.h +++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack.h @@ -22,7 +22,7 @@ #pragma once -#include +#include #include namespace TNL { @@ -354,5 +354,5 @@ protected: } // namespace Matrices } // namespace TNL -#include +#include diff --git a/src/TNL/Matrices/Legacy/ChunkedEllpack_impl.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack_impl.h similarity index 99% rename from src/TNL/Matrices/Legacy/ChunkedEllpack_impl.h rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack_impl.h index ec05515fdf128c32b9f43547f015757ff7689a83..99c3ef547c78f09ff293c70e0585e7352b8de5de 100644 --- a/src/TNL/Matrices/Legacy/ChunkedEllpack_impl.h +++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack_impl.h @@ -10,7 +10,7 @@ #pragma once -#include +#include #include #include #include diff --git a/src/TNL/Matrices/Legacy/Ellpack.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack.h similarity index 98% rename from src/TNL/Matrices/Legacy/Ellpack.h rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack.h index af730ccd22f864da7ade15f1f134cdc3393037f4..7ddb4bb04e7063b328c340a3e0f0fb3760c45da3 100644 --- a/src/TNL/Matrices/Legacy/Ellpack.h +++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack.h @@ -10,7 +10,7 @@ #pragma once -#include +#include #include namespace TNL { @@ -212,4 +212,4 @@ protected: } // namespace Matrices } // namespace TNL -#include +#include diff --git a/src/TNL/Matrices/Legacy/Ellpack_impl.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack_impl.h similarity index 99% rename from src/TNL/Matrices/Legacy/Ellpack_impl.h rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack_impl.h index 39e27f8f9eb6e42417bedf7d8ed9b48b2435968e..1ca524701268dede22277e829f3d3ea587c0f8e6 100644 --- a/src/TNL/Matrices/Legacy/Ellpack_impl.h +++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/Ellpack_impl.h @@ -10,7 +10,7 @@ #pragma once -#include +#include #include #include #include diff --git a/src/TNL/Matrices/Legacy/SlicedEllpack.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack.h similarity index 98% rename from src/TNL/Matrices/Legacy/SlicedEllpack.h rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack.h index 88ab6ae32fe02a0609d84d58e3f630dbbdf6271d..e0bcd3c75d79fe579b05da00974bc0b9217ebc46 100644 --- a/src/TNL/Matrices/Legacy/SlicedEllpack.h +++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack.h @@ -21,7 +21,7 @@ #pragma once -#include +#include #include namespace TNL { @@ -240,4 +240,4 @@ public: } // namespace Matrices } // namespace TNL -#include +#include diff --git a/src/TNL/Matrices/Legacy/SlicedEllpack_impl.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack_impl.h similarity index 99% rename from src/TNL/Matrices/Legacy/SlicedEllpack_impl.h rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack_impl.h index fa99206e22fe7f9b09b9c1be83cd60f386c3feb7..6bd8b87aad66a73768fd5f17dfdcee848c5451dc 100644 --- a/src/TNL/Matrices/Legacy/SlicedEllpack_impl.h +++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack_impl.h @@ -10,7 +10,7 @@ #pragma once -#include +#include #include #include #include diff --git a/src/TNL/Matrices/Legacy/Sparse.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse.h similarity index 93% rename from src/TNL/Matrices/Legacy/Sparse.h rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse.h index 275c7a9bc79959103df5cc41ff889fe1c8db26eb..5f75efe1849889ab7a9189961241ebdfd1c9f6e4 100644 --- a/src/TNL/Matrices/Legacy/Sparse.h +++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse.h @@ -11,7 +11,7 @@ #pragma once #include -#include +#include namespace TNL { namespace Matrices { @@ -66,5 +66,5 @@ class Sparse : public Matrix< Real, Device, Index > } // namespace Matrices } // namespace TNL -#include +#include #include diff --git a/src/TNL/Matrices/Legacy/SparseRow.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/SparseRow.h similarity index 97% rename from src/TNL/Matrices/Legacy/SparseRow.h rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/SparseRow.h index eb7a461fba5d59763bfad608dbef3ea3327aa5d1..0b5ff29d9925fdc288ac72a54deebe5d8d72fa46 100644 --- a/src/TNL/Matrices/Legacy/SparseRow.h +++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/SparseRow.h @@ -100,4 +100,4 @@ std::ostream& operator<<( std::ostream& str, const SparseRow< Real, Index >& row } // namespace Matrices } // namespace TNL -#include +#include diff --git a/src/TNL/Matrices/Legacy/SparseRow_impl.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/SparseRow_impl.h similarity index 98% rename from src/TNL/Matrices/Legacy/SparseRow_impl.h rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/SparseRow_impl.h index e34f3a8478d20149101553bd45cddd997beda0a1..f538bbb86285fb210c931e7475817dcd447189e6 100644 --- a/src/TNL/Matrices/Legacy/SparseRow_impl.h +++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/SparseRow_impl.h @@ -10,7 +10,7 @@ #pragma once -#include +#include #include // Following includes are here to enable usage of std::vector and std::cout. To avoid having to include Device type (HOW would this be done anyway) diff --git a/src/TNL/Matrices/Legacy/Sparse_impl.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse_impl.h similarity index 100% rename from src/TNL/Matrices/Legacy/Sparse_impl.h rename to src/Benchmarks/SpMV/ReferenceFormats/Legacy/Sparse_impl.h diff --git a/src/Benchmarks/SpMV/cusparseCSRMatrix.h b/src/Benchmarks/SpMV/ReferenceFormats/cusparseCSRMatrix.h similarity index 100% rename from src/Benchmarks/SpMV/cusparseCSRMatrix.h rename to src/Benchmarks/SpMV/ReferenceFormats/cusparseCSRMatrix.h diff --git a/src/Benchmarks/SpMV/spmv-legacy.h b/src/Benchmarks/SpMV/spmv-legacy.h index ff1cdacafd0b5a3c5a2c1c67f40567f6a05cb33e..ec0fd001860959efa0492e3a4c8497948ab5c010 100644 --- a/src/Benchmarks/SpMV/spmv-legacy.h +++ b/src/Benchmarks/SpMV/spmv-legacy.h @@ -19,11 +19,11 @@ #include #include -#include -#include -#include +#include +#include +#include #include -#include +#include #include #include @@ -37,7 +37,7 @@ #include using namespace TNL::Matrices; -#include "cusparseCSRMatrix.h" +#include namespace TNL { namespace Benchmarks { @@ -85,11 +85,29 @@ using SparseMatrixLegacy_CSR_Vector = Matrices::Legacy::CSR< Real, Device, Index template< typename Real, typename Device, typename Index > using SparseMatrixLegacy_CSR_Light = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRLight >; +template< typename Real, typename Device, typename Index > +using SparseMatrixLegacy_CSR_Light2 = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRLight2 >; + +template< typename Real, typename Device, typename Index > +using SparseMatrixLegacy_CSR_Light3 = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRLight3 >; + +template< typename Real, typename Device, typename Index > +using SparseMatrixLegacy_CSR_Light4 = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRLight4 >; + +template< typename Real, typename Device, typename Index > +using SparseMatrixLegacy_CSR_Light5 = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRLight5 >; + +template< typename Real, typename Device, typename Index > +using SparseMatrixLegacy_CSR_Light6 = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRLight6 >; + template< typename Real, typename Device, typename Index > using SparseMatrixLegacy_CSR_Adaptive = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRAdaptive >; template< typename Real, typename Device, typename Index > -using SparseMatrixLegacy_CSR_Stream = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRStream >; +using SparseMatrixLegacy_CSR_MultiVector = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRMultiVector >; + +template< typename Real, typename Device, typename Index > +using SparseMatrixLegacy_CSR_LightWithoutAtomic = Matrices::Legacy::CSR< Real, Device, Index, Matrices::Legacy::CSRLightWithoutAtomic >; // Get the name (with extension) of input matrix file std::string getMatrixFileName( const String& InputFileName ) @@ -292,10 +310,16 @@ benchmarkSpmvSynthetic( Benchmark& benchmark, #endif benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar >( benchmark, hostOutVector, inputFileName, verboseMR ); - //benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Vector >( benchmark, hostOutVector, inputFileName, verboseMR ); - //benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light >( benchmark, hostOutVector, inputFileName, verboseMR ); - //benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Adaptive >( benchmark, hostOutVector, inputFileName, verboseMR ); - //benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Stream >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Vector >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light2 >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light3 >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light4 >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light5 >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light6 >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Adaptive >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_MultiVector>( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_LightWithoutAtomic>( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, SparseMatrix_CSR >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, Matrices::Legacy::Ellpack >( benchmark, hostOutVector, inputFileName, verboseMR ); benchmarkSpMV< Real, SparseMatrix_Ellpack >( benchmark, hostOutVector, inputFileName, verboseMR ); diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h index d8e2003fb5f9e3932d0964696ebf828b429f8f01..82e1f12cde656caf38f45bafa09f8dd38028f126 100644 --- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h +++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h @@ -25,6 +25,7 @@ #include using namespace TNL::Matrices; +#include #include // Used for file naming, so logs don't get overwritten. using namespace TNL; @@ -44,7 +45,12 @@ runSpMVBenchmarks( Benchmark & benchmark, benchmark.newBenchmark( String("Sparse matrix-vector multiplication (") + precision + ")", metadata ); // Start the actual benchmark in spmv.h - SpMVLegacy::benchmarkSpmvSynthetic< Real >( benchmark, inputFileName, verboseMR ); + try { + SpMVLegacy::benchmarkSpmvSynthetic< Real >( benchmark, inputFileName, verboseMR ); + } + catch( const std::exception& ex ) { + std::cerr << ex.what() << std::endl; + } } // Get current date time to have different log files names and avoid overwriting. diff --git a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py index 229e32cc2519ad6a2ea817285ac07c81a6028569..2af4b9ffc65b06476054858228b4b7d19b68c48f 100755 --- a/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py +++ b/src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py @@ -5,13 +5,52 @@ import re import math import pandas +from collections import defaultdict from TNL.LogParser import LogParser +""" +Sparse matrix formats as they appear in the log file. +""" +cpu_matrix_formats = [ 'CSR', + 'Ellpack', 'Ellpack Legacy', + 'SlicedEllpack', 'SlicedEllpack Legacy', + 'ChunkedEllpack', 'ChunkedEllpack Legacy', + 'BiEllpack', 'BiEllpack Legacy' ] + +gpu_matrix_formats = [ 'CSR Legacy Scalar', 'CSR Legacy Vector', 'CSR Legacy MultiVector', + 'CSR Legacy Light', 'CSR Legacy Light2', 'CSR Legacy Light3', 'CSR Legacy Light4', 'CSR Legacy Light5', 'CSR Legacy Light6', 'CSR Legacy LightWithoutAtomic', + 'CSR Legacy Adaptive', + 'Ellpack', 'Ellpack Legacy', + 'SlicedEllpack', 'SlicedEllpack Legacy', + 'ChunkedEllpack', 'ChunkedEllpack Legacy', + 'BiEllpack', 'BiEllpack Legacy' ] +""" +CPU formats to be compared +""" +cpu_comparison_formats = { 'CSR' : 'CSR Legacy Scalar', + 'Ellpack' : 'Ellpack Legacy', + 'SlicedEllpack' : 'SlicedEllpack Legacy', + 'BiEllpack' : 'BiEllpack Legacy' + } + +""" +GPU formats to be compared +""" +gpu_comparison_formats = { #'CSR' : 'CSR Legacy Scalar', + 'Ellpack' : 'Ellpack Legacy', + 'SlicedEllpack' : 'SlicedEllpack Legacy', + 'BiEllpack' : 'BiEllpack Legacy' + } #pandas.options.display.float_format = "{:.2f}".format pandas.options.display.float_format = "{:.2e}".format pandas.options.display.width = 0 # auto-detect terminal width for formatting pandas.options.display.max_rows = None +def slugify(s): + s = str(s).strip().replace(' ', '_') + return re.sub(r'(?u)[^-\w.]', '', s) + + def parse_file(fname): parser = LogParser() for metadata, df in parser.readFile(fname): @@ -59,22 +98,8 @@ df = df.reorder_levels([2, 0, 1], axis=1) df.sort_index(axis=1, inplace=True) # Drop CPU speedup -df.drop(columns=('BiEllpack Legacy', 'CPU','speedup'), axis=1, inplace=True ) -df.drop(columns=('BiEllpack', 'CPU','speedup'), axis=1, inplace=True ) -df.drop(columns=('CSR', 'CPU','speedup'), axis=1, inplace=True ) -#df.drop(columns=('CSR Legacy Adaptive', 'CPU','speedup'), axis=1, inplace=True ) -#df.drop(columns=('CSR Legacy Light', 'CPU','speedup'), axis=1, inplace=True ) -#df.drop(columns=('CSR Legacy LightWithoutAtomic', 'CPU','speedup'), axis=1, inplace=True ) -#df.drop(columns=('CSR Legacy Scalar', 'CPU','speedup'), axis=1, inplace=True ) -#df.drop(columns=('CSR Legacy Stream', 'CPU','speedup'), axis=1, inplace=True ) -#df.drop(columns=('CSR Legacy Vector', 'CPU','speedup'), axis=1, inplace=True ) -#df.drop(columns=('CSR Legacy MultiVector', 'CPU','speedup'), axis=1, inplace=True ) -df.drop(columns=('ChunkedEllpack Legacy', 'CPU','speedup'), axis=1, inplace=True ) -df.drop(columns=('Ellpack', 'CPU','speedup'), axis=1, inplace=True ) -df.drop(columns=('Ellpack Legacy', 'CPU','speedup'), axis=1, inplace=True ) -df.drop(columns=('SlicedEllpack', 'CPU','speedup'), axis=1, inplace=True ) -df.drop(columns=('SlicedEllpack Legacy', 'CPU','speedup'), axis=1, inplace=True ) -#df.drop(columns=('cuSparse', 'CPU'), axis=1, inplace=True ) +for cpu_format in cpu_matrix_formats: + df.drop(columns=( cpu_format, 'CPU','speedup'), axis=1, inplace=True ) #print( "Exporting data frame to log.html..." ) #pandas.options.display.float_format = '{:,.4f}'.format @@ -82,285 +107,147 @@ df.drop(columns=('SlicedEllpack Legacy', 'CPU','speedup'), axis=1, inplace=True print( "Computing speed-up of formats...") # Add speedup compared to CSR and cuSparse -df["BiEllpack Legacy", "CPU", "CSR speedup"] = df["BiEllpack Legacy", "CPU", "time"] / df["CSR", "CPU", "time"] -df["BiEllpack Legacy", "GPU", "cuSparse speedup"] = df["BiEllpack Legacy", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["BiEllpack", "CPU", "CSR speedup"] = df["BiEllpack", "CPU", "time"] / df["CSR", "CPU", "time"] -df["BiEllpacky", "GPU", "cuSparse speedup"] = df["BiEllpack", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["CSR", "GPU", "cuSparse speedup"] = df["CSR", "GPU", "time"] / df["cuSparse", "GPU", "time"] -#df["CSR Legacy Adaptive", "GPU", "cuSparse speedup"] = df["CSR Legacy Adaptive", "GPU", "time"] / df["cuSparse", "GPU", "time"] -#df["CSR Legacy Light", "GPU", "cuSparse speedup"] = df["CSR Legacy Light", "GPU", "time"] / df["cuSparse", "GPU", "time"] -#df["CSR Legacy LightWithoutAtomic", "GPU", "cuSparse speedup"] = df["CSR Legacy LightWithoutAtomic", "GPU", "time"] / df["cuSparse", "GPU", "time"] -#df["CSR Legacy Scalar", "GPU", "cuSparse speedup"] = df["CSR Legacy Scalar", "GPU", "time"] / df["cuSparse", "GPU", "time"] -#df["CSR Legacy Vector", "GPU", "cuSparse speedup"] = df["CSR Legacy Vector", "GPU", "time"] / df["cuSparse", "GPU", "time"] -#df["CSR Legacy MultiVector", "GPU", "cuSparse speedup"] = df["CSR Legacy MultiVector", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["ChunkedEllpack Legacy", "CPU", "CSR speedup"] = df["ChunkedEllpack Legacy", "CPU", "time"] / df["CSR", "CPU", "time"] -df["ChunkedEllpack Legacy", "GPU", "cuSparse speedup"] = df["ChunkedEllpack Legacy", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["Ellpack Legacy", "CPU", "CSR speedup"] = df["Ellpack Legacy", "CPU", "time"] / df["CSR", "CPU", "time"] -df["Ellpack Legacy", "GPU", "cuSparse speedup"] = df["Ellpack Legacy", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["Ellpack", "CPU", "CSR speedup"] = df["Ellpack", "CPU", "time"] / df["CSR", "CPU", "time"] -df["Ellpack", "GPU", "cuSparse speedup"] = df["Ellpack", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["SlicedEllpack Legacy", "CPU", "CSR speedup"] = df["SlicedEllpack Legacy", "CPU", "time"] / df["CSR", "CPU", "time"] -df["SlicedEllpack Legacy", "GPU", "cuSparse speedup"] = df["SlicedEllpack Legacy", "GPU", "time"] / df["cuSparse", "GPU", "time"] -df["SlicedEllpack", "CPU", "CSR speedup"] = df["SlicedEllpack", "CPU", "time"] / df["CSR", "CPU", "time"] -df["SlicedEllpack", "GPU", "cuSparse speedup"] = df["SlicedEllpack", "GPU", "time"] / df["cuSparse", "GPU", "time"] +for cpu_format in cpu_matrix_formats: + if cpu_format != 'CSR': + df[cpu_format, "CPU", "CSR speedup"] = df[cpu_format, "CPU", "time"] / df["CSR","CPU", "time"] + +for gpu_format in gpu_matrix_formats: + df[ gpu_format, "GPU", "cuSparse speedup"] = df[ gpu_format,"GPU", "time"] / df["cuSparse", "GPU", "time"] # Add speedup compared to legacy formats -df["CSR", "GPU", "Legacy speedup"] = df["CSR", "GPU", "time"] / df["CSR Legacy Scalar", "GPU", "time"] -df["CSR", "CPU", "Legacy speedup"] = df["CSR", "CPU", "time"] / df["CSR Legacy Scalar", "CPU", "time"] -df["Ellpack", "GPU", "Legacy speedup"] = df["Ellpack", "GPU", "time"] / df["Ellpack Legacy", "GPU", "time"] -df["Ellpack", "CPU", "Legacy speedup"] = df["Ellpack", "CPU", "time"] / df["Ellpack Legacy", "CPU", "time"] -df["SlicedEllpack", "GPU", "Legacy speedup"] = df["SlicedEllpack", "GPU", "time"] / df["SlicedEllpack Legacy", "GPU", "time"] -df["SlicedEllpack", "CPU", "Legacy speedup"] = df["SlicedEllpack", "CPU", "time"] / df["SlicedEllpack Legacy", "CPU", "time"] -df["BiEllpack", "GPU", "Legacy speedup"] = df["BiEllpack", "GPU", "time"] / df["BiEllpack Legacy", "GPU", "time"] -df["BiEllpack", "CPU", "Legacy speedup"] = df["BiEllpack", "CPU", "time"] / df["BiEllpack Legacy", "CPU", "time"] +for format in cpu_comparison_formats: + other_format = cpu_comparison_formats[ format ] + df[ format, "CPU", f"{other_format} speedup"] = df[ format, "CPU", "time"] / df[ other_format, "CPU", "time"] + +for format in gpu_comparison_formats: + other_format = gpu_comparison_formats[ format ] + df[ format, "GPU", f"{other_format} speedup"] = df[ format, "GPU", "time"] / df[ other_format, "GPU", "time"] print( "Exporting data frame to log.html..." ) pandas.options.display.float_format = '{:,.4f}'.format df.to_html("log.html") -# extract columns of reference formats on GPU +""" +Extract columns of reference formats on GPU +""" print( "Preparing data for graph analysis..." ) df['cuSparse-bandwidth' ] = df[ 'cuSparse','GPU','bandwidth'] -#df['csr-legacy-adaptive-bandwidth' ] = df[ 'CSR Legacy Adaptive','GPU','bandwidth'] -#df['csr-legacy-light-bandwidth' ] = df[ 'CSR Legacy Light','GPU','bandwidth'] -#df['csr-legacy-light-without-atomic-bandwidth' ] = df[ 'CSR Legacy LightWithoutAtomic','GPU','bandwidth'] -#df['csr-legacy-scalar-bandwidth' ] = df[ 'CSR Legacy Scalar','GPU','bandwidth'] -#df['csr-legacy-vector-bandwidth' ] = df[ 'CSR Legacy Vector','GPU','bandwidth'] -#df['csr-legacy-multi-vector-bandwidth' ] = df[ 'CSR Legacy MultiVector','GPU','bandwidth'] -df['ellpack-bandwidth' ] = df[ 'Ellpack','GPU','bandwidth'] -df['sliced-ellpack-bandwidth' ] = df[ 'SlicedEllpack','GPU','bandwidth'] -df['chunked-ellpack-bandwidth' ] = df[ 'ChunkedEllpack','GPU','bandwidth'] -df['bi-ellpack-bandwidth' ] = df[ 'BiEllpack','GPU','bandwidth'] - -# sort by cuSparse +for gpu_format in gpu_matrix_formats: + df[ gpu_format + ' Bandwidth' ] = df[ gpu_format,'GPU','bandwidth'] + +""" +Sort by cuSparse +""" df.sort_values(by=["cuSparse-bandwidth"],inplace=True,ascending=False) cuSparse_list = df['cuSparse-bandwidth'].tolist() -#cuSparse_csr_legacy_adaptive_gpu_list = df[ "CSR Legacy Adaptive", "GPU", "bandwidth"].tolist(); -#cuSparse_csr_legacy_light_gpu_list = df[ "CSR Legacy Light", "GPU", "bandwidth"].tolist(); -#cuSparse_csr_legacy_light_without_atomic_gpu_list = df[ "CSR Legacy LightWithoutAtomic", "GPU", "bandwidth"].tolist(); -#cuSparse_csr_legacy_scalar_gpu_list = df[ "CSR Legacy Scalar", "GPU", "bandwidth"].tolist(); -#cuSparse_csr_legacy_vector_gpu_list = df[ "CSR Legacy Vector", "GPU", "bandwidth"].tolist(); -#cuSparse_csr_legacy_multivector_gpu_list = df[ "CSR Legacy MultiVector", "GPU", "bandwidth"].tolist(); -cuSparse_ellpack_gpu_list = df[ "Ellpack", "GPU", "bandwidth"].tolist(); -cuSparse_ellpack_legacy_gpu_list = df[ "Ellpack Legacy", "GPU", "bandwidth"].tolist(); -cuSparse_sliced_ellpack_gpu_list = df[ "SlicedEllpack", "GPU", "bandwidth"].tolist(); -cuSparse_sliced_ellpack_legacy_gpu_list = df[ "SlicedEllpack Legacy", "GPU", "bandwidth"].tolist(); -cuSparse_chunked_ellpack_legacy_gpu_list = df[ "ChunkedEllpack Legacy", "GPU", "bandwidth"].tolist(); -cuSparse_chunked_ellpack_gpu_list = df[ "ChunkedEllpack", "GPU", "bandwidth"].tolist(); -cuSparse_bi_ellpack_legacy_gpu_list = df[ "BiEllpack Legacy", "GPU", "bandwidth"].tolist(); -cuSparse_bi_ellpack_gpu_list = df[ "BiEllpack", "GPU", "bandwidth"].tolist(); - -# sort by Ellpack -df.sort_values(by=["ellpack-bandwidth"],inplace=True,ascending=False) -ellpack_gpu_list = df["Ellpack", "GPU", "bandwidth"].tolist(); -ellpack_legacy_gpu_list = df["Ellpack Legacy", "GPU", "bandwidth"].tolist(); - -# sort by SlicedEllpack -df.sort_values(by=["sliced-ellpack-bandwidth"],inplace=True,ascending=False) -df.sort_values(by=["sliced-ellpack-bandwidth"],inplace=True,ascending=False) -sliced_ellpack_gpu_list = df["SlicedEllpack", "GPU", "bandwidth"].tolist(); -sliced_ellpack_legacy_gpu_list = df["SlicedEllpack Legacy", "GPU", "bandwidth"].tolist(); - -# sort by ChunkedEllpack -df.sort_values(by=["chunked-ellpack-bandwidth"],inplace=True,ascending=False) -df.sort_values(by=["chunked-ellpack-bandwidth"],inplace=True,ascending=False) -chunked_ellpack_gpu_list = df["ChunkedEllpack", "GPU", "bandwidth"].tolist(); -chunked_ellpack_legacy_gpu_list = df["ChunkedEllpack Legacy", "GPU", "bandwidth"].tolist(); - -# sort by BiEllpack -df.sort_values(by=["bi-ellpack-bandwidth"],inplace=True,ascending=False) -df.sort_values(by=["bi-ellpack-bandwidth"],inplace=True,ascending=False) -bi_ellpack_gpu_list = df["BiEllpack", "GPU", "bandwidth"].tolist(); -bi_ellpack_legacy_gpu_list = df["BiEllpack Legacy", "GPU", "bandwidth"].tolist(); - +cusparse_comparison = defaultdict( list ) +for gpu_format in gpu_matrix_formats: + cusparse_comparison[ gpu_format ] = df[ gpu_format, "GPU", "bandwidth" ].tolist() + +""" +Sort by comparison formats +""" +formats_comparison = defaultdict( list ) +for format in gpu_comparison_formats: + df.sort_values(by=[f"{format} Bandwidth"],inplace=True,ascending=False) + formats_comparison[ format ] = df[format, "GPU", "bandwidth"].tolist(); + formats_comparison[ gpu_comparison_formats[ format ] ] = df[gpu_comparison_formats[ format ], "GPU", "bandwidth"].tolist(); + +""" +Writting gnuplot source files +""" print( "Writing gnuplot files..." ) -cuSparse_file = open( "cusparse.gplt", "w" ) -i = 0 -for x in cuSparse_list: - if str( x ) != "nan": - if ( #str( cuSparse_csr_legacy_adaptive_gpu_list[ i ] ) != "nan" and - #str( cuSparse_csr_legacy_light_gpu_list[ i ] ) != "nan" and - #str( cuSparse_csr_legacy_light_without_atomic_gpu_list[ i ] ) != "nan" and - #str( cuSparse_csr_legacy_scalar_gpu_list[ i ] ) != "nan" and - #str( cuSparse_csr_legacy_vector_gpu_list[ i ] ) != "nan" and - #str( cuSparse_csr_legacy_multivector_gpu_list[ i ] ) != "nan" and - str( cuSparse_ellpack_gpu_list[ i ] ) != "nan" and - str( cuSparse_ellpack_legacy_gpu_list[ i ] ) != "nan" and - str( cuSparse_sliced_ellpack_gpu_list[ i ] ) != "nan" and - str( cuSparse_sliced_ellpack_legacy_gpu_list[ i ] ) != "nan" and - str( cuSparse_chunked_ellpack_gpu_list[ i ] ) != "nan" and - str( cuSparse_chunked_ellpack_legacy_gpu_list[ i ] ) != "nan" and - str( cuSparse_bi_ellpack_gpu_list[ i ] ) != "nan" and - str( cuSparse_bi_ellpack_legacy_gpu_list[ i ] ) != "nan" ): - cuSparse_file.write( f"{i+1} {x} " ) # 1 2 - cuSparse_file.write( f"0 " ) #{cuSparse_csr_legacy_adaptive_gpu_list[ i ]} " ) # 3 - cuSparse_file.write( f"0 " ) #{cuSparse_csr_legacy_light_gpu_list[ i ]} " ) # 4 - cuSparse_file.write( f"0 " ) #{cuSparse_csr_legacy_light_without_atomic_gpu_list[ i ]} " ) # 5 - cuSparse_file.write( f"0 " ) #{cuSparse_csr_legacy_scalar_gpu_list[ i ]} " ) # 6 - cuSparse_file.write( f"0 " ) #{cuSparse_csr_legacy_vector_gpu_list[ i ]} " ) # 7 - cuSparse_file.write( f"0 " ) #{cuSparse_csr_legacy_multivector_gpu_list[ i ]} " ) # 8 - cuSparse_file.write( f"{cuSparse_ellpack_gpu_list[ i ]} {cuSparse_ellpack_legacy_gpu_list[ i ]} " ) # 9 10 - cuSparse_file.write( f"{cuSparse_sliced_ellpack_gpu_list[ i ]} {cuSparse_sliced_ellpack_legacy_gpu_list[ i ]} " ) # 11 12 - cuSparse_file.write( f"{cuSparse_chunked_ellpack_gpu_list[ i ]} {cuSparse_chunked_ellpack_legacy_gpu_list[ i ]} " ) # 13 14 - cuSparse_file.write( f"{cuSparse_bi_ellpack_gpu_list[ i ]} {cuSparse_bi_ellpack_legacy_gpu_list[ i ]}\n" ) # 15 16 - i = i + 1 -cuSparse_file.close() - -ellpack_file = open( "ellpack.gplt", "w" ) -i = 0; -for x in ellpack_gpu_list: - if str( x ) != "nan": - if str( ellpack_legacy_gpu_list[ i ] ) != "nan": - ellpack_file.write( f"{i+1} {x} {ellpack_legacy_gpu_list[ i ]}\n" ) - i = i + 1 -ellpack_file.close() - -sliced_ellpack_file = open( "sliced-ellpack.gplt", "w" ) -i = 0; -for x in sliced_ellpack_gpu_list: - if str( x ) != "nan": - if str( sliced_ellpack_legacy_gpu_list[ i ] ) != "nan": - sliced_ellpack_file.write( f"{i+1} {x} {sliced_ellpack_legacy_gpu_list[ i ]}\n" ) - i = i + 1 -sliced_ellpack_file.close() - -chunked_ellpack_file = open( "chunked-ellpack.gplt", "w" ) -i = 0; -for x in chunked_ellpack_gpu_list: - if str( x ) != "nan": - if str( chunked_ellpack_legacy_gpu_list[ i ] ) != "nan": - chunked_ellpack_file.write( f"{i+1} {x} {chunked_ellpack_legacy_gpu_list[ i ]}\n" ) - i = i + 1 -chunked_ellpack_file.close() - -bi_ellpack_file = open( "bi-ellpack.gplt", "w" ) -i = 0; -for x in bi_ellpack_gpu_list: - if str( x ) != "nan": - if str( bi_ellpack_legacy_gpu_list[ i ] ) != "nan": - bi_ellpack_file.write( f"{i+1} {x} {bi_ellpack_legacy_gpu_list[ i ]}\n" ) - i = i + 1 -bi_ellpack_file.close() - -print( "Generating Gnuplot file..." ) +for gpu_format in gpu_matrix_formats: + filename = "cusparse-" + slugify( gpu_format ) + ".gplt" + data = cusparse_comparison[ gpu_format ] + out_file = open( filename, "w" ) + i = 0 + for x in cuSparse_list: + if str( x ) != "nan": + if ( str(cusparse_comparison[ gpu_format ][ i ] ) != "nan" ): + out_file.write( f"{i+1} {x} {data[ i ]} \n" ) + i = i + 1; + out_file.close() + +for format in gpu_comparison_formats: + out_file = open( f"{slugify(format)}-gpu-comparison.gplt", "w" ) + data = formats_comparison[ format ] + other_data = formats_comparison[ gpu_comparison_formats[ format ] ] + i = 0 + for x in data: + if str( x ) != "nan": + if str( other_data[ i ] ) != "nan": + out_file.write( f"{i+1} {x} {other_data[ i ]}\n" ) + i = i + 1 + out_file.close() + +""" +Generating gnuplot script +""" +print( "Generating Gnuplot script..." ) gnuplot_file = open( "gnuplot.gplt", "w" ) -# NOTE: """...""" allows multi-line strings, r"..." disables backslash-escaping (so a single \ is just a \ in the output) gnuplot_file.write( r""" set terminal postscript lw 3 20 color set grid set xlabel 'Matrix' set xtics 250 set ylabel 'Bandwidth GB/sec' -#set output 'csr-legacy-adaptive-vs-cusparse.eps' -#plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ -# 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ -# 'cusparse.gplt' using 1:3 title '' with dots linewidth 2 lt rgb 'green', \ -# 'cusparse.gplt' using 1:3 title 'CSR Legacy Adaptive' with lines linewidth 0.5 lt rgb 'green', -#set output 'csr-legacy-light-vs-cusparse.eps' -#plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ -# 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ -# 'cusparse.gplt' using 1:4 title '' with dots linewidth 2 lt rgb 'green', \ -# 'cusparse.gplt' using 1:4 title 'CSR Legacy Light' with lines linewidth 0.5 lt rgb 'green', -#set output 'csr-legacy-light-without-atomic-vs-cusparse.eps' -#plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ -# 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ -# 'cusparse.gplt' using 1:5 title '' with dots linewidth 2 lt rgb 'green', \ -# 'cusparse.gplt' using 1:5 title 'CSR Legacy LightWithoutAtomic' with lines linewidth 0.5 lt rgb 'green', -#set output 'csr-legacy-scalar-vs-cusparse.eps' -#plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ -# 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ -# 'cusparse.gplt' using 1:6 title '' with dots linewidth 2 lt rgb 'green', \ -# 'cusparse.gplt' using 1:6 title 'CSR Legacy Scalar' with lines linewidth 0.5 lt rgb 'green', -#set output 'csr-legacy-vector-vs-cusparse.eps' -#plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ -# 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ -# 'cusparse.gplt' using 1:7 title '' with dots linewidth 2 lt rgb 'green', \ -# 'cusparse.gplt' using 1:7 title 'CSR Legacy Vector' with lines linewidth 0.5 lt rgb 'green', -#set output 'csr-legacy-multivector-vs-cusparse.eps' -#plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ -# 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ -# 'cusparse.gplt' using 1:8 title '' with dots linewidth 2 lt rgb 'green', \ -# 'cusparse.gplt' using 1:8 title 'CSR Legacy MultiVector' with lines linewidth 0.5 lt rgb 'green', -set output 'ellpack-vs-cusparse.eps' -plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ - 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ - 'cusparse.gplt' using 1:9 title '' with dots linewidth 2 lt rgb 'green', \ - 'cusparse.gplt' using 1:9 title 'Ellpack' with lines linewidth 0.5 lt rgb 'green', \ - 'cusparse.gplt' using 1:10 title '' with dots linewidth 2 lt rgb 'blue', \ - 'cusparse.gplt' using 1:10 title 'Ellpack Legacy' with lines linewidth 0.5 lt rgb 'blue' -set output 'sliced-ellpack-vs-cusparse.eps' -plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ - 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ - 'cusparse.gplt' using 1:11 title '' with dots linewidth 2 lt rgb 'green', \ - 'cusparse.gplt' using 1:11 title 'Sliced Ellpack' with lines linewidth 0.5 lt rgb 'green', \ - 'cusparse.gplt' using 1:12 title '' with dots linewidth 2 lt rgb 'blue', \ - 'cusparse.gplt' using 1:12 title 'Sliced Ellpack Legacy' with lines linewidth 0.5 lt rgb 'blue' -set output 'chunked-ellpack-vs-cusparse.eps' -plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ - 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ - 'cusparse.gplt' using 1:13 title '' with dots linewidth 2 lt rgb 'green', \ - 'cusparse.gplt' using 1:13 title 'Chunked Ellpack' with lines linewidth 0.5 lt rgb 'green', \ - 'cusparse.gplt' using 1:14 title '' with dots linewidth 2 lt rgb 'blue', \ - 'cusparse.gplt' using 1:14 title 'Chunked Ellpack Legacy' with lines linewidth 0.5 lt rgb 'blue' -set output 'bi-ellpack-vs-cusparse.eps' -plot 'cusparse.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ - 'cusparse.gplt' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', \ - 'cusparse.gplt' using 1:15 title '' with dots linewidth 2 lt rgb 'green', \ - 'cusparse.gplt' using 1:15 title 'BiEllpack' with lines linewidth 0.5 lt rgb 'green', \ - 'cusparse.gplt' using 1:16 title '' with dots linewidth 2 lt rgb 'blue', \ - 'cusparse.gplt' using 1:16 title 'BiEllpack Legacy' with lines linewidth 0.5 lt rgb 'blue' -set output 'ellpack-vs-ellpack-legacy.eps' -plot 'ellpack.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ - 'ellpack.gplt' using 1:2 title 'Ellpack' with lines linewidth 0.5 lt rgb 'red', \ - 'ellpack.gplt' using 1:3 title '' with dots linewidth 2 lt rgb 'blue', \ - 'ellpack.gplt' using 1:3 title 'Ellpack Legacy' with lines linewidth 0.5 lt rgb 'blue' -set output 'sliced-ellpack-vs-sliced-ellpack-legacy.eps' -plot 'sliced-ellpack.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ - 'sliced-ellpack.gplt' using 1:2 title 'SlicedEllpack' with lines linewidth 0.5 lt rgb 'red', \ - 'sliced-ellpack.gplt' using 1:3 title '' with dots linewidth 2 lt rgb 'blue', \ - 'sliced-ellpack.gplt' using 1:3 title 'SlicedEllpack Legacy' with lines linewidth 0.5 lt rgb 'blue' -set output 'chunked-ellpack-vs-chunked-ellpack-legacy.eps' -plot 'chunked-ellpack.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ - 'chunked-ellpack.gplt' using 1:2 title 'ChunkedEllpack' with lines linewidth 0.5 lt rgb 'red', \ - 'chunked-ellpack.gplt' using 1:3 title '' with dots linewidth 2 lt rgb 'blue', \ - 'chunked-ellpack.gplt' using 1:3 title 'ChunkedEllpack Legacy' with lines linewidth 0.5 lt rgb 'blue' -set output 'bi-ellpack-vs-bi-ellpack-legacy.eps' -plot 'bi-ellpack.gplt' using 1:2 title '' with dots linewidth 2 lt rgb 'red', \ - 'bi-ellpack.gplt' using 1:2 title 'BiEllpack' with lines linewidth 0.5 lt rgb 'red', \ - 'bi-ellpack.gplt' using 1:3 title '' with dots linewidth 2 lt rgb 'blue', \ - 'bi-ellpack.gplt' using 1:3 title 'BiEllpack Legacy' with lines linewidth 0.5 lt rgb 'blue' -""") +""" ) +for gpu_format in gpu_matrix_formats: + filename = "cusparse-" + slugify( gpu_format ) + ".gplt" + gnuplot_file.write( f"set output 'cusparse-vs-{slugify(gpu_format)}.eps' \n" ) + gnuplot_file.write( f"plot '{filename}' using 1:2 title '' with dots linewidth 2 lt rgb 'red', " ) + gnuplot_file.write( f" '{filename}' using 1:2 title 'cuSparse' with lines linewidth 0.5 lt rgb 'red', " ) + gnuplot_file.write( f" '{filename}' using 1:3 title '' with dots linewidth 2 lt rgb 'green', " ) + gnuplot_file.write( f" '{filename}' using 1:3 title '{gpu_format}' with lines linewidth 0.5 lt rgb 'green' \n" ) + + +for format in gpu_comparison_formats: + filename = f"{slugify(format)}-gpu-comparison.gplt" + data = formats_comparison[ format ] + other_data = formats_comparison[ gpu_comparison_formats[ format ] ] + gnuplot_file.write( f"set output '{slugify(format)}-vs-{slugify(gpu_comparison_formats[ format ])}.eps' \n" ) + gnuplot_file.write( f"plot '{filename}' using 1:2 title '' with dots linewidth 2 lt rgb 'red', " ) + gnuplot_file.write( f" '{filename}' using 1:2 title '{format}' with lines linewidth 0.5 lt rgb 'red'," ) + gnuplot_file.write( f" '{filename}' using 1:3 title '' with dots linewidth 2 lt rgb 'blue', " ) + gnuplot_file.write( f" '{filename}' using 1:3 title '{gpu_comparison_formats[ format ]}' with lines linewidth 0.5 lt rgb 'blue' \n" ) + gnuplot_file.close() +""" +Executing Gnuplot +""" + print( "Executing Gnuplot ..." ) os.system( "gnuplot gnuplot.gplt" ) +""" +Converting files to PDF +""" print( "Converting files to PDF ..." ) -#os.system( "epstopdf --autorotate All csr-legacy-adaptive-vs-cusparse.eps" ) -#os.system( "epstopdf --autorotate All csr-legacy-light-vs-cusparse.eps" ) -#os.system( "epstopdf --autorotate All csr-legacy-light-without-atomic-vs-cusparse.eps" ) -#os.system( "epstopdf --autorotate All csr-legacy-scalar-vs-cusparse.eps" ) -#os.system( "epstopdf --autorotate All csr-legacy-vector-vs-cusparse.eps" ) -#os.system( "epstopdf --autorotate All csr-legacy-multivector-vs-cusparse.eps" ) -os.system( "epstopdf --autorotate All ellpack-vs-cusparse.eps" ) -os.system( "epstopdf --autorotate All sliced-ellpack-vs-cusparse.eps" ) -os.system( "epstopdf --autorotate All chunked-ellpack-vs-cusparse.eps" ) -os.system( "epstopdf --autorotate All bi-ellpack-vs-cusparse.eps" ) -os.system( "epstopdf --autorotate All ellpack-vs-ellpack-legacy.eps" ) -os.system( "epstopdf --autorotate All sliced-ellpack-vs-sliced-ellpack-legacy.eps" ) -os.system( "epstopdf --autorotate All chunked-ellpack-vs-chunked-ellpack-legacy.eps" ) -os.system( "epstopdf --autorotate All bi-ellpack-vs-bi-ellpack-legacy.eps" ) +for gpu_format in gpu_matrix_formats: + filename = "cusparse-vs-" + slugify( gpu_format ) + ".eps" + os.system( f"epstopdf --autorotate All {filename}" ) + +for format in gpu_comparison_formats: + filename = slugify(format) + "-vs-" + slugify(gpu_comparison_formats[ format ]) + ".eps" + os.system( f"epstopdf --autorotate All {filename}" ) +""" +Deleting temporary files +""" print( "Deleting temprary files..." ) -#os.system( "rm cusparse.gplt" ) -#os.system( "rm ellpack.gplt" ) -#os.system( "rm sliced-ellpack.gplt" ) -#os.system( "rm gnuplot.gplt" ) -#os.system( "rm ellpack-vs-cusparse.eps" ) -#os.system( "rm sliced-ellpack-vs-cusparse.eps" ) -#os.system( "rm chunked-ellpack-vs-cusparse.eps" ) -#os.system( "rm bi-ellpack-vs-cusparse.eps" ) -#os.system( "rm ellpack-vs-ellpack-legacy.eps" ) -#os.system( "rm sliced-ellpack-vs-sliced-ellpack-legacy.eps" ) +for gpu_format in gpu_matrix_formats: + filename = "cusparse-" + slugify( gpu_format ) + ".gplt" + os.system( f"rm {filename}" ) + filename = "cusparse-vs-" + slugify( gpu_format ) + ".eps" + os.system( f"rm {filename}" ) + +for format in gpu_comparison_formats: + filename = f"{slugify(format)}-gpu-comparison.gplt" + os.system( f"rm {filename}" ) + filename = slugify(format) + "-vs-" + slugify(gpu_comparison_formats[ format ]) + ".eps" + os.system( f"rm {filename}" ) +os.system( "rm gnuplot.gplt" ) diff --git a/src/Python/pytnl/tnl/SparseMatrix.cpp b/src/Python/pytnl/tnl/SparseMatrix.cpp index f4b1772a706bbfd8d7171cc5a50f93e765b4169d..b5e99c27577af8ee9741c480ff1824634eeb9a35 100644 --- a/src/Python/pytnl/tnl/SparseMatrix.cpp +++ b/src/Python/pytnl/tnl/SparseMatrix.cpp @@ -4,8 +4,8 @@ #include "SparseMatrix.h" #include -#include -#include +#include +#include using CSR_host = TNL::Matrices::Legacy::CSR< double, TNL::Devices::Host, int >; using CSR_cuda = TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int >; diff --git a/src/TNL/Config/ConfigEntryType.h b/src/TNL/Config/ConfigEntryType.h index 28f57a58228d561bc424584c60084bc2ff2111b8..4e6544639add2786ef40c58346770ba67614ca0f 100644 --- a/src/TNL/Config/ConfigEntryType.h +++ b/src/TNL/Config/ConfigEntryType.h @@ -12,6 +12,8 @@ #pragma once +#include +#include #include #include #include diff --git a/src/TNL/Matrices/Legacy/AdEllpack.h b/src/TNL/Matrices/Legacy/AdEllpack.h index 260bdc4ac1f6e9cec000886f5a3124ee0d583210..f1a023007230ce5d5a8dfadb3434dab1bdd09bae 100644 --- a/src/TNL/Matrices/Legacy/AdEllpack.h +++ b/src/TNL/Matrices/Legacy/AdEllpack.h @@ -18,7 +18,7 @@ #pragma once -#include +#include #include namespace TNL { diff --git a/src/TNL/Matrices/Legacy/BiEllpackSymmetric.h b/src/TNL/Matrices/Legacy/BiEllpackSymmetric.h deleted file mode 100644 index 09fe7c4e55b8247ef77846356dd20110f2d7eac6..0000000000000000000000000000000000000000 --- a/src/TNL/Matrices/Legacy/BiEllpackSymmetric.h +++ /dev/null @@ -1,184 +0,0 @@ -/*************************************************************************** - BiEllpackSymmetric.h - description - ------------------- - begin : Aug 30, 2018 - copyright : (C) 2018 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#pragma once - -#include -#include - -namespace TNL { -namespace Matrices { - namespace Legacy { - -template< typename Device > -class BiEllpackSymmetricDeviceDependentCode; - -template< typename Real, typename Device = Devices::Cuda, typename Index = int, int StripSize = 32 > -class BiEllpackSymmetric : public Sparse< Real, Device, Index > -{ -public: - typedef Real RealType; - typedef Device DeviceType; - typedef Index IndexType; - typedef typename Sparse< RealType, DeviceType, IndexType >::CompressedRowLengthsVector CompressedRowLengthsVector; - typedef typename Sparse< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView; - typedef typename Sparse< RealType, DeviceType, IndexType >::ValuesVector ValuesVector; - typedef typename Sparse< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector; - - template< typename _Real = Real, - typename _Device = Device, - typename _Index = Index > - using Self = BiEllpackSymmetric< _Real, _Device, _Index >; - - BiEllpackSymmetric(); - - void setDimensions( const IndexType rows, const IndexType columns ); - - void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths ); - - IndexType getRowLength( const IndexType row ) const; - - template< typename Real2, - typename Device2, - typename Index2 > - bool setLike( const BiEllpackSymmetric< Real2, Device2, Index2, StripSize >& matrix ); - - void getRowLengths( Containers::Vector< IndexType, DeviceType, IndexType >& rowLengths ) const; - - bool setElement( const IndexType row, - const IndexType column, - const RealType& value ); - - __cuda_callable__ - bool setElementFast( const IndexType row, - const IndexType column, - const RealType& value ); - - bool addElement( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator = 1.0 ); - - __cuda_callable__ - bool addElementFast( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator = 1.0 ); - - bool setRow( const IndexType row, - const IndexType* columns, - const RealType* values, - const IndexType numberOfElements ); - - bool addRow( const IndexType row, - const IndexType* columns, - const RealType* values, - const IndexType numberOfElements, - const RealType& thisElementMultiplicator = 1.0 ); - - RealType getElement( const IndexType row, - const IndexType column ) const; - - __cuda_callable__ - RealType getElementFast( const IndexType row, - const IndexType column ) const; - - void getRow( const IndexType row, - IndexType* columns, - RealType* values ) const; - - __cuda_callable__ - IndexType getGroupLength( const IndexType strip, - const IndexType group ) const; - - template< typename InVector, - typename OutVector > - void vectorProduct( const InVector& inVector, - OutVector& outVector ) const; - - template< typename InVector, - typename OutVector > - void vectorProductHost( const InVector& inVector, - OutVector& outVector ) const; - - void setVirtualRows(const IndexType rows); - - __cuda_callable__ - IndexType getNumberOfGroups( const IndexType row ) const; - - bool vectorProductTest() const; - - void reset(); - - void save( File& file ) const; - - void load( File& file ); - - void save( const String& fileName ) const; - - void load( const String& fileName ); - - void print( std::ostream& str ) const; - - void performRowBubbleSort( Containers::Vector< Index, Device, Index >& tempRowLengths ); - void computeColumnSizes( Containers::Vector< Index, Device, Index >& tempRowLengths ); - -// void verifyRowLengths( const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::RowLengthsVector& rowLengths ); - - template< typename InVector, - typename OutVector > -#ifdef HAVE_CUDA - __device__ -#endif - void spmvCuda( const InVector& inVector, - OutVector& outVector, - /*const IndexType warpStart, - const IndexType inWarpIdx*/ - int globalIdx ) const; - - __cuda_callable__ - IndexType getStripLength( const IndexType strip ) const; - - __cuda_callable__ - void performRowBubbleSortCudaKernel( const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths, - const IndexType strip ); - - __cuda_callable__ - void computeColumnSizesCudaKernel( const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths, - const IndexType numberOfStrips, - const IndexType strip ); - - __cuda_callable__ - IndexType power( const IndexType number, - const IndexType exponent ) const; - - typedef BiEllpackSymmetricDeviceDependentCode< DeviceType > DeviceDependentCode; - friend class BiEllpackSymmetricDeviceDependentCode< DeviceType >; - -private: - - IndexType warpSize; - - IndexType logWarpSize; - - IndexType virtualRows; - - Containers::Vector< Index, Device, Index > rowPermArray; - - Containers::Vector< Index, Device, Index > groupPointers; - -}; - -} //namespace Legacy -} // namespace Matrices -} // namespace TNL - -#include - diff --git a/src/TNL/Matrices/Legacy/BiEllpackSymmetric_impl.h b/src/TNL/Matrices/Legacy/BiEllpackSymmetric_impl.h deleted file mode 100644 index 61dde63343dfe178889ecea73b4bdc3bb7ccb3fe..0000000000000000000000000000000000000000 --- a/src/TNL/Matrices/Legacy/BiEllpackSymmetric_impl.h +++ /dev/null @@ -1,1637 +0,0 @@ -/*************************************************************************** - BiEllpackSymmetric.h - description - ------------------- - begin : Aug 30, 2018 - copyright : (C) 2018 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#pragma once - -#include -#include -#include -#include - -namespace TNL { -namespace Matrices { - namespace Legacy { - -template< typename Real, - typename Device, - typename Index, - int StripSize > - __cuda_callable__ -Index BiEllpackSymmetric< Real, Device, Index, StripSize >::power( const IndexType number, - const IndexType exponent ) const -{ - if( exponent >= 0 ) - { - IndexType result = 1; - for( IndexType i = 0; i < exponent; i++ ) - result *= number; - return result; - } - return 0; -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -BiEllpackSymmetric< Real, Device, Index, StripSize >::BiEllpackSymmetric() -: warpSize( 32 ), - logWarpSize( 5 ) -{} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -String BiEllpackSymmetric< Real, Device, Index, StripSize >::getType() -{ - return String( "Matrices::BiEllpackMatrix< ") + - String( TNL::getType< Real >() ) + - String( ", " ) + - String( Device :: getDeviceType() ) + - String( ", " ) + - String( TNL::getType< Index >() ) + - String( " >" ); -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -String BiEllpackSymmetric< Real, Device, Index, StripSize >::getTypeVirtual() const -{ - return this->getType(); -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -void BiEllpackSymmetric< Real, Device, Index, StripSize >::setDimensions( const IndexType rows, - const IndexType columns ) -{ - TNL_ASSERT( rows >= 0 && columns >= 0, - std::cerr << "rows = " << rows - << "columns = " << columns << std::endl ); - - if( this->getRows() % this->warpSize != 0 ) - this->setVirtualRows( this->getRows() + this->warpSize - ( this->getRows() % this->warpSize ) ); - else - this->setVirtualRows( this->getRows() ); - IndexType strips = this->virtualRows / this->warpSize; - - Sparse< Real, Device, Index >::setDimensions( rows, columns ); - this->rowPermArray.setSize( this->rows ); - this->groupPointers.setSize( strips * ( this->logWarpSize + 1 ) + 1 ); - - for( IndexType row = 0; row < this->getRows(); row++ ) - this->rowPermArray.setElement(row, row); -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -void BiEllpackSymmetric< Real, Device, Index, StripSize >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths ) -{ - if( this->getRows() % this->warpSize != 0 ) - this->setVirtualRows( this->getRows() + this->warpSize - ( this->getRows() % this->warpSize ) ); - else - this->setVirtualRows( this->getRows() ); - IndexType strips = this->virtualRows / this->warpSize; - this->rowPermArray.setSize( this->rows ); - this->groupPointers.setSize( strips * ( this->logWarpSize + 1 ) + 1 ); - for( IndexType i = 0; i < this->groupPointers.getSize(); i++ ) - this->groupPointers.setElement( i, 0 ); - - // FIXME: cannot sort a const vector! - //DeviceDependentCode::performRowBubbleSort( *this, rowLengths ); - //DeviceDependentCode::computeColumnSizes( *this, rowLengths ); - - this->groupPointers.computeExclusivePrefixSum(); - - // uncomment to perform structure test - //DeviceDependentCode::verifyRowPerm( *this, rowLengths ); - //DeviceDependentCode::verifyRowLengths( *this, rowLengths ); - - this->allocateMatrixElements( this->warpSize * this->groupPointers.getElement( strips * ( this->logWarpSize + 1 ) ) ); -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -__cuda_callable__ -Index BiEllpackSymmetric< Real, Device, Index, StripSize >::getStripLength( const IndexType strip ) const -{ - TNL_ASSERT( strip >= 0, - std::cerr << "strip = " << strip - << " this->getName() = " << std::endl ); - - return this->groupPointers.getElement( ( strip + 1 ) * ( this->logWarpSize + 1 ) ) - - this->groupPointers.getElement( strip * ( this->logWarpSize + 1 ) ); -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -__cuda_callable__ -Index BiEllpackSymmetric< Real, Device, Index, StripSize >::getNumberOfGroups( const IndexType row ) const -{ - TNL_ASSERT( row >=0 && row < this->getRows(), - std::cerr << "row = " << row - << " this->getRows() = " << this->getRows() - << " this->getName() = " << std::endl ); - - IndexType strip = row / this->warpSize; - IndexType rowStripPermutation = this->rowPermArray[ row ] - this->warpSize * strip; - IndexType numberOfGroups = this->logWarpSize + 1; - IndexType bisection = 1; - for( IndexType i = 0; i < this->logWarpSize + 1; i++ ) - { - if( rowStripPermutation < bisection ) - return ( numberOfGroups - i ); - bisection *= 2; - } - // FIXME: non-void function always has to return something sensible -#ifndef __CUDA_ARCH__ - throw "bug - row was not found"; -#else - TNL_ASSERT_TRUE( false, "bug - row was not found" ); -#endif -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -Index BiEllpackSymmetric< Real, Device, Index, StripSize >::getRowLength( const IndexType row ) const -{ - TNL_ASSERT( row >= 0 && row < this->getRows(), - std::cerr << "row = " << row - << " this->getRows() = " << this->getRows() - << " this->getName() = " << std::endl ); - - const IndexType strip = row / this->warpSize; - const IndexType groupBegin = strip * ( this->logWarpSize + 1 ); - const IndexType rowStripPerm = this->rowPermArray.getElement( row ) - strip * this->warpSize; - IndexType elementPtr = this->groupPointers.getElement( groupBegin ) * this->warpSize + rowStripPerm; - IndexType rowMultiplicator = 1; - IndexType step = this->warpSize; - IndexType rowLength = 0; - - for( IndexType group = 0; group < this->getNumberOfGroups( row ); group++ ) - { - for( IndexType i = 0; i < rowMultiplicator * this->getGroupLength( strip, group ); i++ ) - { - if( this->values.getElement( elementPtr ) == 0.0 ) - return rowLength; - else - rowLength++; - elementPtr += step; - } - rowMultiplicator *= 2; - step /= 2; - } - return rowLength; -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > - template< typename Real2, - typename Device2, - typename Index2 > -bool BiEllpackSymmetric< Real, Device, Index, StripSize >::setLike( const BiEllpackSymmetric< Real2, Device2, Index2, StripSize >& matrix ) -{ - std::cout << "setLike" << std::endl; - std::cout << "settingLike" << std::endl; - if( ! Sparse< Real, Device, Index >::setLike( matrix ) || - ! this->rowPermArray.setLike( matrix.rowPermArray ) || - ! this->groupPointers.setLike( matrix.groupPointers ) ) - return false; - return true; -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -void BiEllpackSymmetric< Real, Device, Index, StripSize >::getRowLengths( Containers::Vector< IndexType, DeviceType, IndexType >& rowLengths) const -{ - for( IndexType row = 0; row < this->getRows(); row++ ) - rowLengths.setElement( row, this->getRowLength( row ) ); -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -bool BiEllpackSymmetric< Real, Device, Index, StripSize >::setElement( const IndexType row, - const IndexType column, - const RealType& value ) -{ - TNL_ASSERT( ( row >= 0 && row < this->getRows() ) || - ( column >= 0 && column < this->getColumns() ), - std::cerr << "row = " << row - << " this->getRows() = " << this->getRows() - << " this->getColumns() = " << this->getColumns() - << " this->getName() = " << std::endl ); - - return this->addElement( row, column, value, 0.0 ); -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -__cuda_callable__ -bool BiEllpackSymmetric< Real, Device, Index, StripSize >::setElementFast( const IndexType row, - const IndexType column, - const RealType& value ) -{ - TNL_ASSERT( ( row >= 0 && row < this->getRows() ) || - ( column >= 0 && column < this->getColumns() ), - std::cerr << "row = " << row - << " this->getRows() = " << this->getRows() - << " this->getColumns() = " << this->getColumns() - << " this->getName() = " << this->getName() <addElementFast( row, column, value, 0.0 ); -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -bool BiEllpackSymmetric< Real, Device, Index, StripSize >::addElement( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator ) -{ - const IndexType strip = row / this->warpSize; - const IndexType groupBegin = strip * ( this->logWarpSize + 1 ); - const IndexType rowStripPerm = this->rowPermArray.getElement( row ) - strip * this->warpSize; - IndexType elementPtr = this->groupPointers.getElement( groupBegin ) * this->warpSize + rowStripPerm; - IndexType rowMultiplicator = 1; - IndexType step = this->warpSize; - - for( IndexType group = 0; group < this->getNumberOfGroups( row ); group++ ) - { - for( IndexType i = 0; i < rowMultiplicator * this->getGroupLength( strip, group ); i++ ) - { - if( this->columnIndexes.getElement( elementPtr ) == this->getPaddingIndex() ) - { - this->columnIndexes.setElement( elementPtr, column ); - this->values.setElement( elementPtr, value ); - return true; - } - if( this->columnIndexes.getElement( elementPtr ) == column ) - { - this->values.setElement( elementPtr, this->values.getElement( elementPtr ) + value * thisElementMultiplicator ); - return true; - } - elementPtr += step; - } - step /= 2; - rowMultiplicator *= 2; - } - return false; -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -__cuda_callable__ -bool BiEllpackSymmetric< Real, Device, Index, StripSize >::addElementFast( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator ) -{ - const IndexType strip = row / this->warpSize; - const IndexType groupBegin = strip * ( this->logWarpSize + 1 ); - const IndexType rowStripPerm = this->rowPermArray[ row ] - strip * this->warpSize; - IndexType elementPtr = this->groupPointers[ groupBegin ] * this->warpSize + rowStripPerm; - IndexType rowMultiplicator = 1; - IndexType step = this->warpSize; - - IndexType numberOfGroups = this->logWarpSize + 1; - IndexType bisection = 1; - for( IndexType i = 0; i < this->logWarpSize + 1; i++ ) - { - if( rowStripPerm < bisection ) - { - numberOfGroups -= i; - break; - } - bisection *= 2; - } - - for( IndexType group = 0; group < numberOfGroups; group++ ) - { - IndexType groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ] - - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ]; - - for( IndexType i = 0; i < rowMultiplicator * groupLength; i++ ) - { - if( this->columnIndexes[ elementPtr ] == this->getPaddingIndex() ) - { - this->columnIndexes[ elementPtr ] = column ; - this->values[ elementPtr ] = value; - return true; - } - if( this->columnIndexes[ elementPtr ] == column ) - { - this->values[ elementPtr ] += value * thisElementMultiplicator ; - return true; - } - elementPtr += step; - } - step /= 2; - rowMultiplicator *= 2; - } - return false; -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -bool BiEllpackSymmetric< Real, Device, Index, StripSize >::setRow( const IndexType row, - const IndexType* columns, - const RealType* values, - const IndexType numberOfElements ) -{ - TNL_ASSERT( row >= 0 && row < this->getRows(), - std::cerr << "row = " << row - << " this->getRows() = " << this->getRows() - << " this->getName() = " << std::endl ); - - const IndexType strip = row / this->warpSize; - const IndexType groupBegin = strip * ( this->logWarpSize + 1 ); - const IndexType rowStripPerm = this->rowPermArray.getElement( row ) - strip * this->warpSize; - IndexType elementPtr = this->groupPointers.getElement( groupBegin ) * this->warpSize + rowStripPerm; - IndexType thisElementPtr = 0; - IndexType rowMultiplicator = 1; - IndexType step = this->warpSize; - - for( IndexType group = 0; ( group < this->getNumberOfGroups( row ) ) && ( thisElementPtr < numberOfElements ); group++ ) - { - for( IndexType i = 0; ( i < rowMultiplicator * this->getGroupLength( strip, group ) ) && ( thisElementPtr < numberOfElements ); i++ ) - { - this->columnIndexes.setElement( elementPtr, columns[ thisElementPtr ] ); - this->values.setElement( elementPtr, values[ thisElementPtr ] ); - thisElementPtr++; - elementPtr += step; - } - step /= 2; - rowMultiplicator *= 2; - } - if( thisElementPtr == numberOfElements ) - return true; - return false; -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -bool BiEllpackSymmetric< Real, Device, Index, StripSize >::addRow( const IndexType row, - const IndexType* columns, - const RealType* values, - const IndexType numberOfElements, - const RealType& thisElementMultiplicator ) -{ - TNL_ASSERT( row >=0 && row < this->getRows(), - std::cerr << "row = " << row - << " this->getRows() = " << this->getRows() - << " this->getName() = " << std::endl ); - - const IndexType strip = row / this->warpSize; - const IndexType groupBegin = strip * ( this->logWarpSize + 1 ); - const IndexType rowStripPerm = this->rowPermArray.getElement( row ) - this->warpSize * strip; - IndexType elementPtr = this->groupPointers.getElement( groupBegin ) * this->warpSize + rowStripPerm; - IndexType rowMultiplicator = 1; - IndexType step = this->warpSize; - IndexType thisElementPtr = 0; - - while( thisElementPtr < numberOfElements ) - { - for( IndexType group = 0; group < this->getNumberOfGroups( row ); group++ ) - { - for( IndexType i = 0; ( i < rowMultiplicator * this->getGroupLength( strip, group ) ) && ( thisElementPtr < numberOfElements ); i++ ) - { - if( this->columnIndexes.getElement( elementPtr ) == columns[ thisElementPtr ] ) - { - RealType result = this->values.getElement( elementPtr ) + values[ thisElementPtr ] * thisElementMultiplicator; - this->values.setElement( elementPtr, result ); - thisElementPtr++; - } - elementPtr += step; - } - step /= 2; - rowMultiplicator *= 2; - } - } - return ( thisElementPtr == numberOfElements ); -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -Real BiEllpackSymmetric< Real, Device, Index, StripSize >::getElement( const IndexType row, - const IndexType column ) const -{ - TNL_ASSERT( ( row >= 0 && row < this->getRows() ) || - ( column >= 0 && column < this->getColumns() ), - std::cerr << "row = " << row - << " this->getRows() = " << this->getRows() - << " this->getColumns() = " << this->getColumns() - << "this->getName() = " << std::endl ); - - if( row > column ) - return this->getElement( column, row ); - - const IndexType strip = row / this->warpSize; - const IndexType groupBegin = strip * ( this->logWarpSize + 1 ); - const IndexType rowStripPerm = this->rowPermArray.getElement( row ) - strip * this->warpSize; - IndexType elementPtr = this->groupPointers.getElement( groupBegin ) * this->warpSize + rowStripPerm; - IndexType rowMultiplicator = 1; - IndexType step = this->warpSize; - - for( IndexType group = 0; group < this->getNumberOfGroups( row ); group++ ) - { - for( IndexType i = 0; i < rowMultiplicator * this->getGroupLength( strip, group ); i++ ) - { - if( this->columnIndexes.getElement( elementPtr ) == column ) - return this->values.getElement( elementPtr ); - elementPtr += step; - } - step /= 2; - rowMultiplicator *= 2; - } - return 0.0; -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -__cuda_callable__ -Real BiEllpackSymmetric< Real, Device, Index, StripSize >::getElementFast( const IndexType row, - const IndexType column ) const -{ - const IndexType strip = row / this->warpSize; - const IndexType groupBegin = strip * ( this->logWarpSize + 1 ); - const IndexType rowStripPerm = this->rowPermArray[ row ] - strip * this->warpSize; - IndexType elementPtr = this->groupPointers[ groupBegin ] * this->warpSize + rowStripPerm; - IndexType rowMultiplicator = 1; - IndexType step = this->warpSize; - - IndexType numberOfGroups = this->logWarpSize + 1; - IndexType bisection = 1; - for( IndexType i = 0; i < this->logWarpSize + 1; i++ ) - { - if( rowStripPerm < bisection ) - { - numberOfGroups -= i; - break; - } - bisection *= 2; - } - - for( IndexType group = 0; group < numberOfGroups; group++ ) - { - IndexType groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ] - - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ]; - - for( IndexType i = 0; i < rowMultiplicator * groupLength; i++ ) - { - if( this->columnIndexes[ elementPtr ] == column ) - return this->values[ elementPtr ]; - elementPtr += step; - } - step /= 2; - rowMultiplicator *= 2; - } - return false; -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -void BiEllpackSymmetric< Real, Device, Index, StripSize >::getRow( const IndexType row, - IndexType* columns, - RealType* values ) const -{ - TNL_ASSERT( row >=0 && row < this->getRows(), - std::cerr << "row = " << row - << " this->getRows() = " << this->getRows() - << " this->getName() = " << this->getName() <warpSize; - const IndexType groupBegin = strip * ( this->logWarpSize + 1 ); - const IndexType rowStripPerm = this->rowPermArray.getElement( row ) - this->warpSize * strip; - IndexType elementPtr = this->groupPointers.getElement( groupBegin ) * this->warpSize + rowStripPerm; - IndexType rowMultiplicator = 1; - IndexType step = this->warpSize; - IndexType thisElementPtr = 0; - - for( IndexType group = 0; group < this->getNumberOfGroups( row ) && !padding; group++ ) - { - for( IndexType i = 0; ( i < rowMultiplicator * this->getGroupLength( strip, group ) ) && !padding; i++ ) - { - if( this->columnIndexes.getElement( elementPtr ) == this->getPaddingIndex() ) - { - padding = true; - break; - } - values[ thisElementPtr ] = this->values.getElement( elementPtr ); - columns[ thisElementPtr ] = this->columnIndexes.getElement( elementPtr ); - thisElementPtr++; - elementPtr += step; - } - step /= 2; - rowMultiplicator *= 2; - } -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -void BiEllpackSymmetric< Real, Device, Index, StripSize >::setVirtualRows(const IndexType rows) -{ - this->virtualRows = rows; -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -__cuda_callable__ -Index BiEllpackSymmetric< Real, Device, Index, StripSize >::getGroupLength( const Index strip, - const Index group ) const -{ - return this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ] - - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ]; -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -template< typename InVector, - typename OutVector > -void BiEllpackSymmetric< Real, Device, Index, StripSize >::vectorProduct( const InVector& inVector, - OutVector& outVector ) const -{ - DeviceDependentCode::vectorProduct( *this, inVector, outVector ); -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -template< typename InVector, - typename OutVector > -void BiEllpackSymmetric< Real, Device, Index, StripSize >::vectorProductHost( const InVector& inVector, - OutVector& outVector ) const -{ - const IndexType cudaBlockSize = 256; - const IndexType cudaBlocks = roundUpDivision( this->getRows(), cudaBlockSize ); - for( IndexType blockIdx = 0; blockIdx < cudaBlocks; blockIdx++ ) - { - Containers::Vector< Real, Device, Index > tempStripOutVector; - tempStripOutVector.setSize( cudaBlockSize ); - for( IndexType i = 0; i < tempStripOutVector.getSize(); i++ ) - tempStripOutVector.setElement( i, 0 ); - - for( IndexType threadIdx = 0; threadIdx < cudaBlockSize; threadIdx++ ) - { - IndexType globalIdx = cudaBlockSize * blockIdx + threadIdx; - IndexType warpStart = this->warpSize * ( globalIdx / this->warpSize ); - IndexType inWarpIdx = globalIdx % this->warpSize; - if( warpStart >= this->getRows() ) - break; - IndexType strip = warpStart / this->warpSize; - const IndexType groupBegin = strip * ( this->logWarpSize + 1 ); - - IndexType row = warpStart + inWarpIdx; - IndexType currentRow = row; - IndexType elementPtr = this->groupPointers.getElement( groupBegin ) * this->warpSize + ( row - warpStart ); - IndexType bisection = this->warpSize; - for( IndexType group = 0; group < this->logWarpSize + 1; group++ ) - { - if( !( currentRow - warpStart < bisection ) ) - currentRow -= bisection; - IndexType groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ] - - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ]; - for( IndexType i = 0; i < groupLength; i++ ) - { - if( this->columnIndexes.getElement( elementPtr ) == this->getPaddingIndex() ) - { - elementPtr += this->warpSize; - continue; - } - RealType result = tempStripOutVector.getElement( currentRow % cudaBlockSize ); - result += inVector[ this->columnIndexes.getElement( elementPtr ) ] * this->values.getElement( elementPtr ); - outVector[ this->columnIndexes[ elementPtr ] ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; - tempStripOutVector.setElement( currentRow % cudaBlockSize, result ); - elementPtr += this->warpSize; - } - bisection /= 2; - } - } - IndexType end = cudaBlockSize * ( blockIdx + 1 ); - if( end > this->getRows() ) - end = this->getRows(); - for( IndexType i = cudaBlockSize * blockIdx; i < end; i++ ) - outVector[ i ] = tempStripOutVector.getElement( this->rowPermArray.getElement( i ) % cudaBlockSize ); - tempStripOutVector.reset(); - } -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -void BiEllpackSymmetric< Real, Device, Index, StripSize >::reset() -{ - Sparse< Real, Device, Index >::reset(); - this->rowPermArray.reset(); - this->groupPointers.reset(); -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -void BiEllpackSymmetric< Real, Device, Index, StripSize >::save( File& file ) const -{ - Sparse< Real, Device, Index >::save( file ); - file << this->groupPointers << this->rowPermArray; -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -void BiEllpackSymmetric< Real, Device, Index, StripSize >::load( File& file ) -{ - Sparse< Real, Device, Index >::load( file ); - file >> this->groupPointers >> this->rowPermArray; -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -void BiEllpackSymmetric< Real, Device, Index, StripSize >::save( const String& fileName ) const -{ - Object::save( fileName ); -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -void BiEllpackSymmetric< Real, Device, Index, StripSize >::load( const String& fileName ) -{ - Object::load( fileName ); -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -void BiEllpackSymmetric< Real, Device, Index, StripSize >::print( std::ostream& str ) const -{ - for( IndexType row = 0; row < this->getRows(); row++ ) - { - str <<"Row: " << row << " -> "; - bool padding = false; - const IndexType strip = row / this->warpSize; - const IndexType groupBegin = strip * ( this->logWarpSize + 1 ); - const IndexType rowStripPerm = this->rowPermArray.getElement( row ) - this->warpSize * strip; - IndexType elementPtr = this->groupPointers.getElement( groupBegin ) * this->warpSize + rowStripPerm; - IndexType rowMultiplicator = 1; - IndexType step = this->warpSize; - - for( IndexType group = 0; group < this->getNumberOfGroups( row ) && !padding; group++ ) - { - for( IndexType i = 0; ( i < rowMultiplicator * this->getGroupLength( strip, group ) ) && !padding; i++ ) - { - if( this->columnIndexes.getElement( elementPtr ) == this->getPaddingIndex() ) - { - padding = true; - break; - } - RealType value = this->values.getElement( elementPtr ); - IndexType column = this->columnIndexes.getElement( elementPtr ); - str << " Col:" << column << "->" << value << "\t"; - elementPtr += step; - } - step /= 2; - rowMultiplicator *= 2; - } - str < -void BiEllpackSymmetric< Real, Device, Index, StripSize >::performRowBubbleSort( Containers::Vector< Index, Device, Index >& tempRowLengths ) -{ - Index strips = this->virtualRows / this->warpSize; - for( Index i = 0; i < strips; i++ ) - { - Index begin = i * this->warpSize; - Index end = ( i + 1 ) * this->warpSize - 1; - if( this->getRows() - 1 < end) - end = this->getRows() - 1; - bool sorted = false; - Index permIndex1, permIndex2, offset = 0; - while( !sorted ) - { - sorted = true; - for( Index j = begin + offset; j < end - offset; j++ ) - if( tempRowLengths.getElement( j ) < tempRowLengths.getElement( j + 1 ) ) - { - for( Index k = begin; k < end + 1; k++ ) - { - if( this->rowPermArray.getElement( k ) == j ) - permIndex1 = k; - if( this->rowPermArray.getElement( k ) == j + 1 ) - permIndex2 = k; - } - Index temp = tempRowLengths.getElement( j ); - tempRowLengths.setElement( j, tempRowLengths.getElement( j + 1 ) ); - tempRowLengths.setElement( j + 1, temp ); - temp = this->rowPermArray.getElement( permIndex1 ); - this->rowPermArray.setElement( permIndex1, this->rowPermArray.getElement( permIndex2 ) ); - this->rowPermArray.setElement( permIndex2, temp ); - sorted = false; - } - for( Index j = end - 1 - offset; j > begin + offset; j-- ) - if( tempRowLengths.getElement( j ) > tempRowLengths.getElement( j - 1 ) ) - { - for( Index k = begin; k < end + 1; k++ ) - { - if( this->rowPermArray.getElement( k ) == j ) - permIndex1 = k; - if( this->rowPermArray.getElement( k ) == j - 1 ) - permIndex2 = k; - } - Index temp = tempRowLengths.getElement( j ); - tempRowLengths.setElement( j, tempRowLengths.getElement( j - 1 ) ); - tempRowLengths.setElement( j - 1, temp ); - temp = this->rowPermArray.getElement( permIndex1 ); - this->rowPermArray.setElement( permIndex1, this->rowPermArray.getElement( permIndex2 ) ); - this->rowPermArray.setElement( permIndex2, temp ); - sorted = false; - } - offset++; - } - } -} - -template< typename Real, - typename Device, - typename Index, - int StripSize > -void BiEllpackSymmetric< Real, Device, Index, StripSize >::computeColumnSizes( Containers::Vector< Index, Device, Index >& tempRowLengths ) -{ - Index numberOfStrips = this->virtualRows / this->warpSize; - for( Index strip = 0; strip < numberOfStrips; strip++ ) - { - Index i = 0; - Index rowBegin = strip * this->warpSize; - Index groupBegin = strip * ( this->logWarpSize + 1 ); - Index emptyGroups = 0; - if( strip == numberOfStrips - 1 ) - { - Index lastRows = this->getRows() - rowBegin; - while( !( lastRows > this->power( 2, this->logWarpSize - 1 - emptyGroups ) ) ) - emptyGroups++; - for( Index group = groupBegin; group < groupBegin + emptyGroups; group++ ) - this->groupPointers.setElement( group, 0 ); - } - i += emptyGroups; - for( Index group = groupBegin + emptyGroups; group < groupBegin + this->logWarpSize; group++ ) - { - Index row = this->power( 2, 4 - i ); - Index temp = tempRowLengths.getElement( row + rowBegin ); - for( Index prevGroups = groupBegin; prevGroups < group; prevGroups++ ) - temp -= this->power( 2, prevGroups - groupBegin ) * this->groupPointers.getElement( prevGroups ); - temp = ceil( ( float ) temp / this->power( 2, i ) ); - this->groupPointers.setElement( group, temp ); - i++; - } - Index temp = tempRowLengths.getElement( rowBegin ); - for( Index prevGroups = groupBegin; prevGroups < groupBegin + this->logWarpSize; prevGroups++ ) - temp -= this->power( 2, prevGroups - groupBegin ) * this->groupPointers.getElement( prevGroups ); - temp = ceil( ( float ) temp / this->power( 2, this->logWarpSize ) ); - this->groupPointers.setElement( groupBegin + this->logWarpSize, temp ); - } -} - -template<> -class BiEllpackSymmetricDeviceDependentCode< Devices::Host > -{ -public: - - typedef Devices::Host Device; - - template< typename Real, - typename Index, - int StripSize > - static void verifyRowLengths( const BiEllpackSymmetric< Real, Device, Index, StripSize >& matrix, - const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::RowLengthsVector& rowLengths ) - { - bool ok = true; - for( Index row = 0; row < matrix.getRows(); row++ ) - { - const Index strip = row / matrix.warpSize; - const Index stripLength = matrix.getStripLength( strip ); - const Index groupBegin = ( matrix.logWarpSize + 1 ) * strip; - const Index rowStripPerm = matrix.rowPermArray.getElement( row ) - strip * matrix.warpSize; - const Index begin = matrix.groupPointers.getElement( groupBegin ) * matrix.warpSize + rowStripPerm * stripLength; - Index elementPtr = begin; - Index rowLength = 0; - for( Index group = 0; group < matrix.getNumberOfGroups( row ); group++ ) - { - for( Index i = 0; i < matrix.getGroupLength( strip, group ); i++ ) - { - Index biElementPtr = elementPtr; - for( Index j = 0; j < matrix.power( 2, group ); j++ ) - { - rowLength++; - biElementPtr += matrix.power( 2, matrix.logWarpSize - group ) * stripLength; - } - elementPtr++; - } - } - if( rowLengths.getElement( row ) > rowLength ) - ok = false; - } - if( ok ) - std::cout << "row lengths OK" < - static void verifyRowPerm( const BiEllpackSymmetric< Real, Device, Index, StripSize >& matrix, - const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::RowLengthsVector& rowLengths ) - { - bool ok = true; - Index numberOfStrips = matrix.virtualRows / matrix.warpSize; - for( Index strip = 0; strip < numberOfStrips; strip++ ) - { - Index begin = strip * matrix.warpSize; - Index end = ( strip + 1 ) * matrix.warpSize; - if( matrix.getRows() < end ) - end = matrix.getRows(); - for( Index i = begin; i < end - 1; i++ ) - { - Index permIndex1, permIndex2; - bool first = false; - bool second = false; - for( Index j = begin; j < end; j++ ) - { - if( matrix.rowPermArray.getElement( j ) == i ) - { - permIndex1 = j; - first = true; - } - if( matrix.rowPermArray.getElement( j ) == i + 1 ) - { - permIndex2 = j; - second = true; - } - } - if( !first || !second ) - std::cout << "Wrong permutation!" <= rowLengths.getElement( permIndex2 ) ) - continue; - else - ok = false; - } - } - if( ok ) - std::cout << "Permutation OK" < - static void vectorProduct( const BiEllpackSymmetric< Real, Device, Index, StripSize >& matrix, - const InVector& inVector, - OutVector& outVector ) - { - matrix.vectorProductHost( inVector, outVector ); - } - - template< typename Real, - typename Index, - int StripSize > - static void computeColumnSizes( BiEllpackSymmetric< Real, Device, Index, StripSize >& matrix, - const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::RowLengthsVector& rowLengths ) - { - Index numberOfStrips = matrix.virtualRows / matrix.warpSize; - for( Index strip = 0; strip < numberOfStrips; strip++ ) - { - Index i = 0; - Index rowBegin = strip * matrix.warpSize; - Index groupBegin = strip * ( matrix.logWarpSize + 1 ); - Index emptyGroups = 0; - if( strip == numberOfStrips - 1 ) - { - Index lastRows = matrix.getRows() - rowBegin; - while( !( lastRows > matrix.power( 2, matrix.logWarpSize - 1 - emptyGroups ) ) ) - emptyGroups++; - for( Index group = groupBegin; group < groupBegin + emptyGroups; group++ ) - matrix.groupPointers.setElement( group, 0 ); - } - i += emptyGroups; - for( Index group = groupBegin + emptyGroups; group < groupBegin + matrix.logWarpSize; group++ ) - { - Index row = matrix.power( 2, 4 - i ); - Index permRow = 0; - while( matrix.rowPermArray.getElement( permRow + rowBegin ) != row + rowBegin ) - permRow++; - Index temp = rowLengths.getElement( permRow + rowBegin ); - for( Index prevGroups = groupBegin; prevGroups < group; prevGroups++ ) - temp -= matrix.power( 2, prevGroups - groupBegin ) * matrix.groupPointers.getElement( prevGroups ); - temp = ceil( ( float ) temp / matrix.power( 2, i ) ); - matrix.groupPointers.setElement( group, temp ); - i++; - } - Index permRow = rowBegin; - while( matrix.rowPermArray.getElement( permRow ) != rowBegin ) - permRow++; - Index temp = rowLengths.getElement( permRow ); - for( Index prevGroups = groupBegin; prevGroups < groupBegin + matrix.logWarpSize; prevGroups++ ) - temp -= matrix.power( 2, prevGroups - groupBegin ) * matrix.groupPointers.getElement( prevGroups ); - temp = ceil( ( float ) temp / matrix.power( 2, matrix.logWarpSize ) ); - matrix.groupPointers.setElement( groupBegin + matrix.logWarpSize, temp ); - } - } - - template< typename Real, - typename Index, - int StripSize > - static void performRowBubbleSort( BiEllpackSymmetric< Real, Device, Index, StripSize >& matrix, - const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::RowLengthsVector& rowLengths - /*Containers::Vector< Index, Device, Index >& tempRowLengths*/ ) - { - Index strips = matrix.virtualRows / matrix.warpSize; - for( Index i = 0; i < strips; i++ ) - { - Index begin = i * matrix.warpSize; - Index end = ( i + 1 ) * matrix.warpSize - 1; - if(matrix.getRows() - 1 < end) - end = matrix.getRows() - 1; - bool sorted = false; - Index permIndex1, permIndex2, offset = 0; - while( !sorted ) - { - sorted = true; - for( Index j = begin + offset; j < end - offset; j++ ) - { - for( Index k = begin; k < end + 1; k++ ) - { - if( matrix.rowPermArray.getElement( k ) == j ) - permIndex1 = k; - if( matrix.rowPermArray.getElement( k ) == j + 1 ) - permIndex2 = k; - } - if( rowLengths.getElement( permIndex1 ) < rowLengths.getElement( permIndex2 ) ) - { - Index temp = matrix.rowPermArray.getElement( permIndex1 ); - matrix.rowPermArray.setElement( permIndex1, matrix.rowPermArray.getElement( permIndex2 ) ); - matrix.rowPermArray.setElement( permIndex2, temp ); - sorted = false; - } - } - for( Index j = end - 1 - offset; j > begin + offset; j-- ) - { - for( Index k = begin; k < end + 1; k++ ) - { - if( matrix.rowPermArray.getElement( k ) == j ) - permIndex1 = k; - if( matrix.rowPermArray.getElement( k ) == j - 1 ) - permIndex2 = k; - } - if( rowLengths.getElement( permIndex2 ) < rowLengths.getElement( permIndex1 ) ) - { - Index temp = matrix.rowPermArray.getElement( permIndex1 ); - matrix.rowPermArray.setElement( permIndex1, matrix.rowPermArray.getElement( permIndex2 ) ); - matrix.rowPermArray.setElement( permIndex2, temp ); - sorted = false; - } - } - offset++; - } - } - } -}; - -#ifdef HAVE_CUDA -template< typename Real, - typename Device, - typename Index, - int StripSize > -template< typename InVector, - typename OutVector > -__device__ -void BiEllpackSymmetric< Real, Device, Index, StripSize >::spmvCuda( const InVector& inVector, - OutVector& outVector, - int globalIdx ) const -{ - const IndexType strip = globalIdx >> this->logWarpSize; - const IndexType warpStart = strip << this->logWarpSize; - const IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 ); - - if( warpStart >= this->getRows() ) - return; - - const IndexType cudaBlockSize = 256; - IndexType bisection = this->warpSize; - IndexType groupBegin = strip * ( this->logWarpSize + 1 ); - - Real* temp = Cuda::getSharedMemory< Real >(); - __shared__ Real results[ cudaBlockSize ]; - results[ threadIdx.x ] = 0.0; - IndexType elementPtr = ( this->groupPointers[ groupBegin ] << this->logWarpSize ) + inWarpIdx; - - for( IndexType group = 0; group < this->logWarpSize + 1; group++ ) - { - temp[ threadIdx.x ] = 0.0; - IndexType groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ] - - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ]; - - if( groupLength > 0 ) - { - for( IndexType i = 0; i < groupLength; i++ ) - { - if( this->columnIndexes[ elementPtr ] < this->getColumns() ) - temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; - outVector.add( this->columnIndexes[ elementPtr ], inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ] ); - elementPtr += this->warpSize; - } - IndexType bisection2 = this->warpSize; - for( IndexType i = 0; i < group; i++ ) - { - bisection2 >>= 1; - if( inWarpIdx < bisection2 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + bisection2 ]; - } - if( inWarpIdx < bisection ) - results[ threadIdx.x ] += temp[ threadIdx.x ]; - } - bisection >>= 1; - } - __syncthreads(); - if( warpStart + inWarpIdx >= this->getRows() ) - return; - outVector[ warpStart + inWarpIdx ] = results[ this->rowPermArray[ warpStart + inWarpIdx ] & ( cudaBlockSize - 1 ) ]; -} -#endif - -/*#ifdef HAVE_CUDA -template< typename Real, - typename Device, - typename Index, - int StripSize > -template< typename InVector, - typename OutVector > -__device__ -void BiEllpackSymmetric< Real, Device, Index, StripSize >::spmvCuda( const InVector& inVector, - OutVector& outVector, - int globalIdx ) const -{ - // Loop unrolling test - const IndexType strip = globalIdx >> this->logWarpSize; - const IndexType warpStart = strip << this->logWarpSize; - const IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 ); - - if( warpStart >= this->getRows() ) - return; - - const IndexType cudaBlockSize = 256; - - volatile Real* temp = getSharedMemory< Real >(); - __shared__ Real results[ cudaBlockSize ]; - results[ threadIdx.x ] = 0.0; - IndexType elementPtr = ( this->groupPointers[ strip * ( this->logWarpSize + 1 ) ] << this->logWarpSize ) + inWarpIdx; - - //Loop Unroll #1 - IndexType group = 0; - IndexType groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ] - - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ]; - - if( groupLength > 0 ) - { - for( IndexType i = 0; i < groupLength; i++ ) - { - if( this->columnIndexes[ elementPtr ] < this->getColumns() ) - results[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; - elementPtr += this->warpSize; - } - } - - group++; - temp[ threadIdx.x ] = 0.0; - groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ] - - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ]; - - if( groupLength > 0 ) - { - for( IndexType i = 0; i < groupLength; i++ ) - { - if( this->columnIndexes[ elementPtr ] < this->getColumns() ) - temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; - elementPtr += this->warpSize; - } - //Loop Unroll #2 - if( inWarpIdx < 16 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 16 ]; - if( inWarpIdx < 16 ) - results[ threadIdx.x ] += temp[ threadIdx.x ]; - } - - - //group == 2; - group++; - temp[ threadIdx.x ] = 0.0; - groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ] - - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ]; - if( groupLength > 0 ) - { - for( IndexType i = 0; i < groupLength; i++ ) - { - if( this->columnIndexes[ elementPtr ] < this->getColumns() ) - temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; - elementPtr += this->warpSize; - } - //Loop Unroll #3 - if( inWarpIdx < 16 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 16 ]; - if( inWarpIdx < 8 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 8 ]; - if( inWarpIdx < 8 ) - results[ threadIdx.x ] += temp[ threadIdx.x ]; - } - - //group == 3; - group++; - temp[ threadIdx.x ] = 0.0; - groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ] - - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ]; - if( groupLength > 0 ) - { - for( IndexType i = 0; i < groupLength; i++ ) - { - if( this->columnIndexes[ elementPtr ] < this->getColumns() ) - temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; - elementPtr += this->warpSize; - } - //Loop Unroll #4 - if( inWarpIdx < 16 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 16 ]; - if( inWarpIdx < 8 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 8 ]; - if( inWarpIdx < 4 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 4 ]; - if( inWarpIdx < 4 ) - results[ threadIdx.x ] += temp[ threadIdx.x ]; - } - - //group == 4; - group++; - temp[ threadIdx.x ] = 0.0; - groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ] - - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ]; - if( groupLength > 0 ) - { - for( IndexType i = 0; i < groupLength; i++ ) - { - if( this->columnIndexes[ elementPtr ] < this->getColumns() ) - temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; - elementPtr += this->warpSize; - } - //Loop Unroll #5 - if( inWarpIdx < 16 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 16 ]; - if( inWarpIdx < 8 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 8 ]; - if( inWarpIdx < 4 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 4 ]; - if( inWarpIdx < 2 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 2 ]; - if( inWarpIdx < 2 ) - results[ threadIdx.x ] += temp[ threadIdx.x ]; - } - - //group == 5 - group++; - temp[ threadIdx.x ] = 0.0; - groupLength = this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group + 1 ] - - this->groupPointers[ strip * ( this->logWarpSize + 1 ) + group ]; - if( groupLength > 0 ) - { - for( IndexType i = 0; i < groupLength; i++ ) - { - if( this->columnIndexes[ elementPtr ] < this->getColumns() ) - temp[ threadIdx.x ] += inVector[ this->columnIndexes[ elementPtr ] ] * this->values[ elementPtr ]; - elementPtr += this->warpSize; - } - //Loop Unroll #6 - if( inWarpIdx < 16 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 16 ]; - if( inWarpIdx < 8 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 8 ]; - if( inWarpIdx < 4 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 4 ]; - if( inWarpIdx < 2 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 2 ]; - if( inWarpIdx < 1 ) - temp[ threadIdx.x ] += temp[ threadIdx.x + 1 ]; - if( inWarpIdx < 1 ) - results[ threadIdx.x ] += temp[ threadIdx.x ]; - } - - if( warpStart + inWarpIdx >= this->getRows() ) - return; - outVector[ warpStart + inWarpIdx ] = results[ this->rowPermArray[ warpStart + inWarpIdx ] & ( cudaBlockSize - 1 ) ]; -} -#endif*/ - -#ifdef HAVE_CUDA -template< typename Real, - typename Index, - int StripSize, - typename InVector, - typename OutVector > -__global__ -void BiEllpackSymmetricVectorProductCuda( const BiEllpackSymmetric< Real, Devices::Cuda, Index, StripSize >* matrix, - const InVector* inVector, - OutVector* outVector, - int gridIdx, - const int warpSize ) -{ - Index globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - matrix->spmvCuda( *inVector, *outVector, globalIdx ); -} -#endif - -#ifdef HAVE_CUDA -template< typename Real, - typename Device, - typename Index, - int StripSize > -__device__ -void BiEllpackSymmetric< Real, Device, Index, StripSize >::performRowBubbleSortCudaKernel( const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths, - const IndexType strip ) -{ - IndexType begin = strip * this->warpSize; - IndexType end = ( strip + 1 ) * this->warpSize - 1; - if( this->getRows() - 1 < end ) - end = this->getRows() - 1; - bool sorted = false; - IndexType permIndex1, permIndex2, offset = 0; - while( !sorted ) - { - sorted = true; - for( IndexType j = begin + offset; j < end - offset; j++ ) - { - for( IndexType k = begin; k < end + 1; k++) - { - if( this->rowPermArray[ k ] == j ) - permIndex1 = k; - if( this->rowPermArray[ k ] == j + 1 ) - permIndex2 = k; - } - if( rowLengths[ permIndex1 ] < rowLengths[ permIndex2 ] ) - { - IndexType temp = this->rowPermArray[ permIndex1 ]; - this->rowPermArray[ permIndex1 ] = this->rowPermArray[ permIndex2 ]; - this->rowPermArray[ permIndex2 ] = temp; - sorted = false; - } - } - for( IndexType j = end - 1 - offset; j > begin + offset; j-- ) - { - for( IndexType k = begin; k < end + 1; k++ ) - { - if( this->rowPermArray[ k ] == j ) - permIndex1 = k; - if( this->rowPermArray[ k ] == j - 1) - permIndex2 = k; - } - if( rowLengths[ permIndex2 ] < rowLengths[ permIndex1 ] ) - { - IndexType temp = this->rowPermArray[ permIndex1 ]; - this->rowPermArray[ permIndex1 ] = this->rowPermArray[ permIndex2 ]; - this->rowPermArray[ permIndex2 ] = temp; - sorted = false; - } - } - offset++; - } -} -#endif - -#ifdef HAVE_CUDA -template< typename Real, - typename Device, - typename Index, - int StripSize > -__device__ -void BiEllpackSymmetric< Real, Device, Index, StripSize >::computeColumnSizesCudaKernel( const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::CompressedRowLengthsVector& rowLengths, - const IndexType numberOfStrips, - const IndexType strip ) -{ - if( strip >= numberOfStrips ) - return; - IndexType i = 0; - IndexType rowBegin = strip * this->warpSize; - IndexType groupBegin = strip * ( this->logWarpSize + 1 ); - IndexType emptyGroups = 0; - if( strip == numberOfStrips - 1 ) - { - IndexType lastRows = this->getRows() - rowBegin; - while( !( lastRows > this->power( 2, this->logWarpSize - 1 - emptyGroups ) ) ) - emptyGroups++; - for( IndexType group = groupBegin; group < groupBegin + emptyGroups; group++ ) - this->groupPointers[ group ] = 0; - } - i += emptyGroups; - for( IndexType group = groupBegin + emptyGroups; group < groupBegin + this->logWarpSize; group++ ) - { - IndexType row = this->power( 2, 4 - i ); - IndexType permRow = 0; - while( this->rowPermArray[ permRow + rowBegin ] != row + rowBegin && permRow < this->warpSize ) - permRow++; - IndexType temp = rowLengths[ permRow + rowBegin ]; - for( IndexType prevGroups = groupBegin; prevGroups < group; prevGroups++ ) - temp -= this->power( 2, prevGroups - groupBegin ) * this->groupPointers[ prevGroups ]; - temp = ceil( ( float ) temp / this->power( 2, i ) ); - this->groupPointers[ group ] = temp; - i++; - } - IndexType permRow = rowBegin; - while( this->rowPermArray[ permRow ] != rowBegin && permRow < this->warpSize + rowBegin ) - permRow++; - IndexType temp = rowLengths[ permRow ]; - for( IndexType prevGroups = groupBegin; prevGroups < groupBegin + this->logWarpSize; prevGroups++ ) - temp -= this->power( 2, prevGroups - groupBegin ) * this->groupPointers[ prevGroups ]; - temp = ceil( ( float ) temp / this->power( 2, this->logWarpSize ) ); - this->groupPointers[ groupBegin + this->logWarpSize ] = temp; -} -#endif - -#ifdef HAVE_CUDA -template< typename Real, - typename Index, - int StripSize > -__global__ -void performRowBubbleSortCuda( BiEllpackSymmetric< Real, Devices::Cuda, Index, StripSize >* matrix, - const typename BiEllpackSymmetric< Real, Devices::Cuda, Index, StripSize >::RowLengthsVector* rowLengths, - int gridIdx ) -{ - const Index stripIdx = gridIdx * Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x; - matrix->performRowBubbleSortCudaKernel( *rowLengths, stripIdx ); -} -#endif - -#ifdef HAVE_CUDA -template< typename Real, - typename Index, - int StripSize > -__global__ -void computeColumnSizesCuda( BiEllpackSymmetric< Real, Devices::Cuda, Index, StripSize >* matrix, - const typename BiEllpackSymmetric< Real, Devices::Cuda, Index, StripSize >::RowLengthsVector* rowLengths, - const Index numberOfStrips, - int gridIdx ) -{ - const Index stripIdx = gridIdx * Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x; - matrix->computeColumnSizesCudaKernel( *rowLengths, numberOfStrips, stripIdx ); -} -#endif - -template<> -class BiEllpackSymmetricDeviceDependentCode< Devices::Cuda > -{ -public: - - typedef Devices::Cuda Device; - - template< typename Real, - typename Index, - int StripSize > - static void verifyRowLengths( const BiEllpackSymmetric< Real, Device, Index, StripSize >& matrix, - const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::RowLengthsVector& rowLengths ) - { - bool ok = true; - std::cout << "inside method" < rowLength ) - ok = false; - } - if( ok ) - std::cout << "row lengths OK" < - static void verifyRowPerm( const BiEllpackSymmetric< Real, Device, Index, StripSize >& matrix, - const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::RowLengthsVector& rowLengths ) - { - bool ok = true; - Index numberOfStrips = matrix.virtualRows / matrix.warpSize; - for( Index strip = 0; strip < numberOfStrips; strip++ ) - { - Index begin = strip * matrix.warpSize; - Index end = ( strip + 1 ) * matrix.warpSize; - if( matrix.getRows() < end ) - end = matrix.getRows(); - for( Index i = begin; i < end - 1; i++ ) - { - Index permIndex1, permIndex2; - bool first = false; - bool second = false; - for( Index j = begin; j < end; j++ ) - { - if( matrix.rowPermArray.getElement( j ) == i ) - { - permIndex1 = j; - first = true; - } - if( matrix.rowPermArray.getElement( j ) == i + 1 ) - { - permIndex2 = j; - second = true; - } - } - if( !first || !second ) - std::cout << "nenasel jsem spravne indexy" <= rowLengths.getElement( permIndex2 ) ) - continue; - else - ok = false; - } - } - if( ok ) - std::cout << "perm OK" < - static void performRowBubbleSort( BiEllpackSymmetric< Real, Device, Index, StripSize >& matrix, - const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::RowLengthsVector& rowLengths ) - { -#ifdef HAVE_CUDA - Index numberOfStrips = matrix.virtualRows / StripSize; - typedef BiEllpackSymmetric< Real, Devices::Cuda, Index, StripSize > Matrix; - typedef typename Matrix::RowLengthsVector CompressedRowLengthsVector; - Matrix* kernel_this = Cuda::passToDevice( matrix ); - CompressedRowLengthsVector* kernel_rowLengths = Cuda::passToDevice( rowLengths ); - dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); - const Index cudaBlocks = roundUpDivision( numberOfStrips, cudaBlockSize.x ); - const Index cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); - for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) - { - if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); - performRowBubbleSortCuda< Real, Index, StripSize > - <<< cudaGridSize, cudaBlockSize >>> - ( kernel_this, - kernel_rowLengths, - gridIdx ); - } - Cuda::freeFromDevice( kernel_this ); - Cuda::freeFromDevice( kernel_rowLengths ); - TNL_CHECK_CUDA_DEVICE; -#endif - } - - template< typename Real, - typename Index, - int StripSize > - static void computeColumnSizes( BiEllpackSymmetric< Real, Device, Index, StripSize >& matrix, - const typename BiEllpackSymmetric< Real, Device, Index, StripSize >::RowLengthsVector& rowLengths ) - { -#ifdef HAVE_CUDA - const Index numberOfStrips = matrix.virtualRows / StripSize; - typedef BiEllpackSymmetric< Real, Devices::Cuda, Index, StripSize > Matrix; - typedef typename Matrix::RowLengthsVector CompressedRowLengthsVector; - Matrix* kernel_this = Cuda::passToDevice( matrix ); - CompressedRowLengthsVector* kernel_rowLengths = Cuda::passToDevice( rowLengths ); - dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); - const Index cudaBlocks = roundUpDivision( numberOfStrips, cudaBlockSize.x ); - const Index cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); - for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) - { - if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); - computeColumnSizesCuda< Real, Index, StripSize > - <<< cudaGridSize, cudaBlockSize >>> - ( kernel_this, - kernel_rowLengths, - numberOfStrips, - gridIdx ); - } - Cuda::freeFromDevice( kernel_this ); - Cuda::freeFromDevice( kernel_rowLengths ); - TNL_CHECK_CUDA_DEVICE; -#endif - } - - - template< typename Real, - typename Index, - int StripSize, - typename InVector, - typename OutVector > - static void vectorProduct( const BiEllpackSymmetric< Real, Device, Index, StripSize >& matrix, - const InVector& inVector, - OutVector& outVector ) - { -#ifdef HAVE_CUDA - typedef BiEllpackSymmetric< Real, Devices::Cuda, Index > Matrix; - typedef typename Matrix::IndexType IndexType; - Matrix* kernel_this = Cuda::passToDevice( matrix ); - InVector* kernel_inVector = Cuda::passToDevice( inVector ); - OutVector* kernel_outVector = Cuda::passToDevice( outVector ); - dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); - const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); - const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); - for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) - { - if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); - const int sharedMemory = cudaBlockSize.x * sizeof( Real ); - BiEllpackSymmetricVectorProductCuda< Real, Index, StripSize, InVector, OutVector > - <<< cudaGridSize, cudaBlockSize, sharedMemory >>> - ( kernel_this, - kernel_inVector, - kernel_outVector, - gridIdx, - matrix.warpSize ); - } - Cuda::freeFromDevice( kernel_this ); - Cuda::freeFromDevice( kernel_inVector ); - Cuda::freeFromDevice( kernel_outVector ); - TNL_CHECK_CUDA_DEVICE; -#endif - } - -}; - -} //namespace Legacy -} // namespace Matrices -} // namespace TNL diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h index 46e616d165d8cb891d3c1e307388f478e50801a7..d7a9092cfc3c63fea6a8d5f6867da572db160acd 100644 --- a/src/TNL/Matrices/Legacy/CSR.h +++ b/src/TNL/Matrices/Legacy/CSR.h @@ -10,7 +10,7 @@ #pragma once -#include +#include #include #include @@ -20,6 +20,43 @@ namespace TNL { namespace Matrices { namespace Legacy { +enum class Type { + /* LONG = 0!!! Non zero value rewrites index[1] */ + LONG = 0, + STREAM = 1, + VECTOR = 2 +}; + +template +union Block { + Block(Index row, Type type = Type::VECTOR, Index index = 0) noexcept { + this->index[0] = row; + this->index[1] = index; + this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type; + } + + Block(Index row, Type type, Index nextRow, Index maxID, Index minID) noexcept { + this->index[0] = row; + this->index[1] = 0; + this->twobytes[sizeof(Index) == 4 ? 2 : 4] = maxID - minID; + + if (type == Type::STREAM) + this->twobytes[sizeof(Index) == 4 ? 3 : 5] = nextRow - row; + + if (type == Type::STREAM) + this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b1000000; + else if (type == Type::VECTOR) + this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b10000000; + } + + Block() = default; + + Index index[2]; // index[0] is row pointer, index[1] is index in warp + uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator + uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID + //twobytes[3/5] is nextRow - row +}; + #ifdef HAVE_UMFPACK template< typename Matrix, typename Preconditioner > class UmfpackWrapper; @@ -31,7 +68,9 @@ class CusparseCSR; template< typename Device > class CSRDeviceDependentCode; -enum CSRKernel { CSRScalar, CSRVector, CSRHybrid, CSRLight, CSRAdaptive, CSRStream }; +enum CSRKernel { CSRScalar, CSRVector, CSRHybrid, // Hybrid is not implemented + CSRLight, CSRLight2, CSRLight3, CSRLight4, CSRLight5, CSRLight6, + CSRAdaptive, CSRMultiVector, CSRLightWithoutAtomic }; template< typename Real, typename Device = Devices::Host, typename Index = int, CSRKernel KernelType = CSRScalar > class CSR : public Sparse< Real, Device, Index > @@ -65,6 +104,34 @@ public: constexpr CSRKernel getSpMVKernelType() { return KernelType; }; //enum SPMVCudaKernel { scalar, vector, hybrid }; + + Containers::Vector< Block, Device, Index > blocks; + + /* Configuration of CSR SpMV kernels ----------------------------------------- */ + + /* Block sizes */ + static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; + static constexpr Index THREADS_SCALAR = 128; + static constexpr Index THREADS_VECTOR = 128; + static constexpr Index THREADS_LIGHT = 128; + + /* Max length of row to process one warp */ + static constexpr Index MAX_ELEMENTS_PER_WARP = 1024; + + /* How many shared memory use per block in CSR Adaptive kernel */ + static constexpr Index SHARED_PER_BLOCK = 24576; + + /* Number of elements in shared memory */ + static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real); + + /* Number of warps in block for CSR Adaptive */ + static constexpr Index WARPS = THREADS_ADAPTIVE / 32; + + /* Number of elements in shared memory per one warp */ + static constexpr Index SHARED_PER_WARP = SHARED / WARPS; + /* -------------------------------------------------------------------------- */ + + using Sparse< Real, Device, Index >::getAllocatedElementsCount; CSR(); @@ -217,42 +284,8 @@ public: __cuda_callable__ IndexType getHybridModeSplit() const; -#ifdef HAVE_CUDA - - template< typename InVector, - typename OutVector, - int warpSize > - __device__ - void spmvCudaVectorized( const InVector& inVector, - OutVector& outVector, - const IndexType gridIdx ) const; - - template< typename InVector, - typename OutVector, - int warpSize > - __device__ - void vectorProductCuda( const InVector& inVector, - OutVector& outVector, - int gridIdx, int *blocks, size_t size ) const; - - template< typename InVector, - typename OutVector, - int warpSize > - __device__ - void spmvCudaLightSpmv( const InVector& inVector, - OutVector& outVector, - int gridIdx) const; - - template< typename InVector, - typename OutVector, - int warpSize > - __device__ - void spmvCSRAdaptive( const InVector& inVector, - OutVector& outVector, - int gridIdx, - int *blocks, - size_t blocks_size) const; -#endif + /* Analyze rowPointers, columnIndecies and values to create block for CSR Adaptive */ + void setBlocks(); // The following getters allow us to interface TNL with external C-like // libraries such as UMFPACK or SuperLU, which need the raw data. diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index 6990d4072b5f8e62d0452bbc5785000cd84207da..e03e4db6d67ecda5fea8b9e366ffbc4a7b5507fe 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -13,13 +13,18 @@ #include #include #include +#include #include -#include +#include +#include // for blocks in CSR Adaptive #ifdef HAVE_CUSPARSE +#include #include #endif +constexpr size_t MAX_X_DIM = 2147483647; + namespace TNL { namespace Matrices { namespace Legacy { @@ -104,6 +109,83 @@ void CSR< Real, Device, Index, KernelType >::setCompressedRowLengths( ConstCompr this->values.setSize( this->rowPointers.getElement( this->rows ) ); this->columnIndexes.setSize( this->rowPointers.getElement( this->rows ) ); this->columnIndexes.setValue( this->columns ); + + if (KernelType == CSRAdaptive && this->blocks.empty()) + this->setBlocks(); +} + +/* Find limit of block */ +template< typename Real, + typename Index, + typename Device, + CSRKernel KernelType> +Index findLimit(const Index start, + const CSR< Real, Device, Index, KernelType >& matrix, + const Index size, + Type &type, + Index &sum) { + sum = 0; + for (Index current = start; current < size - 1; ++current) { + Index elements = matrix.getRowPointers().getElement(current + 1) - + matrix.getRowPointers().getElement(current); + sum += elements; + if (sum > matrix.SHARED_PER_WARP) { + if (current - start > 0) { // extra row + type = Type::STREAM; + return current; + } else { // one long row + if (sum <= 2 * matrix.MAX_ELEMENTS_PER_WARP) + type = Type::VECTOR; + else + type = Type::LONG; + return current + 1; + } + } + } + + type = Type::STREAM; + return size - 1; // return last row pointer +} + +template< typename Real, + typename Device, + typename Index, + CSRKernel KernelType > +void CSR< Real, Device, Index, KernelType >::setBlocks() +{ + const Index rows = this->getRowPointers().getSize(); + Index sum, start = 0, nextStart = 0; + + /* Fill blocks */ + std::vector> inBlock; + inBlock.reserve(rows); // reserve space to avoid reallocation + + while (nextStart != rows - 1) { + Type type; + nextStart = findLimit( + start, *this, rows, type, sum + ); + if (type == Type::LONG) { + Index parts = roundUpDivision(sum, this->SHARED_PER_WARP); + for (Index index = 0; index < parts; ++index) { + inBlock.emplace_back(start, Type::LONG, index); + } + } else { + inBlock.emplace_back(start, type, + nextStart, + this->rowPointers.getElement(nextStart), + this->rowPointers.getElement(start) + ); + } + + start = nextStart; + } + inBlock.emplace_back(nextStart); + + /* Copy values */ + this->blocks.setSize(inBlock.size()); + for (size_t i = 0; i < inBlock.size(); ++i) + this->blocks.setElement(i, inBlock[i]); } template< typename Real, @@ -583,6 +665,7 @@ CSR< Real, Device, Index, KernelType >::operator=( const CSR& matrix ) this->values = matrix.values; this->columnIndexes = matrix.columnIndexes; this->rowPointers = matrix.rowPointers; + this->blocks = matrix.blocks; return *this; } @@ -599,6 +682,7 @@ CSR< Real, Device, Index, KernelType >::operator=( const CSR< Real2, Device2, In this->values = matrix.values; this->columnIndexes = matrix.columnIndexes; this->rowPointers = matrix.rowPointers; + this->blocks = matrix.blocks; return *this; } @@ -718,294 +802,974 @@ Index CSR< Real, Device, Index, KernelType >::getHybridModeSplit() const #ifdef HAVE_CUDA template< typename Real, - typename Device, typename Index, - CSRKernel KernelType > - template< typename InVector, - typename OutVector, - int warpSize > -__device__ -void CSR< Real, Device, Index, KernelType >::spmvCudaLightSpmv( const InVector& inVector, - OutVector& outVector, - int gridIdx) const -{ - const IndexType index = blockIdx.x * blockDim.x + threadIdx.x; - const IndexType elemPerGroup = 4; - const IndexType laneID = index % 32; - const IndexType groupID = laneID / elemPerGroup; - const IndexType inGroupID = laneID % elemPerGroup; - - IndexType row, minID, column, maxID, idxMtx; - __shared__ unsigned rowCnt; + int warpSize, + int WARPS, + int SHARED_PER_WARP, + int MAX_ELEM_PER_WARP > +__global__ +void SpMVCSRAdaptive( const Real *inVector, + Real *outVector, + const Index* rowPointers, + const Index* columnIndexes, + const Real* values, + const Block *blocks, + Index blocksSize, + Index gridID) { + __shared__ Real shared[WARPS][SHARED_PER_WARP]; + const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; + const Index blockIdx = index / warpSize; + if (blockIdx >= blocksSize) + return; - if (index == 0) rowCnt = 0; // Init shared variable - __syncthreads(); + Real result = 0.0; + const Index laneID = threadIdx.x & 31; // & is cheaper than % + Block block = blocks[blockIdx]; + const Index minID = rowPointers[block.index[0]/* minRow */]; + Index i, to, maxID; + if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b1000000) { + /////////////////////////////////////* CSR STREAM *////////////// + const Index warpID = threadIdx.x / 32; + maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4]; - while (true) { + /* Stream data to shared memory */ + for (i = laneID + minID; i < maxID; i += warpSize) + shared[warpID][i - minID] = values[i] * inVector[columnIndexes[i]]; - /* Get row number */ - if (inGroupID == 0) row = atomicAdd(&rowCnt, 1); + const Index maxRow = block.index[0]/* minRow */ + + /* maxRow - minRow */(block.twobytes[sizeof(Index) == 4 ? 3 : 5] & 0x3FFF); + /* Calculate result */ + for (i = block.index[0]/* minRow */ + laneID; i < maxRow; i += warpSize) { + to = rowPointers[i + 1] - minID; // end of preprocessed data + result = 0; + /* Scalar reduction */ + for (Index sharedID = rowPointers[i] - minID; sharedID < to; ++sharedID) + result += shared[warpID][sharedID]; + + outVector[i] = result; // Write result + } + } else if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b10000000) { + /////////////////////////////////////* CSR VECTOR *////////////// + maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4]; - /* Propagate row number in group */ - row = __shfl_sync((unsigned)(warpSize - 1), row, groupID * elemPerGroup); + for (i = minID + laneID; i < maxID; i += warpSize) + result += values[i] * inVector[columnIndexes[i]]; - if (row >= this->rowPointers.getSize() - 1) - return; + /* Parallel reduction */ + result += __shfl_down_sync(0xFFFFFFFF, result, 16); + result += __shfl_down_sync(0xFFFFFFFF, result, 8); + result += __shfl_down_sync(0xFFFFFFFF, result, 4); + result += __shfl_down_sync(0xFFFFFFFF, result, 2); + result += __shfl_down_sync(0xFFFFFFFF, result, 1); + if (laneID == 0) outVector[block.index[0]/* minRow */] = result; // Write result + } else { + /////////////////////////////////////* CSR VECTOR L *///////////// + /* Number of elements processed by previous warps */ + const Index offset = block.index[1]/* warpInRow */ * MAX_ELEM_PER_WARP; + to = minID + (block.index[1]/* warpInRow */ + 1) * MAX_ELEM_PER_WARP; + maxID = rowPointers[block.index[0]/* minRow */ + 1]; + if (to > maxID) to = maxID; + for (i = minID + offset + laneID; i < to; i += warpSize) + result += values[i] * inVector[columnIndexes[i]]; - minID = this->rowPointers[row]; - maxID = this->rowPointers[row + 1]; + /* Parallel reduction */ + result += __shfl_down_sync(0xFFFFFFFF, result, 16); + result += __shfl_down_sync(0xFFFFFFFF, result, 8); + result += __shfl_down_sync(0xFFFFFFFF, result, 4); + result += __shfl_down_sync(0xFFFFFFFF, result, 2); + result += __shfl_down_sync(0xFFFFFFFF, result, 1); + if (laneID == 0) atomicAdd(&outVector[block.index[0]/* minRow */], result); + } +} - Real result = 0.0; +template< typename Real, + typename Index> +__global__ +void SpMVCSRScalar( const Real *inVector, + Real* outVector, + const Index* rowPointers, + const Index* columnIndexes, + const Real* values, + const Index rows, + const Index gridID) { + const Index row = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; + if (row >= rows) + return; - idxMtx = minID + inGroupID; - while (idxMtx < maxID) { - column = this->columnIndexes[idxMtx]; - if (column >= this->getColumns()) - break; + Real result = 0.0; + const Index endID = rowPointers[row + 1]; - result += this->values[idxMtx] * inVector[column]; - idxMtx += elemPerGroup; - } + for (Index i = rowPointers[row]; i < endID; ++i) + result += values[i] * inVector[columnIndexes[i]]; - /* Parallel reduction */ - for (int i = elemPerGroup/2; i > 0; i /= 2) - result += __shfl_down_sync((unsigned)(warpSize - 1), result, i); - /* Write result */ - if (inGroupID == 0) { - outVector[row] = result; - } - } + outVector[row] = result; } -/* template< typename Real, - typename Device, +template< typename Real, typename Index, - typename InVector, int warpSize > __global__ -void spmvCSRVectorHelper( const InVector& inVector, - Real *out, - size_t from, - size_t to, - size_t perWarp) +void SpMVCSRMultiVector( const Real *inVector, + Real* outVector, + const Index* rowPointers, + const Index* columnIndexes, + const Real* values, + const Index rows, + const Index warps, // warps per row + const Index gridID) { - const size_t index = blockIdx.x * blockDim.x + threadIdx.x; - const size_t warpID = index / warpSize; - const size_t laneID = index % warpSize; - const size_t minID = from + warpID * perWarp; - size_t maxID = from + (warpID + 1) * perWarp; - if (minID >= to) return; - if (maxID >= to ) maxID = to; - + const Index warpID = + ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / warpSize; + const Index rowID = warpID / warps; + if (rowID >= rows) + return; + + const Index laneID = threadIdx.x & 31; // & is cheaper than % + const Index offset = warps * warpSize; + Real result = 0.0; - for (IndexType i = minID + laneID; i < maxID; i += warpSize) { - const IndexType column = this->columnIndexes[i]; - if (column >= this->getColumns()) - continue; - result += this->values[i] * inVector[column]; + Index endID = rowPointers[rowID + 1]; + /* Calculate result */ + for (Index i = rowPointers[rowID] + (warpID % warps) * warpSize + laneID; + i < endID; i += offset) { + result += values[i] * inVector[columnIndexes[i]]; } - atomicAdd(out, result); -} */ + + /* Reduction */ + result += __shfl_down_sync(0xFFFFFFFF, result, 16); + result += __shfl_down_sync(0xFFFFFFFF, result, 8); + result += __shfl_down_sync(0xFFFFFFFF, result, 4); + result += __shfl_down_sync(0xFFFFFFFF, result, 2); + result += __shfl_down_sync(0xFFFFFFFF, result, 1); + /* Write result */ + if (laneID == 0) atomicAdd(&outVector[rowID], result); +} template< typename Real, - typename Device, typename Index, - CSRKernel KernelType > - template< typename InVector, - typename OutVector, - int warpSize > -__device__ -void CSR< Real, Device, Index, KernelType >::spmvCSRAdaptive( const InVector& inVector, - OutVector& outVector, - int gridIdx, - int *blocks, - size_t blocks_size) const + int warpSize > +__global__ +void SpMVCSRVector( const Real *inVector, + Real* outVector, + const Index* rowPointers, + const Index* columnIndexes, + const Real* values, + const Index rows, + const Index gridID) { - /* Configuration ---------------------------------------------------*/ - constexpr size_t SHARED = 49152/sizeof(float); - constexpr size_t SHARED_PER_WARP = SHARED / warpSize; - constexpr size_t MAX_PER_WARP = 65536; - //constexpr size_t ELEMENTS_PER_WARP = 1024; - //constexpr size_t THREADS_PER_BLOCK = 1024; - //constexpr size_t WARPS_PER_BLOCK = THREADS_PER_BLOCK / warpSize; - //-------------------------------------------------------------------- - const IndexType index = blockIdx.x * blockDim.x + threadIdx.x; - const IndexType laneID = index % warpSize; - IndexType blockIdx = index / warpSize; - __shared__ float shared_res[SHARED]; - Real result = 0.0; - if (blockIdx >= blocks_size - 1) + const Index warpID = ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / warpSize; + if (warpID >= rows) return; - const IndexType minRow = blocks[blockIdx]; - const IndexType maxRow = blocks[blockIdx + 1]; - const IndexType minID = this->rowPointers[minRow]; - const IndexType maxID = this->rowPointers[maxRow]; - const IndexType elements = maxID - minID; - /* rows per block more than 1 */ - if ((maxRow - minRow) > 1) { - /////////////////////////////////////* CSR STREAM *////////////// - /* Copy and calculate elements from global to shared memory, coalesced */ - const IndexType offset = threadIdx.x / warpSize * SHARED_PER_WARP; - for (IndexType i = laneID; i < elements; i += warpSize) { - const IndexType elementIdx = i + minID; - const IndexType column = this->columnIndexes[elementIdx]; - if (column >= this->getColumns()) - continue; - shared_res[i + offset] = this->values[elementIdx] * inVector[column]; - } - const IndexType row = minRow + laneID; - if (row >= maxRow) - return; - /* Calculate result */ - const IndexType to = this->rowPointers[row + 1] - minID; - for (IndexType i = this->rowPointers[row] - minID; i < to; ++i) { - result += shared_res[i + offset]; + Real result = 0.0; + const Index laneID = threadIdx.x & 31; // & is cheaper than % + Index endID = rowPointers[warpID + 1]; + + /* Calculate result */ + for (Index i = rowPointers[warpID] + laneID; i < endID; i += warpSize) + result += values[i] * inVector[columnIndexes[i]]; + + /* Reduction */ + result += __shfl_down_sync(0xFFFFFFFF, result, 16); + result += __shfl_down_sync(0xFFFFFFFF, result, 8); + result += __shfl_down_sync(0xFFFFFFFF, result, 4); + result += __shfl_down_sync(0xFFFFFFFF, result, 2); + result += __shfl_down_sync(0xFFFFFFFF, result, 1); + /* Write result */ + if (laneID == 0) outVector[warpID] = result; +} + +template< typename Real, + typename Index, + int groupSize, + int MAX_NUM_VECTORS_PER_BLOCK > +__global__ +void SpMVCSRLight( const Real *inVector, + Real* outVector, + const Index* rowPointers, + const Index* columnIndexes, + const Real* values, + const Index rows, + unsigned *rowCnt) { + Real sum; + Index row, i, rowStart, rowEnd; + const Index laneId = threadIdx.x % groupSize; /*lane index in the vector*/ + const Index vectorId = threadIdx.x / groupSize; /*vector index in the thread block*/ + const Index warpLaneId = threadIdx.x & 31; /*lane index in the warp*/ + const Index warpVectorId = warpLaneId / groupSize; /*vector index in the warp*/ + + __shared__ volatile Index space[MAX_NUM_VECTORS_PER_BLOCK][2]; + + /*get the row index*/ + if (warpLaneId == 0) { + row = atomicAdd(rowCnt, 32 / groupSize); + } + /*broadcast the value to other threads in the same warp and compute the row index of each vector*/ + row = __shfl_sync(0xFFFFFFFF, row, 0) + warpVectorId; + + /*check the row range*/ + while (row < rows) { + + /*use two threads to fetch the row offset*/ + if (laneId < 2) + space[vectorId][laneId] = rowPointers[row + laneId]; + + rowStart = space[vectorId][0]; + rowEnd = space[vectorId][1]; + + /*there are non-zero elements in the current row*/ + sum = 0; + /*compute dot product*/ + if (groupSize == 32) { + + /*ensure aligned memory access*/ + i = rowStart - (rowStart & (groupSize - 1)) + laneId; + + /*process the unaligned part*/ + if (i >= rowStart && i < rowEnd) + sum += values[i] * inVector[columnIndexes[i]]; + + /*process the aligned part*/ + for (i += groupSize; i < rowEnd; i += groupSize) + sum += values[i] * inVector[columnIndexes[i]]; + } else { + /*regardless of the global memory access alignment*/ + for (i = rowStart + laneId; i < rowEnd; i += groupSize) + sum += values[i] * inVector[columnIndexes[i]]; } - outVector[row] = result; // Write result - } else if (elements <= MAX_PER_WARP) { - /////////////////////////////////////* CSR VECTOR *////////////// - for (IndexType i = minID + laneID; i < maxID; i += warpSize) { - IndexType column = this->columnIndexes[i]; - if (column >= this->getColumns()) - break; + /*intra-vector reduction*/ + for (i = groupSize >> 1; i > 0; i >>= 1) + sum += __shfl_down_sync(0xFFFFFFFF, sum, i); + + /*save the results and get a new row*/ + if (laneId == 0) + outVector[row] = sum; + + /*get a new row index*/ + if(warpLaneId == 0) + row = atomicAdd(rowCnt, 32 / groupSize); + + /*broadcast the row index to the other threads in the same warp and compute the row index of each vetor*/ + row = __shfl_sync(0xFFFFFFFF, row, 0) + warpVectorId; + + }/*while*/ +} - result += this->values[i] * inVector[column]; +/* Original CSR Light without shared memory */ +template< typename Real, + typename Index, + int groupSize > +__global__ +void SpMVCSRLight2( const Real *inVector, + Real* outVector, + const Index* rowPointers, + const Index* columnIndexes, + const Real* values, + const Index rows, + unsigned *rowCnt) { + Real sum; + Index i, rowStart, rowEnd, row; + const Index laneId = threadIdx.x % groupSize; /*lane index in the vector*/ + const Index warpLaneId = threadIdx.x & 31; /*lane index in the warp*/ + const Index warpVectorId = warpLaneId / groupSize; /*vector index in the warp*/ + + /*get the row index*/ + if (warpLaneId == 0) + row = atomicAdd(rowCnt, 32 / groupSize); + + /*broadcast the value to other threads in the same warp and compute the row index of each vector*/ + row = __shfl_sync(0xFFFFFFFF, row, 0) + warpVectorId; + + /*check the row range*/ + while (row < rows) { + + rowStart = rowPointers[row]; + rowEnd = rowPointers[row + 1]; + + /*there are non-zero elements in the current row*/ + sum = 0; + /*compute dot product*/ + if (groupSize == 32) { + + /*ensure aligned memory access*/ + i = rowStart - (rowStart & (groupSize - 1)) + laneId; + + /*process the unaligned part*/ + if (i >= rowStart && i < rowEnd) + sum += values[i] * inVector[columnIndexes[i]]; + + /*process the aligned part*/ + for (i += groupSize; i < rowEnd; i += groupSize) + sum += values[i] * inVector[columnIndexes[i]]; + } else { + /*regardless of the global memory access alignment*/ + for (i = rowStart + laneId; i < rowEnd; i += groupSize) + sum += values[i] * inVector[columnIndexes[i]]; } - /* Reduction */ - result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16); - result += __shfl_down_sync((unsigned)(warpSize - 1), result, 8); - result += __shfl_down_sync((unsigned)(warpSize - 1), result, 4); - result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2); - result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1); - if (laneID == 0) outVector[minRow] = result; // Write result - } else { - /////////////////////////////////////* CSR VECTOR LONG *////////////// - //const size_t warps = (elements - ELEMENTS_PER_WARP) / ELEMENTS_PER_WARP + 1; - //const size_t blocks = warps <= WARPS_PER_BLOCK ? 1 : warps / WARPS_PER_BLOCK + 1; - //const size_t threads_per_block = blocks == 1 ? warps * warpSize : WARPS_PER_BLOCK * warpSize; - // spmvCSRVectorHelper <<>>( - // inVector, - // &outVector[minRow], - // (size_t)(minID + ELEMENTS_PER_WARP), - // (size_t)maxID, - // (size_t)ELEMENTS_PER_WARP - // ); - } + /*intra-vector reduction*/ + for (i = groupSize >> 1; i > 0; i >>= 1) + sum += __shfl_down_sync(0xFFFFFFFF, sum, i); + + /*save the results and get a new row*/ + if (laneId == 0) + outVector[row] = sum; + + /*get a new row index*/ + if(warpLaneId == 0) + row = atomicAdd(rowCnt, 32 / groupSize); + + /*broadcast the row index to the other threads in the same warp and compute the row index of each vetor*/ + row = __shfl_sync(0xFFFFFFFF, row, 0) + warpVectorId; + + }/*while*/ } +/* Original CSR Light without shared memory and allign memory access */ +template< typename Real, + typename Index, + int groupSize > +__global__ +void SpMVCSRLight3( const Real *inVector, + Real* outVector, + const Index* rowPointers, + const Index* columnIndexes, + const Real* values, + const Index rows, + unsigned *rowCnt) { + Real sum; + Index i, rowEnd, row; + const Index laneId = threadIdx.x % groupSize; /*lane index in the vector*/ + const Index warpLaneId = threadIdx.x & 31; /*lane index in the warp*/ + const Index warpVectorId = warpLaneId / groupSize; /*vector index in the warp*/ + + /*get the row index*/ + if (warpLaneId == 0) + row = atomicAdd(rowCnt, 32 / groupSize); + + /*broadcast the value to other threads in the same warp and compute the row index of each vector*/ + row = __shfl_sync(0xFFFFFFFF, row, 0) + warpVectorId; + + /*check the row range*/ + while (row < rows) { + sum = 0; + + /*compute dot product*/ + rowEnd = rowPointers[row + 1]; + for (i = rowPointers[row] + laneId; i < rowEnd; i += groupSize) + sum += values[i] * inVector[columnIndexes[i]]; + /*intra-vector reduction*/ + for (i = groupSize >> 1; i > 0; i >>= 1) + sum += __shfl_down_sync(0xFFFFFFFF, sum, i); + + /*save the results and get a new row*/ + if (laneId == 0) + outVector[row] = sum; + + /*get a new row index*/ + if(warpLaneId == 0) + row = atomicAdd(rowCnt, 32 / groupSize); + + /*broadcast the row index to the other threads in the same warp and compute the row index of each vetor*/ + row = __shfl_sync(0xFFFFFFFF, row, 0) + warpVectorId; + + }/*while*/ +} + +/* Original CSR Light without shared memory, allign memory access and atomic instructions */ template< typename Real, - typename Device, typename Index, - CSRKernel KernelType > - template< typename InVector, - typename OutVector, - int warpSize > -__device__ -void CSR< Real, Device, Index, KernelType >::spmvCudaVectorized( const InVector& inVector, - OutVector& outVector, - const IndexType gridIdx ) const -{ - IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - const IndexType warpStart = warpSize * ( globalIdx / warpSize ); - const IndexType warpEnd = min( warpStart + warpSize, this->getRows() ); - const IndexType inWarpIdx = globalIdx % warpSize; + int groupSize > +__global__ +void SpMVCSRLight4( const Real *inVector, + Real* outVector, + const Index* rowPointers, + const Index* columnIndexes, + const Real* values, + const Index rows, + const Index gridID) { + const Index row = ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / groupSize; + if (row >= rows) + return; - volatile Real* aux = Cuda::getSharedMemory< Real >(); - for( IndexType row = warpStart; row < warpEnd; row++ ) - { - aux[ threadIdx.x ] = 0.0; + Real sum = 0; + Index i; + const Index laneId = threadIdx.x & (groupSize - 1); /*lane index in the group*/ - IndexType elementPtr = this->rowPointers[ row ] + inWarpIdx; - const IndexType rowEnd = this->rowPointers[ row + 1 ]; - IndexType column; - while( elementPtr < rowEnd && - ( column = this->columnIndexes[ elementPtr ] ) < this->getColumns() ) - { - aux[ threadIdx.x ] += inVector[ column ] * this->values[ elementPtr ]; - elementPtr += warpSize; + /*compute dot product*/ + const Index rowEnd = rowPointers[row + 1]; + for (i = rowPointers[row] + laneId; i < rowEnd; i += groupSize) + sum += values[i] * inVector[columnIndexes[i]]; + + /*intra-vector reduction*/ + for (i = groupSize >> 1; i > 0; i >>= 1) + sum += __shfl_down_sync(0xFFFFFFFF, sum, i); + + /*save the results and get a new row*/ + if (laneId == 0) outVector[row] = sum; +} + +template< typename Real, + typename Index> +__global__ +void SpMVCSRLightWithoutAtomic2( const Real *inVector, + Real* outVector, + const Index* rowPointers, + const Index* columnIndexes, + const Real* values, + const Index rows, + const Index gridID) { + const Index row = + ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / 2; + if (row >= rows) + return; + + const Index inGroupID = threadIdx.x & 1; // & is cheaper than % + const Index maxID = rowPointers[row + 1]; + + Real result = 0.0; + for (Index i = rowPointers[row] + inGroupID; i < maxID; i += 2) + result += values[i] * inVector[columnIndexes[i]]; + + /* Parallel reduction */ + result += __shfl_down_sync(0xFFFFFFFF, result, 1); + + /* Write result */ + if (inGroupID == 0) outVector[row] = result; +} + +template< typename Real, + typename Index> +__global__ +void SpMVCSRLightWithoutAtomic4( const Real *inVector, + Real* outVector, + const Index* rowPointers, + const Index* columnIndexes, + const Real* values, + const Index rows, + const Index gridID) { + const Index row = + ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / 4; + if (row >= rows) + return; + + const Index inGroupID = threadIdx.x & 3; // & is cheaper than % + const Index maxID = rowPointers[row + 1]; + + Real result = 0.0; + for (Index i = rowPointers[row] + inGroupID; i < maxID; i += 4) + result += values[i] * inVector[columnIndexes[i]]; + + /* Parallel reduction */ + result += __shfl_down_sync(0xFFFFFFFF, result, 2); + result += __shfl_down_sync(0xFFFFFFFF, result, 1); + + /* Write result */ + if (inGroupID == 0) outVector[row] = result; +} + +template< typename Real, + typename Index> +__global__ +void SpMVCSRLightWithoutAtomic8( const Real *inVector, + Real* outVector, + const Index* rowPointers, + const Index* columnIndexes, + const Real* values, + const Index rows, + const Index gridID) { + const Index row = + ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / 8; + if (row >= rows) + return; + + Index i; + const Index inGroupID = threadIdx.x & 7; // & is cheaper than % + const Index maxID = rowPointers[row + 1]; + + Real result = 0.0; + for (i = rowPointers[row] + inGroupID; i < maxID; i += 8) + result += values[i] * inVector[columnIndexes[i]]; + + /* Parallel reduction */ + result += __shfl_down_sync(0xFFFFFFFF, result, 4); + result += __shfl_down_sync(0xFFFFFFFF, result, 2); + result += __shfl_down_sync(0xFFFFFFFF, result, 1); + + /* Write result */ + if (inGroupID == 0) outVector[row] = result; +} + +template< typename Real, + typename Index> +__global__ +void SpMVCSRLightWithoutAtomic16( const Real *inVector, + Real* outVector, + const Index* rowPointers, + const Index* columnIndexes, + const Real* values, + const Index rows, + const Index gridID) { + const Index row = + ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / 16; + if (row >= rows) + return; + + + Index i; + const Index inGroupID = threadIdx.x & 15; // & is cheaper than % + const Index maxID = rowPointers[row + 1]; + + Real result = 0.0; + for (i = rowPointers[row] + inGroupID; i < maxID; i += 16) + result += values[i] * inVector[columnIndexes[i]]; + + /* Parallel reduction */ + result += __shfl_down_sync(0xFFFFFFFF, result, 8); + result += __shfl_down_sync(0xFFFFFFFF, result, 4); + result += __shfl_down_sync(0xFFFFFFFF, result, 2); + result += __shfl_down_sync(0xFFFFFFFF, result, 1); + + /* Write result */ + if (inGroupID == 0) outVector[row] = result; +} + +template< typename Real, + typename Index, + typename Device, + CSRKernel KernelType> +void SpMVCSRScalarPrepare( const Real *inVector, + Real* outVector, + const CSR< Real, Device, Index, KernelType >& matrix) { + const Index threads = matrix.THREADS_SCALAR; // block size + size_t neededThreads = matrix.getRowPointers().getSize() - 1; + Index blocks; + /* Execute kernels on device */ + for (Index grid = 0; neededThreads != 0; ++grid) { + if (MAX_X_DIM * threads >= neededThreads) { + blocks = roundUpDivision(neededThreads, threads); + neededThreads = 0; + } else { + blocks = MAX_X_DIM; + neededThreads -= MAX_X_DIM * threads; } - if( warpSize == 32 ) - if( inWarpIdx < 16 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 16 ]; - if( warpSize >= 16 ) - if( inWarpIdx < 8 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 8 ]; - if( warpSize >= 8 ) - if( inWarpIdx < 4 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 4 ]; - if( warpSize >= 4 ) - if( inWarpIdx < 2 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 2 ]; - if( warpSize >= 2 ) - if( inWarpIdx < 1 ) aux[ threadIdx.x ] += aux[ threadIdx.x + 1 ]; - if( inWarpIdx == 0 ) - outVector[ row ] = aux[ threadIdx.x ]; + + SpMVCSRScalar<<>>( + inVector, + outVector, + matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), + matrix.getValues().getData(), + matrix.getRowPointers().getSize() - 1, + grid + ); } } template< typename Real, + typename Index, typename Device, + CSRKernel KernelType, + int warpSize > +void SpMVCSRVectorPrepare( const Real *inVector, + Real* outVector, + const CSR< Real, Device, Index, KernelType >& matrix) { + const Index threads = matrix.THREADS_VECTOR; // block size + size_t neededThreads = matrix.getRowPointers().getSize() * warpSize; + Index blocks; + /* Execute kernels on device */ + for (Index grid = 0; neededThreads != 0; ++grid) { + if (MAX_X_DIM * threads >= neededThreads) { + blocks = roundUpDivision(neededThreads, threads); + neededThreads = 0; + } else { + blocks = MAX_X_DIM; + neededThreads -= MAX_X_DIM * threads; + } + + SpMVCSRVector<<>>( + inVector, + outVector, + matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), + matrix.getValues().getData(), + matrix.getRowPointers().getSize() - 1, + grid + ); + } +} + +template< typename Real, typename Index, - CSRKernel KernelType > - template< typename InVector, - typename OutVector, - int warpSize > -__device__ -void CSR< Real, Device, Index, KernelType >::vectorProductCuda( const InVector& inVector, - OutVector& outVector, - int gridIdx, - int *blocks, size_t size ) const -{ - switch( KernelType ) - { - case CSRScalar: - // TODO: - break; - case CSRVector: - spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, gridIdx ); - break; - case CSRLight: - spmvCudaLightSpmv< InVector, OutVector, warpSize >( inVector, outVector, gridIdx ); - break; - case CSRAdaptive: - spmvCSRAdaptive< InVector, OutVector, warpSize >( inVector, outVector, gridIdx, blocks, size ); - break; - case CSRStream: - // TODO: - break; + typename Device, + CSRKernel KernelType, + int warpSize > +void SpMVCSRLightPrepare( const Real *inVector, + Real* outVector, + const CSR< Real, Device, Index, KernelType >& matrix) { + const Index threads = 1024; // max block size + const Index rows = matrix.getRowPointers().getSize() - 1; + /* Copy rowCnt to GPU */ + unsigned rowCnt = 0; + unsigned *kernelRowCnt = nullptr; + cudaMalloc((void **)&kernelRowCnt, sizeof(*kernelRowCnt)); + cudaMemcpy(kernelRowCnt, &rowCnt, sizeof(*kernelRowCnt), cudaMemcpyHostToDevice); + /* Get info about GPU */ + cudaDeviceProp properties; + cudaGetDeviceProperties( &properties, Cuda::DeviceInfo::getActiveDevice() ); + const Index blocks = + properties.multiProcessorCount * properties.maxThreadsPerMultiProcessor / threads; + + const Index nnz = roundUpDivision(matrix.getValues().getSize(), rows); // non zeroes per row + if (KernelType == CSRLight) { //----------------------------------------- + if (nnz <= 2) + SpMVCSRLight<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, kernelRowCnt + ); + else if (nnz <= 4) + SpMVCSRLight<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, kernelRowCnt + ); + else if (nnz <= 64) + SpMVCSRLight<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, kernelRowCnt + ); + else + SpMVCSRLight<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, kernelRowCnt + ); + } else if(KernelType == CSRLight2) { //----------------------------------------- + if (nnz <= 2) + SpMVCSRLight2<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, kernelRowCnt + ); + else if (nnz <= 4) + SpMVCSRLight2<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, kernelRowCnt + ); + else if (nnz <= 64) + SpMVCSRLight2<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, kernelRowCnt + ); + else + SpMVCSRLight2<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, kernelRowCnt + ); + } else if(KernelType == CSRLight3) { //----------------------------------------- + if (nnz <= 2) + SpMVCSRLight3<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, kernelRowCnt + ); + else if (nnz <= 4) + SpMVCSRLight3<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, kernelRowCnt + ); + else if (nnz <= 64) + SpMVCSRLight3<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, kernelRowCnt + ); + else + SpMVCSRLight3<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, kernelRowCnt + ); + } else if(KernelType == CSRLight6) { //----------------------------------------- + if (nnz <= 2) + SpMVCSRLight3<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, kernelRowCnt + ); + else if (nnz <= 4) + SpMVCSRLight3<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, kernelRowCnt + ); + else if (nnz <= 8) + SpMVCSRLight3<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, kernelRowCnt + ); + else if (nnz <= 16) + SpMVCSRLight3<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, kernelRowCnt + ); + else + SpMVCSRLight3<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, kernelRowCnt + ); } + cudaFree(kernelRowCnt); +} - /*IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - const IndexType warpStart = warpSize * ( globalIdx / warpSize ); - const IndexType warpEnd = min( warpStart + warpSize, this->getRows() ); - const IndexType inWarpIdx = globalIdx % warpSize; +template< typename Real, + typename Index, + typename Device, + CSRKernel KernelType, + int warpSize> +void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector, + Real* outVector, + const CSR< Real, Device, Index, KernelType >& matrix) { + const Index rows = matrix.getRowPointers().getSize() - 1; + const Index threads = matrix.THREADS_LIGHT; // block size + size_t neededThreads = rows * warpSize; + Index blocks, groupSize; + + const Index nnz = roundUpDivision(matrix.getValues().getSize(), rows); // non zeroes per row + if (nnz <= 2) + groupSize = 2; + else if (nnz <= 4) + groupSize = 4; + else if (nnz <= 8) + groupSize = 8; + else if (nnz <= 16) + groupSize = 16; + else if (nnz <= 2 * matrix.MAX_ELEMENTS_PER_WARP) + groupSize = 32; // CSR Vector + else + groupSize = roundUpDivision(nnz, matrix.MAX_ELEMENTS_PER_WARP) * 32; // CSR MultiVector - if( this->getCudaKernelType() == vector ) - + if (KernelType == CSRLightWithoutAtomic) + neededThreads = groupSize * rows; + else + neededThreads = rows * (groupSize > 32 ? 32 : groupSize); + + /* Execute kernels on device */ + for (Index grid = 0; neededThreads != 0; ++grid) { + if (MAX_X_DIM * threads >= neededThreads) { + blocks = roundUpDivision(neededThreads, threads); + neededThreads = 0; + } else { + blocks = MAX_X_DIM; + neededThreads -= MAX_X_DIM * threads; + } - ///// - // Hybrid mode - // - const Index firstRow = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x; - const IndexType lastRow = min( this->getRows(), firstRow + blockDim. x ); - const IndexType nonzerosPerRow = ( this->rowPointers[ lastRow ] - this->rowPointers[ firstRow ] ) / - ( lastRow - firstRow ); + if (KernelType == CSRLightWithoutAtomic) { //----------------------------------------- + if (groupSize == 2) { + SpMVCSRLightWithoutAtomic2<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, grid + ); + } else if (groupSize == 4) { + SpMVCSRLightWithoutAtomic4<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, grid + ); + } else if (groupSize == 8) { + SpMVCSRLightWithoutAtomic8<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, grid + ); + } else if (groupSize == 16) { + SpMVCSRLightWithoutAtomic16<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, grid + ); + } else if (groupSize == 32) { // CSR SpMV Light with groupsize = 32 is CSR Vector + SpMVCSRVector<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, grid + ); + } else { // Execute CSR MultiVector + SpMVCSRMultiVector<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, groupSize / 32, grid + ); + } + } else if (KernelType == CSRLight5) { //----------------------------------------- + if (groupSize == 2) { + SpMVCSRLightWithoutAtomic2<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, grid + ); + } else if (groupSize == 4) { + SpMVCSRLightWithoutAtomic4<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, grid + ); + } else if (groupSize == 8) { + SpMVCSRLightWithoutAtomic8<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, grid + ); + } else if (groupSize == 16) { + SpMVCSRLightWithoutAtomic16<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, grid + ); + } else { // CSR SpMV Light with groupsize = 32 is CSR Vector + SpMVCSRVector<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, grid + ); + } + } else if (KernelType == CSRLight4) { //----------------------------------------- + if (groupSize == 2) { + SpMVCSRLight4<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, grid + ); + } else if (groupSize == 4) { + SpMVCSRLight4<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, grid + ); + } else if (groupSize == 8) { + SpMVCSRLight4<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, grid + ); + } else if (groupSize == 16) { + SpMVCSRLight4<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, grid + ); + } else { // CSR SpMV Light with groupsize = 32 is CSR Vector + SpMVCSRVector<<>>( + inVector, outVector, matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), matrix.getValues().getData(), + rows, grid + ); + } //----------------------------------------- + } + } +} - if( nonzerosPerRow < this->getHybridModeSplit() ) - { - ///// - // Use the scalar mode - // - if( globalIdx < this->getRows() ) - outVector[ globalIdx ] = this->rowVectorProduct( globalIdx, inVector ); +template< typename Real, + typename Index, + typename Device, + CSRKernel KernelType, + int warpSize> +void SpMVCSRMultiVectorPrepare( const Real *inVector, + Real* outVector, + const CSR< Real, Device, Index, KernelType >& matrix) { + const Index rows = matrix.getRowPointers().getSize() - 1; + const Index threads = matrix.THREADS_VECTOR; // block size + Index blocks; + + const Index nnz = roundUpDivision(matrix.getValues().getSize(), rows); // non zeroes per row + const Index neededWarps = roundUpDivision(nnz, matrix.MAX_ELEMENTS_PER_WARP); // warps per row + size_t neededThreads = warpSize * neededWarps * rows; + /* Execute kernels on device */ + for (Index grid = 0; neededThreads != 0; ++grid) { + if (MAX_X_DIM * threads >= neededThreads) { + blocks = roundUpDivision(neededThreads, threads); + neededThreads = 0; + } else { + blocks = MAX_X_DIM; + neededThreads -= MAX_X_DIM * threads; + } + + if (neededWarps == 1) { // one warp per row -> execute CSR Vector + SpMVCSRVector<<>>( + inVector, + outVector, + matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), + matrix.getValues().getData(), + rows, + grid + ); + } else { + SpMVCSRMultiVector<<>>( + inVector, + outVector, + matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), + matrix.getValues().getData(), + rows, + neededWarps, + grid + ); + } + } +} + +template< typename Real, + typename Index, + typename Device, + CSRKernel KernelType, + int warpSize> +void SpMVCSRAdaptivePrepare( const Real *inVector, + Real* outVector, + const CSR< Real, Device, Index, KernelType >& matrix) { + Index blocks; + const Index threads = matrix.THREADS_ADAPTIVE; + + /* Fill blocks */ + size_t neededThreads = matrix.blocks.getSize() * warpSize; // one warp per block + /* Execute kernels on device */ + for (Index grid = 0; neededThreads != 0; ++grid) { + if (MAX_X_DIM * threads >= neededThreads) { + blocks = roundUpDivision(neededThreads, threads); + neededThreads = 0; + } else { + blocks = MAX_X_DIM; + neededThreads -= MAX_X_DIM * threads; + } + + SpMVCSRAdaptive< Real, Index, warpSize, + matrix.WARPS, + matrix.SHARED_PER_WARP, + matrix.MAX_ELEMENTS_PER_WARP > + <<>>( + inVector, + outVector, + matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), + matrix.getValues().getData(), + matrix.blocks.getData(), + matrix.blocks.getSize() - 1, // last block shouldn't be used + grid + ); } - else - { - //// - // Use the vector mode - // - spmvCudaVectorized< InVector, OutVector, warpSize >( inVector, outVector, warpStart, warpEnd, inWarpIdx ); - }*/ } + #endif template<> @@ -1037,121 +1801,6 @@ class CSRDeviceDependentCode< Devices::Host > }; -#ifdef HAVE_CUDA -template< typename Real, - typename Index, - CSRKernel KernelType, - typename InVector, - typename OutVector, - int warpSize > -__global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Index, KernelType >* matrix, - const InVector* inVector, - OutVector* outVector, - int gridIdx, - int *blocks, size_t size) -{ - typedef CSR< Real, Devices::Cuda, Index > Matrix; - static_assert( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value, "" ); - const typename Matrix::IndexType rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - if( KernelType == CSRScalar ) - { - if( rowIdx < matrix->getRows() ) - ( *outVector )[ rowIdx ] = matrix->rowVectorProduct( rowIdx, *inVector ); - } - else - { - matrix->template vectorProductCuda< InVector, OutVector, warpSize > - ( *inVector, *outVector, gridIdx, blocks, size ); - } -} -#endif - -template< typename Real, - typename Index, - CSRKernel KernelType, - typename InVector, - typename OutVector > -void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index, KernelType >& matrix, - const InVector& inVector, - OutVector& outVector, - int *blocks, - size_t size ) -{ -#ifdef HAVE_CUDA - typedef CSR< Real, Devices::Cuda, Index, KernelType > Matrix; - typedef typename Matrix::IndexType IndexType; - Matrix* kernel_this = Cuda::passToDevice( matrix ); - InVector* kernel_inVector = Cuda::passToDevice( inVector ); - OutVector* kernel_outVector = Cuda::passToDevice( outVector ); - int *kernelBlocks; - cudaMalloc((void **)&kernelBlocks, sizeof(int) * size); - cudaMemcpy(kernelBlocks, blocks, size * sizeof(int), cudaMemcpyHostToDevice); - - TNL_CHECK_CUDA_DEVICE; - dim3 cudaBlockSize( 256 ); - //dim3 cudaGridSize( Cuda::getMaxGridSize() ); - const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); - const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); - for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) - { - //if( gridIdx == cudaGrids - 1 ) - // cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); - //const int sharedMemory = cudaBlockSize.x * sizeof( Real ); - //const int threads = cudaBlockSize.x; - if( matrix.getCudaWarpSize() == 32 ) { - // printf("BL %d BLSIZE %d\n", (int)cudaBlocks, (int)threads); - CSRVectorProductCudaKernel< Real, Index, KernelType, InVector, OutVector, 32 > - <<< 2, 1024 >>> - ( kernel_this, - kernel_inVector, - kernel_outVector, - gridIdx, kernelBlocks, size ); - } - // if( matrix.getCudaWarpSize() == 16 ) - // CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 16 > - // <<< cudaGridSize, cudaBlockSize, sharedMemory >>> - // ( kernel_this, - // kernel_inVector, - // kernel_outVector, - // gridIdx, kernelBlocks, size ); - // if( matrix.getCudaWarpSize() == 8 ) - // CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 8 > - // <<< cudaGridSize, cudaBlockSize, sharedMemory >>> - // ( kernel_this, - // kernel_inVector, - // kernel_outVector, - // gridIdx, kernelBlocks, size ); - // if( matrix.getCudaWarpSize() == 4 ) - // CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 4 > - // <<< cudaGridSize, cudaBlockSize, sharedMemory >>> - // ( kernel_this, - // kernel_inVector, - // kernel_outVector, - // gridIdx, kernelBlocks, size ); - // if( matrix.getCudaWarpSize() == 2 ) - // CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 2 > - // <<< cudaGridSize, cudaBlockSize, sharedMemory >>> - // ( kernel_this, - // kernel_inVector, - // kernel_outVector, - // gridIdx, kernelBlocks, size ); - // if( matrix.getCudaWarpSize() == 1 ) - // CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 1 > - // <<< cudaGridSize, cudaBlockSize, sharedMemory >>> - // ( kernel_this, - // kernel_inVector, - // kernel_outVector, - // gridIdx, kernelBlocks, size ); - - } - TNL_CHECK_CUDA_DEVICE; - Cuda::freeFromDevice( kernel_this ); - Cuda::freeFromDevice( kernel_inVector ); - Cuda::freeFromDevice( kernel_outVector ); - TNL_CHECK_CUDA_DEVICE; -#endif -} - #ifdef HAVE_CUSPARSE template<> @@ -1260,6 +1909,7 @@ class CSRDeviceDependentCode< Devices::Cuda > const InVector& inVector, OutVector& outVector ) { +#ifdef HAVE_CUDA #ifdef HAVE_CUSPARSE tnlCusparseCSRWrapper< Real, Index >::vectorProduct( matrix.getRows(), matrix.getColumns(), @@ -1270,39 +1920,47 @@ class CSRDeviceDependentCode< Devices::Cuda > inVector.getData(), outVector.getData() ); #else - constexpr int SHARED = 49152/sizeof(float); - constexpr int SHARED_PER_WARP = SHARED / 32; - std::vector inBlock; - inBlock.push_back(0); - size_t sum = 0; - Index i; - int prev_i = 0; - for (i = 1; i < matrix.getRowPointers().getSize() - 1; ++i) { - size_t elements = matrix.getRowPointers().getElement(i) - - matrix.getRowPointers().getElement(i - 1); - sum += elements; - if (sum > SHARED_PER_WARP) { - if (i - prev_i == 1) { - inBlock.push_back(i); - } else { - inBlock.push_back(i - 1); - --i; - } - sum = 0; - prev_i = i; - continue; - } - if (i - prev_i == 32) { - inBlock.push_back(i); - prev_i = i; - sum = 0; - } + switch(KernelType) + { + case CSRScalar: + SpMVCSRScalarPrepare( + inVector.getData(), outVector.getData(), matrix + ); + break; + case CSRVector: + SpMVCSRVectorPrepare( + inVector.getData(), outVector.getData(), matrix + ); + break; + case CSRLight: + case CSRLight2: + case CSRLight3: + case CSRLight6: + SpMVCSRLightPrepare( + inVector.getData(), outVector.getData(), matrix + ); + break; + case CSRAdaptive: + SpMVCSRAdaptivePrepare( + inVector.getData(), outVector.getData(), matrix + ); + break; + case CSRMultiVector: + SpMVCSRMultiVectorPrepare( + inVector.getData(), outVector.getData(), matrix + ); + break; + case CSRLight4: + case CSRLight5: + case CSRLightWithoutAtomic: + SpMVCSRLightWithoutAtomicPrepare( + inVector.getData(), outVector.getData(), matrix + ); + break; } - inBlock.push_back(matrix.getRowPointers().getSize() - 1); - CSRVectorProductCuda( matrix, inVector, outVector, inBlock.data(), inBlock.size() ); +#endif /* HAVE_CUDA */ #endif } - }; } //namespace Legacy diff --git a/src/TNL/Matrices/Legacy/EllpackSymmetric.h b/src/TNL/Matrices/Legacy/EllpackSymmetric.h deleted file mode 100644 index af3c2e4a81a5d966dd644483add94db471069f6d..0000000000000000000000000000000000000000 --- a/src/TNL/Matrices/Legacy/EllpackSymmetric.h +++ /dev/null @@ -1,190 +0,0 @@ -/*************************************************************************** - EllpackSymmetric.h - description - ------------------- - begin : Aug 30, 2018 - copyright : (C) 2018 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#pragma once - -#include -#include - -namespace TNL { -namespace Matrices { - namespace Legacy { - -template< typename Device > -class EllpackSymmetricDeviceDependentCode; - -template< typename Real, typename Device = Devices::Host, typename Index = int > -class EllpackSymmetric : public Sparse< Real, Device, Index > -{ - public: - - typedef Real RealType; - typedef Device DeviceType; - typedef Index IndexType; - typedef typename Sparse< RealType, DeviceType, IndexType >::CompressedRowLengthsVector CompressedRowLengthsVector; - typedef typename Sparse< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView; - typedef typename Sparse< RealType, DeviceType, IndexType >::ValuesVector ValuesVector; - typedef typename Sparse< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector; - - template< typename _Real = Real, - typename _Device = Device, - typename _Index = Index > - using Self = EllpackSymmetric< _Real, _Device, _Index >; - - EllpackSymmetric(); - - void setDimensions( const IndexType rows, - const IndexType columns ); - - void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths ); - - bool setConstantRowLengths( const IndexType& rowLengths ); - - IndexType getRowLength( const IndexType row ) const; - - template< typename Real2, typename Device2, typename Index2 > - bool setLike( const EllpackSymmetric< Real2, Device2, Index2 >& matrix ); - - void reset(); - - template< typename Real2, typename Device2, typename Index2 > - bool operator == ( const EllpackSymmetric< Real2, Device2, Index2 >& matrix ) const; - - template< typename Real2, typename Device2, typename Index2 > - bool operator != ( const EllpackSymmetric< Real2, Device2, Index2 >& matrix ) const; - - /*template< typename Matrix > - bool copyFrom( const Matrix& matrix, - const CompressedRowLengthsVector& rowLengths );*/ - - __cuda_callable__ - bool setElementFast( const IndexType row, - const IndexType column, - const RealType& value ); - - bool setElement( const IndexType row, - const IndexType column, - const RealType& value ); - - __cuda_callable__ - bool addElementFast( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator = 1.0 ); - - bool addElement( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator = 1.0 ); - - - __cuda_callable__ - bool setRowFast( const IndexType row, - const IndexType* columnIndexes, - const RealType* values, - const IndexType elements ); - - bool setRow( const IndexType row, - const IndexType* columnIndexes, - const RealType* values, - const IndexType elements ); - - - __cuda_callable__ - bool addRowFast( const IndexType row, - const IndexType* columns, - const RealType* values, - const IndexType numberOfElements, - const RealType& thisElementMultiplicator = 1.0 ); - - bool addRow( const IndexType row, - const IndexType* columns, - const RealType* values, - const IndexType numberOfElements, - const RealType& thisElementMultiplicator = 1.0 ); - - __cuda_callable__ - RealType getElementFast( const IndexType row, - const IndexType column ) const; - - RealType getElement( const IndexType row, - const IndexType column ) const; - - __cuda_callable__ - void getRowFast( const IndexType row, - IndexType* columns, - RealType* values ) const; - - void getRow( const IndexType row, - IndexType* columns, - RealType* values ) const; - - template< typename Vector > - __cuda_callable__ - typename Vector::RealType rowVectorProduct( const IndexType row, - const Vector& vector ) const; - - template< typename InVector, - typename OutVector > - void vectorProduct( const InVector& inVector, - OutVector& outVector ) const; - - template< typename InVector, - typename OutVector > - void vectorProductHost( const InVector& inVector, - OutVector& outVector ) const; - - template< typename Real2, typename Index2 > - void addMatrix( const EllpackSymmetric< Real2, Device, Index2 >& matrix, - const RealType& matrixMultiplicator = 1.0, - const RealType& thisMatrixMultiplicator = 1.0 ); - - template< typename Real2, typename Index2 > - void getTransposition( const EllpackSymmetric< Real2, Device, Index2 >& matrix, - const RealType& matrixMultiplicator = 1.0 ); - - template< typename Vector > - bool performSORIteration( const Vector& b, - const IndexType row, - Vector& x, - const RealType& omega = 1.0 ) const; - - void save( File& file ) const; - - void load( File& file ); - - void save( const String& fileName ) const; - - void load( const String& fileName ); - - void print( std::ostream& str ) const; - - template< typename InVector, - typename OutVector > - __cuda_callable__ - void spmvCuda( const InVector& inVector, - OutVector& outVector, - int rowIdx ) const; - - protected: - - void allocateElements(); - - IndexType rowLengths, alignedRows; - - typedef EllpackSymmetricDeviceDependentCode< DeviceType > DeviceDependentCode; - friend class EllpackSymmetricDeviceDependentCode< DeviceType >; -}; - -} //namespace Legacy -} // namespace Matrices -} // namespace TNL - -#include diff --git a/src/TNL/Matrices/Legacy/EllpackSymmetricGraph.h b/src/TNL/Matrices/Legacy/EllpackSymmetricGraph.h deleted file mode 100644 index dd42b7f26a93ae088b3198c7975949f415ae021e..0000000000000000000000000000000000000000 --- a/src/TNL/Matrices/Legacy/EllpackSymmetricGraph.h +++ /dev/null @@ -1,212 +0,0 @@ -/*************************************************************************** - EllpackSymmetricGraph.h - description - ------------------- - begin : Aug 30, 2018 - copyright : (C) 2018 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#pragma once - -#include -#include - -namespace TNL { -namespace Matrices { - namespace Legacy { - -template< typename Device > -class EllpackSymmetricGraphDeviceDependentCode; - -template< typename Real, typename Device = Devices::Host, typename Index = int > -class EllpackSymmetricGraph : public Sparse< Real, Device, Index > -{ - public: - - typedef Real RealType; - typedef Device DeviceType; - typedef Index IndexType; - typedef typename Sparse< RealType, DeviceType, IndexType >::CompressedRowLengthsVector CompressedRowLengthsVector; - typedef typename Sparse< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView; - typedef typename Sparse< RealType, DeviceType, IndexType >::ValuesVector ValuesVector; - typedef typename Sparse< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector; - - template< typename _Real = Real, - typename _Device = Device, - typename _Index = Index > - using Self = EllpackSymmetricGraph< _Real, _Device, _Index >; - - EllpackSymmetricGraph(); - - void setDimensions( const IndexType rows, - const IndexType columns ); - - void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths ); - - bool setConstantRowLengths( const IndexType& rowLengths ); - - IndexType getRowLength( const IndexType row ) const; - - template< typename Real2, typename Device2, typename Index2 > - bool setLike( const EllpackSymmetricGraph< Real2, Device2, Index2 >& matrix ); - - void reset(); - - //template< typename Real2, typename Device2, typename Index2 > - //bool operator == ( const EllpackSymmetricGraph< Real2, Device2, Index2 >& matrix ) const; - - //template< typename Real2, typename Device2, typename Index2 > - //bool operator != ( const EllpackSymmetricGraph< Real2, Device2, Index2 >& matrix ) const; - - /*template< typename Matrix > - bool copyFrom( const Matrix& matrix, - const CompressedRowLengthsVector& rowLengths );*/ - - __cuda_callable__ - bool setElementFast( const IndexType row, - const IndexType column, - const RealType& value ); - - bool setElement( const IndexType row, - const IndexType column, - const RealType& value ); - - __cuda_callable__ - bool addElementFast( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator = 1.0 ); - - bool addElement( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator = 1.0 ); - - - __cuda_callable__ - bool setRowFast( const IndexType row, - const IndexType* columnIndexes, - const RealType* values, - const IndexType elements ); - - bool setRow( const IndexType row, - const IndexType* columnIndexes, - const RealType* values, - const IndexType elements ); - - - __cuda_callable__ - bool addRowFast( const IndexType row, - const IndexType* columns, - const RealType* values, - const IndexType numberOfElements, - const RealType& thisElementMultiplicator = 1.0 ); - - bool addRow( const IndexType row, - const IndexType* columns, - const RealType* values, - const IndexType numberOfElements, - const RealType& thisElementMultiplicator = 1.0 ); - - __cuda_callable__ - RealType getElementFast( const IndexType row, - const IndexType column ) const; - - RealType getElement( const IndexType row, - const IndexType column ) const; - - __cuda_callable__ - void getRowFast( const IndexType row, - IndexType* columns, - RealType* values ) const; - - void getRow( const IndexType row, - IndexType* columns, - RealType* values ) const; - - template< typename Vector > - __cuda_callable__ - typename Vector::RealType rowVectorProduct( const IndexType row, - const Vector& vector ) const; - - template< typename InVector, - typename OutVector > - void vectorProduct( const InVector& inVector, - OutVector& outVector ) const; - - template< typename InVector, - typename OutVector > - void vectorProductHost( const InVector& inVector, - OutVector& outVector ) const; - -#ifdef HAVE_CUDA - template< typename InVector, - typename OutVector > - __cuda_callable__ - void spmvCuda( const InVector& inVector, - OutVector& outVector, - const int globalIdx, - const int color ) const; -#endif - - void computePermutationArray(); - - bool rearrangeMatrix( bool verbose ); - - void save( File& file ) const; - - void load( File& file ); - - void save( const String& fileName ) const; - - void load( const String& fileName ); - - void print( std::ostream& str ) const; - - bool help( bool verbose = false ); - - void verifyPermutationArray(); - - __cuda_callable__ - Index getRowLengthsInt() const; - - __cuda_callable__ - Index getAlignedRows() const; - - __cuda_callable__ - Index getRowsOfColor( IndexType color ) const; - - void copyFromHostToCuda( EllpackSymmetricGraph< Real, Devices::Host, Index >& matrix ); - - __cuda_callable__ - Containers::Vector< Index, Device, Index >& getPermutationArray(); - - __cuda_callable__ - Containers::Vector< Index, Device, Index >& getInversePermutation(); - - __cuda_callable__ - Containers::Vector< Index, Device, Index >& getColorPointers(); - - protected: - - void allocateElements(); - - IndexType rowLengths, alignedRows; - - typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DeviceDependentCode; - friend class EllpackSymmetricGraphDeviceDependentCode< DeviceType >; - - Containers::Vector< Index, Device, Index > permutationArray; - Containers::Vector< Index, Device, Index > inversePermutationArray; - Containers::Vector< Index, Device, Index > colorPointers; - bool rearranged; -}; - -} //namespace Legacy -} // namespace Matrices -} // namespace TNL - - -#include diff --git a/src/TNL/Matrices/Legacy/EllpackSymmetricGraph_impl.h b/src/TNL/Matrices/Legacy/EllpackSymmetricGraph_impl.h deleted file mode 100644 index 6f5419196dc6839e6831bb2fe5579ae1bc87823a..0000000000000000000000000000000000000000 --- a/src/TNL/Matrices/Legacy/EllpackSymmetricGraph_impl.h +++ /dev/null @@ -1,1044 +0,0 @@ -/*************************************************************************** - EllpackSymmetricGraph_impl.h - description - ------------------- - begin : Aug 30, 2018 - copyright : (C) 2018 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#pragma once - -#include -#include -#include - -namespace TNL { -namespace Matrices { - namespace Legacy { - -template< typename Real, - typename Device, - typename Index > -EllpackSymmetricGraph< Real, Device, Index > :: EllpackSymmetricGraph() -: rowLengths( 0 ), alignedRows( 0 ), rearranged( false ) -{ -}; - -template< typename Real, - typename Device, - typename Index > -__cuda_callable__ -Index EllpackSymmetricGraph< Real, Device, Index >::getRowLengthsInt() const -{ - return this->rowLengths; -} - -template< typename Real, - typename Device, - typename Index > -Index EllpackSymmetricGraph< Real, Device, Index >::getAlignedRows() const -{ - return this->alignedRows; -} - -template< typename Real, - typename Device, - typename Index > -String EllpackSymmetricGraph< Real, Device, Index > :: getType() -{ - return String( "Matrices::EllpackSymmetricGraph< ") + - String( TNL::getType< Real >() ) + - String( ", " ) + - String( Device::getDeviceType() ) + - String( ", " ) + - String( TNL::getType< Index >() ) + - String( " >" ); -} - -template< typename Real, - typename Device, - typename Index > -String EllpackSymmetricGraph< Real, Device, Index >::getTypeVirtual() const -{ - return this->getType(); -} - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetricGraph< Real, Device, Index >::setDimensions( const IndexType rows, - const IndexType columns ) -{ - TNL_ASSERT( rows > 0 && columns > 0, - std::cerr << "rows = " << rows - << " columns = " << columns << std::endl ); - - this->rows = rows; - this->columns = columns; - - if( std::is_same< DeviceType, Devices::Cuda >::value ) - { - this->alignedRows = roundToMultiple( columns, Devices::Cuda::getWarpSize() ); - - if( this->rows - this->alignedRows > 0 ) - { - IndexType missingRows = this->rows - this->alignedRows; - missingRows = roundToMultiple( missingRows, Devices::Cuda::getWarpSize() ); - this->alignedRows += missingRows; - -// this->alignedRows += roundToMultiple( this->rows - this->alignedRows, Devices::Cuda::getWarpSize() ); - } - } - else this->alignedRows = rows; - - if( this->rowLengths != 0 ) - allocateElements(); -} - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetricGraph< Real, Device, Index >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths ) -{ - TNL_ASSERT( this->getRows() > 0, ); - TNL_ASSERT( this->getColumns() > 0, ); - //TNL_ASSERT( this->rowLengths > 0, - // std::cerr << "this->rowLengths = " << this->rowLengths ); - this->rowLengths = this->maxRowLength = max( rowLengths ); - this->permutationArray.setSize( this->getRows() ); - for( IndexType i = 0; i < this->getRows(); i++ ) - this->permutationArray.setElement( i, i ); - allocateElements(); -} - -template< typename Real, - typename Device, - typename Index > -__cuda_callable__ -Index EllpackSymmetricGraph< Real, Device, Index >::getRowsOfColor( IndexType color ) const -{ - return this->colorPointers[ color + 1 ] - this->colorPointers[ color ]; -} - -/* -template< typename Real, - typename Device, - typename Index > -#ifdef HAVE_CUDA -__device__ __host__ -#endif -void EllpackSymmetricGraph< Real, Device, Index >::computeColorsVector( Containers::Vector< Index, Device, Index >& colorsVector ) -{ - this->numberOfColors = 0; - - for( IndexType i = this->getRows() - 1; i >= 0; i-- ) - { - // init color array - Containers::Vector< Index, Device, Index > usedColors; - usedColors.setSize( this->numberOfColors ); - for( IndexType j = 0; j < this->numberOfColors; j++ ) - usedColors.setElement( j, 0 ); - - // find all colors used in given row - - // optimization: - // load the whole row in sparse format - // traverse it while don't hit the padding index or end of the row - // for each nonzero element write -> usedColors.setElement( colorsVector.getElement( column ), 1 ) - IndexType* columns = new IndexType[ this->getRowLength( i ) ]; - RealType* values = new RealType[ this->getRowLength( i ) ]; - this->getRow( i, columns, values ); - for( IndexType j = 0; j < this->getRowLength( i ); j++ ) - { - // we are only interested in symmetric part of the matrix - if( columns[ j ] < i + 1 ) - continue; - - // if we hit padding index, there is no reason to continue iterations - if( columns[ j ] == this->getPaddingIndex() ) - break; - - usedColors.setElement( colorsVector.getElement( columns[ j ] ), 1 ); - } - delete [] columns; - delete [] values; - - - //for( IndexType j = i + 1; j < this->getColumns(); j++ ) - // if( this->getElement( i, j ) != 0.0 ) - // usedColors.setElement( colorsVector.getElement( j ), 1 ); - - // find unused color - bool found = false; - for( IndexType j = 0; j < this->numberOfColors; j++ ) - if( usedColors.getElement( j ) == 0 ) - { - colorsVector.setElement( i, j ); - found = true; - break; - } - if( !found ) - { - colorsVector.setElement( i, this->numberOfColors ); - this->numberOfColors++; - } - } -} -*/ - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetricGraph< Real, Device, Index >::computePermutationArray() -{ - // init vector of colors and permutation array - Containers::Vector< Index, Device, Index > colorsVector; - colorsVector.setSize( this->getRows() ); - for( IndexType i = 0; i < this->getRows(); i++ ) - { - colorsVector.setElement( i, 0 ); - } - - // compute colors for each row - Matrix< Real, Device, Index >::computeColorsVector( colorsVector ); - - // init color pointers - this->colorPointers.setSize( this->getNumberOfColors() + 1 ); - - // compute permutation - IndexType position = 0; - for( IndexType color = 0; color < this->getNumberOfColors(); color++ ) - { - this->colorPointers.setElement( color, position ); - for (IndexType i = 0; i < this->getRows(); i++) - if ( colorsVector.getElement( i ) == color) - { - IndexType row1 = this->permutationArray.getElement( i ); - IndexType row2 = this->permutationArray.getElement( position ); - IndexType tmp = this->permutationArray.getElement( row1 ); - this->permutationArray.setElement( row1, this->permutationArray.getElement( row2 ) ); - this->permutationArray.setElement( row2, tmp ); - - tmp = colorsVector.getElement( position ); - colorsVector.setElement( position, colorsVector.getElement( i ) ); - colorsVector.setElement( i, tmp ); - position++; - } - } - - this->colorPointers.setElement( this->getNumberOfColors(), this->getRows() ); - - // destroy colors vector - colorsVector.reset(); - - this->inversePermutationArray.setSize( this->getRows() ); - for( IndexType row = 0; row < this->getRows(); row++ ) - this->inversePermutationArray.setElement( this->permutationArray.getElement( row ), row ); -} - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetricGraph< Real, Device, Index >::verifyPermutationArray() -{ - for( IndexType i = 0; i < this->getRows(); i++ ) - if( this->permutationArray.getElement( i ) >= this->getRows() ) - { - std::cerr << "There is wrong data in permutationArray position " << i << std::endl; - break; - } -} - -template< typename Real, - typename Device, - typename Index > -bool EllpackSymmetricGraph< Real, Device, Index >::rearrangeMatrix( bool verbose ) -{ - // first we need to know permutation - this->computePermutationArray(); - if( verbose ) - this->verifyPermutationArray(); - - // then we need to create new matrix - Containers::Vector< Real, Device, Index > valuesVector; - Containers::Vector< Index, Device, Index > columnsVector; - valuesVector.setSize( this->values.getSize() ); - columnsVector.setSize( this->columnIndexes.getSize() ); - valuesVector.setValue( 0.0 ); - columnsVector.setValue( this->getPaddingIndex() ); - - for( IndexType row = 0; row < this->getRows(); row++ ) - { - typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DDCType; - IndexType elementPtrOrig = DDCType::getRowBegin( *this, row ); - IndexType elementPtrNew = DDCType::getRowBegin( *this, this->permutationArray.getElement( row ) ); - IndexType rowEnd = DDCType::getRowEnd( *this, row ); - IndexType step = DDCType::getElementStep( *this ); - - for( IndexType i = 0; i < this->rowLengths; i++ ) - { - if( this->columnIndexes.getElement( elementPtrOrig ) <= row ) - { - valuesVector.setElement(elementPtrNew, this->values.getElement(elementPtrOrig)); - columnsVector.setElement(elementPtrNew, this->columnIndexes.getElement(elementPtrOrig)); - elementPtrNew += step; - } - elementPtrOrig += step; - } - } - - // reset original matrix - this->values.reset(); - this->columnIndexes.reset(); - - // deep copy new matrix - this->values.setSize( valuesVector.getSize() ); - this->columnIndexes.setSize( columnsVector.getSize() ); - this->values = valuesVector; - this->columnIndexes = columnsVector; - - // clear memory - valuesVector.reset(); - columnsVector.reset(); - - this->rearranged = true; - return true; -} - -template< typename Real, - typename Device, - typename Index > -__cuda_callable__ -Containers::Vector< Index, Device, Index >& -EllpackSymmetricGraph< Real, Device, Index >::getPermutationArray() -{ - return this->permutationArray; -} - -template< typename Real, - typename Device, - typename Index > -__cuda_callable__ -Containers::Vector< Index, Device, Index >& -EllpackSymmetricGraph< Real, Device, Index >::getInversePermutation() -{ - return this->inversePermutationArray; -} - -template< typename Real, - typename Device, - typename Index > -__cuda_callable__ -Containers::Vector< Index, Device, Index >& -EllpackSymmetricGraph< Real, Device, Index >::getColorPointers() -{ - return this->colorPointers; -} - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetricGraph< Real, Device, Index >::copyFromHostToCuda( EllpackSymmetricGraph< Real, Devices::Host, Index >& matrix ) -{ - // TODO: fix - //Sparse< Real, Device, Index >::copyFromHostToCuda( matrix ); - - this->rearranged = true; - this->rowLengths = matrix.getRowLengthsInt(); - this->alignedRows = matrix.getAlignedRows(); - Containers::Vector< Index, Devices::Host, Index >& colorPointers = matrix.getColorPointers(); - this->colorPointers.setSize( colorPointers.getSize() ); - for( IndexType i = 0; i < colorPointers.getSize(); i++ ) - this->colorPointers.setElement( i, colorPointers[ i ] ); - - Containers::Vector< Index,Devices::Host, Index >& permutationArray = matrix.getPermutationArray(); - this->permutationArray.setSize( permutationArray.getSize() ); - for( IndexType i = 0; i < permutationArray.getSize(); i++ ) - this->permutationArray.setElement( i, permutationArray[ i ] ); - - Containers::Vector< Index, Devices::Host, Index >& inversePermutation = matrix.getInversePermutation(); - this->inversePermutationArray.setSize( inversePermutation.getSize() ); - for( IndexType i = 0; i < inversePermutation.getSize(); i++ ) - this->inversePermutationArray.setElement( i, inversePermutation[ i ] ); - - for( IndexType i = 0; i < this->getRows(); i++ ) - for( IndexType j = 0; j <= i; j++ ) - if( matrix.getElement( i, j ) != 0.0 ) - this->setElementFast( i, j, matrix.getElement( i, j ) ); - - colorPointers.reset(); - permutationArray.reset(); -} - -template< typename Real, - typename Device, - typename Index > -bool EllpackSymmetricGraph< Real, Device, Index >::setConstantRowLengths( const IndexType& rowLengths ) -{ - TNL_ASSERT( rowLengths > 0, std::cerr << " rowLengths = " << rowLengths ); - this->rowLengths = rowLengths; - if( this->rows > 0 ) - allocateElements(); - return true; -} - -template< typename Real, - typename Device, - typename Index > -Index EllpackSymmetricGraph< Real, Device, Index >::getRowLength( const IndexType row ) const -{ - return this->rowLengths; -} - -template< typename Real, - typename Device, - typename Index > - template< typename Real2, - typename Device2, - typename Index2 > -bool EllpackSymmetricGraph< Real, Device, Index >::setLike( const EllpackSymmetricGraph< Real2, Device2, Index2 >& matrix ) -{ - if( ! Sparse< Real, Device, Index >::setLike( matrix ) || - ! this->permutationArray.setLike( matrix.permutationArray ) || - ! this->colorPointers.setLike( matrix.colorPointers ) ) - return false; - this->rowLengths = matrix.rowLengths; - this->numberOfColors = matrix.getNumberOfColors(); - return true; -} - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetricGraph< Real, Device, Index > :: reset() -{ - Sparse< Real, Device, Index >::reset(); - this->permutationArray.reset(); - this->colorPointers.reset(); - this->rowLengths = 0; -} - -/*template< typename Real, - typename Device, - typename Index > - template< typename Matrix > -bool EllpackSymmetricGraph< Real, Device, Index >::copyFrom( const Matrix& matrix, - const CompressedRowLengthsVector& rowLengths ) -{ - return tnlMatrix< RealType, DeviceType, IndexType >::copyFrom( matrix, rowLengths ); -}*/ - -template< typename Real, - typename Device, - typename Index > -__cuda_callable__ -bool EllpackSymmetricGraph< Real, Device, Index > :: setElementFast( const IndexType row, - const IndexType column, - const Real& value ) -{ - return this->addElementFast( row, column, value, 0.0 ); -} - -template< typename Real, - typename Device, - typename Index > -bool EllpackSymmetricGraph< Real, Device, Index > :: setElement( const IndexType row, - const IndexType column, - const Real& value ) -{ - return this->addElement( row, column, value, 0.0 ); -} - - -template< typename Real, - typename Device, - typename Index > -__cuda_callable__ -bool EllpackSymmetricGraph< Real, Device, Index > :: addElementFast( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator ) -{ - typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DDCType; - IndexType i = DDCType::getRowBegin( *this, this->permutationArray[ row ] ); - const IndexType rowEnd = DDCType::getRowEnd( *this, this->permutationArray[ row ] ); - const IndexType step = DDCType::getElementStep( *this ); - - while( i < rowEnd && - this->columnIndexes[ i ] < column && - this->columnIndexes[ i ] != this->getPaddingIndex() ) i += step; - if( i == rowEnd ) - return false; - if( this->columnIndexes[ i ] == column ) - { - this->values[ i ] = thisElementMultiplicator * this->values[ i ] + value; - return true; - } - else - if( this->columnIndexes[ i ] == this->getPaddingIndex() ) // artificial zero - { - this->columnIndexes[ i ] = column; - this->values[ i ] = value; - } - else - { - Index j = rowEnd - step; - while( j > i ) - { - this->columnIndexes[ j ] = this->columnIndexes[ j - step ]; - this->values[ j ] = this->values[ j - step ]; - j -= step; - } - this->columnIndexes[ i ] = column; - this->values[ i ] = value; - } - return true; -} - -template< typename Real, - typename Device, - typename Index > -bool EllpackSymmetricGraph< Real, Device, Index > :: addElement( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator ) -{ - typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DDCType; - IndexType i = DDCType::getRowBegin( *this, this->permutationArray[ row ] ); - const IndexType rowEnd = DDCType::getRowEnd( *this, this->permutationArray[ row ] ); - const IndexType step = DDCType::getElementStep( *this ); - - while( i < rowEnd && - this->columnIndexes.getElement( i ) < column && - this->columnIndexes.getElement( i ) != this->getPaddingIndex() ) i += step; - if( i == rowEnd ) - return false; - if( this->columnIndexes.getElement( i ) == column ) - { - this->values.setElement( i, thisElementMultiplicator * this->values.getElement( i ) + value ); - return true; - } - else - if( this->columnIndexes.getElement( i ) == this->getPaddingIndex() ) - { - this->columnIndexes.setElement( i, column ); - this->values.setElement( i, value ); - } - else - { - IndexType j = rowEnd - step; - while( j > i ) - { - this->columnIndexes.setElement( j, this->columnIndexes.getElement( j - step ) ); - this->values.setElement( j, this->values.getElement( j - step ) ); - j -= step; - } - this->columnIndexes.setElement( i, column ); - this->values.setElement( i, value ); - } - return true; -} - -template< typename Real, - typename Device, - typename Index > -__cuda_callable__ -bool EllpackSymmetricGraph< Real, Device, Index > :: setRowFast( const IndexType row, - const IndexType* columnIndexes, - const RealType* values, - const IndexType elements ) -{ - typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DDCType; - IndexType elementPointer = DDCType::getRowBegin( *this, this->permutationArray[ row ] ); - const IndexType rowEnd = DDCType::getRowEnd( *this, this->permutationArray[ row ] ); - const IndexType step = DDCType::getElementStep( *this ); - - if( elements > this->rowLengths ) - return false; - for( Index i = 0; i < elements; i++ ) - { - const IndexType column = columnIndexes[ i ]; - if( column < 0 || column >= this->getColumns() ) - return false; - this->columnIndexes[ elementPointer ] = column; - this->values[ elementPointer ] = values[ i ]; - elementPointer += step; - } - for( Index i = elements; i < this->rowLengths; i++ ) - { - this->columnIndexes[ elementPointer ] = this->getPaddingIndex(); - elementPointer += step; - } - return true; -} - -template< typename Real, - typename Device, - typename Index > -bool EllpackSymmetricGraph< Real, Device, Index > :: setRow( const IndexType row, - const IndexType* columnIndexes, - const RealType* values, - const IndexType elements ) -{ - typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DDCType; - IndexType elementPointer = DDCType::getRowBegin( *this, this->permutationArray.getElement( row ) ); - const IndexType rowEnd = DDCType::getRowEnd( *this, this->permutationArray.getElement( row ) ); - const IndexType step = DDCType::getElementStep( *this ); - - if( elements > this->rowLengths ) - return false; - - for( IndexType i = 0; i < elements; i++ ) - { - const IndexType column = columnIndexes[ i ]; - if( column < 0 || column >= this->getColumns() ) - return false; - this->columnIndexes.setElement( elementPointer, column ); - this->values.setElement( elementPointer, values[ i ] ); - elementPointer += step; - } - for( IndexType i = elements; i < this->rowLengths; i++ ) - { - this->columnIndexes.setElement( elementPointer, this->getPaddingIndex() ); - elementPointer += step; - } - return true; -} - -template< typename Real, - typename Device, - typename Index > -__cuda_callable__ -bool EllpackSymmetricGraph< Real, Device, Index > :: addRowFast( const IndexType row, - const IndexType* columns, - const RealType* values, - const IndexType numberOfElements, - const RealType& thisElementMultiplicator ) -{ - // TODO: implement - return false; -} - -template< typename Real, - typename Device, - typename Index > -bool EllpackSymmetricGraph< Real, Device, Index > :: addRow( const IndexType row, - const IndexType* columns, - const RealType* values, - const IndexType numberOfElements, - const RealType& thisElementMultiplicator ) -{ - return this->addRowFast( row, columns, values, numberOfElements ); -} - - -template< typename Real, - typename Device, - typename Index > -__cuda_callable__ -Real EllpackSymmetricGraph< Real, Device, Index >::getElementFast( const IndexType row, - const IndexType column ) const -{ - if( row < column ) - return this->getElementFast( column, row ); - - typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DDCType; - IndexType elementPtr = DDCType::getRowBegin( *this, this->permutationArray.getElement( row ) ); - const IndexType rowEnd = DDCType::getRowEnd( *this, this->permutationArray.getElement( row ) ); - const IndexType step = DDCType::getElementStep( *this ); - - while( elementPtr < rowEnd && - this->columnIndexes.getElement( elementPtr ) < column && - this->columnIndexes.getElement( elementPtr ) != this->getPaddingIndex() ) elementPtr += step; - if( elementPtr < rowEnd && this->columnIndexes.getElement( elementPtr ) == column ) - return this->values.getElement( elementPtr ); - return 0.0; -} - -template< typename Real, - typename Device, - typename Index > -Real EllpackSymmetricGraph< Real, Device, Index >::getElement( const IndexType row, - const IndexType column ) const -{ - if( row < column ) - return this->getElement( column, row ); - - typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DDCType; - IndexType elementPtr = DDCType::getRowBegin( *this, this->permutationArray.getElement( row ) ); - const IndexType rowEnd = DDCType::getRowEnd( *this, this->permutationArray.getElement( row ) ); - const IndexType step = DDCType::getElementStep( *this ); - - while( elementPtr < rowEnd && - this->columnIndexes.getElement( elementPtr ) < column && - this->columnIndexes.getElement( elementPtr ) != this->getPaddingIndex() ) - { - elementPtr += step; - } - if( elementPtr < rowEnd && this->columnIndexes.getElement( elementPtr ) == column ) - return this->values.getElement( elementPtr ); - return 0.0; -} - - -template< typename Real, - typename Device, - typename Index > -__cuda_callable__ -void EllpackSymmetricGraph< Real, Device, Index >::getRowFast( const IndexType row, - IndexType* columns, - RealType* values ) const -{ - typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DDCType; - IndexType elementPtr = DDCType::getRowBegin( *this, this->permutationArray[ row ] ); - const IndexType rowEnd = DDCType::getRowEnd( *this, this->permutationArray[ row ] ); - const IndexType step = DDCType::getElementStep( *this ); - - for( IndexType i = 0; i < this->rowLengths; i++ ) - { - columns[ i ] = this->columnIndexes[ elementPtr ]; - values[ i ] = this->values[ elementPtr ]; - elementPtr += step; - } -} - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetricGraph< Real, Device, Index >::getRow( const IndexType row, - IndexType* columns, - RealType* values ) const -{ - typedef EllpackSymmetricGraphDeviceDependentCode< DeviceType > DDCType; - IndexType elementPtr = DDCType::getRowBegin( *this, this->permutationArray[ row ] ); - const IndexType rowEnd = DDCType::getRowEnd( *this, this->permutationArray[ row ] ); - const IndexType step = DDCType::getElementStep( *this ); - - for( IndexType i = 0; i < this->rowLengths; i++ ) - { - columns[ i ] = this->columnIndexes.getElement( elementPtr ); - values[ i ] = this->values.getElement( elementPtr ); - elementPtr += step; - } -} - -template< typename Real, - typename Device, - typename Index > - template< typename Vector > -__cuda_callable__ -typename Vector::RealType EllpackSymmetricGraph< Real, Device, Index >::rowVectorProduct( const IndexType row, - const Vector& vector ) const -{ - IndexType i = DeviceDependentCode::getRowBegin( *this, row ); - const IndexType rowEnd = DeviceDependentCode::getRowEnd( *this, row ); - const IndexType step = DeviceDependentCode::getElementStep( *this ); - - Real result = 0.0; - while( i < rowEnd && this->columnIndexes[ i ] != this->getPaddingIndex() ) - { - const Index column = this->columnIndexes[ i ]; - result += this->values[ i ] * vector[ column ]; - i += step; - } - return result; -} - -template< typename Real, - typename Device, - typename Index > - template< typename InVector, - typename OutVector > -void EllpackSymmetricGraph< Real, Device, Index >::vectorProduct( const InVector& inVector, - OutVector& outVector ) const -{ - DeviceDependentCode::vectorProduct( *this, inVector, outVector ); -} - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetricGraph< Real, Device, Index >::save( File& file ) const -{ - Sparse< Real, Device, Index >::save( file); - file.save( &this->rowLengths ); -} - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetricGraph< Real, Device, Index >::load( File& file ) -{ - Sparse< Real, Device, Index >::load( file); - file.load( &this->rowLengths ); -} - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetricGraph< Real, Device, Index >::save( const String& fileName ) const -{ - Object::save( fileName ); -} - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetricGraph< Real, Device, Index >::load( const String& fileName ) -{ - Object::load( fileName ); -} - -template< typename Real, - typename Device, - typename Index > -bool EllpackSymmetricGraph< Real, Device, Index >::help( bool verbose ) -{ - if( !this->rearranged ) - return this->rearrangeMatrix( verbose ); - return true; -} - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetricGraph< Real, Device, Index >::print( std::ostream& str ) const -{ - for( IndexType row = 0; row < this->getRows(); row++ ) - { - str <<"Row: " << row << " -> "; - IndexType i( row * this->rowLengths ); - const IndexType rowEnd( i + this->rowLengths ); - while( i < rowEnd && - this->columnIndexes.getElement( i ) < this->columns && - this->columnIndexes.getElement( i ) != this->getPaddingIndex() ) - { - const Index column = this->columnIndexes.getElement( i ); - str << " Col:" << column << "->" << this->values.getElement( i ) << "\t"; - i++; - } - str << std::endl; - } -} - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetricGraph< Real, Device, Index >::allocateElements() -{ - IndexType numberOfMatrixElements = this->alignedRows * this->rowLengths; - - TNL_ASSERT_TRUE( this->alignedRows != 0 && numberOfMatrixElements / this->alignedRows == this->rowLengths, - "Ellpack cannot store this matrix. The number of matrix elements has overflown the value that IndexType is capable of storing" ); - - Sparse< Real, Device, Index >::allocateMatrixElements( this->alignedRows * this->rowLengths ); -} - -template< typename Real, - typename Device, - typename Index > -template< typename InVector, - typename OutVector > -void EllpackSymmetricGraph< Real, Device, Index >::vectorProductHost( const InVector& inVector, - OutVector& outVector ) const -{ - for( IndexType color = 0; color < this->getNumberOfColors(); color++ ) - { - // IndexType colorBegin = this->colorPointers[ color ]; - IndexType offset = this->colorPointers[ color ]; - IndexType colorEnd = this->colorPointers[ color + 1 ]; - for( IndexType j = 0; j < this->getRowsOfColor( color ); j++ ) - { - IndexType row = offset + j; - if( row >= colorEnd ) - break; - IndexType i = DeviceDependentCode::getRowBegin( *this, row ); - const IndexType rowEnd = DeviceDependentCode::getRowEnd( *this, row ); - const IndexType step = DeviceDependentCode::getElementStep( *this ); - const IndexType rowMapping = this->inversePermutationArray[ row ]; - - while( i < rowEnd && this->columnIndexes[ i ] != this->getPaddingIndex() ) - { - const IndexType column = this->columnIndexes[ i ]; - outVector[ rowMapping ] += this->values[ i ] * inVector[ column ]; - if( rowMapping != column ) - outVector[ column ] += this->values[ i ] * inVector[ rowMapping ]; - i += step; - } - } - } -} - -template<> -class EllpackSymmetricGraphDeviceDependentCode< Devices::Host > -{ - public: - - typedef Devices::Host Device; - - template< typename Real, - typename Index > - static Index getRowBegin( const EllpackSymmetricGraph< Real, Device, Index >& matrix, - const Index row ) - { - return row * matrix.rowLengths; - } - - template< typename Real, - typename Index > - static Index getRowEnd( const EllpackSymmetricGraph< Real, Device, Index >& matrix, - const Index row ) - { - return ( row + 1 ) * matrix.rowLengths; - } - - template< typename Real, - typename Index > - static Index getElementStep( const EllpackSymmetricGraph< Real, Device, Index >& matrix ) - { - return 1; - } - - template< typename Real, - typename Index, - typename InVector, - typename OutVector > - static void vectorProduct( const EllpackSymmetricGraph< Real, Device, Index >& matrix, - const InVector& inVector, - OutVector& outVector ) - { - matrix.vectorProductHost( inVector, outVector ); - } -}; - -#ifdef HAVE_CUDA -template< typename Real, - typename Device, - typename Index > -template< typename InVector, - typename OutVector > -__cuda_callable__ -void EllpackSymmetricGraph< Real, Device, Index >::spmvCuda( const InVector& inVector, - OutVector& outVector, - const int globalIdx, - const int color ) const -{ - IndexType offset = this->colorPointers[ color ]; - const IndexType colorEnd = this->colorPointers[ color + 1 ]; - IndexType row = offset + globalIdx; - if( row >= colorEnd ) - return; - - IndexType i = DeviceDependentCode::getRowBegin( *this, row ); - const IndexType rowEnd = DeviceDependentCode::getRowEnd( *this, row ); - const IndexType step = DeviceDependentCode::getElementStep( *this ); - const IndexType rowMapping = this->inversePermutationArray[ row ]; - - while( i < rowEnd && this->columnIndexes[ i ] != this->getPaddingIndex() ) - { - const IndexType column = this->columnIndexes[ i ]; - outVector[ rowMapping ] += this->values[ i ] * inVector[ column ]; - if( rowMapping != column ) - outVector[ column ] += this->values[ i ] * inVector[ rowMapping ]; - i += step; - } -} -#endif - -#ifdef HAVE_CUDA -template< typename Real, - typename Index, - typename InVector, - typename OutVector > -__global__ -void EllpackSymmetricGraphVectorProductCuda( const EllpackSymmetricGraph< Real, Devices::Cuda, Index >* matrix, - const InVector* inVector, - OutVector* outVector, - const int gridIdx, - const int color ) -{ - int globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - matrix->spmvCuda( *inVector, *outVector, globalIdx, color ); -} -#endif - -template<> -class EllpackSymmetricGraphDeviceDependentCode< Devices::Cuda > -{ - public: - - typedef Devices::Cuda Device; - - template< typename Real, - typename Index > - __cuda_callable__ - static Index getRowBegin( const EllpackSymmetricGraph< Real, Device, Index >& matrix, - const Index row ) - { - return row; - } - - template< typename Real, - typename Index > - __cuda_callable__ - static Index getRowEnd( const EllpackSymmetricGraph< Real, Device, Index >& matrix, - const Index row ) - { - return row + getElementStep( matrix ) * matrix.rowLengths; - } - - template< typename Real, - typename Index > - __cuda_callable__ - static Index getElementStep( const EllpackSymmetricGraph< Real, Device, Index >& matrix ) - { - return matrix.alignedRows; - } - - template< typename Real, - typename Index, - typename InVector, - typename OutVector > - static void vectorProduct( const EllpackSymmetricGraph< Real, Device, Index >& matrix, - const InVector& inVector, - OutVector& outVector ) - { -#ifdef HAVE_CUDA - typedef EllpackSymmetricGraph< Real, Devices::Cuda, Index > Matrix; - typedef typename Matrix::IndexType IndexType; - Matrix* kernel_this = Cuda::passToDevice( matrix ); - InVector* kernel_inVector = Cuda::passToDevice( inVector ); - OutVector* kernel_outVector = Cuda::passToDevice( outVector ); - dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); - for( IndexType color = 0; color < matrix.getNumberOfColors(); color++ ) - { - IndexType rows = matrix.getRowsOfColor( color ); - const IndexType cudaBlocks = roundUpDivision( rows, cudaBlockSize.x ); - const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); - for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) - { - if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); - EllpackSymmetricGraphVectorProductCuda< Real, Index, InVector, OutVector > - <<< cudaGridSize, cudaBlockSize >>> - ( kernel_this, - kernel_inVector, - kernel_outVector, - gridIdx, - color ); - } - } - - Cuda::freeFromDevice( kernel_this ); - Cuda::freeFromDevice( kernel_inVector ); - Cuda::freeFromDevice( kernel_outVector ); - TNL_CHECK_CUDA_DEVICE; -#endif - } -}; - -} //namespace Legacy -} // namespace Matrices -} // namespace TNL diff --git a/src/TNL/Matrices/Legacy/EllpackSymmetric_impl.h b/src/TNL/Matrices/Legacy/EllpackSymmetric_impl.h deleted file mode 100644 index 8bf42b79da148b47b30af6cc446332565489c780..0000000000000000000000000000000000000000 --- a/src/TNL/Matrices/Legacy/EllpackSymmetric_impl.h +++ /dev/null @@ -1,833 +0,0 @@ -/*************************************************************************** - EllpackSymmetric_impl.h - description - ------------------- - begin : Aug 30, 2018 - copyright : (C) 2018 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#pragma once - -#include -#include -#include -#include - -namespace TNL { -namespace Matrices { - namespace Legacy { - -template< typename Real, - typename Device, - typename Index > -EllpackSymmetric< Real, Device, Index > :: EllpackSymmetric() -: rowLengths( 0 ), alignedRows( 0 ) -{ -}; - -template< typename Real, - typename Device, - typename Index > -String EllpackSymmetric< Real, Device, Index > :: getType() -{ - return String( "Matrices::EllpackSymmetric< ") + - String( TNL::getType< Real >() ) + - String( ", " ) + - String( Device::getDeviceType() ) + - String( ", " ) + - String( TNL::getType< Index >() ) + - String( " >" ); -} - -template< typename Real, - typename Device, - typename Index > -String EllpackSymmetric< Real, Device, Index >::getTypeVirtual() const -{ - return this->getType(); -} - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetric< Real, Device, Index >::setDimensions( const IndexType rows, - const IndexType columns ) -{ - TNL_ASSERT( rows > 0 && columns > 0, - std::cerr << "rows = " << rows - << " columns = " << columns <rows = rows; - this->columns = columns; - - if( std::is_same< DeviceType, Devices::Cuda >::value ) - { - this->alignedRows = roundToMultiple( columns, Devices::Cuda::getWarpSize() ); - - if( this->rows - this->alignedRows > 0 ) - { - IndexType missingRows = this->rows - this->alignedRows; - missingRows = roundToMultiple( missingRows, Devices::Cuda::getWarpSize() ); - this->alignedRows += missingRows; - -// this->alignedRows += roundToMultiple( this->rows - this->alignedRows, Devices::Cuda::getWarpSize() ); - } - } - else this->alignedRows = rows; - - if( this->rowLengths != 0 ) - allocateElements(); -} - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetric< Real, Device, Index >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths ) -{ - TNL_ASSERT( this->getRows() > 0, ); - TNL_ASSERT( this->getColumns() > 0, ); - //TNL_ASSERT( this->rowLengths > 0, - // std::cerr << "this->rowLengths = " << this->rowLengths ); - this->rowLengths = this->maxRowLength = max( rowLengths ); - allocateElements(); -} - -template< typename Real, - typename Device, - typename Index > -bool EllpackSymmetric< Real, Device, Index >::setConstantRowLengths( const IndexType& rowLengths ) -{ - TNL_ASSERT( rowLengths > 0, - std::cerr << " rowLengths = " << rowLengths ); - this->rowLengths = rowLengths; - if( this->rows > 0 ) - allocateElements(); - return true; -} - -template< typename Real, - typename Device, - typename Index > -Index EllpackSymmetric< Real, Device, Index >::getRowLength( const IndexType row ) const -{ - return this->rowLengths; -} - -template< typename Real, - typename Device, - typename Index > - template< typename Real2, - typename Device2, - typename Index2 > -bool EllpackSymmetric< Real, Device, Index >::setLike( const EllpackSymmetric< Real2, Device2, Index2 >& matrix ) -{ - if( ! Sparse< Real, Device, Index >::setLike( matrix ) ) - return false; - this->rowLengths = matrix.rowLengths; - return true; -} - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetric< Real, Device, Index > :: reset() -{ - Sparse< Real, Device, Index >::reset(); - this->rowLengths = 0; -} - -template< typename Real, - typename Device, - typename Index > - template< typename Real2, - typename Device2, - typename Index2 > -bool EllpackSymmetric< Real, Device, Index >::operator == ( const EllpackSymmetric< Real2, Device2, Index2 >& matrix ) const -{ - TNL_ASSERT( this->getRows() == matrix.getRows() && - this->getColumns() == matrix.getColumns(), - std::cerr << "this->getRows() = " << this->getRows() - << " matrix.getRows() = " << matrix.getRows() - << " this->getColumns() = " << this->getColumns() - << " matrix.getColumns() = " << matrix.getColumns() - << " this->getName() = " << this->getName() - << " matrix.getName() = " << matrix.getName() ); - // TODO: implement this - throw Exceptions::NotImplementedError( "EllpackSymmetric::operator== is not implemented." ); -} - -template< typename Real, - typename Device, - typename Index > - template< typename Real2, - typename Device2, - typename Index2 > -bool EllpackSymmetric< Real, Device, Index >::operator != ( const EllpackSymmetric< Real2, Device2, Index2 >& matrix ) const -{ - return ! ( ( *this ) == matrix ); -} - -/*template< typename Real, - typename Device, - typename Index > - template< typename Matrix > -bool EllpackSymmetric< Real, Device, Index >::copyFrom( const Matrix& matrix, - const CompressedRowLengthsVector& rowLengths ) -{ - return tnlMatrix< RealType, DeviceType, IndexType >::copyFrom( matrix, rowLengths ); -}*/ - -template< typename Real, - typename Device, - typename Index > -__cuda_callable__ -bool EllpackSymmetric< Real, Device, Index > :: setElementFast( const IndexType row, - const IndexType column, - const Real& value ) -{ - return this->addElementFast( row, column, value, 0.0 ); -} - -template< typename Real, - typename Device, - typename Index > -bool EllpackSymmetric< Real, Device, Index > :: setElement( const IndexType row, - const IndexType column, - const Real& value ) -{ - return this->addElement( row, column, value, 0.0 ); -} - - -template< typename Real, - typename Device, - typename Index > -__cuda_callable__ -bool EllpackSymmetric< Real, Device, Index > :: addElementFast( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator ) -{ - // TODO: return this back when CUDA kernels supportstd::cerr - /*TNL_ASSERT( row >= 0 && row < this->rows && - column >= 0 && column <= this->rows, - std::cerr << " row = " << row - << " column = " << column - << " this->rows = " << this->rows - << " this->columns = " << this-> columns );*/ - typedef EllpackSymmetricDeviceDependentCode< DeviceType > DDCType; - IndexType i = DDCType::getRowBegin( *this, row ); - const IndexType rowEnd = DDCType::getRowEnd( *this, row ); - const IndexType step = DDCType::getElementStep( *this ); - - while( i < rowEnd && - this->columnIndexes[ i ] < column && - this->columnIndexes[ i ] != this->getPaddingIndex() ) i += step; - if( i == rowEnd ) - return false; - if( this->columnIndexes[ i ] == column ) - { - this->values[ i ] = thisElementMultiplicator * this->values[ i ] + value; - return true; - } - else - if( this->columnIndexes[ i ] == this->getPaddingIndex() ) // artificial zero - { - this->columnIndexes[ i ] = column; - this->values[ i ] = value; - } - else - { - Index j = rowEnd - step; - while( j > i ) - { - this->columnIndexes[ j ] = this->columnIndexes[ j - step ]; - this->values[ j ] = this->values[ j - step ]; - j -= step; - } - this->columnIndexes[ i ] = column; - this->values[ i ] = value; - } - return true; -} - -template< typename Real, - typename Device, - typename Index > -bool EllpackSymmetric< Real, Device, Index > :: addElement( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator ) -{ - typedef EllpackSymmetricDeviceDependentCode< DeviceType > DDCType; - IndexType i = DDCType::getRowBegin( *this, row ); - const IndexType rowEnd = DDCType::getRowEnd( *this, row ); - const IndexType step = DDCType::getElementStep( *this ); - - while( i < rowEnd && - this->columnIndexes.getElement( i ) < column && - this->columnIndexes.getElement( i ) != this->getPaddingIndex() ) i += step; - if( i == rowEnd ) - return false; - if( this->columnIndexes.getElement( i ) == column ) - { - this->values.setElement( i, thisElementMultiplicator * this->values.getElement( i ) + value ); - return true; - } - else - if( this->columnIndexes.getElement( i ) == this->getPaddingIndex() ) - { - this->columnIndexes.setElement( i, column ); - this->values.setElement( i, value ); - } - else - { - IndexType j = rowEnd - step; - while( j > i ) - { - this->columnIndexes.setElement( j, this->columnIndexes.getElement( j - step ) ); - this->values.setElement( j, this->values.getElement( j - step ) ); - j -= step; - } - this->columnIndexes.setElement( i, column ); - this->values.setElement( i, value ); - } - return true; -} - -template< typename Real, - typename Device, - typename Index > -__cuda_callable__ -bool EllpackSymmetric< Real, Device, Index > :: setRowFast( const IndexType row, - const IndexType* columnIndexes, - const RealType* values, - const IndexType elements ) -{ - typedef EllpackSymmetricDeviceDependentCode< DeviceType > DDCType; - IndexType elementPointer = DDCType::getRowBegin( *this, row ); - const IndexType rowEnd = DDCType::getRowEnd( *this, row ); - const IndexType step = DDCType::getElementStep( *this ); - - if( elements > this->rowLengths ) - return false; - for( Index i = 0; i < elements; i++ ) - { - const IndexType column = columnIndexes[ i ]; - if( column < 0 || column >= this->getColumns() ) - return false; - this->columnIndexes[ elementPointer ] = column; - this->values[ elementPointer ] = values[ i ]; - elementPointer += step; - } - for( Index i = elements; i < this->rowLengths; i++ ) - { - this->columnIndexes[ elementPointer ] = this->getPaddingIndex(); - elementPointer += step; - } - return true; -} - -template< typename Real, - typename Device, - typename Index > -bool EllpackSymmetric< Real, Device, Index > :: setRow( const IndexType row, - const IndexType* columnIndexes, - const RealType* values, - const IndexType elements ) -{ - typedef EllpackSymmetricDeviceDependentCode< DeviceType > DDCType; - IndexType elementPointer = DDCType::getRowBegin( *this, row ); - const IndexType rowEnd = DDCType::getRowEnd( *this, row ); - const IndexType step = DDCType::getElementStep( *this ); - - if( elements > this->rowLengths ) - return false; - - for( IndexType i = 0; i < elements; i++ ) - { - const IndexType column = columnIndexes[ i ]; - if( column < 0 || column >= this->getColumns() ) - return false; - this->columnIndexes.setElement( elementPointer, column ); - this->values.setElement( elementPointer, values[ i ] ); - elementPointer += step; - } - for( IndexType i = elements; i < this->rowLengths; i++ ) - { - this->columnIndexes.setElement( elementPointer, this->getPaddingIndex() ); - elementPointer += step; - } - return true; -} - -template< typename Real, - typename Device, - typename Index > -__cuda_callable__ -bool EllpackSymmetric< Real, Device, Index > :: addRowFast( const IndexType row, - const IndexType* columns, - const RealType* values, - const IndexType numberOfElements, - const RealType& thisElementMultiplicator ) -{ - // TODO: implement - return false; -} - -template< typename Real, - typename Device, - typename Index > -bool EllpackSymmetric< Real, Device, Index > :: addRow( const IndexType row, - const IndexType* columns, - const RealType* values, - const IndexType numberOfElements, - const RealType& thisElementMultiplicator ) -{ - return this->addRowFast( row, columns, values, numberOfElements ); -} - - -template< typename Real, - typename Device, - typename Index > -__cuda_callable__ -Real EllpackSymmetric< Real, Device, Index >::getElementFast( const IndexType row, - const IndexType column ) const -{ - if( row < column ) - return this->getElementFast( column, row ); - - typedef EllpackSymmetricDeviceDependentCode< DeviceType > DDCType; - IndexType elementPtr = DDCType::getRowBegin( *this, row ); - const IndexType rowEnd = DDCType::getRowEnd( *this, row ); - const IndexType step = DDCType::getElementStep( *this ); - - while( elementPtr < rowEnd && - this->columnIndexes[ elementPtr ] < column && - this->columnIndexes[ elementPtr ] != this->getPaddingIndex() ) elementPtr += step; - if( elementPtr < rowEnd && this->columnIndexes[ elementPtr ] == column ) - return this->values[ elementPtr ]; - return 0.0; -} - -template< typename Real, - typename Device, - typename Index > -Real EllpackSymmetric< Real, Device, Index >::getElement( const IndexType row, - const IndexType column ) const -{ - if( row < column ) - return this->getElement( column, row ); - - typedef EllpackSymmetricDeviceDependentCode< DeviceType > DDCType; - IndexType elementPtr = DDCType::getRowBegin( *this, row ); - const IndexType rowEnd = DDCType::getRowEnd( *this, row ); - const IndexType step = DDCType::getElementStep( *this ); - - while( elementPtr < rowEnd && - this->columnIndexes.getElement( elementPtr ) < column && - this->columnIndexes.getElement( elementPtr ) != this->getPaddingIndex() ) elementPtr += step; - if( elementPtr < rowEnd && this->columnIndexes.getElement( elementPtr ) == column ) - return this->values.getElement( elementPtr ); - return 0.0; -} - - -template< typename Real, - typename Device, - typename Index > -__cuda_callable__ -void EllpackSymmetric< Real, Device, Index >::getRowFast( const IndexType row, - IndexType* columns, - RealType* values ) const -{ - typedef EllpackSymmetricDeviceDependentCode< DeviceType > DDCType; - IndexType elementPtr = DDCType::getRowBegin( *this, row ); - const IndexType rowEnd = DDCType::getRowEnd( *this, row ); - const IndexType step = DDCType::getElementStep( *this ); - - for( IndexType i = 0; i < this->rowLengths; i++ ) - { - columns[ i ] = this->columnIndexes[ elementPtr ]; - values[ i ] = this->values[ elementPtr ]; - elementPtr += step; - } -} - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetric< Real, Device, Index >::getRow( const IndexType row, - IndexType* columns, - RealType* values ) const -{ - typedef EllpackSymmetricDeviceDependentCode< DeviceType > DDCType; - IndexType elementPtr = DDCType::getRowBegin( *this, row ); - const IndexType rowEnd = DDCType::getRowEnd( *this, row ); - const IndexType step = DDCType::getElementStep( *this ); - - for( IndexType i = 0; i < this->rowLengths; i++ ) - { - columns[ i ] = this->columnIndexes.getElement( elementPtr ); - values[ i ] = this->values.getElement( elementPtr ); - elementPtr += step; - } -} - - - -template< typename Real, - typename Device, - typename Index > - template< typename InVector, - typename OutVector > -void EllpackSymmetric< Real, Device, Index >::vectorProduct( const InVector& inVector, - OutVector& outVector ) const -{ - DeviceDependentCode::vectorProduct( *this, inVector, outVector ); -} - -template< typename Real, - typename Device, - typename Index > - template< typename Real2, - typename Index2 > -void EllpackSymmetric< Real, Device, Index > :: addMatrix( const EllpackSymmetric< Real2, Device, Index2 >& matrix, - const RealType& matrixMultiplicator, - const RealType& thisMatrixMultiplicator ) -{ - throw Exceptions::NotImplementedError( "EllpackSymmetric::addMatrix is not implemented." ); - // TODO: implement -} - -template< typename Real, - typename Device, - typename Index > - template< typename Real2, - typename Index2 > -void EllpackSymmetric< Real, Device, Index >::getTransposition( const EllpackSymmetric< Real2, Device, Index2 >& matrix, - const RealType& matrixMultiplicator ) -{ - throw Exceptions::NotImplementedError( "EllpackSymmetric::getTransposition is not implemented." ); - // TODO: implement -} - -template< typename Real, - typename Device, - typename Index > - template< typename Vector > -bool EllpackSymmetric< Real, Device, Index > :: performSORIteration( const Vector& b, - const IndexType row, - Vector& x, - const RealType& omega ) const -{ - TNL_ASSERT( row >=0 && row < this->getRows(), - std::cerr << "row = " << row - << " this->getRows() = " << this->getRows() - << " this->getName() = " << this->getName() <rowLengths ); - const IndexType rowEnd( i + this->rowLengths ); - IndexType column; - while( i < rowEnd && ( column = this->columnIndexes[ i ] ) < this->columns ) - { - if( column == row ) - diagonalValue = this->values.getElement( i ); - else - sum += this->values.getElement( row * this->diagonalsShift.getSize() + i ) * x. getElement( column ); - i++; - } - if( diagonalValue == ( Real ) 0.0 ) - { - std::cerr << "There is zero on the diagonal in " << row << "-th row of thge matrix " << this->getName() << ". I cannot perform SOR iteration." < -void EllpackSymmetric< Real, Device, Index >::save( File& file ) const -{ - Sparse< Real, Device, Index >::save( file); - file.save( &this->rowLengths ); -} - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetric< Real, Device, Index >::load( File& file ) -{ - Sparse< Real, Device, Index >::load( file); - file.load( &this->rowLengths ); -} - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetric< Real, Device, Index >::save( const String& fileName ) const -{ - Object::save( fileName ); -} - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetric< Real, Device, Index >::load( const String& fileName ) -{ - Object::load( fileName ); -} - -template< typename Real, - typename Device, - typename Index > -void EllpackSymmetric< Real, Device, Index >::print( std::ostream& str ) const -{ - for( IndexType row = 0; row < this->getRows(); row++ ) - { - str <<"Row: " << row << " -> "; - IndexType i( row * this->rowLengths ); - const IndexType rowEnd( i + this->rowLengths ); - while( i < rowEnd && - this->columnIndexes.getElement( i ) < this->columns && - this->columnIndexes.getElement( i ) != this->getPaddingIndex() ) - { - const Index column = this->columnIndexes.getElement( i ); - str << " Col:" << column << "->" << this->values.getElement( i ) << "\t"; - i++; - } - str < -void EllpackSymmetric< Real, Device, Index >::allocateElements() -{ - IndexType numberOfMatrixElements = this->alignedRows * this->rowLengths; - - TNL_ASSERT_TRUE( this->alignedRows != 0 && numberOfMatrixElements / this->alignedRows == this->rowLengths, - "Ellpack cannot store this matrix. The number of matrix elements has overflown the value that IndexType is capable of storing" ); - - Sparse< Real, Device, Index >::allocateMatrixElements( this->alignedRows * this->rowLengths ); -} - -template<> -class EllpackSymmetricDeviceDependentCode< Devices::Host > -{ - public: - - typedef Devices::Host Device; - - template< typename Real, - typename Index > - static Index getRowBegin( const EllpackSymmetric< Real, Device, Index >& matrix, - const Index row ) - { - return row * matrix.rowLengths; - } - - template< typename Real, - typename Index > - static Index getRowEnd( const EllpackSymmetric< Real, Device, Index >& matrix, - const Index row ) - { - //return row * matrix.rowLengths + row + 1; - return min(row * matrix.rowLengths + row + 1, ( row + 1 ) * matrix.rowLengths ); - } - - template< typename Real, - typename Index > - static Index getElementStep( const EllpackSymmetric< Real, Device, Index >& matrix ) - { - return 1; - } - - template< typename Real, - typename Index, - typename InVector, - typename OutVector > - static void vectorProduct( const EllpackSymmetric< Real, Device, Index >& matrix, - const InVector& inVector, - OutVector& outVector ) - { - matrix.vectorProductHost( inVector, outVector ); - } - -}; - -template< typename Real, - typename Device, - typename Index > -template< typename InVector, - typename OutVector > -void EllpackSymmetric< Real, Device, Index >::vectorProductHost( const InVector& inVector, - OutVector& outVector ) const -{ - for( Index row = 0; row < this->getRows(); row++ ) - { - IndexType i = DeviceDependentCode::getRowBegin( *this, row ); - const IndexType rowEnd = DeviceDependentCode::getRowEnd( *this, row ); - const IndexType step = DeviceDependentCode::getElementStep( *this ); - - while( i < rowEnd && this->columnIndexes[ i ] != this->getPaddingIndex() ) - { - const IndexType column = this->columnIndexes[ i ]; - outVector[ row ] += this->values[ i ] * inVector[ column ]; - if( row != column ) - outVector[ column ] += this->values[ i ] * inVector[ row ]; - i += step; - } - } -}; - -template< typename Real, - typename Device, - typename Index > -template< typename Vector > -__cuda_callable__ -typename Vector::RealType EllpackSymmetric< Real, Device, Index >::rowVectorProduct( const IndexType row, - const Vector& vector ) const -{ - IndexType i = DeviceDependentCode::getRowBegin( *this, row ); - const IndexType rowEnd = DeviceDependentCode::getRowEnd( *this, row ); - const IndexType step = DeviceDependentCode::getElementStep( *this ); - - Real result = 0.0; - while( i < rowEnd && this->columnIndexes[ i ] != this->getPaddingIndex() ) - { - const Index column = this->columnIndexes[ i ]; - result += this->values[ i ] * vector[ column ]; - i += step; - } - return result; -} - -#ifdef HAVE_CUDA -template< typename Real, - typename Device, - typename Index > -template< typename InVector, - typename OutVector > -__cuda_callable__ -void EllpackSymmetric< Real, Device, Index >::spmvCuda( const InVector& inVector, - OutVector& outVector, - int rowId ) const -{ - IndexType i = DeviceDependentCode::getRowBegin( *this, rowId ); - const IndexType rowEnd = DeviceDependentCode::getRowEnd( *this, rowId ); - const IndexType step = DeviceDependentCode::getElementStep( *this ); - - while( i < rowEnd && this->columnIndexes[ i ] != this->getPaddingIndex() ) - { - const IndexType column = this->columnIndexes[ i ]; - outVector[ rowId ] += this->values[ i ] * inVector[ column ]; - if( rowId != column ) - outVector[ column ] += this->values[ i ] * inVector[ rowId ]; - i += step; - } -}; -#endif - -#ifdef HAVE_CUDA -template< typename Real, - typename Index, - typename InVector, - typename OutVector > -__global__ -void EllpackSymmetricVectorProductCuda( const EllpackSymmetric< Real, Devices::Cuda, Index >* matrix, - const InVector* inVector, - OutVector* outVector, - const int gridIdx ) -{ - int globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - if( globalIdx >= matrix->getRows() ) - return; - matrix->spmvCuda( *inVector, *outVector, globalIdx ); -}; -#endif - -template<> -class EllpackSymmetricDeviceDependentCode< Devices::Cuda > -{ - public: - - typedef Devices::Cuda Device; - - template< typename Real, - typename Index > - __cuda_callable__ - static Index getRowBegin( const EllpackSymmetric< Real, Device, Index >& matrix, - const Index row ) - { - return row; - } - - template< typename Real, - typename Index > - __cuda_callable__ - static Index getRowEnd( const EllpackSymmetric< Real, Device, Index >& matrix, - const Index row ) - { - // TODO: fix this: return row + getElementStep( matrix ) * matrix.rowLengths; - return min( row + getElementStep( matrix ) * matrix.rowLengths, row + ( row + 1 ) * getElementStep( matrix ) ); - } - - template< typename Real, - typename Index > - __cuda_callable__ - static Index getElementStep( const EllpackSymmetric< Real, Device, Index >& matrix ) - { - return matrix.alignedRows; - } - - template< typename Real, - typename Index, - typename InVector, - typename OutVector > - static void vectorProduct( const EllpackSymmetric< Real, Device, Index >& matrix, - const InVector& inVector, - OutVector& outVector ) - { -#ifdef HAVE_CUDA - typedef EllpackSymmetric< Real, Devices::Cuda, Index > Matrix; - typedef typename Matrix::IndexType IndexType; - Matrix* kernel_this = Cuda::passToDevice( matrix ); - InVector* kernel_inVector = Cuda::passToDevice( inVector ); - OutVector* kernel_outVector = Cuda::passToDevice( outVector ); - dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); - const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); - const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); - for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) - { - if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); - const int sharedMemory = cudaBlockSize.x * sizeof( Real ); - EllpackSymmetricVectorProductCuda< Real, Index, InVector, OutVector > - <<< cudaGridSize, cudaBlockSize, sharedMemory >>> - ( kernel_this, - kernel_inVector, - kernel_outVector, - gridIdx ); - } - Cuda::freeFromDevice( kernel_this ); - Cuda::freeFromDevice( kernel_inVector ); - Cuda::freeFromDevice( kernel_outVector ); - TNL_CHECK_CUDA_DEVICE; -#endif - } -}; - -} //namespace Legacy -} // namespace Matrices -} // namespace TNL diff --git a/src/TNL/Matrices/Legacy/SlicedEllpackSymmetric.h b/src/TNL/Matrices/Legacy/SlicedEllpackSymmetric.h deleted file mode 100644 index 99ac3562e94bc30510c70198da4997871f145ff1..0000000000000000000000000000000000000000 --- a/src/TNL/Matrices/Legacy/SlicedEllpackSymmetric.h +++ /dev/null @@ -1,210 +0,0 @@ -/*************************************************************************** - SlocedEllpackSymmetric.h - description - ------------------- - begin : Aug 30, 2018 - copyright : (C) 2018 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#pragma once - -#include -#include - -namespace TNL { -namespace Matrices { - namespace Legacy { - -template< typename Device > -class SlicedEllpackSymmetricDeviceDependentCode; - -template< typename Real = double, - typename Device = Devices::Host, - typename Index = int, - int SliceSize = 32 > -class SlicedEllpackSymmetric; - -#ifdef HAVE_CUDA -template< typename Real, - typename Index, - int SliceSize > -__global__ void SlicedEllpackSymmetric_computeMaximalRowLengthInSlices_CudaKernel( SlicedEllpack< Real, Devices::Cuda, Index, SliceSize >* matrix, - typename SlicedEllpackSymmetric< Real, Devices::Cuda, Index, SliceSize >::ConstCompressedRowLengthsVectorView rowLengths, - int gridIdx ); -#endif - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -class SlicedEllpackSymmetric : public Sparse< Real, Device, Index > -{ - public: - - typedef Real RealType; - typedef Device DeviceType; - typedef Index IndexType; - typedef typename Sparse< RealType, DeviceType, IndexType >::CompressedRowLengthsVector CompressedRowLengthsVector; - typedef typename Sparse< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView; - typedef typename Sparse< RealType, DeviceType, IndexType >::ValuesVector ValuesVector; - typedef typename Sparse< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector; - - template< typename _Real = Real, - typename _Device = Device, - typename _Index = Index, - int _SliceSize = SliceSize > - using Self = SlicedEllpackSymmetric< _Real, _Device, _Index, _SliceSize >; - - SlicedEllpackSymmetric(); - - void setDimensions( const IndexType rows, - const IndexType columns ); - - void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths ); - - IndexType getRowLength( const IndexType row ) const; - - template< typename Real2, typename Device2, typename Index2 > - bool setLike( const SlicedEllpackSymmetric< Real2, Device2, Index2, SliceSize >& matrix ); - - void reset(); - - template< typename Real2, typename Device2, typename Index2 > - bool operator == ( const SlicedEllpackSymmetric< Real2, Device2, Index2 >& matrix ) const; - - template< typename Real2, typename Device2, typename Index2 > - bool operator != ( const SlicedEllpackSymmetric< Real2, Device2, Index2 >& matrix ) const; - - __cuda_callable__ - bool setElementFast( const IndexType row, - const IndexType column, - const RealType& value ); - - bool setElement( const IndexType row, - const IndexType column, - const RealType& value ); - - __cuda_callable__ - bool addElementFast( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator = 1.0 ); - - bool addElement( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator = 1.0 ); - - __cuda_callable__ - bool setRowFast( const IndexType row, - const IndexType* columnIndexes, - const RealType* values, - const IndexType elements ); - - bool setRow( const IndexType row, - const IndexType* columnIndexes, - const RealType* values, - const IndexType elements ); - - __cuda_callable__ - bool addRowFast( const IndexType row, - const IndexType* columns, - const RealType* values, - const IndexType numberOfElements, - const RealType& thisElementMultiplicator = 1.0 ); - - bool addRow( const IndexType row, - const IndexType* columns, - const RealType* values, - const IndexType numberOfElements, - const RealType& thisElementMultiplicator = 1.0 ); - - __cuda_callable__ - RealType getElementFast( const IndexType row, - const IndexType column ) const; - - RealType getElement( const IndexType row, - const IndexType column ) const; - - - __cuda_callable__ - void getRowFast( const IndexType row, - IndexType* columns, - RealType* values ) const; - - void getRow( const IndexType row, - IndexType* columns, - RealType* values ) const; - - template< typename InVector, - typename OutVector > - __cuda_callable__ - void rowVectorProduct( const IndexType row, - const InVector& inVector, - OutVector& outVector ) const; - - template< typename InVector, - typename OutVector > - void vectorProduct( const InVector& inVector, - OutVector& outVector ) const; - - template< typename InVector, - typename OutVector > - __cuda_callable__ - void spmvCuda( const InVector& inVector, - OutVector& outVector, - int globalIdx ) const; - - template< typename Real2, typename Index2 > - void addMatrix( const SlicedEllpackSymmetric< Real2, Device, Index2 >& matrix, - const RealType& matrixMultiplicator = 1.0, - const RealType& thisMatrixMultiplicator = 1.0 ); - - template< typename Real2, typename Index2 > - void getTransposition( const SlicedEllpackSymmetric< Real2, Device, Index2 >& matrix, - const RealType& matrixMultiplicator = 1.0 ); - - template< typename Vector > - bool performSORIteration( const Vector& b, - const IndexType row, - Vector& x, - const RealType& omega = 1.0 ) const; - - void save( File& file ) const; - - void load( File& file ); - - void save( const String& fileName ) const; - - void load( const String& fileName ); - - void print( std::ostream& str ) const; - - protected: - - Containers::Vector< Index, Device, Index > slicePointers, sliceRowLengths; - - typedef SlicedEllpackSymmetricDeviceDependentCode< DeviceType > DeviceDependentCode; - friend class SlicedEllpackSymmetricDeviceDependentCode< DeviceType >; -#ifdef HAVE_CUDA - /*friend __global__ void SlicedEllpackSymmetric_computeMaximalRowLengthInSlices_CudaKernel< Real, Index, SliceSize >( SlicedEllpackMatrix< Real, Devices::Cuda, Index, SliceSize >* matrix, - const typename SlicedEllpackSymmetric< Real, Devices::Cuda, Index, SliceSize >::RowLengthsVector* rowLengths, - int gridIdx ); - */ - // TODO: The friend declaration above does not work because of __global__ storage specifier. Therefore we declare the following method as public. Fix this, when possible. - - public: - __device__ void computeMaximalRowLengthInSlicesCuda( ConstCompressedRowLengthsVectorView rowLengths, - const IndexType sliceIdx ); - -#endif - -}; - -} //namespace Legacy -} // namespace Matrices -} // namespace TNL - -#include diff --git a/src/TNL/Matrices/Legacy/SlicedEllpackSymmetricGraph.h b/src/TNL/Matrices/Legacy/SlicedEllpackSymmetricGraph.h deleted file mode 100644 index b7ee87235d3d56091d28f4ed14689867f605a55c..0000000000000000000000000000000000000000 --- a/src/TNL/Matrices/Legacy/SlicedEllpackSymmetricGraph.h +++ /dev/null @@ -1,242 +0,0 @@ -/*************************************************************************** - SlicedEllpackSymmetricGraph.h - description - ------------------- - begin : Aug 30, 2018 - copyright : (C) 2018 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#pragma once - -#include -#include - -namespace TNL { -namespace Matrices { - namespace Legacy { - -template< typename Device > -class SlicedEllpackSymmetricGraphDeviceDependentCode; - -template< typename Real = double, - typename Device = Devices::Host, - typename Index = int, - int SliceSize = 32 > -class SlicedEllpackSymmetricGraph; - -#ifdef HAVE_CUDA -template< typename Real, - typename Index, - int SliceSize > -__global__ void SlicedEllpackSymmetricGraph_computeMaximalRowLengthInSlices_CudaKernel( SlicedEllpackSymmetricGraph< Real, Devices::Cuda, Index, SliceSize >* matrix, - typename SlicedEllpackSymmetricGraph< Real, Devices::Cuda, Index, SliceSize >::ConstCompressedRowLengthsVectorView rowLengths, - int gridIdx ); -#endif - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -class SlicedEllpackSymmetricGraph : public Sparse< Real, Device, Index > -{ - public: - - typedef Real RealType; - typedef Device DeviceType; - typedef Index IndexType; - typedef typename Sparse< RealType, DeviceType, IndexType >::CompressedRowLengthsVector CompressedRowLengthsVector; - typedef typename Sparse< RealType, DeviceType, IndexType >::ConstCompressedRowLengthsVectorView ConstCompressedRowLengthsVectorView; - typedef typename Sparse< RealType, DeviceType, IndexType >::ValuesVector ValuesVector; - typedef typename Sparse< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector; - - template< typename _Real = Real, - typename _Device = Device, - typename _Index = Index, - int _SliceSize = SliceSize > - using Self = SlicedEllpackSymmetricGraph< _Real, _Device, _Index, _SliceSize >; - - SlicedEllpackSymmetricGraph(); - - void setDimensions( const IndexType rows, - const IndexType columns ); - - void setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths ); - - IndexType getRowLength( const IndexType row ) const; - - template< typename Real2, typename Device2, typename Index2 > - bool setLike( const SlicedEllpackSymmetricGraph< Real2, Device2, Index2, SliceSize >& matrix ); - - void reset(); - - template< typename Real2, typename Device2, typename Index2 > - bool operator == ( const SlicedEllpackSymmetricGraph< Real2, Device2, Index2 >& matrix ) const; - - template< typename Real2, typename Device2, typename Index2 > - bool operator != ( const SlicedEllpackSymmetricGraph< Real2, Device2, Index2 >& matrix ) const; - - template< typename InVector, - typename OutVector > - void vectorProductHost( const InVector& inVector, OutVector& outVector ) const; - - __cuda_callable__ - bool setElementFast( const IndexType row, - const IndexType column, - const RealType& value ); - - bool setElement( const IndexType row, - const IndexType column, - const RealType& value ); - - __cuda_callable__ - bool addElementFast( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator = 1.0 ); - - bool addElement( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator = 1.0 ); - - __cuda_callable__ - bool setRowFast( const IndexType row, - const IndexType* columnIndexes, - const RealType* values, - const IndexType elements ); - - bool setRow( const IndexType row, - const IndexType* columnIndexes, - const RealType* values, - const IndexType elements ); - - __cuda_callable__ - bool addRowFast( const IndexType row, - const IndexType* columns, - const RealType* values, - const IndexType numberOfElements, - const RealType& thisElementMultiplicator = 1.0 ); - - bool addRow( const IndexType row, - const IndexType* columns, - const RealType* values, - const IndexType numberOfElements, - const RealType& thisElementMultiplicator = 1.0 ); - - __cuda_callable__ - RealType getElementFast( const IndexType row, - const IndexType column ) const; - - RealType getElement( const IndexType row, - const IndexType column ) const; - - __cuda_callable__ - void getRowFast( const IndexType row, - IndexType* columns, - RealType* values ) const; - - void getRow( const IndexType row, - IndexType* columns, - RealType* values ) const; - - template< typename Vector > - __cuda_callable__ - typename Vector::RealType rowVectorProduct( const IndexType row, - const Vector& vector ) const; - - template< typename InVector, - typename OutVector > - void vectorProduct( const InVector& inVector, - OutVector& outVector ) const; - - template< typename Real2, typename Index2 > - void addMatrix( const SlicedEllpackSymmetricGraph< Real2, Device, Index2 >& matrix, - const RealType& matrixMultiplicator = 1.0, - const RealType& thisMatrixMultiplicator = 1.0 ); - - template< typename Real2, typename Index2 > - void getTransposition( const SlicedEllpackSymmetricGraph< Real2, Device, Index2 >& matrix, - const RealType& matrixMultiplicator = 1.0 ); - - template< typename Vector > - bool performSORIteration( const Vector& b, - const IndexType row, - Vector& x, - const RealType& omega = 1.0 ) const; - - Index getRealRowLength( const Index row ); - - Containers::Vector< Index, Device, Index > getRealRowLengths(); - - void save( File& file ) const; - - void load( File& file ); - - void save( const String& fileName ) const; - - void load( const String& fileName ); - - void print( std::ostream& str ) const; - - bool help( bool verbose = false ); - -#ifdef HAVE_CUDA - template< typename InVector, - typename OutVector > - __device__ - void spmvCuda( const InVector& inVector, - OutVector& outVector, - const int globalIdx, - const int color ) const; -#endif - - void copyFromHostToCuda( SlicedEllpackSymmetricGraph< Real, Devices::Host, Index, SliceSize >& matrix ); - - bool rearrangeMatrix( bool verbose = false ); - - void computePermutationArray(); - - Containers::Vector< Index, Device, Index > getSlicePointers(); - - Containers::Vector< Index, Device, Index > getSliceRowLengths(); - - Containers::Vector< Index, Device, Index > getPermutationArray(); - - Containers::Vector< Index, Device, Index > getInversePermutationArray(); - - Containers::Vector< Index, Device, Index > getColorPointers(); - - protected: - - Containers::Vector< Index, Device, Index > slicePointers, sliceRowLengths; - - typedef SlicedEllpackSymmetricGraphDeviceDependentCode< DeviceType > DeviceDependentCode; - friend class SlicedEllpackSymmetricGraphDeviceDependentCode< DeviceType >; - - Containers::Vector< Index, Device, Index > permutationArray; - Containers::Vector< Index, Device, Index > inversePermutationArray; - Containers::Vector< Index, Device, Index > colorPointers; - bool rearranged; -#ifdef HAVE_CUDA - /*friend __global__ void SlicedEllpackSymmetricGraph_computeMaximalRowLengthInSlices_CudaKernel< Real, Index, SliceSize >( SlicedEllpackMatrix< Real, Devices::Cuda, Index, SliceSize >* matrix, - const typename SlicedEllpackSymmetricGraph< Real, Devices::Cuda, Index, SliceSize >::RowLengthsVector* rowLengths, - int gridIdx ); - */ - // TODO: The friend declaration above does not work because of __global__ storage specifier. Therefore we declare the following method as public. Fix this, when possible. - - public: - __device__ void computeMaximalRowLengthInSlicesCuda( ConstCompressedRowLengthsVectorView rowLengths, - const IndexType sliceIdx ); - -#endif - -}; - -} //namespace Legacy -} // namespace Matrices -} // namespace TNL - -#include - diff --git a/src/TNL/Matrices/Legacy/SlicedEllpackSymmetricGraph_impl.h b/src/TNL/Matrices/Legacy/SlicedEllpackSymmetricGraph_impl.h deleted file mode 100644 index 5ab2f77c1216c98675de1200b9883672f1c0c146..0000000000000000000000000000000000000000 --- a/src/TNL/Matrices/Legacy/SlicedEllpackSymmetricGraph_impl.h +++ /dev/null @@ -1,1316 +0,0 @@ -/*************************************************************************** - SlicedEllpackSymmetricGraph_impl.h - description - ------------------- - begin : Aug 30, 2018 - copyright : (C) 2018 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#pragma once - -#include -#include -#include -#include - -namespace TNL { -namespace Matrices { - namespace Legacy { - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::SlicedEllpackSymmetricGraph() -: rearranged( false ) -{ -}; - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -String SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getType() -{ - return String( "Matrices::SlicedEllpackSymmetricGraph< ") + - String( TNL::getType< Real >() ) + - String( ", " ) + - String( Device::getDeviceType() ) + - String( ", " ) + - String( TNL::getType< Index >() ) + - String( " >" ); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -String SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getTypeVirtual() const -{ - return this->getType(); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::setDimensions( const IndexType rows, - const IndexType columns ) -{ - TNL_ASSERT( rows > 0 && columns > 0, - std::cerr << "rows = " << rows - << " columns = " << columns <::setDimensions( rows, columns ); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths ) -{ - TNL_ASSERT( this->getRows() > 0, ); - TNL_ASSERT( this->getColumns() > 0, ); - const IndexType slices = roundUpDivision( this->rows, SliceSize ); - this->sliceRowLengths.setSize( slices ); - this->slicePointers.setSize( slices + 1 ); - - this->permutationArray.setSize( this->getRows() ); - for( IndexType i = 0; i < this->getRows(); i++ ) - this->permutationArray.setElement( i, i ); - - Containers::Vector< Index, Device, Index > sliceRowLengths, slicePointers; - sliceRowLengths.setSize( slices ); - slicePointers.setSize( slices + 1 ); - // TODO: fix this - //DeviceDependentCode::computeMaximalRowLengthInSlices( *this, rowLengths, sliceRowLengths, slicePointers ); - this->sliceRowLengths = sliceRowLengths; - this->slicePointers = slicePointers; - - this->maxRowLength = rowLengths.max(); - - this->slicePointers.computeExclusivePrefixSum(); - this->allocateMatrixElements( this->slicePointers.getElement( slices ) ); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -Index SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getRowLength( const IndexType row ) const -{ - const IndexType slice = row / SliceSize; - return this->sliceRowLengths[ slice ]; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > - template< typename Real2, - typename Device2, - typename Index2 > -bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::setLike( const SlicedEllpackSymmetricGraph< Real2, Device2, Index2, SliceSize >& matrix ) -{ - if( !Sparse< Real, Device, Index >::setLike( matrix ) || - ! this->slicePointers.setLike( matrix.slicePointers ) || - ! this->sliceRowLengths.setLike( matrix.sliceRowLengths ) ) - return false; - return true; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::reset() -{ - Sparse< Real, Device, Index >::reset(); - this->slicePointers.reset(); - this->sliceRowLengths.reset(); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > - template< typename Real2, - typename Device2, - typename Index2 > -bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::operator == ( const SlicedEllpackSymmetricGraph< Real2, Device2, Index2 >& matrix ) const -{ - TNL_ASSERT( this->getRows() == matrix.getRows() && - this->getColumns() == matrix.getColumns(), - std::cerr << "this->getRows() = " << this->getRows() - << " matrix.getRows() = " << matrix.getRows() - << " this->getColumns() = " << this->getColumns() - << " matrix.getColumns() = " << matrix.getColumns() - << " this->getName() = " << this->getName() - << " matrix.getName() = " << matrix.getName() ); - // TODO: implement this - throw Exceptions::NotImplementedError( "SlicedEllpackSymmetricGraph::operator== is not implemented." ); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > - template< typename Real2, - typename Device2, - typename Index2 > -bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::operator != ( const SlicedEllpackSymmetricGraph< Real2, Device2, Index2 >& matrix ) const -{ - return ! ( ( *this ) == matrix ); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -__cuda_callable__ -bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::setElementFast( const IndexType row, - const IndexType column, - const Real& value ) -{ - return this->addElementFast( row, column, value, 0.0 ); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::setElement( const IndexType row, - const IndexType column, - const Real& value ) -{ - return this->addElement( row, column, value, 0.0 ); -} - - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -__cuda_callable__ -bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::addElementFast( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator ) -{ - TNL_ASSERT( row >= 0 && row < this->rows && - column >= 0 && column <= this->rows, - std::cerr << " row = " << row - << " column = " << column - << " this->rows = " << this->rows - << " this->columns = " << this-> columns ); - - Index elementPtr, rowEnd, step; - DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step ); - - IndexType col; - while( elementPtr < rowEnd && - ( col = this->columnIndexes.getElement( elementPtr ) ) < column && - col != this->getPaddingIndex() ) elementPtr += step; - if( elementPtr == rowEnd ) - return false; - if( col == column ) - { - this->values.setElement( elementPtr, thisElementMultiplicator * this->values.getElement( elementPtr ) + value ); - return true; - } - if( col == this->getPaddingIndex() ) - { - this->columnIndexes.setElement( elementPtr, column ); - this->values.setElement( elementPtr, value ); - return true; - } - IndexType j = rowEnd - step; - while( j > elementPtr ) - { - this->columnIndexes.setElement( j, this->columnIndexes.getElement( j - step ) ); - this->values.setElement( j, this->values.getElement( j - step ) ); - j -= step; - } - this->columnIndexes.setElement( elementPtr, column ); - this->values.setElement( elementPtr, value ); - return true; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::addElement( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator ) -{ - TNL_ASSERT( row >= 0 && row < this->rows && - column >= 0 && column <= this->rows, - std::cerr << " row = " << row - << " column = " << column - << " this->rows = " << this->rows - << " this->columns = " << this-> columns ); - - Index elementPtr, rowEnd, step; - DeviceDependentCode::initRowTraverse( *this, row, elementPtr, rowEnd, step ); - - IndexType col; - while( elementPtr < rowEnd && - ( col = this->columnIndexes.getElement( elementPtr ) ) < column && - col != this->getPaddingIndex() ) elementPtr += step; - if( elementPtr == rowEnd ) - return false; - if( col == column ) - { - this->values.setElement( elementPtr, thisElementMultiplicator * this->values.getElement( elementPtr ) + value ); - return true; - } - if( col == this->getPaddingIndex() ) - { - this->columnIndexes.setElement( elementPtr, column ); - this->values.setElement( elementPtr, value ); - return true; - } - IndexType j = rowEnd - step; - while( j > elementPtr ) - { - this->columnIndexes.setElement( j, this->columnIndexes.getElement( j - step ) ); - this->values.setElement( j, this->values.getElement( j - step ) ); - j -= step; - } - this->columnIndexes.setElement( elementPtr, column ); - this->values.setElement( elementPtr, value ); - return true; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -__cuda_callable__ -bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize > :: setRowFast( const IndexType row, - const IndexType* columnIndexes, - const RealType* values, - const IndexType elements ) -{ - const IndexType sliceIdx = this->permutationArray.getElement( row ) / SliceSize; - const IndexType rowLength = this->sliceRowLengths[ sliceIdx ]; - if( elements > rowLength ) - return false; - - Index elementPointer, rowEnd, step; - DeviceDependentCode::initRowTraverseFast( *this, this->permutationArray.getElement( row ), elementPointer, rowEnd, step ); - - for( IndexType i = 0; i < elements; i++ ) - { - const IndexType column = columnIndexes[ i ]; - if( column < 0 || column >= this->getColumns() ) - return false; - this->columnIndexes[ elementPointer ] = columnIndexes[ i ]; - this->values[ elementPointer ] = values[ i ]; - elementPointer += step; - } - for( IndexType i = elements; i < rowLength; i++ ) - { - this->columnIndexes[ elementPointer ] = this->getPaddingIndex(); - elementPointer += step; - } - return true; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize > :: setRow( const IndexType row, - const IndexType* columnIndexes, - const RealType* values, - const IndexType elements ) -{ - const IndexType sliceIdx = this->permutationArray.getElement( row ) / SliceSize; - const IndexType rowLength = this->sliceRowLengths.getElement( sliceIdx ); - if( elements > rowLength ) - return false; - - Index elementPointer, rowEnd, step; - DeviceDependentCode::initRowTraverse( *this, this->permutationArray.getElement( row ), elementPointer, rowEnd, step ); - - for( IndexType i = 0; i < elements; i++ ) - { - const IndexType column = columnIndexes[ i ]; - if( column < 0 || column >= this->getColumns() ) - return false; - this->columnIndexes.setElement( elementPointer, column ); - this->values.setElement( elementPointer, values[ i ] ); - elementPointer += step; - } - for( IndexType i = elements; i < rowLength; i++ ) - { - this->columnIndexes.setElement( elementPointer, this->getPaddingIndex() ); - elementPointer += step; - } - return true; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -__cuda_callable__ -bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize > :: addRowFast( const IndexType row, - const IndexType* columns, - const RealType* values, - const IndexType numberOfElements, - const RealType& thisElementMultiplicator ) -{ - // TODO: implement - return false; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize > :: addRow( const IndexType row, - const IndexType* columns, - const RealType* values, - const IndexType numberOfElements, - const RealType& thisElementMultiplicator ) -{ - // TODO: implement - return false; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -__cuda_callable__ -Real SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getElementFast( const IndexType row, - const IndexType column ) const -{ - Index elementPtr, rowEnd, step; - DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step ); - - IndexType col; - while( elementPtr < rowEnd && - ( col = this->columnIndexes[ elementPtr ] ) < column && - col != this->getPaddingIndex() ) - elementPtr += step; - if( elementPtr < rowEnd && col == column ) - return this->values[ elementPtr ]; - return 0.0; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -Real SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getElement( const IndexType row, - const IndexType column ) const -{ - if( row < column ) - return this->getElement( column, row ); - - Index elementPtr, rowEnd, step; - DeviceDependentCode::initRowTraverse( *this, row, elementPtr, rowEnd, step ); - - IndexType col; - while( elementPtr < rowEnd && - ( col = this->columnIndexes.getElement( elementPtr ) ) < column && - col != this->getPaddingIndex() ) - elementPtr += step; - if( elementPtr < rowEnd && col == column ) - return this->values.getElement( elementPtr ); - return 0.0; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -__cuda_callable__ -void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getRowFast( const IndexType row, - IndexType* columns, - RealType* values ) const -{ - Index elementPtr, rowEnd, step, i( 0 ); - DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step ); - - while( elementPtr < rowEnd ) - { - columns[ i ] = this->columnIndexes[ elementPtr ]; - values[ i ] = this->values[ elementPtr ]; - elementPtr += step; - i++; - } -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getRow( const IndexType row, - IndexType* columns, - RealType* values ) const -{ - Index elementPtr, rowEnd, step, i( 0 ); - DeviceDependentCode::initRowTraverse( *this, row, elementPtr, rowEnd, step ); - - while( elementPtr < rowEnd ) - { - columns[ i ] = this->columnIndexes.getElement( elementPtr ); - values[ i ] = this->values.getElement( elementPtr ); - elementPtr += step; - i++; - } -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > - template< typename Vector > -__cuda_callable__ -typename Vector::RealType SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::rowVectorProduct( const IndexType row, - const Vector& vector ) const -{ - Real result = 0.0; - Index elementPtr, rowEnd, step; - DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step ); - - IndexType column; - while( elementPtr < rowEnd && - ( column = this->columnIndexes[ elementPtr ] ) < this->columns && - column != this->getPaddingIndex() ) - { - result += this->values[ elementPtr ] * vector[ column ]; - elementPtr += step; - } - return result; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > - template< typename InVector, - typename OutVector > -void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::vectorProduct( const InVector& inVector, - OutVector& outVector ) const -{ - DeviceDependentCode::vectorProduct( *this, inVector, outVector ); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > - template< typename Real2, - typename Index2 > -void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::addMatrix( const SlicedEllpackSymmetricGraph< Real2, Device, Index2 >& matrix, - const RealType& matrixMultiplicator, - const RealType& thisMatrixMultiplicator ) -{ - throw Exceptions::NotImplementedError( "SlicedEllpackSymmetricGraph::addMatrix is not implemented." ); - // TODO: implement -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > - template< typename Real2, - typename Index2 > -void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getTransposition( const SlicedEllpackSymmetricGraph< Real2, Device, Index2 >& matrix, - const RealType& matrixMultiplicator ) -{ - throw Exceptions::NotImplementedError( "SlicedEllpackSymmetricGraph::getTransposition is not implemented." ); - // TODO: implement -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > - template< typename Vector > -bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::performSORIteration( const Vector& b, - const IndexType row, - Vector& x, - const RealType& omega ) const -{ - TNL_ASSERT( row >=0 && row < this->getRows(), - std::cerr << "row = " << row - << " this->getRows() = " << this->getRows() - << " this->getName() = " << this->getName() <permutationArray.getElement( row ) / SliceSize; - const IndexType rowLength = this->sliceRowLengths[ sliceIdx ]; - IndexType elementPtr = this->slicePointers[ sliceIdx ] + - rowLength * ( this->permutationArray.getElement( row ) - sliceIdx * SliceSize ); - const IndexType rowEnd( elementPtr + rowLength ); - IndexType column; - while( elementPtr < rowEnd && ( column = this->columnIndexes[ elementPtr ] ) < this->columns ) - { - if( column == this->permutationArray.getElement( row ) ) - diagonalValue = this->values.getElement( elementPtr ); - else - sum += this->values.getElement( this->permutationArray.getElement( row ) * this->diagonalsShift.getSize() + elementPtr ) * x. getElement( column ); - elementPtr++; - } - if( diagonalValue == ( Real ) 0.0 ) - { - std::cerr << "There is zero on the diagonal in " << this->permutationArray.getElement( row ) << "-th row of thge matrix " << this->getName() << ". I cannot perform SOR iteration." <permutationArray.getElement( row ), x[ this->permutationArray.getElement( row ) ] + omega / diagonalValue * ( b[ this->permutationArray.getElement( row ) ] - sum ) ); - return true; -} - - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::save( File& file ) const -{ - Sparse< Real, Device, Index >::save( file ); - file << this->slicePointers << this->sliceRowLengths; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::load( File& file ) -{ - Sparse< Real, Device, Index >::load( file ); - file >> this->slicePointers >> this->sliceRowLengths; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::save( const String& fileName ) const -{ - Object::save( fileName ); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::load( const String& fileName ) -{ - Object::load( fileName ); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::print( std::ostream& str ) const -{ - for( IndexType row = 0; row < this->getRows(); row++ ) - { - str <<"Row: " << row << " -> "; - const IndexType sliceIdx = this->permutationArray.getElement( row ) / SliceSize; - const IndexType rowLength = this->sliceRowLengths.getElement( sliceIdx ); - IndexType elementPtr = this->slicePointers.getElement( sliceIdx ) + - rowLength * ( this->permutationArray.getElement( row ) - sliceIdx * SliceSize ); - const IndexType rowEnd( elementPtr + rowLength ); - while( elementPtr < rowEnd && - this->columnIndexes.getElement( elementPtr ) < this->columns && - this->columnIndexes.getElement( elementPtr ) != this->getPaddingIndex() ) - { - const Index column = this->columnIndexes.getElement( elementPtr ); - str << " Col:" << column << "->" << this->values.getElement( elementPtr ) << "\t"; - elementPtr++; - } - str < -void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::computePermutationArray() -{ - Containers::Vector< Index, Device, Index > colorsVector; - colorsVector.setSize( this->getRows() ); - for( IndexType i = 0; i < this->getRows(); i++ ) - { - colorsVector.setElement( i, 0 ); - } - - // compute colors for each row - Matrix< Real, Device, Index >::computeColorsVector( colorsVector ); - - // init color pointers - this->colorPointers.setSize( this->getNumberOfColors() + 1 ); - - // compute permutation - IndexType position = 0; - for( IndexType color = 0; color < this->getNumberOfColors(); color++ ) - { - this->colorPointers.setElement( color, position ); - for (IndexType i = 0; i < this->getRows(); i++) - if ( colorsVector.getElement( i ) == color) - { - IndexType row1 = this->permutationArray.getElement( i ); - IndexType row2 = this->permutationArray.getElement( position ); - IndexType tmp = this->permutationArray.getElement( row1 ); - this->permutationArray.setElement( row1, this->permutationArray.getElement( row2 ) ); - this->permutationArray.setElement( row2, tmp ); - - tmp = colorsVector.getElement( position ); - colorsVector.setElement( position, colorsVector.getElement( i ) ); - colorsVector.setElement( i, tmp ); - position++; - } - } - - this->colorPointers.setElement( this->getNumberOfColors(), this->getRows() ); - - this->inversePermutationArray.setSize( this->getRows() ); - for( IndexType i = 0; i < this->getRows(); i++ ) - this->inversePermutationArray.setElement( this->permutationArray.getElement( i ), i ); - - // destroy colors vector - colorsVector.reset(); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -Index SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getRealRowLength( const Index row ) -{ - const Index sliceIdx = row / SliceSize; - const Index slicePointer = this->slicePointers.getElement( sliceIdx ); - const Index rowLength = this->sliceRowLengths.getElement( sliceIdx ); - - Index rowBegin = slicePointer + rowLength * ( row - sliceIdx * SliceSize ); - Index rowEnd = rowBegin + rowLength; - Index length = 0; - for( Index i = rowBegin; i < rowEnd; i++ ) - if( this->columnIndexes.getElement( i ) != this->getPaddingIndex() ) - length++; - else - break; - - return length; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -Containers::Vector< Index, Device, Index > SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getRealRowLengths() -{ - Containers::Vector< Index, Device, Index > rowLengths; - rowLengths.setSize( this->getRows() ); - for( IndexType row = 0; row < this->getRows(); row++ ) - rowLengths.setElement( row, this->getRealRowLength( row ) ); - - return rowLengths; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::rearrangeMatrix( bool verbose ) -{ - this->computePermutationArray(); - - // now based on new permutation array we need to recompute row lengths in slices - const IndexType slices = roundUpDivision( this->rows, SliceSize ); - Containers::Vector< Index, Device, Index > sliceRowLengths, slicePointers, rowLengths; - sliceRowLengths.setSize( slices ); - slicePointers.setSize( slices + 1 ); - rowLengths.setSize( this->getRows() ); - rowLengths = this->getRealRowLengths(); - // TODO: fix this - //DeviceDependentCode::computeMaximalRowLengthInSlices( *this, rowLengths, sliceRowLengths, slicePointers ); - - slicePointers.computeExclusivePrefixSum(); - - // this->testRowLengths( rowLengths, sliceRowLengths ); - - // return this->allocateMatrixElements( this->slicePointers.getElement( slices ) ); - Containers::Vector< Real, Device, Index > valuesVector; - Containers::Vector< Index, Device, Index > columnsVector; - valuesVector.setSize( slicePointers.getElement( slices ) ); - columnsVector.setSize( slicePointers.getElement( slices ) ); - columnsVector.setValue( this->getPaddingIndex() ); - valuesVector.setValue( 0.0 ); - - for( IndexType slice = 0; slice < slices; slice++ ) - { - IndexType step = 1; - IndexType slicePointerOrig = this->slicePointers.getElement( slice ); - IndexType rowLengthOrig = this->sliceRowLengths.getElement( slice ); - for( IndexType row = slice * SliceSize; row < (slice + 1) * SliceSize && row < this->getRows(); row++ ) - { - IndexType rowBegin = slicePointerOrig + rowLengthOrig * ( row - slice * SliceSize ); - IndexType elementPointer = rowBegin; - - IndexType sliceNew = this->permutationArray.getElement( row ) / SliceSize; - IndexType slicePointerNew = slicePointers.getElement( sliceNew ); - IndexType rowLengthNew = sliceRowLengths.getElement( sliceNew ); - IndexType elementPointerNew = slicePointerNew + rowLengthNew * ( this->permutationArray.getElement( row ) - sliceNew * SliceSize ); - - for( IndexType i = 0; i < rowLengthOrig; i++ ) - { - if( this->columnIndexes.getElement( elementPointer ) != this->getPaddingIndex() ) - { - valuesVector.setElement(elementPointerNew, this->values.getElement(elementPointer)); - columnsVector.setElement(elementPointerNew, this->columnIndexes.getElement(elementPointer)); - elementPointer += step; - } - elementPointerNew += step; - } - } - } - - // reset original matrix - this->values.reset(); - this->columnIndexes.reset(); - this->slicePointers.reset(); - this->sliceRowLengths.reset(); - - this->slicePointers.setSize( slicePointers.getSize() ); - this->sliceRowLengths.setSize( sliceRowLengths.getSize() ); - - this->sliceRowLengths = sliceRowLengths; - this->slicePointers = slicePointers; - - // deep copy new matrix - this->values.setSize( valuesVector.getSize() ); - this->columnIndexes.setSize( columnsVector.getSize() ); - this->values = valuesVector; - this->columnIndexes = columnsVector; - - // clear memory - valuesVector.reset(); - columnsVector.reset(); - slicePointers.reset(); - sliceRowLengths.reset(); - - this->rearranged = true; - return true; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -bool SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::help( bool verbose ) -{ - if( !this->rearranged ) - this->rearrangeMatrix( verbose ); - return true; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -Containers::Vector< Index, Device, Index > SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getSlicePointers() -{ - return this->slicePointers; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -Containers::Vector< Index, Device, Index > SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getSliceRowLengths() -{ - return this->sliceRowLengths; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -Containers::Vector< Index, Device, Index > SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getPermutationArray() -{ - return this->permutationArray; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -Containers::Vector< Index, Device, Index > SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getInversePermutationArray() -{ - return this->inversePermutationArray; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -Containers::Vector< Index, Device, Index > SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::getColorPointers() -{ - return this->colorPointers; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::copyFromHostToCuda( SlicedEllpackSymmetricGraph& matrix ) -{ - Sparse< Real, Device, Index >::copyFromHostToCuda( matrix ); - - this->rearranged = true; - - Containers::Vector< Index, Device, Index > colorPointers = matrix.getColorPointers(); - this->colorPointers.setSize( colorPointers.getSize() ); - for( IndexType i = 0; i < colorPointers.getSize(); i++ ) - this->colorPointers.setElement( i, colorPointers[ i ] ); - - Containers::Vector< Index, Device, Index > slicePointers = matrix.getSlicePointers(); - this->slicePointers.setSize( slicePointers.getSize() ); - for( IndexType i = 0; i < slicePointers.getSize(); i++ ) - this->slicePointers.setElement( i, slicePointers[ i ] ); - - Containers::Vector< Index, Device, Index > sliceRowLengths = matrix.getSliceRowLengths(); - this->sliceRowLengths.setSize( sliceRowLengths.getSize() ); - for( IndexType i = 0; i < sliceRowLengths.getSize(); i++ ) - this->sliceRowLengths.setElement( i, sliceRowLengths[ i ] ); - - Containers::Vector< Index, Device, Index > permutationArray = matrix.getPermutationArray(); - this->permutationArray.setSize( permutationArray.getSize() ); - for( IndexType i = 0; i < permutationArray.getSize(); i++ ) - this->permutationArray.setElement( i, permutationArray[ i ] ); - - Containers::Vector< Index, Device, Index > inversePermutation = matrix.getInversePermutationArray(); - this->inversePermutationArray.setSize( inversePermutation.getize() ); - for( IndexType i = 0; i < inversePermutation.getSize(); i++ ) - this->inversePermutationArray.setElement( i, inversePermutation[ i ] ); - - for( IndexType i = 0; i < this->getRows(); i++ ) - for( IndexType j = 0; j <= i; j++ ) - { - if( matrix.getElement( i, j ) != 0.0 ) - this->setElementFast( i, j, matrix.getElement( i, j ) ); - } - - colorPointers.reset(); - slicePointers.reset(); - sliceRowLengths.reset(); - permutationArray.reset(); - inversePermutation.reset(); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -template< typename InVector, - typename OutVector > -void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::vectorProductHost( const InVector& inVector, - OutVector& outVector ) const -{ - // simulated cuda SPMV on CPU - for( IndexType i = 0; i < this->getNumberOfColors(); i++ ) - { - IndexType offset = this->colorPointers[ i ]; - IndexType stop = this->colorPointers[ i + 1 ]; - IndexType inSliceIdx = offset % SliceSize; - IndexType sliceOffset = offset - inSliceIdx; - IndexType length = this->colorPointers[ i + 1 ] - this->colorPointers[ i ] + inSliceIdx; - IndexType cudaBlockSize = 256; - IndexType blocks = roundUpDivision( length, cudaBlockSize ); - for( IndexType blockIdx = 0; blockIdx < blocks; blockIdx++ ) - { - for( IndexType warpIdx = 0; warpIdx < 8; warpIdx++ ) - { - IndexType warpSize = 32; - for (IndexType threadIdx = 0; threadIdx < warpSize; threadIdx++) { - IndexType row = blockIdx * cudaBlockSize + warpIdx * warpSize + threadIdx + sliceOffset; - if (row >= stop || row < offset) - continue; - IndexType sliceIdx = row / SliceSize; - IndexType sliceLength = this->sliceRowLengths[sliceIdx]; - IndexType begin = this->slicePointers[sliceIdx] + sliceLength * threadIdx; - IndexType rowMapping = this->inversePermutationArray.getElement(row); - for (IndexType elementPtr = begin; elementPtr < begin + sliceLength; elementPtr++) { - IndexType column = this->columnIndexes[elementPtr]; - if (column == this->getPaddingIndex()) - break; - outVector[rowMapping] += inVector[column] * this->values[elementPtr]; - if (rowMapping != column) - { - outVector[column] += inVector[rowMapping] * this->values[elementPtr]; - } - } - } - } - } - } -} - -#ifdef HAVE_CUDA -template< typename Real, - typename Device, - typename Index, - int SliceSize > -__device__ void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::computeMaximalRowLengthInSlicesCuda( ConstCompressedRowLengthsVectorView rowLengths, - const IndexType sliceIdx ) -{ - Index rowIdx = sliceIdx * SliceSize; - Index rowInSliceIdx( 0 ); - Index maxRowLength( 0 ); - if( rowIdx >= this->getRows() ) - return; - while( rowInSliceIdx < SliceSize && rowIdx < this->getRows() ) - { - maxRowLength = Max( maxRowLength, rowLengths[ rowIdx ] ); - rowIdx++; - rowInSliceIdx++; - } - this->sliceRowLengths[ sliceIdx ] = maxRowLength; - this->slicePointers[ sliceIdx ] = maxRowLength * SliceSize; - if( threadIdx.x == 0 ) - this->slicePointers[ this->slicePointers.getSize() - 1 ] = 0; - -} -#endif - -template<> -class SlicedEllpackSymmetricGraphDeviceDependentCode< Devices::Host > -{ - public: - - typedef Devices::Host Device; - - template< typename Real, - typename Index, - int SliceSize > - static void initRowTraverse( const SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >& matrix, - const Index row, - Index& rowBegin, - Index& rowEnd, - Index& step ) - { - const Index sliceIdx = matrix.permutationArray.getElement( row ) / SliceSize; - const Index slicePointer = matrix.slicePointers.getElement( sliceIdx ); - const Index rowLength = matrix.sliceRowLengths.getElement( sliceIdx ); - - rowBegin = slicePointer + rowLength * ( matrix.permutationArray.getElement( row ) - sliceIdx * SliceSize ); - rowEnd = rowBegin + rowLength; - step = 1; - } - - template< typename Real, - typename Index, - int SliceSize > - static void initRowTraverseFast( const SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >& matrix, - const Index row, - Index& rowBegin, - Index& rowEnd, - Index& step ) - { - const Index sliceIdx = matrix.permutationArray.getElement( row ) / SliceSize; - const Index slicePointer = matrix.slicePointers[ sliceIdx ]; - const Index rowLength = matrix.sliceRowLengths[ sliceIdx ]; - - rowBegin = slicePointer + rowLength * ( matrix.permutationArray.getElement( row ) - sliceIdx * SliceSize ); - rowEnd = rowBegin + rowLength; - step = 1; - } - - - template< typename Real, - typename Index, - int SliceSize > - static void computeMaximalRowLengthInSlices( SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >& matrix, - typename SlicedEllpackSymmetricGraph< Real, Device, Index >::ConstCompressedRowLengthsVectorView rowLengths, - Containers::Vector< Index, Device, Index >& sliceRowLengths, - Containers::Vector< Index, Device, Index >& slicePointers ) - { - /*Index row( 0 ), slice( 0 ), sliceRowLength( 0 ); - while( row < matrix.getRows() ) - { - sliceRowLength = Max( rowLengths.getElement( matrix.permutationArray.getElement( row++ ) ), sliceRowLength ); - if( row % SliceSize == 0 ) - { - sliceRowLengths.setElement( slice, sliceRowLength ); - slicePointers.setElement( slice++, sliceRowLength * SliceSize ); - sliceRowLength = 0; - } - } - if( row % SliceSize != 0 ) - { - sliceRowLengths.setElement( slice, sliceRowLength ); - slicePointers.setElement( slice++, sliceRowLength * SliceSize ); - } - slicePointers.setElement( slicePointers.getSize() - 1, 0 );*/ - - Index sliceRowLength( 0 ); - Index numberOSlices = roundUpDivision( matrix.getRows(), SliceSize ); - Containers::Vector< Index, Device, Index > rowMapToSlice; - rowMapToSlice.setSize( SliceSize ); - for( Index slice = 0; slice < numberOSlices; slice++ ) - { - rowMapToSlice.setValue( -1 ); - Index elementPtr = 0; - for( Index row = 0; row < matrix.getRows() && elementPtr < SliceSize; row++ ) - { - if( matrix.permutationArray.getElement( row ) >= slice * SliceSize && - matrix.permutationArray.getElement( row ) < ( slice + 1 ) * SliceSize ) - { - rowMapToSlice.setElement( elementPtr, row ); - elementPtr++; - } - } - - // TODO: pridej sem nejaky logger! - - Index i = 0; - for( ; i < SliceSize; i++ ) - // sliceRowLength = Max( rowLengths.getElement( matrix.permutationArray.getElement( rowMapToSlice.getElement( row ) ) ), sliceRowLength ); - { - if( rowMapToSlice.getElement( i ) < 0 ) - break; - sliceRowLength = Max( rowLengths.getElement( rowMapToSlice.getElement( i ) ), sliceRowLength ); - } - if( i % SliceSize == 0 || rowMapToSlice.getElement( i ) < 0 ) - { - sliceRowLengths.setElement( slice, sliceRowLength ); - slicePointers.setElement( slice, sliceRowLength * SliceSize ); - sliceRowLength = 0; - } - } - slicePointers.setElement( slicePointers.getSize() - 1, 0 ); - } - - template< typename Real, - typename Index, - typename InVector, - typename OutVector, - int SliceSize > - static void vectorProduct( const SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >& matrix, - const InVector& inVector, - OutVector& outVector ) - { - matrix.vectorProductHost( inVector, outVector ); - } - -}; - -#ifdef HAVE_CUDA -template< typename Real, - typename Index, - int SliceSize > -__global__ void SlicedEllpackSymmetricGraph_computeMaximalRowLengthInSlices_CudaKernel( SlicedEllpack< Real, Devices::Cuda, Index, SliceSize >* matrix, - typename SlicedEllpackSymmetricGraph< Real, Devices::Cuda, Index, SliceSize >::ConstCompressedRowLengthsVector rowLengths, - int gridIdx ) -{ - const Index sliceIdx = gridIdx * Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x; - matrix->computeMaximalRowLengthInSlicesCuda( rowLengths, sliceIdx ); -} -#endif - -#ifdef HAVE_CUDA -template< typename Real, - typename Device, - typename Index, - int SliceSize > -template< typename InVector, - typename OutVector > -__device__ -void SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >::spmvCuda( const InVector& inVector, - OutVector& outVector, - const int globalIdx, - const int color ) const -{ - /*const IndexType offset = this->colorPointers[ i ]; - const IndexType stop = this->colorPointers[ i + 1 ]; - if( globalIdx >= stop || globalIdx < offset ) - return;*/ - - IndexType inSliceIdx = threadIdx.x % SliceSize; - const IndexType sliceIdx = globalIdx / SliceSize; - const IndexType sliceLength = this->sliceRowLengths[ sliceIdx ]; - const IndexType begin = this->slicePointers[ sliceIdx ] + inSliceIdx * sliceLength; - const IndexType rowMapping = this->inversePermutationArray[ globalIdx ]; - for( IndexType elementPtr = begin; elementPtr < begin + sliceLength; elementPtr++ ) - { - IndexType column = this->columnIndexes[ elementPtr ]; - if( column == this->getPaddingIndex() ) - break; - - outVector[ rowMapping ] += inVector[ column ] * this->values[ elementPtr ]; - if( rowMapping != column ) - { - outVector[ column ] += inVector[ rowMapping ] * this->values[ elementPtr ]; - } - } -} -#endif - -#ifdef HAVE_CUDA -template< typename Real, - typename Index, - int SliceSize, - typename InVector, - typename OutVector > -__global__ -void SlicedEllpackSymmetricGraphVectorProductCuda( const SlicedEllpackSymmetricGraph< Real, Devices::Cuda, Index, SliceSize >& matrix, - const InVector* inVector, - OutVector* outVector, - const int gridIdx, - const int color, - const int sliceOffset ) -{ - int globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x + sliceOffset; - matrix->smvCuda( *inVector, *outVector, globalIdx, color ); -} -#endif - -template<> -class SlicedEllpackSymmetricGraphDeviceDependentCode< Devices::Cuda > -{ - public: - - typedef Devices::Cuda Device; - - template< typename Real, - typename Index, - int SliceSize > - static void initRowTraverse( const SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >& matrix, - const Index row, - Index& rowBegin, - Index& rowEnd, - Index& step ) - { - const Index sliceIdx = matrix.permutationArray.getElement( row ) / SliceSize; - const Index slicePointer = matrix.slicePointers.getElement( sliceIdx ); - const Index rowLength = matrix.sliceRowLengths.getElement( sliceIdx ); - - rowBegin = slicePointer + matrix.permutationArray.getElement( row ) - sliceIdx * SliceSize; - rowEnd = rowBegin + rowLength * SliceSize; - step = SliceSize; - } - - template< typename Real, - typename Index, - int SliceSize > - __cuda_callable__ - static void initRowTraverseFast( const SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >& matrix, - const Index row, - Index& rowBegin, - Index& rowEnd, - Index& step ) - { - const Index sliceIdx = matrix.permutationArray.getElement( row ) / SliceSize; - const Index slicePointer = matrix.slicePointers[ sliceIdx ]; - const Index rowLength = matrix.sliceRowLengths[ sliceIdx ]; - - rowBegin = slicePointer + matrix.permutationArray.getElement( row ) - sliceIdx * SliceSize; - rowEnd = rowBegin + rowLength * SliceSize; - step = SliceSize; - - } - - template< typename Real, - typename Index, - int SliceSize > - static void computeMaximalRowLengthInSlices( SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >& matrix, - typename SlicedEllpackSymmetricGraph< Real, Device, Index >::ConstCompressedRowLengthsVectorView rowLengths, - Containers::Vector< Index, Device, Index >& sliceRowLengths, - Containers::Vector< Index, Device, Index >& slicePointers ) - { -#ifdef HAVE_CUDA - typedef SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize > Matrix; - typedef typename Matrix::RowLengthsVector CompressedRowLengthsVector; - Matrix* kernel_matrix = Cuda::passToDevice( matrix ); - const Index numberOfSlices = roundUpDivision( matrix.getRows(), SliceSize ); - dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); - const Index cudaBlocks = roundUpDivision( numberOfSlices, cudaBlockSize.x ); - const Index cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); - for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) - { - if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); - SlicedEllpackSymmetricGraph_computeMaximalRowLengthInSlices_CudaKernel< Real, Index, SliceSize ><<< cudaGridSize, cudaBlockSize >>> - ( kernel_matrix, - rowLengths, - gridIdx ); - } - Cuda::freeFromDevice( kernel_matrix ); - TNL_CHECK_CUDA_DEVICE; -#endif - } - - template< typename Real, - typename Index, - typename InVector, - typename OutVector, - int SliceSize > - static void vectorProduct( const SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize >& matrix, - const InVector& inVector, - OutVector& outVector ) - { - // TODO: tohle -#ifdef HAVE_CUDA - typedef SlicedEllpackSymmetricGraph< Real, Devices::Cuda, Index, SliceSize > Matrix; - typedef typename Matrix::IndexType IndexType; - Matrix* kernel_this = Cuda::passToDevice( matrix ); - InVector* kernel_inVector = Cuda::passToDevice( inVector ); - OutVector* kernel_outVector = Cuda::passToDevice( outVector ); - dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); - for( IndexType color = 0; color < matrix.getNumberOfColors(); color++ ) - { - IndexType offset = matrix.colorPointers.getElement( color ); //can be computed in kernel - // IndexType rowStop = matrix.colorPointers.getElement( color + 1 ); can be computed in kernel - IndexType inSliceOffset = offset % SliceSize; - // TODO: inSliceIdx is undefined - //IndexType rows = matrix.colorPointers.getElement( color + 1 ) - matrix.colorPointers.getElement( color ) + inSliceIdx; - // TODO: rows id undefined - /*const IndexType cudaBlocks = roundUpDivision( rows, cudaBlockSize.x ); - const IndexType cudaGrids = rondUpDivision( cudaBlocks, Cuda::getMaxGridSize ); - for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) - { - if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); - // TODO: this cannot be used here and i is undefined - //IndexType offset = this->colorPointers[ i ]; - IndexType inSliceIdx = offset % SliceSize; - IndexType sliceOffset = offset - inSliceIdx; - SlicedEllpackSymmetricGraphVectorProductCuda< Real, Index, InVector, OutVector > - <<< cudaGridSize, cudaBlockSize >>> - ( kernel_this, - kernel_inVector, - kernel_outVector, - gridIdx, - color, - sliceOffset ); - }*/ - } - Cuda::freeFromDevice( kernel_this ); - Cuda::freeFromDevice( kernel_inVector ); - Cuda::freeFromDevice( kernel_outVector ); - TNL_CHECK_CUDA_DEVICE; -#endif - } - -}; - -} //namespace Legacy -} // namespace Matrices -} // namespace TNL diff --git a/src/TNL/Matrices/Legacy/SlicedEllpackSymmetric_impl.h b/src/TNL/Matrices/Legacy/SlicedEllpackSymmetric_impl.h deleted file mode 100644 index 46475ac2007c0b89217eda5f93bfc47c38c45213..0000000000000000000000000000000000000000 --- a/src/TNL/Matrices/Legacy/SlicedEllpackSymmetric_impl.h +++ /dev/null @@ -1,930 +0,0 @@ -/*************************************************************************** - SlocedEllpackSymmetric_impl.h - description - ------------------- - begin : Aug 30, 2018 - copyright : (C) 2018 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#pragma once - -#include -#include -#include -#include - -namespace TNL { -namespace Matrices { - namespace Legacy { - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::SlicedEllpackSymmetric() -{ -}; - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -String SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::getType() -{ - return String( "Matrices::SlicedEllpackSymmetric< ") + - String( TNL::getType< Real >() ) + - String( ", " ) + - String( Device :: getDeviceType() ) + - String( ", " ) + - String( TNL::getType< Index >() ) + - String( " >" ); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -String SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::getTypeVirtual() const -{ - return this->getType(); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::setDimensions( const IndexType rows, - const IndexType columns ) -{ - TNL_ASSERT( rows > 0 && columns > 0, - std::cerr << "rows = " << rows - << " columns = " << columns <::setDimensions( rows, columns ); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::setCompressedRowLengths( ConstCompressedRowLengthsVectorView rowLengths ) -{ - TNL_ASSERT( this->getRows() > 0, ); - TNL_ASSERT( this->getColumns() > 0, ); - const IndexType slices = roundUpDivision( this->rows, SliceSize ); - this->sliceRowLengths.setSize( slices ); - this->slicePointers.setSize( slices + 1 ); - - // TODO: Uncomment the next line and fix the compilation - //DeviceDependentCode::computeMaximalRowLengthInSlices( *this, rowLengths ); - - throw std::runtime_error("code fix required"); - - this->maxRowLength = max( rowLengths ); - - this->slicePointers.template scan< Algorithms::ScanType::Exclusive >(); - this->allocateMatrixElements( this->slicePointers.getElement( slices ) ); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -Index SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::getRowLength( const IndexType row ) const -{ - const IndexType slice = roundUpDivision( row, SliceSize ); - return this->sliceRowLengths[ slice ]; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > - template< typename Real2, - typename Device2, - typename Index2 > -bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::setLike( const SlicedEllpackSymmetric< Real2, Device2, Index2, SliceSize >& matrix ) -{ - if( !Sparse< Real, Device, Index >::setLike( matrix ) || - ! this->slicePointers.setLike( matrix.slicePointers ) || - ! this->sliceRowLengths.setLike( matrix.sliceRowLengths ) ) - return false; - return true; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::reset() -{ - Sparse< Real, Device, Index >::reset(); - this->slicePointers.reset(); - this->sliceRowLengths.reset(); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > - template< typename Real2, - typename Device2, - typename Index2 > -bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::operator == ( const SlicedEllpackSymmetric< Real2, Device2, Index2 >& matrix ) const -{ - TNL_ASSERT( this->getRows() == matrix.getRows() && - this->getColumns() == matrix.getColumns(), - std::cerr << "this->getRows() = " << this->getRows() - << " matrix.getRows() = " << matrix.getRows() - << " this->getColumns() = " << this->getColumns() - << " matrix.getColumns() = " << matrix.getColumns() - << " this->getName() = " << this->getName() - << " matrix.getName() = " << matrix.getName() ); - // TODO: implement this - throw Exceptions::NotImplementedError( "SlicedEllpackSymmetric::operator== is not implemented." ); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > - template< typename Real2, - typename Device2, - typename Index2 > -bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::operator != ( const SlicedEllpackSymmetric< Real2, Device2, Index2 >& matrix ) const -{ - return ! ( ( *this ) == matrix ); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -__cuda_callable__ -bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::setElementFast( const IndexType row, - const IndexType column, - const Real& value ) -{ - return this->addElementFast( row, column, value, 0.0 ); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::setElement( const IndexType row, - const IndexType column, - const Real& value ) -{ - return this->addElement( row, column, value, 0.0 ); -} - - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -__cuda_callable__ -bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::addElementFast( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator ) -{ - TNL_ASSERT( row >= 0 && row < this->rows && - column >= 0 && column <= this->rows, - std::cerr << " row = " << row - << " column = " << column - << " this->rows = " << this->rows - << " this->columns = " << this-> columns ); - - Index elementPtr, rowEnd, step; - DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step ); - - IndexType col; - while( elementPtr < rowEnd && - ( col = this->columnIndexes[ elementPtr ] ) < column && - col != this->getPaddingIndex() ) elementPtr += step; - if( elementPtr == rowEnd ) - return false; - if( col == column ) - { - this->values[ elementPtr ] = thisElementMultiplicator * this->values[ elementPtr ] + value; - return true; - } - if( col == this->getPaddingIndex() ) - { - this->columnIndexes[ elementPtr ] = column; - this->values[ elementPtr ] = value; - return true; - } - IndexType j = rowEnd - step; - while( j > elementPtr ) - { - this->columnIndexes[ j ] = this->columnIndexes[ j - step ]; - this->values[ j ] = this->values[ j - step ]; - j -= step; - } - this->columnIndexes[ elementPtr ] = column; - this->values[ elementPtr ] = value; - return true; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::addElement( const IndexType row, - const IndexType column, - const RealType& value, - const RealType& thisElementMultiplicator ) -{ - TNL_ASSERT( row >= 0 && row < this->rows && - column >= 0 && column <= this->rows, - std::cerr << " row = " << row - << " column = " << column - << " this->rows = " << this->rows - << " this->columns = " << this-> columns ); - - Index elementPtr, rowEnd, step; - DeviceDependentCode::initRowTraverse( *this, row, elementPtr, rowEnd, step ); - - IndexType col; - while( elementPtr < rowEnd && - ( col = this->columnIndexes.getElement( elementPtr ) ) < column && - col != this->getPaddingIndex() ) elementPtr += step; - if( elementPtr == rowEnd ) - return false; - if( col == column ) - { - this->values.setElement( elementPtr, thisElementMultiplicator * this->values.getElement( elementPtr ) + value ); - return true; - } - if( col == this->getPaddingIndex() ) - { - this->columnIndexes.setElement( elementPtr, column ); - this->values.setElement( elementPtr, value ); - return true; - } - IndexType j = rowEnd - step; - while( j > elementPtr ) - { - this->columnIndexes.setElement( j, this->columnIndexes.getElement( j - step ) ); - this->values.setElement( j, this->values.getElement( j - step ) ); - j -= step; - } - this->columnIndexes.setElement( elementPtr, column ); - this->values.setElement( elementPtr, value ); - return true; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -__cuda_callable__ -bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize > :: setRowFast( const IndexType row, - const IndexType* columnIndexes, - const RealType* values, - const IndexType elements ) -{ - const IndexType sliceIdx = row / SliceSize; - const IndexType rowLength = this->sliceRowLengths[ sliceIdx ]; - if( elements > rowLength ) - return false; - - Index elementPointer, rowEnd, step; - DeviceDependentCode::initRowTraverseFast( *this, row, elementPointer, rowEnd, step ); - - for( IndexType i = 0; i < elements; i++ ) - { - const IndexType column = columnIndexes[ i ]; - if( column < 0 || column >= this->getColumns() ) - return false; - this->columnIndexes[ elementPointer ] = columnIndexes[ i ]; - this->values[ elementPointer ] = values[ i ]; - elementPointer += step; - } - for( IndexType i = elements; i < rowLength; i++ ) - { - this->columnIndexes[ elementPointer ] = this->getPaddingIndex(); - elementPointer += step; - } - return true; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize > :: setRow( const IndexType row, - const IndexType* columnIndexes, - const RealType* values, - const IndexType elements ) -{ - const IndexType sliceIdx = row / SliceSize; - const IndexType rowLength = this->sliceRowLengths.getElement( sliceIdx ); - if( elements > rowLength ) - return false; - - Index elementPointer, rowEnd, step; - DeviceDependentCode::initRowTraverse( *this, row, elementPointer, rowEnd, step ); - - for( IndexType i = 0; i < elements; i++ ) - { - const IndexType column = columnIndexes[ i ]; - if( column < 0 || column >= this->getColumns() ) - return false; - this->columnIndexes.setElement( elementPointer, column ); - this->values.setElement( elementPointer, values[ i ] ); - elementPointer += step; - } - for( IndexType i = elements; i < rowLength; i++ ) - { - this->columnIndexes.setElement( elementPointer, this->getPaddingIndex() ); - elementPointer += step; - } - return true; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -__cuda_callable__ -bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize > :: addRowFast( const IndexType row, - const IndexType* columns, - const RealType* values, - const IndexType numberOfElements, - const RealType& thisElementMultiplicator ) -{ - // TODO: implement - return false; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize > :: addRow( const IndexType row, - const IndexType* columns, - const RealType* values, - const IndexType numberOfElements, - const RealType& thisElementMultiplicator ) -{ - // TODO: implement - return false; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -__cuda_callable__ -Real SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::getElementFast( const IndexType row, - const IndexType column ) const -{ - if( row < column ) - return this->getElementFast( column, row ); - - Index elementPtr, rowEnd, step; - DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step ); - - IndexType col; - while( elementPtr < rowEnd && - ( col = this->columnIndexes[ elementPtr ] ) < column && - col != this->getPaddingIndex() ) - elementPtr += step; - if( elementPtr < rowEnd && col == column ) - return this->values[ elementPtr ]; - return 0.0; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -Real SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::getElement( const IndexType row, - const IndexType column ) const -{ - if( row < column ) - return this->getElement( column, row ); - - Index elementPtr, rowEnd, step; - DeviceDependentCode::initRowTraverse( *this, row, elementPtr, rowEnd, step ); - - IndexType col; - while( elementPtr < rowEnd && - ( col = this->columnIndexes.getElement( elementPtr ) ) < column && - col != this->getPaddingIndex() ) - elementPtr += step; - if( elementPtr < rowEnd && col == column ) - return this->values.getElement( elementPtr ); - return 0.0; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -__cuda_callable__ -void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::getRowFast( const IndexType row, - IndexType* columns, - RealType* values ) const -{ - Index elementPtr, rowEnd, step, i( 0 ); - DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step ); - - while( elementPtr < rowEnd ) - { - columns[ i ] = this->columnIndexes[ elementPtr ]; - values[ i ] = this->values[ elementPtr ]; - elementPtr += step; - i++; - } -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::getRow( const IndexType row, - IndexType* columns, - RealType* values ) const -{ - Index elementPtr, rowEnd, step, i( 0 ); - DeviceDependentCode::initRowTraverse( *this, row, elementPtr, rowEnd, step ); - - while( elementPtr < rowEnd ) - { - columns[ i ] = this->columnIndexes.getElement( elementPtr ); - values[ i ] = this->values.getElement( elementPtr ); - elementPtr += step; - i++; - } -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -template< typename InVector, - typename OutVector > -__cuda_callable__ -void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::rowVectorProduct( const IndexType row, - const InVector& inVector, - OutVector& outVector ) const -{ - Real result = 0.0; - Index elementPtr, rowEnd, step; - DeviceDependentCode::initRowTraverseFast( *this, row, elementPtr, rowEnd, step ); - - IndexType column; - while( elementPtr < rowEnd && - ( column = this->columnIndexes[ elementPtr ] ) < this->columns && - column != this->getPaddingIndex() ) - { - result += this->values[ elementPtr ] * inVector[ column ]; - if( row != column ) - outVector[ column ] += this->values[ elementPtr ] * inVector[ row ]; - elementPtr += step; - } - outVector[ row ] += result; -} - -#ifdef HAVE_CUDA -template< typename Real, - typename Device, - typename Index, - int SliceSize > -template< typename InVector, - typename OutVector > -__device__ -void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::spmvCuda( const InVector& inVector, - OutVector& outVector, - int rowIdx ) const -{ - if( rowIdx >= this->getRows() ) - return; - - Real result = 0.0; - Index elementPtr, rowEnd, step; - DeviceDependentCode::initRowTraverseFast( *this, rowIdx, elementPtr, rowEnd, step ); - IndexType column; - while( elementPtr < rowEnd && - ( column = this->columnIndexes[ elementPtr ] ) < this->columns && - column != this->getPaddingIndex() ) - { - result += this->values[ elementPtr ] * inVector[ column ]; - if( rowIdx != column ) - outVector[ column ] += this->values[ elementPtr ] * inVector[ rowIdx ]; - elementPtr += step; - } - outVector[ rowIdx ] += result; -} -#endif - -#ifdef HAVE_CUDA -template< typename Real, - typename Index, - int SliceSize, - typename InVector, - typename OutVector > -__global__ -void SlicedEllpackSymmetricVectorProductCudaKernel( -const SlicedEllpackSymmetric< Real, Devices::Cuda, Index, SliceSize >* matrix, - const InVector* inVector, - OutVector* outVector, - int gridIdx ) -{ - int rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - matrix->spmvCuda( *inVector, *outVector, rowIdx ); -} -#endif - -template< typename Real, - typename Device, - typename Index, - int SliceSize > - template< typename InVector, - typename OutVector > -void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::vectorProduct( const InVector& inVector, - OutVector& outVector ) const -{ - DeviceDependentCode::vectorProduct( *this, inVector, outVector ); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > - template< typename Real2, - typename Index2 > -void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::addMatrix( const SlicedEllpackSymmetric< Real2, Device, Index2 >& matrix, - const RealType& matrixMultiplicator, - const RealType& thisMatrixMultiplicator ) -{ - throw Exceptions::NotImplementedError( "SlicedEllpackSymmetric::addMatrix is not implemented." ); - // TODO: implement -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > - template< typename Real2, - typename Index2 > -void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::getTransposition( const SlicedEllpackSymmetric< Real2, Device, Index2 >& matrix, - const RealType& matrixMultiplicator ) -{ - throw Exceptions::NotImplementedError( "SlicedEllpackSymmetric::getTransposition is not implemented." ); - // TODO: implement -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > - template< typename Vector > -bool SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::performSORIteration( const Vector& b, - const IndexType row, - Vector& x, - const RealType& omega ) const -{ - TNL_ASSERT( row >=0 && row < this->getRows(), - std::cerr << "row = " << row - << " this->getRows() = " << this->getRows() - << " this->getName() = " << this->getName() <sliceRowLengths[ sliceIdx ]; - IndexType elementPtr = this->slicePointers[ sliceIdx ] + - rowLength * ( row - sliceIdx * SliceSize ); - const IndexType rowEnd( elementPtr + rowLength ); - IndexType column; - while( elementPtr < rowEnd && ( column = this->columnIndexes[ elementPtr ] ) < this->columns ) - { - if( column == row ) - diagonalValue = this->values.getElement( elementPtr ); - else - sum += this->values.getElement( row * this->diagonalsShift.getSize() + elementPtr ) * x. getElement( column ); - elementPtr++; - } - if( diagonalValue == ( Real ) 0.0 ) - { - std::cerr << "There is zero on the diagonal in " << row << "-th row of thge matrix " << this->getName() << ". I cannot perform SOR iteration." < -void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::save( File& file ) const -{ - Sparse< Real, Device, Index >::save( file ); - file << this->slicePointers << this->sliceRowLengths; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::load( File& file ) -{ - Sparse< Real, Device, Index >::load( file ); - file >> this->slicePointers >> this->sliceRowLengths; -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::save( const String& fileName ) const -{ - Object::save( fileName ); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::load( const String& fileName ) -{ - Object::load( fileName ); -} - -template< typename Real, - typename Device, - typename Index, - int SliceSize > -void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::print( std::ostream& str ) const -{ - for( IndexType row = 0; row < this->getRows(); row++ ) - { - str <<"Row: " << row << " -> "; - const IndexType sliceIdx = row / SliceSize; - const IndexType rowLength = this->sliceRowLengths.getElement( sliceIdx ); - IndexType elementPtr = this->slicePointers.getElement( sliceIdx ) + - rowLength * ( row - sliceIdx * SliceSize ); - const IndexType rowEnd( elementPtr + rowLength ); - while( elementPtr < rowEnd && - this->columnIndexes.getElement( elementPtr ) < this->columns && - this->columnIndexes.getElement( elementPtr ) != this->getPaddingIndex() ) - { - const Index column = this->columnIndexes.getElement( elementPtr ); - str << " Col:" << column << "->" << this->values.getElement( elementPtr ) << "\t"; - elementPtr++; - } - str < -__device__ void SlicedEllpackSymmetric< Real, Device, Index, SliceSize >::computeMaximalRowLengthInSlicesCuda( ConstCompressedRowLengthsVectorView rowLengths, - const IndexType sliceIdx ) -{ - Index rowIdx = sliceIdx * SliceSize; - Index rowInSliceIdx( 0 ); - Index maxRowLength( 0 ); - if( rowIdx >= this->getRows() ) - return; - while( rowInSliceIdx < SliceSize && rowIdx < this->getRows() ) - { - maxRowLength = Max( maxRowLength, rowLengths[ rowIdx ] ); - rowIdx++; - rowInSliceIdx++; - } - this->sliceRowLengths[ sliceIdx ] = maxRowLength; - this->slicePointers[ sliceIdx ] = maxRowLength * SliceSize; - if( threadIdx.x == 0 ) - this->slicePointers[ this->slicePointers.getSize() - 1 ] = 0; - -} -#endif - -template<> -class SlicedEllpackSymmetricDeviceDependentCode< Devices::Host > -{ - public: - - typedef Devices::Host Device; - - template< typename Real, - typename Index, - int SliceSize > - static void initRowTraverse( const SlicedEllpackSymmetric< Real, Device, Index, SliceSize >& matrix, - const Index row, - Index& rowBegin, - Index& rowEnd, - Index& step ) - { - const Index sliceIdx = row / SliceSize; - const Index slicePointer = matrix.slicePointers.getElement( sliceIdx ); - const Index rowLength = matrix.sliceRowLengths.getElement( sliceIdx ); - - rowBegin = slicePointer + rowLength * ( row - sliceIdx * SliceSize ); - rowEnd = rowBegin + rowLength; - step = 1; - } - - template< typename Real, - typename Index, - int SliceSize > - __cuda_callable__ - static void initRowTraverseFast( const SlicedEllpackSymmetric< Real, Device, Index, SliceSize >& matrix, - const Index row, - Index& rowBegin, - Index& rowEnd, - Index& step ) - { - const Index sliceIdx = row / SliceSize; - const Index slicePointer = matrix.slicePointers[ sliceIdx ]; - const Index rowLength = matrix.sliceRowLengths[ sliceIdx ]; - - rowBegin = slicePointer + rowLength * ( row - sliceIdx * SliceSize ); - rowEnd = rowBegin + rowLength; - step = 1; - } - - - template< typename Real, - typename Index, - int SliceSize > - static void computeMaximalRowLengthInSlices( SlicedEllpackSymmetric< Real, Device, Index, SliceSize >& matrix, - typename SlicedEllpackSymmetric< Real, Device, Index >::ConstCompressedRowLengthsVectorView rowLengths ) - { - Index row( 0 ), slice( 0 ), sliceRowLength( 0 ); - while( row < matrix.getRows() ) - { - sliceRowLength = Max( rowLengths.getElement( row++ ), sliceRowLength ); - if( row % SliceSize == 0 ) - { - matrix.sliceRowLengths.setElement( slice, sliceRowLength ); - matrix.slicePointers.setElement( slice++, sliceRowLength * SliceSize ); - sliceRowLength = 0; - } - } - if( row % SliceSize != 0 ) - { - matrix.sliceRowLengths.setElement( slice, sliceRowLength ); - matrix.slicePointers.setElement( slice++, sliceRowLength * SliceSize ); - } - matrix.slicePointers.setElement( matrix.slicePointers.getSize() - 1, 0 ); - } - - template< typename Real, - typename Index, - typename InVector, - typename OutVector, - int SliceSize > - static void vectorProduct( const SlicedEllpackSymmetric< Real, Device, Index, SliceSize >& matrix, - const InVector& inVector, - OutVector& outVector ) - { - for( Index row = 0; row < matrix.getRows(); row++ ) - { - matrix.rowVectorProduct( row, inVector, outVector ); - } - } - -}; - -#ifdef HAVE_CUDA -template< typename Real, - typename Index, - int SliceSize > -__global__ void SlicedEllpackSymmetric_computeMaximalRowLengthInSlices_CudaKernel( SlicedEllpackSymmetric< Real, Devices::Cuda, Index, SliceSize >* matrix, - typename SlicedEllpackSymmetric< Real, Devices::Cuda, Index, SliceSize >::ConstCompressedRowLengthsVectorView rowLengths, - int gridIdx ) -{ - const Index sliceIdx = gridIdx * Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x; - matrix->computeMaximalRowLengthInSlicesCuda( rowLengths, sliceIdx ); -} -#endif - -template<> -class SlicedEllpackSymmetricDeviceDependentCode< Devices::Cuda > -{ - public: - - typedef Devices::Cuda Device; - - template< typename Real, - typename Index, - int SliceSize > - static void initRowTraverse( const SlicedEllpackSymmetric< Real, Device, Index, SliceSize >& matrix, - const Index row, - Index& rowBegin, - Index& rowEnd, - Index& step ) - { - const Index sliceIdx = row / SliceSize; - const Index slicePointer = matrix.slicePointers.getElement( sliceIdx ); - const Index rowLength = matrix.sliceRowLengths.getElement( sliceIdx ); - - rowBegin = slicePointer + row - sliceIdx * SliceSize; - rowEnd = rowBegin + rowLength * SliceSize; - step = SliceSize; - } - - template< typename Real, - typename Index, - int SliceSize > - __cuda_callable__ - static void initRowTraverseFast( const SlicedEllpackSymmetric< Real, Device, Index, SliceSize >& matrix, - const Index row, - Index& rowBegin, - Index& rowEnd, - Index& step ) - { - const Index sliceIdx = row / SliceSize; - const Index slicePointer = matrix.slicePointers[ sliceIdx ]; - const Index rowLength = matrix.sliceRowLengths[ sliceIdx ]; - - rowBegin = slicePointer + row - sliceIdx * SliceSize; - rowEnd = rowBegin + rowLength * SliceSize; - step = SliceSize; - - } - - template< typename Real, - typename Index, - int SliceSize > - static void computeMaximalRowLengthInSlices( SlicedEllpackSymmetric< Real, Device, Index, SliceSize >& matrix, - typename SlicedEllpackSymmetric< Real, Device, Index >::ConstCompressedRowLengthsVectorView rowLengths ) - { -#ifdef HAVE_CUDA - typedef SlicedEllpackSymmetric< Real, Device, Index, SliceSize > Matrix; - typedef typename Matrix::RowLengthsVector CompressedRowLengthsVector; - Matrix* kernel_matrix = Cuda::passToDevice( matrix ); - const Index numberOfSlices = roundUpDivision( matrix.getRows(), SliceSize ); - dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); - const Index cudaBlocks = roundUpDivision( numberOfSlices, cudaBlockSize.x ); - const Index cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); - for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) - { - if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); - SlicedEllpackSymmetric_computeMaximalRowLengthInSlices_CudaKernel< Real, Index, SliceSize ><<< cudaGridSize, cudaBlockSize >>> - ( kernel_matrix, - rowLengths, - gridIdx ); - } - Cuda::freeFromDevice( kernel_matrix ); - TNL_CHECK_CUDA_DEVICE; -#endif - } - - template< typename Real, - typename Index, - typename InVector, - typename OutVector, - int SliceSize > - static void vectorProduct( const SlicedEllpackSymmetric< Real, Device, Index, SliceSize >& matrix, - const InVector& inVector, - OutVector& outVector ) - { -#ifdef HAVE_CUDA - typedef SlicedEllpackSymmetric< Real, Device, Index, SliceSize > Matrix; - typedef typename Matrix::IndexType IndexType; - Matrix* kernel_this = Cuda::passToDevice( matrix ); - InVector* kernel_inVector = Cuda::passToDevice( inVector ); - OutVector* kernel_outVector = Cuda::passToDevice( outVector ); - dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); - const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); - const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); - for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) - { - if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); - SlicedEllpackSymmetricVectorProductCudaKernel< Real, Index, SliceSize, InVector, OutVector > - <<< cudaGridSize, cudaBlockSize >>> - ( kernel_this, - kernel_inVector, - kernel_outVector, - gridIdx ); - } - Cuda::freeFromDevice( kernel_this ); - Cuda::freeFromDevice( kernel_inVector ); - Cuda::freeFromDevice( kernel_outVector ); - TNL_CHECK_CUDA_DEVICE; -#endif - } - -}; - -} //namespace Legacy -} // namespace Matrices -} // namespace TNL diff --git a/src/TNL/Matrices/MatrixInfo.h b/src/TNL/Matrices/MatrixInfo.h index 8e08708484d0a4e36c986c806498dc2c338fb0d9..2715d2f6e19855a156fc8e424a643590abd96201 100644 --- a/src/TNL/Matrices/MatrixInfo.h +++ b/src/TNL/Matrices/MatrixInfo.h @@ -19,10 +19,10 @@ #include #include #include -#include -#include -#include -#include +#include +#include +#include +#include namespace TNL { /** @@ -113,6 +113,46 @@ struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRLight > > static String getFormat() { return "CSR Legacy Light"; }; }; +template< typename Real, typename Device, typename Index > +struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRLight2 > > +{ + static String getDensity() { return String( "sparse" ); }; + + static String getFormat() { return "CSR Legacy Light2"; }; +}; + +template< typename Real, typename Device, typename Index > +struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRLight3 > > +{ + static String getDensity() { return String( "sparse" ); }; + + static String getFormat() { return "CSR Legacy Light3"; }; +}; + +template< typename Real, typename Device, typename Index > +struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRLight4 > > +{ + static String getDensity() { return String( "sparse" ); }; + + static String getFormat() { return "CSR Legacy Light4"; }; +}; + +template< typename Real, typename Device, typename Index > +struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRLight5 > > +{ + static String getDensity() { return String( "sparse" ); }; + + static String getFormat() { return "CSR Legacy Light5"; }; +}; + +template< typename Real, typename Device, typename Index > +struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRLight6 > > +{ + static String getDensity() { return String( "sparse" ); }; + + static String getFormat() { return "CSR Legacy Light6"; }; +}; + template< typename Real, typename Device, typename Index > struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRAdaptive > > { @@ -122,11 +162,19 @@ struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRAdaptive > > }; template< typename Real, typename Device, typename Index > -struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRStream > > +struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRMultiVector > > +{ + static String getDensity() { return String( "sparse" ); }; + + static String getFormat() { return "CSR Legacy MultiVector"; }; +}; + +template< typename Real, typename Device, typename Index > +struct MatrixInfo< Legacy::CSR< Real, Device, Index, Legacy::CSRLightWithoutAtomic > > { static String getDensity() { return String( "sparse" ); }; - static String getFormat() { return "CSR Legacy Stream"; }; + static String getFormat() { return "CSR Legacy LightWithoutAtomic"; }; }; template< typename Real, typename Device, typename Index > diff --git a/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h b/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h index 69d427b84ef2a4519827b74572e3eafd1087554f..c61f7fda71c95b631673c4540235b5f0b2c05d99 100644 --- a/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h +++ b/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h @@ -9,8 +9,8 @@ /* See Copyright Notice in tnl/Copyright */ #include -#include -#include +#include +#include #include #include diff --git a/src/UnitTests/Matrices/CMakeLists.txt b/src/UnitTests/Matrices/CMakeLists.txt index eb8e2e1d5076f07963156a982bc2c2241c1bc683..778ab29bd39d6d7e994218b493448cc10d7c9d3c 100644 --- a/src/UnitTests/Matrices/CMakeLists.txt +++ b/src/UnitTests/Matrices/CMakeLists.txt @@ -137,7 +137,7 @@ if( ${BUILD_MPI} ) if( BUILD_CUDA ) CUDA_ADD_EXECUTABLE( DistributedMatrixTest DistributedMatrixTest.cu OPTIONS ${CXX_TESTS_FLAGS} ) - TARGET_LINK_LIBRARIES( DistributedMatrixTest ${GTEST_BOTH_LIBRARIES} ) + TARGET_LINK_LIBRARIES( DistributedMatrixTest ${GTEST_BOTH_LIBRARIES} ${CUDA_cudadevrt_LIBRARY} ) else() ADD_EXECUTABLE( DistributedMatrixTest DistributedMatrixTest.cpp ) TARGET_COMPILE_OPTIONS( DistributedMatrixTest PRIVATE ${CXX_TESTS_FLAGS} ) diff --git a/src/UnitTests/Matrices/DenseMatrixCopyTest.h b/src/UnitTests/Matrices/DenseMatrixCopyTest.h index 9e63a6f6cacc9b2640bb12ad8b0f75d214e9b5e6..d86eb57f5cf6fbdaafe51734d9ea834f2bb8823e 100644 --- a/src/UnitTests/Matrices/DenseMatrixCopyTest.h +++ b/src/UnitTests/Matrices/DenseMatrixCopyTest.h @@ -9,8 +9,8 @@ /* See Copyright Notice in tnl/Copyright */ #include -#include -#include +#include +#include #include #include diff --git a/src/UnitTests/Matrices/Legacy/CMakeLists.txt b/src/UnitTests/Matrices/Legacy/CMakeLists.txt index 46c6be2cdacbb24648f60aa9e6337f49cd59ad8b..2e7297cceb0f73d197be8b5e2bf80f5c69b4d06b 100644 --- a/src/UnitTests/Matrices/Legacy/CMakeLists.txt +++ b/src/UnitTests/Matrices/Legacy/CMakeLists.txt @@ -15,7 +15,7 @@ IF( BUILD_CUDA ) TARGET_LINK_LIBRARIES( Legacy_SparseMatrixTest_ChunkedEllpack ${GTEST_BOTH_LIBRARIES} ) CUDA_ADD_EXECUTABLE( Legacy_SparseMatrixTest_CSR SparseMatrixTest_CSR.cu OPTIONS ${CXX_TESTS_FLAGS} ) - TARGET_LINK_LIBRARIES( Legacy_SparseMatrixTest_CSR ${GTEST_BOTH_LIBRARIES} ) + TARGET_LINK_LIBRARIES( Legacy_SparseMatrixTest_CSR ${GTEST_BOTH_LIBRARIES} ${CUDA_cudadevrt_LIBRARY} ) CUDA_ADD_EXECUTABLE( Legacy_SparseMatrixTest_Ellpack SparseMatrixTest_Ellpack.cu OPTIONS ${CXX_TESTS_FLAGS} ) TARGET_LINK_LIBRARIES( Legacy_SparseMatrixTest_Ellpack ${GTEST_BOTH_LIBRARIES} ) diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp b/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp index 98ddfd3db9afbcdc5ccb3fc75c19717709421c9d..ab67b8374f0bb9ae59780465d06951b358295b3c 100644 --- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp +++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest.hpp @@ -15,9 +15,9 @@ #include // Temporary, until test_OperatorEquals doesn't work for all formats. -#include +#include #include -#include +#include #ifdef HAVE_GTEST #include @@ -1386,88 +1386,82 @@ void test_VectorProductLarger() } template< typename Matrix > -void test_VectorProductGiant() +void test_VectorProductCSRAdaptive() { - using RealType = typename Matrix::RealType; - using DeviceType = typename Matrix::DeviceType; - using IndexType = typename Matrix::IndexType; - - IndexType m_rows = 100; - IndexType m_cols = 100; - - Matrix m; - m.reset(); - m.setDimensions( m_rows, m_cols ); - typename Matrix::CompressedRowLengthsVector rowLengths( - { - 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, - 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, - 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, - 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, - 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, - 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, - 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, - 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, - 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, - 100, 100, 100, 100, 100, 100, 100, 100, 100, 100 - } - ); + using RealType = typename Matrix::RealType; + using DeviceType = typename Matrix::DeviceType; + using IndexType = typename Matrix::IndexType; + using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >; - m.setCompressedRowLengths( rowLengths ); - - for (int i = 0; i < m_rows; ++i) - for (int j = 0; j < m_cols; ++j) - m.setElement( i, j, i + 1 ); + IndexType m_rows = 100; + IndexType m_cols = 100; + //----------------- Test CSR Stream part ------------------ + Matrix m; + m.setDimensions( m_rows, m_cols ); + typename Matrix::CompressedRowLengthsVector rowLengths( 100, 100 ); - using VectorType = TNL::Containers::Vector< RealType, DeviceType, IndexType >; - - VectorType inVector; - inVector.setSize( m_rows ); - for( IndexType i = 0; i < inVector.getSize(); ++i ) - inVector.setElement( i, 1 ); + if( std::is_same< DeviceType, TNL::Devices::Cuda >::value ) + { + typedef typename Matrix::template Self< RealType, TNL::Devices::Host, IndexType > HostMatrixType; + typename HostMatrixType::CompressedRowLengthsVector rowLengths( 100, 100 ); + HostMatrixType hostMatrix; + hostMatrix.setDimensions( m_rows, m_cols ); + hostMatrix.setCompressedRowLengths( rowLengths ); + for (int i = 0; i < m_rows; ++i) + for (int j = 0; j < m_cols; ++j) + hostMatrix.setElement( i, j, i + 1 ); + m = hostMatrix; + } + else + { + m.setCompressedRowLengths( rowLengths ); + for (int i = 0; i < m_rows; ++i) + for (int j = 0; j < m_cols; ++j) + m.setElement( i, j, i + 1 ); + } - VectorType outVector; - outVector.setSize( m_rows ); - for( IndexType i = 0; i < outVector.getSize(); ++i ) - outVector.setElement( i, 0 ); - m.vectorProduct( inVector, outVector); + VectorType inVector( m_rows, 1.0 ); + VectorType outVector( m_rows, 0.0 ); + m.vectorProduct( inVector, outVector); - for (int i = 0; i < m_rows; ++i) + for (int i = 0; i < m_rows; ++i) EXPECT_EQ( outVector.getElement( i ), (i + 1) * 100 ); - //----------------------------------------------------- + //----------------- Test CSR Vector L part ------------------ - m_rows = 2; - m_cols = 1000; - - m.reset(); - m.setDimensions( m_rows, m_cols ); - typename Matrix::CompressedRowLengthsVector rowLengths2( - { - 1000, 1000 - } - ); + m_rows = 1; + // if less than 'max elements per block to start CSR Dynamic Vector' tests CSR Vector part + m_cols = 3000; - m.setCompressedRowLengths( rowLengths2 ); - - for (int i = 0; i < m_rows; ++i) - for (int j = 0; j < m_cols; ++j) - m.setElement( i, j, i + 1 ); - - VectorType inVector2; - inVector2.setSize( m_cols ); - for( IndexType i = 0; i < inVector2.getSize(); i++ ) - inVector2.setElement( i, 1 ); - - VectorType outVector2; - outVector2.setSize( m_rows ); - for( IndexType i = 0; i < outVector2.getSize(); ++i ) - outVector2.setElement( i, 0 ); - m.vectorProduct( inVector2, outVector2); - - for (int i = 0; i < m_rows; ++i) - EXPECT_EQ( outVector2.getElement( i ), (i + 1) * 1000 ); + m.reset(); + m.setDimensions( m_rows, m_cols ); + typename Matrix::CompressedRowLengthsVector rowLengths2({m_cols}); + + if( std::is_same< DeviceType, TNL::Devices::Cuda >::value ) + { + typedef typename Matrix::template Self< RealType, TNL::Devices::Host, IndexType > HostMatrixType; + typename HostMatrixType::CompressedRowLengthsVector rowLengths( {m_cols} ); + HostMatrixType hostMatrix; + hostMatrix.setDimensions( m_rows, m_cols ); + hostMatrix.setCompressedRowLengths( rowLengths ); + for( int i = 0; i < m_cols; ++i ) + hostMatrix.setElement( 0, i, i ); + m = hostMatrix; + } + else + { + m.setCompressedRowLengths( rowLengths2 ); + for (int i = 0; i < m_cols; ++i) + m.setElement( 0, i, i ); + } + + VectorType inVector2( m_cols, 2.0 ); + + VectorType outVector2( m_rows, 0.0 ); + + m.vectorProduct(inVector2, outVector2); + EXPECT_EQ( outVector2.getElement( 0 ), 8997000 ); } template< typename Matrix > diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_BiEllpack.h b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_BiEllpack.h index cdac8af6e357ad0672eb17d554acb9d2c0de7bb2..d0277e27cbedd269e17bd6517fbf5027da112cde 100644 --- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_BiEllpack.h +++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_BiEllpack.h @@ -8,7 +8,7 @@ /* See Copyright Notice in tnl/Copyright */ -#include +#include #include "SparseMatrixTest.hpp" #include diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h index e9c3f591cf034127c2e751bf8339a33d14562d11..4b9325e06269e98d9d9f5b9b1e3556c6efed325a 100644 --- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h +++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_CSR.h @@ -27,23 +27,55 @@ protected: // types for which MatrixTest is instantiated using CSRMatrixTypes = ::testing::Types < - TNL::Matrices::Legacy::CSR< int, TNL::Devices::Host, int >, - TNL::Matrices::Legacy::CSR< long, TNL::Devices::Host, int >, - TNL::Matrices::Legacy::CSR< float, TNL::Devices::Host, int >, - TNL::Matrices::Legacy::CSR< double, TNL::Devices::Host, int >, - TNL::Matrices::Legacy::CSR< int, TNL::Devices::Host, long >, - TNL::Matrices::Legacy::CSR< long, TNL::Devices::Host, long >, - TNL::Matrices::Legacy::CSR< float, TNL::Devices::Host, long >, - TNL::Matrices::Legacy::CSR< double, TNL::Devices::Host, long > + TNL::Matrices::Legacy::CSR< int, TNL::Devices::Host, int, TNL::Matrices::Legacy::CSRScalar >, + TNL::Matrices::Legacy::CSR< float, TNL::Devices::Host, int, TNL::Matrices::Legacy::CSRScalar >, + TNL::Matrices::Legacy::CSR< double, TNL::Devices::Host, int, TNL::Matrices::Legacy::CSRScalar >, + TNL::Matrices::Legacy::CSR< int, TNL::Devices::Host, long, TNL::Matrices::Legacy::CSRScalar >, + TNL::Matrices::Legacy::CSR< float, TNL::Devices::Host, long, TNL::Matrices::Legacy::CSRScalar >, + TNL::Matrices::Legacy::CSR< double, TNL::Devices::Host, long, TNL::Matrices::Legacy::CSRScalar > #ifdef HAVE_CUDA - ,TNL::Matrices::Legacy::CSR< int, TNL::Devices::Cuda, int >, - TNL::Matrices::Legacy::CSR< long, TNL::Devices::Cuda, int >, - TNL::Matrices::Legacy::CSR< float, TNL::Devices::Cuda, int >, - TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int >, - TNL::Matrices::Legacy::CSR< int, TNL::Devices::Cuda, long >, - TNL::Matrices::Legacy::CSR< long, TNL::Devices::Cuda, long >, - TNL::Matrices::Legacy::CSR< float, TNL::Devices::Cuda, long >, - TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, long > + ,TNL::Matrices::Legacy::CSR< int, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRScalar >, + TNL::Matrices::Legacy::CSR< float, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRScalar >, + TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRScalar >, + TNL::Matrices::Legacy::CSR< int, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRScalar >, + TNL::Matrices::Legacy::CSR< float, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRScalar >, + TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRScalar >, + TNL::Matrices::Legacy::CSR< int, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRVector >, + TNL::Matrices::Legacy::CSR< float, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRVector >, + TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRVector >, + TNL::Matrices::Legacy::CSR< int, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRVector >, + TNL::Matrices::Legacy::CSR< float, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRVector >, + TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRVector >, + /*TNL::Matrices::Legacy::CSR< int, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRHybrid >, // Not implemented + TNL::Matrices::Legacy::CSR< float, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRHybrid >, + TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRHybrid >, + TNL::Matrices::Legacy::CSR< int, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRHybrid >, + TNL::Matrices::Legacy::CSR< float, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRHybrid >, + TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRHybrid >,*/ + TNL::Matrices::Legacy::CSR< int, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRLight >, + TNL::Matrices::Legacy::CSR< float, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRLight >, + TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRLight >, + TNL::Matrices::Legacy::CSR< int, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRLight >, + TNL::Matrices::Legacy::CSR< float, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRLight >, + TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRLight >, + /*TNL::Matrices::Legacy::CSR< int, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRAdaptive >, // Does not work, needs to be fixed. + TNL::Matrices::Legacy::CSR< float, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRAdaptive >, + TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRAdaptive >, + TNL::Matrices::Legacy::CSR< int, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRAdaptive >, + TNL::Matrices::Legacy::CSR< float, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRAdaptive >, + TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRAdaptive >,*/ + TNL::Matrices::Legacy::CSR< int, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRMultiVector >, + TNL::Matrices::Legacy::CSR< float, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRMultiVector >, + TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRMultiVector >, + TNL::Matrices::Legacy::CSR< int, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRMultiVector >, + TNL::Matrices::Legacy::CSR< float, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRMultiVector >, + TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRMultiVector >, + TNL::Matrices::Legacy::CSR< int, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRLightWithoutAtomic >, + TNL::Matrices::Legacy::CSR< float, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRLightWithoutAtomic >, + TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, int, TNL::Matrices::Legacy::CSRLightWithoutAtomic >, + TNL::Matrices::Legacy::CSR< int, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRLightWithoutAtomic >, + TNL::Matrices::Legacy::CSR< float, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRLightWithoutAtomic >, + TNL::Matrices::Legacy::CSR< double, TNL::Devices::Cuda, long, TNL::Matrices::Legacy::CSRLightWithoutAtomic > #endif >; @@ -105,12 +137,12 @@ TYPED_TEST( CSRMatrixTest, setRowTest ) test_SetRow< CSRMatrixType >(); } -TYPED_TEST( CSRMatrixTest, vectorProductTest ) +/* TYPED_TEST( CSRMatrixTest, vectorProductTest ) { using CSRMatrixType = typename TestFixture::CSRMatrixType; test_VectorProduct< CSRMatrixType >(); -} +} */ /*TYPED_TEST( CSRMatrixTest, vectorProductLargerTest ) { @@ -119,12 +151,12 @@ TYPED_TEST( CSRMatrixTest, vectorProductTest ) test_VectorProductLarger< CSRMatrixType >(); }*/ -/*TYPED_TEST( CSRMatrixTest, vectorProductGiantTest ) +TYPED_TEST( CSRMatrixTest, vectorProductCSRApadtiveTest ) { using CSRMatrixType = typename TestFixture::CSRMatrixType; - test_VectorProductGiant< CSRMatrixType >(); -}*/ + test_VectorProductCSRAdaptive< CSRMatrixType >(); +} TYPED_TEST( CSRMatrixTest, saveAndLoadTest ) { diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_ChunkedEllpack.h b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_ChunkedEllpack.h index d633abdbf3d6e2c1bd9bf57d0596e21810befe97..f0ee7c079b66320fd5404e92ec7ee65eb7f4f9f5 100644 --- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_ChunkedEllpack.h +++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_ChunkedEllpack.h @@ -8,7 +8,7 @@ /* See Copyright Notice in tnl/Copyright */ -#include +#include #include "SparseMatrixTest.hpp" #include diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_Ellpack.h b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_Ellpack.h index dd86d63167de179e198ad958c37a5114e5e2ce52..8376654cdda95d723e5e68613117c6718e030270 100644 --- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_Ellpack.h +++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_Ellpack.h @@ -8,7 +8,7 @@ /* See Copyright Notice in tnl/Copyright */ -#include +#include #include "SparseMatrixTest.hpp" #include diff --git a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_SlicedEllpack.h b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_SlicedEllpack.h index 168f482eae4b56488fea3abe0beca9c4cf1cbbc4..9ffba75041066790ba0b6439b23e5f19e1c0bd80 100644 --- a/src/UnitTests/Matrices/Legacy/SparseMatrixTest_SlicedEllpack.h +++ b/src/UnitTests/Matrices/Legacy/SparseMatrixTest_SlicedEllpack.h @@ -8,7 +8,7 @@ /* See Copyright Notice in tnl/Copyright */ -#include +#include #include "SparseMatrixTest.hpp" diff --git a/src/UnitTests/Matrices/SparseMatrixCopyTest.h b/src/UnitTests/Matrices/SparseMatrixCopyTest.h index dcaca61f03735fd6ab7f19e0398c85f3fd56ff32..f5bdd7e3f46f5b6a33f475ef9f132aca2cf442ae 100644 --- a/src/UnitTests/Matrices/SparseMatrixCopyTest.h +++ b/src/UnitTests/Matrices/SparseMatrixCopyTest.h @@ -9,8 +9,8 @@ /* See Copyright Notice in tnl/Copyright */ #include -#include -#include +#include +#include #include #include