diff --git a/Documentation/Tutorials/Matrices/CMakeLists.txt b/Documentation/Tutorials/Matrices/CMakeLists.txt index 0d672aa0b04f51eeda133f5b785568632dfe8b0e..94e57ec13df05c0368087e158f0d2bf199c84e27 100644 --- a/Documentation/Tutorials/Matrices/CMakeLists.txt +++ b/Documentation/Tutorials/Matrices/CMakeLists.txt @@ -104,9 +104,9 @@ ELSE() #### # THe following examples/benchmarks run for very long time - ADD_EXECUTABLE( DenseMatrixSetup_Benchmark DenseMatrixSetup_Benchmark_cuda.cpp ) - ADD_EXECUTABLE( SparseMatrixSetup_Benchmark SparseMatrixSetup_Benchmark_cuda.cpp ) - ADD_EXECUTABLE( MultidiagonalMatrixSetup_Benchmark MultidiagonalMatrixSetup_Benchmark_cuda.cpp ) + ADD_EXECUTABLE( DenseMatrixSetup_Benchmark DenseMatrixSetup_Benchmark.cpp ) + ADD_EXECUTABLE( SparseMatrixSetup_Benchmark SparseMatrixSetup_Benchmark.cpp ) + ADD_EXECUTABLE( MultidiagonalMatrixSetup_Benchmark MultidiagonalMatrixSetup_Benchmark.cpp ) ENDIF() IF( BUILD_CUDA ) diff --git a/Documentation/Tutorials/Matrices/SparseMatrixSetup_Benchmark.cpp b/Documentation/Tutorials/Matrices/SparseMatrixSetup_Benchmark.cpp index c53a8f5b4a9edc06b499448dcec71e91b4428529..a36e17e7b979aa859a0a319283ad06310c5f6eb8 100644 --- a/Documentation/Tutorials/Matrices/SparseMatrixSetup_Benchmark.cpp +++ b/Documentation/Tutorials/Matrices/SparseMatrixSetup_Benchmark.cpp @@ -69,7 +69,7 @@ template< typename Matrix > void setElement_on_host_and_transfer( const int gridSize, Matrix& matrix ) { using RealType = typename Matrix::RealType; - using HostMatrix = typename Matrix::Self< RealType, TNL::Devices::Host >; + using HostMatrix = typename Matrix::template Self< RealType, TNL::Devices::Host >; const int matrixSize = gridSize * gridSize; TNL::Containers::Vector< int, typename HostMatrix::DeviceType, int > rowCapacities( matrixSize, 5 ); diff --git a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h index 3acfb2438c33539594cb3de6aa8f4cc429d21b06..3f64bf33d59eaf271a3ca4b84c59de828a1982c6 100644 --- a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h +++ b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h @@ -479,7 +479,7 @@ struct LinearSolversBenchmark DeviceType, IndexType, TNL::Matrices::GeneralMatrix, - Algorithms::Segments::CSR + Algorithms::Segments::CSRDefault >; SharedPointer< CSR > matrixCopy; Matrices::copySparseMatrix( *matrixCopy, *matrixPointer ); diff --git a/src/Benchmarks/SpMV/spmv-legacy.h b/src/Benchmarks/SpMV/spmv-legacy.h index ec0fd001860959efa0492e3a4c8497948ab5c010..fed37410cf4e004deec7b5a7a8ac6cb2b04ee1d7 100644 --- a/src/Benchmarks/SpMV/spmv-legacy.h +++ b/src/Benchmarks/SpMV/spmv-legacy.h @@ -49,7 +49,16 @@ using SlicedEllpackAlias = Matrices::Legacy::SlicedEllpack< Real, Device, Index // Segments based sparse matrix aliases template< typename Real, typename Device, typename Index > -using SparseMatrix_CSR = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, Algorithms::Segments::CSR >; +using SparseMatrix_CSR_Scalar = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, Algorithms::Segments::CSRScalar >; + +template< typename Real, typename Device, typename Index > +using SparseMatrix_CSR_Vector = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, Algorithms::Segments::CSRVector >; + +template< typename Real, typename Device, typename Index > +using SparseMatrix_CSR_Hybrid = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, Algorithms::Segments::CSRHybrid >; + +template< typename Real, typename Device, typename Index > +using SparseMatrix_CSR_Adaptive = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, Algorithms::Segments::CSRAdaptive >; template< typename Device, typename Index, typename IndexAllocator > using EllpackSegments = Algorithms::Segments::Ellpack< Device, Index, IndexAllocator >; @@ -309,26 +318,29 @@ benchmarkSpmvSynthetic( Benchmark& benchmark, benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", spmvCusparse, cusparseBenchmarkResults ); #endif - benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar >( benchmark, hostOutVector, inputFileName, verboseMR ); - benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Vector >( benchmark, hostOutVector, inputFileName, verboseMR ); - benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light >( benchmark, hostOutVector, inputFileName, verboseMR ); - benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light2 >( benchmark, hostOutVector, inputFileName, verboseMR ); - benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light3 >( benchmark, hostOutVector, inputFileName, verboseMR ); - benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light4 >( benchmark, hostOutVector, inputFileName, verboseMR ); - benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light5 >( benchmark, hostOutVector, inputFileName, verboseMR ); - benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light6 >( benchmark, hostOutVector, inputFileName, verboseMR ); - benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Adaptive >( benchmark, hostOutVector, inputFileName, verboseMR ); - benchmarkSpMV< Real, SparseMatrixLegacy_CSR_MultiVector>( benchmark, hostOutVector, inputFileName, verboseMR ); - benchmarkSpMV< Real, SparseMatrixLegacy_CSR_LightWithoutAtomic>( benchmark, hostOutVector, inputFileName, verboseMR ); - benchmarkSpMV< Real, SparseMatrix_CSR >( benchmark, hostOutVector, inputFileName, verboseMR ); - benchmarkSpMV< Real, Matrices::Legacy::Ellpack >( benchmark, hostOutVector, inputFileName, verboseMR ); - benchmarkSpMV< Real, SparseMatrix_Ellpack >( benchmark, hostOutVector, inputFileName, verboseMR ); - benchmarkSpMV< Real, SlicedEllpackAlias >( benchmark, hostOutVector, inputFileName, verboseMR ); - benchmarkSpMV< Real, SparseMatrix_SlicedEllpack >( benchmark, hostOutVector, inputFileName, verboseMR ); - benchmarkSpMV< Real, Matrices::Legacy::ChunkedEllpack >( benchmark, hostOutVector, inputFileName, verboseMR ); - benchmarkSpMV< Real, SparseMatrix_ChunkedEllpack >( benchmark, hostOutVector, inputFileName, verboseMR ); - benchmarkSpMV< Real, Matrices::Legacy::BiEllpack >( benchmark, hostOutVector, inputFileName, verboseMR ); - benchmarkSpMV< Real, SparseMatrix_BiEllpack >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Vector >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light2 >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light3 >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light4 >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light5 >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light6 >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Adaptive >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_MultiVector >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_LightWithoutAtomic >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrix_CSR_Scalar >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrix_CSR_Vector >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrix_CSR_Hybrid >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrix_CSR_Adaptive >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, Matrices::Legacy::Ellpack >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrix_Ellpack >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SlicedEllpackAlias >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrix_SlicedEllpack >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, Matrices::Legacy::ChunkedEllpack >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrix_ChunkedEllpack >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, Matrices::Legacy::BiEllpack >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrix_BiEllpack >( benchmark, hostOutVector, inputFileName, verboseMR ); /* AdEllpack is broken benchmarkSpMV< Real, Matrices::AdEllpack >( benchmark, hostOutVector, inputFileName, verboseMR ); */ diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h index 82e1f12cde656caf38f45bafa09f8dd38028f126..7897073d96152552a1150c5b94ed0c60ec45d987 100644 --- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h +++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h @@ -63,7 +63,6 @@ std::string getCurrDateTime() timeinfo = localtime( &rawtime ); strftime( buffer, sizeof( buffer ), "%d-%m-%Y--%H:%M:%S", timeinfo ); std::string curr_date_time( buffer ); - return curr_date_time; } @@ -133,8 +132,7 @@ main( int argc, char* argv[] ) // prepare global metadata Benchmark::MetadataMap metadata = getHardwareMetadata(); - - + // Initiate setup of benchmarks if( precision == "all" || precision == "float" ) runSpMVBenchmarks< float >( benchmark, metadata, inputFileName, verboseMR ); diff --git a/src/TNL/Algorithms/Segments/CSR.h b/src/TNL/Algorithms/Segments/CSR.h index 9d2b84b618f835e1578b5441fbfe9cbbdddd8033..3a04e80fd098a3990bebb396337396929c4d2cd0 100644 --- a/src/TNL/Algorithms/Segments/CSR.h +++ b/src/TNL/Algorithms/Segments/CSR.h @@ -22,6 +22,7 @@ namespace TNL { template< typename Device, typename Index, + typename Kernel = CSRKernelScalar< Index, Device >, typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > > class CSR { @@ -29,12 +30,13 @@ class CSR using DeviceType = Device; using IndexType = std::remove_const_t< Index >; + using KernelType = Kernel; using OffsetsHolder = Containers::Vector< Index, DeviceType, IndexType, IndexAllocator >; using SegmentsSizes = OffsetsHolder; template< typename Device_, typename Index_ > - using ViewTemplate = CSRView< Device_, Index_ >; - using ViewType = CSRView< Device, Index >; - using ConstViewType = CSRView< Device, std::add_const_t< IndexType > >; + using ViewTemplate = CSRView< Device_, Index_, KernelType >; + using ViewType = CSRView< Device, Index, KernelType >; + using ConstViewType = CSRView< Device, std::add_const_t< IndexType >, KernelType >; using SegmentViewType = SegmentView< IndexType, RowMajorOrder >; CSR(); @@ -114,8 +116,8 @@ class CSR CSR& operator=( const CSR& rhsSegments ) = default; - template< typename Device_, typename Index_, typename IndexAllocator_ > - CSR& operator=( const CSR< Device_, Index_, IndexAllocator_ >& source ); + template< typename Device_, typename Index_, typename Kernel_, typename IndexAllocator_ > + CSR& operator=( const CSR< Device_, Index_, Kernel_, IndexAllocator_ >& source ); void save( File& file ) const; @@ -124,7 +126,36 @@ class CSR protected: OffsetsHolder offsets; + + KernelType kernel; }; + +template< typename Device, + typename Index, + typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > > +using CSRScalar = CSR< Device, Index, CSRKernelScalar< Index, Device >, IndexAllocator >; + +template< typename Device, + typename Index, + typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > > +using CSRVector = CSR< Device, Index, CSRKernelVector< Index, Device >, IndexAllocator >; + +template< typename Device, + typename Index, + typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > > +using CSRHybrid = CSR< Device, Index, CSRKernelHybrid< Index, Device >, IndexAllocator >; + +template< typename Device, + typename Index, + typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > > +using CSRAdaptive = CSR< Device, Index, CSRKernelAdaptive< Index, Device >, IndexAllocator >; + +template< typename Device, + typename Index, + typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > > +using CSRDefault = CSRScalar< Device, Index, IndexAllocator >; + + } // namespace Segments } // namespace Algorithms } // namespace TNL diff --git a/src/TNL/Algorithms/Segments/CSR.hpp b/src/TNL/Algorithms/Segments/CSR.hpp index a6b915db343ba840e5396796531f4af36ae67071..d6a177f3be5206301b388575346a95bb04d76393 100644 --- a/src/TNL/Algorithms/Segments/CSR.hpp +++ b/src/TNL/Algorithms/Segments/CSR.hpp @@ -22,16 +22,18 @@ namespace TNL { template< typename Device, typename Index, + typename Kernel, typename IndexAllocator > -CSR< Device, Index, IndexAllocator >:: +CSR< Device, Index, Kernel, IndexAllocator >:: CSR() { } template< typename Device, typename Index, + typename Kernel, typename IndexAllocator > -CSR< Device, Index, IndexAllocator >:: +CSR< Device, Index, Kernel, IndexAllocator >:: CSR( const SegmentsSizes& segmentsSizes ) { this->setSegmentsSizes( segmentsSizes ); @@ -39,36 +41,42 @@ CSR( const SegmentsSizes& segmentsSizes ) template< typename Device, typename Index, + typename Kernel, typename IndexAllocator > -CSR< Device, Index, IndexAllocator >:: -CSR( const CSR& csr ) : offsets( csr.offsets ) +CSR< Device, Index, Kernel, IndexAllocator >:: +CSR( const CSR& csr ) : offsets( csr.offsets ), kernel( csr.kernel ) { } template< typename Device, typename Index, + typename Kernel, typename IndexAllocator > -CSR< Device, Index, IndexAllocator >:: -CSR( const CSR&& csr ) : offsets( std::move( csr.offsets ) ) +CSR< Device, Index, Kernel, IndexAllocator >:: +CSR( const CSR&& csr ) : offsets( std::move( csr.offsets ) ), kernel( std::move( csr.kernel ) ) { } template< typename Device, typename Index, + typename Kernel, typename IndexAllocator > String -CSR< Device, Index, IndexAllocator >:: +CSR< Device, Index, Kernel, IndexAllocator >:: getSerializationType() { - return "CSR< [any_device], " + TNL::getSerializationType< IndexType >() + " >"; + return "CSR< [any_device], " + + TNL::getSerializationType< IndexType >() + + TNL::getSerializationType< KernelType >() + " >"; } template< typename Device, typename Index, + typename Kernel, typename IndexAllocator > String -CSR< Device, Index, IndexAllocator >:: +CSR< Device, Index, Kernel, IndexAllocator >:: getSegmentsType() { return ViewType::getSegmentsType(); @@ -76,51 +84,58 @@ getSegmentsType() template< typename Device, typename Index, + typename Kernel, typename IndexAllocator > template< typename SizesHolder > void -CSR< Device, Index, IndexAllocator >:: +CSR< Device, Index, Kernel, IndexAllocator >:: setSegmentsSizes( const SizesHolder& sizes ) { details::CSR< Device, Index >::setSegmentsSizes( sizes, this->offsets ); + this->kernel.init( this->offsets ); } template< typename Device, typename Index, + typename Kernel, typename IndexAllocator > void -CSR< Device, Index, IndexAllocator >:: +CSR< Device, Index, Kernel, IndexAllocator >:: reset() { this->offsets.setSize( 1 ); this->offsets = 0; + this->kernel.reset(); } template< typename Device, typename Index, + typename Kernel, typename IndexAllocator > -typename CSR< Device, Index, IndexAllocator >::ViewType -CSR< Device, Index, IndexAllocator >:: +typename CSR< Device, Index, Kernel, IndexAllocator >::ViewType +CSR< Device, Index, Kernel, IndexAllocator >:: getView() { - return ViewType( this->offsets.getView() ); + return ViewType( this->offsets.getView(), this->kernel.getView() ); } template< typename Device, typename Index, + typename Kernel, typename IndexAllocator > auto -CSR< Device, Index, IndexAllocator >:: +CSR< Device, Index, Kernel, IndexAllocator >:: getConstView() const -> const ConstViewType { - return ConstViewType( this->offsets.getConstView() ); + return ConstViewType( this->offsets.getConstView(), this->kernel.getConstView() ); } template< typename Device, typename Index, + typename Kernel, typename IndexAllocator > -__cuda_callable__ auto CSR< Device, Index, IndexAllocator >:: +__cuda_callable__ auto CSR< Device, Index, Kernel, IndexAllocator >:: getSegmentsCount() const -> IndexType { return this->offsets.getSize() - 1; @@ -128,8 +143,9 @@ getSegmentsCount() const -> IndexType template< typename Device, typename Index, + typename Kernel, typename IndexAllocator > -__cuda_callable__ auto CSR< Device, Index, IndexAllocator >:: +__cuda_callable__ auto CSR< Device, Index, Kernel, IndexAllocator >:: getSegmentSize( const IndexType segmentIdx ) const -> IndexType { return details::CSR< Device, Index >::getSegmentSize( this->offsets, segmentIdx ); @@ -137,8 +153,9 @@ getSegmentSize( const IndexType segmentIdx ) const -> IndexType template< typename Device, typename Index, + typename Kernel, typename IndexAllocator > -__cuda_callable__ auto CSR< Device, Index, IndexAllocator >:: +__cuda_callable__ auto CSR< Device, Index, Kernel, IndexAllocator >:: getSize() const -> IndexType { return this->getStorageSize(); @@ -146,8 +163,9 @@ getSize() const -> IndexType template< typename Device, typename Index, + typename Kernel, typename IndexAllocator > -__cuda_callable__ auto CSR< Device, Index, IndexAllocator >:: +__cuda_callable__ auto CSR< Device, Index, Kernel, IndexAllocator >:: getStorageSize() const -> IndexType { return details::CSR< Device, Index >::getStorageSize( this->offsets ); @@ -155,8 +173,9 @@ getStorageSize() const -> IndexType template< typename Device, typename Index, + typename Kernel, typename IndexAllocator > -__cuda_callable__ auto CSR< Device, Index, IndexAllocator >:: +__cuda_callable__ auto CSR< Device, Index, Kernel, IndexAllocator >:: getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexType { if( ! std::is_same< DeviceType, Devices::Host >::value ) @@ -172,10 +191,11 @@ getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexTyp template< typename Device, typename Index, + typename Kernel, typename IndexAllocator > __cuda_callable__ auto -CSR< Device, Index, IndexAllocator >:: +CSR< Device, Index, Kernel, IndexAllocator >:: getSegmentView( const IndexType segmentIdx ) const -> SegmentViewType { return SegmentViewType( offsets[ segmentIdx ], offsets[ segmentIdx + 1 ] - offsets[ segmentIdx ] ); @@ -183,10 +203,11 @@ getSegmentView( const IndexType segmentIdx ) const -> SegmentViewType template< typename Device, typename Index, + typename Kernel, typename IndexAllocator > template< typename Function, typename... Args > void -CSR< Device, Index, IndexAllocator >:: +CSR< Device, Index, Kernel, IndexAllocator >:: forSegments( IndexType first, IndexType last, Function& f, Args... args ) const { this->getConstView().forSegments( first, last, f, args... ); @@ -194,10 +215,11 @@ forSegments( IndexType first, IndexType last, Function& f, Args... args ) const template< typename Device, typename Index, + typename Kernel, typename IndexAllocator> template< typename Function, typename... Args > void -CSR< Device, Index, IndexAllocator >:: +CSR< Device, Index, Kernel, IndexAllocator >:: forAll( Function& f, Args... args ) const { this->forSegments( 0, this->getSegmentsCount(), f, args... ); @@ -205,10 +227,11 @@ forAll( Function& f, Args... args ) const template< typename Device, typename Index, + typename Kernel, typename IndexAllocator > template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > void -CSR< Device, Index, IndexAllocator >:: +CSR< Device, Index, Kernel, IndexAllocator >:: segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { this->getConstView().segmentsReduction( first, last, fetch, reduction, keeper, zero, args... ); @@ -216,10 +239,11 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio template< typename Device, typename Index, + typename Kernel, typename IndexAllocator > template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > void -CSR< Device, Index, IndexAllocator >:: +CSR< Device, Index, Kernel, IndexAllocator >:: allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... ); @@ -227,21 +251,24 @@ allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, co template< typename Device, typename Index, + typename Kernel, typename IndexAllocator > - template< typename Device_, typename Index_, typename IndexAllocator_ > -CSR< Device, Index, IndexAllocator >& -CSR< Device, Index, IndexAllocator >:: -operator=( const CSR< Device_, Index_, IndexAllocator_ >& source ) + template< typename Device_, typename Index_, typename Kernel_, typename IndexAllocator_ > +CSR< Device, Index, Kernel, IndexAllocator >& +CSR< Device, Index, Kernel, IndexAllocator >:: +operator=( const CSR< Device_, Index_, Kernel_, IndexAllocator_ >& source ) { this->offsets = source.offsets; + this->kernel = kernel; return *this; } template< typename Device, typename Index, + typename Kernel, typename IndexAllocator > void -CSR< Device, Index, IndexAllocator >:: +CSR< Device, Index, Kernel, IndexAllocator >:: save( File& file ) const { file << this->offsets; @@ -249,12 +276,14 @@ save( File& file ) const template< typename Device, typename Index, + typename Kernel, typename IndexAllocator > -void -CSR< Device, Index, IndexAllocator >:: +void +CSR< Device, Index, Kernel, IndexAllocator >:: load( File& file ) { file >> this->offsets; + this->kernel.init( this->offsets ); } } // namespace Segments diff --git a/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h b/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h new file mode 100644 index 0000000000000000000000000000000000000000..feed58a58792c0ba01c4c78040758fd21b9f5eed --- /dev/null +++ b/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h @@ -0,0 +1,487 @@ +/*************************************************************************** + CSRKernels.h - description + ------------------- + begin : Jan 20, 2021 -> Joe Biden inauguration + copyright : (C) 2021 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace TNL { + namespace Algorithms { + namespace Segments { + +#ifdef HAVE_CUDA + +template< int CudaBlockSize, + int warpSize, + int WARPS, + int SHARED_PER_WARP, + int MAX_ELEM_PER_WARP, + typename BlocksView, + typename Offsets, + typename Index, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real, + typename... Args > +__global__ void +segmentsReductionCSRAdaptiveKernel( BlocksView blocks, + int gridIdx, + Offsets offsets, + Index first, + Index last, + Fetch fetch, + Reduction reduce, + ResultKeeper keep, + Real zero, + Args... args ) +{ + __shared__ Real streamShared[ WARPS ][ SHARED_PER_WARP ]; + __shared__ Real multivectorShared[ CudaBlockSize / warpSize ]; + constexpr size_t MAX_X_DIM = 2147483647; + const Index index = (gridIdx * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; + const Index blockIdx = index / warpSize; + if( blockIdx >= blocks.getSize() - 1 ) + return; + + if( threadIdx.x < CudaBlockSize / warpSize ) + multivectorShared[ threadIdx.x ] = zero; + Real result = zero; + bool compute( true ); + const Index laneIdx = threadIdx.x & 31; // & is cheaper than % + const details::CSRAdaptiveKernelBlockDescriptor< Index > block = blocks[ blockIdx ]; + const Index& firstSegmentIdx = block.getFirstSegment(); + const Index begin = offsets[ firstSegmentIdx ]; + + const auto blockType = block.getType(); + if( blockType == details::Type::STREAM ) // Stream kernel - many short segments per warp + { + const Index warpIdx = threadIdx.x / 32; + const Index end = begin + block.getSize(); + + // Stream data to shared memory + for( Index globalIdx = laneIdx + begin; globalIdx < end; globalIdx += warpSize ) + { + streamShared[ warpIdx ][ globalIdx - begin ] = //fetch( globalIdx, compute ); + details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute ); + // TODO:: fix this by template specialization so that we can assume fetch lambda + // with short parameters + } + + const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock(); + + for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += warpSize ) + { + const Index sharedEnd = offsets[ i + 1 ] - begin; // end of preprocessed data + result = zero; + // Scalar reduction + for( Index sharedIdx = offsets[ i ] - begin; sharedIdx < sharedEnd; sharedIdx++ ) + result = reduce( result, streamShared[ warpIdx ][ sharedIdx ] ); + keep( i, result ); + } + } + else if( blockType == details::Type::VECTOR ) // Vector kernel - one segment per warp + { + const Index end = begin + block.getSize(); + const Index segmentIdx = block.getFirstSegment(); + + for( Index globalIdx = begin + laneIdx; globalIdx < end; globalIdx += warpSize ) + result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); // fix local idx + + // Parallel reduction + result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 16 ) ); + result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 8 ) ); + result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 4 ) ); + result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 2 ) ); + result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 1 ) ); + if( laneIdx == 0 ) + keep( segmentIdx, result ); + } + else // blockType == Type::LONG - several warps per segment + { + // Number of elements processed by previous warps + //const Index offset = //block.index[1] * MAX_ELEM_PER_WARP; + /// block.getWarpIdx() * MAX_ELEM_PER_WARP; + //Index to = begin + (block.getWarpIdx() + 1) * MAX_ELEM_PER_WARP; + const Index segmentIdx = block.getFirstSegment();//block.index[0]; + //minID = offsets[block.index[0] ]; + const Index end = offsets[segmentIdx + 1]; + const int tid = threadIdx.x; + //const int inBlockWarpIdx = block.getWarpIdx(); + + //if( to > end ) + // to = end; + TNL_ASSERT_GT( block.getWarpsCount(), 0, "" ); + result = zero; + //printf( "LONG tid %d warpIdx %d: LONG \n", tid, block.getWarpIdx() ); + for( Index globalIdx = begin + laneIdx + TNL::Cuda::getWarpSize() * block.getWarpIdx(); + globalIdx < end; + globalIdx += TNL::Cuda::getWarpSize() * block.getWarpsCount() ) + { + result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); + //if( laneIdx == 0 ) + // printf( "LONG warpIdx: %d gid: %d begin: %d end: %d -> %d \n", ( int ) block.getWarpIdx(), globalIdx, begin, end, + // details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, 0, globalIdx, compute ) ); + //result += values[i] * inVector[columnIndexes[i]]; + } + //printf( "tid %d -> %d \n", tid, result ); + + result += __shfl_down_sync(0xFFFFFFFF, result, 16); + result += __shfl_down_sync(0xFFFFFFFF, result, 8); + result += __shfl_down_sync(0xFFFFFFFF, result, 4); + result += __shfl_down_sync(0xFFFFFFFF, result, 2); + result += __shfl_down_sync(0xFFFFFFFF, result, 1); + + //if( laneIdx == 0 ) + // printf( "WARP RESULT: tid %d -> %d \n", tid, result ); + + const Index warpID = threadIdx.x / 32; + if( laneIdx == 0 ) + multivectorShared[ warpID ] = result; + + __syncthreads(); + // Reduction in multivectorShared + if( block.getWarpIdx() == 0 && laneIdx < 16 ) + { + constexpr int totalWarps = CudaBlockSize / warpSize; + if( totalWarps >= 32 ) + { + multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 16 ] ); + __syncwarp(); + } + if( totalWarps >= 16 ) + { + multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 8 ] ); + __syncwarp(); + } + if( totalWarps >= 8 ) + { + multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 4 ] ); + __syncwarp(); + } + if( totalWarps >= 4 ) + { + multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 2 ] ); + __syncwarp(); + } + if( totalWarps >= 2 ) + { + multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 1 ] ); + __syncwarp(); + } + if( laneIdx == 0 ) + { + //printf( "Long: segmentIdx %d -> %d \n", segmentIdx, multivectorShared[ 0 ] ); + keep( segmentIdx, multivectorShared[ 0 ] ); + } + } + } +} +#endif + + +template< typename Index, + typename Device > +struct CSRKernelAdaptiveView +{ + using IndexType = Index; + using DeviceType = Device; + using ViewType = CSRKernelAdaptiveView< Index, Device >; + using ConstViewType = CSRKernelAdaptiveView< Index, Device >; + using BlocksType = TNL::Containers::Vector< details::CSRAdaptiveKernelBlockDescriptor< Index >, Device, Index >; + using BlocksView = typename BlocksType::ViewType; + + CSRKernelAdaptiveView() = default; + + CSRKernelAdaptiveView( BlocksType& blocks ) + { + this->blocks.bind( blocks ); + }; + + void setBlocks( BlocksType& blocks ) + { + this->blocks.bind( blocks ); + } + + ViewType getView() { return *this; }; + + ConstViewType getConstView() const { return *this; }; + + template< typename OffsetsView, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real, + typename... Args > + void segmentsReduction( const OffsetsView& offsets, + Index first, + Index last, + Fetch& fetch, + const Reduction& reduction, + ResultKeeper& keeper, + const Real& zero, + Args... args ) const + { +#ifdef HAVE_CUDA + if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() ) + { + TNL::Algorithms::Segments::CSRKernelScalar< Index, Device >:: + segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); + return; + } + + static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; + //static constexpr Index THREADS_SCALAR = 128; + //static constexpr Index THREADS_VECTOR = 128; + //static constexpr Index THREADS_LIGHT = 128; + + /* Max length of row to process one warp for CSR Light, MultiVector */ + //static constexpr Index MAX_ELEMENTS_PER_WARP = 384; + + /* Max length of row to process one warp for CSR Adaptive */ + static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512; + + /* How many shared memory use per block in CSR Adaptive kernel */ + static constexpr Index SHARED_PER_BLOCK = 24576; + + /* Number of elements in shared memory */ + static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real); + + /* Number of warps in block for CSR Adaptive */ + static constexpr Index WARPS = THREADS_ADAPTIVE / 32; + + /* Number of elements in shared memory per one warp */ + static constexpr Index SHARED_PER_WARP = SHARED / WARPS; + + constexpr int warpSize = 32; + + Index blocksCount; + + const Index threads = THREADS_ADAPTIVE; + constexpr size_t MAX_X_DIM = 2147483647; + + /* Fill blocks */ + size_t neededThreads = this->blocks.getSize() * warpSize; // one warp per block + /* Execute kernels on device */ + for (Index gridIdx = 0; neededThreads != 0; gridIdx++ ) + { + if (MAX_X_DIM * threads >= neededThreads) + { + blocksCount = roundUpDivision(neededThreads, threads); + neededThreads = 0; + } + else + { + blocksCount = MAX_X_DIM; + neededThreads -= MAX_X_DIM * threads; + } + + segmentsReductionCSRAdaptiveKernel< + THREADS_ADAPTIVE, + warpSize, + WARPS, + SHARED_PER_WARP, + MAX_ELEMENTS_PER_WARP_ADAPT, + BlocksView, + OffsetsView, + Index, Fetch, Reduction, ResultKeeper, Real, Args... > + <<>>( + this->blocks, + gridIdx, + offsets, + first, + last, + fetch, + reduction, + keeper, + zero, + args... ); + } +#endif + } + + CSRKernelAdaptiveView& operator=( const CSRKernelAdaptiveView< Index, Device >& kernelView ) + { + this->blocks.bind( kernelView.blocks ); + return *this; + } + + void printBlocks() const + { + for( Index i = 0; i < this->blocks.getSize(); i++ ) + { + auto block = blocks.getElement( i ); + std::cout << "Block " << i << " : " << block << std::endl; + } + + } + + protected: + BlocksView blocks; +}; + +template< typename Index, + typename Device > +struct CSRKernelAdaptive +{ + using IndexType = Index; + using DeviceType = Device; + using ViewType = CSRKernelAdaptiveView< Index, Device >; + using ConstViewType = CSRKernelAdaptiveView< Index, Device >; + using BlocksType = typename ViewType::BlocksType; + using BlocksView = typename BlocksType::ViewType; + + + static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; + + /* How many shared memory use per block in CSR Adaptive kernel */ + static constexpr Index SHARED_PER_BLOCK = 20000; //24576; TODO: + + /* Number of elements in shared memory */ + static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(double); + + /* Number of warps in block for CSR Adaptive */ + static constexpr Index WARPS = THREADS_ADAPTIVE / 32; + + /* Number of elements in shared memory per one warp */ + static constexpr Index SHARED_PER_WARP = SHARED / WARPS; + + /* Max length of row to process one warp for CSR Light, MultiVector */ + static constexpr Index MAX_ELEMENTS_PER_WARP = 384; + + /* Max length of row to process one warp for CSR Adaptive */ + static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512; + + template< typename Offsets > + Index findLimit( const Index start, + const Offsets& offsets, + const Index size, + details::Type &type, + Index &sum ) + { + sum = 0; + for (Index current = start; current < size - 1; current++ ) + { + Index elements = offsets.getElement(current + 1) - + offsets.getElement(current); + sum += elements; + if( sum > SHARED_PER_WARP ) + { + if( current - start > 0 ) // extra row + { + type = details::Type::STREAM; + return current; + } + else + { // one long row + if( sum <= 2 * MAX_ELEMENTS_PER_WARP_ADAPT ) + type = details::Type::VECTOR; + else + type = details::Type::LONG; + //type = Type::LONG; // + return current + 1; + } + } + } + type = details::Type::STREAM; + return size - 1; // return last row pointer + } + + template< typename Offsets > + void init( const Offsets& offsets ) + { + const Index rows = offsets.getSize(); + Index sum, start( 0 ), nextStart( 0 ); + + // Fill blocks + std::vector< details::CSRAdaptiveKernelBlockDescriptor< Index > > inBlock; + inBlock.reserve( rows ); + + while( nextStart != rows - 1 ) + { + details::Type type; + nextStart = findLimit( start, offsets, rows, type, sum ); + + if( type == details::Type::LONG ) + { + const Index blocksCount = inBlock.size(); + const Index warpsPerCudaBlock = THREADS_ADAPTIVE / TNL::Cuda::getWarpSize(); + Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount; + if( warpsLeft == 0 ) + warpsLeft = warpsPerCudaBlock; + //Index parts = roundUpDivision(sum, this->SHARED_PER_WARP); + inBlock.emplace_back( start, details::Type::LONG, 0, warpsLeft ); + for( Index index = 1; index < warpsLeft; index++ ) + { + inBlock.emplace_back( start, details::Type::LONG, index, warpsLeft ); + } + } + else + { + inBlock.emplace_back(start, type, + nextStart, + offsets.getElement(nextStart), + offsets.getElement(start) ); + } + start = nextStart; + } + inBlock.emplace_back(nextStart); + + // Copy values + this->blocks.setSize(inBlock.size()); + for (size_t i = 0; i < inBlock.size(); ++i) + this->blocks.setElement(i, inBlock[i]); + + this->view.setBlocks( blocks ); + }; + + void reset() + { + this->blocks.reset(); + this->view.setBlocks( blocks ); + } + + ViewType getView() { return this->view; }; + + ConstViewType getConstView() const { return this->view; }; + + template< typename OffsetsView, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real, + typename... Args > + void segmentsReduction( const OffsetsView& offsets, + Index first, + Index last, + Fetch& fetch, + const Reduction& reduction, + ResultKeeper& keeper, + const Real& zero, + Args... args ) const + { + view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); + } + + protected: + BlocksType blocks; + + ViewType view; +}; + + } // namespace Segments + } // namespace Algorithms +} // namespace TNL diff --git a/src/TNL/Algorithms/Segments/CSRKernelHybrid.h b/src/TNL/Algorithms/Segments/CSRKernelHybrid.h new file mode 100644 index 0000000000000000000000000000000000000000..92a4a54ee692f620a8b9983a5b637c25cfd1a09d --- /dev/null +++ b/src/TNL/Algorithms/Segments/CSRKernelHybrid.h @@ -0,0 +1,65 @@ +/*************************************************************************** + CSRKernelHybrid.h - description + ------------------- + begin : Jan 23, 2021 -> Joe Biden inauguration + copyright : (C) 2021 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include +#include +#include +#include +#include + +namespace TNL { + namespace Algorithms { + namespace Segments { + +template< typename Index, + typename Device > +struct CSRKernelHybrid +{ + using IndexType = Index; + using DeviceType = Device; + using ViewType = CSRKernelHybrid< Index, Device >; + using ConstViewType = CSRKernelHybrid< Index, Device >; + + template< typename Offsets > + void init( const Offsets& offsets ); + + void reset(); + + ViewType getView(); + + ConstViewType getConstView() const; + + + template< typename OffsetsView, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real, + typename... Args > + void segmentsReduction( const OffsetsView& offsets, + Index first, + Index last, + Fetch& fetch, + const Reduction& reduction, + ResultKeeper& keeper, + const Real& zero, + Args... args ) const; + + protected: + int threadsPerSegment; +}; + + } // namespace Segments + } // namespace Algorithms +} // namespace TNL + +#include diff --git a/src/TNL/Algorithms/Segments/CSRKernelHybrid.hpp b/src/TNL/Algorithms/Segments/CSRKernelHybrid.hpp new file mode 100644 index 0000000000000000000000000000000000000000..06d2d2868d04f0b8562a991b34fa91539cb43d03 --- /dev/null +++ b/src/TNL/Algorithms/Segments/CSRKernelHybrid.hpp @@ -0,0 +1,195 @@ +/*************************************************************************** + CSRKernelHybrid.hpp - description + ------------------- + begin : Jan 23, 2021 -> Joe Biden inauguration + copyright : (C) 2021 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace TNL { + namespace Algorithms { + namespace Segments { + +#ifdef HAVE_CUDA +template< int ThreadsPerSegment, + typename Offsets, + typename Index, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real, + typename... Args > +__global__ +void segmentsReductionCSRHybridKernel( + int gridIdx, + const Offsets offsets, + Index first, + Index last, + Fetch fetch, + const Reduction reduce, + ResultKeeper keep, + const Real zero, + Args... args ) +{ + /*** + * We map one warp to each segment + */ + const Index segmentIdx = TNL::Cuda::getGlobalThreadIdx( gridIdx ) / ThreadsPerSegment + first; + if( segmentIdx >= last ) + return; + + const int laneIdx = threadIdx.x & ( ThreadsPerSegment - 1 ); // & is cheaper than % + Index endIdx = offsets[ segmentIdx + 1] ; + + Index localIdx( laneIdx ); + Real aux = zero; + bool compute( true ); + for( Index globalIdx = offsets[ segmentIdx ] + localIdx; globalIdx < endIdx; globalIdx += ThreadsPerSegment ) + { + aux = reduce( aux, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) ); + localIdx += TNL::Cuda::getWarpSize(); + } + + /**** + * Reduction in each segment. + */ + if( ThreadsPerSegment == 32 ) + aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 16 ) ); + if( ThreadsPerSegment >= 16 ) + aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 8 ) ); + if( ThreadsPerSegment >= 8 ) + aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 4 ) ); + if( ThreadsPerSegment >= 4 ) + aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 2 ) ); + if( ThreadsPerSegment >= 2 ) + aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 1 ) ); + + if( laneIdx == 0 ) + keep( segmentIdx, aux ); +} +#endif + + + +template< typename Index, + typename Device > + template< typename Offsets > +void +CSRKernelHybrid< Index, Device >:: +init( const Offsets& offsets ) +{ + const Index segmentsCount = offsets.getSize() - 1; + const Index elementsInSegment = std::ceil( ( double ) offsets.getElement( segmentsCount ) / ( double ) segmentsCount ); + this->threadsPerSegment = TNL::min( std::pow( 2, std::ceil( std::log2( elementsInSegment ) ) ), TNL::Cuda::getWarpSize() ); + TNL_ASSERT_GE( threadsPerSegment, 0, "" ); + TNL_ASSERT_LE( threadsPerSegment, 32, "" ); +} + +template< typename Index, + typename Device > +void +CSRKernelHybrid< Index, Device >:: +reset() +{ + this->threadsPerSegment = 0; +} + +template< typename Index, + typename Device > +auto +CSRKernelHybrid< Index, Device >:: +getView() -> ViewType +{ + return *this; +} + +template< typename Index, + typename Device > +auto +CSRKernelHybrid< Index, Device >:: +getConstView() const -> ConstViewType +{ + return *this; +}; + + +template< typename Index, + typename Device > + template< typename OffsetsView, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real, + typename... Args > +void +CSRKernelHybrid< Index, Device >:: +segmentsReduction( const OffsetsView& offsets, + Index first, + Index last, + Fetch& fetch, + const Reduction& reduction, + ResultKeeper& keeper, + const Real& zero, + Args... args ) const +{ + TNL_ASSERT_GE( this->threadsPerSegment, 0, "" ); + TNL_ASSERT_LE( this->threadsPerSegment, 32, "" ); + +#ifdef HAVE_CUDA + const size_t threadsCount = this->threadsPerSegment * ( last - first ); + dim3 blocksCount, gridsCount, blockSize( 256 ); + TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount ); + //std::cerr << " this->threadsPerSegment = " << this->threadsPerSegment << " offsets = " << offsets << std::endl; + for( unsigned int gridIdx = 0; gridIdx < gridsCount.x; gridIdx ++ ) + { + dim3 gridSize; + TNL::Cuda::setupGrid( blocksCount, gridsCount, gridIdx, gridSize ); + switch( this->threadsPerSegment ) + { + case 0: // this means zero/empty matrix + break; + case 1: + segmentsReductionCSRHybridKernel< 1, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( + gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); + break; + case 2: + segmentsReductionCSRHybridKernel< 2, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( + gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); + break; + case 4: + segmentsReductionCSRHybridKernel< 4, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( + gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); + break; + case 8: + segmentsReductionCSRHybridKernel< 8, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( + gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); + break; + case 16: + segmentsReductionCSRHybridKernel< 16, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( + gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); + break; + case 32: + segmentsReductionCSRHybridKernel< 32, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( + gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); + break; + default: + throw std::runtime_error( std::string( "Wrong value of threadsPerSegment: " ) + std::to_string( this->threadsPerSegment ) ); + } + } +#endif +} + + } // namespace Segments + } // namespace Algorithms +} // namespace TNL diff --git a/src/TNL/Algorithms/Segments/CSRKernelScalar.h b/src/TNL/Algorithms/Segments/CSRKernelScalar.h new file mode 100644 index 0000000000000000000000000000000000000000..4a716c890d07b9fd797235540ee89dd85de86196 --- /dev/null +++ b/src/TNL/Algorithms/Segments/CSRKernelScalar.h @@ -0,0 +1,61 @@ +/*************************************************************************** + CSRKernelScalar.h - description + ------------------- + begin : Jan 23, 2021 -> Joe Biden inauguration + copyright : (C) 2021 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include +#include +#include +#include +#include + +namespace TNL { + namespace Algorithms { + namespace Segments { + +template< typename Index, + typename Device > +struct CSRKernelScalar +{ + using IndexType = Index; + using DeviceType = Device; + using ViewType = CSRKernelScalar< Index, Device >; + using ConstViewType = CSRKernelScalar< Index, Device >; + + template< typename Offsets > + void init( const Offsets& offsets ); + + void reset(); + + ViewType getView(); + + ConstViewType getConstView() const; + + template< typename OffsetsView, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real, + typename... Args > + static void segmentsReduction( const OffsetsView& offsets, + Index first, + Index last, + Fetch& fetch, + const Reduction& reduction, + ResultKeeper& keeper, + const Real& zero, + Args... args ); +}; + + } // namespace Segments + } // namespace Algorithms +} // namespace TNL + +#include \ No newline at end of file diff --git a/src/TNL/Algorithms/Segments/CSRKernelScalar.hpp b/src/TNL/Algorithms/Segments/CSRKernelScalar.hpp new file mode 100644 index 0000000000000000000000000000000000000000..7dd0f5cd7b87cc213585ad955e91e4b363ee3a9c --- /dev/null +++ b/src/TNL/Algorithms/Segments/CSRKernelScalar.hpp @@ -0,0 +1,92 @@ +/*************************************************************************** + CSRKernelScalar.h - description + ------------------- + begin : Jan 23, 2021 -> Joe Biden inauguration + copyright : (C) 2021 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace TNL { + namespace Algorithms { + namespace Segments { + +template< typename Index, + typename Device > + template< typename Offsets > +void +CSRKernelScalar< Index, Device >:: +init( const Offsets& offsets ) +{ +} + +template< typename Index, + typename Device > +void +CSRKernelScalar< Index, Device >:: +reset() +{ +} + +template< typename Index, + typename Device > +auto +CSRKernelScalar< Index, Device >:: +getView() -> ViewType +{ + return *this; +} + +template< typename Index, + typename Device > +auto +CSRKernelScalar< Index, Device >:: +getConstView() const -> ConstViewType +{ + return *this; +}; + +template< typename Index, + typename Device > + template< typename OffsetsView, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real, + typename... Args > +void +CSRKernelScalar< Index, Device >:: +segmentsReduction( const OffsetsView& offsets, + Index first, + Index last, + Fetch& fetch, + const Reduction& reduction, + ResultKeeper& keeper, + const Real& zero, + Args... args ) +{ + auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable { + const IndexType begin = offsets[ segmentIdx ]; + const IndexType end = offsets[ segmentIdx + 1 ]; + Real aux( zero ); + IndexType localIdx( 0 ); + bool compute( true ); + for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++ ) + aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) ); + keeper( segmentIdx, aux ); + }; + Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); +} + } // namespace Segments + } // namespace Algorithms +} // namespace TNL diff --git a/src/TNL/Algorithms/Segments/CSRKernelVector.h b/src/TNL/Algorithms/Segments/CSRKernelVector.h new file mode 100644 index 0000000000000000000000000000000000000000..7a6ccf7ff71cea21829121450dd2e09fbbc68b0f --- /dev/null +++ b/src/TNL/Algorithms/Segments/CSRKernelVector.h @@ -0,0 +1,62 @@ +/*************************************************************************** + CSRKernelVector.h - description + ------------------- + begin : Jan 23, 2021 -> Joe Biden inauguration + copyright : (C) 2021 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include +#include +#include +#include +#include + +namespace TNL { + namespace Algorithms { + namespace Segments { + +template< typename Index, + typename Device > +struct CSRKernelVector +{ + using IndexType = Index; + using DeviceType = Device; + using ViewType = CSRKernelVector< Index, Device >; + using ConstViewType = CSRKernelVector< Index, Device >; + + template< typename Offsets > + void init( const Offsets& offsets ); + + void reset(); + + ViewType getView(); + + ConstViewType getConstView() const; + + + template< typename OffsetsView, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real, + typename... Args > + static void segmentsReduction( const OffsetsView& offsets, + Index first, + Index last, + Fetch& fetch, + const Reduction& reduction, + ResultKeeper& keeper, + const Real& zero, + Args... args ); +}; + + } // namespace Segments + } // namespace Algorithms +} // namespace TNL + +#include diff --git a/src/TNL/Algorithms/Segments/CSRKernelVector.hpp b/src/TNL/Algorithms/Segments/CSRKernelVector.hpp new file mode 100644 index 0000000000000000000000000000000000000000..d6f5bb7ec321b25ef89a4654197d86c985123706 --- /dev/null +++ b/src/TNL/Algorithms/Segments/CSRKernelVector.hpp @@ -0,0 +1,152 @@ +/*************************************************************************** + CSRKernelVector.hpp - description + ------------------- + begin : Jan 23, 2021 -> Joe Biden inauguration + copyright : (C) 2021 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace TNL { + namespace Algorithms { + namespace Segments { + +#ifdef HAVE_CUDA +template< typename Offsets, + typename Index, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real, + typename... Args > +__global__ +void segmentsReductionCSRKernelVector( + int gridIdx, + const Offsets offsets, + Index first, + Index last, + Fetch fetch, + const Reduction reduce, + ResultKeeper keep, + const Real zero, + Args... args ) +{ + /*** + * We map one warp to each segment + */ + const Index segmentIdx = TNL::Cuda::getGlobalThreadIdx( gridIdx ) / TNL::Cuda::getWarpSize() + first; + if( segmentIdx >= last ) + return; + + const int laneIdx = threadIdx.x & ( TNL::Cuda::getWarpSize() - 1 ); // & is cheaper than % + TNL_ASSERT_LT( segmentIdx + 1, offsets.getSize(), "" ); + Index endIdx = offsets[ segmentIdx + 1 ]; + + Index localIdx( laneIdx ); + Real aux = zero; + bool compute( true ); + for( Index globalIdx = offsets[ segmentIdx ] + localIdx; globalIdx < endIdx; globalIdx += TNL::Cuda::getWarpSize() ) + { + TNL_ASSERT_LT( globalIdx, endIdx, "" ); + aux = reduce( aux, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) ); + localIdx += TNL::Cuda::getWarpSize(); + } + + /**** + * Reduction in each warp which means in each segment. + */ + aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 16 ) ); + aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 8 ) ); + aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 4 ) ); + aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 2 ) ); + aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 1 ) ); + + if( laneIdx == 0 ) + keep( segmentIdx, aux ); +} +#endif + +template< typename Index, + typename Device > + template< typename Offsets > +void +CSRKernelVector< Index, Device >:: +init( const Offsets& offsets ) +{ +} + +template< typename Index, + typename Device > +void +CSRKernelVector< Index, Device >:: +reset() +{ +} + +template< typename Index, + typename Device > +auto +CSRKernelVector< Index, Device >:: +getView() -> ViewType +{ + return *this; +} + +template< typename Index, + typename Device > +auto +CSRKernelVector< Index, Device >:: +getConstView() const -> ConstViewType +{ + return *this; +}; + + +template< typename Index, + typename Device > + template< typename OffsetsView, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real, + typename... Args > +void +CSRKernelVector< Index, Device >:: +segmentsReduction( const OffsetsView& offsets, + Index first, + Index last, + Fetch& fetch, + const Reduction& reduction, + ResultKeeper& keeper, + const Real& zero, + Args... args ) +{ +#ifdef HAVE_CUDA + const Index warpsCount = last - first; + const size_t threadsCount = warpsCount * TNL::Cuda::getWarpSize(); + dim3 blocksCount, gridsCount, blockSize( 256 ); + TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount ); + dim3 gridIdx; + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x ++ ) + { + dim3 gridSize; + TNL::Cuda::setupGrid( blocksCount, gridsCount, gridIdx, gridSize ); + segmentsReductionCSRKernelVector< OffsetsView, IndexType, Fetch, Reduction, ResultKeeper, Real, Args... > + <<< gridSize, blockSize >>>( + gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args... ); + }; +#endif +} + } // namespace Segments + } // namespace Algorithms +} // namespace TNL diff --git a/src/TNL/Algorithms/Segments/CSRView.h b/src/TNL/Algorithms/Segments/CSRView.h index 610864f5e13921e6d81b775ebf7b233ea8b6b3e9..4576d9fdb5efbb527fb8f35d77b29b168b6a0978 100644 --- a/src/TNL/Algorithms/Segments/CSRView.h +++ b/src/TNL/Algorithms/Segments/CSRView.h @@ -14,35 +14,42 @@ #include #include +#include +#include +#include +#include namespace TNL { namespace Algorithms { namespace Segments { template< typename Device, - typename Index > + typename Index, + typename Kernel = CSRKernelScalar< Index, Device > > class CSRView { public: using DeviceType = Device; using IndexType = std::remove_const_t< Index >; + using KernelType = Kernel; using OffsetsView = typename Containers::VectorView< Index, DeviceType, IndexType >; - using ConstOffsetsView = typename Containers::Vector< Index, DeviceType,IndexType >::ConstViewType; + using ConstOffsetsView = typename Containers::Vector< Index, DeviceType, IndexType >::ConstViewType; + using KernelView = typename Kernel::ViewType; using ViewType = CSRView; template< typename Device_, typename Index_ > - using ViewTemplate = CSRView< Device_, Index_ >; - using ConstViewType = CSRView< Device, std::add_const_t< Index > >; + using ViewTemplate = CSRView< Device_, Index_, Kernel >; + using ConstViewType = CSRView< Device, std::add_const_t< Index >, Kernel >; using SegmentViewType = SegmentView< IndexType, RowMajorOrder >; __cuda_callable__ CSRView(); __cuda_callable__ - CSRView( const OffsetsView& offsets ); + CSRView( const OffsetsView& offsets, const KernelView& kernel ); __cuda_callable__ - CSRView( const OffsetsView&& offsets ); + CSRView( const OffsetsView&& offsets, const KernelView&& kernel ); __cuda_callable__ CSRView( const CSRView& csr_view ); @@ -121,7 +128,30 @@ class CSRView protected: OffsetsView offsets; + + KernelView kernel; }; + +template< typename Device, + typename Index > +using CSRViewScalar = CSRView< Device, Index, CSRKernelScalar< Index, Device > >; + +template< typename Device, + typename Index > +using CSRViewVector = CSRView< Device, Index, CSRKernelVector< Index, Device > >; + +template< typename Device, + typename Index > +using CSRViewHybrid = CSRView< Device, Index, CSRKernelHybrid< Index, Device > >; + +template< typename Device, + typename Index > +using CSRViewAdaptive = CSRView< Device, Index, CSRKernelAdaptive< Index, Device > >; + +template< typename Device, + typename Index > +using CSRViewDefault = CSRViewScalar< Device, Index >; + } // namespace Segments } // namespace Algorithms } // namespace TNL diff --git a/src/TNL/Algorithms/Segments/CSRView.hpp b/src/TNL/Algorithms/Segments/CSRView.hpp index 5537a1233ce47638e37d91aa0ea656f6d7de8f62..045b6bc5a2fe2c13a59cfcdcf7bc6d7234f76f17 100644 --- a/src/TNL/Algorithms/Segments/CSRView.hpp +++ b/src/TNL/Algorithms/Segments/CSRView.hpp @@ -22,122 +22,140 @@ namespace TNL { template< typename Device, - typename Index > + typename Index, + typename Kernel > __cuda_callable__ -CSRView< Device, Index >:: +CSRView< Device, Index, Kernel >:: CSRView() { } template< typename Device, - typename Index > + typename Index, + typename Kernel > __cuda_callable__ -CSRView< Device, Index >:: -CSRView( const OffsetsView& offsets_view ) - : offsets( offsets_view ) +CSRView< Device, Index, Kernel >:: +CSRView( const OffsetsView& offsets_view, + const KernelView& kernel_view ) + : offsets( offsets_view ), kernel( kernel_view ) { } template< typename Device, - typename Index > + typename Index, + typename Kernel > __cuda_callable__ -CSRView< Device, Index >:: -CSRView( const OffsetsView&& offsets_view ) - : offsets( offsets_view ) +CSRView< Device, Index, Kernel >:: +CSRView( const OffsetsView&& offsets_view, + const KernelView&& kernel_view ) + : offsets( std::move( offsets_view ) ), kernel( std::move( kernel_view ) ) { } template< typename Device, - typename Index > + typename Index, + typename Kernel > __cuda_callable__ -CSRView< Device, Index >:: +CSRView< Device, Index, Kernel >:: CSRView( const CSRView& csr_view ) - : offsets( csr_view.offsets ) + : offsets( csr_view.offsets ), kernel( csr_view.kernel ) { } template< typename Device, - typename Index > + typename Index, + typename Kernel > __cuda_callable__ -CSRView< Device, Index >:: +CSRView< Device, Index, Kernel >:: CSRView( const CSRView&& csr_view ) - : offsets( std::move( csr_view.offsets ) ) + : offsets( std::move( csr_view.offsets ) ), kernel( std::move( csr_view.kernel ) ) { } template< typename Device, - typename Index > + typename Index, + typename Kernel > String -CSRView< Device, Index >:: +CSRView< Device, Index, Kernel >:: getSerializationType() { - return "CSR< [any_device], " + TNL::getSerializationType< IndexType >() + " >"; + return "CSR< [any_device], " + + TNL::getSerializationType< IndexType >() + + TNL::getSerializationType< KernelType >() + " >"; } template< typename Device, - typename Index > + typename Index, + typename Kernel > String -CSRView< Device, Index >:: +CSRView< Device, Index, Kernel >:: getSegmentsType() { return "CSR"; } template< typename Device, - typename Index > + typename Index, + typename Kernel > __cuda_callable__ -typename CSRView< Device, Index >::ViewType -CSRView< Device, Index >:: +typename CSRView< Device, Index, Kernel >::ViewType +CSRView< Device, Index, Kernel >:: getView() { - return ViewType( this->offsets ); + return ViewType( this->offsets, this->kernel ); } template< typename Device, - typename Index > + typename Index, + typename Kernel > __cuda_callable__ auto -CSRView< Device, Index >:: +CSRView< Device, Index, Kernel >:: getConstView() const -> const ConstViewType { - return ConstViewType( this->offsets.getConstView() ); + return ConstViewType( this->offsets.getConstView(), this->kernel.getConstView() ); } template< typename Device, - typename Index > -__cuda_callable__ auto CSRView< Device, Index >:: + typename Index, + typename Kernel > +__cuda_callable__ auto CSRView< Device, Index, Kernel >:: getSegmentsCount() const -> IndexType { return this->offsets.getSize() - 1; } template< typename Device, - typename Index > -__cuda_callable__ auto CSRView< Device, Index >:: + typename Index, + typename Kernel > +__cuda_callable__ auto CSRView< Device, Index, Kernel >:: getSegmentSize( const IndexType segmentIdx ) const -> IndexType { return details::CSR< Device, Index >::getSegmentSize( this->offsets, segmentIdx ); } template< typename Device, - typename Index > -__cuda_callable__ auto CSRView< Device, Index >:: + typename Index, + typename Kernel > +__cuda_callable__ auto CSRView< Device, Index, Kernel >:: getSize() const -> IndexType { return this->getStorageSize(); } template< typename Device, - typename Index > -__cuda_callable__ auto CSRView< Device, Index >:: + typename Index, + typename Kernel > +__cuda_callable__ auto CSRView< Device, Index, Kernel >:: getStorageSize() const -> IndexType { return details::CSR< Device, Index >::getStorageSize( this->offsets ); } template< typename Device, - typename Index > -__cuda_callable__ auto CSRView< Device, Index >:: + typename Index, + typename Kernel > +__cuda_callable__ auto CSRView< Device, Index, Kernel >:: getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexType { if( ! std::is_same< DeviceType, Devices::Host >::value ) @@ -152,20 +170,22 @@ getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexTyp } template< typename Device, - typename Index > + typename Index, + typename Kernel > __cuda_callable__ auto -CSRView< Device, Index >:: +CSRView< Device, Index, Kernel >:: getSegmentView( const IndexType segmentIdx ) const -> SegmentViewType { return SegmentViewType( offsets[ segmentIdx ], offsets[ segmentIdx + 1 ] - offsets[ segmentIdx ], 1 ); } template< typename Device, - typename Index > + typename Index, + typename Kernel > template< typename Function, typename... Args > void -CSRView< Device, Index >:: +CSRView< Device, Index, Kernel >:: forSegments( IndexType first, IndexType last, Function& f, Args... args ) const { const auto offsetsView = this->offsets; @@ -181,73 +201,72 @@ forSegments( IndexType first, IndexType last, Function& f, Args... args ) const } template< typename Device, - typename Index > + typename Index, + typename Kernel > template< typename Function, typename... Args > void -CSRView< Device, Index >:: +CSRView< Device, Index, Kernel >:: forAll( Function& f, Args... args ) const { this->forSegments( 0, this->getSegmentsCount(), f, args... ); } template< typename Device, - typename Index > + typename Index, + typename Kernel > template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > void -CSRView< Device, Index >:: +CSRView< Device, Index, Kernel >:: segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { - using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType; - const auto offsetsView = this->offsets.getConstView(); - auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable { - const IndexType begin = offsetsView[ segmentIdx ]; - const IndexType end = offsetsView[ segmentIdx + 1 ]; - RealType aux( zero ); - IndexType localIdx( 0 ); - bool compute( true ); - for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++ ) - aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) ); - keeper( segmentIdx, aux ); - }; - Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); + if( std::is_same< DeviceType, TNL::Devices::Host >::value ) + TNL::Algorithms::Segments::CSRKernelScalar< IndexType, DeviceType >::segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); + else + kernel.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); } template< typename Device, - typename Index > + typename Index, + typename Kernel > template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > void -CSRView< Device, Index >:: +CSRView< Device, Index, Kernel >:: allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... ); } template< typename Device, - typename Index > -CSRView< Device, Index >& -CSRView< Device, Index >:: + typename Index, + typename Kernel > +CSRView< Device, Index, Kernel >& +CSRView< Device, Index, Kernel >:: operator=( const CSRView& view ) { this->offsets.bind( view.offsets ); + this->kernel = view.kernel; return *this; } template< typename Device, - typename Index > + typename Index, + typename Kernel > void -CSRView< Device, Index >:: +CSRView< Device, Index, Kernel >:: save( File& file ) const { file << this->offsets; } template< typename Device, - typename Index > + typename Index, + typename Kernel > void -CSRView< Device, Index >:: +CSRView< Device, Index, Kernel >:: load( File& file ) { file >> this->offsets; + this->kernel.init( this->offsets ); } } // namespace Segments diff --git a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h new file mode 100644 index 0000000000000000000000000000000000000000..96f1899b268596bc57ba395cec1556ab5fbdfff5 --- /dev/null +++ b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h @@ -0,0 +1,239 @@ +/*************************************************************************** + CSRAdaptiveKernelBlockDescriptor.h - description + ------------------- + begin : Jan 25, 2021 -> Joe Biden inauguration + copyright : (C) 2021 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +namespace TNL { + namespace Algorithms { + namespace Segments { + namespace details { + +enum class Type { + /* LONG = 0!!! Non zero value rewrites index[1] */ + LONG = 0, + STREAM = 1, + VECTOR = 2 +}; + +#ifdef CSR_ADAPTIVE_UNION +template< typename Index > +union CSRAdaptiveKernelBlockDescriptor +{ + CSRAdaptiveKernelBlockDescriptor(Index row, Type type = Type::VECTOR, Index index = 0) noexcept + { + this->index[0] = row; + this->index[1] = index; + this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type; + } + + CSRAdaptiveKernelBlockDescriptor(Index row, Type type, Index nextRow, Index maxID, Index minID) noexcept + { + this->index[0] = row; + this->index[1] = 0; + this->twobytes[sizeof(Index) == 4 ? 2 : 4] = maxID - minID; + + if (type == Type::STREAM) + this->twobytes[sizeof(Index) == 4 ? 3 : 5] = nextRow - row; + + if (type == Type::STREAM) + this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b1000000; + else if (type == Type::VECTOR) + this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b10000000; + } + + CSRAdaptiveKernelBlockDescriptor() = default; + + __cuda_callable__ Type getType() const + { + if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b1000000 ) + return Type::STREAM; + if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b10000000 ) + return Type::VECTOR; + return Type::LONG; + } + + __cuda_callable__ const Index& getFirstSegment() const + { + return index[ 0 ]; + } + + /*** + * \brief Returns number of elements covered by the block. + */ + __cuda_callable__ const Index getSize() const + { + return twobytes[ sizeof(Index) == 4 ? 2 : 4 ]; + } + + /*** + * \brief Returns number of segments covered by the block. + */ + __cuda_callable__ const Index getSegmentsInBlock() const + { + return ( twobytes[ sizeof( Index ) == 4 ? 3 : 5 ] & 0x3FFF ); + } + + void print( std::ostream& str ) const + { + Type type = this->getType(); + str << "Type: "; + switch( type ) + { + case Type::STREAM: + str << " Stream "; + break; + case Type::VECTOR: + str << " Vector "; + break; + case Type::LONG: + str << " Long "; + break; + } + str << " first segment: " << getFirstSegment(); + str << " block end: " << getSize(); + str << " index in warp: " << index[ 1 ]; + } + Index index[2]; // index[0] is row pointer, index[1] is index in warp + uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator + uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID + //twobytes[3/5] is nextRow - row +}; +#else + +template< typename Index > +struct CSRAdaptiveKernelBlockDescriptor +{ + CSRAdaptiveKernelBlockDescriptor( Index firstSegmentIdx, + Type type = Type::VECTOR, + uint8_t warpIdx = 0, + uint8_t warpsCount = 0 ) noexcept + { + this->firstSegmentIdx = firstSegmentIdx; + this->type = ( uint8_t ) type; + this->warpIdx = warpIdx; + this->warpsCount = warpsCount; + /*this->index[0] = row; + this->index[1] = index; + this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type;*/ + } + + CSRAdaptiveKernelBlockDescriptor( Index firstSegmentIdx, + Type type, + Index lastSegmentIdx, + Index end, + Index begin ) noexcept + { + this->firstSegmentIdx = firstSegmentIdx; + this->warpIdx = 0; + this->blockSize = end - begin; + this->segmentsInBlock = lastSegmentIdx - firstSegmentIdx; + this->type = ( uint8_t ) type; + + /*this->index[0] = row; + this->index[1] = 0; + this->twobytes[sizeof(Index) == 4 ? 2 : 4] = maxID - minID; + + if (type == Type::STREAM) + this->twobytes[sizeof(Index) == 4 ? 3 : 5] = nextRow - row; + + if (type == Type::STREAM) + this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b1000000; + else if (type == Type::VECTOR) + this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b10000000;*/ + } + + CSRAdaptiveKernelBlockDescriptor() = default; + + __cuda_callable__ Type getType() const + { + return ( Type ) this->type; + /*if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b1000000 ) + return Type::STREAM; + if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b10000000 ) + return Type::VECTOR; + return Type::LONG;*/ + } + + __cuda_callable__ const Index& getFirstSegment() const + { + return this->firstSegmentIdx; + //return index[ 0 ]; + } + + /*** + * \brief Returns number of elements covered by the block. + */ + __cuda_callable__ const Index getSize() const + { + return this->blockSize; + //return twobytes[ sizeof(Index) == 4 ? 2 : 4 ]; + } + + /*** + * \brief Returns number of segments covered by the block. + */ + __cuda_callable__ const Index getSegmentsInBlock() const + { + return this->segmentsInBlock; + //return ( twobytes[ sizeof( Index ) == 4 ? 3 : 5 ] & 0x3FFF ); + } + + __cuda_callable__ uint8_t getWarpIdx() const + { + return this->warpIdx; + } + + __cuda_callable__ uint8_t getWarpsCount() const + { + return this->warpsCount; + } + + void print( std::ostream& str ) const + { + str << "Type: "; + switch( this->getType() ) + { + case Type::STREAM: + str << " Stream "; + break; + case Type::VECTOR: + str << " Vector "; + break; + case Type::LONG: + str << " Long "; + break; + } + str << " first segment: " << this->getFirstSegment(); + str << " block end: " << this->getSize(); + str << " index in warp: " << this->getWarpIdx(); + } + + uint8_t type; + Index firstSegmentIdx, blockSize, segmentsInBlock; + uint8_t warpIdx, warpsCount; + + //Index index[2]; // index[0] is row pointer, index[1] is index in warp + //uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator + //uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID + //twobytes[3/5] is nextRow - row +}; + +#endif + +template< typename Index > +std::ostream& operator<< ( std::ostream& str, const CSRAdaptiveKernelBlockDescriptor< Index >& block ) +{ + block.print( str ); + return str; +} + } // namespace details + } // namespace Segments + } // namespace Algorithms +} // namespace TNL diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h index 7570eac8be54c31bf61f364abe2c5a02413b4234..42f68b1277f994197c561ef7a4d000b0e600878e 100644 --- a/src/TNL/Matrices/Legacy/CSR.h +++ b/src/TNL/Matrices/Legacy/CSR.h @@ -115,8 +115,11 @@ public: static constexpr Index THREADS_VECTOR = 128; static constexpr Index THREADS_LIGHT = 128; - /* Max length of row to process one warp */ - static constexpr Index MAX_ELEMENTS_PER_WARP = 1024; + /* Max length of row to process one warp for CSR Light, MultiVector */ + static constexpr Index MAX_ELEMENTS_PER_WARP = 384; + + /* Max length of row to process one warp for CSR Adaptive */ + static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512; /* How many shared memory use per block in CSR Adaptive kernel */ static constexpr Index SHARED_PER_BLOCK = 24576; diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index 580b63456c2071cb4b27a2c83e5d4e5737b91cd2..7a610c8257cb4450035fa0c46a928b0f84b377f5 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -143,7 +143,7 @@ Index findLimit(const Index start, type = Type::STREAM; return current; } else { // one long row - if (sum <= 2 * matrix.MAX_ELEMENTS_PER_WARP) + if (sum <= 2 * matrix.MAX_ELEMENTS_PER_WARP_ADAPT) type = Type::VECTOR; else type = Type::LONG; @@ -1764,8 +1764,8 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, SpMVCSRAdaptive< Real, Index, warpSize, matrix.WARPS, - matrix.SHARED_PER_WARP, - matrix.MAX_ELEMENTS_PER_WARP > + matrix.SHARED_PER_WARP, + matrix.MAX_ELEMENTS_PER_WARP_ADAPT > <<>>( inVector, outVector, diff --git a/src/TNL/Matrices/SparseMatrix.h b/src/TNL/Matrices/SparseMatrix.h index 6d068f370f3be3a2d8b6eea20386054d6c984776..581d79c983d2a0961cfe7576c07e2d2dc9d5e5f9 100644 --- a/src/TNL/Matrices/SparseMatrix.h +++ b/src/TNL/Matrices/SparseMatrix.h @@ -45,7 +45,7 @@ template< typename Real = double, typename Device = Devices::Host, typename Index = int, typename MatrixType = GeneralMatrix, - template< typename Device_, typename Index_, typename IndexAllocator_ > class Segments = Algorithms::Segments::CSR, + template< typename Device_, typename Index_, typename IndexAllocator_ > class Segments = Algorithms::Segments::CSRDefault, typename ComputeReal = typename ChooseSparseMatrixComputeReal< Real, Index >::type, typename RealAllocator = typename Allocators::Default< Device >::template Allocator< Real >, typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > > @@ -209,13 +209,8 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator > const IndexAllocatorType& indexAllocator = IndexAllocatorType() ); /** -<<<<<<< HEAD * \brief Constructor with matrix rows capacities and number of columns. * -======= - * \brief Constructor with matrix rows capacities given as an initializer list and a number of columns. - * ->>>>>>> Added SparseMatrix constructor with row capacities vector. * The number of matrix rows is given by the size of \e rowCapacities list. * * \tparam ListIndex is the initializer list values type. @@ -238,9 +233,9 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator > /** * \brief Constructor with matrix rows capacities given as a vector and number of columns. - * + * * The number of matrix rows is given by the size of \e rowCapacities vector. - * + * * \tparam RowCapacitiesVector is the row capacities vector type. Usually it is some of * \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, \ref TNL::Containers::Vector or * \ref TNL::Containers::VectorView. @@ -249,7 +244,7 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator > * \param columns is the number of matrix columns. * \param realAllocator is used for allocation of matrix elements values. * \param indexAllocator is used for allocation of matrix elements column indexes. - * + * * \par Example * \include Matrices/SparseMatrix/SparseMatrixExample_Constructor_rowCapacities_vector.cpp * \par Output diff --git a/src/TNL/Matrices/SparseMatrixView.h b/src/TNL/Matrices/SparseMatrixView.h index f91e471e8a2158de04f46946513e149ac530f338..9b69c2e91c12a7419dbe9764ccae3ceb567602e2 100644 --- a/src/TNL/Matrices/SparseMatrixView.h +++ b/src/TNL/Matrices/SparseMatrixView.h @@ -36,10 +36,10 @@ struct ChooseSparseMatrixComputeReal< bool, Index > * * It serves as an accessor to \ref SparseMatrix for example when passing the * matrix to lambda functions. SparseMatrix view can be also created in CUDA kernels. - * - * \tparam Real is a type of matrix elements. If \e Real equals \e bool the matrix is treated + * + * \tparam Real is a type of matrix elements. If \e Real equals \e bool the matrix is treated * as binary and so the matrix elements values are not stored in the memory since we need - * to remember only coordinates of non-zero elements( which equal one). + * to remember only coordinates of non-zero elements( which equal one). * \tparam Device is a device where the matrix is allocated. * \tparam Index is a type for indexing of the matrix elements. * \tparam MatrixType specifies a symmetry of matrix. See \ref MatrixType. Symmetric @@ -50,13 +50,13 @@ struct ChooseSparseMatrixComputeReal< bool, Index > * \ref Ellpack, \ref SlicedEllpack, \ref ChunkedEllpack or \ref BiEllpack. * \tparam ComputeReal is the same as \e Real mostly but for binary matrices it is set to \e Index type. This can be changed * bu the user, of course. - * + * */ template< typename Real, typename Device = Devices::Host, typename Index = int, typename MatrixType = GeneralMatrix, - template< typename Device_, typename Index_ > class SegmentsView = Algorithms::Segments::CSRView, + template< typename Device_, typename Index_ > class SegmentsView = Algorithms::Segments::CSRViewDefault, typename ComputeReal = typename ChooseSparseMatrixComputeReal< Real, Index >::type > class SparseMatrixView : public MatrixView< Real, Device, Index > { @@ -79,14 +79,14 @@ class SparseMatrixView : public MatrixView< Real, Device, Index > /** * \brief Test of symmetric matrix type. - * + * * \return \e true if the matrix is stored as symmetric and \e false otherwise. */ static constexpr bool isSymmetric() { return MatrixType::isSymmetric(); }; /** * \brief Test of binary matrix type. - * + * * \return \e true if the matrix is stored as binary and \e false otherwise. */ static constexpr bool isBinary() { return std::is_same< Real, bool >::value; }; @@ -120,7 +120,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index > using SegmentsViewType = SegmentsView< Device, Index >; /** - * \brief Type of related matrix view. + * \brief Type of related matrix view. */ using ViewType = SparseMatrixView< std::remove_const_t< Real >, Device, Index, MatrixType, SegmentsViewTemplate >; @@ -158,7 +158,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index > /** * \brief Constructor with all necessary data and views. - * + * * \param rows is a number of matrix rows. * \param columns is a number of matrix columns. * \param values is a vector view with matrix elements values. @@ -174,7 +174,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index > /** * \brief Copy constructor. - * + * * \param matrix is an input sparse matrix view. */ __cuda_callable__ @@ -182,7 +182,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index > /** * \brief Move constructor. - * + * * \param matrix is an input sparse matrix view. */ __cuda_callable__ @@ -190,7 +190,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index > /** * \brief Returns a modifiable view of the sparse matrix. - * + * * \return sparse matrix view. */ __cuda_callable__ @@ -198,7 +198,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index > /** * \brief Returns a non-modifiable view of the sparse matrix. - * + * * \return sparse matrix view. */ __cuda_callable__ @@ -206,11 +206,11 @@ class SparseMatrixView : public MatrixView< Real, Device, Index > /** * \brief Returns string with serialization type. - * + * * The string has a form `Matrices::SparseMatrix< RealType, [any_device], IndexType, General/Symmetric, Format, [any_allocator] >`. - * + * * \return \ref String with the serialization type. - * + * * \par Example * \include Matrices/SparseMatrix/SparseMatrixViewExample_getSerializationType.cpp * \par Output @@ -220,11 +220,11 @@ class SparseMatrixView : public MatrixView< Real, Device, Index > /** * \brief Returns string with serialization type. - * + * * See \ref SparseMatrix::getSerializationType. - * + * * \return \e String with the serialization type. - * + * * \par Example * \include Matrices/SparseMatrix/SparseMatrixExample_getSerializationType.cpp * \par Output @@ -234,10 +234,10 @@ class SparseMatrixView : public MatrixView< Real, Device, Index > /** * \brief Computes number of non-zeros in each row. - * + * * \param rowLengths is a vector into which the number of non-zeros in each row * will be stored. - * + * * \par Example * \include Matrices/SparseMatrix/SparseMatrixViewExample_getCompressedRowLengths.cpp * \par Output @@ -248,7 +248,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index > /** * \brief Returns capacity of given matrix row. - * + * * \param row index of matrix row. * \return number of matrix elements allocated for the row. */ @@ -257,26 +257,26 @@ class SparseMatrixView : public MatrixView< Real, Device, Index > /** * \brief Returns number of non-zero matrix elements. - * + * * This method really counts the non-zero matrix elements and so * it returns zero for matrix having all allocated elements set to zero. - * + * * \return number of non-zero matrix elements. */ IndexType getNonzeroElementsCount() const; /** * \brief Constant getter of simple structure for accessing given matrix row. - * + * * \param rowIdx is matrix row index. - * + * * \return RowView for accessing given matrix row. * * \par Example * \include Matrices/SparseMatrix/SparseMatrixViewExample_getConstRow.cpp * \par Output * \include SparseMatrixViewExample_getConstRow.out - * + * * See \ref SparseMatrixRowView. */ __cuda_callable__ @@ -284,16 +284,16 @@ class SparseMatrixView : public MatrixView< Real, Device, Index > /** * \brief Non-constant getter of simple structure for accessing given matrix row. - * + * * \param rowIdx is matrix row index. - * + * * \return RowView for accessing given matrix row. - * + * * \par Example * \include Matrices/SparseMatrix/SparseMatrixViewExample_getRow.cpp * \par Output * \include SparseMatrixViewExample_getRow.out - * + * * See \ref SparseMatrixRowView. */ __cuda_callable__ @@ -301,7 +301,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index > /** * \brief Sets element at given \e row and \e column to given \e value. - * + * * This method can be called from the host system (CPU) no matter * where the matrix is allocated. If the matrix is allocated on GPU this method * can be called even from device kernels. If the matrix is allocated in GPU device @@ -309,11 +309,11 @@ class SparseMatrixView : public MatrixView< Real, Device, Index > * performance is very low. For higher performance see. \ref SparseMatrix::getRow * or \ref SparseMatrix::forRows and \ref SparseMatrix::forAllRows. * The call may fail if the matrix row capacity is exhausted. - * + * * \param row is row index of the element. * \param column is columns index of the element. * \param value is the value the element will be set to. - * + * * \par Example * \include Matrices/SparseMatrix/SparseMatrixViewExample_setElement.cpp * \par Output diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp index b031e846d5094955d037b8ed55f21967d524b571..26217620bbbc7ad3226829937be257ba7935d7ff 100644 --- a/src/TNL/Matrices/SparseMatrixView.hpp +++ b/src/TNL/Matrices/SparseMatrixView.hpp @@ -383,8 +383,8 @@ vectorProduct( const InVector& inVector, static_assert( ! MatrixType::isSymmetric() || ! std::is_same< Device, Devices::Cuda >::value || - ( std::is_same< OutVectorReal, float >::value || - std::is_same< OutVectorReal, double >::value || + ( std::is_same< OutVectorReal, float >::value || + std::is_same< OutVectorReal, double >::value || std::is_same< OutVectorReal, int >::value || std::is_same< OutVectorReal, long long int >::value ), "Given Real type is not supported by atomic operations on GPU which are necessary for symmetric operations." ); @@ -484,6 +484,7 @@ rowsReduction( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduc const auto values_view = this->values.getConstView(); const IndexType paddingIndex_ = this->getPaddingIndex(); auto fetch_ = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType globalIdx, bool& compute ) mutable -> decltype( fetch( IndexType(), IndexType(), RealType() ) ) { + TNL_ASSERT_LT( globalIdx, columns_view.getSize(), "" ); IndexType columnIdx = columns_view[ globalIdx ]; if( columnIdx != paddingIndex_ ) { diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILU0.h b/src/TNL/Solvers/Linear/Preconditioners/ILU0.h index a4eb9e8aae26786412fe8945a9ccf2795f6293fa..8791b95e2500cc415e311935348aa791bcc8fd93 100644 --- a/src/TNL/Solvers/Linear/Preconditioners/ILU0.h +++ b/src/TNL/Solvers/Linear/Preconditioners/ILU0.h @@ -77,7 +77,7 @@ public: protected: // The factors L and U are stored separately and the rows of U are reversed. - Matrices::SparseMatrix< RealType, DeviceType, IndexType, Matrices::GeneralMatrix, Algorithms::Segments::CSR > L, U; + Matrices::SparseMatrix< RealType, DeviceType, IndexType, Matrices::GeneralMatrix, Algorithms::Segments::CSRDefault > L, U; // Specialized methods to distinguish between normal and distributed matrices // in the implementation. diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILUT.h b/src/TNL/Solvers/Linear/Preconditioners/ILUT.h index 344daf1a0103a0a93ca576358b2da787d7578f8b..82ab88e862603464ebfb5f8b799a490f9d158e86 100644 --- a/src/TNL/Solvers/Linear/Preconditioners/ILUT.h +++ b/src/TNL/Solvers/Linear/Preconditioners/ILUT.h @@ -66,7 +66,7 @@ protected: Real tau = 1e-4; // The factors L and U are stored separately and the rows of U are reversed. - Matrices::SparseMatrix< RealType, DeviceType, IndexType, Matrices::GeneralMatrix, Algorithms::Segments::CSR > L, U; + Matrices::SparseMatrix< RealType, DeviceType, IndexType, Matrices::GeneralMatrix, Algorithms::Segments::CSRDefault > L, U; // Specialized methods to distinguish between normal and distributed matrices // in the implementation. diff --git a/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h b/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h index 8a6e0abddda2cfcb9acd70c6b8ba350cfeb3d28e..609a6afd74cd3e37ab1856529815b79ffd3ab9cf 100644 --- a/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h +++ b/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h @@ -27,8 +27,8 @@ using EllpackSegments = TNL::Algorithms::Segments::Ellpack< Device, Index, Index template< typename Device, typename Index, typename IndexAllocator > using SlicedEllpackSegments = TNL::Algorithms::Segments::SlicedEllpack< Device, Index, IndexAllocator >; -using CSR_host = TNL::Matrices::SparseMatrix< bool, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >; -using CSR_cuda = TNL::Matrices::SparseMatrix< bool, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >; +using CSR_host = TNL::Matrices::SparseMatrix< bool, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >; +using CSR_cuda = TNL::Matrices::SparseMatrix< bool, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >; using E_host = TNL::Matrices::SparseMatrix< bool, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, EllpackSegments >; using E_cuda = TNL::Matrices::SparseMatrix< bool, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, EllpackSegments >; using SE_host = TNL::Matrices::SparseMatrix< bool, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, SlicedEllpackSegments >; diff --git a/src/UnitTests/Matrices/BinarySparseMatrixTest_CSR.h b/src/UnitTests/Matrices/BinarySparseMatrixTest_CSR.h index 8f7dad73c719fd5121650a4e6170149a69075036..5a4e98915cbec11a8107194b5b3ed33ab26e4e8c 100644 --- a/src/UnitTests/Matrices/BinarySparseMatrixTest_CSR.h +++ b/src/UnitTests/Matrices/BinarySparseMatrixTest_CSR.h @@ -29,11 +29,11 @@ protected: // types for which MatrixTest is instantiated using CSRMatrixTypes = ::testing::Types < - TNL::Matrices::SparseMatrix< bool, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR, int >, - TNL::Matrices::SparseMatrix< bool, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR, int > + TNL::Matrices::SparseMatrix< bool, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault, int >, + TNL::Matrices::SparseMatrix< bool, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault, int > #ifdef HAVE_CUDA - ,TNL::Matrices::SparseMatrix< bool, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR, int >, - TNL::Matrices::SparseMatrix< bool, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR, int > + ,TNL::Matrices::SparseMatrix< bool, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault, int >, + TNL::Matrices::SparseMatrix< bool, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault, int > #endif >; diff --git a/src/UnitTests/Matrices/CMakeLists.txt b/src/UnitTests/Matrices/CMakeLists.txt index b713c8f0ca76d534b8abb903097b77b8bc8bd22b..a65411fc065b299fdc86d2e2550f07456257c75e 100644 --- a/src/UnitTests/Matrices/CMakeLists.txt +++ b/src/UnitTests/Matrices/CMakeLists.txt @@ -6,7 +6,10 @@ set( COMMON_TESTS TridiagonalMatrixTest MultidiagonalMatrixTest - SparseMatrixTest_CSR + SparseMatrixTest_CSRScalar + SparseMatrixTest_CSRVector + SparseMatrixTest_CSRHybrid + SparseMatrixTest_CSRAdaptive SparseMatrixTest_Ellpack SparseMatrixTest_SlicedEllpack SparseMatrixTest_ChunkedEllpack diff --git a/src/UnitTests/Matrices/DenseMatrixCopyTest.h b/src/UnitTests/Matrices/DenseMatrixCopyTest.h index d86eb57f5cf6fbdaafe51734d9ea834f2bb8823e..dfdcc3b83556183e6e935b21545dbd5b2c8c3347 100644 --- a/src/UnitTests/Matrices/DenseMatrixCopyTest.h +++ b/src/UnitTests/Matrices/DenseMatrixCopyTest.h @@ -27,8 +27,8 @@ using EllpackSegments = TNL::Algorithms::Segments::Ellpack< Device, Index, Index template< typename Device, typename Index, typename IndexAllocator > using SlicedEllpackSegments = TNL::Algorithms::Segments::SlicedEllpack< Device, Index, IndexAllocator >; -using CSR_host = TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >; -using CSR_cuda = TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >; +using CSR_host = TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >; +using CSR_cuda = TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >; using E_host = TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, EllpackSegments >; using E_cuda = TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, EllpackSegments >; using SE_host = TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, SlicedEllpackSegments >; diff --git a/src/UnitTests/Matrices/SparseMatrixCopyTest.h b/src/UnitTests/Matrices/SparseMatrixCopyTest.h index c9f68b5885849209b0e5d1848a16c313b3a32fbd..826b7af6b6bcb2cf76b7f0a6f81492341bd51fbe 100644 --- a/src/UnitTests/Matrices/SparseMatrixCopyTest.h +++ b/src/UnitTests/Matrices/SparseMatrixCopyTest.h @@ -27,8 +27,8 @@ using EllpackSegments = TNL::Algorithms::Segments::Ellpack< Device, Index, Index template< typename Device, typename Index, typename IndexAllocator > using SlicedEllpackSegments = TNL::Algorithms::Segments::SlicedEllpack< Device, Index, IndexAllocator >; -using CSR_host = TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >; -using CSR_cuda = TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >; +using CSR_host = TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >; +using CSR_cuda = TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >; using E_host = TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, EllpackSegments >; using E_cuda = TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, EllpackSegments >; using SE_host = TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, SlicedEllpackSegments >; diff --git a/src/UnitTests/Matrices/SparseMatrixTest.hpp b/src/UnitTests/Matrices/SparseMatrixTest.hpp index b5885afbe83a75ee51eb078c75d1b65e02eaeabc..00794032e6e859462c720e7695f2e311616a0ce9 100644 --- a/src/UnitTests/Matrices/SparseMatrixTest.hpp +++ b/src/UnitTests/Matrices/SparseMatrixTest.hpp @@ -1070,7 +1070,6 @@ void test_VectorProduct() outVector_1.setElement( j, 0 ); m_1.vectorProduct( inVector_1, outVector_1 ); - EXPECT_EQ( outVector_1.getElement( 0 ), 2 ); EXPECT_EQ( outVector_1.getElement( 1 ), 10 ); EXPECT_EQ( outVector_1.getElement( 2 ), 8 ); @@ -1310,7 +1309,7 @@ void test_VectorProduct() ///// // Large test - const IndexType size( 35 ); + const IndexType size( 1051 ); //for( int size = 1; size < 1000; size++ ) { //std::cerr << " size = " << size << std::endl; @@ -1338,26 +1337,28 @@ void test_VectorProduct() EXPECT_EQ( out.getElement( i ), i + 1 ); // Test with large triangular matrix - Matrix m2( size, size ); - rowCapacities.evaluate( [] __cuda_callable__ ( IndexType i ) { return i + 1; } ); + const int rows( size ), columns( size ); + Matrix m2( rows, columns ); + rowCapacities.setSize( rows ); + rowCapacities.evaluate( [=] __cuda_callable__ ( IndexType i ) { return i + 1; } ); m2.setRowCapacities( rowCapacities ); auto f2 = [=] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType& column, RealType& value, bool& compute ) { if( localIdx <= row ) { - value = row -localIdx + 1; + value = localIdx + 1; column = localIdx; } }; m2.forAllRows( f2 ); // check that the matrix was initialized - TNL::Containers::Vector< IndexType, DeviceType, IndexType > rowLengths( size ); + TNL::Containers::Vector< IndexType, DeviceType, IndexType > rowLengths( rows ); m2.getCompressedRowLengths( rowLengths ); EXPECT_EQ( rowLengths, rowCapacities ); + out.setSize( rows ); out = 0.0; m2.vectorProduct( in, out ); - //std::cerr << out << std::endl; - for( IndexType i = 0; i < size; i++ ) + for( IndexType i = 0; i < rows; i++ ) EXPECT_EQ( out.getElement( i ), ( i + 1 ) * ( i + 2 ) / 2 ); } } diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cpp b/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cpp new file mode 100644 index 0000000000000000000000000000000000000000..41306c6da762c35be285912d097541e45e2f641b --- /dev/null +++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cpp @@ -0,0 +1,11 @@ +/*************************************************************************** + SparseMatrixTest_CSRAdaptive.cpp - description + ------------------- + begin : Jan 23, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#include "SparseMatrixTest_CSRAdaptive.h" diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cu b/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cu new file mode 100644 index 0000000000000000000000000000000000000000..50a4333330398f537c95a2b1f17fd3c1c73ba655 --- /dev/null +++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cu @@ -0,0 +1,11 @@ +/*************************************************************************** + SparseMatrixTest_CSRAdaptive.cu - description + ------------------- + begin : Jan 23, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#include "SparseMatrixTest_CSRAdaptive.h" diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.h b/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.h new file mode 100644 index 0000000000000000000000000000000000000000..2756868220b220e1bb1e78cea02b9903128ff413 --- /dev/null +++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.h @@ -0,0 +1,46 @@ +/*************************************************************************** + SparseMatrixTest_CSRAdaptive.h - description + ------------------- + begin : Jan 23, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#include +#include +#include + +#ifdef HAVE_GTEST +#include + +const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRAdaptive_segments"; + +// types for which MatrixTest is instantiated +using MatrixTypes = ::testing::Types +< + TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, + TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive > +#ifdef HAVE_CUDA + ,TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, + TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive > +#endif +>; + +#endif + +#include "SparseMatrixTest.h" +#include "../main.h" diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRHybrid.cpp b/src/UnitTests/Matrices/SparseMatrixTest_CSRHybrid.cpp new file mode 100644 index 0000000000000000000000000000000000000000..214ed2ca7c6990dd03932cfc862f82dc1633f865 --- /dev/null +++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRHybrid.cpp @@ -0,0 +1,11 @@ +/*************************************************************************** + SparseMatrixTest_CSRHybrid.cpp - description + ------------------- + begin : Jan 23, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#include "SparseMatrixTest_CSRHybrid.h" diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRHybrid.cu b/src/UnitTests/Matrices/SparseMatrixTest_CSRHybrid.cu new file mode 100644 index 0000000000000000000000000000000000000000..c0a0918d782b7c86dd603927f7886cb5019f8cb2 --- /dev/null +++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRHybrid.cu @@ -0,0 +1,11 @@ +/*************************************************************************** + SparseMatrixTest_CSRHybrid.cu - description + ------------------- + begin : Jan 23, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#include "SparseMatrixTest_CSRHybrid.h" diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRHybrid.h b/src/UnitTests/Matrices/SparseMatrixTest_CSRHybrid.h new file mode 100644 index 0000000000000000000000000000000000000000..24ba77fa0e87ab13081622ed7cbd52180f035eca --- /dev/null +++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRHybrid.h @@ -0,0 +1,46 @@ +/*************************************************************************** + SparseMatrixTest_CSRHybrid.h - description + ------------------- + begin : Jan 23, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#include +#include +#include + +#ifdef HAVE_GTEST +#include + +const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRHybrid_segments"; + +// types for which MatrixTest is instantiated +using MatrixTypes = ::testing::Types +< + TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid > +#ifdef HAVE_CUDA + ,TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid > +#endif +>; + +#endif + +#include "SparseMatrixTest.h" +#include "../main.h" diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSR.cu b/src/UnitTests/Matrices/SparseMatrixTest_CSRScalar.cpp similarity index 78% rename from src/UnitTests/Matrices/SparseMatrixTest_CSR.cu rename to src/UnitTests/Matrices/SparseMatrixTest_CSRScalar.cpp index 91f0de81a928a6f5676b2d839a35496dfdae61c3..0f73d79aae249be2af3328f6a9dc459f5a6cbf3b 100644 --- a/src/UnitTests/Matrices/SparseMatrixTest_CSR.cu +++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRScalar.cpp @@ -1,5 +1,5 @@ /*************************************************************************** - SparseMatrixTest_CSR.cu - description + SparseMatrixTest_CSRScalar.cpp - description ------------------- begin : Dec 3, 2019 copyright : (C) 2019 by Tomas Oberhuber et al. @@ -8,4 +8,4 @@ /* See Copyright Notice in tnl/Copyright */ -#include "SparseMatrixTest_CSR.h" +#include "SparseMatrixTest_CSRScalar.h" diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSR.cpp b/src/UnitTests/Matrices/SparseMatrixTest_CSRScalar.cu similarity index 78% rename from src/UnitTests/Matrices/SparseMatrixTest_CSR.cpp rename to src/UnitTests/Matrices/SparseMatrixTest_CSRScalar.cu index 5830658abd3135064cc46c8a7c534252a0421935..ff22ae692560e45b5fa0d2380a6a71267fba80a9 100644 --- a/src/UnitTests/Matrices/SparseMatrixTest_CSR.cpp +++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRScalar.cu @@ -1,5 +1,5 @@ /*************************************************************************** - SparseMatrixTest_CSR.cpp - description + SparseMatrixTest_CSRScalar.cu - description ------------------- begin : Dec 3, 2019 copyright : (C) 2019 by Tomas Oberhuber et al. @@ -8,4 +8,4 @@ /* See Copyright Notice in tnl/Copyright */ -#include "SparseMatrixTest_CSR.h" +#include "SparseMatrixTest_CSRScalar.h" diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSR.h b/src/UnitTests/Matrices/SparseMatrixTest_CSRScalar.h similarity index 89% rename from src/UnitTests/Matrices/SparseMatrixTest_CSR.h rename to src/UnitTests/Matrices/SparseMatrixTest_CSRScalar.h index e090f5f62e920b2e307d551ccc2301e36027b6fb..0902ee81a6400a62131227a7f616a2da7b9d78e9 100644 --- a/src/UnitTests/Matrices/SparseMatrixTest_CSR.h +++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRScalar.h @@ -1,5 +1,5 @@ /*************************************************************************** - SparseMatrixTest_CSR.h - description + SparseMatrixTest_CSRScalar.h - description ------------------- begin : Dec 2, 2019 copyright : (C) 2019 by Tomas Oberhuber et al. @@ -15,28 +15,28 @@ #ifdef HAVE_GTEST #include -const char* saveAndLoadFileName = "test_SparseMatrixTest_CSR_segments"; +const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRScalar_segments"; // types for which MatrixTest is instantiated using MatrixTypes = ::testing::Types < - TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR > + TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >, + TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar > #ifdef HAVE_CUDA - ,TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR > + ,TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >, + TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar > #endif >; diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.cpp b/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c60c5e1f7d39dacb628eec317f8fb8b31366b574 --- /dev/null +++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.cpp @@ -0,0 +1,11 @@ +/*************************************************************************** + SparseMatrixTest_CSRVector.cpp - description + ------------------- + begin : Jan 22, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#include "SparseMatrixTest_CSRVector.h" diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.cu b/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.cu new file mode 100644 index 0000000000000000000000000000000000000000..5c78647a1395ac87466c74d7f703cd57b0336e68 --- /dev/null +++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.cu @@ -0,0 +1,11 @@ +/*************************************************************************** + SparseMatrixTest_CSRVector.cu - description + ------------------- + begin : Jan 22, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#include "SparseMatrixTest_CSRVector.h" diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.h b/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.h new file mode 100644 index 0000000000000000000000000000000000000000..8d50fc686c3f828321f1fe40b97dd8ee2d1c3205 --- /dev/null +++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.h @@ -0,0 +1,46 @@ +/*************************************************************************** + SparseMatrixTest_CSRVector.h - description + ------------------- + begin : Jan 22, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#include +#include +#include + +#ifdef HAVE_GTEST +#include + +const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRVector_segments"; + +// types for which MatrixTest is instantiated +using MatrixTypes = ::testing::Types +< + TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >, + TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector > +#ifdef HAVE_CUDA + ,TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >, + TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector > +#endif +>; + +#endif + +#include "SparseMatrixTest.h" +#include "../main.h" diff --git a/src/UnitTests/Matrices/SymmetricSparseMatrixTest_CSR.h b/src/UnitTests/Matrices/SymmetricSparseMatrixTest_CSR.h index 439fab7dfec75dcde32bc880abfe114b5a0257fd..5feb97e11cfa36adddf320cf278eb9860c36ff71 100644 --- a/src/UnitTests/Matrices/SymmetricSparseMatrixTest_CSR.h +++ b/src/UnitTests/Matrices/SymmetricSparseMatrixTest_CSR.h @@ -24,31 +24,31 @@ // types for which MatrixTest is instantiated using MatrixTypes = ::testing::Types < - TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, int, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, int, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, int, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, long, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, long, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, long, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, long, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR > + TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, int, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, int, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, int, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, long, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, long, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, long, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, long, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault > #ifdef HAVE_CUDA // Commented types are not supported by atomic operations on GPU. - ,//TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - //TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - //TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - //TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - //TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - //TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - //TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - //TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - //TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR > + ,//TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + //TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + //TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + //TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + //TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + //TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + //TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + //TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + //TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault > #endif // HAVE_CUDA >;