From 5f8e4c109b4b478b5335176527de8bcc538c10ea Mon Sep 17 00:00:00 2001 From: Illia Kolesnik Date: Thu, 27 Aug 2020 22:36:33 +0200 Subject: [PATCH 01/27] Changed number of elements per warp for CSR Adaptive, CSR Multivector and Improved CSR Light --- src/TNL/Matrices/Legacy/CSR.h | 7 +++++-- src/TNL/Matrices/Legacy/CSR_impl.h | 6 +++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/TNL/Matrices/Legacy/CSR.h b/src/TNL/Matrices/Legacy/CSR.h index 7570eac8b..42f68b127 100644 --- a/src/TNL/Matrices/Legacy/CSR.h +++ b/src/TNL/Matrices/Legacy/CSR.h @@ -115,8 +115,11 @@ public: static constexpr Index THREADS_VECTOR = 128; static constexpr Index THREADS_LIGHT = 128; - /* Max length of row to process one warp */ - static constexpr Index MAX_ELEMENTS_PER_WARP = 1024; + /* Max length of row to process one warp for CSR Light, MultiVector */ + static constexpr Index MAX_ELEMENTS_PER_WARP = 384; + + /* Max length of row to process one warp for CSR Adaptive */ + static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512; /* How many shared memory use per block in CSR Adaptive kernel */ static constexpr Index SHARED_PER_BLOCK = 24576; diff --git a/src/TNL/Matrices/Legacy/CSR_impl.h b/src/TNL/Matrices/Legacy/CSR_impl.h index 580b63456..7a610c825 100644 --- a/src/TNL/Matrices/Legacy/CSR_impl.h +++ b/src/TNL/Matrices/Legacy/CSR_impl.h @@ -143,7 +143,7 @@ Index findLimit(const Index start, type = Type::STREAM; return current; } else { // one long row - if (sum <= 2 * matrix.MAX_ELEMENTS_PER_WARP) + if (sum <= 2 * matrix.MAX_ELEMENTS_PER_WARP_ADAPT) type = Type::VECTOR; else type = Type::LONG; @@ -1764,8 +1764,8 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, SpMVCSRAdaptive< Real, Index, warpSize, matrix.WARPS, - matrix.SHARED_PER_WARP, - matrix.MAX_ELEMENTS_PER_WARP > + matrix.SHARED_PER_WARP, + matrix.MAX_ELEMENTS_PER_WARP_ADAPT > <<>>( inVector, outVector, -- GitLab From 4160e723102b32f6d4a9db8a4eb93bf8cc33a403 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Mon, 18 Jan 2021 20:40:50 +0100 Subject: [PATCH 02/27] Small fix in SparseMatrix documentation. --- src/TNL/Matrices/SparseMatrix.h | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/src/TNL/Matrices/SparseMatrix.h b/src/TNL/Matrices/SparseMatrix.h index 6d068f370..0cfd585fe 100644 --- a/src/TNL/Matrices/SparseMatrix.h +++ b/src/TNL/Matrices/SparseMatrix.h @@ -209,13 +209,8 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator > const IndexAllocatorType& indexAllocator = IndexAllocatorType() ); /** -<<<<<<< HEAD * \brief Constructor with matrix rows capacities and number of columns. * -======= - * \brief Constructor with matrix rows capacities given as an initializer list and a number of columns. - * ->>>>>>> Added SparseMatrix constructor with row capacities vector. * The number of matrix rows is given by the size of \e rowCapacities list. * * \tparam ListIndex is the initializer list values type. @@ -238,9 +233,9 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator > /** * \brief Constructor with matrix rows capacities given as a vector and number of columns. - * + * * The number of matrix rows is given by the size of \e rowCapacities vector. - * + * * \tparam RowCapacitiesVector is the row capacities vector type. Usually it is some of * \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, \ref TNL::Containers::Vector or * \ref TNL::Containers::VectorView. @@ -249,7 +244,7 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator > * \param columns is the number of matrix columns. * \param realAllocator is used for allocation of matrix elements values. * \param indexAllocator is used for allocation of matrix elements column indexes. - * + * * \par Example * \include Matrices/SparseMatrix/SparseMatrixExample_Constructor_rowCapacities_vector.cpp * \par Output -- GitLab From af358d68a1fd9d2b9ea76ab14b1ce4095dc0a895 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Wed, 20 Jan 2021 15:30:19 +0100 Subject: [PATCH 03/27] Adding kernel type parameter to CSR segments. --- src/TNL/Algorithms/Segments/CSR.h | 6 +- src/TNL/Algorithms/Segments/CSR.hpp | 79 +++++++++----- src/TNL/Algorithms/Segments/CSRView.h | 8 +- src/TNL/Algorithms/Segments/CSRView.hpp | 137 ++++++++++++++---------- 4 files changed, 142 insertions(+), 88 deletions(-) diff --git a/src/TNL/Algorithms/Segments/CSR.h b/src/TNL/Algorithms/Segments/CSR.h index 9d2b84b61..042123f91 100644 --- a/src/TNL/Algorithms/Segments/CSR.h +++ b/src/TNL/Algorithms/Segments/CSR.h @@ -22,6 +22,7 @@ namespace TNL { template< typename Device, typename Index, + CSRKernelTypes KernelType_ = CSRScalar, typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > > class CSR { @@ -36,6 +37,7 @@ class CSR using ViewType = CSRView< Device, Index >; using ConstViewType = CSRView< Device, std::add_const_t< IndexType > >; using SegmentViewType = SegmentView< IndexType, RowMajorOrder >; + CSRKernelTypes KernelType = KernelType_; CSR(); @@ -114,8 +116,8 @@ class CSR CSR& operator=( const CSR& rhsSegments ) = default; - template< typename Device_, typename Index_, typename IndexAllocator_ > - CSR& operator=( const CSR< Device_, Index_, IndexAllocator_ >& source ); + template< typename Device_, typename Index_, CSRKernelTypes KernelType__, typename IndexAllocator_ > + CSR& operator=( const CSR< Device_, Index_, KernelType__, IndexAllocator_ >& source ); void save( File& file ) const; diff --git a/src/TNL/Algorithms/Segments/CSR.hpp b/src/TNL/Algorithms/Segments/CSR.hpp index a6b915db3..48e82de41 100644 --- a/src/TNL/Algorithms/Segments/CSR.hpp +++ b/src/TNL/Algorithms/Segments/CSR.hpp @@ -22,16 +22,18 @@ namespace TNL { template< typename Device, typename Index, + CSRKernelTypes KernelType_, typename IndexAllocator > -CSR< Device, Index, IndexAllocator >:: +CSR< Device, Index, KernelType_, IndexAllocator >:: CSR() { } template< typename Device, typename Index, + CSRKernelTypes KernelType_, typename IndexAllocator > -CSR< Device, Index, IndexAllocator >:: +CSR< Device, Index, KernelType_, IndexAllocator >:: CSR( const SegmentsSizes& segmentsSizes ) { this->setSegmentsSizes( segmentsSizes ); @@ -39,16 +41,18 @@ CSR( const SegmentsSizes& segmentsSizes ) template< typename Device, typename Index, + CSRKernelTypes KernelType_, typename IndexAllocator > -CSR< Device, Index, IndexAllocator >:: +CSR< Device, Index, KernelType_, IndexAllocator >:: CSR( const CSR& csr ) : offsets( csr.offsets ) { } template< typename Device, typename Index, + CSRKernelTypes KernelType_, typename IndexAllocator > -CSR< Device, Index, IndexAllocator >:: +CSR< Device, Index, KernelType_, IndexAllocator >:: CSR( const CSR&& csr ) : offsets( std::move( csr.offsets ) ) { @@ -56,9 +60,10 @@ CSR( const CSR&& csr ) : offsets( std::move( csr.offsets ) ) template< typename Device, typename Index, + CSRKernelTypes KernelType_, typename IndexAllocator > String -CSR< Device, Index, IndexAllocator >:: +CSR< Device, Index, KernelType_, IndexAllocator >:: getSerializationType() { return "CSR< [any_device], " + TNL::getSerializationType< IndexType >() + " >"; @@ -66,9 +71,10 @@ getSerializationType() template< typename Device, typename Index, + CSRKernelTypes KernelType_, typename IndexAllocator > String -CSR< Device, Index, IndexAllocator >:: +CSR< Device, Index, KernelType_, IndexAllocator >:: getSegmentsType() { return ViewType::getSegmentsType(); @@ -76,10 +82,11 @@ getSegmentsType() template< typename Device, typename Index, + CSRKernelTypes KernelType_, typename IndexAllocator > template< typename SizesHolder > void -CSR< Device, Index, IndexAllocator >:: +CSR< Device, Index, KernelType_, IndexAllocator >:: setSegmentsSizes( const SizesHolder& sizes ) { details::CSR< Device, Index >::setSegmentsSizes( sizes, this->offsets ); @@ -87,9 +94,10 @@ setSegmentsSizes( const SizesHolder& sizes ) template< typename Device, typename Index, + CSRKernelTypes KernelType_, typename IndexAllocator > void -CSR< Device, Index, IndexAllocator >:: +CSR< Device, Index, KernelType_, IndexAllocator >:: reset() { this->offsets.setSize( 1 ); @@ -99,9 +107,10 @@ reset() template< typename Device, typename Index, + CSRKernelTypes KernelType_, typename IndexAllocator > -typename CSR< Device, Index, IndexAllocator >::ViewType -CSR< Device, Index, IndexAllocator >:: +typename CSR< Device, Index, KernelType_, IndexAllocator >::ViewType +CSR< Device, Index, KernelType_, IndexAllocator >:: getView() { return ViewType( this->offsets.getView() ); @@ -109,9 +118,10 @@ getView() template< typename Device, typename Index, + CSRKernelTypes KernelType_, typename IndexAllocator > auto -CSR< Device, Index, IndexAllocator >:: +CSR< Device, Index, KernelType_, IndexAllocator >:: getConstView() const -> const ConstViewType { return ConstViewType( this->offsets.getConstView() ); @@ -119,8 +129,9 @@ getConstView() const -> const ConstViewType template< typename Device, typename Index, + CSRKernelTypes KernelType_, typename IndexAllocator > -__cuda_callable__ auto CSR< Device, Index, IndexAllocator >:: +__cuda_callable__ auto CSR< Device, Index, KernelType_, IndexAllocator >:: getSegmentsCount() const -> IndexType { return this->offsets.getSize() - 1; @@ -128,8 +139,9 @@ getSegmentsCount() const -> IndexType template< typename Device, typename Index, + CSRKernelTypes KernelType_, typename IndexAllocator > -__cuda_callable__ auto CSR< Device, Index, IndexAllocator >:: +__cuda_callable__ auto CSR< Device, Index, KernelType_, IndexAllocator >:: getSegmentSize( const IndexType segmentIdx ) const -> IndexType { return details::CSR< Device, Index >::getSegmentSize( this->offsets, segmentIdx ); @@ -137,8 +149,9 @@ getSegmentSize( const IndexType segmentIdx ) const -> IndexType template< typename Device, typename Index, + CSRKernelTypes KernelType_, typename IndexAllocator > -__cuda_callable__ auto CSR< Device, Index, IndexAllocator >:: +__cuda_callable__ auto CSR< Device, Index, KernelType_, IndexAllocator >:: getSize() const -> IndexType { return this->getStorageSize(); @@ -146,8 +159,9 @@ getSize() const -> IndexType template< typename Device, typename Index, + CSRKernelTypes KernelType_, typename IndexAllocator > -__cuda_callable__ auto CSR< Device, Index, IndexAllocator >:: +__cuda_callable__ auto CSR< Device, Index, KernelType_, IndexAllocator >:: getStorageSize() const -> IndexType { return details::CSR< Device, Index >::getStorageSize( this->offsets ); @@ -155,8 +169,9 @@ getStorageSize() const -> IndexType template< typename Device, typename Index, + CSRKernelTypes KernelType_, typename IndexAllocator > -__cuda_callable__ auto CSR< Device, Index, IndexAllocator >:: +__cuda_callable__ auto CSR< Device, Index, KernelType_, IndexAllocator >:: getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexType { if( ! std::is_same< DeviceType, Devices::Host >::value ) @@ -172,10 +187,11 @@ getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexTyp template< typename Device, typename Index, + CSRKernelTypes KernelType_, typename IndexAllocator > __cuda_callable__ auto -CSR< Device, Index, IndexAllocator >:: +CSR< Device, Index, KernelType_, IndexAllocator >:: getSegmentView( const IndexType segmentIdx ) const -> SegmentViewType { return SegmentViewType( offsets[ segmentIdx ], offsets[ segmentIdx + 1 ] - offsets[ segmentIdx ] ); @@ -183,10 +199,11 @@ getSegmentView( const IndexType segmentIdx ) const -> SegmentViewType template< typename Device, typename Index, + CSRKernelTypes KernelType_, typename IndexAllocator > template< typename Function, typename... Args > void -CSR< Device, Index, IndexAllocator >:: +CSR< Device, Index, KernelType_, IndexAllocator >:: forSegments( IndexType first, IndexType last, Function& f, Args... args ) const { this->getConstView().forSegments( first, last, f, args... ); @@ -194,10 +211,11 @@ forSegments( IndexType first, IndexType last, Function& f, Args... args ) const template< typename Device, typename Index, + CSRKernelTypes KernelType_, typename IndexAllocator> template< typename Function, typename... Args > void -CSR< Device, Index, IndexAllocator >:: +CSR< Device, Index, KernelType_, IndexAllocator >:: forAll( Function& f, Args... args ) const { this->forSegments( 0, this->getSegmentsCount(), f, args... ); @@ -205,10 +223,11 @@ forAll( Function& f, Args... args ) const template< typename Device, typename Index, + CSRKernelTypes KernelType_, typename IndexAllocator > template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > void -CSR< Device, Index, IndexAllocator >:: +CSR< Device, Index, KernelType_, IndexAllocator >:: segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { this->getConstView().segmentsReduction( first, last, fetch, reduction, keeper, zero, args... ); @@ -216,10 +235,11 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio template< typename Device, typename Index, + CSRKernelTypes KernelType_, typename IndexAllocator > template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > void -CSR< Device, Index, IndexAllocator >:: +CSR< Device, Index, KernelType_, IndexAllocator >:: allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... ); @@ -227,11 +247,12 @@ allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, co template< typename Device, typename Index, + CSRKernelTypes KernelType_, typename IndexAllocator > - template< typename Device_, typename Index_, typename IndexAllocator_ > -CSR< Device, Index, IndexAllocator >& -CSR< Device, Index, IndexAllocator >:: -operator=( const CSR< Device_, Index_, IndexAllocator_ >& source ) + template< typename Device_, typename Index_, CSRKernelTypes KernelType__, typename IndexAllocator_ > +CSR< Device, Index, KernelType_, IndexAllocator >& +CSR< Device, Index, KernelType_, IndexAllocator >:: +operator=( const CSR< Device_, Index_, KernelType__, IndexAllocator_ >& source ) { this->offsets = source.offsets; return *this; @@ -239,9 +260,10 @@ operator=( const CSR< Device_, Index_, IndexAllocator_ >& source ) template< typename Device, typename Index, + CSRKernelTypes KernelType_, typename IndexAllocator > void -CSR< Device, Index, IndexAllocator >:: +CSR< Device, Index, KernelType_, IndexAllocator >:: save( File& file ) const { file << this->offsets; @@ -249,9 +271,10 @@ save( File& file ) const template< typename Device, typename Index, + CSRKernelTypes KernelType_, typename IndexAllocator > -void -CSR< Device, Index, IndexAllocator >:: +void +CSR< Device, Index, KernelType_, IndexAllocator >:: load( File& file ) { file >> this->offsets; diff --git a/src/TNL/Algorithms/Segments/CSRView.h b/src/TNL/Algorithms/Segments/CSRView.h index 610864f5e..91c408055 100644 --- a/src/TNL/Algorithms/Segments/CSRView.h +++ b/src/TNL/Algorithms/Segments/CSRView.h @@ -19,8 +19,11 @@ namespace TNL { namespace Algorithms { namespace Segments { +enum CSRKernelTypes { CSRScalarKernel, CSRVectorKernel, CSRLightKernel }; + template< typename Device, - typename Index > + typename Index, + CSRKernelTypes KernelType_ = CSRScalar > class CSRView { public: @@ -28,12 +31,13 @@ class CSRView using DeviceType = Device; using IndexType = std::remove_const_t< Index >; using OffsetsView = typename Containers::VectorView< Index, DeviceType, IndexType >; - using ConstOffsetsView = typename Containers::Vector< Index, DeviceType,IndexType >::ConstViewType; + using ConstOffsetsView = typename Containers::Vector< Index, DeviceType, IndexType >::ConstViewType; using ViewType = CSRView; template< typename Device_, typename Index_ > using ViewTemplate = CSRView< Device_, Index_ >; using ConstViewType = CSRView< Device, std::add_const_t< Index > >; using SegmentViewType = SegmentView< IndexType, RowMajorOrder >; + CSRKernelTypes KernelType = KernelType_; __cuda_callable__ CSRView(); diff --git a/src/TNL/Algorithms/Segments/CSRView.hpp b/src/TNL/Algorithms/Segments/CSRView.hpp index 5537a1233..4b0397852 100644 --- a/src/TNL/Algorithms/Segments/CSRView.hpp +++ b/src/TNL/Algorithms/Segments/CSRView.hpp @@ -22,122 +22,136 @@ namespace TNL { template< typename Device, - typename Index > + typename Index, + CSRKernelTypes KernelType_ > __cuda_callable__ -CSRView< Device, Index >:: +CSRView< Device, Index, KernelType_ >:: CSRView() { } template< typename Device, - typename Index > + typename Index, + CSRKernelTypes KernelType_ > __cuda_callable__ -CSRView< Device, Index >:: +CSRView< Device, Index, KernelType_ >:: CSRView( const OffsetsView& offsets_view ) : offsets( offsets_view ) { } template< typename Device, - typename Index > + typename Index, + CSRKernelTypes KernelType_ > __cuda_callable__ -CSRView< Device, Index >:: +CSRView< Device, Index, KernelType_ >:: CSRView( const OffsetsView&& offsets_view ) : offsets( offsets_view ) { } template< typename Device, - typename Index > + typename Index, + CSRKernelTypes KernelType_ > __cuda_callable__ -CSRView< Device, Index >:: +CSRView< Device, Index, KernelType_ >:: CSRView( const CSRView& csr_view ) : offsets( csr_view.offsets ) { } template< typename Device, - typename Index > + typename Index, + CSRKernelTypes KernelType_ > __cuda_callable__ -CSRView< Device, Index >:: +CSRView< Device, Index, KernelType_ >:: CSRView( const CSRView&& csr_view ) : offsets( std::move( csr_view.offsets ) ) { } template< typename Device, - typename Index > + typename Index, + CSRKernelTypes KernelType_ > String -CSRView< Device, Index >:: +CSRView< Device, Index, KernelType_ >:: getSerializationType() { return "CSR< [any_device], " + TNL::getSerializationType< IndexType >() + " >"; } template< typename Device, - typename Index > + typename Index, + CSRKernelTypes KernelType_ > String -CSRView< Device, Index >:: +CSRView< Device, Index, KernelType_ >:: getSegmentsType() { return "CSR"; } template< typename Device, - typename Index > + typename Index, + CSRKernelTypes KernelType_ > __cuda_callable__ -typename CSRView< Device, Index >::ViewType -CSRView< Device, Index >:: +typename CSRView< Device, Index, KernelType_ >::ViewType +CSRView< Device, Index, KernelType_ >:: getView() { return ViewType( this->offsets ); } template< typename Device, - typename Index > + typename Index, + CSRKernelTypes KernelType_ > __cuda_callable__ auto -CSRView< Device, Index >:: +CSRView< Device, Index, KernelType_ >:: getConstView() const -> const ConstViewType { return ConstViewType( this->offsets.getConstView() ); } template< typename Device, - typename Index > -__cuda_callable__ auto CSRView< Device, Index >:: + typename Index, + CSRKernelTypes KernelType_ > +__cuda_callable__ auto CSRView< Device, Index, KernelType_ >:: getSegmentsCount() const -> IndexType { return this->offsets.getSize() - 1; } template< typename Device, - typename Index > -__cuda_callable__ auto CSRView< Device, Index >:: + typename Index, + CSRKernelTypes KernelType_ > +__cuda_callable__ auto CSRView< Device, Index, KernelType_ >:: getSegmentSize( const IndexType segmentIdx ) const -> IndexType { return details::CSR< Device, Index >::getSegmentSize( this->offsets, segmentIdx ); } template< typename Device, - typename Index > -__cuda_callable__ auto CSRView< Device, Index >:: + typename Index, + CSRKernelTypes KernelType_ > +__cuda_callable__ auto CSRView< Device, Index, KernelType_ >:: getSize() const -> IndexType { return this->getStorageSize(); } template< typename Device, - typename Index > -__cuda_callable__ auto CSRView< Device, Index >:: + typename Index, + CSRKernelTypes KernelType_ > +__cuda_callable__ auto CSRView< Device, Index, KernelType_ >:: getStorageSize() const -> IndexType { return details::CSR< Device, Index >::getStorageSize( this->offsets ); } template< typename Device, - typename Index > -__cuda_callable__ auto CSRView< Device, Index >:: + typename Index, + CSRKernelTypes KernelType_ > +__cuda_callable__ auto CSRView< Device, Index, KernelType_ >:: getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexType { if( ! std::is_same< DeviceType, Devices::Host >::value ) @@ -152,20 +166,22 @@ getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexTyp } template< typename Device, - typename Index > + typename Index, + CSRKernelTypes KernelType_ > __cuda_callable__ auto -CSRView< Device, Index >:: +CSRView< Device, Index, KernelType_ >:: getSegmentView( const IndexType segmentIdx ) const -> SegmentViewType { return SegmentViewType( offsets[ segmentIdx ], offsets[ segmentIdx + 1 ] - offsets[ segmentIdx ], 1 ); } template< typename Device, - typename Index > + typename Index, + CSRKernelTypes KernelType_ > template< typename Function, typename... Args > void -CSRView< Device, Index >:: +CSRView< Device, Index, KernelType_ >:: forSegments( IndexType first, IndexType last, Function& f, Args... args ) const { const auto offsetsView = this->offsets; @@ -181,51 +197,58 @@ forSegments( IndexType first, IndexType last, Function& f, Args... args ) const } template< typename Device, - typename Index > + typename Index, + CSRKernelTypes KernelType_ > template< typename Function, typename... Args > void -CSRView< Device, Index >:: +CSRView< Device, Index, KernelType_ >:: forAll( Function& f, Args... args ) const { this->forSegments( 0, this->getSegmentsCount(), f, args... ); } template< typename Device, - typename Index > + typename Index, + CSRKernelTypes KernelType_ > template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > void -CSRView< Device, Index >:: +CSRView< Device, Index, KernelType_ >:: segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType; const auto offsetsView = this->offsets.getConstView(); - auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable { - const IndexType begin = offsetsView[ segmentIdx ]; - const IndexType end = offsetsView[ segmentIdx + 1 ]; - RealType aux( zero ); - IndexType localIdx( 0 ); - bool compute( true ); - for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++ ) - aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) ); - keeper( segmentIdx, aux ); - }; + if( KernelType == CSRScalar ) + { + auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable { + const IndexType begin = offsetsView[ segmentIdx ]; + const IndexType end = offsetsView[ segmentIdx + 1 ]; + RealType aux( zero ); + IndexType localIdx( 0 ); + bool compute( true ); + for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++ ) + aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) ); + keeper( segmentIdx, aux ); + }; Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); + } } template< typename Device, - typename Index > + typename Index, + CSRKernelTypes KernelType_ > template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > void -CSRView< Device, Index >:: +CSRView< Device, Index, KernelType_ >:: allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... ); } template< typename Device, - typename Index > -CSRView< Device, Index >& -CSRView< Device, Index >:: + typename Index, + CSRKernelTypes KernelType_ > +CSRView< Device, Index, KernelType_ >& +CSRView< Device, Index, KernelType_ >:: operator=( const CSRView& view ) { this->offsets.bind( view.offsets ); @@ -233,18 +256,20 @@ operator=( const CSRView& view ) } template< typename Device, - typename Index > + typename Index, + CSRKernelTypes KernelType_ > void -CSRView< Device, Index >:: +CSRView< Device, Index, KernelType_ >:: save( File& file ) const { file << this->offsets; } template< typename Device, - typename Index > + typename Index, + CSRKernelTypes KernelType_ > void -CSRView< Device, Index >:: +CSRView< Device, Index, KernelType_ >:: load( File& file ) { file >> this->offsets; -- GitLab From 9a79a96b953aa9ef9b20c27665ff0be40e86f3e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Wed, 20 Jan 2021 15:57:10 +0100 Subject: [PATCH 04/27] Added aliases on CSR segments with different kernel types. --- src/TNL/Algorithms/Segments/CSR.h | 22 ++++++++++++++++++++++ src/TNL/Algorithms/Segments/CSRView.h | 17 +++++++++++++++++ src/TNL/Matrices/SparseMatrix.h | 2 +- src/TNL/Matrices/SparseMatrixView.h | 10 +++++----- 4 files changed, 45 insertions(+), 6 deletions(-) diff --git a/src/TNL/Algorithms/Segments/CSR.h b/src/TNL/Algorithms/Segments/CSR.h index 042123f91..a9269d72a 100644 --- a/src/TNL/Algorithms/Segments/CSR.h +++ b/src/TNL/Algorithms/Segments/CSR.h @@ -127,6 +127,28 @@ class CSR OffsetsHolder offsets; }; + +template< typename Device, + typename Index, + typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > > +using CSRScalar = CSR< Device, Index, CSRScalarKernel, IndexAllocator >; + +template< typename Device, + typename Index, + typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > > +using CSRVector = CSR< Device, Index, CSRVectorKernel, IndexAllocator >; + +template< typename Device, + typename Index, + typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > > +using CSRLight = CSR< Device, Index, CSRLightKernel, IndexAllocator >; + +template< typename Device, + typename Index, + typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > > +using CSRDefault = CSRScalar< Device, Index, IndexAllocator >; + + } // namespace Segments } // namespace Algorithms } // namespace TNL diff --git a/src/TNL/Algorithms/Segments/CSRView.h b/src/TNL/Algorithms/Segments/CSRView.h index 91c408055..928d08ff9 100644 --- a/src/TNL/Algorithms/Segments/CSRView.h +++ b/src/TNL/Algorithms/Segments/CSRView.h @@ -126,6 +126,23 @@ class CSRView OffsetsView offsets; }; + +template< typename Device, + typename Index > +using CSRViewScalar = CSRView< Device, Index, CSRScalarKernel >; + +template< typename Device, + typename Index > +using CSRViewVector = CSRView< Device, Index, CSRVectorKernel >; + +template< typename Device, + typename Index > +using CSRViewLight = CSRView< Device, Index, CSRLightKernel >; + +template< typename Device, + typename Index > +using CSRViewDefault = CSRViewScalar< Device, Index >; + } // namespace Segments } // namespace Algorithms } // namespace TNL diff --git a/src/TNL/Matrices/SparseMatrix.h b/src/TNL/Matrices/SparseMatrix.h index 0cfd585fe..581d79c98 100644 --- a/src/TNL/Matrices/SparseMatrix.h +++ b/src/TNL/Matrices/SparseMatrix.h @@ -45,7 +45,7 @@ template< typename Real = double, typename Device = Devices::Host, typename Index = int, typename MatrixType = GeneralMatrix, - template< typename Device_, typename Index_, typename IndexAllocator_ > class Segments = Algorithms::Segments::CSR, + template< typename Device_, typename Index_, typename IndexAllocator_ > class Segments = Algorithms::Segments::CSRDefault, typename ComputeReal = typename ChooseSparseMatrixComputeReal< Real, Index >::type, typename RealAllocator = typename Allocators::Default< Device >::template Allocator< Real >, typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > > diff --git a/src/TNL/Matrices/SparseMatrixView.h b/src/TNL/Matrices/SparseMatrixView.h index f91e471e8..a753332a9 100644 --- a/src/TNL/Matrices/SparseMatrixView.h +++ b/src/TNL/Matrices/SparseMatrixView.h @@ -36,10 +36,10 @@ struct ChooseSparseMatrixComputeReal< bool, Index > * * It serves as an accessor to \ref SparseMatrix for example when passing the * matrix to lambda functions. SparseMatrix view can be also created in CUDA kernels. - * - * \tparam Real is a type of matrix elements. If \e Real equals \e bool the matrix is treated + * + * \tparam Real is a type of matrix elements. If \e Real equals \e bool the matrix is treated * as binary and so the matrix elements values are not stored in the memory since we need - * to remember only coordinates of non-zero elements( which equal one). + * to remember only coordinates of non-zero elements( which equal one). * \tparam Device is a device where the matrix is allocated. * \tparam Index is a type for indexing of the matrix elements. * \tparam MatrixType specifies a symmetry of matrix. See \ref MatrixType. Symmetric @@ -50,13 +50,13 @@ struct ChooseSparseMatrixComputeReal< bool, Index > * \ref Ellpack, \ref SlicedEllpack, \ref ChunkedEllpack or \ref BiEllpack. * \tparam ComputeReal is the same as \e Real mostly but for binary matrices it is set to \e Index type. This can be changed * bu the user, of course. - * + * */ template< typename Real, typename Device = Devices::Host, typename Index = int, typename MatrixType = GeneralMatrix, - template< typename Device_, typename Index_ > class SegmentsView = Algorithms::Segments::CSRView, + template< typename Device_, typename Index_ > class SegmentsView = Algorithms::Segments::CSRViewDefault, typename ComputeReal = typename ChooseSparseMatrixComputeReal< Real, Index >::type > class SparseMatrixView : public MatrixView< Real, Device, Index > { -- GitLab From d564ca7b1967a2c2c12674eb50374a391ac9ff0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Wed, 20 Jan 2021 17:21:43 +0100 Subject: [PATCH 05/27] Fixing the code for new CSR segments types. --- .../tnl-benchmark-linear-solvers.h | 2 +- src/Benchmarks/SpMV/spmv-legacy.h | 2 +- src/TNL/Algorithms/Segments/CSR.h | 2 +- src/TNL/Algorithms/Segments/CSRView.h | 2 +- src/TNL/Algorithms/Segments/CSRView.hpp | 2 +- src/TNL/Solvers/Linear/Preconditioners/ILU0.h | 2 +- src/TNL/Solvers/Linear/Preconditioners/ILUT.h | 2 +- .../Matrices/BinarySparseMatrixCopyTest.h | 4 +- .../Matrices/BinarySparseMatrixTest_CSR.h | 8 ++-- src/UnitTests/Matrices/DenseMatrixCopyTest.h | 4 +- src/UnitTests/Matrices/SparseMatrixCopyTest.h | 4 +- src/UnitTests/Matrices/SparseMatrixTest_CSR.h | 32 ++++++------- .../Matrices/SymmetricSparseMatrixTest_CSR.h | 48 +++++++++---------- 13 files changed, 57 insertions(+), 57 deletions(-) diff --git a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h index 3acfb2438..3f64bf33d 100644 --- a/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h +++ b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h @@ -479,7 +479,7 @@ struct LinearSolversBenchmark DeviceType, IndexType, TNL::Matrices::GeneralMatrix, - Algorithms::Segments::CSR + Algorithms::Segments::CSRDefault >; SharedPointer< CSR > matrixCopy; Matrices::copySparseMatrix( *matrixCopy, *matrixPointer ); diff --git a/src/Benchmarks/SpMV/spmv-legacy.h b/src/Benchmarks/SpMV/spmv-legacy.h index ec0fd0018..3416ad3ef 100644 --- a/src/Benchmarks/SpMV/spmv-legacy.h +++ b/src/Benchmarks/SpMV/spmv-legacy.h @@ -49,7 +49,7 @@ using SlicedEllpackAlias = Matrices::Legacy::SlicedEllpack< Real, Device, Index // Segments based sparse matrix aliases template< typename Real, typename Device, typename Index > -using SparseMatrix_CSR = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, Algorithms::Segments::CSR >; +using SparseMatrix_CSR = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, Algorithms::Segments::CSRDefault >; template< typename Device, typename Index, typename IndexAllocator > using EllpackSegments = Algorithms::Segments::Ellpack< Device, Index, IndexAllocator >; diff --git a/src/TNL/Algorithms/Segments/CSR.h b/src/TNL/Algorithms/Segments/CSR.h index a9269d72a..ef958a252 100644 --- a/src/TNL/Algorithms/Segments/CSR.h +++ b/src/TNL/Algorithms/Segments/CSR.h @@ -22,7 +22,7 @@ namespace TNL { template< typename Device, typename Index, - CSRKernelTypes KernelType_ = CSRScalar, + CSRKernelTypes KernelType_ = CSRScalarKernel, typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > > class CSR { diff --git a/src/TNL/Algorithms/Segments/CSRView.h b/src/TNL/Algorithms/Segments/CSRView.h index 928d08ff9..b30863b8f 100644 --- a/src/TNL/Algorithms/Segments/CSRView.h +++ b/src/TNL/Algorithms/Segments/CSRView.h @@ -23,7 +23,7 @@ enum CSRKernelTypes { CSRScalarKernel, CSRVectorKernel, CSRLightKernel }; template< typename Device, typename Index, - CSRKernelTypes KernelType_ = CSRScalar > + CSRKernelTypes KernelType_ = CSRScalarKernel > class CSRView { public: diff --git a/src/TNL/Algorithms/Segments/CSRView.hpp b/src/TNL/Algorithms/Segments/CSRView.hpp index 4b0397852..7077d0f03 100644 --- a/src/TNL/Algorithms/Segments/CSRView.hpp +++ b/src/TNL/Algorithms/Segments/CSRView.hpp @@ -217,7 +217,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio { using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType; const auto offsetsView = this->offsets.getConstView(); - if( KernelType == CSRScalar ) + if( KernelType == CSRScalarKernel ) { auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable { const IndexType begin = offsetsView[ segmentIdx ]; diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILU0.h b/src/TNL/Solvers/Linear/Preconditioners/ILU0.h index a4eb9e8aa..8791b95e2 100644 --- a/src/TNL/Solvers/Linear/Preconditioners/ILU0.h +++ b/src/TNL/Solvers/Linear/Preconditioners/ILU0.h @@ -77,7 +77,7 @@ public: protected: // The factors L and U are stored separately and the rows of U are reversed. - Matrices::SparseMatrix< RealType, DeviceType, IndexType, Matrices::GeneralMatrix, Algorithms::Segments::CSR > L, U; + Matrices::SparseMatrix< RealType, DeviceType, IndexType, Matrices::GeneralMatrix, Algorithms::Segments::CSRDefault > L, U; // Specialized methods to distinguish between normal and distributed matrices // in the implementation. diff --git a/src/TNL/Solvers/Linear/Preconditioners/ILUT.h b/src/TNL/Solvers/Linear/Preconditioners/ILUT.h index 344daf1a0..82ab88e86 100644 --- a/src/TNL/Solvers/Linear/Preconditioners/ILUT.h +++ b/src/TNL/Solvers/Linear/Preconditioners/ILUT.h @@ -66,7 +66,7 @@ protected: Real tau = 1e-4; // The factors L and U are stored separately and the rows of U are reversed. - Matrices::SparseMatrix< RealType, DeviceType, IndexType, Matrices::GeneralMatrix, Algorithms::Segments::CSR > L, U; + Matrices::SparseMatrix< RealType, DeviceType, IndexType, Matrices::GeneralMatrix, Algorithms::Segments::CSRDefault > L, U; // Specialized methods to distinguish between normal and distributed matrices // in the implementation. diff --git a/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h b/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h index 8a6e0abdd..609a6afd7 100644 --- a/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h +++ b/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h @@ -27,8 +27,8 @@ using EllpackSegments = TNL::Algorithms::Segments::Ellpack< Device, Index, Index template< typename Device, typename Index, typename IndexAllocator > using SlicedEllpackSegments = TNL::Algorithms::Segments::SlicedEllpack< Device, Index, IndexAllocator >; -using CSR_host = TNL::Matrices::SparseMatrix< bool, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >; -using CSR_cuda = TNL::Matrices::SparseMatrix< bool, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >; +using CSR_host = TNL::Matrices::SparseMatrix< bool, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >; +using CSR_cuda = TNL::Matrices::SparseMatrix< bool, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >; using E_host = TNL::Matrices::SparseMatrix< bool, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, EllpackSegments >; using E_cuda = TNL::Matrices::SparseMatrix< bool, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, EllpackSegments >; using SE_host = TNL::Matrices::SparseMatrix< bool, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, SlicedEllpackSegments >; diff --git a/src/UnitTests/Matrices/BinarySparseMatrixTest_CSR.h b/src/UnitTests/Matrices/BinarySparseMatrixTest_CSR.h index 8f7dad73c..5a4e98915 100644 --- a/src/UnitTests/Matrices/BinarySparseMatrixTest_CSR.h +++ b/src/UnitTests/Matrices/BinarySparseMatrixTest_CSR.h @@ -29,11 +29,11 @@ protected: // types for which MatrixTest is instantiated using CSRMatrixTypes = ::testing::Types < - TNL::Matrices::SparseMatrix< bool, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR, int >, - TNL::Matrices::SparseMatrix< bool, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR, int > + TNL::Matrices::SparseMatrix< bool, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault, int >, + TNL::Matrices::SparseMatrix< bool, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault, int > #ifdef HAVE_CUDA - ,TNL::Matrices::SparseMatrix< bool, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR, int >, - TNL::Matrices::SparseMatrix< bool, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR, int > + ,TNL::Matrices::SparseMatrix< bool, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault, int >, + TNL::Matrices::SparseMatrix< bool, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault, int > #endif >; diff --git a/src/UnitTests/Matrices/DenseMatrixCopyTest.h b/src/UnitTests/Matrices/DenseMatrixCopyTest.h index d86eb57f5..dfdcc3b83 100644 --- a/src/UnitTests/Matrices/DenseMatrixCopyTest.h +++ b/src/UnitTests/Matrices/DenseMatrixCopyTest.h @@ -27,8 +27,8 @@ using EllpackSegments = TNL::Algorithms::Segments::Ellpack< Device, Index, Index template< typename Device, typename Index, typename IndexAllocator > using SlicedEllpackSegments = TNL::Algorithms::Segments::SlicedEllpack< Device, Index, IndexAllocator >; -using CSR_host = TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >; -using CSR_cuda = TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >; +using CSR_host = TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >; +using CSR_cuda = TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >; using E_host = TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, EllpackSegments >; using E_cuda = TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, EllpackSegments >; using SE_host = TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, SlicedEllpackSegments >; diff --git a/src/UnitTests/Matrices/SparseMatrixCopyTest.h b/src/UnitTests/Matrices/SparseMatrixCopyTest.h index c9f68b588..826b7af6b 100644 --- a/src/UnitTests/Matrices/SparseMatrixCopyTest.h +++ b/src/UnitTests/Matrices/SparseMatrixCopyTest.h @@ -27,8 +27,8 @@ using EllpackSegments = TNL::Algorithms::Segments::Ellpack< Device, Index, Index template< typename Device, typename Index, typename IndexAllocator > using SlicedEllpackSegments = TNL::Algorithms::Segments::SlicedEllpack< Device, Index, IndexAllocator >; -using CSR_host = TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >; -using CSR_cuda = TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >; +using CSR_host = TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >; +using CSR_cuda = TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >; using E_host = TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, EllpackSegments >; using E_cuda = TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, EllpackSegments >; using SE_host = TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, SlicedEllpackSegments >; diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSR.h b/src/UnitTests/Matrices/SparseMatrixTest_CSR.h index e090f5f62..639876875 100644 --- a/src/UnitTests/Matrices/SparseMatrixTest_CSR.h +++ b/src/UnitTests/Matrices/SparseMatrixTest_CSR.h @@ -20,23 +20,23 @@ const char* saveAndLoadFileName = "test_SparseMatrixTest_CSR_segments"; // types for which MatrixTest is instantiated using MatrixTypes = ::testing::Types < - TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR > + TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault > #ifdef HAVE_CUDA - ,TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSR > + ,TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault > #endif >; diff --git a/src/UnitTests/Matrices/SymmetricSparseMatrixTest_CSR.h b/src/UnitTests/Matrices/SymmetricSparseMatrixTest_CSR.h index 439fab7df..5feb97e11 100644 --- a/src/UnitTests/Matrices/SymmetricSparseMatrixTest_CSR.h +++ b/src/UnitTests/Matrices/SymmetricSparseMatrixTest_CSR.h @@ -24,31 +24,31 @@ // types for which MatrixTest is instantiated using MatrixTypes = ::testing::Types < - TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, int, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, int, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, int, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, long, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, long, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, long, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, long, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR > + TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, int, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, int, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, int, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, long, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, long, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, long, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, long, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault > #ifdef HAVE_CUDA // Commented types are not supported by atomic operations on GPU. - ,//TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - //TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - //TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - //TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - //TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - //TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - //TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - //TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR >, - //TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSR > + ,//TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + //TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + //TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + //TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, short, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + //TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + //TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + //TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + //TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault >, + //TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::SymmetricMatrix, TNL::Algorithms::Segments::CSRDefault > #endif // HAVE_CUDA >; -- GitLab From a42d2a3fd0c359c140489bf407b2f0a7671b17c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Fri, 22 Jan 2021 11:46:00 +0100 Subject: [PATCH 06/27] Renaming SparseMatrix_CSR unit test to SparseMatrix_CSRScalar. --- src/UnitTests/Matrices/CMakeLists.txt | 2 +- ..._CSR.cu => SparseMatrixTest_CSRScalar.cpp} | 4 +-- ..._CSR.cpp => SparseMatrixTest_CSRScalar.cu} | 4 +-- ...est_CSR.h => SparseMatrixTest_CSRScalar.h} | 34 +++++++++---------- 4 files changed, 22 insertions(+), 22 deletions(-) rename src/UnitTests/Matrices/{SparseMatrixTest_CSR.cu => SparseMatrixTest_CSRScalar.cpp} (78%) rename src/UnitTests/Matrices/{SparseMatrixTest_CSR.cpp => SparseMatrixTest_CSRScalar.cu} (78%) rename src/UnitTests/Matrices/{SparseMatrixTest_CSR.h => SparseMatrixTest_CSRScalar.h} (91%) diff --git a/src/UnitTests/Matrices/CMakeLists.txt b/src/UnitTests/Matrices/CMakeLists.txt index b713c8f0c..e5660090c 100644 --- a/src/UnitTests/Matrices/CMakeLists.txt +++ b/src/UnitTests/Matrices/CMakeLists.txt @@ -6,7 +6,7 @@ set( COMMON_TESTS TridiagonalMatrixTest MultidiagonalMatrixTest - SparseMatrixTest_CSR + SparseMatrixTest_CSRScalar SparseMatrixTest_Ellpack SparseMatrixTest_SlicedEllpack SparseMatrixTest_ChunkedEllpack diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSR.cu b/src/UnitTests/Matrices/SparseMatrixTest_CSRScalar.cpp similarity index 78% rename from src/UnitTests/Matrices/SparseMatrixTest_CSR.cu rename to src/UnitTests/Matrices/SparseMatrixTest_CSRScalar.cpp index 91f0de81a..0f73d79aa 100644 --- a/src/UnitTests/Matrices/SparseMatrixTest_CSR.cu +++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRScalar.cpp @@ -1,5 +1,5 @@ /*************************************************************************** - SparseMatrixTest_CSR.cu - description + SparseMatrixTest_CSRScalar.cpp - description ------------------- begin : Dec 3, 2019 copyright : (C) 2019 by Tomas Oberhuber et al. @@ -8,4 +8,4 @@ /* See Copyright Notice in tnl/Copyright */ -#include "SparseMatrixTest_CSR.h" +#include "SparseMatrixTest_CSRScalar.h" diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSR.cpp b/src/UnitTests/Matrices/SparseMatrixTest_CSRScalar.cu similarity index 78% rename from src/UnitTests/Matrices/SparseMatrixTest_CSR.cpp rename to src/UnitTests/Matrices/SparseMatrixTest_CSRScalar.cu index 5830658ab..ff22ae692 100644 --- a/src/UnitTests/Matrices/SparseMatrixTest_CSR.cpp +++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRScalar.cu @@ -1,5 +1,5 @@ /*************************************************************************** - SparseMatrixTest_CSR.cpp - description + SparseMatrixTest_CSRScalar.cu - description ------------------- begin : Dec 3, 2019 copyright : (C) 2019 by Tomas Oberhuber et al. @@ -8,4 +8,4 @@ /* See Copyright Notice in tnl/Copyright */ -#include "SparseMatrixTest_CSR.h" +#include "SparseMatrixTest_CSRScalar.h" diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSR.h b/src/UnitTests/Matrices/SparseMatrixTest_CSRScalar.h similarity index 91% rename from src/UnitTests/Matrices/SparseMatrixTest_CSR.h rename to src/UnitTests/Matrices/SparseMatrixTest_CSRScalar.h index 639876875..3a1cb02c3 100644 --- a/src/UnitTests/Matrices/SparseMatrixTest_CSR.h +++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRScalar.h @@ -1,5 +1,5 @@ /*************************************************************************** - SparseMatrixTest_CSR.h - description + SparseMatrixTest_CSRScalar.h - description ------------------- begin : Dec 2, 2019 copyright : (C) 2019 by Tomas Oberhuber et al. @@ -20,23 +20,23 @@ const char* saveAndLoadFileName = "test_SparseMatrixTest_CSR_segments"; // types for which MatrixTest is instantiated using MatrixTypes = ::testing::Types < - TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >, - TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >, - TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >, - TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >, - TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >, - TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >, - TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >, - TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault > + TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >, + TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar > #ifdef HAVE_CUDA - ,TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >, - TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >, - TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >, - TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >, - TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >, - TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >, - TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault >, - TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRDefault > + ,TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >, + TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRScalar > #endif >; -- GitLab From a45e791093d5995b3844a0d54902a279fdd75b5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Fri, 22 Jan 2021 16:20:16 +0100 Subject: [PATCH 07/27] Fixing matrices tutorial benchmark and CMakeLists. --- Documentation/Tutorials/Matrices/CMakeLists.txt | 6 +++--- .../Tutorials/Matrices/SparseMatrixSetup_Benchmark.cpp | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/Tutorials/Matrices/CMakeLists.txt b/Documentation/Tutorials/Matrices/CMakeLists.txt index 0d672aa0b..94e57ec13 100644 --- a/Documentation/Tutorials/Matrices/CMakeLists.txt +++ b/Documentation/Tutorials/Matrices/CMakeLists.txt @@ -104,9 +104,9 @@ ELSE() #### # THe following examples/benchmarks run for very long time - ADD_EXECUTABLE( DenseMatrixSetup_Benchmark DenseMatrixSetup_Benchmark_cuda.cpp ) - ADD_EXECUTABLE( SparseMatrixSetup_Benchmark SparseMatrixSetup_Benchmark_cuda.cpp ) - ADD_EXECUTABLE( MultidiagonalMatrixSetup_Benchmark MultidiagonalMatrixSetup_Benchmark_cuda.cpp ) + ADD_EXECUTABLE( DenseMatrixSetup_Benchmark DenseMatrixSetup_Benchmark.cpp ) + ADD_EXECUTABLE( SparseMatrixSetup_Benchmark SparseMatrixSetup_Benchmark.cpp ) + ADD_EXECUTABLE( MultidiagonalMatrixSetup_Benchmark MultidiagonalMatrixSetup_Benchmark.cpp ) ENDIF() IF( BUILD_CUDA ) diff --git a/Documentation/Tutorials/Matrices/SparseMatrixSetup_Benchmark.cpp b/Documentation/Tutorials/Matrices/SparseMatrixSetup_Benchmark.cpp index c53a8f5b4..a36e17e7b 100644 --- a/Documentation/Tutorials/Matrices/SparseMatrixSetup_Benchmark.cpp +++ b/Documentation/Tutorials/Matrices/SparseMatrixSetup_Benchmark.cpp @@ -69,7 +69,7 @@ template< typename Matrix > void setElement_on_host_and_transfer( const int gridSize, Matrix& matrix ) { using RealType = typename Matrix::RealType; - using HostMatrix = typename Matrix::Self< RealType, TNL::Devices::Host >; + using HostMatrix = typename Matrix::template Self< RealType, TNL::Devices::Host >; const int matrixSize = gridSize * gridSize; TNL::Containers::Vector< int, typename HostMatrix::DeviceType, int > rowCapacities( matrixSize, 5 ); -- GitLab From 183f565c02e0a443a776a94e6337b73ceff30b6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Fri, 22 Jan 2021 16:20:46 +0100 Subject: [PATCH 08/27] Adding CSR Light kernel. --- src/TNL/Algorithms/Segments/CSR.h | 6 +++--- src/TNL/Algorithms/Segments/CSRView.hpp | 12 ++++++++++-- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/src/TNL/Algorithms/Segments/CSR.h b/src/TNL/Algorithms/Segments/CSR.h index ef958a252..3eaad6eb9 100644 --- a/src/TNL/Algorithms/Segments/CSR.h +++ b/src/TNL/Algorithms/Segments/CSR.h @@ -33,9 +33,9 @@ class CSR using OffsetsHolder = Containers::Vector< Index, DeviceType, IndexType, IndexAllocator >; using SegmentsSizes = OffsetsHolder; template< typename Device_, typename Index_ > - using ViewTemplate = CSRView< Device_, Index_ >; - using ViewType = CSRView< Device, Index >; - using ConstViewType = CSRView< Device, std::add_const_t< IndexType > >; + using ViewTemplate = CSRView< Device_, Index_, KernelType_ >; + using ViewType = CSRView< Device, Index, KernelType_ >; + using ConstViewType = CSRView< Device, std::add_const_t< IndexType >, KernelType_ >; using SegmentViewType = SegmentView< IndexType, RowMajorOrder >; CSRKernelTypes KernelType = KernelType_; diff --git a/src/TNL/Algorithms/Segments/CSRView.hpp b/src/TNL/Algorithms/Segments/CSRView.hpp index 7077d0f03..a49b1bfc9 100644 --- a/src/TNL/Algorithms/Segments/CSRView.hpp +++ b/src/TNL/Algorithms/Segments/CSRView.hpp @@ -14,6 +14,7 @@ #include #include #include +#include #include namespace TNL { @@ -217,7 +218,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio { using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType; const auto offsetsView = this->offsets.getConstView(); - if( KernelType == CSRScalarKernel ) + if( KernelType == CSRScalarKernel || std::is_same< DeviceType, TNL::Devices::Host >::value ) { auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable { const IndexType begin = offsetsView[ segmentIdx ]; @@ -229,7 +230,14 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) ); keeper( segmentIdx, aux ); }; - Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); + Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); + } + if( KernelType == CSRVectorKernel ) + details::RowsReductionVectorKernelCaller( offsetsView, first, last, fetch, reduction, keeper, zero, args... ); + if( KernelType == CSRLightKernel ) + { + const IndexType elementsInSegment = ceil( this->getSize() / this->getSegmentsCount() ); + details::RowsReductionLightKernelCaller( elementsInSegment, offsetsView, first, last, fetch, reduction, keeper, zero, args... ); } } -- GitLab From 93dd8e209f69c984295495bfe3eb0360da25fb8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Fri, 22 Jan 2021 16:21:05 +0100 Subject: [PATCH 09/27] Adding CSR Light kernel. --- .../Algorithms/Segments/details/CSRKernels.h | 280 ++++++++++++++++++ 1 file changed, 280 insertions(+) create mode 100644 src/TNL/Algorithms/Segments/details/CSRKernels.h diff --git a/src/TNL/Algorithms/Segments/details/CSRKernels.h b/src/TNL/Algorithms/Segments/details/CSRKernels.h new file mode 100644 index 000000000..0fc237483 --- /dev/null +++ b/src/TNL/Algorithms/Segments/details/CSRKernels.h @@ -0,0 +1,280 @@ +/*************************************************************************** + CSRKernels.h - description + ------------------- + begin : Jan 20, 2021 -> Joe Biden inauguration + copyright : (C) 2021 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include +#include +#include + +namespace TNL { + namespace Algorithms { + namespace Segments { + namespace details { + + +#ifdef HAVE_CUDA +template< typename Device, + typename Index, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real, + typename... Args > +__global__ +void RowsReductionVectorKernel( + int gridIdx, + const TNL::Containers::VectorView< Index, TNL::Devices::Cuda, Index > offsets, + Index first, + Index last, + Fetch& fetch, + const Reduction& reduction, + ResultKeeper& keeper, + const Real& zero, + Args... args ) +{ + /*** + * We map one warp to each segment + */ + const Index segmentIdx = TNL::Cuda::getGlobalThreadIdx( gridIdx ) / TNL::Cuda::getWarpSize() + first; + if( segmentIdx >= last ) + return; + + const int laneIdx = threadIdx.x & 31; // & is cheaper than % + Index endIdx = offsets[ segmentIdx + 1] ; + + Index localIdx( laneIdx ); + Real aux = zero; + for( Index globalIdx = offsets[ segmentIdx ] + localIdx; i < endIdx; i += TNL::Cuda::getWarpSize() ) + { + aux = reduce( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) ); + localIdx += TNL::Cuda::getWarpSize(); + } + + /**** + * Reduction in each warp which means in each segment. + */ + aux += __shfl_down_sync(0xFFFFFFFF, aux, 16); + aux += __shfl_down_sync(0xFFFFFFFF, aux, 8); + aux += __shfl_down_sync(0xFFFFFFFF, aux, 4); + aux += __shfl_down_sync(0xFFFFFFFF, aux, 2); + aux += __shfl_down_sync(0xFFFFFFFF, aux, 1); + + if( laneIdx == 0 ) + keeper( segmentIdx, aux ) + + + + /*const Index warpID = ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / warpSize; + if (warpID >= rows) + return; + + Real result = 0.0; + const Index laneID = threadIdx.x & 31; // & is cheaper than % + Index endID = rowPointers[warpID + 1]; + + // Calculate result + for (Index i = rowPointers[warpID] + laneID; i < endID; i += warpSize) + result += values[i] * inVector[columnIndexes[i]]; + + // Reduction + result += __shfl_down_sync(0xFFFFFFFF, result, 16); + result += __shfl_down_sync(0xFFFFFFFF, result, 8); + result += __shfl_down_sync(0xFFFFFFFF, result, 4); + result += __shfl_down_sync(0xFFFFFFFF, result, 2); + result += __shfl_down_sync(0xFFFFFFFF, result, 1); + // Write result + if (laneID == 0) outVector[warpID] = result;*/ +} +#endif + +template< typename OffsetsView, + typename Index, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real, + typename... Args > +void +RowsReductionVectorKernelCaller( + const OffsetsView& offsets, + Index first, + Index last, + Fetch& fetch, + const Reduction& reduction, + ResultKeeper& keeper, + const Real& zero, + Args... args ) +{ +#ifdef HAVE_CUDA + const Index warpsCount = last - first; + const size_t threadsCount = warpsCount * TNL::Cuda::getWarpSize(); + dim3 blocksCount, gridsCount, blockSize( 256 ); + TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount ); + for( int gridIdx = 0; gridIdx < gridsCount.x; gridIdx ++ ) + { + dim3 gridSize; + setupGrid( blocksCount, gridsCount, gridIdx, gridSize ); + SpMVCSRVector< Index, Fetch, Redcution, ResultKeeper, Real, Args ><<< gridSize, blockSize >>>( + gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args ); + }; + +#endif + +/*const Index threads = matrix.THREADS_VECTOR; // block size + size_t neededThreads = matrix.getRowPointers().getSize() * warpSize; + Index blocks; + // Execute kernels on device + for (Index grid = 0; neededThreads != 0; ++grid) { + if (MAX_X_DIM * threads >= neededThreads) { + blocks = roundUpDivision(neededThreads, threads); + neededThreads = 0; + } else { + blocks = MAX_X_DIM; + neededThreads -= MAX_X_DIM * threads; + } + + SpMVCSRVector<<>>( + inVector, + outVector, + matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), + matrix.getValues().getData(), + matrix.getRowPointers().getSize() - 1, + grid + ); + }*/ +} + +#ifdef HAVE_CUDA +template< int ThreadsPerSegment, + typename Device, + typename Index, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real, + typename... Args > +__global__ +void RowsReductionLightKernel( + int gridIdx, + const TNL::Containers::VectorView< Index, TNL::Devices::Cuda, Index > offsets, + Index first, + Index last, + Fetch& fetch, + const Reduction& reduction, + ResultKeeper& keeper, + const Real& zero, + Args... args ) +{ + /*** + * We map one warp to each segment + */ + const Index segmentIdx = TNL::Cuda::getGlobalThreadIdx( gridIdx ) / TNL::Cuda::getWarpSize() + first; + if( segmentIdx >= last ) + return; + + const int laneIdx = threadIdx.x & ( ThreadsPerSegment - 1 ); // & is cheaper than % + Index endIdx = offsets[ segmentIdx + 1] ; + + Index localIdx( laneIdx ); + Real aux = zero; + for( Index globalIdx = offsets[ segmentIdx ] + localIdx; i < endIdx; i += ThreadsPerSegment ) + { + aux = reduce( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) ); + localIdx += TNL::Cuda::getWarpSize(); + } + + /**** + * Reduction in each segment. + */ + if( ThreadsPerSegment == 32 ) + aux += __shfl_down_sync(0xFFFFFFFF, aux, 16); + if( ThreadsPerSegment >= 16 ) + aux += __shfl_down_sync(0xFFFFFFFF, aux, 8); + if( ThreadsPerSegment >= 8 ) + aux += __shfl_down_sync(0xFFFFFFFF, aux, 4); + if( ThreadsPerSegment >= 4 ) + aux += __shfl_down_sync(0xFFFFFFFF, aux, 2); + if( ThreadsPerSegment >= 2 ) + aux += __shfl_down_sync(0xFFFFFFFF, aux, 1); + + if( laneIdx == 0 ) + keeper( segmentIdx, aux ) +} +#endif + + +template< typename OffsetsView, + typename Index, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real, + typename... Args > +void +RowsReductionLightKernelCaller( + const Index elementsInSegment, + const OffsetsView& offsets, + Index first, + Index last, + Fetch& fetch, + const Reduction& reduction, + ResultKeeper& keeper, + const Real& zero, + Args... args ) +{ +#ifdef HAVE_CUDA + const int threadsPerSegment = TNL::min( std::pow( 2, std::floor( std::log2( elementInSegment ) ) ), TNL::Cuda::getWarpSize() ); + TNL::ASSERT_GE( threadsPerSegment, 0 ); + TNL::ASSERT_LE( threadsPerSegment, 32 ); + const size_t threadsCount = threadsPerSegment * ( last - first ); + dim3 blocksCount, gridsCount, blockSize( 256 ); + TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount ); + for( int gridIdx = 0; gridIdx < gridsCount.x; gridIdx ++ ) + { + dim3 gridSize; + setupGrid( blocksCount, gridsCount, gridIdx, gridSize ); + switch( threadsPerSegment ) + { + case 1: + SpMVCSRLight< 1, Index, Fetch, Redcution, ResultKeeper, Real, Args ><<< gridSize, blockSize >>>( + gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args ); + break; + case 2: + SpMVCSRLight< 2, Index, Fetch, Redcution, ResultKeeper, Real, Args ><<< gridSize, blockSize >>>( + gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args ); + break; + case 4: + SpMVCSRLight< 4, Index, Fetch, Redcution, ResultKeeper, Real, Args ><<< gridSize, blockSize >>>( + gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args ); + break; + case 8: + SpMVCSRLight< 8, Index, Fetch, Redcution, ResultKeeper, Real, Args ><<< gridSize, blockSize >>>( + gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args ); + break; + case 16: + SpMVCSRLight< 16, Index, Fetch, Redcution, ResultKeeper, Real, Args ><<< gridSize, blockSize >>>( + gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args ); + break; + case 32: + SpMVCSRLight< 32, Index, Fetch, Redcution, ResultKeeper, Real, Args ><<< gridSize, blockSize >>>( + gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args ); + break; + default: + throw std::runtime_error( "Wrong value of threadsPerSegment." ); + }; +#endif +} + + } // namespace details + } // namespace Segments + } // namespace Algorithms +} // namespace TNL -- GitLab From c1b8c44fd60cf3933d88cee0e94ac3c17273ea92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Fri, 22 Jan 2021 16:21:40 +0100 Subject: [PATCH 10/27] Added unit tests for CSR Vector sparse matrix. --- src/UnitTests/Matrices/CMakeLists.txt | 1 + .../Matrices/SparseMatrixTest_CSRVector.cpp | 11 +++++ .../Matrices/SparseMatrixTest_CSRVector.cu | 11 +++++ .../Matrices/SparseMatrixTest_CSRVector.h | 46 +++++++++++++++++++ 4 files changed, 69 insertions(+) create mode 100644 src/UnitTests/Matrices/SparseMatrixTest_CSRVector.cpp create mode 100644 src/UnitTests/Matrices/SparseMatrixTest_CSRVector.cu create mode 100644 src/UnitTests/Matrices/SparseMatrixTest_CSRVector.h diff --git a/src/UnitTests/Matrices/CMakeLists.txt b/src/UnitTests/Matrices/CMakeLists.txt index e5660090c..7fc16968e 100644 --- a/src/UnitTests/Matrices/CMakeLists.txt +++ b/src/UnitTests/Matrices/CMakeLists.txt @@ -7,6 +7,7 @@ set( COMMON_TESTS MultidiagonalMatrixTest SparseMatrixTest_CSRScalar + SparseMatrixTest_CSRVector SparseMatrixTest_Ellpack SparseMatrixTest_SlicedEllpack SparseMatrixTest_ChunkedEllpack diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.cpp b/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.cpp new file mode 100644 index 000000000..1f6bf5111 --- /dev/null +++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.cpp @@ -0,0 +1,11 @@ +/*************************************************************************** + SparseMatrixTest_CSRVector.cpp - description + ------------------- + begin : Jan 22, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#include "SparseMatrixTest_CSRScalar.h" diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.cu b/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.cu new file mode 100644 index 000000000..11d7afc9c --- /dev/null +++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.cu @@ -0,0 +1,11 @@ +/*************************************************************************** + SparseMatrixTest_CSRVector.cu - description + ------------------- + begin : Jan 22, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#include "SparseMatrixTest_CSRScalar.h" diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.h b/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.h new file mode 100644 index 000000000..7b2e4e7fc --- /dev/null +++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.h @@ -0,0 +1,46 @@ +/*************************************************************************** + SparseMatrixTest_CSRVector.h - description + ------------------- + begin : Jan 22, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#include +#include +#include + +#ifdef HAVE_GTEST +#include + +const char* saveAndLoadFileName = "test_SparseMatrixTest_CSR_segments"; + +// types for which MatrixTest is instantiated +using MatrixTypes = ::testing::Types +< + TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >, + TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector > +#ifdef HAVE_CUDA + ,TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >, + TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRVector > +#endif +>; + +#endif + +#include "SparseMatrixTest.h" +#include "../main.h" -- GitLab From 4cda08c805d67e2b2f4b624c97716b8d99a6ba63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Fri, 22 Jan 2021 23:09:10 +0100 Subject: [PATCH 11/27] Added Scalar, Vector and Light CSR kernels. --- src/TNL/Algorithms/Segments/CSR.h | 22 +- src/TNL/Algorithms/Segments/CSR.hpp | 105 ++--- src/TNL/Algorithms/Segments/CSRKernels.h | 427 ++++++++++++++++++ src/TNL/Algorithms/Segments/CSRView.h | 24 +- src/TNL/Algorithms/Segments/CSRView.hpp | 118 ++--- .../Algorithms/Segments/details/CSRKernels.h | 280 ------------ .../Matrices/SparseMatrixTest_CSRScalar.h | 2 +- .../Matrices/SparseMatrixTest_CSRVector.h | 2 +- 8 files changed, 568 insertions(+), 412 deletions(-) create mode 100644 src/TNL/Algorithms/Segments/CSRKernels.h delete mode 100644 src/TNL/Algorithms/Segments/details/CSRKernels.h diff --git a/src/TNL/Algorithms/Segments/CSR.h b/src/TNL/Algorithms/Segments/CSR.h index 3eaad6eb9..e2b793b84 100644 --- a/src/TNL/Algorithms/Segments/CSR.h +++ b/src/TNL/Algorithms/Segments/CSR.h @@ -22,7 +22,7 @@ namespace TNL { template< typename Device, typename Index, - CSRKernelTypes KernelType_ = CSRScalarKernel, + typename Kernel = CSRScalarKernel< Index, Device >, typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > > class CSR { @@ -30,14 +30,14 @@ class CSR using DeviceType = Device; using IndexType = std::remove_const_t< Index >; + using KernelType = Kernel; using OffsetsHolder = Containers::Vector< Index, DeviceType, IndexType, IndexAllocator >; using SegmentsSizes = OffsetsHolder; template< typename Device_, typename Index_ > - using ViewTemplate = CSRView< Device_, Index_, KernelType_ >; - using ViewType = CSRView< Device, Index, KernelType_ >; - using ConstViewType = CSRView< Device, std::add_const_t< IndexType >, KernelType_ >; + using ViewTemplate = CSRView< Device_, Index_, KernelType >; + using ViewType = CSRView< Device, Index, KernelType >; + using ConstViewType = CSRView< Device, std::add_const_t< IndexType >, KernelType >; using SegmentViewType = SegmentView< IndexType, RowMajorOrder >; - CSRKernelTypes KernelType = KernelType_; CSR(); @@ -116,8 +116,8 @@ class CSR CSR& operator=( const CSR& rhsSegments ) = default; - template< typename Device_, typename Index_, CSRKernelTypes KernelType__, typename IndexAllocator_ > - CSR& operator=( const CSR< Device_, Index_, KernelType__, IndexAllocator_ >& source ); + template< typename Device_, typename Index_, typename Kernel_, typename IndexAllocator_ > + CSR& operator=( const CSR< Device_, Index_, Kernel_, IndexAllocator_ >& source ); void save( File& file ) const; @@ -126,22 +126,24 @@ class CSR protected: OffsetsHolder offsets; + + KernelType kernel; }; template< typename Device, typename Index, typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > > -using CSRScalar = CSR< Device, Index, CSRScalarKernel, IndexAllocator >; +using CSRScalar = CSR< Device, Index, CSRScalarKernel< Index, Device >, IndexAllocator >; template< typename Device, typename Index, typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > > -using CSRVector = CSR< Device, Index, CSRVectorKernel, IndexAllocator >; +using CSRVector = CSR< Device, Index, CSRVectorKernel< Index, Device >, IndexAllocator >; template< typename Device, typename Index, typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > > -using CSRLight = CSR< Device, Index, CSRLightKernel, IndexAllocator >; +using CSRLight = CSR< Device, Index, CSRLightKernel< Index, Device >, IndexAllocator >; template< typename Device, typename Index, diff --git a/src/TNL/Algorithms/Segments/CSR.hpp b/src/TNL/Algorithms/Segments/CSR.hpp index 48e82de41..9845b0208 100644 --- a/src/TNL/Algorithms/Segments/CSR.hpp +++ b/src/TNL/Algorithms/Segments/CSR.hpp @@ -22,18 +22,18 @@ namespace TNL { template< typename Device, typename Index, - CSRKernelTypes KernelType_, + typename Kernel, typename IndexAllocator > -CSR< Device, Index, KernelType_, IndexAllocator >:: +CSR< Device, Index, Kernel, IndexAllocator >:: CSR() { } template< typename Device, typename Index, - CSRKernelTypes KernelType_, + typename Kernel, typename IndexAllocator > -CSR< Device, Index, KernelType_, IndexAllocator >:: +CSR< Device, Index, Kernel, IndexAllocator >:: CSR( const SegmentsSizes& segmentsSizes ) { this->setSegmentsSizes( segmentsSizes ); @@ -41,18 +41,18 @@ CSR( const SegmentsSizes& segmentsSizes ) template< typename Device, typename Index, - CSRKernelTypes KernelType_, + typename Kernel, typename IndexAllocator > -CSR< Device, Index, KernelType_, IndexAllocator >:: +CSR< Device, Index, Kernel, IndexAllocator >:: CSR( const CSR& csr ) : offsets( csr.offsets ) { } template< typename Device, typename Index, - CSRKernelTypes KernelType_, + typename Kernel, typename IndexAllocator > -CSR< Device, Index, KernelType_, IndexAllocator >:: +CSR< Device, Index, Kernel, IndexAllocator >:: CSR( const CSR&& csr ) : offsets( std::move( csr.offsets ) ) { @@ -60,10 +60,10 @@ CSR( const CSR&& csr ) : offsets( std::move( csr.offsets ) ) template< typename Device, typename Index, - CSRKernelTypes KernelType_, + typename Kernel, typename IndexAllocator > String -CSR< Device, Index, KernelType_, IndexAllocator >:: +CSR< Device, Index, Kernel, IndexAllocator >:: getSerializationType() { return "CSR< [any_device], " + TNL::getSerializationType< IndexType >() + " >"; @@ -71,10 +71,10 @@ getSerializationType() template< typename Device, typename Index, - CSRKernelTypes KernelType_, + typename Kernel, typename IndexAllocator > String -CSR< Device, Index, KernelType_, IndexAllocator >:: +CSR< Device, Index, Kernel, IndexAllocator >:: getSegmentsType() { return ViewType::getSegmentsType(); @@ -82,22 +82,23 @@ getSegmentsType() template< typename Device, typename Index, - CSRKernelTypes KernelType_, + typename Kernel, typename IndexAllocator > template< typename SizesHolder > void -CSR< Device, Index, KernelType_, IndexAllocator >:: +CSR< Device, Index, Kernel, IndexAllocator >:: setSegmentsSizes( const SizesHolder& sizes ) { details::CSR< Device, Index >::setSegmentsSizes( sizes, this->offsets ); + this->kernel.init( this->offsets ); } template< typename Device, typename Index, - CSRKernelTypes KernelType_, + typename Kernel, typename IndexAllocator > void -CSR< Device, Index, KernelType_, IndexAllocator >:: +CSR< Device, Index, Kernel, IndexAllocator >:: reset() { this->offsets.setSize( 1 ); @@ -107,31 +108,31 @@ reset() template< typename Device, typename Index, - CSRKernelTypes KernelType_, + typename Kernel, typename IndexAllocator > -typename CSR< Device, Index, KernelType_, IndexAllocator >::ViewType -CSR< Device, Index, KernelType_, IndexAllocator >:: +typename CSR< Device, Index, Kernel, IndexAllocator >::ViewType +CSR< Device, Index, Kernel, IndexAllocator >:: getView() { - return ViewType( this->offsets.getView() ); + return ViewType( this->offsets.getView(), this->kernel.getView() ); } template< typename Device, typename Index, - CSRKernelTypes KernelType_, + typename Kernel, typename IndexAllocator > auto -CSR< Device, Index, KernelType_, IndexAllocator >:: +CSR< Device, Index, Kernel, IndexAllocator >:: getConstView() const -> const ConstViewType { - return ConstViewType( this->offsets.getConstView() ); + return ConstViewType( this->offsets.getConstView(), this->kernel.getConstView() ); } template< typename Device, typename Index, - CSRKernelTypes KernelType_, + typename Kernel, typename IndexAllocator > -__cuda_callable__ auto CSR< Device, Index, KernelType_, IndexAllocator >:: +__cuda_callable__ auto CSR< Device, Index, Kernel, IndexAllocator >:: getSegmentsCount() const -> IndexType { return this->offsets.getSize() - 1; @@ -139,9 +140,9 @@ getSegmentsCount() const -> IndexType template< typename Device, typename Index, - CSRKernelTypes KernelType_, + typename Kernel, typename IndexAllocator > -__cuda_callable__ auto CSR< Device, Index, KernelType_, IndexAllocator >:: +__cuda_callable__ auto CSR< Device, Index, Kernel, IndexAllocator >:: getSegmentSize( const IndexType segmentIdx ) const -> IndexType { return details::CSR< Device, Index >::getSegmentSize( this->offsets, segmentIdx ); @@ -149,9 +150,9 @@ getSegmentSize( const IndexType segmentIdx ) const -> IndexType template< typename Device, typename Index, - CSRKernelTypes KernelType_, + typename Kernel, typename IndexAllocator > -__cuda_callable__ auto CSR< Device, Index, KernelType_, IndexAllocator >:: +__cuda_callable__ auto CSR< Device, Index, Kernel, IndexAllocator >:: getSize() const -> IndexType { return this->getStorageSize(); @@ -159,9 +160,9 @@ getSize() const -> IndexType template< typename Device, typename Index, - CSRKernelTypes KernelType_, + typename Kernel, typename IndexAllocator > -__cuda_callable__ auto CSR< Device, Index, KernelType_, IndexAllocator >:: +__cuda_callable__ auto CSR< Device, Index, Kernel, IndexAllocator >:: getStorageSize() const -> IndexType { return details::CSR< Device, Index >::getStorageSize( this->offsets ); @@ -169,9 +170,9 @@ getStorageSize() const -> IndexType template< typename Device, typename Index, - CSRKernelTypes KernelType_, + typename Kernel, typename IndexAllocator > -__cuda_callable__ auto CSR< Device, Index, KernelType_, IndexAllocator >:: +__cuda_callable__ auto CSR< Device, Index, Kernel, IndexAllocator >:: getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexType { if( ! std::is_same< DeviceType, Devices::Host >::value ) @@ -187,11 +188,11 @@ getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexTyp template< typename Device, typename Index, - CSRKernelTypes KernelType_, + typename Kernel, typename IndexAllocator > __cuda_callable__ auto -CSR< Device, Index, KernelType_, IndexAllocator >:: +CSR< Device, Index, Kernel, IndexAllocator >:: getSegmentView( const IndexType segmentIdx ) const -> SegmentViewType { return SegmentViewType( offsets[ segmentIdx ], offsets[ segmentIdx + 1 ] - offsets[ segmentIdx ] ); @@ -199,11 +200,11 @@ getSegmentView( const IndexType segmentIdx ) const -> SegmentViewType template< typename Device, typename Index, - CSRKernelTypes KernelType_, + typename Kernel, typename IndexAllocator > template< typename Function, typename... Args > void -CSR< Device, Index, KernelType_, IndexAllocator >:: +CSR< Device, Index, Kernel, IndexAllocator >:: forSegments( IndexType first, IndexType last, Function& f, Args... args ) const { this->getConstView().forSegments( first, last, f, args... ); @@ -211,11 +212,11 @@ forSegments( IndexType first, IndexType last, Function& f, Args... args ) const template< typename Device, typename Index, - CSRKernelTypes KernelType_, + typename Kernel, typename IndexAllocator> template< typename Function, typename... Args > void -CSR< Device, Index, KernelType_, IndexAllocator >:: +CSR< Device, Index, Kernel, IndexAllocator >:: forAll( Function& f, Args... args ) const { this->forSegments( 0, this->getSegmentsCount(), f, args... ); @@ -223,11 +224,11 @@ forAll( Function& f, Args... args ) const template< typename Device, typename Index, - CSRKernelTypes KernelType_, + typename Kernel, typename IndexAllocator > template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > void -CSR< Device, Index, KernelType_, IndexAllocator >:: +CSR< Device, Index, Kernel, IndexAllocator >:: segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { this->getConstView().segmentsReduction( first, last, fetch, reduction, keeper, zero, args... ); @@ -235,11 +236,11 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio template< typename Device, typename Index, - CSRKernelTypes KernelType_, + typename Kernel, typename IndexAllocator > template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > void -CSR< Device, Index, KernelType_, IndexAllocator >:: +CSR< Device, Index, Kernel, IndexAllocator >:: allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... ); @@ -247,12 +248,12 @@ allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, co template< typename Device, typename Index, - CSRKernelTypes KernelType_, + typename Kernel, typename IndexAllocator > - template< typename Device_, typename Index_, CSRKernelTypes KernelType__, typename IndexAllocator_ > -CSR< Device, Index, KernelType_, IndexAllocator >& -CSR< Device, Index, KernelType_, IndexAllocator >:: -operator=( const CSR< Device_, Index_, KernelType__, IndexAllocator_ >& source ) + template< typename Device_, typename Index_, typename Kernel_, typename IndexAllocator_ > +CSR< Device, Index, Kernel, IndexAllocator >& +CSR< Device, Index, Kernel, IndexAllocator >:: +operator=( const CSR< Device_, Index_, Kernel_, IndexAllocator_ >& source ) { this->offsets = source.offsets; return *this; @@ -260,10 +261,10 @@ operator=( const CSR< Device_, Index_, KernelType__, IndexAllocator_ >& source ) template< typename Device, typename Index, - CSRKernelTypes KernelType_, + typename Kernel, typename IndexAllocator > void -CSR< Device, Index, KernelType_, IndexAllocator >:: +CSR< Device, Index, Kernel, IndexAllocator >:: save( File& file ) const { file << this->offsets; @@ -271,10 +272,10 @@ save( File& file ) const template< typename Device, typename Index, - CSRKernelTypes KernelType_, + typename Kernel, typename IndexAllocator > void -CSR< Device, Index, KernelType_, IndexAllocator >:: +CSR< Device, Index, Kernel, IndexAllocator >:: load( File& file ) { file >> this->offsets; diff --git a/src/TNL/Algorithms/Segments/CSRKernels.h b/src/TNL/Algorithms/Segments/CSRKernels.h new file mode 100644 index 000000000..7d9b6f1d2 --- /dev/null +++ b/src/TNL/Algorithms/Segments/CSRKernels.h @@ -0,0 +1,427 @@ +/*************************************************************************** + CSRKernels.h - description + ------------------- + begin : Jan 20, 2021 -> Joe Biden inauguration + copyright : (C) 2021 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include +#include +#include +#include +#include + +namespace TNL { + namespace Algorithms { + namespace Segments { + +template< typename Index, + typename Device > +struct CSRScalarKernel +{ + using IndexType = Index; + using DeviceType = Device; + using ViewType = CSRScalarKernel< Index, Device >; + using ConstViewType = CSRScalarKernel< Index, Device >; + + template< typename Offsets > + void init( const Offsets& offsets ) {}; + + ViewType getView() { return *this; }; + + ConstViewType getConstView() const { return *this; }; + + template< typename OffsetsView, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real, + typename... Args > + static void rowsReduction( const OffsetsView& offsets, + Index first, + Index last, + Fetch& fetch, + const Reduction& reduction, + ResultKeeper& keeper, + const Real& zero, + Args... args ) + { + auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable { + const IndexType begin = offsets[ segmentIdx ]; + const IndexType end = offsets[ segmentIdx + 1 ]; + Real aux( zero ); + IndexType localIdx( 0 ); + bool compute( true ); + for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++ ) + aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) ); + keeper( segmentIdx, aux ); + }; + Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); + } +}; + +#ifdef HAVE_CUDA +template< typename Device, + typename Index, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real, + typename... Args > +__global__ +void RowsReductionCSRVectorKernel( + int gridIdx, + const TNL::Containers::VectorView< Index, TNL::Devices::Cuda, Index > offsets, + Index first, + Index last, + Fetch& fetch, + const Reduction& reduction, + ResultKeeper& keeper, + const Real& zero, + Args... args ) +{ + /*** + * We map one warp to each segment + */ + const Index segmentIdx = TNL::Cuda::getGlobalThreadIdx( gridIdx ) / TNL::Cuda::getWarpSize() + first; + if( segmentIdx >= last ) + return; + + const int laneIdx = threadIdx.x & 31; // & is cheaper than % + Index endIdx = offsets[ segmentIdx + 1] ; + + Index localIdx( laneIdx ); + Real aux = zero; + bool compute( true ); + for( Index globalIdx = offsets[ segmentIdx ] + localIdx; globalIdx < endIdx; globalIdx += TNL::Cuda::getWarpSize() ) + { + aux = reduce( aux, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) ); + localIdx += TNL::Cuda::getWarpSize(); + } + + /**** + * Reduction in each warp which means in each segment. + */ + aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 16 ) ); + aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 8 ) ); + aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 4 ) ); + aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 2 ) ); + aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 1 ) ); + + if( laneIdx == 0 ) + keeper( segmentIdx, aux ); +} +#endif + +template< typename Index, + typename Device > +struct CSRVectorKernel +{ + using IndexType = Index; + using DeviceType = Device; + using ViewType = CSRVectorKernel< Index, Device >; + using ConstViewType = CSRVectorKernel< Index, Device >; + + template< typename Offsets > + void init( const Offsets& offsets ) {}; + + ViewType getView() { return *this; }; + + ConstViewType getConstView() const { return *this; }; + + + template< typename OffsetsView, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real, + typename... Args > + static void rowsReduction( const OffsetsView& offsets, + Index first, + Index last, + Fetch& fetch, + const Reduction& reduction, + ResultKeeper& keeper, + const Real& zero, + Args... args ) + { + abort(); +#ifdef HAVE_CUDA + const Index warpsCount = last - first; + const size_t threadsCount = warpsCount * TNL::Cuda::getWarpSize(); + dim3 blocksCount, gridsCount, blockSize( 256 ); + TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount ); + dim3 gridIdx; + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x ++ ) + { + dim3 gridSize; + TNL::Cuda::setupGrid( blocksCount, gridsCount, gridIdx, gridSize ); + RowsReductionCSRVectorKernel< Index, Fetch, Reduction, ResultKeeper, Real, Args... > + <<< gridSize, blockSize >>>( + gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args... ); + }; +#endif + } +}; + + +#ifdef HAVE_CUDA +template< int ThreadsPerSegment, + typename Device, + typename Index, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real, + typename... Args > +__global__ +void RowsReductionCSRLightKernel( + int gridIdx, + const TNL::Containers::VectorView< Index, TNL::Devices::Cuda, Index > offsets, + Index first, + Index last, + Fetch& fetch, + const Reduction& reduction, + ResultKeeper& keeper, + const Real& zero, + Args... args ) +{ + /*** + * We map one warp to each segment + */ + const Index segmentIdx = TNL::Cuda::getGlobalThreadIdx( gridIdx ) / TNL::Cuda::getWarpSize() + first; + if( segmentIdx >= last ) + return; + + const int laneIdx = threadIdx.x & ( ThreadsPerSegment - 1 ); // & is cheaper than % + Index endIdx = offsets[ segmentIdx + 1] ; + + Index localIdx( laneIdx ); + Real aux = zero; + bool compute( true ); + for( Index globalIdx = offsets[ segmentIdx ] + localIdx; globalIdx < endIdx; globalIdx += ThreadsPerSegment ) + { + aux = reduce( aux, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) ); + localIdx += TNL::Cuda::getWarpSize(); + } + + /**** + * Reduction in each segment. + */ + if( ThreadsPerSegment == 32 ) + aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 16 ) ); + if( ThreadsPerSegment >= 16 ) + aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 8 ) ); + if( ThreadsPerSegment >= 8 ) + aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 4 ) ); + if( ThreadsPerSegment >= 4 ) + aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 2 ) ); + if( ThreadsPerSegment >= 2 ) + aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 1 ) ); + + if( laneIdx == 0 ) + keeper( segmentIdx, aux ); +} +#endif + +template< typename Index, + typename Device > +struct CSRLightKernel +{ + using IndexType = Index; + using DeviceType = Device; + using ViewType = CSRLightKernel< Index, Device >; + using ConstViewType = CSRLightKernel< Index, Device >; + + template< typename Offsets > + void init( const Offsets& offsets ) + { + const Index segmentsCount = offsets.getSize() - 1; + const Index elementsInSegment = offsets.getElement( segmentsCount ) / segmentsCount; + this->threadsPerSegment = TNL::min( std::pow( 2, std::floor( std::log2( elementsInSegment ) ) ), TNL::Cuda::getWarpSize() ); + TNL_ASSERT_GE( threadsPerSegment, 0, "" ); + TNL_ASSERT_LE( threadsPerSegment, 32, "" ); + }; + + ViewType getView() { return *this; }; + + ConstViewType getConstView() const { return *this; }; + + template< typename OffsetsView, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real, + typename... Args > + void rowsReduction( const OffsetsView& offsets, + Index first, + Index last, + Fetch& fetch, + const Reduction& reduction, + ResultKeeper& keeper, + const Real& zero, + Args... args ) const + { +#ifdef HAVE_CUDA + const size_t threadsCount = this->threadsPerSegment * ( last - first ); + dim3 blocksCount, gridsCount, blockSize( 256 ); + TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount ); + for( int gridIdx = 0; gridIdx < gridsCount.x; gridIdx ++ ) + { + dim3 gridSize; + TNL::Cuda::setupGrid( blocksCount, gridsCount, gridIdx, gridSize ); + switch( this->threadsPerSegment ) + { + case 1: + RowsReductionCSRLightKernel< 1, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( + gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); + break; + case 2: + RowsReductionCSRLightKernel< 2, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( + gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); + break; + case 4: + RowsReductionCSRLightKernel< 4, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( + gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); + break; + case 8: + RowsReductionCSRLightKernel< 8, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( + gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); + break; + case 16: + RowsReductionCSRLightKernel< 16, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( + gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); + break; + case 32: + RowsReductionCSRLightKernel< 32, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( + gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); + break; + default: + throw std::runtime_error( "Wrong value of threadsPerSegment." ); + } + } +#endif + } + + protected: + int threadsPerSegment; +}; + + +template< typename Index, + typename Device > +struct CSRAdaptiveKernelView +{ + using IndexType = Index; + using DeviceType = Device; + using ViewType = CSRAdaptiveKernelView< Index, Device >; + using ConstViewType = CSRAdaptiveKernelView< Index, Device >; + + ViewType getView() { return *this; }; + + ConstViewType getConstView() const { return *this; }; + + template< typename OffsetsView, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real, + typename... Args > + void rowsReduction( const OffsetsView& offsets, + Index first, + Index last, + Fetch& fetch, + const Reduction& reduction, + ResultKeeper& keeper, + const Real& zero, + Args... args ) const + { + } +}; + +template< typename Index, + typename Device > +struct CSRAdaptiveKernel +{ + using IndexType = Index; + using DeviceType = Device; + using ViewType = CSRAdaptiveKernel< Index, Device >; + using ConstViewType = CSRAdaptiveKernel< Index, Device >; + + template< typename Offsets > + void init( const Offsets& offsets ) + { + /*const Index rows = offsets.getSize(); + Index sum, start = 0, nextStart = 0; + + // Fill blocks + std::vector> inBlock; + inBlock.reserve(rows); + + while (nextStart != rows - 1) + { + Type type; + nextStart = findLimit( + start, *this, rows, type, sum ); + + if (type == Type::LONG) + { + Index parts = roundUpDivision(sum, this->SHARED_PER_WARP); + for (Index index = 0; index < parts; ++index) + { + inBlock.emplace_back(start, Type::LONG, index); + } + } + else + { + inBlock.emplace_back(start, type, + nextStart, + this->rowPointers.getElement(nextStart), + this->rowPointers.getElement(start) ); + } + start = nextStart; + } + inBlock.emplace_back(nextStart); + + // Copy values + this->blocks.setSize(inBlock.size()); + for (size_t i = 0; i < inBlock.size(); ++i) + this->blocks.setElement(i, inBlock[i]); + */ + }; + + ViewType getView() { return view; }; + + ConstViewType getConstView() const { return ConstViewType(); }; + + template< typename OffsetsView, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real, + typename... Args > + void rowsReduction( const OffsetsView& offsets, + Index first, + Index last, + Fetch& fetch, + const Reduction& reduction, + ResultKeeper& keeper, + const Real& zero, + Args... args ) const + { + view.rowsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); + } + + ViewType view; +}; + + + + } // namespace Segments + } // namespace Algorithms +} // namespace TNL diff --git a/src/TNL/Algorithms/Segments/CSRView.h b/src/TNL/Algorithms/Segments/CSRView.h index b30863b8f..541b7c957 100644 --- a/src/TNL/Algorithms/Segments/CSRView.h +++ b/src/TNL/Algorithms/Segments/CSRView.h @@ -14,39 +14,39 @@ #include #include +#include namespace TNL { namespace Algorithms { namespace Segments { -enum CSRKernelTypes { CSRScalarKernel, CSRVectorKernel, CSRLightKernel }; - template< typename Device, typename Index, - CSRKernelTypes KernelType_ = CSRScalarKernel > + typename Kernel = CSRScalarKernel< Index, Device > > class CSRView { public: using DeviceType = Device; using IndexType = std::remove_const_t< Index >; + using KernelType = Kernel; using OffsetsView = typename Containers::VectorView< Index, DeviceType, IndexType >; using ConstOffsetsView = typename Containers::Vector< Index, DeviceType, IndexType >::ConstViewType; + using KernelView = typename Kernel::ViewType; using ViewType = CSRView; template< typename Device_, typename Index_ > - using ViewTemplate = CSRView< Device_, Index_ >; - using ConstViewType = CSRView< Device, std::add_const_t< Index > >; + using ViewTemplate = CSRView< Device_, Index_, Kernel >; + using ConstViewType = CSRView< Device, std::add_const_t< Index >, Kernel >; using SegmentViewType = SegmentView< IndexType, RowMajorOrder >; - CSRKernelTypes KernelType = KernelType_; __cuda_callable__ CSRView(); __cuda_callable__ - CSRView( const OffsetsView& offsets ); + CSRView( const OffsetsView& offsets, const KernelView& kernel ); __cuda_callable__ - CSRView( const OffsetsView&& offsets ); + CSRView( const OffsetsView&& offsets, const KernelView&& kernel ); __cuda_callable__ CSRView( const CSRView& csr_view ); @@ -125,19 +125,21 @@ class CSRView protected: OffsetsView offsets; + + KernelView kernel; }; template< typename Device, typename Index > -using CSRViewScalar = CSRView< Device, Index, CSRScalarKernel >; +using CSRViewScalar = CSRView< Device, Index, CSRScalarKernel< Index, Device > >; template< typename Device, typename Index > -using CSRViewVector = CSRView< Device, Index, CSRVectorKernel >; +using CSRViewVector = CSRView< Device, Index, CSRVectorKernel< Index, Device > >; template< typename Device, typename Index > -using CSRViewLight = CSRView< Device, Index, CSRLightKernel >; +using CSRViewLight = CSRView< Device, Index, CSRLightKernel< Index, Device > >; template< typename Device, typename Index > diff --git a/src/TNL/Algorithms/Segments/CSRView.hpp b/src/TNL/Algorithms/Segments/CSRView.hpp index a49b1bfc9..43018a03f 100644 --- a/src/TNL/Algorithms/Segments/CSRView.hpp +++ b/src/TNL/Algorithms/Segments/CSRView.hpp @@ -14,7 +14,6 @@ #include #include #include -#include #include namespace TNL { @@ -24,68 +23,72 @@ namespace TNL { template< typename Device, typename Index, - CSRKernelTypes KernelType_ > + typename Kernel > __cuda_callable__ -CSRView< Device, Index, KernelType_ >:: +CSRView< Device, Index, Kernel >:: CSRView() { } template< typename Device, typename Index, - CSRKernelTypes KernelType_ > + typename Kernel > __cuda_callable__ -CSRView< Device, Index, KernelType_ >:: -CSRView( const OffsetsView& offsets_view ) - : offsets( offsets_view ) +CSRView< Device, Index, Kernel >:: +CSRView( const OffsetsView& offsets_view, + const KernelView& kernel_view ) + : offsets( offsets_view ), kernel( kernel_view ) { } template< typename Device, typename Index, - CSRKernelTypes KernelType_ > + typename Kernel > __cuda_callable__ -CSRView< Device, Index, KernelType_ >:: -CSRView( const OffsetsView&& offsets_view ) - : offsets( offsets_view ) +CSRView< Device, Index, Kernel >:: +CSRView( const OffsetsView&& offsets_view, + const KernelView&& kernel_view ) + : offsets( std::move( offsets_view ) ), kernel( std::move( kernel_view ) ) { } template< typename Device, typename Index, - CSRKernelTypes KernelType_ > + typename Kernel > __cuda_callable__ -CSRView< Device, Index, KernelType_ >:: +CSRView< Device, Index, Kernel >:: CSRView( const CSRView& csr_view ) - : offsets( csr_view.offsets ) + : offsets( csr_view.offsets ), kernel( csr_view.kernel ) { } template< typename Device, typename Index, - CSRKernelTypes KernelType_ > + typename Kernel > __cuda_callable__ -CSRView< Device, Index, KernelType_ >:: +CSRView< Device, Index, Kernel >:: CSRView( const CSRView&& csr_view ) - : offsets( std::move( csr_view.offsets ) ) + : offsets( std::move( csr_view.offsets ) ), kernel( std::move( csr_view.kernel ) ) { } template< typename Device, typename Index, - CSRKernelTypes KernelType_ > + typename Kernel > String -CSRView< Device, Index, KernelType_ >:: +CSRView< Device, Index, Kernel >:: getSerializationType() { - return "CSR< [any_device], " + TNL::getSerializationType< IndexType >() + " >"; + return "CSR< [any_device], " + + TNL::getSerializationType< IndexType >() + + TNL::getSerializationType< KernelType >() + " >"; } template< typename Device, typename Index, - CSRKernelTypes KernelType_ > + typename Kernel > String -CSRView< Device, Index, KernelType_ >:: +CSRView< Device, Index, Kernel >:: getSegmentsType() { return "CSR"; @@ -93,10 +96,10 @@ getSegmentsType() template< typename Device, typename Index, - CSRKernelTypes KernelType_ > + typename Kernel > __cuda_callable__ -typename CSRView< Device, Index, KernelType_ >::ViewType -CSRView< Device, Index, KernelType_ >:: +typename CSRView< Device, Index, Kernel >::ViewType +CSRView< Device, Index, Kernel >:: getView() { return ViewType( this->offsets ); @@ -104,19 +107,19 @@ getView() template< typename Device, typename Index, - CSRKernelTypes KernelType_ > + typename Kernel > __cuda_callable__ auto -CSRView< Device, Index, KernelType_ >:: +CSRView< Device, Index, Kernel >:: getConstView() const -> const ConstViewType { - return ConstViewType( this->offsets.getConstView() ); + return ConstViewType( this->offsets.getConstView(), this->kernel.getConstView() ); } template< typename Device, typename Index, - CSRKernelTypes KernelType_ > -__cuda_callable__ auto CSRView< Device, Index, KernelType_ >:: + typename Kernel > +__cuda_callable__ auto CSRView< Device, Index, Kernel >:: getSegmentsCount() const -> IndexType { return this->offsets.getSize() - 1; @@ -124,8 +127,8 @@ getSegmentsCount() const -> IndexType template< typename Device, typename Index, - CSRKernelTypes KernelType_ > -__cuda_callable__ auto CSRView< Device, Index, KernelType_ >:: + typename Kernel > +__cuda_callable__ auto CSRView< Device, Index, Kernel >:: getSegmentSize( const IndexType segmentIdx ) const -> IndexType { return details::CSR< Device, Index >::getSegmentSize( this->offsets, segmentIdx ); @@ -133,8 +136,8 @@ getSegmentSize( const IndexType segmentIdx ) const -> IndexType template< typename Device, typename Index, - CSRKernelTypes KernelType_ > -__cuda_callable__ auto CSRView< Device, Index, KernelType_ >:: + typename Kernel > +__cuda_callable__ auto CSRView< Device, Index, Kernel >:: getSize() const -> IndexType { return this->getStorageSize(); @@ -142,8 +145,8 @@ getSize() const -> IndexType template< typename Device, typename Index, - CSRKernelTypes KernelType_ > -__cuda_callable__ auto CSRView< Device, Index, KernelType_ >:: + typename Kernel > +__cuda_callable__ auto CSRView< Device, Index, Kernel >:: getStorageSize() const -> IndexType { return details::CSR< Device, Index >::getStorageSize( this->offsets ); @@ -151,8 +154,8 @@ getStorageSize() const -> IndexType template< typename Device, typename Index, - CSRKernelTypes KernelType_ > -__cuda_callable__ auto CSRView< Device, Index, KernelType_ >:: + typename Kernel > +__cuda_callable__ auto CSRView< Device, Index, Kernel >:: getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexType { if( ! std::is_same< DeviceType, Devices::Host >::value ) @@ -168,10 +171,10 @@ getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexTyp template< typename Device, typename Index, - CSRKernelTypes KernelType_ > + typename Kernel > __cuda_callable__ auto -CSRView< Device, Index, KernelType_ >:: +CSRView< Device, Index, Kernel >:: getSegmentView( const IndexType segmentIdx ) const -> SegmentViewType { return SegmentViewType( offsets[ segmentIdx ], offsets[ segmentIdx + 1 ] - offsets[ segmentIdx ], 1 ); @@ -179,10 +182,10 @@ getSegmentView( const IndexType segmentIdx ) const -> SegmentViewType template< typename Device, typename Index, - CSRKernelTypes KernelType_ > + typename Kernel > template< typename Function, typename... Args > void -CSRView< Device, Index, KernelType_ >:: +CSRView< Device, Index, Kernel >:: forSegments( IndexType first, IndexType last, Function& f, Args... args ) const { const auto offsetsView = this->offsets; @@ -199,10 +202,10 @@ forSegments( IndexType first, IndexType last, Function& f, Args... args ) const template< typename Device, typename Index, - CSRKernelTypes KernelType_ > + typename Kernel > template< typename Function, typename... Args > void -CSRView< Device, Index, KernelType_ >:: +CSRView< Device, Index, Kernel >:: forAll( Function& f, Args... args ) const { this->forSegments( 0, this->getSegmentsCount(), f, args... ); @@ -210,13 +213,14 @@ forAll( Function& f, Args... args ) const template< typename Device, typename Index, - CSRKernelTypes KernelType_ > + typename Kernel > template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > void -CSRView< Device, Index, KernelType_ >:: +CSRView< Device, Index, Kernel >:: segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { - using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType; + kernel.rowsReduction( this->offsets.getConstView(), first, last, fetch, reduction, keeper, zero, args... ); + /*using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType; const auto offsetsView = this->offsets.getConstView(); if( KernelType == CSRScalarKernel || std::is_same< DeviceType, TNL::Devices::Host >::value ) { @@ -238,15 +242,15 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio { const IndexType elementsInSegment = ceil( this->getSize() / this->getSegmentsCount() ); details::RowsReductionLightKernelCaller( elementsInSegment, offsetsView, first, last, fetch, reduction, keeper, zero, args... ); - } + }*/ } template< typename Device, typename Index, - CSRKernelTypes KernelType_ > + typename Kernel > template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > void -CSRView< Device, Index, KernelType_ >:: +CSRView< Device, Index, Kernel >:: allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { this->segmentsReduction( 0, this->getSegmentsCount(), fetch, reduction, keeper, zero, args... ); @@ -254,9 +258,9 @@ allReduction( Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, co template< typename Device, typename Index, - CSRKernelTypes KernelType_ > -CSRView< Device, Index, KernelType_ >& -CSRView< Device, Index, KernelType_ >:: + typename Kernel > +CSRView< Device, Index, Kernel >& +CSRView< Device, Index, Kernel >:: operator=( const CSRView& view ) { this->offsets.bind( view.offsets ); @@ -265,9 +269,9 @@ operator=( const CSRView& view ) template< typename Device, typename Index, - CSRKernelTypes KernelType_ > + typename Kernel > void -CSRView< Device, Index, KernelType_ >:: +CSRView< Device, Index, Kernel >:: save( File& file ) const { file << this->offsets; @@ -275,9 +279,9 @@ save( File& file ) const template< typename Device, typename Index, - CSRKernelTypes KernelType_ > + typename Kernel > void -CSRView< Device, Index, KernelType_ >:: +CSRView< Device, Index, Kernel >:: load( File& file ) { file >> this->offsets; diff --git a/src/TNL/Algorithms/Segments/details/CSRKernels.h b/src/TNL/Algorithms/Segments/details/CSRKernels.h deleted file mode 100644 index 0fc237483..000000000 --- a/src/TNL/Algorithms/Segments/details/CSRKernels.h +++ /dev/null @@ -1,280 +0,0 @@ -/*************************************************************************** - CSRKernels.h - description - ------------------- - begin : Jan 20, 2021 -> Joe Biden inauguration - copyright : (C) 2021 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#pragma once - -#include -#include -#include - -namespace TNL { - namespace Algorithms { - namespace Segments { - namespace details { - - -#ifdef HAVE_CUDA -template< typename Device, - typename Index, - typename Fetch, - typename Reduction, - typename ResultKeeper, - typename Real, - typename... Args > -__global__ -void RowsReductionVectorKernel( - int gridIdx, - const TNL::Containers::VectorView< Index, TNL::Devices::Cuda, Index > offsets, - Index first, - Index last, - Fetch& fetch, - const Reduction& reduction, - ResultKeeper& keeper, - const Real& zero, - Args... args ) -{ - /*** - * We map one warp to each segment - */ - const Index segmentIdx = TNL::Cuda::getGlobalThreadIdx( gridIdx ) / TNL::Cuda::getWarpSize() + first; - if( segmentIdx >= last ) - return; - - const int laneIdx = threadIdx.x & 31; // & is cheaper than % - Index endIdx = offsets[ segmentIdx + 1] ; - - Index localIdx( laneIdx ); - Real aux = zero; - for( Index globalIdx = offsets[ segmentIdx ] + localIdx; i < endIdx; i += TNL::Cuda::getWarpSize() ) - { - aux = reduce( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) ); - localIdx += TNL::Cuda::getWarpSize(); - } - - /**** - * Reduction in each warp which means in each segment. - */ - aux += __shfl_down_sync(0xFFFFFFFF, aux, 16); - aux += __shfl_down_sync(0xFFFFFFFF, aux, 8); - aux += __shfl_down_sync(0xFFFFFFFF, aux, 4); - aux += __shfl_down_sync(0xFFFFFFFF, aux, 2); - aux += __shfl_down_sync(0xFFFFFFFF, aux, 1); - - if( laneIdx == 0 ) - keeper( segmentIdx, aux ) - - - - /*const Index warpID = ((gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / warpSize; - if (warpID >= rows) - return; - - Real result = 0.0; - const Index laneID = threadIdx.x & 31; // & is cheaper than % - Index endID = rowPointers[warpID + 1]; - - // Calculate result - for (Index i = rowPointers[warpID] + laneID; i < endID; i += warpSize) - result += values[i] * inVector[columnIndexes[i]]; - - // Reduction - result += __shfl_down_sync(0xFFFFFFFF, result, 16); - result += __shfl_down_sync(0xFFFFFFFF, result, 8); - result += __shfl_down_sync(0xFFFFFFFF, result, 4); - result += __shfl_down_sync(0xFFFFFFFF, result, 2); - result += __shfl_down_sync(0xFFFFFFFF, result, 1); - // Write result - if (laneID == 0) outVector[warpID] = result;*/ -} -#endif - -template< typename OffsetsView, - typename Index, - typename Fetch, - typename Reduction, - typename ResultKeeper, - typename Real, - typename... Args > -void -RowsReductionVectorKernelCaller( - const OffsetsView& offsets, - Index first, - Index last, - Fetch& fetch, - const Reduction& reduction, - ResultKeeper& keeper, - const Real& zero, - Args... args ) -{ -#ifdef HAVE_CUDA - const Index warpsCount = last - first; - const size_t threadsCount = warpsCount * TNL::Cuda::getWarpSize(); - dim3 blocksCount, gridsCount, blockSize( 256 ); - TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount ); - for( int gridIdx = 0; gridIdx < gridsCount.x; gridIdx ++ ) - { - dim3 gridSize; - setupGrid( blocksCount, gridsCount, gridIdx, gridSize ); - SpMVCSRVector< Index, Fetch, Redcution, ResultKeeper, Real, Args ><<< gridSize, blockSize >>>( - gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args ); - }; - -#endif - -/*const Index threads = matrix.THREADS_VECTOR; // block size - size_t neededThreads = matrix.getRowPointers().getSize() * warpSize; - Index blocks; - // Execute kernels on device - for (Index grid = 0; neededThreads != 0; ++grid) { - if (MAX_X_DIM * threads >= neededThreads) { - blocks = roundUpDivision(neededThreads, threads); - neededThreads = 0; - } else { - blocks = MAX_X_DIM; - neededThreads -= MAX_X_DIM * threads; - } - - SpMVCSRVector<<>>( - inVector, - outVector, - matrix.getRowPointers().getData(), - matrix.getColumnIndexes().getData(), - matrix.getValues().getData(), - matrix.getRowPointers().getSize() - 1, - grid - ); - }*/ -} - -#ifdef HAVE_CUDA -template< int ThreadsPerSegment, - typename Device, - typename Index, - typename Fetch, - typename Reduction, - typename ResultKeeper, - typename Real, - typename... Args > -__global__ -void RowsReductionLightKernel( - int gridIdx, - const TNL::Containers::VectorView< Index, TNL::Devices::Cuda, Index > offsets, - Index first, - Index last, - Fetch& fetch, - const Reduction& reduction, - ResultKeeper& keeper, - const Real& zero, - Args... args ) -{ - /*** - * We map one warp to each segment - */ - const Index segmentIdx = TNL::Cuda::getGlobalThreadIdx( gridIdx ) / TNL::Cuda::getWarpSize() + first; - if( segmentIdx >= last ) - return; - - const int laneIdx = threadIdx.x & ( ThreadsPerSegment - 1 ); // & is cheaper than % - Index endIdx = offsets[ segmentIdx + 1] ; - - Index localIdx( laneIdx ); - Real aux = zero; - for( Index globalIdx = offsets[ segmentIdx ] + localIdx; i < endIdx; i += ThreadsPerSegment ) - { - aux = reduce( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) ); - localIdx += TNL::Cuda::getWarpSize(); - } - - /**** - * Reduction in each segment. - */ - if( ThreadsPerSegment == 32 ) - aux += __shfl_down_sync(0xFFFFFFFF, aux, 16); - if( ThreadsPerSegment >= 16 ) - aux += __shfl_down_sync(0xFFFFFFFF, aux, 8); - if( ThreadsPerSegment >= 8 ) - aux += __shfl_down_sync(0xFFFFFFFF, aux, 4); - if( ThreadsPerSegment >= 4 ) - aux += __shfl_down_sync(0xFFFFFFFF, aux, 2); - if( ThreadsPerSegment >= 2 ) - aux += __shfl_down_sync(0xFFFFFFFF, aux, 1); - - if( laneIdx == 0 ) - keeper( segmentIdx, aux ) -} -#endif - - -template< typename OffsetsView, - typename Index, - typename Fetch, - typename Reduction, - typename ResultKeeper, - typename Real, - typename... Args > -void -RowsReductionLightKernelCaller( - const Index elementsInSegment, - const OffsetsView& offsets, - Index first, - Index last, - Fetch& fetch, - const Reduction& reduction, - ResultKeeper& keeper, - const Real& zero, - Args... args ) -{ -#ifdef HAVE_CUDA - const int threadsPerSegment = TNL::min( std::pow( 2, std::floor( std::log2( elementInSegment ) ) ), TNL::Cuda::getWarpSize() ); - TNL::ASSERT_GE( threadsPerSegment, 0 ); - TNL::ASSERT_LE( threadsPerSegment, 32 ); - const size_t threadsCount = threadsPerSegment * ( last - first ); - dim3 blocksCount, gridsCount, blockSize( 256 ); - TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount ); - for( int gridIdx = 0; gridIdx < gridsCount.x; gridIdx ++ ) - { - dim3 gridSize; - setupGrid( blocksCount, gridsCount, gridIdx, gridSize ); - switch( threadsPerSegment ) - { - case 1: - SpMVCSRLight< 1, Index, Fetch, Redcution, ResultKeeper, Real, Args ><<< gridSize, blockSize >>>( - gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args ); - break; - case 2: - SpMVCSRLight< 2, Index, Fetch, Redcution, ResultKeeper, Real, Args ><<< gridSize, blockSize >>>( - gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args ); - break; - case 4: - SpMVCSRLight< 4, Index, Fetch, Redcution, ResultKeeper, Real, Args ><<< gridSize, blockSize >>>( - gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args ); - break; - case 8: - SpMVCSRLight< 8, Index, Fetch, Redcution, ResultKeeper, Real, Args ><<< gridSize, blockSize >>>( - gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args ); - break; - case 16: - SpMVCSRLight< 16, Index, Fetch, Redcution, ResultKeeper, Real, Args ><<< gridSize, blockSize >>>( - gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args ); - break; - case 32: - SpMVCSRLight< 32, Index, Fetch, Redcution, ResultKeeper, Real, Args ><<< gridSize, blockSize >>>( - gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args ); - break; - default: - throw std::runtime_error( "Wrong value of threadsPerSegment." ); - }; -#endif -} - - } // namespace details - } // namespace Segments - } // namespace Algorithms -} // namespace TNL diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRScalar.h b/src/UnitTests/Matrices/SparseMatrixTest_CSRScalar.h index 3a1cb02c3..0902ee81a 100644 --- a/src/UnitTests/Matrices/SparseMatrixTest_CSRScalar.h +++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRScalar.h @@ -15,7 +15,7 @@ #ifdef HAVE_GTEST #include -const char* saveAndLoadFileName = "test_SparseMatrixTest_CSR_segments"; +const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRScalar_segments"; // types for which MatrixTest is instantiated using MatrixTypes = ::testing::Types diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.h b/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.h index 7b2e4e7fc..8d50fc686 100644 --- a/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.h +++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.h @@ -15,7 +15,7 @@ #ifdef HAVE_GTEST #include -const char* saveAndLoadFileName = "test_SparseMatrixTest_CSR_segments"; +const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRVector_segments"; // types for which MatrixTest is instantiated using MatrixTypes = ::testing::Types -- GitLab From 888308eaea78047011419e1f0bb0aef4bc2d524d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Sat, 23 Jan 2021 15:55:35 +0100 Subject: [PATCH 12/27] Fixed CSR Vector kernel. --- src/TNL/Algorithms/Segments/CSR.hpp | 9 ++- src/TNL/Algorithms/Segments/CSRKernels.h | 64 ++++++++++--------- src/TNL/Algorithms/Segments/CSRView.hpp | 30 ++------- src/TNL/Matrices/SparseMatrixView.h | 58 ++++++++--------- src/TNL/Matrices/SparseMatrixView.hpp | 1 + src/UnitTests/Matrices/SparseMatrixTest.hpp | 1 + .../Matrices/SparseMatrixTest_CSRVector.cpp | 2 +- .../Matrices/SparseMatrixTest_CSRVector.cu | 2 +- 8 files changed, 77 insertions(+), 90 deletions(-) diff --git a/src/TNL/Algorithms/Segments/CSR.hpp b/src/TNL/Algorithms/Segments/CSR.hpp index 9845b0208..e9240e71e 100644 --- a/src/TNL/Algorithms/Segments/CSR.hpp +++ b/src/TNL/Algorithms/Segments/CSR.hpp @@ -44,7 +44,7 @@ template< typename Device, typename Kernel, typename IndexAllocator > CSR< Device, Index, Kernel, IndexAllocator >:: -CSR( const CSR& csr ) : offsets( csr.offsets ) +CSR( const CSR& csr ) : offsets( csr.offsets ), kernel( csr.kernel ) { } @@ -53,7 +53,7 @@ template< typename Device, typename Kernel, typename IndexAllocator > CSR< Device, Index, Kernel, IndexAllocator >:: -CSR( const CSR&& csr ) : offsets( std::move( csr.offsets ) ) +CSR( const CSR&& csr ) : offsets( std::move( csr.offsets ) ), kernel( std::move( csr.kernel ) ) { } @@ -66,7 +66,9 @@ String CSR< Device, Index, Kernel, IndexAllocator >:: getSerializationType() { - return "CSR< [any_device], " + TNL::getSerializationType< IndexType >() + " >"; + return "CSR< [any_device], " + + TNL::getSerializationType< IndexType >() + + TNL::getSerializationType< KernelType >() + " >"; } template< typename Device, @@ -256,6 +258,7 @@ CSR< Device, Index, Kernel, IndexAllocator >:: operator=( const CSR< Device_, Index_, Kernel_, IndexAllocator_ >& source ) { this->offsets = source.offsets; + this->kernel = kernel; return *this; } diff --git a/src/TNL/Algorithms/Segments/CSRKernels.h b/src/TNL/Algorithms/Segments/CSRKernels.h index 7d9b6f1d2..6883610dd 100644 --- a/src/TNL/Algorithms/Segments/CSRKernels.h +++ b/src/TNL/Algorithms/Segments/CSRKernels.h @@ -42,7 +42,7 @@ struct CSRScalarKernel typename ResultKeeper, typename Real, typename... Args > - static void rowsReduction( const OffsetsView& offsets, + static void segmentsReduction( const OffsetsView& offsets, Index first, Index last, Fetch& fetch, @@ -66,7 +66,7 @@ struct CSRScalarKernel }; #ifdef HAVE_CUDA -template< typename Device, +template< typename Offsets, typename Index, typename Fetch, typename Reduction, @@ -74,15 +74,15 @@ template< typename Device, typename Real, typename... Args > __global__ -void RowsReductionCSRVectorKernel( +void segmentsReductionCSRVectorKernel( int gridIdx, - const TNL::Containers::VectorView< Index, TNL::Devices::Cuda, Index > offsets, + const Offsets offsets, Index first, Index last, - Fetch& fetch, - const Reduction& reduction, - ResultKeeper& keeper, - const Real& zero, + Fetch fetch, + const Reduction reduce, + ResultKeeper keep, + const Real zero, Args... args ) { /*** @@ -92,16 +92,19 @@ void RowsReductionCSRVectorKernel( if( segmentIdx >= last ) return; - const int laneIdx = threadIdx.x & 31; // & is cheaper than % - Index endIdx = offsets[ segmentIdx + 1] ; + const int laneIdx = threadIdx.x & ( TNL::Cuda::getWarpSize() - 1 ); // & is cheaper than % + TNL_ASSERT_LT( segmentIdx + 1, offsets.getSize(), "" ); + Index endIdx = offsets[ segmentIdx + 1 ]; Index localIdx( laneIdx ); Real aux = zero; bool compute( true ); for( Index globalIdx = offsets[ segmentIdx ] + localIdx; globalIdx < endIdx; globalIdx += TNL::Cuda::getWarpSize() ) { - aux = reduce( aux, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) ); - localIdx += TNL::Cuda::getWarpSize(); + //printf( "globalIdx = %d endIdx = %d \n", globalIdx, endIdx ); + TNL_ASSERT_LT( globalIdx, endIdx, "" ); + aux = reduce( aux, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) ); + localIdx += TNL::Cuda::getWarpSize(); } /**** @@ -114,7 +117,7 @@ void RowsReductionCSRVectorKernel( aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 1 ) ); if( laneIdx == 0 ) - keeper( segmentIdx, aux ); + keep( segmentIdx, aux ); } #endif @@ -141,7 +144,7 @@ struct CSRVectorKernel typename ResultKeeper, typename Real, typename... Args > - static void rowsReduction( const OffsetsView& offsets, + static void segmentsReduction( const OffsetsView& offsets, Index first, Index last, Fetch& fetch, @@ -150,7 +153,6 @@ struct CSRVectorKernel const Real& zero, Args... args ) { - abort(); #ifdef HAVE_CUDA const Index warpsCount = last - first; const size_t threadsCount = warpsCount * TNL::Cuda::getWarpSize(); @@ -161,7 +163,7 @@ struct CSRVectorKernel { dim3 gridSize; TNL::Cuda::setupGrid( blocksCount, gridsCount, gridIdx, gridSize ); - RowsReductionCSRVectorKernel< Index, Fetch, Reduction, ResultKeeper, Real, Args... > + segmentsReductionCSRVectorKernel< OffsetsView, IndexType, Fetch, Reduction, ResultKeeper, Real, Args... > <<< gridSize, blockSize >>>( gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args... ); }; @@ -180,15 +182,15 @@ template< int ThreadsPerSegment, typename Real, typename... Args > __global__ -void RowsReductionCSRLightKernel( +void segmentsReductionCSRLightKernel( int gridIdx, const TNL::Containers::VectorView< Index, TNL::Devices::Cuda, Index > offsets, Index first, Index last, - Fetch& fetch, - const Reduction& reduction, - ResultKeeper& keeper, - const Real& zero, + Fetch fetch, + const Reduction reduction, + ResultKeeper keeper, + const Real zero, Args... args ) { /*** @@ -258,7 +260,7 @@ struct CSRLightKernel typename ResultKeeper, typename Real, typename... Args > - void rowsReduction( const OffsetsView& offsets, + void segmentsReduction( const OffsetsView& offsets, Index first, Index last, Fetch& fetch, @@ -278,27 +280,27 @@ struct CSRLightKernel switch( this->threadsPerSegment ) { case 1: - RowsReductionCSRLightKernel< 1, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( + segmentsReductionCSRLightKernel< 1, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); break; case 2: - RowsReductionCSRLightKernel< 2, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( + segmentsReductionCSRLightKernel< 2, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); break; case 4: - RowsReductionCSRLightKernel< 4, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( + segmentsReductionCSRLightKernel< 4, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); break; case 8: - RowsReductionCSRLightKernel< 8, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( + segmentsReductionCSRLightKernel< 8, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); break; case 16: - RowsReductionCSRLightKernel< 16, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( + segmentsReductionCSRLightKernel< 16, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); break; case 32: - RowsReductionCSRLightKernel< 32, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( + segmentsReductionCSRLightKernel< 32, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); break; default: @@ -332,7 +334,7 @@ struct CSRAdaptiveKernelView typename ResultKeeper, typename Real, typename... Args > - void rowsReduction( const OffsetsView& offsets, + void segmentsReduction( const OffsetsView& offsets, Index first, Index last, Fetch& fetch, @@ -405,7 +407,7 @@ struct CSRAdaptiveKernel typename ResultKeeper, typename Real, typename... Args > - void rowsReduction( const OffsetsView& offsets, + void segmentsReduction( const OffsetsView& offsets, Index first, Index last, Fetch& fetch, @@ -414,7 +416,7 @@ struct CSRAdaptiveKernel const Real& zero, Args... args ) const { - view.rowsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); + view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); } ViewType view; diff --git a/src/TNL/Algorithms/Segments/CSRView.hpp b/src/TNL/Algorithms/Segments/CSRView.hpp index 43018a03f..34cdd68ee 100644 --- a/src/TNL/Algorithms/Segments/CSRView.hpp +++ b/src/TNL/Algorithms/Segments/CSRView.hpp @@ -102,7 +102,7 @@ typename CSRView< Device, Index, Kernel >::ViewType CSRView< Device, Index, Kernel >:: getView() { - return ViewType( this->offsets ); + return ViewType( this->offsets, this->kernel ); } template< typename Device, @@ -219,30 +219,10 @@ void CSRView< Device, Index, Kernel >:: segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { - kernel.rowsReduction( this->offsets.getConstView(), first, last, fetch, reduction, keeper, zero, args... ); - /*using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType; - const auto offsetsView = this->offsets.getConstView(); - if( KernelType == CSRScalarKernel || std::is_same< DeviceType, TNL::Devices::Host >::value ) - { - auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable { - const IndexType begin = offsetsView[ segmentIdx ]; - const IndexType end = offsetsView[ segmentIdx + 1 ]; - RealType aux( zero ); - IndexType localIdx( 0 ); - bool compute( true ); - for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++ ) - aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) ); - keeper( segmentIdx, aux ); - }; - Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); - } - if( KernelType == CSRVectorKernel ) - details::RowsReductionVectorKernelCaller( offsetsView, first, last, fetch, reduction, keeper, zero, args... ); - if( KernelType == CSRLightKernel ) - { - const IndexType elementsInSegment = ceil( this->getSize() / this->getSegmentsCount() ); - details::RowsReductionLightKernelCaller( elementsInSegment, offsetsView, first, last, fetch, reduction, keeper, zero, args... ); - }*/ + if( std::is_same< DeviceType, TNL::Devices::Host >::value ) + TNL::Algorithms::Segments::CSRScalarKernel< IndexType, DeviceType >::segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); + else + kernel.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); } template< typename Device, diff --git a/src/TNL/Matrices/SparseMatrixView.h b/src/TNL/Matrices/SparseMatrixView.h index a753332a9..9b69c2e91 100644 --- a/src/TNL/Matrices/SparseMatrixView.h +++ b/src/TNL/Matrices/SparseMatrixView.h @@ -79,14 +79,14 @@ class SparseMatrixView : public MatrixView< Real, Device, Index > /** * \brief Test of symmetric matrix type. - * + * * \return \e true if the matrix is stored as symmetric and \e false otherwise. */ static constexpr bool isSymmetric() { return MatrixType::isSymmetric(); }; /** * \brief Test of binary matrix type. - * + * * \return \e true if the matrix is stored as binary and \e false otherwise. */ static constexpr bool isBinary() { return std::is_same< Real, bool >::value; }; @@ -120,7 +120,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index > using SegmentsViewType = SegmentsView< Device, Index >; /** - * \brief Type of related matrix view. + * \brief Type of related matrix view. */ using ViewType = SparseMatrixView< std::remove_const_t< Real >, Device, Index, MatrixType, SegmentsViewTemplate >; @@ -158,7 +158,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index > /** * \brief Constructor with all necessary data and views. - * + * * \param rows is a number of matrix rows. * \param columns is a number of matrix columns. * \param values is a vector view with matrix elements values. @@ -174,7 +174,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index > /** * \brief Copy constructor. - * + * * \param matrix is an input sparse matrix view. */ __cuda_callable__ @@ -182,7 +182,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index > /** * \brief Move constructor. - * + * * \param matrix is an input sparse matrix view. */ __cuda_callable__ @@ -190,7 +190,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index > /** * \brief Returns a modifiable view of the sparse matrix. - * + * * \return sparse matrix view. */ __cuda_callable__ @@ -198,7 +198,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index > /** * \brief Returns a non-modifiable view of the sparse matrix. - * + * * \return sparse matrix view. */ __cuda_callable__ @@ -206,11 +206,11 @@ class SparseMatrixView : public MatrixView< Real, Device, Index > /** * \brief Returns string with serialization type. - * + * * The string has a form `Matrices::SparseMatrix< RealType, [any_device], IndexType, General/Symmetric, Format, [any_allocator] >`. - * + * * \return \ref String with the serialization type. - * + * * \par Example * \include Matrices/SparseMatrix/SparseMatrixViewExample_getSerializationType.cpp * \par Output @@ -220,11 +220,11 @@ class SparseMatrixView : public MatrixView< Real, Device, Index > /** * \brief Returns string with serialization type. - * + * * See \ref SparseMatrix::getSerializationType. - * + * * \return \e String with the serialization type. - * + * * \par Example * \include Matrices/SparseMatrix/SparseMatrixExample_getSerializationType.cpp * \par Output @@ -234,10 +234,10 @@ class SparseMatrixView : public MatrixView< Real, Device, Index > /** * \brief Computes number of non-zeros in each row. - * + * * \param rowLengths is a vector into which the number of non-zeros in each row * will be stored. - * + * * \par Example * \include Matrices/SparseMatrix/SparseMatrixViewExample_getCompressedRowLengths.cpp * \par Output @@ -248,7 +248,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index > /** * \brief Returns capacity of given matrix row. - * + * * \param row index of matrix row. * \return number of matrix elements allocated for the row. */ @@ -257,26 +257,26 @@ class SparseMatrixView : public MatrixView< Real, Device, Index > /** * \brief Returns number of non-zero matrix elements. - * + * * This method really counts the non-zero matrix elements and so * it returns zero for matrix having all allocated elements set to zero. - * + * * \return number of non-zero matrix elements. */ IndexType getNonzeroElementsCount() const; /** * \brief Constant getter of simple structure for accessing given matrix row. - * + * * \param rowIdx is matrix row index. - * + * * \return RowView for accessing given matrix row. * * \par Example * \include Matrices/SparseMatrix/SparseMatrixViewExample_getConstRow.cpp * \par Output * \include SparseMatrixViewExample_getConstRow.out - * + * * See \ref SparseMatrixRowView. */ __cuda_callable__ @@ -284,16 +284,16 @@ class SparseMatrixView : public MatrixView< Real, Device, Index > /** * \brief Non-constant getter of simple structure for accessing given matrix row. - * + * * \param rowIdx is matrix row index. - * + * * \return RowView for accessing given matrix row. - * + * * \par Example * \include Matrices/SparseMatrix/SparseMatrixViewExample_getRow.cpp * \par Output * \include SparseMatrixViewExample_getRow.out - * + * * See \ref SparseMatrixRowView. */ __cuda_callable__ @@ -301,7 +301,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index > /** * \brief Sets element at given \e row and \e column to given \e value. - * + * * This method can be called from the host system (CPU) no matter * where the matrix is allocated. If the matrix is allocated on GPU this method * can be called even from device kernels. If the matrix is allocated in GPU device @@ -309,11 +309,11 @@ class SparseMatrixView : public MatrixView< Real, Device, Index > * performance is very low. For higher performance see. \ref SparseMatrix::getRow * or \ref SparseMatrix::forRows and \ref SparseMatrix::forAllRows. * The call may fail if the matrix row capacity is exhausted. - * + * * \param row is row index of the element. * \param column is columns index of the element. * \param value is the value the element will be set to. - * + * * \par Example * \include Matrices/SparseMatrix/SparseMatrixViewExample_setElement.cpp * \par Output diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp index b031e846d..3be30da64 100644 --- a/src/TNL/Matrices/SparseMatrixView.hpp +++ b/src/TNL/Matrices/SparseMatrixView.hpp @@ -484,6 +484,7 @@ rowsReduction( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduc const auto values_view = this->values.getConstView(); const IndexType paddingIndex_ = this->getPaddingIndex(); auto fetch_ = [=] __cuda_callable__ ( IndexType rowIdx, IndexType localIdx, IndexType globalIdx, bool& compute ) mutable -> decltype( fetch( IndexType(), IndexType(), RealType() ) ) { + TNL_ASSERT_LT( globalIdx, columns_view.getSize(), "" ); IndexType columnIdx = columns_view[ globalIdx ]; if( columnIdx != paddingIndex_ ) { diff --git a/src/UnitTests/Matrices/SparseMatrixTest.hpp b/src/UnitTests/Matrices/SparseMatrixTest.hpp index b5885afbe..46c4d977b 100644 --- a/src/UnitTests/Matrices/SparseMatrixTest.hpp +++ b/src/UnitTests/Matrices/SparseMatrixTest.hpp @@ -92,6 +92,7 @@ void test_Constructors() EXPECT_EQ( mm.getRow( 4 ).getValue( 0 ), 1 ); // 4th row } + std::cerr << "Values size = " << m2.getValues().getSize() << std::endl; m2.getCompressedRowLengths( v1 ); EXPECT_EQ( v1, v2 ); diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.cpp b/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.cpp index 1f6bf5111..c60c5e1f7 100644 --- a/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.cpp +++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.cpp @@ -8,4 +8,4 @@ /* See Copyright Notice in tnl/Copyright */ -#include "SparseMatrixTest_CSRScalar.h" +#include "SparseMatrixTest_CSRVector.h" diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.cu b/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.cu index 11d7afc9c..5c78647a1 100644 --- a/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.cu +++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRVector.cu @@ -8,4 +8,4 @@ /* See Copyright Notice in tnl/Copyright */ -#include "SparseMatrixTest_CSRScalar.h" +#include "SparseMatrixTest_CSRVector.h" -- GitLab From 22f48b6d090c75de06da7f98872cc6591d4ee4eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Sat, 23 Jan 2021 17:28:01 +0100 Subject: [PATCH 13/27] Fixed Light CSR kernel. --- src/TNL/Algorithms/Segments/CSR.hpp | 2 + src/TNL/Algorithms/Segments/CSRKernels.h | 45 +++++++++++------- src/TNL/Algorithms/Segments/CSRView.hpp | 2 + src/UnitTests/Matrices/CMakeLists.txt | 1 + src/UnitTests/Matrices/SparseMatrixTest.hpp | 1 - .../Matrices/SparseMatrixTest_CSRLight.cpp | 11 +++++ .../Matrices/SparseMatrixTest_CSRLight.cu | 11 +++++ .../Matrices/SparseMatrixTest_CSRLight.h | 46 +++++++++++++++++++ 8 files changed, 101 insertions(+), 18 deletions(-) create mode 100644 src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cpp create mode 100644 src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cu create mode 100644 src/UnitTests/Matrices/SparseMatrixTest_CSRLight.h diff --git a/src/TNL/Algorithms/Segments/CSR.hpp b/src/TNL/Algorithms/Segments/CSR.hpp index e9240e71e..d6a177f3b 100644 --- a/src/TNL/Algorithms/Segments/CSR.hpp +++ b/src/TNL/Algorithms/Segments/CSR.hpp @@ -105,6 +105,7 @@ reset() { this->offsets.setSize( 1 ); this->offsets = 0; + this->kernel.reset(); } @@ -282,6 +283,7 @@ CSR< Device, Index, Kernel, IndexAllocator >:: load( File& file ) { file >> this->offsets; + this->kernel.init( this->offsets ); } } // namespace Segments diff --git a/src/TNL/Algorithms/Segments/CSRKernels.h b/src/TNL/Algorithms/Segments/CSRKernels.h index 6883610dd..705f65a51 100644 --- a/src/TNL/Algorithms/Segments/CSRKernels.h +++ b/src/TNL/Algorithms/Segments/CSRKernels.h @@ -32,6 +32,8 @@ struct CSRScalarKernel template< typename Offsets > void init( const Offsets& offsets ) {}; + void reset(){}; + ViewType getView() { return *this; }; ConstViewType getConstView() const { return *this; }; @@ -101,7 +103,6 @@ void segmentsReductionCSRVectorKernel( bool compute( true ); for( Index globalIdx = offsets[ segmentIdx ] + localIdx; globalIdx < endIdx; globalIdx += TNL::Cuda::getWarpSize() ) { - //printf( "globalIdx = %d endIdx = %d \n", globalIdx, endIdx ); TNL_ASSERT_LT( globalIdx, endIdx, "" ); aux = reduce( aux, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) ); localIdx += TNL::Cuda::getWarpSize(); @@ -133,6 +134,8 @@ struct CSRVectorKernel template< typename Offsets > void init( const Offsets& offsets ) {}; + void reset(){}; + ViewType getView() { return *this; }; ConstViewType getConstView() const { return *this; }; @@ -174,7 +177,7 @@ struct CSRVectorKernel #ifdef HAVE_CUDA template< int ThreadsPerSegment, - typename Device, + typename Offsets, typename Index, typename Fetch, typename Reduction, @@ -184,19 +187,19 @@ template< int ThreadsPerSegment, __global__ void segmentsReductionCSRLightKernel( int gridIdx, - const TNL::Containers::VectorView< Index, TNL::Devices::Cuda, Index > offsets, + const Offsets offsets, Index first, Index last, Fetch fetch, - const Reduction reduction, - ResultKeeper keeper, + const Reduction reduce, + ResultKeeper keep, const Real zero, Args... args ) { /*** * We map one warp to each segment */ - const Index segmentIdx = TNL::Cuda::getGlobalThreadIdx( gridIdx ) / TNL::Cuda::getWarpSize() + first; + const Index segmentIdx = TNL::Cuda::getGlobalThreadIdx( gridIdx ) / ThreadsPerSegment + first; if( segmentIdx >= last ) return; @@ -227,7 +230,7 @@ void segmentsReductionCSRLightKernel( aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 1 ) ); if( laneIdx == 0 ) - keeper( segmentIdx, aux ); + keep( segmentIdx, aux ); } #endif @@ -244,12 +247,14 @@ struct CSRLightKernel void init( const Offsets& offsets ) { const Index segmentsCount = offsets.getSize() - 1; - const Index elementsInSegment = offsets.getElement( segmentsCount ) / segmentsCount; - this->threadsPerSegment = TNL::min( std::pow( 2, std::floor( std::log2( elementsInSegment ) ) ), TNL::Cuda::getWarpSize() ); + const Index elementsInSegment = std::ceil( ( double ) offsets.getElement( segmentsCount ) / ( double ) segmentsCount ); + this->threadsPerSegment = TNL::min( std::pow( 2, std::ceil( std::log2( elementsInSegment ) ) ), TNL::Cuda::getWarpSize() ); TNL_ASSERT_GE( threadsPerSegment, 0, "" ); TNL_ASSERT_LE( threadsPerSegment, 32, "" ); }; + void reset() { this->threadsPerSegment = 0; } + ViewType getView() { return *this; }; ConstViewType getConstView() const { return *this; }; @@ -269,42 +274,48 @@ struct CSRLightKernel const Real& zero, Args... args ) const { + TNL_ASSERT_GE( threadsPerSegment, 0, "" ); + TNL_ASSERT_LE( threadsPerSegment, 32, "" ); + #ifdef HAVE_CUDA const size_t threadsCount = this->threadsPerSegment * ( last - first ); dim3 blocksCount, gridsCount, blockSize( 256 ); TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount ); - for( int gridIdx = 0; gridIdx < gridsCount.x; gridIdx ++ ) + //std::cerr << " this->threadsPerSegment = " << this->threadsPerSegment << " offsets = " << offsets << std::endl; + for( unsigned int gridIdx = 0; gridIdx < gridsCount.x; gridIdx ++ ) { dim3 gridSize; TNL::Cuda::setupGrid( blocksCount, gridsCount, gridIdx, gridSize ); switch( this->threadsPerSegment ) { + case 0: // this means zero/empty matrix + break; case 1: - segmentsReductionCSRLightKernel< 1, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( + segmentsReductionCSRLightKernel< 1, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); break; case 2: - segmentsReductionCSRLightKernel< 2, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( + segmentsReductionCSRLightKernel< 2, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); break; case 4: - segmentsReductionCSRLightKernel< 4, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( + segmentsReductionCSRLightKernel< 4, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); break; case 8: - segmentsReductionCSRLightKernel< 8, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( + segmentsReductionCSRLightKernel< 8, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); break; case 16: - segmentsReductionCSRLightKernel< 16, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( + segmentsReductionCSRLightKernel< 16, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); break; case 32: - segmentsReductionCSRLightKernel< 32, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( + segmentsReductionCSRLightKernel< 32, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); break; default: - throw std::runtime_error( "Wrong value of threadsPerSegment." ); + throw std::runtime_error( std::string( "Wrong value of threadsPerSegment: " ) + std::to_string( this->threadsPerSegment ) ); } } #endif diff --git a/src/TNL/Algorithms/Segments/CSRView.hpp b/src/TNL/Algorithms/Segments/CSRView.hpp index 34cdd68ee..d72c45b1d 100644 --- a/src/TNL/Algorithms/Segments/CSRView.hpp +++ b/src/TNL/Algorithms/Segments/CSRView.hpp @@ -244,6 +244,7 @@ CSRView< Device, Index, Kernel >:: operator=( const CSRView& view ) { this->offsets.bind( view.offsets ); + this->kernel = view.kernel; return *this; } @@ -265,6 +266,7 @@ CSRView< Device, Index, Kernel >:: load( File& file ) { file >> this->offsets; + this->kernel.init( this->offsets ); } } // namespace Segments diff --git a/src/UnitTests/Matrices/CMakeLists.txt b/src/UnitTests/Matrices/CMakeLists.txt index 7fc16968e..37021b230 100644 --- a/src/UnitTests/Matrices/CMakeLists.txt +++ b/src/UnitTests/Matrices/CMakeLists.txt @@ -8,6 +8,7 @@ set( COMMON_TESTS SparseMatrixTest_CSRScalar SparseMatrixTest_CSRVector + SparseMatrixTest_CSRLight SparseMatrixTest_Ellpack SparseMatrixTest_SlicedEllpack SparseMatrixTest_ChunkedEllpack diff --git a/src/UnitTests/Matrices/SparseMatrixTest.hpp b/src/UnitTests/Matrices/SparseMatrixTest.hpp index 46c4d977b..b5885afbe 100644 --- a/src/UnitTests/Matrices/SparseMatrixTest.hpp +++ b/src/UnitTests/Matrices/SparseMatrixTest.hpp @@ -92,7 +92,6 @@ void test_Constructors() EXPECT_EQ( mm.getRow( 4 ).getValue( 0 ), 1 ); // 4th row } - std::cerr << "Values size = " << m2.getValues().getSize() << std::endl; m2.getCompressedRowLengths( v1 ); EXPECT_EQ( v1, v2 ); diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cpp b/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cpp new file mode 100644 index 000000000..70d767b37 --- /dev/null +++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cpp @@ -0,0 +1,11 @@ +/*************************************************************************** + SparseMatrixTest_CSRLight.cpp - description + ------------------- + begin : Jan 23, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#include "SparseMatrixTest_CSRLight.h" diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cu b/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cu new file mode 100644 index 000000000..bf2c8061e --- /dev/null +++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cu @@ -0,0 +1,11 @@ +/*************************************************************************** + SparseMatrixTest_CSRLight.cu - description + ------------------- + begin : Jan 23, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#include "SparseMatrixTest_CSRLight.h" diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.h b/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.h new file mode 100644 index 000000000..6349c1711 --- /dev/null +++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.h @@ -0,0 +1,46 @@ +/*************************************************************************** + SparseMatrixTest_CSRLight.h - description + ------------------- + begin : Jan 23, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#include +#include +#include + +#ifdef HAVE_GTEST +#include + +const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRLight_segments"; + +// types for which MatrixTest is instantiated +using MatrixTypes = ::testing::Types +< + TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >, + TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight > +#ifdef HAVE_CUDA + ,TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >, + TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight > +#endif +>; + +#endif + +#include "SparseMatrixTest.h" +#include "../main.h" -- GitLab From 69c12aeac229167361a9a4ff74f13fb41d585d7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Sat, 23 Jan 2021 17:47:59 +0100 Subject: [PATCH 14/27] Renaming CSRScalarKernel to CSRKernelScalar and maiking separate source files. --- src/TNL/Algorithms/Segments/CSR.h | 4 +- src/TNL/Algorithms/Segments/CSRKernelScalar.h | 61 ++++++++++++ .../Algorithms/Segments/CSRKernelScalar.hpp | 92 +++++++++++++++++++ src/TNL/Algorithms/Segments/CSRKernels.h | 47 ---------- src/TNL/Algorithms/Segments/CSRView.h | 5 +- src/TNL/Algorithms/Segments/CSRView.hpp | 2 +- 6 files changed, 159 insertions(+), 52 deletions(-) create mode 100644 src/TNL/Algorithms/Segments/CSRKernelScalar.h create mode 100644 src/TNL/Algorithms/Segments/CSRKernelScalar.hpp diff --git a/src/TNL/Algorithms/Segments/CSR.h b/src/TNL/Algorithms/Segments/CSR.h index e2b793b84..af1794e43 100644 --- a/src/TNL/Algorithms/Segments/CSR.h +++ b/src/TNL/Algorithms/Segments/CSR.h @@ -22,7 +22,7 @@ namespace TNL { template< typename Device, typename Index, - typename Kernel = CSRScalarKernel< Index, Device >, + typename Kernel = CSRKernelScalar< Index, Device >, typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > > class CSR { @@ -133,7 +133,7 @@ class CSR template< typename Device, typename Index, typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > > -using CSRScalar = CSR< Device, Index, CSRScalarKernel< Index, Device >, IndexAllocator >; +using CSRScalar = CSR< Device, Index, CSRKernelScalar< Index, Device >, IndexAllocator >; template< typename Device, typename Index, diff --git a/src/TNL/Algorithms/Segments/CSRKernelScalar.h b/src/TNL/Algorithms/Segments/CSRKernelScalar.h new file mode 100644 index 000000000..4a716c890 --- /dev/null +++ b/src/TNL/Algorithms/Segments/CSRKernelScalar.h @@ -0,0 +1,61 @@ +/*************************************************************************** + CSRKernelScalar.h - description + ------------------- + begin : Jan 23, 2021 -> Joe Biden inauguration + copyright : (C) 2021 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include +#include +#include +#include +#include + +namespace TNL { + namespace Algorithms { + namespace Segments { + +template< typename Index, + typename Device > +struct CSRKernelScalar +{ + using IndexType = Index; + using DeviceType = Device; + using ViewType = CSRKernelScalar< Index, Device >; + using ConstViewType = CSRKernelScalar< Index, Device >; + + template< typename Offsets > + void init( const Offsets& offsets ); + + void reset(); + + ViewType getView(); + + ConstViewType getConstView() const; + + template< typename OffsetsView, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real, + typename... Args > + static void segmentsReduction( const OffsetsView& offsets, + Index first, + Index last, + Fetch& fetch, + const Reduction& reduction, + ResultKeeper& keeper, + const Real& zero, + Args... args ); +}; + + } // namespace Segments + } // namespace Algorithms +} // namespace TNL + +#include \ No newline at end of file diff --git a/src/TNL/Algorithms/Segments/CSRKernelScalar.hpp b/src/TNL/Algorithms/Segments/CSRKernelScalar.hpp new file mode 100644 index 000000000..7dd0f5cd7 --- /dev/null +++ b/src/TNL/Algorithms/Segments/CSRKernelScalar.hpp @@ -0,0 +1,92 @@ +/*************************************************************************** + CSRKernelScalar.h - description + ------------------- + begin : Jan 23, 2021 -> Joe Biden inauguration + copyright : (C) 2021 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace TNL { + namespace Algorithms { + namespace Segments { + +template< typename Index, + typename Device > + template< typename Offsets > +void +CSRKernelScalar< Index, Device >:: +init( const Offsets& offsets ) +{ +} + +template< typename Index, + typename Device > +void +CSRKernelScalar< Index, Device >:: +reset() +{ +} + +template< typename Index, + typename Device > +auto +CSRKernelScalar< Index, Device >:: +getView() -> ViewType +{ + return *this; +} + +template< typename Index, + typename Device > +auto +CSRKernelScalar< Index, Device >:: +getConstView() const -> ConstViewType +{ + return *this; +}; + +template< typename Index, + typename Device > + template< typename OffsetsView, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real, + typename... Args > +void +CSRKernelScalar< Index, Device >:: +segmentsReduction( const OffsetsView& offsets, + Index first, + Index last, + Fetch& fetch, + const Reduction& reduction, + ResultKeeper& keeper, + const Real& zero, + Args... args ) +{ + auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable { + const IndexType begin = offsets[ segmentIdx ]; + const IndexType end = offsets[ segmentIdx + 1 ]; + Real aux( zero ); + IndexType localIdx( 0 ); + bool compute( true ); + for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++ ) + aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) ); + keeper( segmentIdx, aux ); + }; + Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); +} + } // namespace Segments + } // namespace Algorithms +} // namespace TNL diff --git a/src/TNL/Algorithms/Segments/CSRKernels.h b/src/TNL/Algorithms/Segments/CSRKernels.h index 705f65a51..eadee986d 100644 --- a/src/TNL/Algorithms/Segments/CSRKernels.h +++ b/src/TNL/Algorithms/Segments/CSRKernels.h @@ -20,53 +20,6 @@ namespace TNL { namespace Algorithms { namespace Segments { -template< typename Index, - typename Device > -struct CSRScalarKernel -{ - using IndexType = Index; - using DeviceType = Device; - using ViewType = CSRScalarKernel< Index, Device >; - using ConstViewType = CSRScalarKernel< Index, Device >; - - template< typename Offsets > - void init( const Offsets& offsets ) {}; - - void reset(){}; - - ViewType getView() { return *this; }; - - ConstViewType getConstView() const { return *this; }; - - template< typename OffsetsView, - typename Fetch, - typename Reduction, - typename ResultKeeper, - typename Real, - typename... Args > - static void segmentsReduction( const OffsetsView& offsets, - Index first, - Index last, - Fetch& fetch, - const Reduction& reduction, - ResultKeeper& keeper, - const Real& zero, - Args... args ) - { - auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable { - const IndexType begin = offsets[ segmentIdx ]; - const IndexType end = offsets[ segmentIdx + 1 ]; - Real aux( zero ); - IndexType localIdx( 0 ); - bool compute( true ); - for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++ ) - aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) ); - keeper( segmentIdx, aux ); - }; - Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); - } -}; - #ifdef HAVE_CUDA template< typename Offsets, typename Index, diff --git a/src/TNL/Algorithms/Segments/CSRView.h b/src/TNL/Algorithms/Segments/CSRView.h index 541b7c957..d0dd35acb 100644 --- a/src/TNL/Algorithms/Segments/CSRView.h +++ b/src/TNL/Algorithms/Segments/CSRView.h @@ -14,6 +14,7 @@ #include #include +#include #include namespace TNL { @@ -22,7 +23,7 @@ namespace TNL { template< typename Device, typename Index, - typename Kernel = CSRScalarKernel< Index, Device > > + typename Kernel = CSRKernelScalar< Index, Device > > class CSRView { public: @@ -131,7 +132,7 @@ class CSRView template< typename Device, typename Index > -using CSRViewScalar = CSRView< Device, Index, CSRScalarKernel< Index, Device > >; +using CSRViewScalar = CSRView< Device, Index, CSRKernelScalar< Index, Device > >; template< typename Device, typename Index > diff --git a/src/TNL/Algorithms/Segments/CSRView.hpp b/src/TNL/Algorithms/Segments/CSRView.hpp index d72c45b1d..045b6bc5a 100644 --- a/src/TNL/Algorithms/Segments/CSRView.hpp +++ b/src/TNL/Algorithms/Segments/CSRView.hpp @@ -220,7 +220,7 @@ CSRView< Device, Index, Kernel >:: segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { if( std::is_same< DeviceType, TNL::Devices::Host >::value ) - TNL::Algorithms::Segments::CSRScalarKernel< IndexType, DeviceType >::segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); + TNL::Algorithms::Segments::CSRKernelScalar< IndexType, DeviceType >::segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); else kernel.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); } -- GitLab From 3e9f89a0b6258a3f209e066c6db8d7c331b2b2b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Sat, 23 Jan 2021 18:03:23 +0100 Subject: [PATCH 15/27] Renaming CSRVectorKernel to CSRKernelVector and maiking separate source files. --- src/TNL/Algorithms/Segments/CSR.h | 2 +- src/TNL/Algorithms/Segments/CSRKernelVector.h | 62 +++++++ .../Algorithms/Segments/CSRKernelVector.hpp | 152 ++++++++++++++++++ src/TNL/Algorithms/Segments/CSRKernels.h | 106 ------------ src/TNL/Algorithms/Segments/CSRView.h | 3 +- 5 files changed, 217 insertions(+), 108 deletions(-) create mode 100644 src/TNL/Algorithms/Segments/CSRKernelVector.h create mode 100644 src/TNL/Algorithms/Segments/CSRKernelVector.hpp diff --git a/src/TNL/Algorithms/Segments/CSR.h b/src/TNL/Algorithms/Segments/CSR.h index af1794e43..16f2f37ed 100644 --- a/src/TNL/Algorithms/Segments/CSR.h +++ b/src/TNL/Algorithms/Segments/CSR.h @@ -138,7 +138,7 @@ using CSRScalar = CSR< Device, Index, CSRKernelScalar< Index, Device >, IndexAll template< typename Device, typename Index, typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > > -using CSRVector = CSR< Device, Index, CSRVectorKernel< Index, Device >, IndexAllocator >; +using CSRVector = CSR< Device, Index, CSRKernelVector< Index, Device >, IndexAllocator >; template< typename Device, typename Index, diff --git a/src/TNL/Algorithms/Segments/CSRKernelVector.h b/src/TNL/Algorithms/Segments/CSRKernelVector.h new file mode 100644 index 000000000..7a6ccf7ff --- /dev/null +++ b/src/TNL/Algorithms/Segments/CSRKernelVector.h @@ -0,0 +1,62 @@ +/*************************************************************************** + CSRKernelVector.h - description + ------------------- + begin : Jan 23, 2021 -> Joe Biden inauguration + copyright : (C) 2021 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include +#include +#include +#include +#include + +namespace TNL { + namespace Algorithms { + namespace Segments { + +template< typename Index, + typename Device > +struct CSRKernelVector +{ + using IndexType = Index; + using DeviceType = Device; + using ViewType = CSRKernelVector< Index, Device >; + using ConstViewType = CSRKernelVector< Index, Device >; + + template< typename Offsets > + void init( const Offsets& offsets ); + + void reset(); + + ViewType getView(); + + ConstViewType getConstView() const; + + + template< typename OffsetsView, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real, + typename... Args > + static void segmentsReduction( const OffsetsView& offsets, + Index first, + Index last, + Fetch& fetch, + const Reduction& reduction, + ResultKeeper& keeper, + const Real& zero, + Args... args ); +}; + + } // namespace Segments + } // namespace Algorithms +} // namespace TNL + +#include diff --git a/src/TNL/Algorithms/Segments/CSRKernelVector.hpp b/src/TNL/Algorithms/Segments/CSRKernelVector.hpp new file mode 100644 index 000000000..d6f5bb7ec --- /dev/null +++ b/src/TNL/Algorithms/Segments/CSRKernelVector.hpp @@ -0,0 +1,152 @@ +/*************************************************************************** + CSRKernelVector.hpp - description + ------------------- + begin : Jan 23, 2021 -> Joe Biden inauguration + copyright : (C) 2021 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace TNL { + namespace Algorithms { + namespace Segments { + +#ifdef HAVE_CUDA +template< typename Offsets, + typename Index, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real, + typename... Args > +__global__ +void segmentsReductionCSRKernelVector( + int gridIdx, + const Offsets offsets, + Index first, + Index last, + Fetch fetch, + const Reduction reduce, + ResultKeeper keep, + const Real zero, + Args... args ) +{ + /*** + * We map one warp to each segment + */ + const Index segmentIdx = TNL::Cuda::getGlobalThreadIdx( gridIdx ) / TNL::Cuda::getWarpSize() + first; + if( segmentIdx >= last ) + return; + + const int laneIdx = threadIdx.x & ( TNL::Cuda::getWarpSize() - 1 ); // & is cheaper than % + TNL_ASSERT_LT( segmentIdx + 1, offsets.getSize(), "" ); + Index endIdx = offsets[ segmentIdx + 1 ]; + + Index localIdx( laneIdx ); + Real aux = zero; + bool compute( true ); + for( Index globalIdx = offsets[ segmentIdx ] + localIdx; globalIdx < endIdx; globalIdx += TNL::Cuda::getWarpSize() ) + { + TNL_ASSERT_LT( globalIdx, endIdx, "" ); + aux = reduce( aux, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) ); + localIdx += TNL::Cuda::getWarpSize(); + } + + /**** + * Reduction in each warp which means in each segment. + */ + aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 16 ) ); + aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 8 ) ); + aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 4 ) ); + aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 2 ) ); + aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 1 ) ); + + if( laneIdx == 0 ) + keep( segmentIdx, aux ); +} +#endif + +template< typename Index, + typename Device > + template< typename Offsets > +void +CSRKernelVector< Index, Device >:: +init( const Offsets& offsets ) +{ +} + +template< typename Index, + typename Device > +void +CSRKernelVector< Index, Device >:: +reset() +{ +} + +template< typename Index, + typename Device > +auto +CSRKernelVector< Index, Device >:: +getView() -> ViewType +{ + return *this; +} + +template< typename Index, + typename Device > +auto +CSRKernelVector< Index, Device >:: +getConstView() const -> ConstViewType +{ + return *this; +}; + + +template< typename Index, + typename Device > + template< typename OffsetsView, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real, + typename... Args > +void +CSRKernelVector< Index, Device >:: +segmentsReduction( const OffsetsView& offsets, + Index first, + Index last, + Fetch& fetch, + const Reduction& reduction, + ResultKeeper& keeper, + const Real& zero, + Args... args ) +{ +#ifdef HAVE_CUDA + const Index warpsCount = last - first; + const size_t threadsCount = warpsCount * TNL::Cuda::getWarpSize(); + dim3 blocksCount, gridsCount, blockSize( 256 ); + TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount ); + dim3 gridIdx; + for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x ++ ) + { + dim3 gridSize; + TNL::Cuda::setupGrid( blocksCount, gridsCount, gridIdx, gridSize ); + segmentsReductionCSRKernelVector< OffsetsView, IndexType, Fetch, Reduction, ResultKeeper, Real, Args... > + <<< gridSize, blockSize >>>( + gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args... ); + }; +#endif +} + } // namespace Segments + } // namespace Algorithms +} // namespace TNL diff --git a/src/TNL/Algorithms/Segments/CSRKernels.h b/src/TNL/Algorithms/Segments/CSRKernels.h index eadee986d..9504aec64 100644 --- a/src/TNL/Algorithms/Segments/CSRKernels.h +++ b/src/TNL/Algorithms/Segments/CSRKernels.h @@ -20,112 +20,6 @@ namespace TNL { namespace Algorithms { namespace Segments { -#ifdef HAVE_CUDA -template< typename Offsets, - typename Index, - typename Fetch, - typename Reduction, - typename ResultKeeper, - typename Real, - typename... Args > -__global__ -void segmentsReductionCSRVectorKernel( - int gridIdx, - const Offsets offsets, - Index first, - Index last, - Fetch fetch, - const Reduction reduce, - ResultKeeper keep, - const Real zero, - Args... args ) -{ - /*** - * We map one warp to each segment - */ - const Index segmentIdx = TNL::Cuda::getGlobalThreadIdx( gridIdx ) / TNL::Cuda::getWarpSize() + first; - if( segmentIdx >= last ) - return; - - const int laneIdx = threadIdx.x & ( TNL::Cuda::getWarpSize() - 1 ); // & is cheaper than % - TNL_ASSERT_LT( segmentIdx + 1, offsets.getSize(), "" ); - Index endIdx = offsets[ segmentIdx + 1 ]; - - Index localIdx( laneIdx ); - Real aux = zero; - bool compute( true ); - for( Index globalIdx = offsets[ segmentIdx ] + localIdx; globalIdx < endIdx; globalIdx += TNL::Cuda::getWarpSize() ) - { - TNL_ASSERT_LT( globalIdx, endIdx, "" ); - aux = reduce( aux, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) ); - localIdx += TNL::Cuda::getWarpSize(); - } - - /**** - * Reduction in each warp which means in each segment. - */ - aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 16 ) ); - aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 8 ) ); - aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 4 ) ); - aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 2 ) ); - aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 1 ) ); - - if( laneIdx == 0 ) - keep( segmentIdx, aux ); -} -#endif - -template< typename Index, - typename Device > -struct CSRVectorKernel -{ - using IndexType = Index; - using DeviceType = Device; - using ViewType = CSRVectorKernel< Index, Device >; - using ConstViewType = CSRVectorKernel< Index, Device >; - - template< typename Offsets > - void init( const Offsets& offsets ) {}; - - void reset(){}; - - ViewType getView() { return *this; }; - - ConstViewType getConstView() const { return *this; }; - - - template< typename OffsetsView, - typename Fetch, - typename Reduction, - typename ResultKeeper, - typename Real, - typename... Args > - static void segmentsReduction( const OffsetsView& offsets, - Index first, - Index last, - Fetch& fetch, - const Reduction& reduction, - ResultKeeper& keeper, - const Real& zero, - Args... args ) - { -#ifdef HAVE_CUDA - const Index warpsCount = last - first; - const size_t threadsCount = warpsCount * TNL::Cuda::getWarpSize(); - dim3 blocksCount, gridsCount, blockSize( 256 ); - TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount ); - dim3 gridIdx; - for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x ++ ) - { - dim3 gridSize; - TNL::Cuda::setupGrid( blocksCount, gridsCount, gridIdx, gridSize ); - segmentsReductionCSRVectorKernel< OffsetsView, IndexType, Fetch, Reduction, ResultKeeper, Real, Args... > - <<< gridSize, blockSize >>>( - gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args... ); - }; -#endif - } -}; #ifdef HAVE_CUDA diff --git a/src/TNL/Algorithms/Segments/CSRView.h b/src/TNL/Algorithms/Segments/CSRView.h index d0dd35acb..ec47aaf4f 100644 --- a/src/TNL/Algorithms/Segments/CSRView.h +++ b/src/TNL/Algorithms/Segments/CSRView.h @@ -15,6 +15,7 @@ #include #include #include +#include #include namespace TNL { @@ -136,7 +137,7 @@ using CSRViewScalar = CSRView< Device, Index, CSRKernelScalar< Index, Device > > template< typename Device, typename Index > -using CSRViewVector = CSRView< Device, Index, CSRVectorKernel< Index, Device > >; +using CSRViewVector = CSRView< Device, Index, CSRKernelVector< Index, Device > >; template< typename Device, typename Index > -- GitLab From 601617187dd3a5cbc1b7670f46af0e93f197f21c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Sat, 23 Jan 2021 18:47:19 +0100 Subject: [PATCH 16/27] Renaming CSRLightKernel to CSRKernelHyrbid and making separate source files. --- src/TNL/Algorithms/Segments/CSR.h | 2 +- src/TNL/Algorithms/Segments/CSRKernelHybrid.h | 65 ++++++ .../Algorithms/Segments/CSRKernelHybrid.hpp | 195 ++++++++++++++++++ src/TNL/Algorithms/Segments/CSRKernels.h | 152 -------------- src/TNL/Algorithms/Segments/CSRView.h | 3 +- src/UnitTests/Matrices/CMakeLists.txt | 2 +- ...ight.cu => SparseMatrixTest_CSRHybrid.cpp} | 4 +- ...ight.cpp => SparseMatrixTest_CSRHybrid.cu} | 4 +- ...SRLight.h => SparseMatrixTest_CSRHybrid.h} | 36 ++-- 9 files changed, 286 insertions(+), 177 deletions(-) create mode 100644 src/TNL/Algorithms/Segments/CSRKernelHybrid.h create mode 100644 src/TNL/Algorithms/Segments/CSRKernelHybrid.hpp rename src/UnitTests/Matrices/{SparseMatrixTest_CSRLight.cu => SparseMatrixTest_CSRHybrid.cpp} (78%) rename src/UnitTests/Matrices/{SparseMatrixTest_CSRLight.cpp => SparseMatrixTest_CSRHybrid.cu} (79%) rename src/UnitTests/Matrices/{SparseMatrixTest_CSRLight.h => SparseMatrixTest_CSRHybrid.h} (89%) diff --git a/src/TNL/Algorithms/Segments/CSR.h b/src/TNL/Algorithms/Segments/CSR.h index 16f2f37ed..ead8d2b5d 100644 --- a/src/TNL/Algorithms/Segments/CSR.h +++ b/src/TNL/Algorithms/Segments/CSR.h @@ -143,7 +143,7 @@ using CSRVector = CSR< Device, Index, CSRKernelVector< Index, Device >, IndexAll template< typename Device, typename Index, typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > > -using CSRLight = CSR< Device, Index, CSRLightKernel< Index, Device >, IndexAllocator >; +using CSRHybrid = CSR< Device, Index, CSRKernelHybrid< Index, Device >, IndexAllocator >; template< typename Device, typename Index, diff --git a/src/TNL/Algorithms/Segments/CSRKernelHybrid.h b/src/TNL/Algorithms/Segments/CSRKernelHybrid.h new file mode 100644 index 000000000..92a4a54ee --- /dev/null +++ b/src/TNL/Algorithms/Segments/CSRKernelHybrid.h @@ -0,0 +1,65 @@ +/*************************************************************************** + CSRKernelHybrid.h - description + ------------------- + begin : Jan 23, 2021 -> Joe Biden inauguration + copyright : (C) 2021 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include +#include +#include +#include +#include + +namespace TNL { + namespace Algorithms { + namespace Segments { + +template< typename Index, + typename Device > +struct CSRKernelHybrid +{ + using IndexType = Index; + using DeviceType = Device; + using ViewType = CSRKernelHybrid< Index, Device >; + using ConstViewType = CSRKernelHybrid< Index, Device >; + + template< typename Offsets > + void init( const Offsets& offsets ); + + void reset(); + + ViewType getView(); + + ConstViewType getConstView() const; + + + template< typename OffsetsView, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real, + typename... Args > + void segmentsReduction( const OffsetsView& offsets, + Index first, + Index last, + Fetch& fetch, + const Reduction& reduction, + ResultKeeper& keeper, + const Real& zero, + Args... args ) const; + + protected: + int threadsPerSegment; +}; + + } // namespace Segments + } // namespace Algorithms +} // namespace TNL + +#include diff --git a/src/TNL/Algorithms/Segments/CSRKernelHybrid.hpp b/src/TNL/Algorithms/Segments/CSRKernelHybrid.hpp new file mode 100644 index 000000000..06d2d2868 --- /dev/null +++ b/src/TNL/Algorithms/Segments/CSRKernelHybrid.hpp @@ -0,0 +1,195 @@ +/*************************************************************************** + CSRKernelHybrid.hpp - description + ------------------- + begin : Jan 23, 2021 -> Joe Biden inauguration + copyright : (C) 2021 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace TNL { + namespace Algorithms { + namespace Segments { + +#ifdef HAVE_CUDA +template< int ThreadsPerSegment, + typename Offsets, + typename Index, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real, + typename... Args > +__global__ +void segmentsReductionCSRHybridKernel( + int gridIdx, + const Offsets offsets, + Index first, + Index last, + Fetch fetch, + const Reduction reduce, + ResultKeeper keep, + const Real zero, + Args... args ) +{ + /*** + * We map one warp to each segment + */ + const Index segmentIdx = TNL::Cuda::getGlobalThreadIdx( gridIdx ) / ThreadsPerSegment + first; + if( segmentIdx >= last ) + return; + + const int laneIdx = threadIdx.x & ( ThreadsPerSegment - 1 ); // & is cheaper than % + Index endIdx = offsets[ segmentIdx + 1] ; + + Index localIdx( laneIdx ); + Real aux = zero; + bool compute( true ); + for( Index globalIdx = offsets[ segmentIdx ] + localIdx; globalIdx < endIdx; globalIdx += ThreadsPerSegment ) + { + aux = reduce( aux, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) ); + localIdx += TNL::Cuda::getWarpSize(); + } + + /**** + * Reduction in each segment. + */ + if( ThreadsPerSegment == 32 ) + aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 16 ) ); + if( ThreadsPerSegment >= 16 ) + aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 8 ) ); + if( ThreadsPerSegment >= 8 ) + aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 4 ) ); + if( ThreadsPerSegment >= 4 ) + aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 2 ) ); + if( ThreadsPerSegment >= 2 ) + aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 1 ) ); + + if( laneIdx == 0 ) + keep( segmentIdx, aux ); +} +#endif + + + +template< typename Index, + typename Device > + template< typename Offsets > +void +CSRKernelHybrid< Index, Device >:: +init( const Offsets& offsets ) +{ + const Index segmentsCount = offsets.getSize() - 1; + const Index elementsInSegment = std::ceil( ( double ) offsets.getElement( segmentsCount ) / ( double ) segmentsCount ); + this->threadsPerSegment = TNL::min( std::pow( 2, std::ceil( std::log2( elementsInSegment ) ) ), TNL::Cuda::getWarpSize() ); + TNL_ASSERT_GE( threadsPerSegment, 0, "" ); + TNL_ASSERT_LE( threadsPerSegment, 32, "" ); +} + +template< typename Index, + typename Device > +void +CSRKernelHybrid< Index, Device >:: +reset() +{ + this->threadsPerSegment = 0; +} + +template< typename Index, + typename Device > +auto +CSRKernelHybrid< Index, Device >:: +getView() -> ViewType +{ + return *this; +} + +template< typename Index, + typename Device > +auto +CSRKernelHybrid< Index, Device >:: +getConstView() const -> ConstViewType +{ + return *this; +}; + + +template< typename Index, + typename Device > + template< typename OffsetsView, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real, + typename... Args > +void +CSRKernelHybrid< Index, Device >:: +segmentsReduction( const OffsetsView& offsets, + Index first, + Index last, + Fetch& fetch, + const Reduction& reduction, + ResultKeeper& keeper, + const Real& zero, + Args... args ) const +{ + TNL_ASSERT_GE( this->threadsPerSegment, 0, "" ); + TNL_ASSERT_LE( this->threadsPerSegment, 32, "" ); + +#ifdef HAVE_CUDA + const size_t threadsCount = this->threadsPerSegment * ( last - first ); + dim3 blocksCount, gridsCount, blockSize( 256 ); + TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount ); + //std::cerr << " this->threadsPerSegment = " << this->threadsPerSegment << " offsets = " << offsets << std::endl; + for( unsigned int gridIdx = 0; gridIdx < gridsCount.x; gridIdx ++ ) + { + dim3 gridSize; + TNL::Cuda::setupGrid( blocksCount, gridsCount, gridIdx, gridSize ); + switch( this->threadsPerSegment ) + { + case 0: // this means zero/empty matrix + break; + case 1: + segmentsReductionCSRHybridKernel< 1, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( + gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); + break; + case 2: + segmentsReductionCSRHybridKernel< 2, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( + gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); + break; + case 4: + segmentsReductionCSRHybridKernel< 4, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( + gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); + break; + case 8: + segmentsReductionCSRHybridKernel< 8, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( + gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); + break; + case 16: + segmentsReductionCSRHybridKernel< 16, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( + gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); + break; + case 32: + segmentsReductionCSRHybridKernel< 32, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( + gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); + break; + default: + throw std::runtime_error( std::string( "Wrong value of threadsPerSegment: " ) + std::to_string( this->threadsPerSegment ) ); + } + } +#endif +} + + } // namespace Segments + } // namespace Algorithms +} // namespace TNL diff --git a/src/TNL/Algorithms/Segments/CSRKernels.h b/src/TNL/Algorithms/Segments/CSRKernels.h index 9504aec64..2eca74549 100644 --- a/src/TNL/Algorithms/Segments/CSRKernels.h +++ b/src/TNL/Algorithms/Segments/CSRKernels.h @@ -21,158 +21,6 @@ namespace TNL { namespace Segments { - -#ifdef HAVE_CUDA -template< int ThreadsPerSegment, - typename Offsets, - typename Index, - typename Fetch, - typename Reduction, - typename ResultKeeper, - typename Real, - typename... Args > -__global__ -void segmentsReductionCSRLightKernel( - int gridIdx, - const Offsets offsets, - Index first, - Index last, - Fetch fetch, - const Reduction reduce, - ResultKeeper keep, - const Real zero, - Args... args ) -{ - /*** - * We map one warp to each segment - */ - const Index segmentIdx = TNL::Cuda::getGlobalThreadIdx( gridIdx ) / ThreadsPerSegment + first; - if( segmentIdx >= last ) - return; - - const int laneIdx = threadIdx.x & ( ThreadsPerSegment - 1 ); // & is cheaper than % - Index endIdx = offsets[ segmentIdx + 1] ; - - Index localIdx( laneIdx ); - Real aux = zero; - bool compute( true ); - for( Index globalIdx = offsets[ segmentIdx ] + localIdx; globalIdx < endIdx; globalIdx += ThreadsPerSegment ) - { - aux = reduce( aux, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) ); - localIdx += TNL::Cuda::getWarpSize(); - } - - /**** - * Reduction in each segment. - */ - if( ThreadsPerSegment == 32 ) - aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 16 ) ); - if( ThreadsPerSegment >= 16 ) - aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 8 ) ); - if( ThreadsPerSegment >= 8 ) - aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 4 ) ); - if( ThreadsPerSegment >= 4 ) - aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 2 ) ); - if( ThreadsPerSegment >= 2 ) - aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 1 ) ); - - if( laneIdx == 0 ) - keep( segmentIdx, aux ); -} -#endif - -template< typename Index, - typename Device > -struct CSRLightKernel -{ - using IndexType = Index; - using DeviceType = Device; - using ViewType = CSRLightKernel< Index, Device >; - using ConstViewType = CSRLightKernel< Index, Device >; - - template< typename Offsets > - void init( const Offsets& offsets ) - { - const Index segmentsCount = offsets.getSize() - 1; - const Index elementsInSegment = std::ceil( ( double ) offsets.getElement( segmentsCount ) / ( double ) segmentsCount ); - this->threadsPerSegment = TNL::min( std::pow( 2, std::ceil( std::log2( elementsInSegment ) ) ), TNL::Cuda::getWarpSize() ); - TNL_ASSERT_GE( threadsPerSegment, 0, "" ); - TNL_ASSERT_LE( threadsPerSegment, 32, "" ); - }; - - void reset() { this->threadsPerSegment = 0; } - - ViewType getView() { return *this; }; - - ConstViewType getConstView() const { return *this; }; - - template< typename OffsetsView, - typename Fetch, - typename Reduction, - typename ResultKeeper, - typename Real, - typename... Args > - void segmentsReduction( const OffsetsView& offsets, - Index first, - Index last, - Fetch& fetch, - const Reduction& reduction, - ResultKeeper& keeper, - const Real& zero, - Args... args ) const - { - TNL_ASSERT_GE( threadsPerSegment, 0, "" ); - TNL_ASSERT_LE( threadsPerSegment, 32, "" ); - -#ifdef HAVE_CUDA - const size_t threadsCount = this->threadsPerSegment * ( last - first ); - dim3 blocksCount, gridsCount, blockSize( 256 ); - TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount ); - //std::cerr << " this->threadsPerSegment = " << this->threadsPerSegment << " offsets = " << offsets << std::endl; - for( unsigned int gridIdx = 0; gridIdx < gridsCount.x; gridIdx ++ ) - { - dim3 gridSize; - TNL::Cuda::setupGrid( blocksCount, gridsCount, gridIdx, gridSize ); - switch( this->threadsPerSegment ) - { - case 0: // this means zero/empty matrix - break; - case 1: - segmentsReductionCSRLightKernel< 1, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( - gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); - break; - case 2: - segmentsReductionCSRLightKernel< 2, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( - gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); - break; - case 4: - segmentsReductionCSRLightKernel< 4, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( - gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); - break; - case 8: - segmentsReductionCSRLightKernel< 8, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( - gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); - break; - case 16: - segmentsReductionCSRLightKernel< 16, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( - gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); - break; - case 32: - segmentsReductionCSRLightKernel< 32, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( - gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); - break; - default: - throw std::runtime_error( std::string( "Wrong value of threadsPerSegment: " ) + std::to_string( this->threadsPerSegment ) ); - } - } -#endif - } - - protected: - int threadsPerSegment; -}; - - template< typename Index, typename Device > struct CSRAdaptiveKernelView diff --git a/src/TNL/Algorithms/Segments/CSRView.h b/src/TNL/Algorithms/Segments/CSRView.h index ec47aaf4f..1f8c49f6f 100644 --- a/src/TNL/Algorithms/Segments/CSRView.h +++ b/src/TNL/Algorithms/Segments/CSRView.h @@ -16,6 +16,7 @@ #include #include #include +#include #include namespace TNL { @@ -141,7 +142,7 @@ using CSRViewVector = CSRView< Device, Index, CSRKernelVector< Index, Device > > template< typename Device, typename Index > -using CSRViewLight = CSRView< Device, Index, CSRLightKernel< Index, Device > >; +using CSRViewHybrid = CSRView< Device, Index, CSRKernelHybrid< Index, Device > >; template< typename Device, typename Index > diff --git a/src/UnitTests/Matrices/CMakeLists.txt b/src/UnitTests/Matrices/CMakeLists.txt index 37021b230..2b3617467 100644 --- a/src/UnitTests/Matrices/CMakeLists.txt +++ b/src/UnitTests/Matrices/CMakeLists.txt @@ -8,7 +8,7 @@ set( COMMON_TESTS SparseMatrixTest_CSRScalar SparseMatrixTest_CSRVector - SparseMatrixTest_CSRLight + SparseMatrixTest_CSRHybrid SparseMatrixTest_Ellpack SparseMatrixTest_SlicedEllpack SparseMatrixTest_ChunkedEllpack diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cu b/src/UnitTests/Matrices/SparseMatrixTest_CSRHybrid.cpp similarity index 78% rename from src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cu rename to src/UnitTests/Matrices/SparseMatrixTest_CSRHybrid.cpp index bf2c8061e..214ed2ca7 100644 --- a/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cu +++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRHybrid.cpp @@ -1,5 +1,5 @@ /*************************************************************************** - SparseMatrixTest_CSRLight.cu - description + SparseMatrixTest_CSRHybrid.cpp - description ------------------- begin : Jan 23, 2021 copyright : (C) 2021 by Tomas Oberhuber et al. @@ -8,4 +8,4 @@ /* See Copyright Notice in tnl/Copyright */ -#include "SparseMatrixTest_CSRLight.h" +#include "SparseMatrixTest_CSRHybrid.h" diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cpp b/src/UnitTests/Matrices/SparseMatrixTest_CSRHybrid.cu similarity index 79% rename from src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cpp rename to src/UnitTests/Matrices/SparseMatrixTest_CSRHybrid.cu index 70d767b37..c0a0918d7 100644 --- a/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.cpp +++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRHybrid.cu @@ -1,5 +1,5 @@ /*************************************************************************** - SparseMatrixTest_CSRLight.cpp - description + SparseMatrixTest_CSRHybrid.cu - description ------------------- begin : Jan 23, 2021 copyright : (C) 2021 by Tomas Oberhuber et al. @@ -8,4 +8,4 @@ /* See Copyright Notice in tnl/Copyright */ -#include "SparseMatrixTest_CSRLight.h" +#include "SparseMatrixTest_CSRHybrid.h" diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.h b/src/UnitTests/Matrices/SparseMatrixTest_CSRHybrid.h similarity index 89% rename from src/UnitTests/Matrices/SparseMatrixTest_CSRLight.h rename to src/UnitTests/Matrices/SparseMatrixTest_CSRHybrid.h index 6349c1711..24ba77fa0 100644 --- a/src/UnitTests/Matrices/SparseMatrixTest_CSRLight.h +++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRHybrid.h @@ -1,5 +1,5 @@ /*************************************************************************** - SparseMatrixTest_CSRLight.h - description + SparseMatrixTest_CSRHybrid.h - description ------------------- begin : Jan 23, 2021 copyright : (C) 2021 by Tomas Oberhuber et al. @@ -15,28 +15,28 @@ #ifdef HAVE_GTEST #include -const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRLight_segments"; +const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRHybrid_segments"; // types for which MatrixTest is instantiated using MatrixTypes = ::testing::Types < - TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >, - TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >, - TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >, - TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >, - TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >, - TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >, - TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >, - TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight > + TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid > #ifdef HAVE_CUDA - ,TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >, - TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >, - TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >, - TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >, - TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >, - TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >, - TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight >, - TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRLight > + ,TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid > #endif >; -- GitLab From 536a6526238c03977958e06d1d4c82e1511dec3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Sat, 23 Jan 2021 21:02:04 +0100 Subject: [PATCH 17/27] Adding Adaptive CSR kernel. --- src/TNL/Algorithms/Segments/CSR.h | 5 + .../Algorithms/Segments/CSRKernelAdaptive.h | 329 ++++++++++++++++++ src/TNL/Algorithms/Segments/CSRKernels.h | 135 ------- src/TNL/Algorithms/Segments/CSRView.h | 6 +- src/UnitTests/Matrices/CMakeLists.txt | 1 + .../Matrices/SparseMatrixTest_CSRAdaptive.cpp | 11 + .../Matrices/SparseMatrixTest_CSRAdaptive.cu | 11 + .../Matrices/SparseMatrixTest_CSRAdaptive.h | 46 +++ 8 files changed, 408 insertions(+), 136 deletions(-) create mode 100644 src/TNL/Algorithms/Segments/CSRKernelAdaptive.h delete mode 100644 src/TNL/Algorithms/Segments/CSRKernels.h create mode 100644 src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cpp create mode 100644 src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cu create mode 100644 src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.h diff --git a/src/TNL/Algorithms/Segments/CSR.h b/src/TNL/Algorithms/Segments/CSR.h index ead8d2b5d..3a04e80fd 100644 --- a/src/TNL/Algorithms/Segments/CSR.h +++ b/src/TNL/Algorithms/Segments/CSR.h @@ -145,6 +145,11 @@ template< typename Device, typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > > using CSRHybrid = CSR< Device, Index, CSRKernelHybrid< Index, Device >, IndexAllocator >; +template< typename Device, + typename Index, + typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > > +using CSRAdaptive = CSR< Device, Index, CSRKernelAdaptive< Index, Device >, IndexAllocator >; + template< typename Device, typename Index, typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > > diff --git a/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h b/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h new file mode 100644 index 000000000..df43906e1 --- /dev/null +++ b/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h @@ -0,0 +1,329 @@ +/*************************************************************************** + CSRKernels.h - description + ------------------- + begin : Jan 20, 2021 -> Joe Biden inauguration + copyright : (C) 2021 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include +#include +#include +#include +#include + +namespace TNL { + namespace Algorithms { + namespace Segments { + +enum class Type { + /* LONG = 0!!! Non zero value rewrites index[1] */ + LONG = 0, + STREAM = 1, + VECTOR = 2 +}; + +template +union Block { + Block(Index row, Type type = Type::VECTOR, Index index = 0) noexcept { + this->index[0] = row; + this->index[1] = index; + this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type; + } + + Block(Index row, Type type, Index nextRow, Index maxID, Index minID) noexcept { + this->index[0] = row; + this->index[1] = 0; + this->twobytes[sizeof(Index) == 4 ? 2 : 4] = maxID - minID; + + if (type == Type::STREAM) + this->twobytes[sizeof(Index) == 4 ? 3 : 5] = nextRow - row; + + if (type == Type::STREAM) + this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b1000000; + else if (type == Type::VECTOR) + this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b10000000; + } + + Block() = default; + + Index index[2]; // index[0] is row pointer, index[1] is index in warp + uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator + uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID + //twobytes[3/5] is nextRow - row +}; + +#ifdef HAVE_CUDA + +template< typename Real, + typename Index, + int warpSize, + int WARPS, + int SHARED_PER_WARP, + int MAX_ELEM_PER_WARP > +__global__ +void SpMVCSRAdaptive( const Real *inVector, + Real *outVector, + const Index* rowPointers, + const Index* columnIndexes, + const Real* values, + const Block *blocks, + Index blocksSize, + Index gridID) { + __shared__ Real shared[WARPS][SHARED_PER_WARP]; + const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; + const Index blockIdx = index / warpSize; + if (blockIdx >= blocksSize) + return; + + Real result = 0.0; + const Index laneID = threadIdx.x & 31; // & is cheaper than % + Block block = blocks[blockIdx]; + const Index minID = rowPointers[block.index[0]/* minRow */]; + Index i, to, maxID; + if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b1000000) { + /////////////////////////////////////* CSR STREAM *////////////// + const Index warpID = threadIdx.x / 32; + maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4]; + + /* Stream data to shared memory */ + for (i = laneID + minID; i < maxID; i += warpSize) + shared[warpID][i - minID] = values[i] * inVector[columnIndexes[i]]; + + const Index maxRow = block.index[0]/* minRow */ + + /* maxRow - minRow */(block.twobytes[sizeof(Index) == 4 ? 3 : 5] & 0x3FFF); + /* Calculate result */ + for (i = block.index[0]/* minRow */ + laneID; i < maxRow; i += warpSize) { + to = rowPointers[i + 1] - minID; // end of preprocessed data + result = 0; + /* Scalar reduction */ + for (Index sharedID = rowPointers[i] - minID; sharedID < to; ++sharedID) + result += shared[warpID][sharedID]; + + outVector[i] = result; // Write result + } + } else if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b10000000) { + /////////////////////////////////////* CSR VECTOR *////////////// + maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4]; + + for (i = minID + laneID; i < maxID; i += warpSize) + result += values[i] * inVector[columnIndexes[i]]; + + /* Parallel reduction */ + result += __shfl_down_sync(0xFFFFFFFF, result, 16); + result += __shfl_down_sync(0xFFFFFFFF, result, 8); + result += __shfl_down_sync(0xFFFFFFFF, result, 4); + result += __shfl_down_sync(0xFFFFFFFF, result, 2); + result += __shfl_down_sync(0xFFFFFFFF, result, 1); + if (laneID == 0) outVector[block.index[0]/* minRow */] = result; // Write result + } else { + /////////////////////////////////////* CSR VECTOR L *///////////// + /* Number of elements processed by previous warps */ + const Index offset = block.index[1]/* warpInRow */ * MAX_ELEM_PER_WARP; + to = minID + (block.index[1]/* warpInRow */ + 1) * MAX_ELEM_PER_WARP; + maxID = rowPointers[block.index[0]/* minRow */ + 1]; + if (to > maxID) to = maxID; + for (i = minID + offset + laneID; i < to; i += warpSize) + result += values[i] * inVector[columnIndexes[i]]; + + /* Parallel reduction */ + result += __shfl_down_sync(0xFFFFFFFF, result, 16); + result += __shfl_down_sync(0xFFFFFFFF, result, 8); + result += __shfl_down_sync(0xFFFFFFFF, result, 4); + result += __shfl_down_sync(0xFFFFFFFF, result, 2); + result += __shfl_down_sync(0xFFFFFFFF, result, 1); + if (laneID == 0) atomicAdd(&outVector[block.index[0]/* minRow */], result); + } +} +#endif + + +template< typename Index, + typename Device > +struct CSRKernelAdaptiveView +{ + using IndexType = Index; + using DeviceType = Device; + using ViewType = CSRKernelAdaptiveView< Index, Device >; + using ConstViewType = CSRKernelAdaptiveView< Index, Device >; + + ViewType getView() { return *this; }; + + ConstViewType getConstView() const { return *this; }; + + template< typename OffsetsView, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real, + typename... Args > + void segmentsReduction( const OffsetsView& offsets, + Index first, + Index last, + Fetch& fetch, + const Reduction& reduction, + ResultKeeper& keeper, + const Real& zero, + Args... args ) const + { + + Index blocks; + const Index threads = matrix.THREADS_ADAPTIVE; + + /* Fill blocks */ + size_t neededThreads = matrix.blocks.getSize() * warpSize; // one warp per block + /* Execute kernels on device */ + for (Index grid = 0; neededThreads != 0; ++grid) { + if (MAX_X_DIM * threads >= neededThreads) { + blocks = roundUpDivision(neededThreads, threads); + neededThreads = 0; + } else { + blocks = MAX_X_DIM; + neededThreads -= MAX_X_DIM * threads; + } + + SpMVCSRAdaptive< Real, Index, warpSize, + matrix.WARPS, + matrix.SHARED_PER_WARP, + matrix.MAX_ELEMENTS_PER_WARP_ADAPT > + <<>>( + inVector, + outVector, + matrix.getRowPointers().getData(), + matrix.getColumnIndexes().getData(), + matrix.getValues().getData(), + matrix.blocks.getData(), + matrix.blocks.getSize() - 1, // last block shouldn't be used + grid + ); + } + } +}; + +template< typename Index, + typename Device > +struct CSRKernelAdaptive +{ + using IndexType = Index; + using DeviceType = Device; + using ViewType = CSRKernelAdaptiveView< Index, Device >; + using ConstViewType = CSRKernelAdaptiveView< Index, Device >; + + static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; + + /* How many shared memory use per block in CSR Adaptive kernel */ + static constexpr Index SHARED_PER_BLOCK = 24576; + + /* Number of elements in shared memory */ + static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(double); + + /* Number of warps in block for CSR Adaptive */ + static constexpr Index WARPS = THREADS_ADAPTIVE / 32; + + /* Number of elements in shared memory per one warp */ + static constexpr Index SHARED_PER_WARP = SHARED / WARPS; + + template< typename Offsets > + Index findLimit(const Index start, + const Offsets& offsets, + const Index size, + Type &type, + Index &sum) { + sum = 0; + for (Index current = start; current < size - 1; ++current) { + Index elements = offsets.getElement(current + 1) - + offsets.getElement(current); + sum += elements; + if (sum > matrix.SHARED_PER_WARP) { + if (current - start > 0) { // extra row + type = Type::STREAM; + return current; + } else { // one long row + if (sum <= 2 * matrix.MAX_ELEMENTS_PER_WARP_ADAPT) + type = Type::VECTOR; + else + type = Type::LONG; + return current + 1; + } + } + } + + type = Type::STREAM; + return size - 1; // return last row pointer + } + + template< typename Offsets > + void init( const Offsets& offsets ) + { + const Index rows = offsets.getSize(); + Index sum, start = 0, nextStart = 0; + + // Fill blocks + std::vector> inBlock; + inBlock.reserve(rows); + + while (nextStart != rows - 1) + { + Type type; + nextStart = findLimit( + start, *this, rows, type, sum ); + + if (type == Type::LONG) + { + Index parts = roundUpDivision(sum, this->SHARED_PER_WARP); + for (Index index = 0; index < parts; ++index) + { + inBlock.emplace_back(start, Type::LONG, index); + } + } + else + { + inBlock.emplace_back(start, type, + nextStart, + this->rowPointers.getElement(nextStart), + this->rowPointers.getElement(start) ); + } + start = nextStart; + } + inBlock.emplace_back(nextStart); + + // Copy values + this->blocks.setSize(inBlock.size()); + for (size_t i = 0; i < inBlock.size(); ++i) + this->blocks.setElement(i, inBlock[i]); + }; + + ViewType getView() { return view; }; + + ConstViewType getConstView() const { return ConstViewType(); }; + + template< typename OffsetsView, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real, + typename... Args > + void segmentsReduction( const OffsetsView& offsets, + Index first, + Index last, + Fetch& fetch, + const Reduction& reduction, + ResultKeeper& keeper, + const Real& zero, + Args... args ) const + { + view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); + } + + ViewType view; +}; + + + + } // namespace Segments + } // namespace Algorithms +} // namespace TNL diff --git a/src/TNL/Algorithms/Segments/CSRKernels.h b/src/TNL/Algorithms/Segments/CSRKernels.h deleted file mode 100644 index 2eca74549..000000000 --- a/src/TNL/Algorithms/Segments/CSRKernels.h +++ /dev/null @@ -1,135 +0,0 @@ -/*************************************************************************** - CSRKernels.h - description - ------------------- - begin : Jan 20, 2021 -> Joe Biden inauguration - copyright : (C) 2021 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#pragma once - -#include -#include -#include -#include -#include - -namespace TNL { - namespace Algorithms { - namespace Segments { - - -template< typename Index, - typename Device > -struct CSRAdaptiveKernelView -{ - using IndexType = Index; - using DeviceType = Device; - using ViewType = CSRAdaptiveKernelView< Index, Device >; - using ConstViewType = CSRAdaptiveKernelView< Index, Device >; - - ViewType getView() { return *this; }; - - ConstViewType getConstView() const { return *this; }; - - template< typename OffsetsView, - typename Fetch, - typename Reduction, - typename ResultKeeper, - typename Real, - typename... Args > - void segmentsReduction( const OffsetsView& offsets, - Index first, - Index last, - Fetch& fetch, - const Reduction& reduction, - ResultKeeper& keeper, - const Real& zero, - Args... args ) const - { - } -}; - -template< typename Index, - typename Device > -struct CSRAdaptiveKernel -{ - using IndexType = Index; - using DeviceType = Device; - using ViewType = CSRAdaptiveKernel< Index, Device >; - using ConstViewType = CSRAdaptiveKernel< Index, Device >; - - template< typename Offsets > - void init( const Offsets& offsets ) - { - /*const Index rows = offsets.getSize(); - Index sum, start = 0, nextStart = 0; - - // Fill blocks - std::vector> inBlock; - inBlock.reserve(rows); - - while (nextStart != rows - 1) - { - Type type; - nextStart = findLimit( - start, *this, rows, type, sum ); - - if (type == Type::LONG) - { - Index parts = roundUpDivision(sum, this->SHARED_PER_WARP); - for (Index index = 0; index < parts; ++index) - { - inBlock.emplace_back(start, Type::LONG, index); - } - } - else - { - inBlock.emplace_back(start, type, - nextStart, - this->rowPointers.getElement(nextStart), - this->rowPointers.getElement(start) ); - } - start = nextStart; - } - inBlock.emplace_back(nextStart); - - // Copy values - this->blocks.setSize(inBlock.size()); - for (size_t i = 0; i < inBlock.size(); ++i) - this->blocks.setElement(i, inBlock[i]); - */ - }; - - ViewType getView() { return view; }; - - ConstViewType getConstView() const { return ConstViewType(); }; - - template< typename OffsetsView, - typename Fetch, - typename Reduction, - typename ResultKeeper, - typename Real, - typename... Args > - void segmentsReduction( const OffsetsView& offsets, - Index first, - Index last, - Fetch& fetch, - const Reduction& reduction, - ResultKeeper& keeper, - const Real& zero, - Args... args ) const - { - view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); - } - - ViewType view; -}; - - - - } // namespace Segments - } // namespace Algorithms -} // namespace TNL diff --git a/src/TNL/Algorithms/Segments/CSRView.h b/src/TNL/Algorithms/Segments/CSRView.h index 1f8c49f6f..4576d9fdb 100644 --- a/src/TNL/Algorithms/Segments/CSRView.h +++ b/src/TNL/Algorithms/Segments/CSRView.h @@ -17,7 +17,7 @@ #include #include #include -#include +#include namespace TNL { namespace Algorithms { @@ -144,6 +144,10 @@ template< typename Device, typename Index > using CSRViewHybrid = CSRView< Device, Index, CSRKernelHybrid< Index, Device > >; +template< typename Device, + typename Index > +using CSRViewAdaptive = CSRView< Device, Index, CSRKernelAdaptive< Index, Device > >; + template< typename Device, typename Index > using CSRViewDefault = CSRViewScalar< Device, Index >; diff --git a/src/UnitTests/Matrices/CMakeLists.txt b/src/UnitTests/Matrices/CMakeLists.txt index 2b3617467..a65411fc0 100644 --- a/src/UnitTests/Matrices/CMakeLists.txt +++ b/src/UnitTests/Matrices/CMakeLists.txt @@ -9,6 +9,7 @@ set( COMMON_TESTS SparseMatrixTest_CSRScalar SparseMatrixTest_CSRVector SparseMatrixTest_CSRHybrid + SparseMatrixTest_CSRAdaptive SparseMatrixTest_Ellpack SparseMatrixTest_SlicedEllpack SparseMatrixTest_ChunkedEllpack diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cpp b/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cpp new file mode 100644 index 000000000..214ed2ca7 --- /dev/null +++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cpp @@ -0,0 +1,11 @@ +/*************************************************************************** + SparseMatrixTest_CSRHybrid.cpp - description + ------------------- + begin : Jan 23, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#include "SparseMatrixTest_CSRHybrid.h" diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cu b/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cu new file mode 100644 index 000000000..c0a0918d7 --- /dev/null +++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cu @@ -0,0 +1,11 @@ +/*************************************************************************** + SparseMatrixTest_CSRHybrid.cu - description + ------------------- + begin : Jan 23, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#include "SparseMatrixTest_CSRHybrid.h" diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.h b/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.h new file mode 100644 index 000000000..24ba77fa0 --- /dev/null +++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.h @@ -0,0 +1,46 @@ +/*************************************************************************** + SparseMatrixTest_CSRHybrid.h - description + ------------------- + begin : Jan 23, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#include +#include +#include + +#ifdef HAVE_GTEST +#include + +const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRHybrid_segments"; + +// types for which MatrixTest is instantiated +using MatrixTypes = ::testing::Types +< + TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid > +#ifdef HAVE_CUDA + ,TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid > +#endif +>; + +#endif + +#include "SparseMatrixTest.h" +#include "../main.h" -- GitLab From 316819ca5dbcf0b588458cb0e8d1474827a8e6e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Sun, 24 Jan 2021 21:25:16 +0100 Subject: [PATCH 18/27] Debuging CSR Adaptive kernel. --- .../Algorithms/Segments/CSRKernelAdaptive.h | 446 +++++++++++++----- .../Matrices/SparseMatrixTest_CSRAdaptive.cpp | 4 +- .../Matrices/SparseMatrixTest_CSRAdaptive.cu | 4 +- .../Matrices/SparseMatrixTest_CSRAdaptive.h | 36 +- 4 files changed, 340 insertions(+), 150 deletions(-) diff --git a/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h b/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h index df43906e1..9e247fa6d 100644 --- a/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h +++ b/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h @@ -15,6 +15,7 @@ #include #include #include +#include namespace TNL { namespace Algorithms { @@ -27,15 +28,18 @@ enum class Type { VECTOR = 2 }; -template -union Block { - Block(Index row, Type type = Type::VECTOR, Index index = 0) noexcept { +template< typename Index > +union Block +{ + Block(Index row, Type type = Type::VECTOR, Index index = 0) noexcept + { this->index[0] = row; this->index[1] = index; this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type; } - Block(Index row, Type type, Index nextRow, Index maxID, Index minID) noexcept { + Block(Index row, Type type, Index nextRow, Index maxID, Index minID) noexcept + { this->index[0] = row; this->index[1] = 0; this->twobytes[sizeof(Index) == 4 ? 2 : 4] = maxID - minID; @@ -51,93 +55,177 @@ union Block { Block() = default; + Type getType() const + { + if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b1000000 ) + return Type::STREAM; + if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b10000000 ) + return Type::VECTOR; + return Type::LONG; + } + + Index getFirstRow() const + { + return index[ 0 ]; + } + + Index getRowsInBlock() const + { + return twobytes[ sizeof(Index) == 4 ? 2 : 4 ]; + } + + void print( std::ostream& str ) const + { + Type type = this->getType(); + str << "Type: "; + switch( type ) + { + case Type::STREAM: + str << " Stream "; + break; + case Type::VECTOR: + str << " Vector "; + break; + case Type::LONG: + str << " Long "; + break; + } + str << " first row: " << getFirstRow(); + str << " rows per block: " << getRowsInBlock(); + str << " index in warp: " << index[ 1 ]; + } Index index[2]; // index[0] is row pointer, index[1] is index in warp uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID //twobytes[3/5] is nextRow - row }; +template< typename Index > +std::ostream& operator<< ( std::ostream& str, const Block< Index >& block ) +{ + block.print( str ); + return str; +} + #ifdef HAVE_CUDA -template< typename Real, - typename Index, - int warpSize, +template< int warpSize, int WARPS, int SHARED_PER_WARP, - int MAX_ELEM_PER_WARP > -__global__ -void SpMVCSRAdaptive( const Real *inVector, - Real *outVector, - const Index* rowPointers, - const Index* columnIndexes, - const Real* values, - const Block *blocks, - Index blocksSize, - Index gridID) { + int MAX_ELEM_PER_WARP, + typename Offsets, + typename Index, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real, + typename... Args > +__global__ void +segmentsReductionCSRAdaptiveKernel( const Block< Index > *blocks, + Index blocksSize, + int gridIdx, + Offsets offsets, + Index first, + Index last, + Fetch fetch, + Reduction reduce, + ResultKeeper keep, + Real zero, + Args... args ) +{ __shared__ Real shared[WARPS][SHARED_PER_WARP]; - const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; + constexpr size_t MAX_X_DIM = 2147483647; + const Index index = (gridIdx * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; const Index blockIdx = index / warpSize; if (blockIdx >= blocksSize) return; - Real result = 0.0; + Real result = zero; + bool compute( true ); const Index laneID = threadIdx.x & 31; // & is cheaper than % Block block = blocks[blockIdx]; - const Index minID = rowPointers[block.index[0]/* minRow */]; + const Index minID = offsets[block.index[0]/* minRow */]; Index i, to, maxID; - if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b1000000) { - /////////////////////////////////////* CSR STREAM *////////////// + + if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b1000000) + { + /**** + * CSR Stream: Copy first all data into shared memory + */ + const Index warpID = threadIdx.x / 32; maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4]; /* Stream data to shared memory */ - for (i = laneID + minID; i < maxID; i += warpSize) - shared[warpID][i - minID] = values[i] * inVector[columnIndexes[i]]; + for( Index globalIdx = laneID + minID; globalIdx < maxID; globalIdx += warpSize ) + { + shared[warpID][i - minID] = //fetch( globalIdx, compute ); + details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute ); + printf( "Stream: Fetch at %d -> %f \n", globalIdx, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute ) ); + // TODO:: fix this + //values[i] * inVector[columnIndexes[i]]; + } const Index maxRow = block.index[0]/* minRow */ + /* maxRow - minRow */(block.twobytes[sizeof(Index) == 4 ? 3 : 5] & 0x3FFF); /* Calculate result */ - for (i = block.index[0]/* minRow */ + laneID; i < maxRow; i += warpSize) { - to = rowPointers[i + 1] - minID; // end of preprocessed data - result = 0; + for (i = block.index[0]/* minRow */ + laneID; i < maxRow; i += warpSize) + { + to = offsets[i + 1] - minID; // end of preprocessed data + result = zero; /* Scalar reduction */ - for (Index sharedID = rowPointers[i] - minID; sharedID < to; ++sharedID) - result += shared[warpID][sharedID]; + for( Index sharedID = offsets[ i ] - minID; sharedID < to; ++sharedID) + result = reduce( result, shared[warpID][sharedID] ); - outVector[i] = result; // Write result + printf( "Stream: threadIdx = %d result for segment %d is %f \n", threadIdx, i, result ); + keep( i, result ); + //outVector[i] = result; // Write result } - } else if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b10000000) { + } + else //if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b10000000) + { + printf( "Vector: threadIdx = %d \n", threadIdx ); /////////////////////////////////////* CSR VECTOR *////////////// maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4]; + const Index segmentIdx = block.index[0]; - for (i = minID + laneID; i < maxID; i += warpSize) - result += values[i] * inVector[columnIndexes[i]]; + for( Index globalIdx = minID + laneID; globalIdx < maxID; globalIdx += warpSize ) + result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); // fix local idx + //values[i] * inVector[columnIndexes[i]]; /* Parallel reduction */ - result += __shfl_down_sync(0xFFFFFFFF, result, 16); - result += __shfl_down_sync(0xFFFFFFFF, result, 8); - result += __shfl_down_sync(0xFFFFFFFF, result, 4); - result += __shfl_down_sync(0xFFFFFFFF, result, 2); - result += __shfl_down_sync(0xFFFFFFFF, result, 1); - if (laneID == 0) outVector[block.index[0]/* minRow */] = result; // Write result - } else { - /////////////////////////////////////* CSR VECTOR L *///////////// - /* Number of elements processed by previous warps */ - const Index offset = block.index[1]/* warpInRow */ * MAX_ELEM_PER_WARP; - to = minID + (block.index[1]/* warpInRow */ + 1) * MAX_ELEM_PER_WARP; - maxID = rowPointers[block.index[0]/* minRow */ + 1]; - if (to > maxID) to = maxID; - for (i = minID + offset + laneID; i < to; i += warpSize) - result += values[i] * inVector[columnIndexes[i]]; + result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 16 ) ); + result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 8 ) ); + result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 4 ) ); + result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 2 ) ); + result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 1 ) ); + if( laneID == 0 ) + { + printf( "Vector: threadIdx = %d result for segment %d is %f \n", threadIdx, i, result ); + keep( segmentIdx, result ); + //outVector[block.index[0]/* minRow */] = result; // Write result + } + }/* + else + { + ///////////////////////////////////// CSR VECTOR L ///////////// + // Number of elements processed by previous warps + const Index offset = block.index[1] * MAX_ELEM_PER_WARP; + to = minID + (block.index[1] + 1) * MAX_ELEM_PER_WARP; + maxID = offsets[block.index[0] + 1]; + if( to > maxID ) + to = maxID; + for( Index globalIdx = minID + offset + laneID; globalIdx < to; globalIdx += warpSize ) + result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) ); + //result += values[i] * inVector[columnIndexes[i]]; - /* Parallel reduction */ result += __shfl_down_sync(0xFFFFFFFF, result, 16); result += __shfl_down_sync(0xFFFFFFFF, result, 8); result += __shfl_down_sync(0xFFFFFFFF, result, 4); result += __shfl_down_sync(0xFFFFFFFF, result, 2); result += __shfl_down_sync(0xFFFFFFFF, result, 1); - if (laneID == 0) atomicAdd(&outVector[block.index[0]/* minRow */], result); - } + if (laneID == 0) atomicAdd(&outVector[block.index[0] ], result); + }*/ } #endif @@ -146,22 +234,36 @@ template< typename Index, typename Device > struct CSRKernelAdaptiveView { - using IndexType = Index; - using DeviceType = Device; - using ViewType = CSRKernelAdaptiveView< Index, Device >; - using ConstViewType = CSRKernelAdaptiveView< Index, Device >; + using IndexType = Index; + using DeviceType = Device; + using ViewType = CSRKernelAdaptiveView< Index, Device >; + using ConstViewType = CSRKernelAdaptiveView< Index, Device >; + using BlocksType = TNL::Containers::Vector< Block< Index >, Device, Index >; + using BlocksView = typename BlocksType::ViewType; + + CSRKernelAdaptiveView() = default; + + CSRKernelAdaptiveView( BlocksType& blocks ) + { + this->blocks.bind( blocks ); + }; + + void setBlocks( BlocksType& blocks ) + { + this->blocks.bind( blocks ); + } - ViewType getView() { return *this; }; + ViewType getView() { return *this; }; - ConstViewType getConstView() const { return *this; }; + ConstViewType getConstView() const { return *this; }; - template< typename OffsetsView, - typename Fetch, - typename Reduction, - typename ResultKeeper, - typename Real, - typename... Args > - void segmentsReduction( const OffsetsView& offsets, + template< typename OffsetsView, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real, + typename... Args > + void segmentsReduction( const OffsetsView& offsets, Index first, Index last, Fetch& fetch, @@ -169,39 +271,103 @@ struct CSRKernelAdaptiveView ResultKeeper& keeper, const Real& zero, Args... args ) const - { + { +#ifdef HAVE_CUDA + if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() ) + { + TNL::Algorithms::Segments::CSRKernelScalar< Index, Device >:: + segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); + return; + } + + this->printBlocks(); + static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; + //static constexpr Index THREADS_SCALAR = 128; + static constexpr Index THREADS_VECTOR = 128; + static constexpr Index THREADS_LIGHT = 128; + + /* Max length of row to process one warp for CSR Light, MultiVector */ + static constexpr Index MAX_ELEMENTS_PER_WARP = 384; + + /* Max length of row to process one warp for CSR Adaptive */ + static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512; + + /* How many shared memory use per block in CSR Adaptive kernel */ + static constexpr Index SHARED_PER_BLOCK = 24576; + + /* Number of elements in shared memory */ + static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real); + + /* Number of warps in block for CSR Adaptive */ + static constexpr Index WARPS = THREADS_ADAPTIVE / 32; + + /* Number of elements in shared memory per one warp */ + static constexpr Index SHARED_PER_WARP = SHARED / WARPS; + + constexpr int warpSize = 32; + + Index blocksCount; + + const Index threads = THREADS_ADAPTIVE; + constexpr size_t MAX_X_DIM = 2147483647; + + /* Fill blocks */ + size_t neededThreads = blocks.getSize() * warpSize; // one warp per block + /* Execute kernels on device */ + for (Index gridIdx = 0; neededThreads != 0; gridIdx++ ) + { + if (MAX_X_DIM * threads >= neededThreads) + { + blocksCount = roundUpDivision(neededThreads, threads); + neededThreads = 0; + } + else + { + blocksCount = MAX_X_DIM; + neededThreads -= MAX_X_DIM * threads; + } + + segmentsReductionCSRAdaptiveKernel< + warpSize, + WARPS, + SHARED_PER_WARP, + MAX_ELEMENTS_PER_WARP_ADAPT, + OffsetsView, + Index, Fetch, Reduction, ResultKeeper, Real, Args... > + <<>>( + blocks.getData(), + blocks.getSize() - 1, // last block shouldn't be used + gridIdx, + offsets, + first, + last, + fetch, + reduction, + keeper, + zero, + args... ); + } +#endif + } + + CSRKernelAdaptiveView& operator=( const CSRKernelAdaptiveView< Index, Device >& kernelView ) + { + this->blocks.bind( kernelView.blocks ); + return *this; + } - Index blocks; - const Index threads = matrix.THREADS_ADAPTIVE; - - /* Fill blocks */ - size_t neededThreads = matrix.blocks.getSize() * warpSize; // one warp per block - /* Execute kernels on device */ - for (Index grid = 0; neededThreads != 0; ++grid) { - if (MAX_X_DIM * threads >= neededThreads) { - blocks = roundUpDivision(neededThreads, threads); - neededThreads = 0; - } else { - blocks = MAX_X_DIM; - neededThreads -= MAX_X_DIM * threads; + void printBlocks() const + { + for( Index i = 0; i < this->blocks.getSize(); i++ ) + { + auto block = blocks.getElement( i ); + std::cout << "Block " << i << " : " << block << std::endl; } - SpMVCSRAdaptive< Real, Index, warpSize, - matrix.WARPS, - matrix.SHARED_PER_WARP, - matrix.MAX_ELEMENTS_PER_WARP_ADAPT > - <<>>( - inVector, - outVector, - matrix.getRowPointers().getData(), - matrix.getColumnIndexes().getData(), - matrix.getValues().getData(), - matrix.blocks.getData(), - matrix.blocks.getSize() - 1, // last block shouldn't be used - grid - ); } - } + + protected: + BlocksView blocks; }; template< typename Index, @@ -212,6 +378,9 @@ struct CSRKernelAdaptive using DeviceType = Device; using ViewType = CSRKernelAdaptiveView< Index, Device >; using ConstViewType = CSRKernelAdaptiveView< Index, Device >; + using BlocksType = typename ViewType::BlocksType; + using BlocksView = typename BlocksType::ViewType; + static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; @@ -227,33 +396,44 @@ struct CSRKernelAdaptive /* Number of elements in shared memory per one warp */ static constexpr Index SHARED_PER_WARP = SHARED / WARPS; - template< typename Offsets > - Index findLimit(const Index start, + /* Max length of row to process one warp for CSR Light, MultiVector */ + static constexpr Index MAX_ELEMENTS_PER_WARP = 384; + + /* Max length of row to process one warp for CSR Adaptive */ + static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512; + + template< typename Offsets > + Index findLimit(const Index start, const Offsets& offsets, const Index size, Type &type, - Index &sum) { - sum = 0; - for (Index current = start; current < size - 1; ++current) { - Index elements = offsets.getElement(current + 1) - - offsets.getElement(current); - sum += elements; - if (sum > matrix.SHARED_PER_WARP) { - if (current - start > 0) { // extra row - type = Type::STREAM; - return current; - } else { // one long row - if (sum <= 2 * matrix.MAX_ELEMENTS_PER_WARP_ADAPT) - type = Type::VECTOR; - else - type = Type::LONG; - return current + 1; + Index &sum) + { + sum = 0; + for (Index current = start; current < size - 1; ++current) + { + Index elements = offsets.getElement(current + 1) - + offsets.getElement(current); + sum += elements; + if (sum >SHARED_PER_WARP) + { + if (current - start > 0) + { // extra row + type = Type::STREAM; + return current; } - } - } - - type = Type::STREAM; - return size - 1; // return last row pointer + else + { // one long row + if (sum <= 2 * MAX_ELEMENTS_PER_WARP_ADAPT) + type = Type::VECTOR; + else + type = Type::LONG; + return current + 1; + } + } + } + type = Type::STREAM; + return size - 1; // return last row pointer } template< typename Offsets > @@ -269,8 +449,7 @@ struct CSRKernelAdaptive while (nextStart != rows - 1) { Type type; - nextStart = findLimit( - start, *this, rows, type, sum ); + nextStart = findLimit( start, offsets, rows, type, sum ); if (type == Type::LONG) { @@ -284,8 +463,8 @@ struct CSRKernelAdaptive { inBlock.emplace_back(start, type, nextStart, - this->rowPointers.getElement(nextStart), - this->rowPointers.getElement(start) ); + offsets.getElement(nextStart), + offsets.getElement(start) ); } start = nextStart; } @@ -295,19 +474,27 @@ struct CSRKernelAdaptive this->blocks.setSize(inBlock.size()); for (size_t i = 0; i < inBlock.size(); ++i) this->blocks.setElement(i, inBlock[i]); + + this->view.setBlocks( blocks ); }; - ViewType getView() { return view; }; + void reset() + { + this->blocks.reset(); + this->view.setBlocks( blocks ); + } + + ViewType getView() { return this->view; }; - ConstViewType getConstView() const { return ConstViewType(); }; + ConstViewType getConstView() const { return this->view; }; - template< typename OffsetsView, + template< typename OffsetsView, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > - void segmentsReduction( const OffsetsView& offsets, + void segmentsReduction( const OffsetsView& offsets, Index first, Index last, Fetch& fetch, @@ -315,11 +502,14 @@ struct CSRKernelAdaptive ResultKeeper& keeper, const Real& zero, Args... args ) const - { - view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); - } + { + view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); + } + + protected: + BlocksType blocks; - ViewType view; + ViewType view; }; diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cpp b/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cpp index 214ed2ca7..41306c6da 100644 --- a/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cpp +++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cpp @@ -1,5 +1,5 @@ /*************************************************************************** - SparseMatrixTest_CSRHybrid.cpp - description + SparseMatrixTest_CSRAdaptive.cpp - description ------------------- begin : Jan 23, 2021 copyright : (C) 2021 by Tomas Oberhuber et al. @@ -8,4 +8,4 @@ /* See Copyright Notice in tnl/Copyright */ -#include "SparseMatrixTest_CSRHybrid.h" +#include "SparseMatrixTest_CSRAdaptive.h" diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cu b/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cu index c0a0918d7..50a433333 100644 --- a/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cu +++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cu @@ -1,5 +1,5 @@ /*************************************************************************** - SparseMatrixTest_CSRHybrid.cu - description + SparseMatrixTest_CSRAdaptive.cu - description ------------------- begin : Jan 23, 2021 copyright : (C) 2021 by Tomas Oberhuber et al. @@ -8,4 +8,4 @@ /* See Copyright Notice in tnl/Copyright */ -#include "SparseMatrixTest_CSRHybrid.h" +#include "SparseMatrixTest_CSRAdaptive.h" diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.h b/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.h index 24ba77fa0..e67ea5c85 100644 --- a/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.h +++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.h @@ -1,5 +1,5 @@ /*************************************************************************** - SparseMatrixTest_CSRHybrid.h - description + SparseMatrixTest_CSRAdaptive.h - description ------------------- begin : Jan 23, 2021 copyright : (C) 2021 by Tomas Oberhuber et al. @@ -15,28 +15,28 @@ #ifdef HAVE_GTEST #include -const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRHybrid_segments"; +const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRAdaptive_segments"; // types for which MatrixTest is instantiated using MatrixTypes = ::testing::Types < - TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, - TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, - TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, - TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, - TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, - TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, - TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, - TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid > + TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, + TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive > #ifdef HAVE_CUDA - ,TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, - TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, - TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, - TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, - TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, - TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, - TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >, - TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid > + ,TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, + //TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, + //TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, + //TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, + //TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, + //TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, + //TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive > #endif >; -- GitLab From 0bdbf8bbb3f9f42d82708b06d5ddf67521142ba2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Mon, 25 Jan 2021 12:21:28 +0100 Subject: [PATCH 19/27] Adaptive CSR Stream kernel is working. --- .../Algorithms/Segments/CSRKernelAdaptive.h | 26 +++++++++---------- src/TNL/Matrices/SparseMatrixView.hpp | 4 +-- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h b/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h index 9e247fa6d..65dc595d2 100644 --- a/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h +++ b/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h @@ -145,41 +145,39 @@ segmentsReductionCSRAdaptiveKernel( const Block< Index > *blocks, const Index laneID = threadIdx.x & 31; // & is cheaper than % Block block = blocks[blockIdx]; const Index minID = offsets[block.index[0]/* minRow */]; - Index i, to, maxID; + Index to, maxID; if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b1000000) { - /**** - * CSR Stream: Copy first all data into shared memory - */ - const Index warpID = threadIdx.x / 32; maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4]; /* Stream data to shared memory */ for( Index globalIdx = laneID + minID; globalIdx < maxID; globalIdx += warpSize ) { - shared[warpID][i - minID] = //fetch( globalIdx, compute ); + shared[warpID][globalIdx - minID] = //fetch( globalIdx, compute ); details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute ); - printf( "Stream: Fetch at %d -> %f \n", globalIdx, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute ) ); + //printf( "Stream: Fetch at %d -> %d \n", globalIdx, shared[warpID][globalIdx - minID] ); + //details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute ) ); // TODO:: fix this - //values[i] * inVector[columnIndexes[i]]; } const Index maxRow = block.index[0]/* minRow */ + /* maxRow - minRow */(block.twobytes[sizeof(Index) == 4 ? 3 : 5] & 0x3FFF); - /* Calculate result */ - for (i = block.index[0]/* minRow */ + laneID; i < maxRow; i += warpSize) + /// Calculate result + for( Index i = block.index[0]/* minRow */ + laneID; i < maxRow; i += warpSize ) { to = offsets[i + 1] - minID; // end of preprocessed data result = zero; - /* Scalar reduction */ + // Scalar reduction for( Index sharedID = offsets[ i ] - minID; sharedID < to; ++sharedID) + { result = reduce( result, shared[warpID][sharedID] ); + //printf( " threadIdx %d is adding %d in segment %d -> %d\n", threadIdx.x, shared[warpID][sharedID], i, result ); + } - printf( "Stream: threadIdx = %d result for segment %d is %f \n", threadIdx, i, result ); + //printf( "Stream: threadIdx = %d result for segment %d is %d \n", threadIdx.x, i, result ); keep( i, result ); - //outVector[i] = result; // Write result } } else //if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b10000000) @@ -201,7 +199,7 @@ segmentsReductionCSRAdaptiveKernel( const Block< Index > *blocks, result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 1 ) ); if( laneID == 0 ) { - printf( "Vector: threadIdx = %d result for segment %d is %f \n", threadIdx, i, result ); + printf( "Vector: threadIdx = %d result for segment %d is %d \n", threadIdx, segmentIdx, result ); keep( segmentIdx, result ); //outVector[block.index[0]/* minRow */] = result; // Write result } diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp index 3be30da64..26217620b 100644 --- a/src/TNL/Matrices/SparseMatrixView.hpp +++ b/src/TNL/Matrices/SparseMatrixView.hpp @@ -383,8 +383,8 @@ vectorProduct( const InVector& inVector, static_assert( ! MatrixType::isSymmetric() || ! std::is_same< Device, Devices::Cuda >::value || - ( std::is_same< OutVectorReal, float >::value || - std::is_same< OutVectorReal, double >::value || + ( std::is_same< OutVectorReal, float >::value || + std::is_same< OutVectorReal, double >::value || std::is_same< OutVectorReal, int >::value || std::is_same< OutVectorReal, long long int >::value ), "Given Real type is not supported by atomic operations on GPU which are necessary for symmetric operations." ); -- GitLab From d7e0e1758c29525040ef31a8d2c9c57de933613d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Mon, 25 Jan 2021 17:35:19 +0100 Subject: [PATCH 20/27] Refactoring CSR adaptive kernel. --- .../Algorithms/Segments/CSRKernelAdaptive.h | 201 ++++++++++++------ 1 file changed, 131 insertions(+), 70 deletions(-) diff --git a/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h b/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h index 65dc595d2..980307606 100644 --- a/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h +++ b/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h @@ -28,6 +28,11 @@ enum class Type { VECTOR = 2 }; +/*template< typename Index > +struct LongBlockDescription +{ + uint8_t type; +}*/ template< typename Index > union Block { @@ -55,7 +60,7 @@ union Block Block() = default; - Type getType() const + __cuda_callable__ Type getType() const { if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b1000000 ) return Type::STREAM; @@ -64,16 +69,27 @@ union Block return Type::LONG; } - Index getFirstRow() const + __cuda_callable__ const Index& getFirstSegment() const { return index[ 0 ]; } - Index getRowsInBlock() const + /*** + * \brief Returns number of elements covered by the block. + */ + __cuda_callable__ const Index getSize() const { return twobytes[ sizeof(Index) == 4 ? 2 : 4 ]; } + /*** + * \brief Returns number of segments covered by the block. + */ + __cuda_callable__ const Index getSegmentsInBlock() const + { + return ( twobytes[ sizeof( Index ) == 4 ? 3 : 5 ] & 0x3FFF ); + } + void print( std::ostream& str ) const { Type type = this->getType(); @@ -90,8 +106,8 @@ union Block str << " Long "; break; } - str << " first row: " << getFirstRow(); - str << " rows per block: " << getRowsInBlock(); + str << " first segment: " << getFirstSegment(); + str << " block end: " << getSize(); str << " index in warp: " << index[ 1 ]; } Index index[2]; // index[0] is row pointer, index[1] is index in warp @@ -109,10 +125,12 @@ std::ostream& operator<< ( std::ostream& str, const Block< Index >& block ) #ifdef HAVE_CUDA -template< int warpSize, +template< int CudaBlockSize, + int warpSize, int WARPS, int SHARED_PER_WARP, int MAX_ELEM_PER_WARP, + typename BlocksView, typename Offsets, typename Index, typename Fetch, @@ -121,8 +139,7 @@ template< int warpSize, typename Real, typename... Args > __global__ void -segmentsReductionCSRAdaptiveKernel( const Block< Index > *blocks, - Index blocksSize, +segmentsReductionCSRAdaptiveKernel( BlocksView blocks, int gridIdx, Offsets offsets, Index first, @@ -133,46 +150,51 @@ segmentsReductionCSRAdaptiveKernel( const Block< Index > *blocks, Real zero, Args... args ) { - __shared__ Real shared[WARPS][SHARED_PER_WARP]; + __shared__ Real streamShared[WARPS][SHARED_PER_WARP]; + __shared__ Real multivectorShared[ CudaBlockSize / warpSize ]; constexpr size_t MAX_X_DIM = 2147483647; const Index index = (gridIdx * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; const Index blockIdx = index / warpSize; - if (blockIdx >= blocksSize) + if( blockIdx >= blocks.getSize() - 1 ) return; + if( threadIdx.x < CudaBlockSize / warpSize ) + multivectorShared[ threadIdx.x ] = zero; Real result = zero; bool compute( true ); const Index laneID = threadIdx.x & 31; // & is cheaper than % - Block block = blocks[blockIdx]; - const Index minID = offsets[block.index[0]/* minRow */]; - Index to, maxID; + const Block< Index > block = blocks[ blockIdx ]; + const Index& firstSegmentIdx = block.getFirstSegment(); + const Index begin = offsets[ firstSegmentIdx ]; + //Index to, maxID; - if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b1000000) + const auto blockType = block.getType(); + if( blockType == Type::STREAM ) { const Index warpID = threadIdx.x / 32; - maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4]; + const Index end = begin + block.getSize(); - /* Stream data to shared memory */ - for( Index globalIdx = laneID + minID; globalIdx < maxID; globalIdx += warpSize ) + // Stream data to shared memory + for( Index globalIdx = laneID + begin; globalIdx < end; globalIdx += warpSize ) { - shared[warpID][globalIdx - minID] = //fetch( globalIdx, compute ); + streamShared[warpID][globalIdx - begin ] = //fetch( globalIdx, compute ); details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute ); - //printf( "Stream: Fetch at %d -> %d \n", globalIdx, shared[warpID][globalIdx - minID] ); - //details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute ) ); - // TODO:: fix this + // TODO:: fix this by template specialization so that we can assume fetch lambda + // with short parameters } - const Index maxRow = block.index[0]/* minRow */ + - /* maxRow - minRow */(block.twobytes[sizeof(Index) == 4 ? 3 : 5] & 0x3FFF); + const Index maxRow = firstSegmentIdx + block.getSegmentsInBlock(); + /* minRow */ //+ + /* maxRow - minRow *///(block.twobytes[sizeof(Index) == 4 ? 3 : 5] & 0x3FFF); /// Calculate result for( Index i = block.index[0]/* minRow */ + laneID; i < maxRow; i += warpSize ) { - to = offsets[i + 1] - minID; // end of preprocessed data + const Index to = offsets[i + 1] - begin; // end of preprocessed data result = zero; // Scalar reduction - for( Index sharedID = offsets[ i ] - minID; sharedID < to; ++sharedID) + for( Index sharedID = offsets[ i ] - begin; sharedID < to; ++sharedID) { - result = reduce( result, shared[warpID][sharedID] ); + result = reduce( result, streamShared[warpID][sharedID] ); //printf( " threadIdx %d is adding %d in segment %d -> %d\n", threadIdx.x, shared[warpID][sharedID], i, result ); } @@ -180,16 +202,15 @@ segmentsReductionCSRAdaptiveKernel( const Block< Index > *blocks, keep( i, result ); } } - else //if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b10000000) + else if( blockType == Type::VECTOR ) { - printf( "Vector: threadIdx = %d \n", threadIdx ); + //printf( "Vector: threadIdx = %d \n", threadIdx ); /////////////////////////////////////* CSR VECTOR *////////////// - maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4]; + const Index end = begin + block.getSize(); //block.twobytes[sizeof(Index) == 4 ? 2 : 4]; const Index segmentIdx = block.index[0]; - for( Index globalIdx = minID + laneID; globalIdx < maxID; globalIdx += warpSize ) + for( Index globalIdx = begin + laneID; globalIdx < end; globalIdx += warpSize ) result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); // fix local idx - //values[i] * inVector[columnIndexes[i]]; /* Parallel reduction */ result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 16 ) ); @@ -199,31 +220,65 @@ segmentsReductionCSRAdaptiveKernel( const Block< Index > *blocks, result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 1 ) ); if( laneID == 0 ) { - printf( "Vector: threadIdx = %d result for segment %d is %d \n", threadIdx, segmentIdx, result ); + //printf( "Vector: threadIdx = %d result for segment %d is %f \n", threadIdx, segmentIdx, result ); keep( segmentIdx, result ); //outVector[block.index[0]/* minRow */] = result; // Write result } - }/* - else + } + else // blockType == Type::LONG { ///////////////////////////////////// CSR VECTOR L ///////////// // Number of elements processed by previous warps const Index offset = block.index[1] * MAX_ELEM_PER_WARP; - to = minID + (block.index[1] + 1) * MAX_ELEM_PER_WARP; - maxID = offsets[block.index[0] + 1]; - if( to > maxID ) - to = maxID; - for( Index globalIdx = minID + offset + laneID; globalIdx < to; globalIdx += warpSize ) - result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) ); + Index to = begin + (block.index[1] + 1) * MAX_ELEM_PER_WARP; + const Index segmentIdx = block.index[0]; + //minID = offsets[block.index[0] ]; + const Index end = offsets[block.index[0] + 1]; + const int tid = threadIdx.x; + + if( to > end ) + to = end; + result = zero; + //printf( "tid %d : start = %d \n", tid, minID + laneID ); + for( Index globalIdx = begin + laneID + offset; globalIdx < to; globalIdx += warpSize ) + { + result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); + //printf( "tid %d -> %d \n", tid, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); //result += values[i] * inVector[columnIndexes[i]]; + } + result += __shfl_down_sync(0xFFFFFFFF, result, 16); result += __shfl_down_sync(0xFFFFFFFF, result, 8); result += __shfl_down_sync(0xFFFFFFFF, result, 4); result += __shfl_down_sync(0xFFFFFFFF, result, 2); result += __shfl_down_sync(0xFFFFFFFF, result, 1); - if (laneID == 0) atomicAdd(&outVector[block.index[0] ], result); - }*/ + const Index warpID = threadIdx.x / 32; + if( laneID == 0 ) + multivectorShared[ warpID ] = result; + __syncthreads(); + // Reduction in multivectorShared + if( tid < 16 ) + { + multivectorShared[ tid ] = reduce( multivectorShared[ tid ], multivectorShared[ tid + 16 ] ); + __syncwarp(); + multivectorShared[ tid ] = reduce( multivectorShared[ tid ], multivectorShared[ tid + 8 ] ); + __syncwarp(); + multivectorShared[ tid ] = reduce( multivectorShared[ tid ], multivectorShared[ tid + 4 ] ); + __syncwarp(); + multivectorShared[ tid ] = reduce( multivectorShared[ tid ], multivectorShared[ tid + 2 ] ); + __syncwarp(); + multivectorShared[ tid ] = reduce( multivectorShared[ tid ], multivectorShared[ tid + 1 ] ); + __syncwarp(); + if( tid == 0 ) + { + printf( "Long: segmentIdx %d -> %d \n", segmentIdx, multivectorShared[ 0 ] ); + keep( segmentIdx, multivectorShared[ 0 ] ); + } + } + + //if (laneID == 0) atomicAdd(&outVector[block.index[0] ], result); + } } #endif @@ -278,10 +333,10 @@ struct CSRKernelAdaptiveView return; } - this->printBlocks(); + //this->printBlocks(); static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; //static constexpr Index THREADS_SCALAR = 128; - static constexpr Index THREADS_VECTOR = 128; + //static constexpr Index THREADS_VECTOR = 128; static constexpr Index THREADS_LIGHT = 128; /* Max length of row to process one warp for CSR Light, MultiVector */ @@ -310,7 +365,7 @@ struct CSRKernelAdaptiveView constexpr size_t MAX_X_DIM = 2147483647; /* Fill blocks */ - size_t neededThreads = blocks.getSize() * warpSize; // one warp per block + size_t neededThreads = this->blocks.getSize() * warpSize; // one warp per block /* Execute kernels on device */ for (Index gridIdx = 0; neededThreads != 0; gridIdx++ ) { @@ -326,15 +381,16 @@ struct CSRKernelAdaptiveView } segmentsReductionCSRAdaptiveKernel< + THREADS_ADAPTIVE, warpSize, WARPS, SHARED_PER_WARP, MAX_ELEMENTS_PER_WARP_ADAPT, + BlocksView, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... > <<>>( - blocks.getData(), - blocks.getSize() - 1, // last block shouldn't be used + this->blocks, gridIdx, offsets, first, @@ -401,31 +457,32 @@ struct CSRKernelAdaptive static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512; template< typename Offsets > - Index findLimit(const Index start, - const Offsets& offsets, - const Index size, - Type &type, - Index &sum) + Index findLimit( const Index start, + const Offsets& offsets, + const Index size, + Type &type, + Index &sum ) { sum = 0; - for (Index current = start; current < size - 1; ++current) + for (Index current = start; current < size - 1; current++ ) { Index elements = offsets.getElement(current + 1) - offsets.getElement(current); sum += elements; - if (sum >SHARED_PER_WARP) + if( sum > SHARED_PER_WARP ) { - if (current - start > 0) - { // extra row + if( current - start > 0 ) // extra row + { type = Type::STREAM; return current; } else { // one long row - if (sum <= 2 * MAX_ELEMENTS_PER_WARP_ADAPT) - type = Type::VECTOR; + if( sum <= 2 * MAX_ELEMENTS_PER_WARP_ADAPT ) + type = Type::VECTOR; else - type = Type::LONG; + type = Type::VECTOR; // TODO: Put LONG back + //type = Type::LONG; // return current + 1; } } @@ -438,28 +495,32 @@ struct CSRKernelAdaptive void init( const Offsets& offsets ) { const Index rows = offsets.getSize(); - Index sum, start = 0, nextStart = 0; + Index sum, start( 0 ), nextStart( 0 ); // Fill blocks - std::vector> inBlock; - inBlock.reserve(rows); + std::vector< Block< Index > > inBlock; + inBlock.reserve( rows ); - while (nextStart != rows - 1) + while( nextStart != rows - 1 ) { Type type; nextStart = findLimit( start, offsets, rows, type, sum ); - if (type == Type::LONG) + if( type == Type::LONG ) { - Index parts = roundUpDivision(sum, this->SHARED_PER_WARP); - for (Index index = 0; index < parts; ++index) - { - inBlock.emplace_back(start, Type::LONG, index); - } + inBlock.emplace_back( start, Type::LONG, 0 ); + const Index blocksCount = inBlock.size(); + const Index warpsPerCudaBlock = THREADS_ADAPTIVE / TNL::Cuda::getWarpSize(); + const Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount; + //Index parts = roundUpDivision(sum, this->SHARED_PER_WARP); + /*for( Index index = 1; index < warpsLeft; index++ ) + { + inBlock.emplace_back(start, Type::LONG, index); + }*/ } else { - inBlock.emplace_back(start, type, + inBlock.emplace_back(start, type, nextStart, offsets.getElement(nextStart), offsets.getElement(start) ); -- GitLab From 737153e2d2b7390b92846303ddb4a53671e68ac1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Mon, 25 Jan 2021 18:07:51 +0100 Subject: [PATCH 21/27] Refactoring CSR adaptive kernel. --- .../Algorithms/Segments/CSRKernelAdaptive.h | 61 +++++++------------ 1 file changed, 22 insertions(+), 39 deletions(-) diff --git a/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h b/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h index 980307606..5f81828f7 100644 --- a/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h +++ b/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h @@ -150,7 +150,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, Real zero, Args... args ) { - __shared__ Real streamShared[WARPS][SHARED_PER_WARP]; + __shared__ Real streamShared[ WARPS ][ SHARED_PER_WARP ]; __shared__ Real multivectorShared[ CudaBlockSize / warpSize ]; constexpr size_t MAX_X_DIM = 2147483647; const Index index = (gridIdx * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; @@ -162,72 +162,57 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, multivectorShared[ threadIdx.x ] = zero; Real result = zero; bool compute( true ); - const Index laneID = threadIdx.x & 31; // & is cheaper than % + const Index laneIdx = threadIdx.x & 31; // & is cheaper than % const Block< Index > block = blocks[ blockIdx ]; const Index& firstSegmentIdx = block.getFirstSegment(); const Index begin = offsets[ firstSegmentIdx ]; - //Index to, maxID; const auto blockType = block.getType(); - if( blockType == Type::STREAM ) + if( blockType == Type::STREAM ) // Stream kernel - many short segments per warp { - const Index warpID = threadIdx.x / 32; + const Index warpIdx = threadIdx.x / 32; const Index end = begin + block.getSize(); // Stream data to shared memory - for( Index globalIdx = laneID + begin; globalIdx < end; globalIdx += warpSize ) + for( Index globalIdx = laneIdx + begin; globalIdx < end; globalIdx += warpSize ) { - streamShared[warpID][globalIdx - begin ] = //fetch( globalIdx, compute ); + streamShared[ warpIdx ][ globalIdx - begin ] = //fetch( globalIdx, compute ); details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute ); // TODO:: fix this by template specialization so that we can assume fetch lambda // with short parameters } - const Index maxRow = firstSegmentIdx + block.getSegmentsInBlock(); - /* minRow */ //+ - /* maxRow - minRow *///(block.twobytes[sizeof(Index) == 4 ? 3 : 5] & 0x3FFF); - /// Calculate result - for( Index i = block.index[0]/* minRow */ + laneID; i < maxRow; i += warpSize ) + const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock(); + + for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += warpSize ) { - const Index to = offsets[i + 1] - begin; // end of preprocessed data + const Index sharedEnd = offsets[ i + 1 ] - begin; // end of preprocessed data result = zero; // Scalar reduction - for( Index sharedID = offsets[ i ] - begin; sharedID < to; ++sharedID) - { - result = reduce( result, streamShared[warpID][sharedID] ); - //printf( " threadIdx %d is adding %d in segment %d -> %d\n", threadIdx.x, shared[warpID][sharedID], i, result ); - } - - //printf( "Stream: threadIdx = %d result for segment %d is %d \n", threadIdx.x, i, result ); + for( Index sharedIdx = offsets[ i ] - begin; sharedIdx < sharedEnd; sharedIdx++ ) + result = reduce( result, streamShared[ warpIdx ][ sharedIdx ] ); keep( i, result ); } } - else if( blockType == Type::VECTOR ) + else if( blockType == Type::VECTOR ) // Vector kernel - one segment per warp { - //printf( "Vector: threadIdx = %d \n", threadIdx ); - /////////////////////////////////////* CSR VECTOR *////////////// - const Index end = begin + block.getSize(); //block.twobytes[sizeof(Index) == 4 ? 2 : 4]; - const Index segmentIdx = block.index[0]; + const Index end = begin + block.getSize(); + const Index segmentIdx = block.getFirstSegment(); - for( Index globalIdx = begin + laneID; globalIdx < end; globalIdx += warpSize ) + for( Index globalIdx = begin + laneIdx; globalIdx < end; globalIdx += warpSize ) result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); // fix local idx - /* Parallel reduction */ + // Parallel reduction result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 16 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 8 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 4 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 2 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 1 ) ); - if( laneID == 0 ) - { - //printf( "Vector: threadIdx = %d result for segment %d is %f \n", threadIdx, segmentIdx, result ); + if( laneIdx == 0 ) keep( segmentIdx, result ); - //outVector[block.index[0]/* minRow */] = result; // Write result - } } - else // blockType == Type::LONG + else // blockType == Type::LONG - several warps per segment { - ///////////////////////////////////// CSR VECTOR L ///////////// // Number of elements processed by previous warps const Index offset = block.index[1] * MAX_ELEM_PER_WARP; Index to = begin + (block.index[1] + 1) * MAX_ELEM_PER_WARP; @@ -235,12 +220,12 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, //minID = offsets[block.index[0] ]; const Index end = offsets[block.index[0] + 1]; const int tid = threadIdx.x; - + if( to > end ) to = end; result = zero; //printf( "tid %d : start = %d \n", tid, minID + laneID ); - for( Index globalIdx = begin + laneID + offset; globalIdx < to; globalIdx += warpSize ) + for( Index globalIdx = begin + laneIdx + offset; globalIdx < to; globalIdx += warpSize ) { result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); //printf( "tid %d -> %d \n", tid, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); @@ -254,7 +239,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, result += __shfl_down_sync(0xFFFFFFFF, result, 2); result += __shfl_down_sync(0xFFFFFFFF, result, 1); const Index warpID = threadIdx.x / 32; - if( laneID == 0 ) + if( laneIdx == 0 ) multivectorShared[ warpID ] = result; __syncthreads(); // Reduction in multivectorShared @@ -276,8 +261,6 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, keep( segmentIdx, multivectorShared[ 0 ] ); } } - - //if (laneID == 0) atomicAdd(&outVector[block.index[0] ], result); } } #endif -- GitLab From dcd87dec99d14ebfaa3a7c8933913ed1b0530a18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Mon, 25 Jan 2021 20:31:57 +0100 Subject: [PATCH 22/27] Refactoring Adaptive CSR kernel. --- .../Algorithms/Segments/CSRKernelAdaptive.h | 135 +++--------------- .../CSRAdaptiveKernelBlockDescriptor.h | 118 +++++++++++++++ 2 files changed, 134 insertions(+), 119 deletions(-) create mode 100644 src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h diff --git a/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h b/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h index 5f81828f7..bfd8f55f7 100644 --- a/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h +++ b/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h @@ -16,113 +16,12 @@ #include #include #include +#include namespace TNL { namespace Algorithms { namespace Segments { -enum class Type { - /* LONG = 0!!! Non zero value rewrites index[1] */ - LONG = 0, - STREAM = 1, - VECTOR = 2 -}; - -/*template< typename Index > -struct LongBlockDescription -{ - uint8_t type; -}*/ -template< typename Index > -union Block -{ - Block(Index row, Type type = Type::VECTOR, Index index = 0) noexcept - { - this->index[0] = row; - this->index[1] = index; - this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type; - } - - Block(Index row, Type type, Index nextRow, Index maxID, Index minID) noexcept - { - this->index[0] = row; - this->index[1] = 0; - this->twobytes[sizeof(Index) == 4 ? 2 : 4] = maxID - minID; - - if (type == Type::STREAM) - this->twobytes[sizeof(Index) == 4 ? 3 : 5] = nextRow - row; - - if (type == Type::STREAM) - this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b1000000; - else if (type == Type::VECTOR) - this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b10000000; - } - - Block() = default; - - __cuda_callable__ Type getType() const - { - if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b1000000 ) - return Type::STREAM; - if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b10000000 ) - return Type::VECTOR; - return Type::LONG; - } - - __cuda_callable__ const Index& getFirstSegment() const - { - return index[ 0 ]; - } - - /*** - * \brief Returns number of elements covered by the block. - */ - __cuda_callable__ const Index getSize() const - { - return twobytes[ sizeof(Index) == 4 ? 2 : 4 ]; - } - - /*** - * \brief Returns number of segments covered by the block. - */ - __cuda_callable__ const Index getSegmentsInBlock() const - { - return ( twobytes[ sizeof( Index ) == 4 ? 3 : 5 ] & 0x3FFF ); - } - - void print( std::ostream& str ) const - { - Type type = this->getType(); - str << "Type: "; - switch( type ) - { - case Type::STREAM: - str << " Stream "; - break; - case Type::VECTOR: - str << " Vector "; - break; - case Type::LONG: - str << " Long "; - break; - } - str << " first segment: " << getFirstSegment(); - str << " block end: " << getSize(); - str << " index in warp: " << index[ 1 ]; - } - Index index[2]; // index[0] is row pointer, index[1] is index in warp - uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator - uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID - //twobytes[3/5] is nextRow - row -}; - -template< typename Index > -std::ostream& operator<< ( std::ostream& str, const Block< Index >& block ) -{ - block.print( str ); - return str; -} - #ifdef HAVE_CUDA template< int CudaBlockSize, @@ -163,12 +62,12 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, Real result = zero; bool compute( true ); const Index laneIdx = threadIdx.x & 31; // & is cheaper than % - const Block< Index > block = blocks[ blockIdx ]; + const details::CSRAdaptiveKernelBlockDescriptor< Index > block = blocks[ blockIdx ]; const Index& firstSegmentIdx = block.getFirstSegment(); const Index begin = offsets[ firstSegmentIdx ]; const auto blockType = block.getType(); - if( blockType == Type::STREAM ) // Stream kernel - many short segments per warp + if( blockType == details::Type::STREAM ) // Stream kernel - many short segments per warp { const Index warpIdx = threadIdx.x / 32; const Index end = begin + block.getSize(); @@ -194,7 +93,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, keep( i, result ); } } - else if( blockType == Type::VECTOR ) // Vector kernel - one segment per warp + else if( blockType == details::Type::VECTOR ) // Vector kernel - one segment per warp { const Index end = begin + block.getSize(); const Index segmentIdx = block.getFirstSegment(); @@ -274,7 +173,7 @@ struct CSRKernelAdaptiveView using DeviceType = Device; using ViewType = CSRKernelAdaptiveView< Index, Device >; using ConstViewType = CSRKernelAdaptiveView< Index, Device >; - using BlocksType = TNL::Containers::Vector< Block< Index >, Device, Index >; + using BlocksType = TNL::Containers::Vector< details::CSRAdaptiveKernelBlockDescriptor< Index >, Device, Index >; using BlocksView = typename BlocksType::ViewType; CSRKernelAdaptiveView() = default; @@ -320,10 +219,10 @@ struct CSRKernelAdaptiveView static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; //static constexpr Index THREADS_SCALAR = 128; //static constexpr Index THREADS_VECTOR = 128; - static constexpr Index THREADS_LIGHT = 128; + //static constexpr Index THREADS_LIGHT = 128; /* Max length of row to process one warp for CSR Light, MultiVector */ - static constexpr Index MAX_ELEMENTS_PER_WARP = 384; + //static constexpr Index MAX_ELEMENTS_PER_WARP = 384; /* Max length of row to process one warp for CSR Adaptive */ static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512; @@ -443,7 +342,7 @@ struct CSRKernelAdaptive Index findLimit( const Index start, const Offsets& offsets, const Index size, - Type &type, + details::Type &type, Index &sum ) { sum = 0; @@ -456,21 +355,21 @@ struct CSRKernelAdaptive { if( current - start > 0 ) // extra row { - type = Type::STREAM; + type = details::Type::STREAM; return current; } else { // one long row if( sum <= 2 * MAX_ELEMENTS_PER_WARP_ADAPT ) - type = Type::VECTOR; + type = details::Type::VECTOR; else - type = Type::VECTOR; // TODO: Put LONG back + type = details::Type::VECTOR; // TODO: Put LONG back //type = Type::LONG; // return current + 1; } } } - type = Type::STREAM; + type = details::Type::STREAM; return size - 1; // return last row pointer } @@ -481,17 +380,17 @@ struct CSRKernelAdaptive Index sum, start( 0 ), nextStart( 0 ); // Fill blocks - std::vector< Block< Index > > inBlock; + std::vector< details::CSRAdaptiveKernelBlockDescriptor< Index > > inBlock; inBlock.reserve( rows ); while( nextStart != rows - 1 ) { - Type type; + details::Type type; nextStart = findLimit( start, offsets, rows, type, sum ); - if( type == Type::LONG ) + if( type == details::Type::LONG ) { - inBlock.emplace_back( start, Type::LONG, 0 ); + inBlock.emplace_back( start, details::Type::LONG, 0 ); const Index blocksCount = inBlock.size(); const Index warpsPerCudaBlock = THREADS_ADAPTIVE / TNL::Cuda::getWarpSize(); const Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount; @@ -554,8 +453,6 @@ struct CSRKernelAdaptive ViewType view; }; - - } // namespace Segments } // namespace Algorithms } // namespace TNL diff --git a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h new file mode 100644 index 000000000..255d77fbd --- /dev/null +++ b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h @@ -0,0 +1,118 @@ +/*************************************************************************** + CSRAdaptiveKernelBlockDescriptor.h - description + ------------------- + begin : Jan 25, 2021 -> Joe Biden inauguration + copyright : (C) 2021 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +namespace TNL { + namespace Algorithms { + namespace Segments { + namespace details { + +enum class Type { + /* LONG = 0!!! Non zero value rewrites index[1] */ + LONG = 0, + STREAM = 1, + VECTOR = 2 +}; + + +template< typename Index > +union CSRAdaptiveKernelBlockDescriptor +{ + CSRAdaptiveKernelBlockDescriptor(Index row, Type type = Type::VECTOR, Index index = 0) noexcept + { + this->index[0] = row; + this->index[1] = index; + this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type; + } + + CSRAdaptiveKernelBlockDescriptor(Index row, Type type, Index nextRow, Index maxID, Index minID) noexcept + { + this->index[0] = row; + this->index[1] = 0; + this->twobytes[sizeof(Index) == 4 ? 2 : 4] = maxID - minID; + + if (type == Type::STREAM) + this->twobytes[sizeof(Index) == 4 ? 3 : 5] = nextRow - row; + + if (type == Type::STREAM) + this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b1000000; + else if (type == Type::VECTOR) + this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b10000000; + } + + CSRAdaptiveKernelBlockDescriptor() = default; + + __cuda_callable__ Type getType() const + { + if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b1000000 ) + return Type::STREAM; + if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b10000000 ) + return Type::VECTOR; + return Type::LONG; + } + + __cuda_callable__ const Index& getFirstSegment() const + { + return index[ 0 ]; + } + + /*** + * \brief Returns number of elements covered by the block. + */ + __cuda_callable__ const Index getSize() const + { + return twobytes[ sizeof(Index) == 4 ? 2 : 4 ]; + } + + /*** + * \brief Returns number of segments covered by the block. + */ + __cuda_callable__ const Index getSegmentsInBlock() const + { + return ( twobytes[ sizeof( Index ) == 4 ? 3 : 5 ] & 0x3FFF ); + } + + void print( std::ostream& str ) const + { + Type type = this->getType(); + str << "Type: "; + switch( type ) + { + case Type::STREAM: + str << " Stream "; + break; + case Type::VECTOR: + str << " Vector "; + break; + case Type::LONG: + str << " Long "; + break; + } + str << " first segment: " << getFirstSegment(); + str << " block end: " << getSize(); + str << " index in warp: " << index[ 1 ]; + } + Index index[2]; // index[0] is row pointer, index[1] is index in warp + uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator + uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID + //twobytes[3/5] is nextRow - row +}; + +template< typename Index > +std::ostream& operator<< ( std::ostream& str, const CSRAdaptiveKernelBlockDescriptor< Index >& block ) +{ + block.print( str ); + return str; +} + } // namespace details + } // namespace Segments + } // namespace Algorithms +} // namespace TNL \ No newline at end of file -- GitLab From 856bac74cbfab55e8c0c736ec4a0c165a992c3ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Mon, 25 Jan 2021 21:15:16 +0100 Subject: [PATCH 23/27] Refactoring Adaptive CSR kernel. --- .../CSRAdaptiveKernelBlockDescriptor.h | 86 +++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h index 255d77fbd..20bf91dbb 100644 --- a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h +++ b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h @@ -22,6 +22,90 @@ enum class Type { VECTOR = 2 }; +#ifdef CSR_ADAPTIVE_UNION +template< typename Index > +union CSRAdaptiveKernelBlockDescriptor +{ + CSRAdaptiveKernelBlockDescriptor(Index row, Type type = Type::VECTOR, Index index = 0) noexcept + { + this->index[0] = row; + this->index[1] = index; + this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type; + } + + CSRAdaptiveKernelBlockDescriptor(Index row, Type type, Index nextRow, Index maxID, Index minID) noexcept + { + this->index[0] = row; + this->index[1] = 0; + this->twobytes[sizeof(Index) == 4 ? 2 : 4] = maxID - minID; + + if (type == Type::STREAM) + this->twobytes[sizeof(Index) == 4 ? 3 : 5] = nextRow - row; + + if (type == Type::STREAM) + this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b1000000; + else if (type == Type::VECTOR) + this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b10000000; + } + + CSRAdaptiveKernelBlockDescriptor() = default; + + __cuda_callable__ Type getType() const + { + if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b1000000 ) + return Type::STREAM; + if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b10000000 ) + return Type::VECTOR; + return Type::LONG; + } + + __cuda_callable__ const Index& getFirstSegment() const + { + return index[ 0 ]; + } + + /*** + * \brief Returns number of elements covered by the block. + */ + __cuda_callable__ const Index getSize() const + { + return twobytes[ sizeof(Index) == 4 ? 2 : 4 ]; + } + + /*** + * \brief Returns number of segments covered by the block. + */ + __cuda_callable__ const Index getSegmentsInBlock() const + { + return ( twobytes[ sizeof( Index ) == 4 ? 3 : 5 ] & 0x3FFF ); + } + + void print( std::ostream& str ) const + { + Type type = this->getType(); + str << "Type: "; + switch( type ) + { + case Type::STREAM: + str << " Stream "; + break; + case Type::VECTOR: + str << " Vector "; + break; + case Type::LONG: + str << " Long "; + break; + } + str << " first segment: " << getFirstSegment(); + str << " block end: " << getSize(); + str << " index in warp: " << index[ 1 ]; + } + Index index[2]; // index[0] is row pointer, index[1] is index in warp + uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator + uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID + //twobytes[3/5] is nextRow - row +}; +#else template< typename Index > union CSRAdaptiveKernelBlockDescriptor @@ -106,6 +190,8 @@ union CSRAdaptiveKernelBlockDescriptor //twobytes[3/5] is nextRow - row }; +#endif + template< typename Index > std::ostream& operator<< ( std::ostream& str, const CSRAdaptiveKernelBlockDescriptor< Index >& block ) { -- GitLab From cadfb88acd2bf8a250adcace199cb9e70d8eb247 Mon Sep 17 00:00:00 2001 From: Tomas Oberhuber Date: Tue, 26 Jan 2021 14:09:54 +0100 Subject: [PATCH 24/27] Added new CSR adaptive kernel block descriptor. --- .../Algorithms/Segments/CSRKernelAdaptive.h | 19 ++--- .../CSRAdaptiveKernelBlockDescriptor.h | 72 +++++++++++++------ 2 files changed, 61 insertions(+), 30 deletions(-) diff --git a/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h b/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h index bfd8f55f7..b56129a05 100644 --- a/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h +++ b/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h @@ -113,11 +113,12 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, else // blockType == Type::LONG - several warps per segment { // Number of elements processed by previous warps - const Index offset = block.index[1] * MAX_ELEM_PER_WARP; - Index to = begin + (block.index[1] + 1) * MAX_ELEM_PER_WARP; - const Index segmentIdx = block.index[0]; + const Index offset = //block.index[1] * MAX_ELEM_PER_WARP; + block.getWarpIdx() * MAX_ELEM_PER_WARP; + Index to = begin + (block.getWarpIdx() + 1) * MAX_ELEM_PER_WARP; + const Index segmentIdx = block.getFirstSegment();//block.index[0]; //minID = offsets[block.index[0] ]; - const Index end = offsets[block.index[0] + 1]; + const Index end = offsets[segmentIdx + 1]; const int tid = threadIdx.x; if( to > end ) @@ -215,7 +216,7 @@ struct CSRKernelAdaptiveView return; } - //this->printBlocks(); + this->printBlocks(); static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; //static constexpr Index THREADS_SCALAR = 128; //static constexpr Index THREADS_VECTOR = 128; @@ -390,15 +391,15 @@ struct CSRKernelAdaptive if( type == details::Type::LONG ) { - inBlock.emplace_back( start, details::Type::LONG, 0 ); const Index blocksCount = inBlock.size(); const Index warpsPerCudaBlock = THREADS_ADAPTIVE / TNL::Cuda::getWarpSize(); const Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount; //Index parts = roundUpDivision(sum, this->SHARED_PER_WARP); - /*for( Index index = 1; index < warpsLeft; index++ ) + inBlock.emplace_back( start, details::Type::LONG, 0, warpsLeft ); + for( Index index = 1; index < warpsLeft; index++ ) { - inBlock.emplace_back(start, Type::LONG, index); - }*/ + inBlock.emplace_back( start, details::Type::LONG, index, warpsLeft ); + } } else { diff --git a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h index 20bf91dbb..40bc8e6f9 100644 --- a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h +++ b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h @@ -108,18 +108,35 @@ union CSRAdaptiveKernelBlockDescriptor #else template< typename Index > -union CSRAdaptiveKernelBlockDescriptor +struct CSRAdaptiveKernelBlockDescriptor { - CSRAdaptiveKernelBlockDescriptor(Index row, Type type = Type::VECTOR, Index index = 0) noexcept + CSRAdaptiveKernelBlockDescriptor( Index firstSegmentIdx, + Type type = Type::VECTOR, + uint8_t warpIdx = 0, + uint8_t warpsCount = 0 ) noexcept { - this->index[0] = row; + this->firstSegmentIdx = firstSegmentIdx; + this->type = ( uint8_t ) type; + this->warpIdx = warpIdx; + this->warpsCount = warpsCount; + /*this->index[0] = row; this->index[1] = index; - this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type; + this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type;*/ } - CSRAdaptiveKernelBlockDescriptor(Index row, Type type, Index nextRow, Index maxID, Index minID) noexcept + CSRAdaptiveKernelBlockDescriptor( Index firstSegmentIdx, + Type type, + Index lastSegmentIdx, + Index end, + Index begin ) noexcept { - this->index[0] = row; + this->firstSegmentIdx = firstSegmentIdx; + this->warpIdx = 0; + this->blockSize = end - begin; + this->segmentsInBlock = lastSegmentIdx - firstSegmentIdx; + this->type = ( uint8_t ) type; + + /*this->index[0] = row; this->index[1] = 0; this->twobytes[sizeof(Index) == 4 ? 2 : 4] = maxID - minID; @@ -129,23 +146,25 @@ union CSRAdaptiveKernelBlockDescriptor if (type == Type::STREAM) this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b1000000; else if (type == Type::VECTOR) - this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b10000000; + this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b10000000;*/ } CSRAdaptiveKernelBlockDescriptor() = default; __cuda_callable__ Type getType() const { - if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b1000000 ) + return ( Type ) this->type; + /*if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b1000000 ) return Type::STREAM; if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b10000000 ) return Type::VECTOR; - return Type::LONG; + return Type::LONG;*/ } __cuda_callable__ const Index& getFirstSegment() const { - return index[ 0 ]; + return this->firstSegmentIdx; + //return index[ 0 ]; } /*** @@ -153,7 +172,8 @@ union CSRAdaptiveKernelBlockDescriptor */ __cuda_callable__ const Index getSize() const { - return twobytes[ sizeof(Index) == 4 ? 2 : 4 ]; + return this->blockSize; + //return twobytes[ sizeof(Index) == 4 ? 2 : 4 ]; } /*** @@ -161,14 +181,19 @@ union CSRAdaptiveKernelBlockDescriptor */ __cuda_callable__ const Index getSegmentsInBlock() const { - return ( twobytes[ sizeof( Index ) == 4 ? 3 : 5 ] & 0x3FFF ); + return this->segmentsInBlock; + //return ( twobytes[ sizeof( Index ) == 4 ? 3 : 5 ] & 0x3FFF ); + } + + __cuda_callable__ const uint8_t getWarpIdx() const + { + return this->warpIdx; } void print( std::ostream& str ) const { - Type type = this->getType(); str << "Type: "; - switch( type ) + switch( this->getType() ) { case Type::STREAM: str << " Stream "; @@ -180,13 +205,18 @@ union CSRAdaptiveKernelBlockDescriptor str << " Long "; break; } - str << " first segment: " << getFirstSegment(); - str << " block end: " << getSize(); - str << " index in warp: " << index[ 1 ]; + str << " first segment: " << this->getFirstSegment(); + str << " block end: " << this->getSize(); + str << " index in warp: " << this->getWarpIdx(); } - Index index[2]; // index[0] is row pointer, index[1] is index in warp - uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator - uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID + + uint8_t type; + Index firstSegmentIdx, blockSize, segmentsInBlock; + uint8_t warpIdx, warpsCount; + + //Index index[2]; // index[0] is row pointer, index[1] is index in warp + //uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator + //uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID //twobytes[3/5] is nextRow - row }; @@ -201,4 +231,4 @@ std::ostream& operator<< ( std::ostream& str, const CSRAdaptiveKernelBlockDescri } // namespace details } // namespace Segments } // namespace Algorithms -} // namespace TNL \ No newline at end of file +} // namespace TNL -- GitLab From ac783cf2b94c97f03a715bf3dbfe6e22ad355ed8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Wed, 27 Jan 2021 14:14:50 +0100 Subject: [PATCH 25/27] Adaptiver CSR kernel seems to be working well. --- .../Algorithms/Segments/CSRKernelAdaptive.h | 80 +++++++++++++------ .../CSRAdaptiveKernelBlockDescriptor.h | 5 ++ src/UnitTests/Matrices/SparseMatrixTest.hpp | 17 ++-- .../Matrices/SparseMatrixTest_CSRAdaptive.h | 12 +-- 4 files changed, 74 insertions(+), 40 deletions(-) diff --git a/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h b/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h index b56129a05..feed58a58 100644 --- a/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h +++ b/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h @@ -113,51 +113,78 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, else // blockType == Type::LONG - several warps per segment { // Number of elements processed by previous warps - const Index offset = //block.index[1] * MAX_ELEM_PER_WARP; - block.getWarpIdx() * MAX_ELEM_PER_WARP; - Index to = begin + (block.getWarpIdx() + 1) * MAX_ELEM_PER_WARP; + //const Index offset = //block.index[1] * MAX_ELEM_PER_WARP; + /// block.getWarpIdx() * MAX_ELEM_PER_WARP; + //Index to = begin + (block.getWarpIdx() + 1) * MAX_ELEM_PER_WARP; const Index segmentIdx = block.getFirstSegment();//block.index[0]; //minID = offsets[block.index[0] ]; const Index end = offsets[segmentIdx + 1]; const int tid = threadIdx.x; + //const int inBlockWarpIdx = block.getWarpIdx(); - if( to > end ) - to = end; + //if( to > end ) + // to = end; + TNL_ASSERT_GT( block.getWarpsCount(), 0, "" ); result = zero; - //printf( "tid %d : start = %d \n", tid, minID + laneID ); - for( Index globalIdx = begin + laneIdx + offset; globalIdx < to; globalIdx += warpSize ) + //printf( "LONG tid %d warpIdx %d: LONG \n", tid, block.getWarpIdx() ); + for( Index globalIdx = begin + laneIdx + TNL::Cuda::getWarpSize() * block.getWarpIdx(); + globalIdx < end; + globalIdx += TNL::Cuda::getWarpSize() * block.getWarpsCount() ) { result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); - //printf( "tid %d -> %d \n", tid, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); + //if( laneIdx == 0 ) + // printf( "LONG warpIdx: %d gid: %d begin: %d end: %d -> %d \n", ( int ) block.getWarpIdx(), globalIdx, begin, end, + // details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, 0, globalIdx, compute ) ); //result += values[i] * inVector[columnIndexes[i]]; } - + //printf( "tid %d -> %d \n", tid, result ); result += __shfl_down_sync(0xFFFFFFFF, result, 16); result += __shfl_down_sync(0xFFFFFFFF, result, 8); result += __shfl_down_sync(0xFFFFFFFF, result, 4); result += __shfl_down_sync(0xFFFFFFFF, result, 2); result += __shfl_down_sync(0xFFFFFFFF, result, 1); + + //if( laneIdx == 0 ) + // printf( "WARP RESULT: tid %d -> %d \n", tid, result ); + const Index warpID = threadIdx.x / 32; if( laneIdx == 0 ) multivectorShared[ warpID ] = result; + __syncthreads(); // Reduction in multivectorShared - if( tid < 16 ) + if( block.getWarpIdx() == 0 && laneIdx < 16 ) { - multivectorShared[ tid ] = reduce( multivectorShared[ tid ], multivectorShared[ tid + 16 ] ); - __syncwarp(); - multivectorShared[ tid ] = reduce( multivectorShared[ tid ], multivectorShared[ tid + 8 ] ); - __syncwarp(); - multivectorShared[ tid ] = reduce( multivectorShared[ tid ], multivectorShared[ tid + 4 ] ); - __syncwarp(); - multivectorShared[ tid ] = reduce( multivectorShared[ tid ], multivectorShared[ tid + 2 ] ); - __syncwarp(); - multivectorShared[ tid ] = reduce( multivectorShared[ tid ], multivectorShared[ tid + 1 ] ); - __syncwarp(); - if( tid == 0 ) + constexpr int totalWarps = CudaBlockSize / warpSize; + if( totalWarps >= 32 ) + { + multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 16 ] ); + __syncwarp(); + } + if( totalWarps >= 16 ) + { + multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 8 ] ); + __syncwarp(); + } + if( totalWarps >= 8 ) + { + multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 4 ] ); + __syncwarp(); + } + if( totalWarps >= 4 ) + { + multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 2 ] ); + __syncwarp(); + } + if( totalWarps >= 2 ) + { + multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 1 ] ); + __syncwarp(); + } + if( laneIdx == 0 ) { - printf( "Long: segmentIdx %d -> %d \n", segmentIdx, multivectorShared[ 0 ] ); + //printf( "Long: segmentIdx %d -> %d \n", segmentIdx, multivectorShared[ 0 ] ); keep( segmentIdx, multivectorShared[ 0 ] ); } } @@ -216,7 +243,6 @@ struct CSRKernelAdaptiveView return; } - this->printBlocks(); static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; //static constexpr Index THREADS_SCALAR = 128; //static constexpr Index THREADS_VECTOR = 128; @@ -322,7 +348,7 @@ struct CSRKernelAdaptive static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; /* How many shared memory use per block in CSR Adaptive kernel */ - static constexpr Index SHARED_PER_BLOCK = 24576; + static constexpr Index SHARED_PER_BLOCK = 20000; //24576; TODO: /* Number of elements in shared memory */ static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(double); @@ -364,7 +390,7 @@ struct CSRKernelAdaptive if( sum <= 2 * MAX_ELEMENTS_PER_WARP_ADAPT ) type = details::Type::VECTOR; else - type = details::Type::VECTOR; // TODO: Put LONG back + type = details::Type::LONG; //type = Type::LONG; // return current + 1; } @@ -393,7 +419,9 @@ struct CSRKernelAdaptive { const Index blocksCount = inBlock.size(); const Index warpsPerCudaBlock = THREADS_ADAPTIVE / TNL::Cuda::getWarpSize(); - const Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount; + Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount; + if( warpsLeft == 0 ) + warpsLeft = warpsPerCudaBlock; //Index parts = roundUpDivision(sum, this->SHARED_PER_WARP); inBlock.emplace_back( start, details::Type::LONG, 0, warpsLeft ); for( Index index = 1; index < warpsLeft; index++ ) diff --git a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h index 40bc8e6f9..90f8a7bfc 100644 --- a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h +++ b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h @@ -190,6 +190,11 @@ struct CSRAdaptiveKernelBlockDescriptor return this->warpIdx; } + __cuda_callable__ uint8_t getWarpsCount() const + { + return this->warpsCount; + } + void print( std::ostream& str ) const { str << "Type: "; diff --git a/src/UnitTests/Matrices/SparseMatrixTest.hpp b/src/UnitTests/Matrices/SparseMatrixTest.hpp index b5885afbe..00794032e 100644 --- a/src/UnitTests/Matrices/SparseMatrixTest.hpp +++ b/src/UnitTests/Matrices/SparseMatrixTest.hpp @@ -1070,7 +1070,6 @@ void test_VectorProduct() outVector_1.setElement( j, 0 ); m_1.vectorProduct( inVector_1, outVector_1 ); - EXPECT_EQ( outVector_1.getElement( 0 ), 2 ); EXPECT_EQ( outVector_1.getElement( 1 ), 10 ); EXPECT_EQ( outVector_1.getElement( 2 ), 8 ); @@ -1310,7 +1309,7 @@ void test_VectorProduct() ///// // Large test - const IndexType size( 35 ); + const IndexType size( 1051 ); //for( int size = 1; size < 1000; size++ ) { //std::cerr << " size = " << size << std::endl; @@ -1338,26 +1337,28 @@ void test_VectorProduct() EXPECT_EQ( out.getElement( i ), i + 1 ); // Test with large triangular matrix - Matrix m2( size, size ); - rowCapacities.evaluate( [] __cuda_callable__ ( IndexType i ) { return i + 1; } ); + const int rows( size ), columns( size ); + Matrix m2( rows, columns ); + rowCapacities.setSize( rows ); + rowCapacities.evaluate( [=] __cuda_callable__ ( IndexType i ) { return i + 1; } ); m2.setRowCapacities( rowCapacities ); auto f2 = [=] __cuda_callable__ ( IndexType row, IndexType localIdx, IndexType& column, RealType& value, bool& compute ) { if( localIdx <= row ) { - value = row -localIdx + 1; + value = localIdx + 1; column = localIdx; } }; m2.forAllRows( f2 ); // check that the matrix was initialized - TNL::Containers::Vector< IndexType, DeviceType, IndexType > rowLengths( size ); + TNL::Containers::Vector< IndexType, DeviceType, IndexType > rowLengths( rows ); m2.getCompressedRowLengths( rowLengths ); EXPECT_EQ( rowLengths, rowCapacities ); + out.setSize( rows ); out = 0.0; m2.vectorProduct( in, out ); - //std::cerr << out << std::endl; - for( IndexType i = 0; i < size; i++ ) + for( IndexType i = 0; i < rows; i++ ) EXPECT_EQ( out.getElement( i ), ( i + 1 ) * ( i + 2 ) / 2 ); } } diff --git a/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.h b/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.h index e67ea5c85..275686822 100644 --- a/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.h +++ b/src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.h @@ -30,12 +30,12 @@ using MatrixTypes = ::testing::Types TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive > #ifdef HAVE_CUDA ,TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, - //TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, - //TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, - //TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, - //TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, - //TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, - //TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, + TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, + TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, + TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, + TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >, TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive > #endif >; -- GitLab From 3c9dbc5fcb1ba55678462035e802ac61c72721a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Wed, 27 Jan 2021 14:54:43 +0100 Subject: [PATCH 26/27] Added new CSR segments kernels to SpMV benchmark. --- src/Benchmarks/SpMV/spmv-legacy.h | 54 +++++++++++-------- .../CSRAdaptiveKernelBlockDescriptor.h | 2 +- 2 files changed, 34 insertions(+), 22 deletions(-) diff --git a/src/Benchmarks/SpMV/spmv-legacy.h b/src/Benchmarks/SpMV/spmv-legacy.h index 3416ad3ef..fed37410c 100644 --- a/src/Benchmarks/SpMV/spmv-legacy.h +++ b/src/Benchmarks/SpMV/spmv-legacy.h @@ -49,7 +49,16 @@ using SlicedEllpackAlias = Matrices::Legacy::SlicedEllpack< Real, Device, Index // Segments based sparse matrix aliases template< typename Real, typename Device, typename Index > -using SparseMatrix_CSR = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, Algorithms::Segments::CSRDefault >; +using SparseMatrix_CSR_Scalar = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, Algorithms::Segments::CSRScalar >; + +template< typename Real, typename Device, typename Index > +using SparseMatrix_CSR_Vector = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, Algorithms::Segments::CSRVector >; + +template< typename Real, typename Device, typename Index > +using SparseMatrix_CSR_Hybrid = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, Algorithms::Segments::CSRHybrid >; + +template< typename Real, typename Device, typename Index > +using SparseMatrix_CSR_Adaptive = Matrices::SparseMatrix< Real, Device, Index, Matrices::GeneralMatrix, Algorithms::Segments::CSRAdaptive >; template< typename Device, typename Index, typename IndexAllocator > using EllpackSegments = Algorithms::Segments::Ellpack< Device, Index, IndexAllocator >; @@ -309,26 +318,29 @@ benchmarkSpmvSynthetic( Benchmark& benchmark, benchmark.time< Devices::Cuda >( resetCusparseVectors, "GPU", spmvCusparse, cusparseBenchmarkResults ); #endif - benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar >( benchmark, hostOutVector, inputFileName, verboseMR ); - benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Vector >( benchmark, hostOutVector, inputFileName, verboseMR ); - benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light >( benchmark, hostOutVector, inputFileName, verboseMR ); - benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light2 >( benchmark, hostOutVector, inputFileName, verboseMR ); - benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light3 >( benchmark, hostOutVector, inputFileName, verboseMR ); - benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light4 >( benchmark, hostOutVector, inputFileName, verboseMR ); - benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light5 >( benchmark, hostOutVector, inputFileName, verboseMR ); - benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light6 >( benchmark, hostOutVector, inputFileName, verboseMR ); - benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Adaptive >( benchmark, hostOutVector, inputFileName, verboseMR ); - benchmarkSpMV< Real, SparseMatrixLegacy_CSR_MultiVector>( benchmark, hostOutVector, inputFileName, verboseMR ); - benchmarkSpMV< Real, SparseMatrixLegacy_CSR_LightWithoutAtomic>( benchmark, hostOutVector, inputFileName, verboseMR ); - benchmarkSpMV< Real, SparseMatrix_CSR >( benchmark, hostOutVector, inputFileName, verboseMR ); - benchmarkSpMV< Real, Matrices::Legacy::Ellpack >( benchmark, hostOutVector, inputFileName, verboseMR ); - benchmarkSpMV< Real, SparseMatrix_Ellpack >( benchmark, hostOutVector, inputFileName, verboseMR ); - benchmarkSpMV< Real, SlicedEllpackAlias >( benchmark, hostOutVector, inputFileName, verboseMR ); - benchmarkSpMV< Real, SparseMatrix_SlicedEllpack >( benchmark, hostOutVector, inputFileName, verboseMR ); - benchmarkSpMV< Real, Matrices::Legacy::ChunkedEllpack >( benchmark, hostOutVector, inputFileName, verboseMR ); - benchmarkSpMV< Real, SparseMatrix_ChunkedEllpack >( benchmark, hostOutVector, inputFileName, verboseMR ); - benchmarkSpMV< Real, Matrices::Legacy::BiEllpack >( benchmark, hostOutVector, inputFileName, verboseMR ); - benchmarkSpMV< Real, SparseMatrix_BiEllpack >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Scalar >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Vector >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light2 >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light3 >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light4 >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light5 >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Light6 >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_Adaptive >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_MultiVector >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrixLegacy_CSR_LightWithoutAtomic >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrix_CSR_Scalar >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrix_CSR_Vector >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrix_CSR_Hybrid >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrix_CSR_Adaptive >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, Matrices::Legacy::Ellpack >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrix_Ellpack >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SlicedEllpackAlias >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrix_SlicedEllpack >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, Matrices::Legacy::ChunkedEllpack >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrix_ChunkedEllpack >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, Matrices::Legacy::BiEllpack >( benchmark, hostOutVector, inputFileName, verboseMR ); + benchmarkSpMV< Real, SparseMatrix_BiEllpack >( benchmark, hostOutVector, inputFileName, verboseMR ); /* AdEllpack is broken benchmarkSpMV< Real, Matrices::AdEllpack >( benchmark, hostOutVector, inputFileName, verboseMR ); */ diff --git a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h index 90f8a7bfc..96f1899b2 100644 --- a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h +++ b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h @@ -185,7 +185,7 @@ struct CSRAdaptiveKernelBlockDescriptor //return ( twobytes[ sizeof( Index ) == 4 ? 3 : 5 ] & 0x3FFF ); } - __cuda_callable__ const uint8_t getWarpIdx() const + __cuda_callable__ uint8_t getWarpIdx() const { return this->warpIdx; } -- GitLab From 702ab3284556a4255cfa97ac3801f8d037491e30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Wed, 27 Jan 2021 21:54:17 +0100 Subject: [PATCH 27/27] Reformatting tnl-benchmark-spmv srouce code. --- src/Benchmarks/SpMV/tnl-benchmark-spmv.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h index 82e1f12cd..7897073d9 100644 --- a/src/Benchmarks/SpMV/tnl-benchmark-spmv.h +++ b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h @@ -63,7 +63,6 @@ std::string getCurrDateTime() timeinfo = localtime( &rawtime ); strftime( buffer, sizeof( buffer ), "%d-%m-%Y--%H:%M:%S", timeinfo ); std::string curr_date_time( buffer ); - return curr_date_time; } @@ -133,8 +132,7 @@ main( int argc, char* argv[] ) // prepare global metadata Benchmark::MetadataMap metadata = getHardwareMetadata(); - - + // Initiate setup of benchmarks if( precision == "all" || precision == "float" ) runSpMVBenchmarks< float >( benchmark, metadata, inputFileName, verboseMR ); -- GitLab