diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h index 46d323f0235fe48926b87294f3efa0cc42b91e6d..5ade54d02eedd3e2a5c1b2778bc8e3a215058c2c 100644 --- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h +++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h @@ -63,13 +63,15 @@ struct CSRAdaptiveKernel using BlocksType = typename ViewType::BlocksType; using BlocksView = typename BlocksType::ViewType; + static constexpr int MaxValueSizeLog() { return ViewType::MaxValueSizeLog; }; + static TNL::String getKernelType(); - static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< Index >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256; + static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< sizeof( Index ) >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256; // How many shared memory use per block in CSR Adaptive kernel - static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< Index >::StreamedSharedMemory(); //20000; //24576; TODO: + static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< sizeof( Index ) >::StreamedSharedMemory(); //20000; //24576; TODO: // Number of elements in shared memory static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(double); @@ -84,14 +86,7 @@ struct CSRAdaptiveKernel static constexpr Index MAX_ELEMENTS_PER_WARP = 384; // Max length of row to process one warp for CSR Adaptive - static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< Index >::MaxAdaptiveElementsPerWarp(); - - template< typename Offsets > - Index findLimit( const Index start, - const Offsets& offsets, - const Index size, - details::Type &type, - Index &sum ); + static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< sizeof( Index ) >::MaxAdaptiveElementsPerWarp(); template< typename Offsets > void init( const Offsets& offsets ); @@ -118,7 +113,21 @@ struct CSRAdaptiveKernel Args... args ) const; protected: - BlocksType blocks; + template< int SizeOfValue, typename Offsets > + Index findLimit( const Index start, + const Offsets& offsets, + const Index size, + details::Type &type, + Index &sum ); + + template< int SizeOfValue, + typename Offsets > + void initValueSize( const Offsets& offsets ); + + /** + * \brief blocksArray[ i ] stores blocks for sizeof( Value ) == 2^i. + */ + BlocksType blocksArray[ MaxValueSizeLog() ]; ViewType view; }; diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp index 4c53a83ca39102c8de7242850c50c0174206dd4f..ff2db147be16312e79260fbd2ed4531570e5816f 100644 --- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp +++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp @@ -31,10 +31,81 @@ getKernelType() return ViewType::getKernelType(); }; - template< typename Index, typename Device > template< typename Offsets > +void +CSRAdaptiveKernel< Index, Device >:: +init( const Offsets& offsets ) +{ + this->template initValueSize< 1 >( offsets ); + this->template initValueSize< 2 >( offsets ); + this->template initValueSize< 4 >( offsets ); + this->template initValueSize< 8 >( offsets ); + this->template initValueSize< 16 >( offsets ); + this->template initValueSize< 32 >( offsets ); + for( int i = 0; i < MaxValueSizeLog(); i++ ) + this->view.setBlocks( blocksArray[ i ], i ); +} + + +template< typename Index, + typename Device > +void +CSRAdaptiveKernel< Index, Device >:: +reset() +{ + for( int i = 0; i < MaxValueSizeLog(); i++ ) + { + this->blocksArray[ i ].reset(); + this->view.setBlocks( this->blocksArray[ i ], i ); + } +} + +template< typename Index, + typename Device > +auto +CSRAdaptiveKernel< Index, Device >:: +getView() -> ViewType +{ + return this->view; +} + +template< typename Index, + typename Device > +auto +CSRAdaptiveKernel< Index, Device >:: +getConstView() const -> ConstViewType +{ + return this->view; +}; + +template< typename Index, + typename Device > + template< typename OffsetsView, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real, + typename... Args > +void +CSRAdaptiveKernel< Index, Device >:: +segmentsReduction( const OffsetsView& offsets, + Index first, + Index last, + Fetch& fetch, + const Reduction& reduction, + ResultKeeper& keeper, + const Real& zero, + Args... args ) const +{ + view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); +} + +template< typename Index, + typename Device > + template< int SizeOfValue, + typename Offsets > Index CSRAdaptiveKernel< Index, Device >:: findLimit( const Index start, @@ -48,7 +119,7 @@ findLimit( const Index start, { Index elements = offsets[ current + 1 ] - offsets[ current ]; sum += elements; - if( sum > SHARED_PER_WARP ) + if( sum > details::CSRAdaptiveKernelParameters< SizeOfValue >::StreamedSharedElementsPerWarp() ) { if( current - start > 0 ) // extra row { @@ -57,7 +128,7 @@ findLimit( const Index start, } else { // one long row - if( sum <= 2 * MAX_ELEMENTS_PER_WARP_ADAPT ) + if( sum <= 2 * details::CSRAdaptiveKernelParameters< SizeOfValue >::MaxAdaptiveElementsPerWarp() ) //MAX_ELEMENTS_PER_WARP_ADAPT ) type = details::Type::VECTOR; else type = details::Type::LONG; @@ -67,14 +138,15 @@ findLimit( const Index start, } type = details::Type::STREAM; return size - 1; // return last row pointer - } +} template< typename Index, typename Device > - template< typename Offsets > + template< int SizeOfValue, + typename Offsets > void CSRAdaptiveKernel< Index, Device >:: -init( const Offsets& offsets ) +initValueSize( const Offsets& offsets ) { using HostOffsetsType = TNL::Containers::Vector< typename Offsets::IndexType, TNL::Devices::Host, typename Offsets::IndexType >; HostOffsetsType hostOffsets( offsets ); @@ -88,7 +160,7 @@ init( const Offsets& offsets ) while( nextStart != rows - 1 ) { details::Type type; - nextStart = findLimit( start, hostOffsets, rows, type, sum ); + nextStart = findLimit< SizeOfValue >( start, hostOffsets, rows, type, sum ); if( type == details::Type::LONG ) { @@ -110,58 +182,10 @@ init( const Offsets& offsets ) start = nextStart; } inBlocks.emplace_back(nextStart); - this->blocks = inBlocks; - this->view.setBlocks( blocks ); -} - -template< typename Index, - typename Device > -void -CSRAdaptiveKernel< Index, Device >:: -reset() -{ - this->blocks.reset(); - this->view.setBlocks( blocks ); -} - -template< typename Index, - typename Device > -auto -CSRAdaptiveKernel< Index, Device >:: -getView() -> ViewType -{ - return this->view; -} - -template< typename Index, - typename Device > -auto -CSRAdaptiveKernel< Index, Device >:: -getConstView() const -> ConstViewType -{ - return this->view; -}; - -template< typename Index, - typename Device > - template< typename OffsetsView, - typename Fetch, - typename Reduction, - typename ResultKeeper, - typename Real, - typename... Args > -void -CSRAdaptiveKernel< Index, Device >:: -segmentsReduction( const OffsetsView& offsets, - Index first, - Index last, - Fetch& fetch, - const Reduction& reduction, - ResultKeeper& keeper, - const Real& zero, - Args... args ) const -{ - view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); + //std::cerr << "Setting blocks to " << std::log2( SizeOfValue ) << std::endl; + TNL_ASSERT_LT( std::log2( SizeOfValue ), MaxValueSizeLog(), "" ); + TNL_ASSERT_GE( std::log2( SizeOfValue ), 0, "" ); + this->blocksArray[ (int ) std::log2( SizeOfValue ) ] = inBlocks; } } // namespace Segments diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h index 431b72f4e37b98cf5a00fc64fdb6ff0e7d727e9c..ea008fdc7c6efc3ae75b76142b4f318dce823791 100644 --- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h +++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h @@ -28,11 +28,13 @@ struct CSRAdaptiveKernelView using BlocksType = TNL::Containers::Vector< details::CSRAdaptiveKernelBlockDescriptor< Index >, Device, Index >; using BlocksView = typename BlocksType::ViewType; + static constexpr int MaxValueSizeLog = 6; + CSRAdaptiveKernelView() = default; CSRAdaptiveKernelView( BlocksType& blocks ); - void setBlocks( BlocksType& blocks ); + void setBlocks( BlocksType& blocks, const int idx ); ViewType getView(); @@ -57,14 +59,14 @@ struct CSRAdaptiveKernelView CSRAdaptiveKernelView& operator=( const CSRAdaptiveKernelView< Index, Device >& kernelView ); - void printBlocks() const; + void printBlocks( int idx ) const; protected: - BlocksView blocks; + BlocksView blocksArray[ MaxValueSizeLog ]; }; } // namespace Segments } // namespace Algorithms } // namespace TNL -#include <TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp> \ No newline at end of file +#include <TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp> diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp index 743f0902f5694aba54647e017f712ceb5c689fae..d4a369f252ffd61be1ade8589a3070bccfc032f7 100644 --- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp +++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp @@ -50,10 +50,10 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, Real zero, Args... args ) { - static constexpr int CudaBlockSize = details::CSRAdaptiveKernelParameters< Real >::CudaBlockSize(); - constexpr int WarpSize = Cuda::getWarpSize(); - constexpr int WarpsCount = details::CSRAdaptiveKernelParameters< Real >::WarpsCount(); - constexpr size_t StreamedSharedElementsPerWarp = details::CSRAdaptiveKernelParameters< Real >::StreamedSharedElementsPerWarp(); + static constexpr int CudaBlockSize = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize(); + //constexpr int WarpSize = Cuda::getWarpSize(); + //constexpr int WarpsCount = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::WarpsCount(); + //constexpr size_t StreamedSharedElementsPerWarp = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::StreamedSharedElementsPerWarp(); __shared__ Real streamShared[ WARPS ][ SHARED_PER_WARP ]; @@ -199,21 +199,21 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, } #endif -template< typename Index, +/*template< typename Index, typename Device > CSRAdaptiveKernelView< Index, Device >:: CSRAdaptiveKernelView( BlocksType& blocks ) { this->blocks.bind( blocks ); -} +}*/ template< typename Index, typename Device > void CSRAdaptiveKernelView< Index, Device >:: -setBlocks( BlocksType& blocks ) +setBlocks( BlocksType& blocks, const int idx ) { - this->blocks.bind( blocks ); + this->blocksArray[ idx ].bind( blocks ); } template< typename Index, @@ -263,23 +263,25 @@ segmentsReduction( const OffsetsView& offsets, Args... args ) const { #ifdef HAVE_CUDA - if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() ) + int valueSizeLog = std::ceil( log2f( ( double ) sizeof( Real ) ) ); + + if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() || valueSizeLog > MaxValueSizeLog ) { TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >:: segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); return; } - static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< Real >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256; + static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256; /* Max length of row to process one warp for CSR Light, MultiVector */ //static constexpr Index MAX_ELEMENTS_PER_WARP = 384; /* Max length of row to process one warp for CSR Adaptive */ - static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< Real >::MaxAdaptiveElementsPerWarp(); + //static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::MaxAdaptiveElementsPerWarp(); /* How many shared memory use per block in CSR Adaptive kernel */ - static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< Real >::StreamedSharedMemory(); + static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::StreamedSharedMemory(); /* Number of elements in shared memory */ static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real); @@ -298,7 +300,7 @@ segmentsReduction( const OffsetsView& offsets, constexpr size_t MAX_X_DIM = 2147483647; /* Fill blocks */ - size_t neededThreads = this->blocks.getSize() * warpSize; // one warp per block + size_t neededThreads = this->blocksArray[ valueSizeLog ].getSize() * warpSize; // one warp per block /* Execute kernels on device */ for (Index gridIdx = 0; neededThreads != 0; gridIdx++ ) { @@ -317,12 +319,12 @@ segmentsReduction( const OffsetsView& offsets, warpSize, WARPS, SHARED_PER_WARP, - details::CSRAdaptiveKernelParameters< Real >::MaxAdaptiveElementsPerWarp(), + details::CSRAdaptiveKernelParameters< sizeof( Real ) >::MaxAdaptiveElementsPerWarp(), BlocksView, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... > <<<blocksCount, threads>>>( - this->blocks, + this->blocksArray[ valueSizeLog ], gridIdx, offsets, first, @@ -342,7 +344,8 @@ CSRAdaptiveKernelView< Index, Device >& CSRAdaptiveKernelView< Index, Device >:: operator=( const CSRAdaptiveKernelView< Index, Device >& kernelView ) { - this->blocks.bind( kernelView.blocks ); + for( int i = 0; i < MaxValueSizeLog; i++ ) + this->blocksArray[ i ].bind( kernelView.blocksArray[ i ] ); return *this; } @@ -350,8 +353,9 @@ template< typename Index, typename Device > void CSRAdaptiveKernelView< Index, Device >:: -printBlocks() const +printBlocks( int idx ) const { + auto& blocks = this->blocksArray[ idx ]; for( Index i = 0; i < this->blocks.getSize(); i++ ) { auto block = blocks.getElement( i ); diff --git a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h index f9dedbaf0bbafb5f346102a0de5abdd8a80ab05a..2546580f8f3492460460748f07a28a17ce9dbcdf 100644 --- a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h +++ b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h @@ -15,7 +15,7 @@ namespace TNL { namespace Segments { namespace details { -template< typename Value, +template< int SizeOfValue, int StreamedSharedMemory_ = 24576 > struct CSRAdaptiveKernelParameters { @@ -37,7 +37,7 @@ struct CSRAdaptiveKernelParameters /** * \brief Number of elements fitting into streamed shared memory. */ - static constexpr size_t StreamedSharedElementsCount() { return StreamedSharedMemory() / sizeof( Value ); }; + static constexpr size_t StreamedSharedElementsCount() { return StreamedSharedMemory() / SizeOfValue; }; /** * \brief Computes number of warps in one CUDA block.