Loading src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h +20 −11 Original line number Diff line number Diff line Loading @@ -63,13 +63,15 @@ struct CSRAdaptiveKernel using BlocksType = typename ViewType::BlocksType; using BlocksView = typename BlocksType::ViewType; static constexpr int MaxValueSizeLog() { return ViewType::MaxValueSizeLog; }; static TNL::String getKernelType(); static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< Index >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256; static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< sizeof( Index ) >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256; // How many shared memory use per block in CSR Adaptive kernel static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< Index >::StreamedSharedMemory(); //20000; //24576; TODO: static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< sizeof( Index ) >::StreamedSharedMemory(); //20000; //24576; TODO: // Number of elements in shared memory static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(double); Loading @@ -84,14 +86,7 @@ struct CSRAdaptiveKernel static constexpr Index MAX_ELEMENTS_PER_WARP = 384; // Max length of row to process one warp for CSR Adaptive static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< Index >::MaxAdaptiveElementsPerWarp(); template< typename Offsets > Index findLimit( const Index start, const Offsets& offsets, const Index size, details::Type &type, Index &sum ); static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< sizeof( Index ) >::MaxAdaptiveElementsPerWarp(); template< typename Offsets > void init( const Offsets& offsets ); Loading @@ -118,7 +113,21 @@ struct CSRAdaptiveKernel Args... args ) const; protected: BlocksType blocks; template< int SizeOfValue, typename Offsets > Index findLimit( const Index start, const Offsets& offsets, const Index size, details::Type &type, Index &sum ); template< int SizeOfValue, typename Offsets > void initValueSize( const Offsets& offsets ); /** * \brief blocksArray[ i ] stores blocks for sizeof( Value ) == 2^i. */ BlocksType blocksArray[ MaxValueSizeLog() ]; ViewType view; }; Loading src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp +83 −59 Original line number Diff line number Diff line Loading @@ -31,10 +31,81 @@ getKernelType() return ViewType::getKernelType(); }; template< typename Index, typename Device > template< typename Offsets > void CSRAdaptiveKernel< Index, Device >:: init( const Offsets& offsets ) { this->template initValueSize< 1 >( offsets ); this->template initValueSize< 2 >( offsets ); this->template initValueSize< 4 >( offsets ); this->template initValueSize< 8 >( offsets ); this->template initValueSize< 16 >( offsets ); this->template initValueSize< 32 >( offsets ); for( int i = 0; i < MaxValueSizeLog(); i++ ) this->view.setBlocks( blocksArray[ i ], i ); } template< typename Index, typename Device > void CSRAdaptiveKernel< Index, Device >:: reset() { for( int i = 0; i < MaxValueSizeLog(); i++ ) { this->blocksArray[ i ].reset(); this->view.setBlocks( this->blocksArray[ i ], i ); } } template< typename Index, typename Device > auto CSRAdaptiveKernel< Index, Device >:: getView() -> ViewType { return this->view; } template< typename Index, typename Device > auto CSRAdaptiveKernel< Index, Device >:: getConstView() const -> ConstViewType { return this->view; }; template< typename Index, typename Device > template< typename OffsetsView, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > void CSRAdaptiveKernel< Index, Device >:: segmentsReduction( const OffsetsView& offsets, Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); } template< typename Index, typename Device > template< int SizeOfValue, typename Offsets > Index CSRAdaptiveKernel< Index, Device >:: findLimit( const Index start, Loading @@ -48,7 +119,7 @@ findLimit( const Index start, { Index elements = offsets[ current + 1 ] - offsets[ current ]; sum += elements; if( sum > SHARED_PER_WARP ) if( sum > details::CSRAdaptiveKernelParameters< SizeOfValue >::StreamedSharedElementsPerWarp() ) { if( current - start > 0 ) // extra row { Loading @@ -57,7 +128,7 @@ findLimit( const Index start, } else { // one long row if( sum <= 2 * MAX_ELEMENTS_PER_WARP_ADAPT ) if( sum <= 2 * details::CSRAdaptiveKernelParameters< SizeOfValue >::MaxAdaptiveElementsPerWarp() ) //MAX_ELEMENTS_PER_WARP_ADAPT ) type = details::Type::VECTOR; else type = details::Type::LONG; Loading @@ -71,10 +142,11 @@ findLimit( const Index start, template< typename Index, typename Device > template< typename Offsets > template< int SizeOfValue, typename Offsets > void CSRAdaptiveKernel< Index, Device >:: init( const Offsets& offsets ) initValueSize( const Offsets& offsets ) { using HostOffsetsType = TNL::Containers::Vector< typename Offsets::IndexType, TNL::Devices::Host, typename Offsets::IndexType >; HostOffsetsType hostOffsets( offsets ); Loading @@ -88,7 +160,7 @@ init( const Offsets& offsets ) while( nextStart != rows - 1 ) { details::Type type; nextStart = findLimit( start, hostOffsets, rows, type, sum ); nextStart = findLimit< SizeOfValue >( start, hostOffsets, rows, type, sum ); if( type == details::Type::LONG ) { Loading @@ -110,58 +182,10 @@ init( const Offsets& offsets ) start = nextStart; } inBlocks.emplace_back(nextStart); this->blocks = inBlocks; this->view.setBlocks( blocks ); } template< typename Index, typename Device > void CSRAdaptiveKernel< Index, Device >:: reset() { this->blocks.reset(); this->view.setBlocks( blocks ); } template< typename Index, typename Device > auto CSRAdaptiveKernel< Index, Device >:: getView() -> ViewType { return this->view; } template< typename Index, typename Device > auto CSRAdaptiveKernel< Index, Device >:: getConstView() const -> ConstViewType { return this->view; }; template< typename Index, typename Device > template< typename OffsetsView, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > void CSRAdaptiveKernel< Index, Device >:: segmentsReduction( const OffsetsView& offsets, Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); //std::cerr << "Setting blocks to " << std::log2( SizeOfValue ) << std::endl; TNL_ASSERT_LT( std::log2( SizeOfValue ), MaxValueSizeLog(), "" ); TNL_ASSERT_GE( std::log2( SizeOfValue ), 0, "" ); this->blocksArray[ (int ) std::log2( SizeOfValue ) ] = inBlocks; } } // namespace Segments Loading src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h +6 −4 Original line number Diff line number Diff line Loading @@ -28,11 +28,13 @@ struct CSRAdaptiveKernelView using BlocksType = TNL::Containers::Vector< details::CSRAdaptiveKernelBlockDescriptor< Index >, Device, Index >; using BlocksView = typename BlocksType::ViewType; static constexpr int MaxValueSizeLog = 6; CSRAdaptiveKernelView() = default; CSRAdaptiveKernelView( BlocksType& blocks ); void setBlocks( BlocksType& blocks ); void setBlocks( BlocksType& blocks, const int idx ); ViewType getView(); Loading @@ -57,10 +59,10 @@ struct CSRAdaptiveKernelView CSRAdaptiveKernelView& operator=( const CSRAdaptiveKernelView< Index, Device >& kernelView ); void printBlocks() const; void printBlocks( int idx ) const; protected: BlocksView blocks; BlocksView blocksArray[ MaxValueSizeLog ]; }; } // namespace Segments Loading src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp +21 −17 Original line number Diff line number Diff line Loading @@ -50,10 +50,10 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, Real zero, Args... args ) { static constexpr int CudaBlockSize = details::CSRAdaptiveKernelParameters< Real >::CudaBlockSize(); constexpr int WarpSize = Cuda::getWarpSize(); constexpr int WarpsCount = details::CSRAdaptiveKernelParameters< Real >::WarpsCount(); constexpr size_t StreamedSharedElementsPerWarp = details::CSRAdaptiveKernelParameters< Real >::StreamedSharedElementsPerWarp(); static constexpr int CudaBlockSize = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize(); //constexpr int WarpSize = Cuda::getWarpSize(); //constexpr int WarpsCount = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::WarpsCount(); //constexpr size_t StreamedSharedElementsPerWarp = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::StreamedSharedElementsPerWarp(); __shared__ Real streamShared[ WARPS ][ SHARED_PER_WARP ]; Loading Loading @@ -199,21 +199,21 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, } #endif template< typename Index, /*template< typename Index, typename Device > CSRAdaptiveKernelView< Index, Device >:: CSRAdaptiveKernelView( BlocksType& blocks ) { this->blocks.bind( blocks ); } }*/ template< typename Index, typename Device > void CSRAdaptiveKernelView< Index, Device >:: setBlocks( BlocksType& blocks ) setBlocks( BlocksType& blocks, const int idx ) { this->blocks.bind( blocks ); this->blocksArray[ idx ].bind( blocks ); } template< typename Index, Loading Loading @@ -263,23 +263,25 @@ segmentsReduction( const OffsetsView& offsets, Args... args ) const { #ifdef HAVE_CUDA if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() ) int valueSizeLog = std::ceil( log2f( ( double ) sizeof( Real ) ) ); if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() || valueSizeLog > MaxValueSizeLog ) { TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >:: segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); return; } static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< Real >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256; static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256; /* Max length of row to process one warp for CSR Light, MultiVector */ //static constexpr Index MAX_ELEMENTS_PER_WARP = 384; /* Max length of row to process one warp for CSR Adaptive */ static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< Real >::MaxAdaptiveElementsPerWarp(); //static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::MaxAdaptiveElementsPerWarp(); /* How many shared memory use per block in CSR Adaptive kernel */ static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< Real >::StreamedSharedMemory(); static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::StreamedSharedMemory(); /* Number of elements in shared memory */ static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real); Loading @@ -298,7 +300,7 @@ segmentsReduction( const OffsetsView& offsets, constexpr size_t MAX_X_DIM = 2147483647; /* Fill blocks */ size_t neededThreads = this->blocks.getSize() * warpSize; // one warp per block size_t neededThreads = this->blocksArray[ valueSizeLog ].getSize() * warpSize; // one warp per block /* Execute kernels on device */ for (Index gridIdx = 0; neededThreads != 0; gridIdx++ ) { Loading @@ -317,12 +319,12 @@ segmentsReduction( const OffsetsView& offsets, warpSize, WARPS, SHARED_PER_WARP, details::CSRAdaptiveKernelParameters< Real >::MaxAdaptiveElementsPerWarp(), details::CSRAdaptiveKernelParameters< sizeof( Real ) >::MaxAdaptiveElementsPerWarp(), BlocksView, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... > <<<blocksCount, threads>>>( this->blocks, this->blocksArray[ valueSizeLog ], gridIdx, offsets, first, Loading @@ -342,7 +344,8 @@ CSRAdaptiveKernelView< Index, Device >& CSRAdaptiveKernelView< Index, Device >:: operator=( const CSRAdaptiveKernelView< Index, Device >& kernelView ) { this->blocks.bind( kernelView.blocks ); for( int i = 0; i < MaxValueSizeLog; i++ ) this->blocksArray[ i ].bind( kernelView.blocksArray[ i ] ); return *this; } Loading @@ -350,8 +353,9 @@ template< typename Index, typename Device > void CSRAdaptiveKernelView< Index, Device >:: printBlocks() const printBlocks( int idx ) const { auto& blocks = this->blocksArray[ idx ]; for( Index i = 0; i < this->blocks.getSize(); i++ ) { auto block = blocks.getElement( i ); Loading src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h +2 −2 Original line number Diff line number Diff line Loading @@ -15,7 +15,7 @@ namespace TNL { namespace Segments { namespace details { template< typename Value, template< int SizeOfValue, int StreamedSharedMemory_ = 24576 > struct CSRAdaptiveKernelParameters { Loading @@ -37,7 +37,7 @@ struct CSRAdaptiveKernelParameters /** * \brief Number of elements fitting into streamed shared memory. */ static constexpr size_t StreamedSharedElementsCount() { return StreamedSharedMemory() / sizeof( Value ); }; static constexpr size_t StreamedSharedElementsCount() { return StreamedSharedMemory() / SizeOfValue; }; /** * \brief Computes number of warps in one CUDA block. Loading Loading
src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h +20 −11 Original line number Diff line number Diff line Loading @@ -63,13 +63,15 @@ struct CSRAdaptiveKernel using BlocksType = typename ViewType::BlocksType; using BlocksView = typename BlocksType::ViewType; static constexpr int MaxValueSizeLog() { return ViewType::MaxValueSizeLog; }; static TNL::String getKernelType(); static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< Index >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256; static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< sizeof( Index ) >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256; // How many shared memory use per block in CSR Adaptive kernel static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< Index >::StreamedSharedMemory(); //20000; //24576; TODO: static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< sizeof( Index ) >::StreamedSharedMemory(); //20000; //24576; TODO: // Number of elements in shared memory static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(double); Loading @@ -84,14 +86,7 @@ struct CSRAdaptiveKernel static constexpr Index MAX_ELEMENTS_PER_WARP = 384; // Max length of row to process one warp for CSR Adaptive static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< Index >::MaxAdaptiveElementsPerWarp(); template< typename Offsets > Index findLimit( const Index start, const Offsets& offsets, const Index size, details::Type &type, Index &sum ); static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< sizeof( Index ) >::MaxAdaptiveElementsPerWarp(); template< typename Offsets > void init( const Offsets& offsets ); Loading @@ -118,7 +113,21 @@ struct CSRAdaptiveKernel Args... args ) const; protected: BlocksType blocks; template< int SizeOfValue, typename Offsets > Index findLimit( const Index start, const Offsets& offsets, const Index size, details::Type &type, Index &sum ); template< int SizeOfValue, typename Offsets > void initValueSize( const Offsets& offsets ); /** * \brief blocksArray[ i ] stores blocks for sizeof( Value ) == 2^i. */ BlocksType blocksArray[ MaxValueSizeLog() ]; ViewType view; }; Loading
src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp +83 −59 Original line number Diff line number Diff line Loading @@ -31,10 +31,81 @@ getKernelType() return ViewType::getKernelType(); }; template< typename Index, typename Device > template< typename Offsets > void CSRAdaptiveKernel< Index, Device >:: init( const Offsets& offsets ) { this->template initValueSize< 1 >( offsets ); this->template initValueSize< 2 >( offsets ); this->template initValueSize< 4 >( offsets ); this->template initValueSize< 8 >( offsets ); this->template initValueSize< 16 >( offsets ); this->template initValueSize< 32 >( offsets ); for( int i = 0; i < MaxValueSizeLog(); i++ ) this->view.setBlocks( blocksArray[ i ], i ); } template< typename Index, typename Device > void CSRAdaptiveKernel< Index, Device >:: reset() { for( int i = 0; i < MaxValueSizeLog(); i++ ) { this->blocksArray[ i ].reset(); this->view.setBlocks( this->blocksArray[ i ], i ); } } template< typename Index, typename Device > auto CSRAdaptiveKernel< Index, Device >:: getView() -> ViewType { return this->view; } template< typename Index, typename Device > auto CSRAdaptiveKernel< Index, Device >:: getConstView() const -> ConstViewType { return this->view; }; template< typename Index, typename Device > template< typename OffsetsView, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > void CSRAdaptiveKernel< Index, Device >:: segmentsReduction( const OffsetsView& offsets, Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); } template< typename Index, typename Device > template< int SizeOfValue, typename Offsets > Index CSRAdaptiveKernel< Index, Device >:: findLimit( const Index start, Loading @@ -48,7 +119,7 @@ findLimit( const Index start, { Index elements = offsets[ current + 1 ] - offsets[ current ]; sum += elements; if( sum > SHARED_PER_WARP ) if( sum > details::CSRAdaptiveKernelParameters< SizeOfValue >::StreamedSharedElementsPerWarp() ) { if( current - start > 0 ) // extra row { Loading @@ -57,7 +128,7 @@ findLimit( const Index start, } else { // one long row if( sum <= 2 * MAX_ELEMENTS_PER_WARP_ADAPT ) if( sum <= 2 * details::CSRAdaptiveKernelParameters< SizeOfValue >::MaxAdaptiveElementsPerWarp() ) //MAX_ELEMENTS_PER_WARP_ADAPT ) type = details::Type::VECTOR; else type = details::Type::LONG; Loading @@ -71,10 +142,11 @@ findLimit( const Index start, template< typename Index, typename Device > template< typename Offsets > template< int SizeOfValue, typename Offsets > void CSRAdaptiveKernel< Index, Device >:: init( const Offsets& offsets ) initValueSize( const Offsets& offsets ) { using HostOffsetsType = TNL::Containers::Vector< typename Offsets::IndexType, TNL::Devices::Host, typename Offsets::IndexType >; HostOffsetsType hostOffsets( offsets ); Loading @@ -88,7 +160,7 @@ init( const Offsets& offsets ) while( nextStart != rows - 1 ) { details::Type type; nextStart = findLimit( start, hostOffsets, rows, type, sum ); nextStart = findLimit< SizeOfValue >( start, hostOffsets, rows, type, sum ); if( type == details::Type::LONG ) { Loading @@ -110,58 +182,10 @@ init( const Offsets& offsets ) start = nextStart; } inBlocks.emplace_back(nextStart); this->blocks = inBlocks; this->view.setBlocks( blocks ); } template< typename Index, typename Device > void CSRAdaptiveKernel< Index, Device >:: reset() { this->blocks.reset(); this->view.setBlocks( blocks ); } template< typename Index, typename Device > auto CSRAdaptiveKernel< Index, Device >:: getView() -> ViewType { return this->view; } template< typename Index, typename Device > auto CSRAdaptiveKernel< Index, Device >:: getConstView() const -> ConstViewType { return this->view; }; template< typename Index, typename Device > template< typename OffsetsView, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > void CSRAdaptiveKernel< Index, Device >:: segmentsReduction( const OffsetsView& offsets, Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); //std::cerr << "Setting blocks to " << std::log2( SizeOfValue ) << std::endl; TNL_ASSERT_LT( std::log2( SizeOfValue ), MaxValueSizeLog(), "" ); TNL_ASSERT_GE( std::log2( SizeOfValue ), 0, "" ); this->blocksArray[ (int ) std::log2( SizeOfValue ) ] = inBlocks; } } // namespace Segments Loading
src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h +6 −4 Original line number Diff line number Diff line Loading @@ -28,11 +28,13 @@ struct CSRAdaptiveKernelView using BlocksType = TNL::Containers::Vector< details::CSRAdaptiveKernelBlockDescriptor< Index >, Device, Index >; using BlocksView = typename BlocksType::ViewType; static constexpr int MaxValueSizeLog = 6; CSRAdaptiveKernelView() = default; CSRAdaptiveKernelView( BlocksType& blocks ); void setBlocks( BlocksType& blocks ); void setBlocks( BlocksType& blocks, const int idx ); ViewType getView(); Loading @@ -57,10 +59,10 @@ struct CSRAdaptiveKernelView CSRAdaptiveKernelView& operator=( const CSRAdaptiveKernelView< Index, Device >& kernelView ); void printBlocks() const; void printBlocks( int idx ) const; protected: BlocksView blocks; BlocksView blocksArray[ MaxValueSizeLog ]; }; } // namespace Segments Loading
src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp +21 −17 Original line number Diff line number Diff line Loading @@ -50,10 +50,10 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, Real zero, Args... args ) { static constexpr int CudaBlockSize = details::CSRAdaptiveKernelParameters< Real >::CudaBlockSize(); constexpr int WarpSize = Cuda::getWarpSize(); constexpr int WarpsCount = details::CSRAdaptiveKernelParameters< Real >::WarpsCount(); constexpr size_t StreamedSharedElementsPerWarp = details::CSRAdaptiveKernelParameters< Real >::StreamedSharedElementsPerWarp(); static constexpr int CudaBlockSize = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize(); //constexpr int WarpSize = Cuda::getWarpSize(); //constexpr int WarpsCount = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::WarpsCount(); //constexpr size_t StreamedSharedElementsPerWarp = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::StreamedSharedElementsPerWarp(); __shared__ Real streamShared[ WARPS ][ SHARED_PER_WARP ]; Loading Loading @@ -199,21 +199,21 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, } #endif template< typename Index, /*template< typename Index, typename Device > CSRAdaptiveKernelView< Index, Device >:: CSRAdaptiveKernelView( BlocksType& blocks ) { this->blocks.bind( blocks ); } }*/ template< typename Index, typename Device > void CSRAdaptiveKernelView< Index, Device >:: setBlocks( BlocksType& blocks ) setBlocks( BlocksType& blocks, const int idx ) { this->blocks.bind( blocks ); this->blocksArray[ idx ].bind( blocks ); } template< typename Index, Loading Loading @@ -263,23 +263,25 @@ segmentsReduction( const OffsetsView& offsets, Args... args ) const { #ifdef HAVE_CUDA if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() ) int valueSizeLog = std::ceil( log2f( ( double ) sizeof( Real ) ) ); if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() || valueSizeLog > MaxValueSizeLog ) { TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >:: segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); return; } static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< Real >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256; static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256; /* Max length of row to process one warp for CSR Light, MultiVector */ //static constexpr Index MAX_ELEMENTS_PER_WARP = 384; /* Max length of row to process one warp for CSR Adaptive */ static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< Real >::MaxAdaptiveElementsPerWarp(); //static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::MaxAdaptiveElementsPerWarp(); /* How many shared memory use per block in CSR Adaptive kernel */ static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< Real >::StreamedSharedMemory(); static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::StreamedSharedMemory(); /* Number of elements in shared memory */ static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real); Loading @@ -298,7 +300,7 @@ segmentsReduction( const OffsetsView& offsets, constexpr size_t MAX_X_DIM = 2147483647; /* Fill blocks */ size_t neededThreads = this->blocks.getSize() * warpSize; // one warp per block size_t neededThreads = this->blocksArray[ valueSizeLog ].getSize() * warpSize; // one warp per block /* Execute kernels on device */ for (Index gridIdx = 0; neededThreads != 0; gridIdx++ ) { Loading @@ -317,12 +319,12 @@ segmentsReduction( const OffsetsView& offsets, warpSize, WARPS, SHARED_PER_WARP, details::CSRAdaptiveKernelParameters< Real >::MaxAdaptiveElementsPerWarp(), details::CSRAdaptiveKernelParameters< sizeof( Real ) >::MaxAdaptiveElementsPerWarp(), BlocksView, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... > <<<blocksCount, threads>>>( this->blocks, this->blocksArray[ valueSizeLog ], gridIdx, offsets, first, Loading @@ -342,7 +344,8 @@ CSRAdaptiveKernelView< Index, Device >& CSRAdaptiveKernelView< Index, Device >:: operator=( const CSRAdaptiveKernelView< Index, Device >& kernelView ) { this->blocks.bind( kernelView.blocks ); for( int i = 0; i < MaxValueSizeLog; i++ ) this->blocksArray[ i ].bind( kernelView.blocksArray[ i ] ); return *this; } Loading @@ -350,8 +353,9 @@ template< typename Index, typename Device > void CSRAdaptiveKernelView< Index, Device >:: printBlocks() const printBlocks( int idx ) const { auto& blocks = this->blocksArray[ idx ]; for( Index i = 0; i < this->blocks.getSize(); i++ ) { auto block = blocks.getElement( i ); Loading
src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h +2 −2 Original line number Diff line number Diff line Loading @@ -15,7 +15,7 @@ namespace TNL { namespace Segments { namespace details { template< typename Value, template< int SizeOfValue, int StreamedSharedMemory_ = 24576 > struct CSRAdaptiveKernelParameters { Loading @@ -37,7 +37,7 @@ struct CSRAdaptiveKernelParameters /** * \brief Number of elements fitting into streamed shared memory. */ static constexpr size_t StreamedSharedElementsCount() { return StreamedSharedMemory() / sizeof( Value ); }; static constexpr size_t StreamedSharedElementsCount() { return StreamedSharedMemory() / SizeOfValue; }; /** * \brief Computes number of warps in one CUDA block. Loading