From fc12a48d99ad40c8d5b7f31122b7a353dee3eabf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Fri, 29 Jan 2021 15:24:47 +0100 Subject: [PATCH] And now the real optimization of the Adaptive CSR kernel initiation :). --- .../Algorithms/Segments/CSRKernelAdaptive.h | 95 +++++++++---------- 1 file changed, 45 insertions(+), 50 deletions(-) diff --git a/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h b/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h index 0b71c2b91..84f1cc437 100644 --- a/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h +++ b/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h @@ -119,7 +119,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, const Index segmentIdx = block.getFirstSegment();//block.index[0]; //minID = offsets[block.index[0] ]; const Index end = offsets[segmentIdx + 1]; - const int tid = threadIdx.x; + //const int tid = threadIdx.x; //const int inBlockWarpIdx = block.getWarpIdx(); //if( to > end ) @@ -342,12 +342,12 @@ template< typename Index, typename Device > struct CSRKernelAdaptive { - using IndexType = Index; - using DeviceType = Device; - using ViewType = CSRKernelAdaptiveView< Index, Device >; - using ConstViewType = CSRKernelAdaptiveView< Index, Device >; - using BlocksType = typename ViewType::BlocksType; - using BlocksView = typename BlocksType::ViewType; + using IndexType = Index; + using DeviceType = Device; + using ViewType = CSRKernelAdaptiveView< Index, Device >; + using ConstViewType = CSRKernelAdaptiveView< Index, Device >; + using BlocksType = typename ViewType::BlocksType; + using BlocksView = typename BlocksType::ViewType; static TNL::String getKernelType() { @@ -382,11 +382,9 @@ struct CSRKernelAdaptive Index &sum ) { sum = 0; - TNL::Containers::Vector< typename Offsets::IndexType, TNL::Devices::Host, typename Offsets::IndexType > - hostOffsets( offsets ); for (Index current = start; current < size - 1; current++ ) { - Index elements = hostOffsets[ current + 1 ] - hostOffsets[ current ]; + Index elements = offsets[ current + 1 ] - offsets[ current ]; sum += elements; if( sum > SHARED_PER_WARP ) { @@ -401,7 +399,6 @@ struct CSRKernelAdaptive type = details::Type::VECTOR; else type = details::Type::LONG; - //type = Type::LONG; // return current + 1; } } @@ -410,48 +407,46 @@ struct CSRKernelAdaptive return size - 1; // return last row pointer } - template< typename Offsets > - void init( const Offsets& offsets ) - { - const Index rows = offsets.getSize(); - Index sum, start( 0 ), nextStart( 0 ); + template< typename Offsets > + void init( const Offsets& offsets ) + { + using HostOffsetsType = TNL::Containers::Vector< typename Offsets::IndexType, TNL::Devices::Host, typename Offsets::IndexType >; + HostOffsetsType hostOffsets( offsets ); + const Index rows = offsets.getSize(); + Index sum, start( 0 ), nextStart( 0 ); - // Fill blocks - std::vector< details::CSRAdaptiveKernelBlockDescriptor< Index > > inBlocks; - inBlocks.reserve( rows ); + // Fill blocks + std::vector< details::CSRAdaptiveKernelBlockDescriptor< Index > > inBlocks; + inBlocks.reserve( rows ); - while( nextStart != rows - 1 ) - { - details::Type type; - nextStart = findLimit( start, offsets, rows, type, sum ); + while( nextStart != rows - 1 ) + { + details::Type type; + nextStart = findLimit( start, hostOffsets, rows, type, sum ); - if( type == details::Type::LONG ) - { - const Index blocksCount = inBlocks.size(); - const Index warpsPerCudaBlock = THREADS_ADAPTIVE / TNL::Cuda::getWarpSize(); - Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount; - if( warpsLeft == 0 ) - warpsLeft = warpsPerCudaBlock; - //Index parts = roundUpDivision(sum, this->SHARED_PER_WARP); - inBlocks.emplace_back( start, details::Type::LONG, 0, warpsLeft ); - for( Index index = 1; index < warpsLeft; index++ ) - { - inBlocks.emplace_back( start, details::Type::LONG, index, warpsLeft ); - } - } - else - { - inBlocks.emplace_back(start, type, - nextStart, - offsets.getElement(nextStart), - offsets.getElement(start) ); - } - start = nextStart; - } - inBlocks.emplace_back(nextStart); - this->blocks = inBlocks; - this->view.setBlocks( blocks ); - }; + if( type == details::Type::LONG ) + { + const Index blocksCount = inBlocks.size(); + const Index warpsPerCudaBlock = THREADS_ADAPTIVE / TNL::Cuda::getWarpSize(); + Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount; + if( warpsLeft == 0 ) + warpsLeft = warpsPerCudaBlock; + for( Index index = 0; index < warpsLeft; index++ ) + inBlocks.emplace_back( start, details::Type::LONG, index, warpsLeft ); + } + else + { + inBlocks.emplace_back(start, type, + nextStart, + offsets.getElement(nextStart), + offsets.getElement(start) ); + } + start = nextStart; + } + inBlocks.emplace_back(nextStart); + this->blocks = inBlocks; + this->view.setBlocks( blocks ); + } void reset() { -- GitLab