diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp index 40700c50f25ed5c50539bf0d61dc7a6330212cfc..a9f921c73cea8c52b473837be21b6802ebc64f1a 100644 --- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp +++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp @@ -53,16 +53,16 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, constexpr size_t StreamedSharedElementsPerWarp = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::StreamedSharedElementsPerWarp(); __shared__ Real streamShared[ WarpsCount ][ StreamedSharedElementsPerWarp ]; - //__shared__ Real multivectorShared[ CudaBlockSize / WarpSize ]; - //__shared__ BlockType sharedBlocks[ WarpsCount ]; + __shared__ Real multivectorShared[ CudaBlockSize / WarpSize ]; + __shared__ BlockType sharedBlocks[ WarpsCount ]; const Index index = ( ( gridIdx * TNL::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x ) + threadIdx.x; const Index blockIdx = index / WarpSize; if( blockIdx >= blocks.getSize() - 1 ) return; - //if( threadIdx.x < CudaBlockSize / WarpSize ) - // multivectorShared[ threadIdx.x ] = zero; + if( threadIdx.x < CudaBlockSize / WarpSize ) + multivectorShared[ threadIdx.x ] = zero; Real result = zero; bool compute( true ); const Index laneIdx = threadIdx.x & 31; // & is cheaper than % @@ -71,7 +71,8 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, __syncthreads(); const auto& block = sharedBlocks[ warpIdx ];*/ const BlockType block = blocks[ blockIdx ]; - const Index begin = offsets[ block.getFirstSegment() ]; + const Index firstSegmentIdx = block.getFirstSegment(); + const Index begin = offsets[ firstSegmentIdx ]; if( block.getType() == details::Type::STREAM ) // Stream kernel - many short segments per warp { @@ -80,12 +81,10 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, // Stream data to shared memory for( Index globalIdx = laneIdx + begin; globalIdx < end; globalIdx += WarpSize ) - { streamShared[ warpIdx ][ globalIdx - begin ] = fetch( globalIdx, compute ); - } - //const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock(); + const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock(); - /*for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += WarpSize ) + for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += WarpSize ) { const Index sharedEnd = offsets[ i + 1 ] - begin; // end of preprocessed data result = zero; @@ -93,9 +92,9 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, for( Index sharedIdx = offsets[ i ] - begin; sharedIdx < sharedEnd; sharedIdx++ ) result = reduce( result, streamShared[ warpIdx ][ sharedIdx ] ); keep( i, result ); - }*/ + } } - /*else if( block.getType() == details::Type::VECTOR ) // Vector kernel - one segment per warp + else if( block.getType() == details::Type::VECTOR ) // Vector kernel - one segment per warp { const Index end = begin + block.getSize(); const Index segmentIdx = block.getFirstSegment(); @@ -172,7 +171,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, keep( segmentIdx, multivectorShared[ 0 ] ); } } - }*/ + } } #endif diff --git a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h index 96f1899b268596bc57ba395cec1556ab5fbdfff5..d2be8966453c9d1253720925cfea44545bfbbb96 100644 --- a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h +++ b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h @@ -22,11 +22,13 @@ enum class Type { VECTOR = 2 }; +//#define CSR_ADAPTIVE_UNION + #ifdef CSR_ADAPTIVE_UNION template< typename Index > union CSRAdaptiveKernelBlockDescriptor { - CSRAdaptiveKernelBlockDescriptor(Index row, Type type = Type::VECTOR, Index index = 0) noexcept + CSRAdaptiveKernelBlockDescriptor(Index row, Type type = Type::VECTOR, Index index = 0, uint8_t warpsCount = 0) noexcept { this->index[0] = row; this->index[1] = index; @@ -80,6 +82,16 @@ union CSRAdaptiveKernelBlockDescriptor return ( twobytes[ sizeof( Index ) == 4 ? 3 : 5 ] & 0x3FFF ); } + __cuda_callable__ uint8_t getWarpIdx() const + { + return index[ 1 ]; + } + + __cuda_callable__ uint8_t getWarpsCount() const + { + return 1; + } + void print( std::ostream& str ) const { Type type = this->getType(); diff --git a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h index 3fa0855cb66fca91440896bd0ab4c38046044948..0f00fbd808772b14bb11e85951566730c766e310 100644 --- a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h +++ b/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h @@ -15,7 +15,7 @@ namespace TNL { namespace Segments { namespace details { -static constexpr int CSRAdaptiveKernelParametersCudaBlockSizes[] = { 256, 256, 256, 128, 128, 128 }; +static constexpr int CSRAdaptiveKernelParametersCudaBlockSizes[] = { 256, 256, 256, 256, 256, 256 }; template< int SizeOfValue = 1, int StreamedSharedMemory_ = 24576 > @@ -25,7 +25,10 @@ struct CSRAdaptiveKernelParameters static constexpr int getSizeValueLogConstexpr( const int i ); + static constexpr int getSizeOfValue() { return SizeOfValue; }; + static constexpr int SizeOfValueLog = getSizeValueLogConstexpr( SizeOfValue ); + static_assert( SizeOfValueLog < MaxValueSizeLog, "Parameter SizeOfValue is too large." ); /**