Loading src/TNL/Algorithms/Segments/CSRKernelAdaptive.h +10 −9 Original line number Diff line number Diff line Loading @@ -113,11 +113,12 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, else // blockType == Type::LONG - several warps per segment { // Number of elements processed by previous warps const Index offset = block.index[1] * MAX_ELEM_PER_WARP; Index to = begin + (block.index[1] + 1) * MAX_ELEM_PER_WARP; const Index segmentIdx = block.index[0]; const Index offset = //block.index[1] * MAX_ELEM_PER_WARP; block.getWarpIdx() * MAX_ELEM_PER_WARP; Index to = begin + (block.getWarpIdx() + 1) * MAX_ELEM_PER_WARP; const Index segmentIdx = block.getFirstSegment();//block.index[0]; //minID = offsets[block.index[0] ]; const Index end = offsets[block.index[0] + 1]; const Index end = offsets[segmentIdx + 1]; const int tid = threadIdx.x; if( to > end ) Loading Loading @@ -215,7 +216,7 @@ struct CSRKernelAdaptiveView return; } //this->printBlocks(); this->printBlocks(); static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; //static constexpr Index THREADS_SCALAR = 128; //static constexpr Index THREADS_VECTOR = 128; Loading Loading @@ -390,15 +391,15 @@ struct CSRKernelAdaptive if( type == details::Type::LONG ) { inBlock.emplace_back( start, details::Type::LONG, 0 ); const Index blocksCount = inBlock.size(); const Index warpsPerCudaBlock = THREADS_ADAPTIVE / TNL::Cuda::getWarpSize(); const Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount; //Index parts = roundUpDivision(sum, this->SHARED_PER_WARP); /*for( Index index = 1; index < warpsLeft; index++ ) inBlock.emplace_back( start, details::Type::LONG, 0, warpsLeft ); for( Index index = 1; index < warpsLeft; index++ ) { inBlock.emplace_back(start, Type::LONG, index); }*/ inBlock.emplace_back( start, details::Type::LONG, index, warpsLeft ); } } else { Loading src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h +51 −21 Original line number Diff line number Diff line Loading @@ -108,18 +108,35 @@ union CSRAdaptiveKernelBlockDescriptor #else template< typename Index > union CSRAdaptiveKernelBlockDescriptor { CSRAdaptiveKernelBlockDescriptor(Index row, Type type = Type::VECTOR, Index index = 0) noexcept { this->index[0] = row; struct CSRAdaptiveKernelBlockDescriptor { CSRAdaptiveKernelBlockDescriptor( Index firstSegmentIdx, Type type = Type::VECTOR, uint8_t warpIdx = 0, uint8_t warpsCount = 0 ) noexcept { this->firstSegmentIdx = firstSegmentIdx; this->type = ( uint8_t ) type; this->warpIdx = warpIdx; this->warpsCount = warpsCount; /*this->index[0] = row; this->index[1] = index; this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type; this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type;*/ } CSRAdaptiveKernelBlockDescriptor(Index row, Type type, Index nextRow, Index maxID, Index minID) noexcept CSRAdaptiveKernelBlockDescriptor( Index firstSegmentIdx, Type type, Index lastSegmentIdx, Index end, Index begin ) noexcept { this->index[0] = row; this->firstSegmentIdx = firstSegmentIdx; this->warpIdx = 0; this->blockSize = end - begin; this->segmentsInBlock = lastSegmentIdx - firstSegmentIdx; this->type = ( uint8_t ) type; /*this->index[0] = row; this->index[1] = 0; this->twobytes[sizeof(Index) == 4 ? 2 : 4] = maxID - minID; Loading @@ -129,23 +146,25 @@ union CSRAdaptiveKernelBlockDescriptor if (type == Type::STREAM) this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b1000000; else if (type == Type::VECTOR) this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b10000000; this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b10000000;*/ } CSRAdaptiveKernelBlockDescriptor() = default; __cuda_callable__ Type getType() const { if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b1000000 ) return ( Type ) this->type; /*if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b1000000 ) return Type::STREAM; if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b10000000 ) return Type::VECTOR; return Type::LONG; return Type::LONG;*/ } __cuda_callable__ const Index& getFirstSegment() const { return index[ 0 ]; return this->firstSegmentIdx; //return index[ 0 ]; } /*** Loading @@ -153,7 +172,8 @@ union CSRAdaptiveKernelBlockDescriptor */ __cuda_callable__ const Index getSize() const { return twobytes[ sizeof(Index) == 4 ? 2 : 4 ]; return this->blockSize; //return twobytes[ sizeof(Index) == 4 ? 2 : 4 ]; } /*** Loading @@ -161,14 +181,19 @@ union CSRAdaptiveKernelBlockDescriptor */ __cuda_callable__ const Index getSegmentsInBlock() const { return ( twobytes[ sizeof( Index ) == 4 ? 3 : 5 ] & 0x3FFF ); return this->segmentsInBlock; //return ( twobytes[ sizeof( Index ) == 4 ? 3 : 5 ] & 0x3FFF ); } __cuda_callable__ const uint8_t getWarpIdx() const { return this->warpIdx; } void print( std::ostream& str ) const { Type type = this->getType(); str << "Type: "; switch( type ) switch( this->getType() ) { case Type::STREAM: str << " Stream "; Loading @@ -180,13 +205,18 @@ union CSRAdaptiveKernelBlockDescriptor str << " Long "; break; } str << " first segment: " << getFirstSegment(); str << " block end: " << getSize(); str << " index in warp: " << index[ 1 ]; str << " first segment: " << this->getFirstSegment(); str << " block end: " << this->getSize(); str << " index in warp: " << this->getWarpIdx(); } Index index[2]; // index[0] is row pointer, index[1] is index in warp uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID uint8_t type; Index firstSegmentIdx, blockSize, segmentsInBlock; uint8_t warpIdx, warpsCount; //Index index[2]; // index[0] is row pointer, index[1] is index in warp //uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator //uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID //twobytes[3/5] is nextRow - row }; Loading Loading
src/TNL/Algorithms/Segments/CSRKernelAdaptive.h +10 −9 Original line number Diff line number Diff line Loading @@ -113,11 +113,12 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, else // blockType == Type::LONG - several warps per segment { // Number of elements processed by previous warps const Index offset = block.index[1] * MAX_ELEM_PER_WARP; Index to = begin + (block.index[1] + 1) * MAX_ELEM_PER_WARP; const Index segmentIdx = block.index[0]; const Index offset = //block.index[1] * MAX_ELEM_PER_WARP; block.getWarpIdx() * MAX_ELEM_PER_WARP; Index to = begin + (block.getWarpIdx() + 1) * MAX_ELEM_PER_WARP; const Index segmentIdx = block.getFirstSegment();//block.index[0]; //minID = offsets[block.index[0] ]; const Index end = offsets[block.index[0] + 1]; const Index end = offsets[segmentIdx + 1]; const int tid = threadIdx.x; if( to > end ) Loading Loading @@ -215,7 +216,7 @@ struct CSRKernelAdaptiveView return; } //this->printBlocks(); this->printBlocks(); static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; //static constexpr Index THREADS_SCALAR = 128; //static constexpr Index THREADS_VECTOR = 128; Loading Loading @@ -390,15 +391,15 @@ struct CSRKernelAdaptive if( type == details::Type::LONG ) { inBlock.emplace_back( start, details::Type::LONG, 0 ); const Index blocksCount = inBlock.size(); const Index warpsPerCudaBlock = THREADS_ADAPTIVE / TNL::Cuda::getWarpSize(); const Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount; //Index parts = roundUpDivision(sum, this->SHARED_PER_WARP); /*for( Index index = 1; index < warpsLeft; index++ ) inBlock.emplace_back( start, details::Type::LONG, 0, warpsLeft ); for( Index index = 1; index < warpsLeft; index++ ) { inBlock.emplace_back(start, Type::LONG, index); }*/ inBlock.emplace_back( start, details::Type::LONG, index, warpsLeft ); } } else { Loading
src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h +51 −21 Original line number Diff line number Diff line Loading @@ -108,18 +108,35 @@ union CSRAdaptiveKernelBlockDescriptor #else template< typename Index > union CSRAdaptiveKernelBlockDescriptor { CSRAdaptiveKernelBlockDescriptor(Index row, Type type = Type::VECTOR, Index index = 0) noexcept { this->index[0] = row; struct CSRAdaptiveKernelBlockDescriptor { CSRAdaptiveKernelBlockDescriptor( Index firstSegmentIdx, Type type = Type::VECTOR, uint8_t warpIdx = 0, uint8_t warpsCount = 0 ) noexcept { this->firstSegmentIdx = firstSegmentIdx; this->type = ( uint8_t ) type; this->warpIdx = warpIdx; this->warpsCount = warpsCount; /*this->index[0] = row; this->index[1] = index; this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type; this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type;*/ } CSRAdaptiveKernelBlockDescriptor(Index row, Type type, Index nextRow, Index maxID, Index minID) noexcept CSRAdaptiveKernelBlockDescriptor( Index firstSegmentIdx, Type type, Index lastSegmentIdx, Index end, Index begin ) noexcept { this->index[0] = row; this->firstSegmentIdx = firstSegmentIdx; this->warpIdx = 0; this->blockSize = end - begin; this->segmentsInBlock = lastSegmentIdx - firstSegmentIdx; this->type = ( uint8_t ) type; /*this->index[0] = row; this->index[1] = 0; this->twobytes[sizeof(Index) == 4 ? 2 : 4] = maxID - minID; Loading @@ -129,23 +146,25 @@ union CSRAdaptiveKernelBlockDescriptor if (type == Type::STREAM) this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b1000000; else if (type == Type::VECTOR) this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b10000000; this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b10000000;*/ } CSRAdaptiveKernelBlockDescriptor() = default; __cuda_callable__ Type getType() const { if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b1000000 ) return ( Type ) this->type; /*if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b1000000 ) return Type::STREAM; if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b10000000 ) return Type::VECTOR; return Type::LONG; return Type::LONG;*/ } __cuda_callable__ const Index& getFirstSegment() const { return index[ 0 ]; return this->firstSegmentIdx; //return index[ 0 ]; } /*** Loading @@ -153,7 +172,8 @@ union CSRAdaptiveKernelBlockDescriptor */ __cuda_callable__ const Index getSize() const { return twobytes[ sizeof(Index) == 4 ? 2 : 4 ]; return this->blockSize; //return twobytes[ sizeof(Index) == 4 ? 2 : 4 ]; } /*** Loading @@ -161,14 +181,19 @@ union CSRAdaptiveKernelBlockDescriptor */ __cuda_callable__ const Index getSegmentsInBlock() const { return ( twobytes[ sizeof( Index ) == 4 ? 3 : 5 ] & 0x3FFF ); return this->segmentsInBlock; //return ( twobytes[ sizeof( Index ) == 4 ? 3 : 5 ] & 0x3FFF ); } __cuda_callable__ const uint8_t getWarpIdx() const { return this->warpIdx; } void print( std::ostream& str ) const { Type type = this->getType(); str << "Type: "; switch( type ) switch( this->getType() ) { case Type::STREAM: str << " Stream "; Loading @@ -180,13 +205,18 @@ union CSRAdaptiveKernelBlockDescriptor str << " Long "; break; } str << " first segment: " << getFirstSegment(); str << " block end: " << getSize(); str << " index in warp: " << index[ 1 ]; str << " first segment: " << this->getFirstSegment(); str << " block end: " << this->getSize(); str << " index in warp: " << this->getWarpIdx(); } Index index[2]; // index[0] is row pointer, index[1] is index in warp uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID uint8_t type; Index firstSegmentIdx, blockSize, segmentsInBlock; uint8_t warpIdx, warpsCount; //Index index[2]; // index[0] is row pointer, index[1] is index in warp //uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator //uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID //twobytes[3/5] is nextRow - row }; Loading