Loading src/TNL/Algorithms/Segments/CSRKernelAdaptive.h +16 −119 Original line number Diff line number Diff line Loading @@ -16,113 +16,12 @@ #include <TNL/Algorithms/ParallelFor.h> #include <TNL/Algorithms/Segments/details/LambdaAdapter.h> #include <TNL/Algorithms/Segments/CSRKernelScalar.h> #include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h> namespace TNL { namespace Algorithms { namespace Segments { enum class Type { /* LONG = 0!!! Non zero value rewrites index[1] */ LONG = 0, STREAM = 1, VECTOR = 2 }; /*template< typename Index > struct LongBlockDescription { uint8_t type; }*/ template< typename Index > union Block { Block(Index row, Type type = Type::VECTOR, Index index = 0) noexcept { this->index[0] = row; this->index[1] = index; this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type; } Block(Index row, Type type, Index nextRow, Index maxID, Index minID) noexcept { this->index[0] = row; this->index[1] = 0; this->twobytes[sizeof(Index) == 4 ? 2 : 4] = maxID - minID; if (type == Type::STREAM) this->twobytes[sizeof(Index) == 4 ? 3 : 5] = nextRow - row; if (type == Type::STREAM) this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b1000000; else if (type == Type::VECTOR) this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b10000000; } Block() = default; __cuda_callable__ Type getType() const { if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b1000000 ) return Type::STREAM; if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b10000000 ) return Type::VECTOR; return Type::LONG; } __cuda_callable__ const Index& getFirstSegment() const { return index[ 0 ]; } /*** * \brief Returns number of elements covered by the block. */ __cuda_callable__ const Index getSize() const { return twobytes[ sizeof(Index) == 4 ? 2 : 4 ]; } /*** * \brief Returns number of segments covered by the block. */ __cuda_callable__ const Index getSegmentsInBlock() const { return ( twobytes[ sizeof( Index ) == 4 ? 3 : 5 ] & 0x3FFF ); } void print( std::ostream& str ) const { Type type = this->getType(); str << "Type: "; switch( type ) { case Type::STREAM: str << " Stream "; break; case Type::VECTOR: str << " Vector "; break; case Type::LONG: str << " Long "; break; } str << " first segment: " << getFirstSegment(); str << " block end: " << getSize(); str << " index in warp: " << index[ 1 ]; } Index index[2]; // index[0] is row pointer, index[1] is index in warp uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID //twobytes[3/5] is nextRow - row }; template< typename Index > std::ostream& operator<< ( std::ostream& str, const Block< Index >& block ) { block.print( str ); return str; } #ifdef HAVE_CUDA template< int CudaBlockSize, Loading Loading @@ -163,12 +62,12 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, Real result = zero; bool compute( true ); const Index laneIdx = threadIdx.x & 31; // & is cheaper than % const Block< Index > block = blocks[ blockIdx ]; const details::CSRAdaptiveKernelBlockDescriptor< Index > block = blocks[ blockIdx ]; const Index& firstSegmentIdx = block.getFirstSegment(); const Index begin = offsets[ firstSegmentIdx ]; const auto blockType = block.getType(); if( blockType == Type::STREAM ) // Stream kernel - many short segments per warp if( blockType == details::Type::STREAM ) // Stream kernel - many short segments per warp { const Index warpIdx = threadIdx.x / 32; const Index end = begin + block.getSize(); Loading @@ -194,7 +93,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, keep( i, result ); } } else if( blockType == Type::VECTOR ) // Vector kernel - one segment per warp else if( blockType == details::Type::VECTOR ) // Vector kernel - one segment per warp { const Index end = begin + block.getSize(); const Index segmentIdx = block.getFirstSegment(); Loading Loading @@ -274,7 +173,7 @@ struct CSRKernelAdaptiveView using DeviceType = Device; using ViewType = CSRKernelAdaptiveView< Index, Device >; using ConstViewType = CSRKernelAdaptiveView< Index, Device >; using BlocksType = TNL::Containers::Vector< Block< Index >, Device, Index >; using BlocksType = TNL::Containers::Vector< details::CSRAdaptiveKernelBlockDescriptor< Index >, Device, Index >; using BlocksView = typename BlocksType::ViewType; CSRKernelAdaptiveView() = default; Loading Loading @@ -320,10 +219,10 @@ struct CSRKernelAdaptiveView static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; //static constexpr Index THREADS_SCALAR = 128; //static constexpr Index THREADS_VECTOR = 128; static constexpr Index THREADS_LIGHT = 128; //static constexpr Index THREADS_LIGHT = 128; /* Max length of row to process one warp for CSR Light, MultiVector */ static constexpr Index MAX_ELEMENTS_PER_WARP = 384; //static constexpr Index MAX_ELEMENTS_PER_WARP = 384; /* Max length of row to process one warp for CSR Adaptive */ static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512; Loading Loading @@ -443,7 +342,7 @@ struct CSRKernelAdaptive Index findLimit( const Index start, const Offsets& offsets, const Index size, Type &type, details::Type &type, Index &sum ) { sum = 0; Loading @@ -456,21 +355,21 @@ struct CSRKernelAdaptive { if( current - start > 0 ) // extra row { type = Type::STREAM; type = details::Type::STREAM; return current; } else { // one long row if( sum <= 2 * MAX_ELEMENTS_PER_WARP_ADAPT ) type = Type::VECTOR; type = details::Type::VECTOR; else type = Type::VECTOR; // TODO: Put LONG back type = details::Type::VECTOR; // TODO: Put LONG back //type = Type::LONG; // return current + 1; } } } type = Type::STREAM; type = details::Type::STREAM; return size - 1; // return last row pointer } Loading @@ -481,17 +380,17 @@ struct CSRKernelAdaptive Index sum, start( 0 ), nextStart( 0 ); // Fill blocks std::vector< Block< Index > > inBlock; std::vector< details::CSRAdaptiveKernelBlockDescriptor< Index > > inBlock; inBlock.reserve( rows ); while( nextStart != rows - 1 ) { Type type; details::Type type; nextStart = findLimit( start, offsets, rows, type, sum ); if( type == Type::LONG ) if( type == details::Type::LONG ) { inBlock.emplace_back( start, Type::LONG, 0 ); inBlock.emplace_back( start, details::Type::LONG, 0 ); const Index blocksCount = inBlock.size(); const Index warpsPerCudaBlock = THREADS_ADAPTIVE / TNL::Cuda::getWarpSize(); const Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount; Loading Loading @@ -554,8 +453,6 @@ struct CSRKernelAdaptive ViewType view; }; } // namespace Segments } // namespace Algorithms } // namespace TNL src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h 0 → 100644 +118 −0 Original line number Diff line number Diff line /*************************************************************************** CSRAdaptiveKernelBlockDescriptor.h - description ------------------- begin : Jan 25, 2021 -> Joe Biden inauguration copyright : (C) 2021 by Tomas Oberhuber email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ #pragma once namespace TNL { namespace Algorithms { namespace Segments { namespace details { enum class Type { /* LONG = 0!!! Non zero value rewrites index[1] */ LONG = 0, STREAM = 1, VECTOR = 2 }; template< typename Index > union CSRAdaptiveKernelBlockDescriptor { CSRAdaptiveKernelBlockDescriptor(Index row, Type type = Type::VECTOR, Index index = 0) noexcept { this->index[0] = row; this->index[1] = index; this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type; } CSRAdaptiveKernelBlockDescriptor(Index row, Type type, Index nextRow, Index maxID, Index minID) noexcept { this->index[0] = row; this->index[1] = 0; this->twobytes[sizeof(Index) == 4 ? 2 : 4] = maxID - minID; if (type == Type::STREAM) this->twobytes[sizeof(Index) == 4 ? 3 : 5] = nextRow - row; if (type == Type::STREAM) this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b1000000; else if (type == Type::VECTOR) this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b10000000; } CSRAdaptiveKernelBlockDescriptor() = default; __cuda_callable__ Type getType() const { if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b1000000 ) return Type::STREAM; if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b10000000 ) return Type::VECTOR; return Type::LONG; } __cuda_callable__ const Index& getFirstSegment() const { return index[ 0 ]; } /*** * \brief Returns number of elements covered by the block. */ __cuda_callable__ const Index getSize() const { return twobytes[ sizeof(Index) == 4 ? 2 : 4 ]; } /*** * \brief Returns number of segments covered by the block. */ __cuda_callable__ const Index getSegmentsInBlock() const { return ( twobytes[ sizeof( Index ) == 4 ? 3 : 5 ] & 0x3FFF ); } void print( std::ostream& str ) const { Type type = this->getType(); str << "Type: "; switch( type ) { case Type::STREAM: str << " Stream "; break; case Type::VECTOR: str << " Vector "; break; case Type::LONG: str << " Long "; break; } str << " first segment: " << getFirstSegment(); str << " block end: " << getSize(); str << " index in warp: " << index[ 1 ]; } Index index[2]; // index[0] is row pointer, index[1] is index in warp uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID //twobytes[3/5] is nextRow - row }; template< typename Index > std::ostream& operator<< ( std::ostream& str, const CSRAdaptiveKernelBlockDescriptor< Index >& block ) { block.print( str ); return str; } } // namespace details } // namespace Segments } // namespace Algorithms } // namespace TNL No newline at end of file Loading
src/TNL/Algorithms/Segments/CSRKernelAdaptive.h +16 −119 Original line number Diff line number Diff line Loading @@ -16,113 +16,12 @@ #include <TNL/Algorithms/ParallelFor.h> #include <TNL/Algorithms/Segments/details/LambdaAdapter.h> #include <TNL/Algorithms/Segments/CSRKernelScalar.h> #include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h> namespace TNL { namespace Algorithms { namespace Segments { enum class Type { /* LONG = 0!!! Non zero value rewrites index[1] */ LONG = 0, STREAM = 1, VECTOR = 2 }; /*template< typename Index > struct LongBlockDescription { uint8_t type; }*/ template< typename Index > union Block { Block(Index row, Type type = Type::VECTOR, Index index = 0) noexcept { this->index[0] = row; this->index[1] = index; this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type; } Block(Index row, Type type, Index nextRow, Index maxID, Index minID) noexcept { this->index[0] = row; this->index[1] = 0; this->twobytes[sizeof(Index) == 4 ? 2 : 4] = maxID - minID; if (type == Type::STREAM) this->twobytes[sizeof(Index) == 4 ? 3 : 5] = nextRow - row; if (type == Type::STREAM) this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b1000000; else if (type == Type::VECTOR) this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b10000000; } Block() = default; __cuda_callable__ Type getType() const { if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b1000000 ) return Type::STREAM; if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b10000000 ) return Type::VECTOR; return Type::LONG; } __cuda_callable__ const Index& getFirstSegment() const { return index[ 0 ]; } /*** * \brief Returns number of elements covered by the block. */ __cuda_callable__ const Index getSize() const { return twobytes[ sizeof(Index) == 4 ? 2 : 4 ]; } /*** * \brief Returns number of segments covered by the block. */ __cuda_callable__ const Index getSegmentsInBlock() const { return ( twobytes[ sizeof( Index ) == 4 ? 3 : 5 ] & 0x3FFF ); } void print( std::ostream& str ) const { Type type = this->getType(); str << "Type: "; switch( type ) { case Type::STREAM: str << " Stream "; break; case Type::VECTOR: str << " Vector "; break; case Type::LONG: str << " Long "; break; } str << " first segment: " << getFirstSegment(); str << " block end: " << getSize(); str << " index in warp: " << index[ 1 ]; } Index index[2]; // index[0] is row pointer, index[1] is index in warp uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID //twobytes[3/5] is nextRow - row }; template< typename Index > std::ostream& operator<< ( std::ostream& str, const Block< Index >& block ) { block.print( str ); return str; } #ifdef HAVE_CUDA template< int CudaBlockSize, Loading Loading @@ -163,12 +62,12 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, Real result = zero; bool compute( true ); const Index laneIdx = threadIdx.x & 31; // & is cheaper than % const Block< Index > block = blocks[ blockIdx ]; const details::CSRAdaptiveKernelBlockDescriptor< Index > block = blocks[ blockIdx ]; const Index& firstSegmentIdx = block.getFirstSegment(); const Index begin = offsets[ firstSegmentIdx ]; const auto blockType = block.getType(); if( blockType == Type::STREAM ) // Stream kernel - many short segments per warp if( blockType == details::Type::STREAM ) // Stream kernel - many short segments per warp { const Index warpIdx = threadIdx.x / 32; const Index end = begin + block.getSize(); Loading @@ -194,7 +93,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, keep( i, result ); } } else if( blockType == Type::VECTOR ) // Vector kernel - one segment per warp else if( blockType == details::Type::VECTOR ) // Vector kernel - one segment per warp { const Index end = begin + block.getSize(); const Index segmentIdx = block.getFirstSegment(); Loading Loading @@ -274,7 +173,7 @@ struct CSRKernelAdaptiveView using DeviceType = Device; using ViewType = CSRKernelAdaptiveView< Index, Device >; using ConstViewType = CSRKernelAdaptiveView< Index, Device >; using BlocksType = TNL::Containers::Vector< Block< Index >, Device, Index >; using BlocksType = TNL::Containers::Vector< details::CSRAdaptiveKernelBlockDescriptor< Index >, Device, Index >; using BlocksView = typename BlocksType::ViewType; CSRKernelAdaptiveView() = default; Loading Loading @@ -320,10 +219,10 @@ struct CSRKernelAdaptiveView static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; //static constexpr Index THREADS_SCALAR = 128; //static constexpr Index THREADS_VECTOR = 128; static constexpr Index THREADS_LIGHT = 128; //static constexpr Index THREADS_LIGHT = 128; /* Max length of row to process one warp for CSR Light, MultiVector */ static constexpr Index MAX_ELEMENTS_PER_WARP = 384; //static constexpr Index MAX_ELEMENTS_PER_WARP = 384; /* Max length of row to process one warp for CSR Adaptive */ static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512; Loading Loading @@ -443,7 +342,7 @@ struct CSRKernelAdaptive Index findLimit( const Index start, const Offsets& offsets, const Index size, Type &type, details::Type &type, Index &sum ) { sum = 0; Loading @@ -456,21 +355,21 @@ struct CSRKernelAdaptive { if( current - start > 0 ) // extra row { type = Type::STREAM; type = details::Type::STREAM; return current; } else { // one long row if( sum <= 2 * MAX_ELEMENTS_PER_WARP_ADAPT ) type = Type::VECTOR; type = details::Type::VECTOR; else type = Type::VECTOR; // TODO: Put LONG back type = details::Type::VECTOR; // TODO: Put LONG back //type = Type::LONG; // return current + 1; } } } type = Type::STREAM; type = details::Type::STREAM; return size - 1; // return last row pointer } Loading @@ -481,17 +380,17 @@ struct CSRKernelAdaptive Index sum, start( 0 ), nextStart( 0 ); // Fill blocks std::vector< Block< Index > > inBlock; std::vector< details::CSRAdaptiveKernelBlockDescriptor< Index > > inBlock; inBlock.reserve( rows ); while( nextStart != rows - 1 ) { Type type; details::Type type; nextStart = findLimit( start, offsets, rows, type, sum ); if( type == Type::LONG ) if( type == details::Type::LONG ) { inBlock.emplace_back( start, Type::LONG, 0 ); inBlock.emplace_back( start, details::Type::LONG, 0 ); const Index blocksCount = inBlock.size(); const Index warpsPerCudaBlock = THREADS_ADAPTIVE / TNL::Cuda::getWarpSize(); const Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount; Loading Loading @@ -554,8 +453,6 @@ struct CSRKernelAdaptive ViewType view; }; } // namespace Segments } // namespace Algorithms } // namespace TNL
src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h 0 → 100644 +118 −0 Original line number Diff line number Diff line /*************************************************************************** CSRAdaptiveKernelBlockDescriptor.h - description ------------------- begin : Jan 25, 2021 -> Joe Biden inauguration copyright : (C) 2021 by Tomas Oberhuber email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ #pragma once namespace TNL { namespace Algorithms { namespace Segments { namespace details { enum class Type { /* LONG = 0!!! Non zero value rewrites index[1] */ LONG = 0, STREAM = 1, VECTOR = 2 }; template< typename Index > union CSRAdaptiveKernelBlockDescriptor { CSRAdaptiveKernelBlockDescriptor(Index row, Type type = Type::VECTOR, Index index = 0) noexcept { this->index[0] = row; this->index[1] = index; this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type; } CSRAdaptiveKernelBlockDescriptor(Index row, Type type, Index nextRow, Index maxID, Index minID) noexcept { this->index[0] = row; this->index[1] = 0; this->twobytes[sizeof(Index) == 4 ? 2 : 4] = maxID - minID; if (type == Type::STREAM) this->twobytes[sizeof(Index) == 4 ? 3 : 5] = nextRow - row; if (type == Type::STREAM) this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b1000000; else if (type == Type::VECTOR) this->byte[sizeof(Index) == 4 ? 7 : 15] |= 0b10000000; } CSRAdaptiveKernelBlockDescriptor() = default; __cuda_callable__ Type getType() const { if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b1000000 ) return Type::STREAM; if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b10000000 ) return Type::VECTOR; return Type::LONG; } __cuda_callable__ const Index& getFirstSegment() const { return index[ 0 ]; } /*** * \brief Returns number of elements covered by the block. */ __cuda_callable__ const Index getSize() const { return twobytes[ sizeof(Index) == 4 ? 2 : 4 ]; } /*** * \brief Returns number of segments covered by the block. */ __cuda_callable__ const Index getSegmentsInBlock() const { return ( twobytes[ sizeof( Index ) == 4 ? 3 : 5 ] & 0x3FFF ); } void print( std::ostream& str ) const { Type type = this->getType(); str << "Type: "; switch( type ) { case Type::STREAM: str << " Stream "; break; case Type::VECTOR: str << " Vector "; break; case Type::LONG: str << " Long "; break; } str << " first segment: " << getFirstSegment(); str << " block end: " << getSize(); str << " index in warp: " << index[ 1 ]; } Index index[2]; // index[0] is row pointer, index[1] is index in warp uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID //twobytes[3/5] is nextRow - row }; template< typename Index > std::ostream& operator<< ( std::ostream& str, const CSRAdaptiveKernelBlockDescriptor< Index >& block ) { block.print( str ); return str; } } // namespace details } // namespace Segments } // namespace Algorithms } // namespace TNL No newline at end of file