Commit c3bf6918 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Implementing BiEllpack segments.

parent 36d81639
Loading
Loading
Loading
Loading
+33 −2
Original line number Diff line number Diff line
@@ -75,7 +75,7 @@ class BiEllpack
      IndexType getStorageSize() const;

      __cuda_callable__
      IndexType getGlobalIndex( const Index segmentIdx, const Index localIdx ) const;
      IndexType getGlobalIndex( const IndexType segmentIdx, const IndexType localIdx ) const;

      __cuda_callable__
      SegmentViewType getSegmentView( const IndexType segmentIdx ) const;
@@ -117,7 +117,23 @@ class BiEllpack

      static constexpr int getWarpSize() { return WarpSize; };

      static constexpr int getLogWarpSize() { return std::log( WarpSize ); };
      static constexpr int getLogWarpSize() { return std::log2( WarpSize ); };

      template< typename SizesHolder = OffsetsHolder >
      void performRowBubbleSort( const SizesHolder& segmentsSize );

      template< typename SizesHolder = OffsetsHolder >
      void computeColumnSizes( const SizesHolder& segmentsSizes );

      template< typename SizesHolder = OffsetsHolder >
      void verifyRowPerm( const SizesHolder& segmentsSizes );

      template< typename SizesHolder = OffsetsHolder >
      void verifyRowLengths( const SizesHolder& segmentsSizes );

      IndexType getStripLength( const IndexType stripIdx ) const;

      IndexType getGroupLength( const IndexType strip, const IndexType group ) const;

      IndexType size = 0, storageSize = 0;

@@ -127,6 +143,21 @@ class BiEllpack

      OffsetsHolder groupPointers;



      // TODO: Replace later
      __cuda_callable__ Index power( const IndexType number, const IndexType exponent ) const
      {
          if( exponent >= 0 )
          {
              IndexType result = 1;
              for( IndexType i = 0; i < exponent; i++ )
                  result *= number;
              return result;
          }
          return 0;
      };

      template< typename Device_, typename Index_, typename IndexAllocator_, bool RowMajorOrder_, int WarpSize_ >
      friend class BiEllpack;
};
+242 −10
Original line number Diff line number Diff line
@@ -10,6 +10,7 @@

#pragma once

#include <math.h>
#include <TNL/Containers/Vector.h>
#include <TNL/Algorithms/ParallelFor.h>
#include <TNL/Containers/Segments/BiEllpack.h>
@@ -113,20 +114,240 @@ template< typename Device,
          bool RowMajorOrder,
          int WarpSize >
   template< typename SizesHolder >
void
BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
setSegmentsSizes( const SizesHolder& segmentsSizes )
void BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
performRowBubbleSort( const SizesHolder& segmentsSizes )
{
   this->rowPermArray.evaluate( [] __cuda_callable__ ( const IndexType i ) -> IndexType { return i; } );

   if( std::is_same< DeviceType, Devices::Host >::value )
   {
      IndexType strips = this->virtualRows / getWarpSize();
      for( IndexType i = 0; i < strips; i++ )
      {
         IndexType begin = i * getWarpSize();
         IndexType end = ( i + 1 ) * getWarpSize() - 1;
         if(this->getSize() - 1 < end)
            end = this->getSize() - 1;
         bool sorted = false;
         IndexType permIndex1, permIndex2, offset = 0;
         while( !sorted )
         {
            sorted = true;
            for( IndexType j = begin + offset; j < end - offset; j++ )
            {
               for( IndexType k = begin; k < end + 1; k++ )
               {
                  if( this->rowPermArray.getElement( k ) == j )
                     permIndex1 = k;
                  if( this->rowPermArray.getElement( k ) == j + 1 )
                     permIndex2 = k;
               }
               if( segmentsSizes.getElement( permIndex1 ) < segmentsSizes.getElement( permIndex2 ) )
               {
                  IndexType temp = this->rowPermArray.getElement( permIndex1 );
                  this->rowPermArray.setElement( permIndex1, this->rowPermArray.getElement( permIndex2 ) );
                  this->rowPermArray.setElement( permIndex2, temp );
                  sorted = false;
               }
            }
            for( IndexType j = end - 1 - offset; j > begin + offset; j-- )
            {
               for( IndexType k = begin; k < end + 1; k++ )
               {
                  if( this->rowPermArray.getElement( k ) == j )
                     permIndex1 = k;
                  if( this->rowPermArray.getElement( k ) == j - 1 )
                     permIndex2 = k;
               }
               if( segmentsSizes.getElement( permIndex2 ) < segmentsSizes.getElement( permIndex1 ) )
               {
                  IndexType temp = this->rowPermArray.getElement( permIndex1 );
                  this->rowPermArray.setElement( permIndex1, this->rowPermArray.getElement( permIndex2 ) );
                  this->rowPermArray.setElement( permIndex2, temp );
                  sorted = false;
               }
            }
            offset++;
         }
      }
   }
}

template< typename Device,
          typename Index,
          typename IndexAllocator,
          bool RowMajorOrder,
          int WarpSize >
   template< typename SizesHolder >
void BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
computeColumnSizes( const SizesHolder& segmentsSizes )
{
   IndexType numberOfStrips = this->virtualRows / getWarpSize();
   auto groupPointersView = this->groupPointers.getView();
   auto segmentsPermutationView = this->rowPermArray.getView();
   auto segmentsSizesView = segmentsSizes.getConstView();
   const IndexType size = this->getSize();
   Algorithms::ParallelFor< DeviceType >::exec(
      ( IndexType ) 0,
      this->virtualRows / getWarpSize(),
      [=] __cuda_callable__ ( const IndexType strip ) mutable {

         IndexType firstSegment = strip * getWarpSize();
         IndexType groupBegin = strip * ( getLogWarpSize() + 1 );
         IndexType emptyGroups = 0;

         ////
         // The last strip can be shorter
         if( strip == numberOfStrips - 1 )
         {
            IndexType segmentsCount = size - firstSegment;
            while( !( segmentsCount > TNL::pow( getLogWarpSize() - 1 - emptyGroups, 2 ) ) )
               emptyGroups++;
            for( IndexType group = groupBegin; group < groupBegin + emptyGroups; group++ )
               groupPointersView[ group ] = 0;
         }

         IndexType allocatedColumns = 0;
         for( IndexType groupIdx = emptyGroups; groupIdx < getLogWarpSize(); groupIdx++ )
         {
            IndexType segmentIdx = TNL::pow( getLogWarpSize() - 1 - groupIdx, 2 );
            IndexType permSegm = 0;
            while( segmentsPermutationView[ permSegm + firstSegment ] != segmentIdx + firstSegment )
               permSegm++;
            const IndexType groupWidth = segmentsSizesView[ permSegm + firstSegment ] - allocatedColumns;
            const IndexType groupHeight = TNL::pow( getLogWarpSize() - groupIdx, 2 );
            const IndexType groupSize = groupWidth * groupHeight;
            allocatedColumns = segmentsSizes[ permSegm + firstSegment ];
            groupPointersView[ groupIdx + groupBegin ] = groupSize;
         }
      } );
}

template< typename Device,
          typename Index,
          typename IndexAllocator,
          bool RowMajorOrder,
          int WarpSize >
   template< typename SizesHolder >
void BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
verifyRowPerm( const SizesHolder& segmentsSizes )
{
   bool ok = true;
   IndexType numberOfStrips = this->virtualRows / getWarpSize();
   for( IndexType strip = 0; strip < numberOfStrips; strip++ )
   {
      IndexType begin = strip * getWarpSize();
      IndexType end = ( strip + 1 ) * getWarpSize();
      if( this->getSize() < end )
         end = this->getSize();
      for( IndexType i = begin; i < end - 1; i++ )
      {
         IndexType permIndex1, permIndex2;
         bool first = false;
         bool second = false;
         for( IndexType j = begin; j < end; j++ )
         {
            if( this->rowPermArray.getElement( j ) == i )
            {
               permIndex1 = j;
               first = true;
            }
            if( this->rowPermArray.getElement( j ) == i + 1 )
            {
               permIndex2 = j;
               second = true;
            }
         }
         if( !first || !second )
            std::cout << "Wrong permutation!" << std::endl;
         if( segmentsSizes.getElement( permIndex1 ) >= segmentsSizes.getElement( permIndex2 ) )
            continue;
         else
            ok = false;
      }
   }
   if( !ok )
      throw( std::logic_error( "Segments permutaion verification failed." ) );
}

template< typename Device,
          typename Index,
          typename IndexAllocator,
          bool RowMajorOrder,
          int WarpSize >
   template< typename SizesHolder >
void BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
verifyRowLengths( const SizesHolder& segmentsSizes )
{
   bool ok = true;
   for( IndexType segmentIdx = 0; segmentIdx < this->getSize(); segmentIdx++ )
   {
      BiEllpack< Devices::Host, Index, typename Allocators::Default< Devices::Host >::template Allocator< Index >, RowMajorOrder > hostSegments;
      const IndexType strip = segmentIdx / getWarpSize();
      const IndexType stripLength = this->getStripLength( strip );
      const IndexType groupBegin = ( getLogWarpSize() + 1 ) * strip;
      const IndexType rowStripPerm = this->rowPermArray.getElement( segmentIdx ) - strip * getWarpSize();
      const IndexType begin = this->groupPointers.getElement( groupBegin ) * getWarpSize() + rowStripPerm * stripLength;
      IndexType elementPtr = begin;
      IndexType rowLength = 0;
      const IndexType groupsCount = details::BiEllpack< Index, Device, RowMajorOrder, WarpSize >::getActiveGroupsCount( this->rowPermArray.getConstView(), segmentIdx );
      for( IndexType group = 0; group < groupsCount; group++ )
      {
         for( IndexType i = 0; i < this->getGroupLength( strip, group ); i++ )
         {
            IndexType biElementPtr = elementPtr;
            for( IndexType j = 0; j < this->power( 2, group ); j++ )
            {
               rowLength++;
               biElementPtr += this->power( 2, getLogWarpSize() - group ) * stripLength;
            }
            elementPtr++;
         }
      }
      if( segmentsSizes.getElement( segmentIdx ) > rowLength )
         ok = false;
   }
   if( ! ok )
      throw( std::logic_error( "Segments capacities verification failed." ) );
}

template< typename Device,
          typename Index,
          typename IndexAllocator,
          bool RowMajorOrder,
          int WarpSize >
   template< typename SizesHolder >
void
BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
setSegmentsSizes( const SizesHolder& segmentsSizes )
{
   //if( std::is_same< DeviceType, Devices::Host >::value )
   // {
      this->size = segmentsSizes.getSize();
      if( this->size % WarpSize != 0 )
         this->virtualRows = this->size + getWarpSize() - ( this->size % getWarpSize() );
      else
         this->virtualRows = this->size;
      IndexType strips = this->virtualRows / getWarpSize();
      this->rowPermArray.setSize( this->size );
      this->groupPointers.setSize( strips * ( getLogWarpSize() + 1 ) + 1 );
      this->groupPointers = 0;

      this->performRowBubbleSort( segmentsSizes );
      this->computeColumnSizes( segmentsSizes );

      this->groupPointers.template scan< Algorithms::ScanType::Exclusive >();

      this->verifyRowPerm( segmentsSizes );
      this->verifyRowLengths( segmentsSizes );
      this->storageSize =  getWarpSize() * this->groupPointers.getElement( strips * ( getLogWarpSize() + 1 ) );
   /*}
   else
   {
      BiEllpack< Devices::Host, Index, typename Allocators::Default< Devices::Host >::template Allocator< IndexType >, RowMajorOrder > hostSegments;
      Containers::Vector< IndexType, Devices::Host, IndexType > hostSegmentsSizes( segmentsSizes );
      hostSegments.setSegmentsSizes( hostSegmentsSizes );
      *this = hostSegments;
   }
   }*/
}

template< typename Device,
@@ -182,7 +403,7 @@ template< typename Device,
          bool RowMajorOrder,
          int WarpSize >
__cuda_callable__ auto BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexType
getGlobalIndex( const IndexType segmentIdx, const IndexType localIdx ) const -> IndexType
{
      return details::BiEllpack< IndexType, DeviceType, RowMajorOrder >::getGlobalIndex(
         rowPermArray.getConstView(),
@@ -308,11 +529,22 @@ template< typename Device,
          typename IndexAllocator,
          bool RowMajorOrder,
          int WarpSize >
void
BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
printStructure( std::ostream& str )
auto BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
getStripLength( const IndexType stripIdx ) const -> IndexType
{
   return details::BiEllpack< Index, Device, RowMajorOrder, WarpSize >::getStripLength( this->groupPointers.getConstView(), stripIdx );
}

template< typename Device,
          typename Index,
          typename IndexAllocator,
          bool RowMajorOrder,
          int WarpSize >
auto BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
getGroupLength( const IndexType strip, const IndexType group ) const -> IndexType
{
   this->getView().printStructure( str );
   return this->groupPointers.getElement( strip * ( getLogWarpSize() + 1 ) + group + 1 )
           - this->groupPointers.getElement( strip * ( getLogWarpSize() + 1 ) + group );
}

      } // namespace Segments
+39 −51
Original line number Diff line number Diff line
@@ -10,64 +10,42 @@

#pragma once

#include <TNL/Containers/StaticVector.h>

namespace TNL {
   namespace Containers {
      namespace Segments {

template< typename Index,
          bool RowMajorOrder = false >
class BiEllpackSegmentView;

template< typename Index >
class BiEllpackSegmentView< Index, false >
          bool RowMajorOrder = false,
          int WarpSize = 32 >
class BiEllpackSegmentView
{
   public:
      
      using IndexType = Index;

      __cuda_callable__
      BiEllpackSegmentView( const IndexType offset,
                                 const IndexType size,
                                 const IndexType chunkSize,      // this is only for compatibility with the following specialization
                                 const IndexType chunksInSlice ) // this one as well - both can be replaced when we could use constexprif in C++17
      : segmentOffset( offset ), segmentSize( size ){};

      __cuda_callable__
      BiEllpackSegmentView( const BiEllpackSegmentView& view )
      : segmentOffset( view.segmentOffset ), segmentSize( view.segmentSize ){};

      __cuda_callable__
      IndexType getSize() const
      {
         return this->segmentSize;
      };

      __cuda_callable__
      IndexType getGlobalIndex( const IndexType localIndex ) const
      {
         TNL_ASSERT_LT( localIndex, segmentSize, "Local index exceeds segment bounds." );
         return segmentOffset + localIndex;
      };
      static constexpr int getWarpSize() { return WarpSize; };

      protected:
      static constexpr int getLogWarpSize() { return std::log2( WarpSize ); };

         IndexType segmentOffset, segmentSize;
};

template< typename Index >
class BiEllpackSegmentView< Index, true >
{
   public:
      static constexpr int getGroupsCount() { return getLogWarpSize() + 1; };

      using IndexType = Index;
      using GroupsWidthType = Containers::StaticVector< getGroupsCount(), IndexType >;


      /**
       * \brief Constructor.
       * 
       * \param offset is offset of the first group of the strip the segment belongs to.
       * \param size is the segment size
       * \param inStripIdx is index of the segment within its strip.
       * \param groupsWidth is a static vector containing widths of the strip groups
       */
      __cuda_callable__
      BiEllpackSegmentView( const IndexType offset,
                                 const IndexType size,
                                 const IndexType chunkSize,
                                 const IndexType chunksInSlice )
      : segmentOffset( offset ), segmentSize( size ),
        chunkSize( chunkSize ), chunksInSlice( chunksInSlice ){};
                            const IndexType inStripIdx,
                            const GroupsWidthType& groupsWidth )
      : groupOffset( offset ), segmentSize( TNL::sum( groupsWidth ) ), inStripIdx( inStripIdx ), groupsWidth( groupsWidth ){};

      __cuda_callable__
      IndexType getSize() const
@@ -76,17 +54,27 @@ class BiEllpackSegmentView< Index, true >
      };

      __cuda_callable__
      IndexType getGlobalIndex( const IndexType localIdx ) const
      IndexType getGlobalIndex( IndexType localIdx ) const
      {
         IndexType i( 0 ), offset( groupOffset ), groupHeight( getWarpSize() );
         while( localIdx > groupsWidth[ i ] )
         {
         TNL_ASSERT_LT( localIdx, segmentSize, "Local index exceeds segment bounds." );
         const IndexType chunkIdx = localIdx / chunkSize;
         const IndexType inChunkOffset = localIdx % chunkSize;
         return segmentOffset + inChunkOffset * chunksInSlice + chunkIdx;
            localIdx -= groupsWidth[ i ];
            offset += groupsWidth[ i++ ] * groupHeight;
            groupHeight /= 2;
         }
         TNL_ASSERT_LE( i, TNL::log2( getWarpSize() - inStripIdx + 1 ), "Local index exceeds segment bounds." );
         if( RowMajorOrder )
            return offset + inStripIdx * groupsWidth[ i ] + localIdx;
         else
            return offset + inStripIdx + localIdx * groupHeight;
      };

      protected:

         IndexType segmentOffset, segmentSize, chunkSize, chunksInSlice;
         IndexType groupOffset, inStripIdx, segmentSize;

         GroupsWidthType groupsWidth;
};

      } //namespace Segments
+1 −1
Original line number Diff line number Diff line
@@ -134,7 +134,7 @@ class BiEllpackView

      static constexpr int getWarpSize() { return WarpSize; };

      static constexpr int getLogWarpSize() { return std::log( WarpSize ); };
      static constexpr int getLogWarpSize() { return std::log2( WarpSize ); };

      IndexType size = 0, storageSize = 0;

+3 −2
Original line number Diff line number Diff line
@@ -285,6 +285,7 @@ BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
{
   using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType;
   
}

template< typename Device,
@@ -310,8 +311,8 @@ operator=( const BiEllpackView& source )
   this->size = source.size;
   this->storageSize = source.storageSize;
   this->virtualRows = source.virtualRows;
   this->rowPermArray = source.rowPermArray;
   this->groupPointers = source.groupPointers;
   this->rowPermArray.bind( source.rowPermArray );
   this->groupPointers.bind( source.groupPointers );
   return *this;
}

Loading