Commit 7fd8fa6b authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Refactoring of ChunkedEllpack segments.

parent 3ce63c9f
Loading
Loading
Loading
Loading
+1 −6
Original line number Diff line number Diff line
@@ -36,7 +36,7 @@ class ChunkedEllpack
      using ViewTemplate = ChunkedEllpackView< Device_, Index_, RowMajorOrder >;
      using ConstViewType = ChunkedEllpackView< Device, std::add_const_t< Index >, RowMajorOrder >;
      using SegmentViewType = SegmentView< IndexType, RowMajorOrder >;
      using ChunkedEllpackSliceInfoType = ChunkedEllpackSliceInfo< IndexType >;
      using ChunkedEllpackSliceInfoType = details::ChunkedEllpackSliceInfo< IndexType >;
      //TODO: using ChunkedEllpackSliceInfoAllocator = typename IndexAllocatorType::retype< ChunkedEllpackSliceInfoType >;
      using ChunkedEllpackSliceInfoAllocator = typename Allocators::Default< Device >::template Allocator< ChunkedEllpackSliceInfoType >;
      using ChunkedEllpackSliceInfoContainer = Containers::Array< ChunkedEllpackSliceInfoType, DeviceType, IndexType, ChunkedEllpackSliceInfoAllocator >;
@@ -66,7 +66,6 @@ class ChunkedEllpack
      __cuda_callable__
      IndexType getSegmentsCount() const;

      __cuda_callable__
      IndexType getSegmentSize( const IndexType segmentIdx ) const;

      /**
@@ -75,16 +74,12 @@ class ChunkedEllpack
      __cuda_callable__
      IndexType getSize() const;


      __cuda_callable__
      IndexType getStorageSize() const;

      __cuda_callable__
      IndexType getGlobalIndex( const Index segmentIdx, const Index localIdx ) const;

      __cuda_callable__
      void getSegmentAndLocalIndex( const Index globalIdx, Index& segmentIdx, Index& localIdx ) const;

      __cuda_callable__
      SegmentViewType getSegmentView( const IndexType segmentIdx ) const;

+14 −121
Original line number Diff line number Diff line
@@ -289,21 +289,15 @@ template< typename Device,
          typename Index,
          typename IndexAllocator,
          bool RowMajorOrder >
__cuda_callable__
Index
ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
getSegmentSize( const IndexType segmentIdx ) const
{
   const IndexType& sliceIndex = rowToSliceMapping[ segmentIdx ];
   TNL_ASSERT_LE( sliceIndex, this->getSegmentsCount(), "" );
   IndexType firstChunkOfSegment( 0 );
   if( segmentIdx != slices[ sliceIndex ].firstRow )
      firstChunkOfSegment = rowToChunkMapping[ segmentIdx - 1 ];

   const IndexType lastChunkOfSegment = rowToChunkMapping[ segmentIdx ];
   const IndexType segmentChunksCount = lastChunkOfSegment - firstChunkOfSegment;
   const IndexType chunkSize = slices[ sliceIndex ].chunkSize;
   return chunkSize * segmentChunksCount;
   return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getSegmentSize(
      rowToSliceMapping.getView(),
      slices.getView(),
      rowToChunkMapping.getView(),
      segmentIdx );
}

template< typename Device,
@@ -339,37 +333,13 @@ Index
ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
getGlobalIndex( const Index segmentIdx, const Index localIdx ) const
{
   const IndexType& sliceIndex = rowToSliceMapping[ segmentIdx ];
   TNL_ASSERT_LE( sliceIndex, this->rows, "" );
   IndexType firstChunkOfSegment( 0 );
   if( segmentIdx != slices[ sliceIndex ].firstRow )
      firstChunkOfSegment = rowToChunkMapping[ segmentIdx - 1 ];
   
   const IndexType lastChunkOfSegment = rowToChunkMapping[ segmentIdx ];
   const IndexType segmentChunksCount = lastChunkOfSegment - firstChunkOfSegment;
   const IndexType sliceOffset = slices[ sliceIndex ].pointer;
   const IndexType chunkSize = slices[ sliceIndex ].chunkSize;
   TNL_ASSERT_LE( localIdx, segmentChunksCount * chunkSize, "" );

   if( RowMajorOrder )
      return sliceOffset + firstChunkOfSegment * chunkSize + localIdx;
   else
   {
      const IndexType inChunkOffset = localIdx % chunkSize;
      const IndexType chunkIdx = localIdx / chunkSize;
      return sliceOffset + inChunkOffset * segmentChunksCount + chunkIdx;
   }
}

template< typename Device,
          typename Index,
          typename IndexAllocator,
          bool RowMajorOrder >
__cuda_callable__
void
ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
getSegmentAndLocalIndex( const Index globalIdx, Index& segmentIdx, Index& localIdx ) const
{
      return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getGlobalIndex(
         rowToSliceMapping,
         slices,
         rowToChunkMapping,
         chunksInSlice,
         segmentIdx,
         localIdx );
}

template< typename Device,
@@ -381,16 +351,6 @@ auto
ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
getSegmentView( const IndexType segmentIdx ) const -> SegmentViewType
{
   /*const IndexType sliceIdx = segmentIdx / SliceSize;
   const IndexType segmentInSliceIdx = segmentIdx % SliceSize;
   const IndexType& sliceOffset = this->sliceOffsets[ sliceIdx ];
   const IndexType& segmentSize = this->sliceSegmentSizes[ sliceIdx ];

   if( RowMajorOrder )
      return SegmentViewType( sliceOffset + segmentInSliceIdx * segmentSize, segmentSize, 1 );
   else
      return SegmentViewType( sliceOffset + segmentInSliceIdx, segmentSize, SliceSize );
      */
}

template< typename Device,
@@ -402,38 +362,7 @@ void
ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
forSegments( IndexType first, IndexType last, Function& f, Args... args ) const
{
/*   const auto sliceSegmentSizes_view = this->sliceSegmentSizes.getConstView();
   const auto sliceOffsets_view = this->sliceOffsets.getConstView();
   if( RowMajorOrder )
   {
      auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
         const IndexType sliceIdx = segmentIdx / SliceSize;
         const IndexType segmentInSliceIdx = segmentIdx % SliceSize;
         const IndexType segmentSize = sliceSegmentSizes_view[ sliceIdx ];
         const IndexType begin = sliceOffsets_view[ sliceIdx ] + segmentInSliceIdx * segmentSize;
         const IndexType end = begin + segmentSize;
         IndexType localIdx( 0 );
         for( IndexType globalIdx = begin; globalIdx < end; globalIdx++  )
            if( ! f( segmentIdx, localIdx++, globalIdx, args... ) )
               break;
      };
      Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
   }
   else
   {
      auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
         const IndexType sliceIdx = segmentIdx / SliceSize;
         const IndexType segmentInSliceIdx = segmentIdx % SliceSize;
         const IndexType segmentSize = sliceSegmentSizes_view[ sliceIdx ];
         const IndexType begin = sliceOffsets_view[ sliceIdx ] + segmentInSliceIdx;
         const IndexType end = sliceOffsets_view[ sliceIdx + 1 ];
         IndexType localIdx( 0 );
         for( IndexType globalIdx = begin; globalIdx < end; globalIdx += SliceSize )
            if( ! f( segmentIdx, localIdx++, globalIdx, args... ) )
               break;
      };
      Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
   }*/
   this->getView().forSegments( first, last, f, args... );
}

template< typename Device,
@@ -457,43 +386,7 @@ void
ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >::
segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
{
/*   using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
   const auto sliceSegmentSizes_view = this->sliceSegmentSizes.getConstView();
   const auto sliceOffsets_view = this->sliceOffsets.getConstView();
   if( RowMajorOrder )
   {
      auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
         const IndexType sliceIdx = segmentIdx / SliceSize;
         const IndexType segmentInSliceIdx = segmentIdx % SliceSize;
         const IndexType segmentSize = sliceSegmentSizes_view[ sliceIdx ];
         const IndexType begin = sliceOffsets_view[ sliceIdx ] + segmentInSliceIdx * segmentSize;
         const IndexType end = begin + segmentSize;
         RealType aux( zero );
         bool compute( true );
         IndexType localIdx( 0 );
         for( IndexType globalIdx = begin; globalIdx< end; globalIdx++  )
            reduction( aux, fetch( segmentIdx, localIdx++, globalIdx, compute, args... ) );
         keeper( segmentIdx, aux );
      };
      Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
   }
   else
   {
      auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable {
         const IndexType sliceIdx = segmentIdx / SliceSize;
         const IndexType segmentInSliceIdx = segmentIdx % SliceSize;
         const IndexType segmentSize = sliceSegmentSizes_view[ sliceIdx ];
         const IndexType begin = sliceOffsets_view[ sliceIdx ] + segmentInSliceIdx;
         const IndexType end = sliceOffsets_view[ sliceIdx + 1 ];
         RealType aux( zero );
         bool compute( true );
         IndexType localIdx( 0 );
         for( IndexType globalIdx = begin; globalIdx < end; globalIdx += SliceSize  )
            reduction( aux, fetch( segmentIdx, localIdx++, globalIdx, compute, args... ) );
         keeper( segmentIdx, aux );
      };
      Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
   }*/
   this->getView().segmentsReduction( first, last, fetch, reduction, keeper, zero, args... );
}

template< typename Device,
+2 −35
Original line number Diff line number Diff line
@@ -14,42 +14,12 @@

#include <TNL/Containers/Vector.h>
#include <TNL/Containers/Segments/ChunkedEllpackSegmentView.h>
#include <TNL/Containers/Segments/details/ChunkedEllpack.h>

namespace TNL {
   namespace Containers {
      namespace Segments {

/***
 * In the ChunkedEllpack, the segments are split into slices. This is done
 * in ChunkedEllpack::resolveSliceSizes. All segments elements in each slice
 * are split into chunks. All chunks in one slice have the same size, but the size
 * of chunks can be different in each slice.
 */
template< typename Index >
struct ChunkedEllpackSliceInfo
{
   /**
    * The size of the slice, it means the number of the matrix rows covered by
    * the slice.
    */
   Index size;

   /**
    * The chunk size, i.e. maximal number of non-zero elements that can be stored
    * in the chunk.
    */
   Index chunkSize;

   /**
    * Index of the first segment covered be this slice.
    */
   Index firstSegment;

   /**
    * Position of the first element of this slice.
    */
   Index pointer;
};

template< typename Device,
          typename Index,
@@ -67,7 +37,7 @@ class ChunkedEllpackView
      using ViewTemplate = ChunkedEllpackView< Device_, Index_ >;
      using ConstViewType = ChunkedEllpackView< Device, std::add_const_t< Index > >;
      using SegmentViewType = ChunkedEllpackSegmentView< IndexType >;
      using ChunkedEllpackSliceInfoType = ChunkedEllpackSliceInfo< IndexType >;
      using ChunkedEllpackSliceInfoType = details::ChunkedEllpackSliceInfo< IndexType >;
      using ChunkedEllpackSliceInfoAllocator = typename Allocators::Default< Device >::template Allocator< ChunkedEllpackSliceInfoType >;
      using ChunkedEllpackSliceInfoContainer = Containers::Array< ChunkedEllpackSliceInfoType, DeviceType, IndexType, ChunkedEllpackSliceInfoAllocator >;
      using ChunkedEllpackSliceInfoContainerView = typename ChunkedEllpackSliceInfoContainer::ViewType;
@@ -140,9 +110,6 @@ class ChunkedEllpackView
      __cuda_callable__
      IndexType getGlobalIndex( const Index segmentIdx, const Index localIdx ) const;

      __cuda_callable__
      void getSegmentAndLocalIndex( const Index globalIdx, Index& segmentIdx, Index& localIdx ) const;

      __cuda_callable__
      SegmentViewType getSegmentView( const IndexType segmentIdx ) const;

+73 −64
Original line number Diff line number Diff line
@@ -179,16 +179,28 @@ Index
ChunkedEllpackView< Device, Index, RowMajorOrder >::
getSegmentSize( const IndexType segmentIdx ) const
{
   const IndexType& sliceIndex = rowToSliceMapping[ segmentIdx ];
   TNL_ASSERT_LE( sliceIndex, this->getSegmentsCount(), "" );
   IndexType firstChunkOfSegment( 0 );
   if( segmentIdx != slices[ sliceIndex ].firstSegment )
      firstChunkOfSegment = rowToChunkMapping[ segmentIdx - 1 ];

   const IndexType lastChunkOfSegment = rowToChunkMapping[ segmentIdx ];
   const IndexType segmentChunksCount = lastChunkOfSegment - firstChunkOfSegment;
   const IndexType chunkSize = slices[ sliceIndex ].chunkSize;
   return chunkSize * segmentChunksCount;
   if( std::is_same< DeviceType, Devices::Host >::value )
      return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getSegmentSizeDirect(
         rowToSliceMapping,
         slices,
         rowToChunkMapping,
         segmentIdx );
   if( std::is_same< DeviceType, Devices::Cuda >::value )
   {
#ifdef __CUDA_ARCH__
      return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getSegmentSizeDirect(
         rowToSliceMapping,
         slices,
         rowToChunkMapping,
         segmentIdx );
#else
      return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getSegmentSize(
         rowToSliceMapping,
         slices,
         rowToChunkMapping,
         segmentIdx );
#endif
   }
}

template< typename Device,
@@ -221,38 +233,36 @@ Index
ChunkedEllpackView< Device, Index, RowMajorOrder >::
getGlobalIndex( const Index segmentIdx, const Index localIdx ) const
{
   const IndexType& sliceIndex = rowToSliceMapping[ segmentIdx ];
   TNL_ASSERT_LE( sliceIndex, this->size, "" );
   IndexType firstChunkOfSegment( 0 );
   if( segmentIdx != slices[ sliceIndex ].firstSegment )
      firstChunkOfSegment = rowToChunkMapping[ segmentIdx - 1 ];
   
   const IndexType lastChunkOfSegment = rowToChunkMapping[ segmentIdx ];
   const IndexType segmentChunksCount = lastChunkOfSegment - firstChunkOfSegment;
   const IndexType sliceOffset = slices[ sliceIndex ].pointer;
   const IndexType chunkSize = slices[ sliceIndex ].chunkSize;
   TNL_ASSERT_LE( localIdx, segmentChunksCount * chunkSize, "" );

   if( RowMajorOrder )
      return sliceOffset + firstChunkOfSegment * chunkSize + localIdx;
   else
   {
      const IndexType inChunkOffset = localIdx % chunkSize;
      const IndexType chunkIdx = localIdx / chunkSize;
      return sliceOffset + inChunkOffset * chunksInSlice + firstChunkOfSegment + chunkIdx;
   if( std::is_same< DeviceType, Devices::Host >::value )
      return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getGlobalIndexDirect(
         rowToSliceMapping,
         slices,
         rowToChunkMapping,
         chunksInSlice,
         segmentIdx,
         localIdx );
   if( std::is_same< DeviceType, Devices::Cuda >::value )
   {
#ifdef __CUDA_ARCH__
      return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getGlobalIndexDirect(
         rowToSliceMapping,
         slices,
         rowToChunkMapping,
         chunksInSlice,
         segmentIdx,
         localIdx );
#else
      return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getGlobalIndex(
         rowToSliceMapping,
         slices,
         rowToChunkMapping,
         chunksInSlice,
         segmentIdx,
         localIdx );
#endif
   }
}

template< typename Device,
          typename Index,
          bool RowMajorOrder >
__cuda_callable__
void
ChunkedEllpackView< Device, Index, RowMajorOrder >::
getSegmentAndLocalIndex( const Index globalIdx, Index& segmentIdx, Index& localIdx ) const
{
}

template< typename Device,
          typename Index,
          bool RowMajorOrder >
@@ -261,32 +271,31 @@ auto
ChunkedEllpackView< Device, Index, RowMajorOrder >::
getSegmentView( const IndexType segmentIdx ) const -> SegmentViewType
{
   const IndexType& sliceIndex = rowToSliceMapping[ segmentIdx ];
   TNL_ASSERT_LE( sliceIndex, this->size, "" );
   IndexType firstChunkOfSegment( 0 );
   if( segmentIdx != slices[ sliceIndex ].firstSegment )
      firstChunkOfSegment = rowToChunkMapping[ segmentIdx - 1 ];

   const IndexType lastChunkOfSegment = rowToChunkMapping[ segmentIdx ];
   const IndexType segmentChunksCount = lastChunkOfSegment - firstChunkOfSegment;
   const IndexType sliceOffset = slices[ sliceIndex ].pointer;
   const IndexType chunkSize = slices[ sliceIndex ].chunkSize;
   const IndexType segmentSize = segmentChunksCount * chunkSize;

   if( RowMajorOrder )
      return SegmentViewType( sliceOffset + firstChunkOfSegment * chunkSize,
                              segmentSize,
                              chunkSize,
                              chunksInSlice );
   else // TODO FIX !!!!!!!!!!!!!!
      return SegmentViewType( sliceOffset + firstChunkOfSegment,
                              segmentSize,
                              chunkSize,
                              chunksInSlice );
   
   
   
   
   if( std::is_same< DeviceType, Devices::Host >::value )
      return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getSegmentViewDirect(
         rowToSliceMapping,
         slices,
         rowToChunkMapping,
         chunksInSlice,
         segmentIdx );
   if( std::is_same< DeviceType, Devices::Cuda >::value )
   {
#ifdef __CUDA_ARCH__
      return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getSegmentViewDirect(
         rowToSliceMapping,
         slices,
         rowToChunkMapping,
         chunksInSlice,
         segmentIdx );
#else
      return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getSegmentView(
         rowToSliceMapping,
         slices,
         rowToChunkMapping,
         chunksInSlice,
         segmentIdx );
#endif
   }
}

template< typename Device,
+230 −0

File added.

Preview size limit exceeded, changes collapsed.