Loading src/TNL/Containers/Segments/ChunkedEllpack.h +1 −6 Original line number Diff line number Diff line Loading @@ -36,7 +36,7 @@ class ChunkedEllpack using ViewTemplate = ChunkedEllpackView< Device_, Index_, RowMajorOrder >; using ConstViewType = ChunkedEllpackView< Device, std::add_const_t< Index >, RowMajorOrder >; using SegmentViewType = SegmentView< IndexType, RowMajorOrder >; using ChunkedEllpackSliceInfoType = ChunkedEllpackSliceInfo< IndexType >; using ChunkedEllpackSliceInfoType = details::ChunkedEllpackSliceInfo< IndexType >; //TODO: using ChunkedEllpackSliceInfoAllocator = typename IndexAllocatorType::retype< ChunkedEllpackSliceInfoType >; using ChunkedEllpackSliceInfoAllocator = typename Allocators::Default< Device >::template Allocator< ChunkedEllpackSliceInfoType >; using ChunkedEllpackSliceInfoContainer = Containers::Array< ChunkedEllpackSliceInfoType, DeviceType, IndexType, ChunkedEllpackSliceInfoAllocator >; Loading Loading @@ -66,7 +66,6 @@ class ChunkedEllpack __cuda_callable__ IndexType getSegmentsCount() const; __cuda_callable__ IndexType getSegmentSize( const IndexType segmentIdx ) const; /** Loading @@ -75,16 +74,12 @@ class ChunkedEllpack __cuda_callable__ IndexType getSize() const; __cuda_callable__ IndexType getStorageSize() const; __cuda_callable__ IndexType getGlobalIndex( const Index segmentIdx, const Index localIdx ) const; __cuda_callable__ void getSegmentAndLocalIndex( const Index globalIdx, Index& segmentIdx, Index& localIdx ) const; __cuda_callable__ SegmentViewType getSegmentView( const IndexType segmentIdx ) const; Loading src/TNL/Containers/Segments/ChunkedEllpack.hpp +14 −121 Original line number Diff line number Diff line Loading @@ -289,21 +289,15 @@ template< typename Device, typename Index, typename IndexAllocator, bool RowMajorOrder > __cuda_callable__ Index ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >:: getSegmentSize( const IndexType segmentIdx ) const { const IndexType& sliceIndex = rowToSliceMapping[ segmentIdx ]; TNL_ASSERT_LE( sliceIndex, this->getSegmentsCount(), "" ); IndexType firstChunkOfSegment( 0 ); if( segmentIdx != slices[ sliceIndex ].firstRow ) firstChunkOfSegment = rowToChunkMapping[ segmentIdx - 1 ]; const IndexType lastChunkOfSegment = rowToChunkMapping[ segmentIdx ]; const IndexType segmentChunksCount = lastChunkOfSegment - firstChunkOfSegment; const IndexType chunkSize = slices[ sliceIndex ].chunkSize; return chunkSize * segmentChunksCount; return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getSegmentSize( rowToSliceMapping.getView(), slices.getView(), rowToChunkMapping.getView(), segmentIdx ); } template< typename Device, Loading Loading @@ -339,37 +333,13 @@ Index ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >:: getGlobalIndex( const Index segmentIdx, const Index localIdx ) const { const IndexType& sliceIndex = rowToSliceMapping[ segmentIdx ]; TNL_ASSERT_LE( sliceIndex, this->rows, "" ); IndexType firstChunkOfSegment( 0 ); if( segmentIdx != slices[ sliceIndex ].firstRow ) firstChunkOfSegment = rowToChunkMapping[ segmentIdx - 1 ]; const IndexType lastChunkOfSegment = rowToChunkMapping[ segmentIdx ]; const IndexType segmentChunksCount = lastChunkOfSegment - firstChunkOfSegment; const IndexType sliceOffset = slices[ sliceIndex ].pointer; const IndexType chunkSize = slices[ sliceIndex ].chunkSize; TNL_ASSERT_LE( localIdx, segmentChunksCount * chunkSize, "" ); if( RowMajorOrder ) return sliceOffset + firstChunkOfSegment * chunkSize + localIdx; else { const IndexType inChunkOffset = localIdx % chunkSize; const IndexType chunkIdx = localIdx / chunkSize; return sliceOffset + inChunkOffset * segmentChunksCount + chunkIdx; } } template< typename Device, typename Index, typename IndexAllocator, bool RowMajorOrder > __cuda_callable__ void ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >:: getSegmentAndLocalIndex( const Index globalIdx, Index& segmentIdx, Index& localIdx ) const { return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getGlobalIndex( rowToSliceMapping, slices, rowToChunkMapping, chunksInSlice, segmentIdx, localIdx ); } template< typename Device, Loading @@ -381,16 +351,6 @@ auto ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >:: getSegmentView( const IndexType segmentIdx ) const -> SegmentViewType { /*const IndexType sliceIdx = segmentIdx / SliceSize; const IndexType segmentInSliceIdx = segmentIdx % SliceSize; const IndexType& sliceOffset = this->sliceOffsets[ sliceIdx ]; const IndexType& segmentSize = this->sliceSegmentSizes[ sliceIdx ]; if( RowMajorOrder ) return SegmentViewType( sliceOffset + segmentInSliceIdx * segmentSize, segmentSize, 1 ); else return SegmentViewType( sliceOffset + segmentInSliceIdx, segmentSize, SliceSize ); */ } template< typename Device, Loading @@ -402,38 +362,7 @@ void ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >:: forSegments( IndexType first, IndexType last, Function& f, Args... args ) const { /* const auto sliceSegmentSizes_view = this->sliceSegmentSizes.getConstView(); const auto sliceOffsets_view = this->sliceOffsets.getConstView(); if( RowMajorOrder ) { auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable { const IndexType sliceIdx = segmentIdx / SliceSize; const IndexType segmentInSliceIdx = segmentIdx % SliceSize; const IndexType segmentSize = sliceSegmentSizes_view[ sliceIdx ]; const IndexType begin = sliceOffsets_view[ sliceIdx ] + segmentInSliceIdx * segmentSize; const IndexType end = begin + segmentSize; IndexType localIdx( 0 ); for( IndexType globalIdx = begin; globalIdx < end; globalIdx++ ) if( ! f( segmentIdx, localIdx++, globalIdx, args... ) ) break; }; Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); } else { auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable { const IndexType sliceIdx = segmentIdx / SliceSize; const IndexType segmentInSliceIdx = segmentIdx % SliceSize; const IndexType segmentSize = sliceSegmentSizes_view[ sliceIdx ]; const IndexType begin = sliceOffsets_view[ sliceIdx ] + segmentInSliceIdx; const IndexType end = sliceOffsets_view[ sliceIdx + 1 ]; IndexType localIdx( 0 ); for( IndexType globalIdx = begin; globalIdx < end; globalIdx += SliceSize ) if( ! f( segmentIdx, localIdx++, globalIdx, args... ) ) break; }; Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); }*/ this->getView().forSegments( first, last, f, args... ); } template< typename Device, Loading @@ -457,43 +386,7 @@ void ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >:: segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { /* using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) ); const auto sliceSegmentSizes_view = this->sliceSegmentSizes.getConstView(); const auto sliceOffsets_view = this->sliceOffsets.getConstView(); if( RowMajorOrder ) { auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable { const IndexType sliceIdx = segmentIdx / SliceSize; const IndexType segmentInSliceIdx = segmentIdx % SliceSize; const IndexType segmentSize = sliceSegmentSizes_view[ sliceIdx ]; const IndexType begin = sliceOffsets_view[ sliceIdx ] + segmentInSliceIdx * segmentSize; const IndexType end = begin + segmentSize; RealType aux( zero ); bool compute( true ); IndexType localIdx( 0 ); for( IndexType globalIdx = begin; globalIdx< end; globalIdx++ ) reduction( aux, fetch( segmentIdx, localIdx++, globalIdx, compute, args... ) ); keeper( segmentIdx, aux ); }; Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); } else { auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable { const IndexType sliceIdx = segmentIdx / SliceSize; const IndexType segmentInSliceIdx = segmentIdx % SliceSize; const IndexType segmentSize = sliceSegmentSizes_view[ sliceIdx ]; const IndexType begin = sliceOffsets_view[ sliceIdx ] + segmentInSliceIdx; const IndexType end = sliceOffsets_view[ sliceIdx + 1 ]; RealType aux( zero ); bool compute( true ); IndexType localIdx( 0 ); for( IndexType globalIdx = begin; globalIdx < end; globalIdx += SliceSize ) reduction( aux, fetch( segmentIdx, localIdx++, globalIdx, compute, args... ) ); keeper( segmentIdx, aux ); }; Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); }*/ this->getView().segmentsReduction( first, last, fetch, reduction, keeper, zero, args... ); } template< typename Device, Loading src/TNL/Containers/Segments/ChunkedEllpackView.h +2 −35 Original line number Diff line number Diff line Loading @@ -14,42 +14,12 @@ #include <TNL/Containers/Vector.h> #include <TNL/Containers/Segments/ChunkedEllpackSegmentView.h> #include <TNL/Containers/Segments/details/ChunkedEllpack.h> namespace TNL { namespace Containers { namespace Segments { /*** * In the ChunkedEllpack, the segments are split into slices. This is done * in ChunkedEllpack::resolveSliceSizes. All segments elements in each slice * are split into chunks. All chunks in one slice have the same size, but the size * of chunks can be different in each slice. */ template< typename Index > struct ChunkedEllpackSliceInfo { /** * The size of the slice, it means the number of the matrix rows covered by * the slice. */ Index size; /** * The chunk size, i.e. maximal number of non-zero elements that can be stored * in the chunk. */ Index chunkSize; /** * Index of the first segment covered be this slice. */ Index firstSegment; /** * Position of the first element of this slice. */ Index pointer; }; template< typename Device, typename Index, Loading @@ -67,7 +37,7 @@ class ChunkedEllpackView using ViewTemplate = ChunkedEllpackView< Device_, Index_ >; using ConstViewType = ChunkedEllpackView< Device, std::add_const_t< Index > >; using SegmentViewType = ChunkedEllpackSegmentView< IndexType >; using ChunkedEllpackSliceInfoType = ChunkedEllpackSliceInfo< IndexType >; using ChunkedEllpackSliceInfoType = details::ChunkedEllpackSliceInfo< IndexType >; using ChunkedEllpackSliceInfoAllocator = typename Allocators::Default< Device >::template Allocator< ChunkedEllpackSliceInfoType >; using ChunkedEllpackSliceInfoContainer = Containers::Array< ChunkedEllpackSliceInfoType, DeviceType, IndexType, ChunkedEllpackSliceInfoAllocator >; using ChunkedEllpackSliceInfoContainerView = typename ChunkedEllpackSliceInfoContainer::ViewType; Loading Loading @@ -140,9 +110,6 @@ class ChunkedEllpackView __cuda_callable__ IndexType getGlobalIndex( const Index segmentIdx, const Index localIdx ) const; __cuda_callable__ void getSegmentAndLocalIndex( const Index globalIdx, Index& segmentIdx, Index& localIdx ) const; __cuda_callable__ SegmentViewType getSegmentView( const IndexType segmentIdx ) const; Loading src/TNL/Containers/Segments/ChunkedEllpackView.hpp +73 −64 Original line number Diff line number Diff line Loading @@ -179,16 +179,28 @@ Index ChunkedEllpackView< Device, Index, RowMajorOrder >:: getSegmentSize( const IndexType segmentIdx ) const { const IndexType& sliceIndex = rowToSliceMapping[ segmentIdx ]; TNL_ASSERT_LE( sliceIndex, this->getSegmentsCount(), "" ); IndexType firstChunkOfSegment( 0 ); if( segmentIdx != slices[ sliceIndex ].firstSegment ) firstChunkOfSegment = rowToChunkMapping[ segmentIdx - 1 ]; const IndexType lastChunkOfSegment = rowToChunkMapping[ segmentIdx ]; const IndexType segmentChunksCount = lastChunkOfSegment - firstChunkOfSegment; const IndexType chunkSize = slices[ sliceIndex ].chunkSize; return chunkSize * segmentChunksCount; if( std::is_same< DeviceType, Devices::Host >::value ) return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getSegmentSizeDirect( rowToSliceMapping, slices, rowToChunkMapping, segmentIdx ); if( std::is_same< DeviceType, Devices::Cuda >::value ) { #ifdef __CUDA_ARCH__ return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getSegmentSizeDirect( rowToSliceMapping, slices, rowToChunkMapping, segmentIdx ); #else return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getSegmentSize( rowToSliceMapping, slices, rowToChunkMapping, segmentIdx ); #endif } } template< typename Device, Loading Loading @@ -221,38 +233,36 @@ Index ChunkedEllpackView< Device, Index, RowMajorOrder >:: getGlobalIndex( const Index segmentIdx, const Index localIdx ) const { const IndexType& sliceIndex = rowToSliceMapping[ segmentIdx ]; TNL_ASSERT_LE( sliceIndex, this->size, "" ); IndexType firstChunkOfSegment( 0 ); if( segmentIdx != slices[ sliceIndex ].firstSegment ) firstChunkOfSegment = rowToChunkMapping[ segmentIdx - 1 ]; const IndexType lastChunkOfSegment = rowToChunkMapping[ segmentIdx ]; const IndexType segmentChunksCount = lastChunkOfSegment - firstChunkOfSegment; const IndexType sliceOffset = slices[ sliceIndex ].pointer; const IndexType chunkSize = slices[ sliceIndex ].chunkSize; TNL_ASSERT_LE( localIdx, segmentChunksCount * chunkSize, "" ); if( RowMajorOrder ) return sliceOffset + firstChunkOfSegment * chunkSize + localIdx; else { const IndexType inChunkOffset = localIdx % chunkSize; const IndexType chunkIdx = localIdx / chunkSize; return sliceOffset + inChunkOffset * chunksInSlice + firstChunkOfSegment + chunkIdx; if( std::is_same< DeviceType, Devices::Host >::value ) return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getGlobalIndexDirect( rowToSliceMapping, slices, rowToChunkMapping, chunksInSlice, segmentIdx, localIdx ); if( std::is_same< DeviceType, Devices::Cuda >::value ) { #ifdef __CUDA_ARCH__ return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getGlobalIndexDirect( rowToSliceMapping, slices, rowToChunkMapping, chunksInSlice, segmentIdx, localIdx ); #else return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getGlobalIndex( rowToSliceMapping, slices, rowToChunkMapping, chunksInSlice, segmentIdx, localIdx ); #endif } } template< typename Device, typename Index, bool RowMajorOrder > __cuda_callable__ void ChunkedEllpackView< Device, Index, RowMajorOrder >:: getSegmentAndLocalIndex( const Index globalIdx, Index& segmentIdx, Index& localIdx ) const { } template< typename Device, typename Index, bool RowMajorOrder > Loading @@ -261,32 +271,31 @@ auto ChunkedEllpackView< Device, Index, RowMajorOrder >:: getSegmentView( const IndexType segmentIdx ) const -> SegmentViewType { const IndexType& sliceIndex = rowToSliceMapping[ segmentIdx ]; TNL_ASSERT_LE( sliceIndex, this->size, "" ); IndexType firstChunkOfSegment( 0 ); if( segmentIdx != slices[ sliceIndex ].firstSegment ) firstChunkOfSegment = rowToChunkMapping[ segmentIdx - 1 ]; const IndexType lastChunkOfSegment = rowToChunkMapping[ segmentIdx ]; const IndexType segmentChunksCount = lastChunkOfSegment - firstChunkOfSegment; const IndexType sliceOffset = slices[ sliceIndex ].pointer; const IndexType chunkSize = slices[ sliceIndex ].chunkSize; const IndexType segmentSize = segmentChunksCount * chunkSize; if( RowMajorOrder ) return SegmentViewType( sliceOffset + firstChunkOfSegment * chunkSize, segmentSize, chunkSize, chunksInSlice ); else // TODO FIX !!!!!!!!!!!!!! return SegmentViewType( sliceOffset + firstChunkOfSegment, segmentSize, chunkSize, chunksInSlice ); if( std::is_same< DeviceType, Devices::Host >::value ) return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getSegmentViewDirect( rowToSliceMapping, slices, rowToChunkMapping, chunksInSlice, segmentIdx ); if( std::is_same< DeviceType, Devices::Cuda >::value ) { #ifdef __CUDA_ARCH__ return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getSegmentViewDirect( rowToSliceMapping, slices, rowToChunkMapping, chunksInSlice, segmentIdx ); #else return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getSegmentView( rowToSliceMapping, slices, rowToChunkMapping, chunksInSlice, segmentIdx ); #endif } } template< typename Device, Loading src/TNL/Containers/Segments/details/ChunkedEllpack.h 0 → 100644 +230 −0 File added.Preview size limit exceeded, changes collapsed. Show changes Loading
src/TNL/Containers/Segments/ChunkedEllpack.h +1 −6 Original line number Diff line number Diff line Loading @@ -36,7 +36,7 @@ class ChunkedEllpack using ViewTemplate = ChunkedEllpackView< Device_, Index_, RowMajorOrder >; using ConstViewType = ChunkedEllpackView< Device, std::add_const_t< Index >, RowMajorOrder >; using SegmentViewType = SegmentView< IndexType, RowMajorOrder >; using ChunkedEllpackSliceInfoType = ChunkedEllpackSliceInfo< IndexType >; using ChunkedEllpackSliceInfoType = details::ChunkedEllpackSliceInfo< IndexType >; //TODO: using ChunkedEllpackSliceInfoAllocator = typename IndexAllocatorType::retype< ChunkedEllpackSliceInfoType >; using ChunkedEllpackSliceInfoAllocator = typename Allocators::Default< Device >::template Allocator< ChunkedEllpackSliceInfoType >; using ChunkedEllpackSliceInfoContainer = Containers::Array< ChunkedEllpackSliceInfoType, DeviceType, IndexType, ChunkedEllpackSliceInfoAllocator >; Loading Loading @@ -66,7 +66,6 @@ class ChunkedEllpack __cuda_callable__ IndexType getSegmentsCount() const; __cuda_callable__ IndexType getSegmentSize( const IndexType segmentIdx ) const; /** Loading @@ -75,16 +74,12 @@ class ChunkedEllpack __cuda_callable__ IndexType getSize() const; __cuda_callable__ IndexType getStorageSize() const; __cuda_callable__ IndexType getGlobalIndex( const Index segmentIdx, const Index localIdx ) const; __cuda_callable__ void getSegmentAndLocalIndex( const Index globalIdx, Index& segmentIdx, Index& localIdx ) const; __cuda_callable__ SegmentViewType getSegmentView( const IndexType segmentIdx ) const; Loading
src/TNL/Containers/Segments/ChunkedEllpack.hpp +14 −121 Original line number Diff line number Diff line Loading @@ -289,21 +289,15 @@ template< typename Device, typename Index, typename IndexAllocator, bool RowMajorOrder > __cuda_callable__ Index ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >:: getSegmentSize( const IndexType segmentIdx ) const { const IndexType& sliceIndex = rowToSliceMapping[ segmentIdx ]; TNL_ASSERT_LE( sliceIndex, this->getSegmentsCount(), "" ); IndexType firstChunkOfSegment( 0 ); if( segmentIdx != slices[ sliceIndex ].firstRow ) firstChunkOfSegment = rowToChunkMapping[ segmentIdx - 1 ]; const IndexType lastChunkOfSegment = rowToChunkMapping[ segmentIdx ]; const IndexType segmentChunksCount = lastChunkOfSegment - firstChunkOfSegment; const IndexType chunkSize = slices[ sliceIndex ].chunkSize; return chunkSize * segmentChunksCount; return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getSegmentSize( rowToSliceMapping.getView(), slices.getView(), rowToChunkMapping.getView(), segmentIdx ); } template< typename Device, Loading Loading @@ -339,37 +333,13 @@ Index ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >:: getGlobalIndex( const Index segmentIdx, const Index localIdx ) const { const IndexType& sliceIndex = rowToSliceMapping[ segmentIdx ]; TNL_ASSERT_LE( sliceIndex, this->rows, "" ); IndexType firstChunkOfSegment( 0 ); if( segmentIdx != slices[ sliceIndex ].firstRow ) firstChunkOfSegment = rowToChunkMapping[ segmentIdx - 1 ]; const IndexType lastChunkOfSegment = rowToChunkMapping[ segmentIdx ]; const IndexType segmentChunksCount = lastChunkOfSegment - firstChunkOfSegment; const IndexType sliceOffset = slices[ sliceIndex ].pointer; const IndexType chunkSize = slices[ sliceIndex ].chunkSize; TNL_ASSERT_LE( localIdx, segmentChunksCount * chunkSize, "" ); if( RowMajorOrder ) return sliceOffset + firstChunkOfSegment * chunkSize + localIdx; else { const IndexType inChunkOffset = localIdx % chunkSize; const IndexType chunkIdx = localIdx / chunkSize; return sliceOffset + inChunkOffset * segmentChunksCount + chunkIdx; } } template< typename Device, typename Index, typename IndexAllocator, bool RowMajorOrder > __cuda_callable__ void ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >:: getSegmentAndLocalIndex( const Index globalIdx, Index& segmentIdx, Index& localIdx ) const { return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getGlobalIndex( rowToSliceMapping, slices, rowToChunkMapping, chunksInSlice, segmentIdx, localIdx ); } template< typename Device, Loading @@ -381,16 +351,6 @@ auto ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >:: getSegmentView( const IndexType segmentIdx ) const -> SegmentViewType { /*const IndexType sliceIdx = segmentIdx / SliceSize; const IndexType segmentInSliceIdx = segmentIdx % SliceSize; const IndexType& sliceOffset = this->sliceOffsets[ sliceIdx ]; const IndexType& segmentSize = this->sliceSegmentSizes[ sliceIdx ]; if( RowMajorOrder ) return SegmentViewType( sliceOffset + segmentInSliceIdx * segmentSize, segmentSize, 1 ); else return SegmentViewType( sliceOffset + segmentInSliceIdx, segmentSize, SliceSize ); */ } template< typename Device, Loading @@ -402,38 +362,7 @@ void ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >:: forSegments( IndexType first, IndexType last, Function& f, Args... args ) const { /* const auto sliceSegmentSizes_view = this->sliceSegmentSizes.getConstView(); const auto sliceOffsets_view = this->sliceOffsets.getConstView(); if( RowMajorOrder ) { auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable { const IndexType sliceIdx = segmentIdx / SliceSize; const IndexType segmentInSliceIdx = segmentIdx % SliceSize; const IndexType segmentSize = sliceSegmentSizes_view[ sliceIdx ]; const IndexType begin = sliceOffsets_view[ sliceIdx ] + segmentInSliceIdx * segmentSize; const IndexType end = begin + segmentSize; IndexType localIdx( 0 ); for( IndexType globalIdx = begin; globalIdx < end; globalIdx++ ) if( ! f( segmentIdx, localIdx++, globalIdx, args... ) ) break; }; Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); } else { auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable { const IndexType sliceIdx = segmentIdx / SliceSize; const IndexType segmentInSliceIdx = segmentIdx % SliceSize; const IndexType segmentSize = sliceSegmentSizes_view[ sliceIdx ]; const IndexType begin = sliceOffsets_view[ sliceIdx ] + segmentInSliceIdx; const IndexType end = sliceOffsets_view[ sliceIdx + 1 ]; IndexType localIdx( 0 ); for( IndexType globalIdx = begin; globalIdx < end; globalIdx += SliceSize ) if( ! f( segmentIdx, localIdx++, globalIdx, args... ) ) break; }; Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); }*/ this->getView().forSegments( first, last, f, args... ); } template< typename Device, Loading @@ -457,43 +386,7 @@ void ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >:: segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { /* using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) ); const auto sliceSegmentSizes_view = this->sliceSegmentSizes.getConstView(); const auto sliceOffsets_view = this->sliceOffsets.getConstView(); if( RowMajorOrder ) { auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable { const IndexType sliceIdx = segmentIdx / SliceSize; const IndexType segmentInSliceIdx = segmentIdx % SliceSize; const IndexType segmentSize = sliceSegmentSizes_view[ sliceIdx ]; const IndexType begin = sliceOffsets_view[ sliceIdx ] + segmentInSliceIdx * segmentSize; const IndexType end = begin + segmentSize; RealType aux( zero ); bool compute( true ); IndexType localIdx( 0 ); for( IndexType globalIdx = begin; globalIdx< end; globalIdx++ ) reduction( aux, fetch( segmentIdx, localIdx++, globalIdx, compute, args... ) ); keeper( segmentIdx, aux ); }; Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); } else { auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable { const IndexType sliceIdx = segmentIdx / SliceSize; const IndexType segmentInSliceIdx = segmentIdx % SliceSize; const IndexType segmentSize = sliceSegmentSizes_view[ sliceIdx ]; const IndexType begin = sliceOffsets_view[ sliceIdx ] + segmentInSliceIdx; const IndexType end = sliceOffsets_view[ sliceIdx + 1 ]; RealType aux( zero ); bool compute( true ); IndexType localIdx( 0 ); for( IndexType globalIdx = begin; globalIdx < end; globalIdx += SliceSize ) reduction( aux, fetch( segmentIdx, localIdx++, globalIdx, compute, args... ) ); keeper( segmentIdx, aux ); }; Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); }*/ this->getView().segmentsReduction( first, last, fetch, reduction, keeper, zero, args... ); } template< typename Device, Loading
src/TNL/Containers/Segments/ChunkedEllpackView.h +2 −35 Original line number Diff line number Diff line Loading @@ -14,42 +14,12 @@ #include <TNL/Containers/Vector.h> #include <TNL/Containers/Segments/ChunkedEllpackSegmentView.h> #include <TNL/Containers/Segments/details/ChunkedEllpack.h> namespace TNL { namespace Containers { namespace Segments { /*** * In the ChunkedEllpack, the segments are split into slices. This is done * in ChunkedEllpack::resolveSliceSizes. All segments elements in each slice * are split into chunks. All chunks in one slice have the same size, but the size * of chunks can be different in each slice. */ template< typename Index > struct ChunkedEllpackSliceInfo { /** * The size of the slice, it means the number of the matrix rows covered by * the slice. */ Index size; /** * The chunk size, i.e. maximal number of non-zero elements that can be stored * in the chunk. */ Index chunkSize; /** * Index of the first segment covered be this slice. */ Index firstSegment; /** * Position of the first element of this slice. */ Index pointer; }; template< typename Device, typename Index, Loading @@ -67,7 +37,7 @@ class ChunkedEllpackView using ViewTemplate = ChunkedEllpackView< Device_, Index_ >; using ConstViewType = ChunkedEllpackView< Device, std::add_const_t< Index > >; using SegmentViewType = ChunkedEllpackSegmentView< IndexType >; using ChunkedEllpackSliceInfoType = ChunkedEllpackSliceInfo< IndexType >; using ChunkedEllpackSliceInfoType = details::ChunkedEllpackSliceInfo< IndexType >; using ChunkedEllpackSliceInfoAllocator = typename Allocators::Default< Device >::template Allocator< ChunkedEllpackSliceInfoType >; using ChunkedEllpackSliceInfoContainer = Containers::Array< ChunkedEllpackSliceInfoType, DeviceType, IndexType, ChunkedEllpackSliceInfoAllocator >; using ChunkedEllpackSliceInfoContainerView = typename ChunkedEllpackSliceInfoContainer::ViewType; Loading Loading @@ -140,9 +110,6 @@ class ChunkedEllpackView __cuda_callable__ IndexType getGlobalIndex( const Index segmentIdx, const Index localIdx ) const; __cuda_callable__ void getSegmentAndLocalIndex( const Index globalIdx, Index& segmentIdx, Index& localIdx ) const; __cuda_callable__ SegmentViewType getSegmentView( const IndexType segmentIdx ) const; Loading
src/TNL/Containers/Segments/ChunkedEllpackView.hpp +73 −64 Original line number Diff line number Diff line Loading @@ -179,16 +179,28 @@ Index ChunkedEllpackView< Device, Index, RowMajorOrder >:: getSegmentSize( const IndexType segmentIdx ) const { const IndexType& sliceIndex = rowToSliceMapping[ segmentIdx ]; TNL_ASSERT_LE( sliceIndex, this->getSegmentsCount(), "" ); IndexType firstChunkOfSegment( 0 ); if( segmentIdx != slices[ sliceIndex ].firstSegment ) firstChunkOfSegment = rowToChunkMapping[ segmentIdx - 1 ]; const IndexType lastChunkOfSegment = rowToChunkMapping[ segmentIdx ]; const IndexType segmentChunksCount = lastChunkOfSegment - firstChunkOfSegment; const IndexType chunkSize = slices[ sliceIndex ].chunkSize; return chunkSize * segmentChunksCount; if( std::is_same< DeviceType, Devices::Host >::value ) return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getSegmentSizeDirect( rowToSliceMapping, slices, rowToChunkMapping, segmentIdx ); if( std::is_same< DeviceType, Devices::Cuda >::value ) { #ifdef __CUDA_ARCH__ return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getSegmentSizeDirect( rowToSliceMapping, slices, rowToChunkMapping, segmentIdx ); #else return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getSegmentSize( rowToSliceMapping, slices, rowToChunkMapping, segmentIdx ); #endif } } template< typename Device, Loading Loading @@ -221,38 +233,36 @@ Index ChunkedEllpackView< Device, Index, RowMajorOrder >:: getGlobalIndex( const Index segmentIdx, const Index localIdx ) const { const IndexType& sliceIndex = rowToSliceMapping[ segmentIdx ]; TNL_ASSERT_LE( sliceIndex, this->size, "" ); IndexType firstChunkOfSegment( 0 ); if( segmentIdx != slices[ sliceIndex ].firstSegment ) firstChunkOfSegment = rowToChunkMapping[ segmentIdx - 1 ]; const IndexType lastChunkOfSegment = rowToChunkMapping[ segmentIdx ]; const IndexType segmentChunksCount = lastChunkOfSegment - firstChunkOfSegment; const IndexType sliceOffset = slices[ sliceIndex ].pointer; const IndexType chunkSize = slices[ sliceIndex ].chunkSize; TNL_ASSERT_LE( localIdx, segmentChunksCount * chunkSize, "" ); if( RowMajorOrder ) return sliceOffset + firstChunkOfSegment * chunkSize + localIdx; else { const IndexType inChunkOffset = localIdx % chunkSize; const IndexType chunkIdx = localIdx / chunkSize; return sliceOffset + inChunkOffset * chunksInSlice + firstChunkOfSegment + chunkIdx; if( std::is_same< DeviceType, Devices::Host >::value ) return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getGlobalIndexDirect( rowToSliceMapping, slices, rowToChunkMapping, chunksInSlice, segmentIdx, localIdx ); if( std::is_same< DeviceType, Devices::Cuda >::value ) { #ifdef __CUDA_ARCH__ return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getGlobalIndexDirect( rowToSliceMapping, slices, rowToChunkMapping, chunksInSlice, segmentIdx, localIdx ); #else return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getGlobalIndex( rowToSliceMapping, slices, rowToChunkMapping, chunksInSlice, segmentIdx, localIdx ); #endif } } template< typename Device, typename Index, bool RowMajorOrder > __cuda_callable__ void ChunkedEllpackView< Device, Index, RowMajorOrder >:: getSegmentAndLocalIndex( const Index globalIdx, Index& segmentIdx, Index& localIdx ) const { } template< typename Device, typename Index, bool RowMajorOrder > Loading @@ -261,32 +271,31 @@ auto ChunkedEllpackView< Device, Index, RowMajorOrder >:: getSegmentView( const IndexType segmentIdx ) const -> SegmentViewType { const IndexType& sliceIndex = rowToSliceMapping[ segmentIdx ]; TNL_ASSERT_LE( sliceIndex, this->size, "" ); IndexType firstChunkOfSegment( 0 ); if( segmentIdx != slices[ sliceIndex ].firstSegment ) firstChunkOfSegment = rowToChunkMapping[ segmentIdx - 1 ]; const IndexType lastChunkOfSegment = rowToChunkMapping[ segmentIdx ]; const IndexType segmentChunksCount = lastChunkOfSegment - firstChunkOfSegment; const IndexType sliceOffset = slices[ sliceIndex ].pointer; const IndexType chunkSize = slices[ sliceIndex ].chunkSize; const IndexType segmentSize = segmentChunksCount * chunkSize; if( RowMajorOrder ) return SegmentViewType( sliceOffset + firstChunkOfSegment * chunkSize, segmentSize, chunkSize, chunksInSlice ); else // TODO FIX !!!!!!!!!!!!!! return SegmentViewType( sliceOffset + firstChunkOfSegment, segmentSize, chunkSize, chunksInSlice ); if( std::is_same< DeviceType, Devices::Host >::value ) return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getSegmentViewDirect( rowToSliceMapping, slices, rowToChunkMapping, chunksInSlice, segmentIdx ); if( std::is_same< DeviceType, Devices::Cuda >::value ) { #ifdef __CUDA_ARCH__ return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getSegmentViewDirect( rowToSliceMapping, slices, rowToChunkMapping, chunksInSlice, segmentIdx ); #else return details::ChunkedEllpack< IndexType, DeviceType, RowMajorOrder >::getSegmentView( rowToSliceMapping, slices, rowToChunkMapping, chunksInSlice, segmentIdx ); #endif } } template< typename Device, Loading
src/TNL/Containers/Segments/details/ChunkedEllpack.h 0 → 100644 +230 −0 File added.Preview size limit exceeded, changes collapsed. Show changes