Loading src/TNL/Containers/Segments/ChunkedEllpack.h +2 −0 Original line number Diff line number Diff line Loading @@ -141,6 +141,8 @@ class ChunkedEllpack */ OffsetsHolder rowToChunkMapping; OffsetsHolder chunksToSegmentsMapping; /** * Keeps index of the first segment index. */ Loading src/TNL/Containers/Segments/ChunkedEllpack.hpp +46 −14 Original line number Diff line number Diff line Loading @@ -42,6 +42,7 @@ ChunkedEllpack( const ChunkedEllpack& chunkedEllpack ) desiredChunkSize( chunkedEllpack.desiredChunkSize ), rowToChunkMapping( chunkedEllpack.rowToChunkMapping ), rowToSliceMapping( chunkedEllpack.rowTopSliceMapping ), chunksToSegmentsMapping( chunkedEllpack. chunksToSegmentsMapping ), rowPointers( chunkedEllpack.rowPointers ), slices( chunkedEllpack.slices ), numberOfSlices( chunkedEllpack.numberOfSlices ) Loading @@ -60,6 +61,7 @@ ChunkedEllpack( const ChunkedEllpack&& chunkedEllpack ) desiredChunkSize( chunkedEllpack.desiredChunkSize ), rowToChunkMapping( chunkedEllpack.rowToChunkMapping ), rowToSliceMapping( chunkedEllpack.rowTopSliceMapping ), chunksToSegmentsMapping( chunkedEllpack. chunksToSegmentsMapping ), rowPointers( chunkedEllpack.rowPointers ), slices( chunkedEllpack.slices ), numberOfSlices( chunkedEllpack.numberOfSlices ) Loading Loading @@ -99,6 +101,7 @@ getView() return ViewType( size, storageSize, chunksInSlice, desiredChunkSize, rowToChunkMapping.getView(), rowToSliceMapping.getView(), chunksToSegmentsMapping.getView(), rowPointers.getView(), slices.getView(), numberOfSlices ); Loading @@ -115,6 +118,7 @@ getConstView() const return ConstViewType( size, storageSize, chunksInSlice, desiredChunkSize, rowToChunkMapping.getConstView(), rowToSliceMapping.getConstView(), chunksToSegmentsMapping.getConstView(), rowPointers.getConstView(), slices.getConstView(), numberOfSlices ); Loading Loading @@ -258,6 +262,8 @@ template< typename Device, void ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >:: setSegmentsSizes( const SizesHolder& segmentsSizes ) { if( std::is_same< DeviceType, Devices::Host >::value ) { this->size = segmentsSizes.getSize(); this->slices.setSize( this->size ); Loading @@ -271,6 +277,29 @@ setSegmentsSizes( const SizesHolder& segmentsSizes ) for( IndexType sliceIndex = 0; sliceIndex < numberOfSlices; sliceIndex++ ) this->setSlice( segmentsSizes, sliceIndex, storageSize ); this->rowPointers.scan(); IndexType chunksCount = this->numberOfSlices * this->chunksInSlice; this->chunksToSegmentsMapping.setSize( chunksCount ); IndexType chunkIdx( 0 ); for( IndexType segmentIdx = 0; segmentIdx < this->size; segmentIdx++ ) { const IndexType& sliceIdx = rowToSliceMapping[ segmentIdx ]; IndexType firstChunkOfSegment( 0 ); if( segmentIdx != slices[ sliceIdx ].firstSegment ) firstChunkOfSegment = rowToChunkMapping[ segmentIdx - 1 ]; const IndexType lastChunkOfSegment = rowToChunkMapping[ segmentIdx ]; const IndexType segmentChunksCount = lastChunkOfSegment - firstChunkOfSegment; for( IndexType i = 0; i < segmentChunksCount; i++ ) this->chunksToSegmentsMapping[ chunkIdx++ ] = segmentIdx; } } else { ChunkedEllpack< Devices::Host, Index, typename Allocators::Default< Devices::Host >::template Allocator< Index >, RowMajorOrder > hostSegments; Containers::Vector< IndexType, Devices::Host, IndexType > hostSegmentsSizes( segmentsSizes ); hostSegments.setSegmentsSizes( hostSegmentsSizes ); *this = hostSegments; } } template< typename Device, Loading Loading @@ -362,7 +391,7 @@ void ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >:: forSegments( IndexType first, IndexType last, Function& f, Args... args ) const { this->getView().forSegments( first, last, f, args... ); this->getConstView().forSegments( first, last, f, args... ); } template< typename Device, Loading @@ -386,7 +415,7 @@ void ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >:: segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { this->getView().segmentsReduction( first, last, fetch, reduction, keeper, zero, args... ); this->getConstView().segmentsReduction( first, last, fetch, reduction, keeper, zero, args... ); } template< typename Device, Loading Loading @@ -417,6 +446,7 @@ operator=( const ChunkedEllpack< Device_, Index_, IndexAllocator_, RowMajorOrder this->rowToChunkMapping = source.rowToChunkMapping; this->rowToSliceMapping = source.rowToSliceMapping; this->rowPointers = source.rowPointers; this->chunksToSegmentMapping = source.chunksToSegmentsMapping; this->slices = source.slices; this->numberOfSlices = source.numberOfSlices; return *this; Loading @@ -437,6 +467,7 @@ save( File& file ) const file << this->rowToChunkMapping << this->rowToSliceMapping << this->rowPointers << this->chunksToSegmentsMapping << this->slices; file.save( this->numberOfSlices ); } Loading @@ -455,6 +486,7 @@ load( File& file ) file.load( &this->desiredChunkSize ); file >> this->rowToChunkMapping >> this->rowToSliceMapping >> this->chunksToSegmentsMapping >> this->rowPointers >> this->slices; file.load( &this->numberOfSlices ); Loading src/TNL/Containers/Segments/ChunkedEllpackView.h +18 −9 Original line number Diff line number Diff line Loading @@ -52,6 +52,7 @@ class ChunkedEllpackView const IndexType desiredChunkSize, const OffsetsView& rowToChunkMapping, const OffsetsView& rowToSliceMapping, const OffsetsView& chunksToSegmentsMapping, const OffsetsView& rowPointers, const ChunkedEllpackSliceInfoContainerView& slices, const IndexType numberOfSlices ); Loading @@ -63,6 +64,7 @@ class ChunkedEllpackView const IndexType desiredChunkSize, const OffsetsView&& rowToChunkMapping, const OffsetsView&& rowToSliceMapping, const OffsetsView&& chunksToSegmentsMapping, const OffsetsView&& rowPointers, const ChunkedEllpackSliceInfoContainerView&& slices, const IndexType numberOfSlices ); Loading Loading @@ -145,15 +147,20 @@ class ChunkedEllpackView protected: /*IndexType size; IndexType chunksInSlice, desiredChunkSize; Containers::VectorView< Index, Device, Index > rowToChunkMapping, rowToSliceMapping, rowPointers; Containers::ArrayView< ChunkedEllpackSliceInfoType, Device, Index > slices; IndexType numberOfSlices;*/ template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > //__device__ void segmentsReductionKernel( IndexType gridIdx, IndexType first, IndexType last, Fetch fetch, Reduction reduction, ResultKeeper keeper, Real zero, Args... args ) const; IndexType size = 0, storageSize = 0; Loading @@ -170,6 +177,8 @@ class ChunkedEllpackView */ OffsetsView rowToChunkMapping; OffsetsView chunksToSegmentsMapping; /** * Keeps index of the first segment index. */ Loading src/TNL/Containers/Segments/ChunkedEllpackView.hpp +144 −11 Original line number Diff line number Diff line Loading @@ -19,6 +19,29 @@ namespace TNL { namespace Containers { namespace Segments { #ifdef HAVE_CUDA template< typename View, typename Index, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > __global__ void ChunkedEllpackSegmentsReductionKernel( View chunkedEllpack, Index gridIdx, Index first, Index last, Fetch fetch, Reduction reduction, ResultKeeper keeper, Real zero, Args... args ) { view.segmentsReductionKernel( gridIdx, first, last, fetch, reduction, keeper, zero, args... ); } #endif template< typename Device, typename Index, Loading @@ -31,6 +54,7 @@ ChunkedEllpackView( const IndexType size, const IndexType desiredChunkSize, const OffsetsView& rowToChunkMapping, const OffsetsView& rowToSliceMapping, const OffsetsView& chunksToSegmentsMapping, const OffsetsView& rowPointers, const ChunkedEllpackSliceInfoContainerView& slices, const IndexType numberOfSlices ) Loading @@ -40,6 +64,7 @@ ChunkedEllpackView( const IndexType size, desiredChunkSize( desiredChunkSize ), rowToChunkMapping( rowToChunkMapping ), rowToSliceMapping( rowToSliceMapping ), chunksToSegmentsMapping( chunksToSegmentsMapping ), rowPointers( rowPointers ), slices( slices ), numberOfSlices( numberOfSlices ) Loading @@ -57,6 +82,7 @@ ChunkedEllpackView( const IndexType size, const IndexType desiredChunkSize, const OffsetsView&& rowToChunkMapping, const OffsetsView&& rowToSliceMapping, const OffsetsView&& chunksToSegmentsMapping, const OffsetsView&& rowPointers, const ChunkedEllpackSliceInfoContainerView&& slices, const IndexType numberOfSlices ) Loading @@ -66,6 +92,7 @@ ChunkedEllpackView( const IndexType size, desiredChunkSize( desiredChunkSize ), rowToChunkMapping( rowToChunkMapping ), rowToSliceMapping( rowToSliceMapping ), chunksToSegmentsMapping( chunksToSegmentsMapping ), rowPointers( rowPointers ), slices( slices ), numberOfSlices( numberOfSlices ) Loading @@ -84,6 +111,7 @@ ChunkedEllpackView( const ChunkedEllpackView& chunked_ellpack_view ) desiredChunkSize( chunked_ellpack_view.desiredChunkSize ), rowToChunkMapping( chunked_ellpack_view.rowToChunkMapping ), rowToSliceMapping( chunked_ellpack_view.rowToSliceMapping ), chunksToSegmentsMapping( chunked_ellpack_view.chunksToSegmentsMapping ), rowPointers( chunked_ellpack_view.rowPointers ), slices( chunked_ellpack_view.slices ), numberOfSlices( chunked_ellpack_view.numberOfSlices ) Loading @@ -102,6 +130,7 @@ ChunkedEllpackView( const ChunkedEllpackView&& chunked_ellpack_view ) desiredChunkSize( chunked_ellpack_view.desiredChunkSize ), rowToChunkMapping( std::move( chunked_ellpack_view.rowToChunkMapping ) ), rowToSliceMapping( std::move( chunked_ellpack_view.rowToSliceMapping ) ), chunksToSegmentsMapping( std::move( chunked_ellpack_view.chunksToSegmentsMapping ) ), rowPointers( std::move( chunked_ellpack_view.rowPointers ) ), slices( std::move( chunked_ellpack_view.slices ) ), numberOfSlices( chunked_ellpack_view.numberOfSlices ) Loading Loading @@ -139,6 +168,7 @@ getView() return ViewType( size, chunksInSlice, desiredChunkSize, rowToChunkMapping.getView(), rowToSliceMapping.getView(), chunksToSegmentsMapping.getView(), rowPointers.getView(), slices.getView(), numberOfSlices ); Loading @@ -155,6 +185,7 @@ getConstView() const return ConstViewType( size, chunksInSlice, desiredChunkSize, rowToChunkMapping.getConstView(), rowToSliceMapping.getConstView(), chunksToSegmentsMapping.getConstView(), rowPointers.getConstView(), slices.getConstView(), numberOfSlices ); Loading Loading @@ -344,17 +375,6 @@ forSegments( IndexType first, IndexType last, Function& f, Args... args ) const } } } /*const auto offsetsView = this->offsets; auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable { const IndexType begin = offsetsView[ segmentIdx ]; const IndexType end = offsetsView[ segmentIdx + 1 ]; IndexType localIdx( 0 ); bool compute( true ); for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++ ) f( segmentIdx, localIdx++, globalIdx, compute, args... ); }; Algorithms::ParallelFor< Device >::exec( first, last, l, args... );*/ } template< typename Device, Loading @@ -379,6 +399,9 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) ); if( std::is_same< DeviceType, Devices::Host >::value ) { segmentsReductionKernel( 0, first, last, fetch, reduction, keeper, zero, args... ); return; for( IndexType segmentIdx = first; segmentIdx < last; segmentIdx++ ) { const IndexType& sliceIndex = rowToSliceMapping[ segmentIdx ]; Loading Loading @@ -417,6 +440,23 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red keeper( segmentIdx, aux ); } } if( std::is_same< DeviceType, Devices::Cuda >::value ) { /*const IndexType chunksCount = this->numberOfSlices * this->chunksInSlice; const IndexType cudaBlocks = this->numberOfSlices; const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); dim3 cudaBlockSize( this->chunksInSlice ), cudaGridSize; const IndexType sharedMemory = cudaBlockSize.x * sizeof( RealType ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); details::ChunkedEllpackSegmentsReductionKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, Args... > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( *this, gridIdx, first, last, fetch, reduction, keeper, zero, args... ); }*/ } } template< typename Device, Loading @@ -442,6 +482,7 @@ operator=( const ChunkedEllpackView& view ) this->chunksInSlice = view.chunksInSlice; this->desiredChunkSize = view.desiredChunkSize; this->rowToChunkMapping.bind( view.rowToChunkMapping ); this->chunksToSegmentsMapping.bind( view.chunksToSegmentsMapping ); this->rowToSliceMapping.bind( view.rowToSliceMapping ); this->rowPointers.bind( view.rowPointers ); this->slices.bind( view.slices ); Loading @@ -461,6 +502,7 @@ save( File& file ) const file.save( &this->chunksInSlice ); file.save( &this->desiredChunkSize ); file << this->rowToChunkMapping << this->chunksToSegmentsMapping << this->rowToSliceMapping << this->rowPointers << this->slices; Loading @@ -479,6 +521,7 @@ load( File& file ) file.load( &this->chunksInSlice ); file.load( &this->desiredChunkSize ); file >> this->rowToChunkMapping >> this->chunksToSegmentsMapping >> this->rowToSliceMapping >> this->rowPointers >> this->slices; Loading Loading @@ -507,6 +550,96 @@ printStructure( std::ostream& str ) const << " chunk = " << this->rowToChunkMapping.getElement( i ) << std::endl; } template< typename Device, typename Index, bool RowMajorOrder > template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > //__cuda_callable__ void ChunkedEllpackView< Device, Index, RowMajorOrder >:: segmentsReductionKernel( IndexType gridIdx, IndexType first, IndexType last, Fetch fetch, Reduction reduction, ResultKeeper keeper, Real zero, Args... args ) const { using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) ); const IndexType firstSlice = rowToChunkMapping[ first ] / chunksInSlice; const IndexType lastSlice = rowToChunkMapping[ last - 1 ] / chunksInSlice; for( IndexType sliceIdx = firstSlice; sliceIdx < lastSlice; sliceIdx++ ) { //const IndexType sliceIdx = gridIdx * Cuda::getMaxGridSize() + blockIdx.x; //if( sliceIdx >= lastSlice ) // return; RealType chunksResults[ 256 ]; //Cuda::getSharedMemory< RealType >(); for( IndexType threadIdx = 0; threadIdx < 256; threadIdx++ ) { details::ChunkedEllpackSliceInfo< IndexType > sliceInfo; if( threadIdx == 0 ) sliceInfo = this->slices[ sliceIdx ]; chunksResults[ threadIdx ] = zero; //__syncthreads(); const IndexType sliceOffset = sliceInfo.pointer; const IndexType chunkSize = sliceInfo.chunkSize; const IndexType chunkIdx = sliceIdx * chunksInSlice + threadIdx; const IndexType segmentIdx = this->chunksToSegmentsMapping[ chunkIdx ]; IndexType firstChunkOfSegment( 0 ); if( segmentIdx != sliceInfo.firstSegment ) firstChunkOfSegment = rowToChunkMapping[ segmentIdx - 1 ]; IndexType localIdx = ( threadIdx - firstChunkOfSegment ) * chunkSize; bool compute( true ); if( RowMajorOrder ) { IndexType begin = sliceOffset + threadIdx * chunkSize; // threadIdx.x = chunkIdx within the slice IndexType end = begin + chunkSize; for( IndexType j = begin; j < end && compute; j++ ) reduction( chunksResults[ threadIdx ], fetch( segmentIdx, localIdx++, j, compute, args...) ); } else { const IndexType begin = sliceOffset + threadIdx; // threadIdx.x = chunkIdx within the slice const IndexType end = begin + chunksInSlice * chunkSize; for( IndexType j = begin; j < end && compute; j += chunksInSlice ) reduction( chunksResults[ threadIdx ], fetch( segmentIdx, localIdx++, j, compute, args...) ); } //__syncthreads(); } // TODO: finish details::ChunkedEllpackSliceInfo< IndexType > sliceInfo; for( IndexType threadIdx = 0; threadIdx < 256; threadIdx++ ) { if( threadIdx == 0 ) sliceInfo = this->slices[ sliceIdx ]; if( threadIdx < sliceInfo.size ) { const IndexType row = sliceInfo.firstSegment + threadIdx; IndexType chunkIndex( 0 ); if( threadIdx != 0 ) chunkIndex = this->rowToChunkMapping[ row - 1 ]; const IndexType lastChunk = this->rowToChunkMapping[ row ]; RealType result( zero ); while( chunkIndex < lastChunk ) reduction( result, chunksResults[ chunkIndex++ ] ); keeper( row, result ); } } // threadIdx } // sliceIdx } } // namespace Segments } // namespace Conatiners } // namespace TNL src/TNL/Containers/Segments/details/ChunkedEllpack.h +1 −2 Original line number Diff line number Diff line Loading @@ -29,7 +29,7 @@ template< typename Index > struct ChunkedEllpackSliceInfo { /** * The size of the slice, it means the number of the matrix rows covered by * The size of the slice, it means the number of the segments covered by * the slice. */ Index size; Loading Loading @@ -223,7 +223,6 @@ class ChunkedEllpack chunksInSlice ); } }; } //namespace details } //namespace Segments } //namespace Containers Loading Loading
src/TNL/Containers/Segments/ChunkedEllpack.h +2 −0 Original line number Diff line number Diff line Loading @@ -141,6 +141,8 @@ class ChunkedEllpack */ OffsetsHolder rowToChunkMapping; OffsetsHolder chunksToSegmentsMapping; /** * Keeps index of the first segment index. */ Loading
src/TNL/Containers/Segments/ChunkedEllpack.hpp +46 −14 Original line number Diff line number Diff line Loading @@ -42,6 +42,7 @@ ChunkedEllpack( const ChunkedEllpack& chunkedEllpack ) desiredChunkSize( chunkedEllpack.desiredChunkSize ), rowToChunkMapping( chunkedEllpack.rowToChunkMapping ), rowToSliceMapping( chunkedEllpack.rowTopSliceMapping ), chunksToSegmentsMapping( chunkedEllpack. chunksToSegmentsMapping ), rowPointers( chunkedEllpack.rowPointers ), slices( chunkedEllpack.slices ), numberOfSlices( chunkedEllpack.numberOfSlices ) Loading @@ -60,6 +61,7 @@ ChunkedEllpack( const ChunkedEllpack&& chunkedEllpack ) desiredChunkSize( chunkedEllpack.desiredChunkSize ), rowToChunkMapping( chunkedEllpack.rowToChunkMapping ), rowToSliceMapping( chunkedEllpack.rowTopSliceMapping ), chunksToSegmentsMapping( chunkedEllpack. chunksToSegmentsMapping ), rowPointers( chunkedEllpack.rowPointers ), slices( chunkedEllpack.slices ), numberOfSlices( chunkedEllpack.numberOfSlices ) Loading Loading @@ -99,6 +101,7 @@ getView() return ViewType( size, storageSize, chunksInSlice, desiredChunkSize, rowToChunkMapping.getView(), rowToSliceMapping.getView(), chunksToSegmentsMapping.getView(), rowPointers.getView(), slices.getView(), numberOfSlices ); Loading @@ -115,6 +118,7 @@ getConstView() const return ConstViewType( size, storageSize, chunksInSlice, desiredChunkSize, rowToChunkMapping.getConstView(), rowToSliceMapping.getConstView(), chunksToSegmentsMapping.getConstView(), rowPointers.getConstView(), slices.getConstView(), numberOfSlices ); Loading Loading @@ -258,6 +262,8 @@ template< typename Device, void ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >:: setSegmentsSizes( const SizesHolder& segmentsSizes ) { if( std::is_same< DeviceType, Devices::Host >::value ) { this->size = segmentsSizes.getSize(); this->slices.setSize( this->size ); Loading @@ -271,6 +277,29 @@ setSegmentsSizes( const SizesHolder& segmentsSizes ) for( IndexType sliceIndex = 0; sliceIndex < numberOfSlices; sliceIndex++ ) this->setSlice( segmentsSizes, sliceIndex, storageSize ); this->rowPointers.scan(); IndexType chunksCount = this->numberOfSlices * this->chunksInSlice; this->chunksToSegmentsMapping.setSize( chunksCount ); IndexType chunkIdx( 0 ); for( IndexType segmentIdx = 0; segmentIdx < this->size; segmentIdx++ ) { const IndexType& sliceIdx = rowToSliceMapping[ segmentIdx ]; IndexType firstChunkOfSegment( 0 ); if( segmentIdx != slices[ sliceIdx ].firstSegment ) firstChunkOfSegment = rowToChunkMapping[ segmentIdx - 1 ]; const IndexType lastChunkOfSegment = rowToChunkMapping[ segmentIdx ]; const IndexType segmentChunksCount = lastChunkOfSegment - firstChunkOfSegment; for( IndexType i = 0; i < segmentChunksCount; i++ ) this->chunksToSegmentsMapping[ chunkIdx++ ] = segmentIdx; } } else { ChunkedEllpack< Devices::Host, Index, typename Allocators::Default< Devices::Host >::template Allocator< Index >, RowMajorOrder > hostSegments; Containers::Vector< IndexType, Devices::Host, IndexType > hostSegmentsSizes( segmentsSizes ); hostSegments.setSegmentsSizes( hostSegmentsSizes ); *this = hostSegments; } } template< typename Device, Loading Loading @@ -362,7 +391,7 @@ void ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >:: forSegments( IndexType first, IndexType last, Function& f, Args... args ) const { this->getView().forSegments( first, last, f, args... ); this->getConstView().forSegments( first, last, f, args... ); } template< typename Device, Loading @@ -386,7 +415,7 @@ void ChunkedEllpack< Device, Index, IndexAllocator, RowMajorOrder >:: segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { this->getView().segmentsReduction( first, last, fetch, reduction, keeper, zero, args... ); this->getConstView().segmentsReduction( first, last, fetch, reduction, keeper, zero, args... ); } template< typename Device, Loading Loading @@ -417,6 +446,7 @@ operator=( const ChunkedEllpack< Device_, Index_, IndexAllocator_, RowMajorOrder this->rowToChunkMapping = source.rowToChunkMapping; this->rowToSliceMapping = source.rowToSliceMapping; this->rowPointers = source.rowPointers; this->chunksToSegmentMapping = source.chunksToSegmentsMapping; this->slices = source.slices; this->numberOfSlices = source.numberOfSlices; return *this; Loading @@ -437,6 +467,7 @@ save( File& file ) const file << this->rowToChunkMapping << this->rowToSliceMapping << this->rowPointers << this->chunksToSegmentsMapping << this->slices; file.save( this->numberOfSlices ); } Loading @@ -455,6 +486,7 @@ load( File& file ) file.load( &this->desiredChunkSize ); file >> this->rowToChunkMapping >> this->rowToSliceMapping >> this->chunksToSegmentsMapping >> this->rowPointers >> this->slices; file.load( &this->numberOfSlices ); Loading
src/TNL/Containers/Segments/ChunkedEllpackView.h +18 −9 Original line number Diff line number Diff line Loading @@ -52,6 +52,7 @@ class ChunkedEllpackView const IndexType desiredChunkSize, const OffsetsView& rowToChunkMapping, const OffsetsView& rowToSliceMapping, const OffsetsView& chunksToSegmentsMapping, const OffsetsView& rowPointers, const ChunkedEllpackSliceInfoContainerView& slices, const IndexType numberOfSlices ); Loading @@ -63,6 +64,7 @@ class ChunkedEllpackView const IndexType desiredChunkSize, const OffsetsView&& rowToChunkMapping, const OffsetsView&& rowToSliceMapping, const OffsetsView&& chunksToSegmentsMapping, const OffsetsView&& rowPointers, const ChunkedEllpackSliceInfoContainerView&& slices, const IndexType numberOfSlices ); Loading Loading @@ -145,15 +147,20 @@ class ChunkedEllpackView protected: /*IndexType size; IndexType chunksInSlice, desiredChunkSize; Containers::VectorView< Index, Device, Index > rowToChunkMapping, rowToSliceMapping, rowPointers; Containers::ArrayView< ChunkedEllpackSliceInfoType, Device, Index > slices; IndexType numberOfSlices;*/ template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > //__device__ void segmentsReductionKernel( IndexType gridIdx, IndexType first, IndexType last, Fetch fetch, Reduction reduction, ResultKeeper keeper, Real zero, Args... args ) const; IndexType size = 0, storageSize = 0; Loading @@ -170,6 +177,8 @@ class ChunkedEllpackView */ OffsetsView rowToChunkMapping; OffsetsView chunksToSegmentsMapping; /** * Keeps index of the first segment index. */ Loading
src/TNL/Containers/Segments/ChunkedEllpackView.hpp +144 −11 Original line number Diff line number Diff line Loading @@ -19,6 +19,29 @@ namespace TNL { namespace Containers { namespace Segments { #ifdef HAVE_CUDA template< typename View, typename Index, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > __global__ void ChunkedEllpackSegmentsReductionKernel( View chunkedEllpack, Index gridIdx, Index first, Index last, Fetch fetch, Reduction reduction, ResultKeeper keeper, Real zero, Args... args ) { view.segmentsReductionKernel( gridIdx, first, last, fetch, reduction, keeper, zero, args... ); } #endif template< typename Device, typename Index, Loading @@ -31,6 +54,7 @@ ChunkedEllpackView( const IndexType size, const IndexType desiredChunkSize, const OffsetsView& rowToChunkMapping, const OffsetsView& rowToSliceMapping, const OffsetsView& chunksToSegmentsMapping, const OffsetsView& rowPointers, const ChunkedEllpackSliceInfoContainerView& slices, const IndexType numberOfSlices ) Loading @@ -40,6 +64,7 @@ ChunkedEllpackView( const IndexType size, desiredChunkSize( desiredChunkSize ), rowToChunkMapping( rowToChunkMapping ), rowToSliceMapping( rowToSliceMapping ), chunksToSegmentsMapping( chunksToSegmentsMapping ), rowPointers( rowPointers ), slices( slices ), numberOfSlices( numberOfSlices ) Loading @@ -57,6 +82,7 @@ ChunkedEllpackView( const IndexType size, const IndexType desiredChunkSize, const OffsetsView&& rowToChunkMapping, const OffsetsView&& rowToSliceMapping, const OffsetsView&& chunksToSegmentsMapping, const OffsetsView&& rowPointers, const ChunkedEllpackSliceInfoContainerView&& slices, const IndexType numberOfSlices ) Loading @@ -66,6 +92,7 @@ ChunkedEllpackView( const IndexType size, desiredChunkSize( desiredChunkSize ), rowToChunkMapping( rowToChunkMapping ), rowToSliceMapping( rowToSliceMapping ), chunksToSegmentsMapping( chunksToSegmentsMapping ), rowPointers( rowPointers ), slices( slices ), numberOfSlices( numberOfSlices ) Loading @@ -84,6 +111,7 @@ ChunkedEllpackView( const ChunkedEllpackView& chunked_ellpack_view ) desiredChunkSize( chunked_ellpack_view.desiredChunkSize ), rowToChunkMapping( chunked_ellpack_view.rowToChunkMapping ), rowToSliceMapping( chunked_ellpack_view.rowToSliceMapping ), chunksToSegmentsMapping( chunked_ellpack_view.chunksToSegmentsMapping ), rowPointers( chunked_ellpack_view.rowPointers ), slices( chunked_ellpack_view.slices ), numberOfSlices( chunked_ellpack_view.numberOfSlices ) Loading @@ -102,6 +130,7 @@ ChunkedEllpackView( const ChunkedEllpackView&& chunked_ellpack_view ) desiredChunkSize( chunked_ellpack_view.desiredChunkSize ), rowToChunkMapping( std::move( chunked_ellpack_view.rowToChunkMapping ) ), rowToSliceMapping( std::move( chunked_ellpack_view.rowToSliceMapping ) ), chunksToSegmentsMapping( std::move( chunked_ellpack_view.chunksToSegmentsMapping ) ), rowPointers( std::move( chunked_ellpack_view.rowPointers ) ), slices( std::move( chunked_ellpack_view.slices ) ), numberOfSlices( chunked_ellpack_view.numberOfSlices ) Loading Loading @@ -139,6 +168,7 @@ getView() return ViewType( size, chunksInSlice, desiredChunkSize, rowToChunkMapping.getView(), rowToSliceMapping.getView(), chunksToSegmentsMapping.getView(), rowPointers.getView(), slices.getView(), numberOfSlices ); Loading @@ -155,6 +185,7 @@ getConstView() const return ConstViewType( size, chunksInSlice, desiredChunkSize, rowToChunkMapping.getConstView(), rowToSliceMapping.getConstView(), chunksToSegmentsMapping.getConstView(), rowPointers.getConstView(), slices.getConstView(), numberOfSlices ); Loading Loading @@ -344,17 +375,6 @@ forSegments( IndexType first, IndexType last, Function& f, Args... args ) const } } } /*const auto offsetsView = this->offsets; auto l = [=] __cuda_callable__ ( const IndexType segmentIdx, Args... args ) mutable { const IndexType begin = offsetsView[ segmentIdx ]; const IndexType end = offsetsView[ segmentIdx + 1 ]; IndexType localIdx( 0 ); bool compute( true ); for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++ ) f( segmentIdx, localIdx++, globalIdx, compute, args... ); }; Algorithms::ParallelFor< Device >::exec( first, last, l, args... );*/ } template< typename Device, Loading @@ -379,6 +399,9 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) ); if( std::is_same< DeviceType, Devices::Host >::value ) { segmentsReductionKernel( 0, first, last, fetch, reduction, keeper, zero, args... ); return; for( IndexType segmentIdx = first; segmentIdx < last; segmentIdx++ ) { const IndexType& sliceIndex = rowToSliceMapping[ segmentIdx ]; Loading Loading @@ -417,6 +440,23 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red keeper( segmentIdx, aux ); } } if( std::is_same< DeviceType, Devices::Cuda >::value ) { /*const IndexType chunksCount = this->numberOfSlices * this->chunksInSlice; const IndexType cudaBlocks = this->numberOfSlices; const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); dim3 cudaBlockSize( this->chunksInSlice ), cudaGridSize; const IndexType sharedMemory = cudaBlockSize.x * sizeof( RealType ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); details::ChunkedEllpackSegmentsReductionKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, Args... > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( *this, gridIdx, first, last, fetch, reduction, keeper, zero, args... ); }*/ } } template< typename Device, Loading @@ -442,6 +482,7 @@ operator=( const ChunkedEllpackView& view ) this->chunksInSlice = view.chunksInSlice; this->desiredChunkSize = view.desiredChunkSize; this->rowToChunkMapping.bind( view.rowToChunkMapping ); this->chunksToSegmentsMapping.bind( view.chunksToSegmentsMapping ); this->rowToSliceMapping.bind( view.rowToSliceMapping ); this->rowPointers.bind( view.rowPointers ); this->slices.bind( view.slices ); Loading @@ -461,6 +502,7 @@ save( File& file ) const file.save( &this->chunksInSlice ); file.save( &this->desiredChunkSize ); file << this->rowToChunkMapping << this->chunksToSegmentsMapping << this->rowToSliceMapping << this->rowPointers << this->slices; Loading @@ -479,6 +521,7 @@ load( File& file ) file.load( &this->chunksInSlice ); file.load( &this->desiredChunkSize ); file >> this->rowToChunkMapping >> this->chunksToSegmentsMapping >> this->rowToSliceMapping >> this->rowPointers >> this->slices; Loading Loading @@ -507,6 +550,96 @@ printStructure( std::ostream& str ) const << " chunk = " << this->rowToChunkMapping.getElement( i ) << std::endl; } template< typename Device, typename Index, bool RowMajorOrder > template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > //__cuda_callable__ void ChunkedEllpackView< Device, Index, RowMajorOrder >:: segmentsReductionKernel( IndexType gridIdx, IndexType first, IndexType last, Fetch fetch, Reduction reduction, ResultKeeper keeper, Real zero, Args... args ) const { using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) ); const IndexType firstSlice = rowToChunkMapping[ first ] / chunksInSlice; const IndexType lastSlice = rowToChunkMapping[ last - 1 ] / chunksInSlice; for( IndexType sliceIdx = firstSlice; sliceIdx < lastSlice; sliceIdx++ ) { //const IndexType sliceIdx = gridIdx * Cuda::getMaxGridSize() + blockIdx.x; //if( sliceIdx >= lastSlice ) // return; RealType chunksResults[ 256 ]; //Cuda::getSharedMemory< RealType >(); for( IndexType threadIdx = 0; threadIdx < 256; threadIdx++ ) { details::ChunkedEllpackSliceInfo< IndexType > sliceInfo; if( threadIdx == 0 ) sliceInfo = this->slices[ sliceIdx ]; chunksResults[ threadIdx ] = zero; //__syncthreads(); const IndexType sliceOffset = sliceInfo.pointer; const IndexType chunkSize = sliceInfo.chunkSize; const IndexType chunkIdx = sliceIdx * chunksInSlice + threadIdx; const IndexType segmentIdx = this->chunksToSegmentsMapping[ chunkIdx ]; IndexType firstChunkOfSegment( 0 ); if( segmentIdx != sliceInfo.firstSegment ) firstChunkOfSegment = rowToChunkMapping[ segmentIdx - 1 ]; IndexType localIdx = ( threadIdx - firstChunkOfSegment ) * chunkSize; bool compute( true ); if( RowMajorOrder ) { IndexType begin = sliceOffset + threadIdx * chunkSize; // threadIdx.x = chunkIdx within the slice IndexType end = begin + chunkSize; for( IndexType j = begin; j < end && compute; j++ ) reduction( chunksResults[ threadIdx ], fetch( segmentIdx, localIdx++, j, compute, args...) ); } else { const IndexType begin = sliceOffset + threadIdx; // threadIdx.x = chunkIdx within the slice const IndexType end = begin + chunksInSlice * chunkSize; for( IndexType j = begin; j < end && compute; j += chunksInSlice ) reduction( chunksResults[ threadIdx ], fetch( segmentIdx, localIdx++, j, compute, args...) ); } //__syncthreads(); } // TODO: finish details::ChunkedEllpackSliceInfo< IndexType > sliceInfo; for( IndexType threadIdx = 0; threadIdx < 256; threadIdx++ ) { if( threadIdx == 0 ) sliceInfo = this->slices[ sliceIdx ]; if( threadIdx < sliceInfo.size ) { const IndexType row = sliceInfo.firstSegment + threadIdx; IndexType chunkIndex( 0 ); if( threadIdx != 0 ) chunkIndex = this->rowToChunkMapping[ row - 1 ]; const IndexType lastChunk = this->rowToChunkMapping[ row ]; RealType result( zero ); while( chunkIndex < lastChunk ) reduction( result, chunksResults[ chunkIndex++ ] ); keeper( row, result ); } } // threadIdx } // sliceIdx } } // namespace Segments } // namespace Conatiners } // namespace TNL
src/TNL/Containers/Segments/details/ChunkedEllpack.h +1 −2 Original line number Diff line number Diff line Loading @@ -29,7 +29,7 @@ template< typename Index > struct ChunkedEllpackSliceInfo { /** * The size of the slice, it means the number of the matrix rows covered by * The size of the slice, it means the number of the segments covered by * the slice. */ Index size; Loading Loading @@ -223,7 +223,6 @@ class ChunkedEllpack chunksInSlice ); } }; } //namespace details } //namespace Segments } //namespace Containers Loading