Loading src/TNL/Containers/Segments/CSR.hpp +4 −3 Original line number Diff line number Diff line Loading @@ -218,14 +218,15 @@ void CSR< Device, Index, IndexAllocator >:: segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { using RealType = decltype( fetch( IndexType(), IndexType() ) ); using RealType = decltype( fetch( IndexType(), IndexType(), std::declval< bool& >(), args... ) ); const auto offsetsView = this->offsets.getConstView(); auto l = [=] __cuda_callable__ ( const IndexType i, Args... args ) mutable { const IndexType begin = offsetsView[ i ]; const IndexType end = offsetsView[ i + 1 ]; RealType aux( zero ); for( IndexType j = begin; j < end; j++ ) reduction( aux, fetch( i, j, args... ) ); bool compute( true ); for( IndexType j = begin; j < end && compute; j++ ) reduction( aux, fetch( i, j, compute, args... ) ); keeper( i, aux ); }; Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); Loading src/TNL/Containers/Segments/CSRView.hpp +4 −3 Original line number Diff line number Diff line Loading @@ -204,14 +204,15 @@ void CSRView< Device, Index >:: segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { using RealType = decltype( fetch( IndexType(), IndexType() ) ); using RealType = decltype( fetch( IndexType(), IndexType(), std::declval< bool& >(), args... ) ); const auto offsetsView = this->offsets.getConstView(); auto l = [=] __cuda_callable__ ( const IndexType i, Args... args ) mutable { const IndexType begin = offsetsView[ i ]; const IndexType end = offsetsView[ i + 1 ]; RealType aux( zero ); for( IndexType j = begin; j < end; j++ ) reduction( aux, fetch( i, j, args... ) ); bool compute( true ); for( IndexType j = begin; j < end && compute; j++ ) reduction( aux, fetch( i, j, compute, args... ) ); keeper( i, aux ); }; Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); Loading src/TNL/Containers/Segments/Ellpack.hpp +7 −6 Original line number Diff line number Diff line Loading @@ -306,31 +306,32 @@ void Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >:: segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { using RealType = decltype( fetch( IndexType(), IndexType(), std::declval< bool& >(), args... ) ); if( RowMajorOrder ) { using RealType = decltype( fetch( IndexType(), IndexType() ) ); const IndexType segmentSize = this->segmentSize; auto l = [=] __cuda_callable__ ( const IndexType i, Args... args ) mutable { const IndexType begin = i * segmentSize; const IndexType end = begin + segmentSize; RealType aux( zero ); for( IndexType j = begin; j < end; j++ ) reduction( aux, fetch( i, j, args... ) ); bool compute( true ); for( IndexType j = begin; j < end && compute; j++ ) reduction( aux, fetch( i, j, compute, args... ) ); keeper( i, aux ); }; Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); } else { using RealType = decltype( fetch( IndexType(), IndexType() ) ); const IndexType storageSize = this->getStorageSize(); const IndexType alignedSize = this->alignedSize; auto l = [=] __cuda_callable__ ( const IndexType i, Args... args ) mutable { const IndexType begin = i; const IndexType end = storageSize; RealType aux( zero ); for( IndexType j = begin; j < end; j += alignedSize ) reduction( aux, fetch( i, j, args... ) ); bool compute( true ); for( IndexType j = begin; j < end && compute; j += alignedSize ) reduction( aux, fetch( i, j, compute, args... ) ); keeper( i, aux ); }; Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); Loading src/TNL/Containers/Segments/EllpackView.hpp +7 −6 Original line number Diff line number Diff line Loading @@ -245,31 +245,32 @@ void EllpackView< Device, Index, RowMajorOrder, Alignment >:: segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { using RealType = decltype( fetch( IndexType(), IndexType(), std::declval< bool& >(), args... ) ); if( RowMajorOrder ) { using RealType = decltype( fetch( IndexType(), IndexType() ) ); const IndexType segmentSize = this->segmentSize; auto l = [=] __cuda_callable__ ( const IndexType i, Args... args ) mutable { const IndexType begin = i * segmentSize; const IndexType end = begin + segmentSize; RealType aux( zero ); for( IndexType j = begin; j < end; j++ ) reduction( aux, fetch( i, j, args... ) ); bool compute( true ); for( IndexType j = begin; j < end && compute; j++ ) reduction( aux, fetch( i, j, compute, args... ) ); keeper( i, aux ); }; Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); } else { using RealType = decltype( fetch( IndexType(), IndexType() ) ); const IndexType storageSize = this->getStorageSize(); const IndexType alignedSize = this->alignedSize; auto l = [=] __cuda_callable__ ( const IndexType i, Args... args ) mutable { const IndexType begin = i; const IndexType end = storageSize; RealType aux( zero ); for( IndexType j = begin; j < end; j += alignedSize ) reduction( aux, fetch( i, j, args... ) ); bool compute( true ); for( IndexType j = begin; j < end && compute; j += alignedSize ) reduction( aux, fetch( i, j, compute, args... ) ); keeper( i, aux ); }; Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); Loading src/TNL/Containers/Segments/SlicedEllpack.hpp +8 −6 Original line number Diff line number Diff line Loading @@ -127,7 +127,7 @@ setSegmentsSizes( const SizesHolder& sizes ) const auto sizes_view = sizes.getConstView(); auto slices_view = this->sliceOffsets.getView(); auto slice_segment_size_view = this->sliceSegmentSizes.getView(); auto fetch = [=] __cuda_callable__ ( IndexType segmentIdx, IndexType globalIdx ) -> IndexType { auto fetch = [=] __cuda_callable__ ( IndexType segmentIdx, IndexType globalIdx, bool& compute ) -> IndexType { if( globalIdx < _size ) return sizes_view[ globalIdx ]; return 0; Loading Loading @@ -341,7 +341,7 @@ void SlicedEllpack< Device, Index, IndexAllocator, RowMajorOrder, SliceSize >:: segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { using RealType = decltype( fetch( IndexType(), IndexType() ) ); using RealType = decltype( fetch( IndexType(), IndexType(), std::declval< bool& >(), args... ) ); const auto sliceSegmentSizes_view = this->sliceSegmentSizes.getConstView(); const auto sliceOffsets_view = this->sliceOffsets.getConstView(); if( RowMajorOrder ) Loading @@ -353,8 +353,9 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red const IndexType begin = sliceOffsets_view[ sliceIdx ] + segmentInSliceIdx * segmentSize; const IndexType end = begin + segmentSize; RealType aux( zero ); for( IndexType globalIdx = begin; globalIdx< end; globalIdx++ ) reduction( aux, fetch( segmentIdx, globalIdx, args... ) ); bool compute( true ); for( IndexType globalIdx = begin; globalIdx< end && compute; globalIdx++ ) reduction( aux, fetch( segmentIdx, globalIdx, compute, args... ) ); keeper( segmentIdx, aux ); }; Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); Loading @@ -368,8 +369,9 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red const IndexType begin = sliceOffsets_view[ sliceIdx ] + segmentInSliceIdx; const IndexType end = sliceOffsets_view[ sliceIdx + 1 ]; RealType aux( zero ); for( IndexType globalIdx = begin; globalIdx < end; globalIdx += SliceSize ) reduction( aux, fetch( segmentIdx, globalIdx, args... ) ); bool compute( true ); for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx += SliceSize ) reduction( aux, fetch( segmentIdx, globalIdx, compute, args... ) ); keeper( segmentIdx, aux ); }; Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); Loading Loading
src/TNL/Containers/Segments/CSR.hpp +4 −3 Original line number Diff line number Diff line Loading @@ -218,14 +218,15 @@ void CSR< Device, Index, IndexAllocator >:: segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { using RealType = decltype( fetch( IndexType(), IndexType() ) ); using RealType = decltype( fetch( IndexType(), IndexType(), std::declval< bool& >(), args... ) ); const auto offsetsView = this->offsets.getConstView(); auto l = [=] __cuda_callable__ ( const IndexType i, Args... args ) mutable { const IndexType begin = offsetsView[ i ]; const IndexType end = offsetsView[ i + 1 ]; RealType aux( zero ); for( IndexType j = begin; j < end; j++ ) reduction( aux, fetch( i, j, args... ) ); bool compute( true ); for( IndexType j = begin; j < end && compute; j++ ) reduction( aux, fetch( i, j, compute, args... ) ); keeper( i, aux ); }; Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); Loading
src/TNL/Containers/Segments/CSRView.hpp +4 −3 Original line number Diff line number Diff line Loading @@ -204,14 +204,15 @@ void CSRView< Device, Index >:: segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { using RealType = decltype( fetch( IndexType(), IndexType() ) ); using RealType = decltype( fetch( IndexType(), IndexType(), std::declval< bool& >(), args... ) ); const auto offsetsView = this->offsets.getConstView(); auto l = [=] __cuda_callable__ ( const IndexType i, Args... args ) mutable { const IndexType begin = offsetsView[ i ]; const IndexType end = offsetsView[ i + 1 ]; RealType aux( zero ); for( IndexType j = begin; j < end; j++ ) reduction( aux, fetch( i, j, args... ) ); bool compute( true ); for( IndexType j = begin; j < end && compute; j++ ) reduction( aux, fetch( i, j, compute, args... ) ); keeper( i, aux ); }; Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); Loading
src/TNL/Containers/Segments/Ellpack.hpp +7 −6 Original line number Diff line number Diff line Loading @@ -306,31 +306,32 @@ void Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >:: segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { using RealType = decltype( fetch( IndexType(), IndexType(), std::declval< bool& >(), args... ) ); if( RowMajorOrder ) { using RealType = decltype( fetch( IndexType(), IndexType() ) ); const IndexType segmentSize = this->segmentSize; auto l = [=] __cuda_callable__ ( const IndexType i, Args... args ) mutable { const IndexType begin = i * segmentSize; const IndexType end = begin + segmentSize; RealType aux( zero ); for( IndexType j = begin; j < end; j++ ) reduction( aux, fetch( i, j, args... ) ); bool compute( true ); for( IndexType j = begin; j < end && compute; j++ ) reduction( aux, fetch( i, j, compute, args... ) ); keeper( i, aux ); }; Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); } else { using RealType = decltype( fetch( IndexType(), IndexType() ) ); const IndexType storageSize = this->getStorageSize(); const IndexType alignedSize = this->alignedSize; auto l = [=] __cuda_callable__ ( const IndexType i, Args... args ) mutable { const IndexType begin = i; const IndexType end = storageSize; RealType aux( zero ); for( IndexType j = begin; j < end; j += alignedSize ) reduction( aux, fetch( i, j, args... ) ); bool compute( true ); for( IndexType j = begin; j < end && compute; j += alignedSize ) reduction( aux, fetch( i, j, compute, args... ) ); keeper( i, aux ); }; Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); Loading
src/TNL/Containers/Segments/EllpackView.hpp +7 −6 Original line number Diff line number Diff line Loading @@ -245,31 +245,32 @@ void EllpackView< Device, Index, RowMajorOrder, Alignment >:: segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { using RealType = decltype( fetch( IndexType(), IndexType(), std::declval< bool& >(), args... ) ); if( RowMajorOrder ) { using RealType = decltype( fetch( IndexType(), IndexType() ) ); const IndexType segmentSize = this->segmentSize; auto l = [=] __cuda_callable__ ( const IndexType i, Args... args ) mutable { const IndexType begin = i * segmentSize; const IndexType end = begin + segmentSize; RealType aux( zero ); for( IndexType j = begin; j < end; j++ ) reduction( aux, fetch( i, j, args... ) ); bool compute( true ); for( IndexType j = begin; j < end && compute; j++ ) reduction( aux, fetch( i, j, compute, args... ) ); keeper( i, aux ); }; Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); } else { using RealType = decltype( fetch( IndexType(), IndexType() ) ); const IndexType storageSize = this->getStorageSize(); const IndexType alignedSize = this->alignedSize; auto l = [=] __cuda_callable__ ( const IndexType i, Args... args ) mutable { const IndexType begin = i; const IndexType end = storageSize; RealType aux( zero ); for( IndexType j = begin; j < end; j += alignedSize ) reduction( aux, fetch( i, j, args... ) ); bool compute( true ); for( IndexType j = begin; j < end && compute; j += alignedSize ) reduction( aux, fetch( i, j, compute, args... ) ); keeper( i, aux ); }; Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); Loading
src/TNL/Containers/Segments/SlicedEllpack.hpp +8 −6 Original line number Diff line number Diff line Loading @@ -127,7 +127,7 @@ setSegmentsSizes( const SizesHolder& sizes ) const auto sizes_view = sizes.getConstView(); auto slices_view = this->sliceOffsets.getView(); auto slice_segment_size_view = this->sliceSegmentSizes.getView(); auto fetch = [=] __cuda_callable__ ( IndexType segmentIdx, IndexType globalIdx ) -> IndexType { auto fetch = [=] __cuda_callable__ ( IndexType segmentIdx, IndexType globalIdx, bool& compute ) -> IndexType { if( globalIdx < _size ) return sizes_view[ globalIdx ]; return 0; Loading Loading @@ -341,7 +341,7 @@ void SlicedEllpack< Device, Index, IndexAllocator, RowMajorOrder, SliceSize >:: segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { using RealType = decltype( fetch( IndexType(), IndexType() ) ); using RealType = decltype( fetch( IndexType(), IndexType(), std::declval< bool& >(), args... ) ); const auto sliceSegmentSizes_view = this->sliceSegmentSizes.getConstView(); const auto sliceOffsets_view = this->sliceOffsets.getConstView(); if( RowMajorOrder ) Loading @@ -353,8 +353,9 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red const IndexType begin = sliceOffsets_view[ sliceIdx ] + segmentInSliceIdx * segmentSize; const IndexType end = begin + segmentSize; RealType aux( zero ); for( IndexType globalIdx = begin; globalIdx< end; globalIdx++ ) reduction( aux, fetch( segmentIdx, globalIdx, args... ) ); bool compute( true ); for( IndexType globalIdx = begin; globalIdx< end && compute; globalIdx++ ) reduction( aux, fetch( segmentIdx, globalIdx, compute, args... ) ); keeper( segmentIdx, aux ); }; Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); Loading @@ -368,8 +369,9 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red const IndexType begin = sliceOffsets_view[ sliceIdx ] + segmentInSliceIdx; const IndexType end = sliceOffsets_view[ sliceIdx + 1 ]; RealType aux( zero ); for( IndexType globalIdx = begin; globalIdx < end; globalIdx += SliceSize ) reduction( aux, fetch( segmentIdx, globalIdx, args... ) ); bool compute( true ); for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx += SliceSize ) reduction( aux, fetch( segmentIdx, globalIdx, compute, args... ) ); keeper( segmentIdx, aux ); }; Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); Loading