diff --git a/src/TNL/Containers/Segments/CSR.hpp b/src/TNL/Containers/Segments/CSR.hpp index 9ab2186c36f01fc8c4806659ab01ce84bb17ee63..83da548fc9eeefbea54c4d45883bd21cbe33f632 100644 --- a/src/TNL/Containers/Segments/CSR.hpp +++ b/src/TNL/Containers/Segments/CSR.hpp @@ -218,14 +218,15 @@ void CSR< Device, Index, IndexAllocator >:: segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { - using RealType = decltype( fetch( IndexType(), IndexType() ) ); + using RealType = decltype( fetch( IndexType(), IndexType(), std::declval< bool& >(), args... ) ); const auto offsetsView = this->offsets.getConstView(); auto l = [=] __cuda_callable__ ( const IndexType i, Args... args ) mutable { const IndexType begin = offsetsView[ i ]; const IndexType end = offsetsView[ i + 1 ]; RealType aux( zero ); - for( IndexType j = begin; j < end; j++ ) - reduction( aux, fetch( i, j, args... ) ); + bool compute( true ); + for( IndexType j = begin; j < end && compute; j++ ) + reduction( aux, fetch( i, j, compute, args... ) ); keeper( i, aux ); }; Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); diff --git a/src/TNL/Containers/Segments/CSRView.hpp b/src/TNL/Containers/Segments/CSRView.hpp index f4f59370d46a5cbcb8502ff5e30d181397c382ca..b4304ee321446ddec2074dba7bd0e8bf48f2c57b 100644 --- a/src/TNL/Containers/Segments/CSRView.hpp +++ b/src/TNL/Containers/Segments/CSRView.hpp @@ -204,14 +204,15 @@ void CSRView< Device, Index >:: segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { - using RealType = decltype( fetch( IndexType(), IndexType() ) ); + using RealType = decltype( fetch( IndexType(), IndexType(), std::declval< bool& >(), args... ) ); const auto offsetsView = this->offsets.getConstView(); auto l = [=] __cuda_callable__ ( const IndexType i, Args... args ) mutable { const IndexType begin = offsetsView[ i ]; const IndexType end = offsetsView[ i + 1 ]; RealType aux( zero ); - for( IndexType j = begin; j < end; j++ ) - reduction( aux, fetch( i, j, args... ) ); + bool compute( true ); + for( IndexType j = begin; j < end && compute; j++ ) + reduction( aux, fetch( i, j, compute, args... ) ); keeper( i, aux ); }; Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); diff --git a/src/TNL/Containers/Segments/Ellpack.hpp b/src/TNL/Containers/Segments/Ellpack.hpp index 9f7702a6f2e071cfd255a2c69ed20b6f9a9b2856..ebc2b360eb91a4100db7a1d6599b33536c74f4b6 100644 --- a/src/TNL/Containers/Segments/Ellpack.hpp +++ b/src/TNL/Containers/Segments/Ellpack.hpp @@ -306,31 +306,32 @@ void Ellpack< Device, Index, IndexAllocator, RowMajorOrder, Alignment >:: segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { + using RealType = decltype( fetch( IndexType(), IndexType(), std::declval< bool& >(), args... ) ); if( RowMajorOrder ) { - using RealType = decltype( fetch( IndexType(), IndexType() ) ); const IndexType segmentSize = this->segmentSize; auto l = [=] __cuda_callable__ ( const IndexType i, Args... args ) mutable { const IndexType begin = i * segmentSize; const IndexType end = begin + segmentSize; RealType aux( zero ); - for( IndexType j = begin; j < end; j++ ) - reduction( aux, fetch( i, j, args... ) ); + bool compute( true ); + for( IndexType j = begin; j < end && compute; j++ ) + reduction( aux, fetch( i, j, compute, args... ) ); keeper( i, aux ); }; Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); } else { - using RealType = decltype( fetch( IndexType(), IndexType() ) ); const IndexType storageSize = this->getStorageSize(); const IndexType alignedSize = this->alignedSize; auto l = [=] __cuda_callable__ ( const IndexType i, Args... args ) mutable { const IndexType begin = i; const IndexType end = storageSize; RealType aux( zero ); - for( IndexType j = begin; j < end; j += alignedSize ) - reduction( aux, fetch( i, j, args... ) ); + bool compute( true ); + for( IndexType j = begin; j < end && compute; j += alignedSize ) + reduction( aux, fetch( i, j, compute, args... ) ); keeper( i, aux ); }; Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); diff --git a/src/TNL/Containers/Segments/EllpackView.hpp b/src/TNL/Containers/Segments/EllpackView.hpp index f5dba4f3d7fb65c5e144de97fde23f58a2893505..dc6bd485dd3faf5710268f590385037af4a3f521 100644 --- a/src/TNL/Containers/Segments/EllpackView.hpp +++ b/src/TNL/Containers/Segments/EllpackView.hpp @@ -245,31 +245,32 @@ void EllpackView< Device, Index, RowMajorOrder, Alignment >:: segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { + using RealType = decltype( fetch( IndexType(), IndexType(), std::declval< bool& >(), args... ) ); if( RowMajorOrder ) { - using RealType = decltype( fetch( IndexType(), IndexType() ) ); const IndexType segmentSize = this->segmentSize; auto l = [=] __cuda_callable__ ( const IndexType i, Args... args ) mutable { const IndexType begin = i * segmentSize; const IndexType end = begin + segmentSize; RealType aux( zero ); - for( IndexType j = begin; j < end; j++ ) - reduction( aux, fetch( i, j, args... ) ); + bool compute( true ); + for( IndexType j = begin; j < end && compute; j++ ) + reduction( aux, fetch( i, j, compute, args... ) ); keeper( i, aux ); }; Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); } else { - using RealType = decltype( fetch( IndexType(), IndexType() ) ); const IndexType storageSize = this->getStorageSize(); const IndexType alignedSize = this->alignedSize; auto l = [=] __cuda_callable__ ( const IndexType i, Args... args ) mutable { const IndexType begin = i; const IndexType end = storageSize; RealType aux( zero ); - for( IndexType j = begin; j < end; j += alignedSize ) - reduction( aux, fetch( i, j, args... ) ); + bool compute( true ); + for( IndexType j = begin; j < end && compute; j += alignedSize ) + reduction( aux, fetch( i, j, compute, args... ) ); keeper( i, aux ); }; Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); diff --git a/src/TNL/Containers/Segments/SlicedEllpack.hpp b/src/TNL/Containers/Segments/SlicedEllpack.hpp index e2aec924d58ef22c5c896bc815506a3e468b68de..ecd32abb25cd08997d389c087bbb039507f4718e 100644 --- a/src/TNL/Containers/Segments/SlicedEllpack.hpp +++ b/src/TNL/Containers/Segments/SlicedEllpack.hpp @@ -127,7 +127,7 @@ setSegmentsSizes( const SizesHolder& sizes ) const auto sizes_view = sizes.getConstView(); auto slices_view = this->sliceOffsets.getView(); auto slice_segment_size_view = this->sliceSegmentSizes.getView(); - auto fetch = [=] __cuda_callable__ ( IndexType segmentIdx, IndexType globalIdx ) -> IndexType { + auto fetch = [=] __cuda_callable__ ( IndexType segmentIdx, IndexType globalIdx, bool& compute ) -> IndexType { if( globalIdx < _size ) return sizes_view[ globalIdx ]; return 0; @@ -341,7 +341,7 @@ void SlicedEllpack< Device, Index, IndexAllocator, RowMajorOrder, SliceSize >:: segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { - using RealType = decltype( fetch( IndexType(), IndexType() ) ); + using RealType = decltype( fetch( IndexType(), IndexType(), std::declval< bool& >(), args... ) ); const auto sliceSegmentSizes_view = this->sliceSegmentSizes.getConstView(); const auto sliceOffsets_view = this->sliceOffsets.getConstView(); if( RowMajorOrder ) @@ -353,8 +353,9 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red const IndexType begin = sliceOffsets_view[ sliceIdx ] + segmentInSliceIdx * segmentSize; const IndexType end = begin + segmentSize; RealType aux( zero ); - for( IndexType globalIdx = begin; globalIdx< end; globalIdx++ ) - reduction( aux, fetch( segmentIdx, globalIdx, args... ) ); + bool compute( true ); + for( IndexType globalIdx = begin; globalIdx< end && compute; globalIdx++ ) + reduction( aux, fetch( segmentIdx, globalIdx, compute, args... ) ); keeper( segmentIdx, aux ); }; Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); @@ -368,8 +369,9 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red const IndexType begin = sliceOffsets_view[ sliceIdx ] + segmentInSliceIdx; const IndexType end = sliceOffsets_view[ sliceIdx + 1 ]; RealType aux( zero ); - for( IndexType globalIdx = begin; globalIdx < end; globalIdx += SliceSize ) - reduction( aux, fetch( segmentIdx, globalIdx, args... ) ); + bool compute( true ); + for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx += SliceSize ) + reduction( aux, fetch( segmentIdx, globalIdx, compute, args... ) ); keeper( segmentIdx, aux ); }; Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); diff --git a/src/TNL/Containers/Segments/SlicedEllpackView.hpp b/src/TNL/Containers/Segments/SlicedEllpackView.hpp index 139a09a15e515f1a5ffe83cbd4f3c5f2c8933490..41b49ed150459376067eb36e3a1b4b1616bd37c5 100644 --- a/src/TNL/Containers/Segments/SlicedEllpackView.hpp +++ b/src/TNL/Containers/Segments/SlicedEllpackView.hpp @@ -247,8 +247,9 @@ forSegments( IndexType first, IndexType last, Function& f, Args... args ) const const IndexType begin = sliceOffsets_view[ sliceIdx ] + segmentInSliceIdx * segmentSize; const IndexType end = begin + segmentSize; IndexType localIdx( 0 ); - for( IndexType globalIdx = begin; globalIdx < end; globalIdx++ ) - if( ! f( segmentIdx, localIdx++, globalIdx, args... ) ) + bool compute( true ); + for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++ ) + if( ! f( segmentIdx, localIdx++, globalIdx, compute, args... ) ) break; }; Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); @@ -262,8 +263,9 @@ forSegments( IndexType first, IndexType last, Function& f, Args... args ) const const IndexType begin = sliceOffsets_view[ sliceIdx ] + segmentInSliceIdx; const IndexType end = sliceOffsets_view[ sliceIdx + 1 ]; IndexType localIdx( 0 ); - for( IndexType globalIdx = begin; globalIdx < end; globalIdx += SliceSize ) - if( ! f( segmentIdx, localIdx++, globalIdx, args... ) ) + bool compute( true ); + for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx += SliceSize ) + if( ! f( segmentIdx, localIdx++, globalIdx, compute, args... ) ) break; }; Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); @@ -291,7 +293,7 @@ void SlicedEllpackView< Device, Index, RowMajorOrder, SliceSize >:: segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { - using RealType = decltype( fetch( IndexType(), IndexType() ) ); + using RealType = decltype( fetch( IndexType(), IndexType(), std::declval< bool& >(), args... ) ); const auto sliceSegmentSizes_view = this->sliceSegmentSizes.getConstView(); const auto sliceOffsets_view = this->sliceOffsets.getConstView(); if( RowMajorOrder ) @@ -303,8 +305,9 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red const IndexType begin = sliceOffsets_view[ sliceIdx ] + segmentInSliceIdx * segmentSize; const IndexType end = begin + segmentSize; RealType aux( zero ); - for( IndexType globalIdx = begin; globalIdx< end; globalIdx++ ) - reduction( aux, fetch( segmentIdx, globalIdx, args... ) ); + bool compute( true ); + for( IndexType globalIdx = begin; globalIdx< end && compute; globalIdx++ ) + reduction( aux, fetch( segmentIdx, globalIdx, compute, args... ) ); keeper( segmentIdx, aux ); }; Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); @@ -318,8 +321,9 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& red const IndexType begin = sliceOffsets_view[ sliceIdx ] + segmentInSliceIdx; const IndexType end = sliceOffsets_view[ sliceIdx + 1 ]; RealType aux( zero ); - for( IndexType globalIdx = begin; globalIdx < end; globalIdx += SliceSize ) - reduction( aux, fetch( segmentIdx, globalIdx, args... ) ); + bool compute( true ); + for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx += SliceSize ) + reduction( aux, fetch( segmentIdx, globalIdx, compute, args... ) ); keeper( segmentIdx, aux ); }; Algorithms::ParallelFor< Device >::exec( first, last, l, args... ); diff --git a/src/TNL/Matrices/SparseMatrix.hpp b/src/TNL/Matrices/SparseMatrix.hpp index c0dd3b9a3ecd9623177374f81b39bad96e3f7528..691157a9c467abb378017338110b986ecedcea77 100644 --- a/src/TNL/Matrices/SparseMatrix.hpp +++ b/src/TNL/Matrices/SparseMatrix.hpp @@ -628,9 +628,10 @@ vectorProduct( const InVector& inVector, const auto valuesView = this->values.getConstView(); const auto columnIndexesView = this->columnIndexes.getConstView(); const IndexType paddingIndex = this->getPaddingIndex(); - auto fetch = [=] __cuda_callable__ ( IndexType row, IndexType offset ) -> RealType { + auto fetch = [=] __cuda_callable__ ( IndexType row, IndexType offset, bool& compute ) -> RealType { const IndexType column = columnIndexesView[ offset ]; - if( column == paddingIndex ) + compute = ( column != paddingIndex ); + if( ! compute ) return 0.0; return valuesView[ offset ] * inVectorView[ column ]; }; @@ -658,7 +659,7 @@ rowsReduction( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Ke const auto columns_view = this->columnIndexes.getConstView(); const auto values_view = this->values.getConstView(); const IndexType paddingIndex_ = this->getPaddingIndex(); - auto fetch_ = [=] __cuda_callable__ ( IndexType rowIdx, IndexType globalIdx ) mutable -> decltype( fetch( IndexType(), IndexType(), RealType() ) ) { + auto fetch_ = [=] __cuda_callable__ ( IndexType rowIdx, IndexType globalIdx, bool& compute ) mutable -> decltype( fetch( IndexType(), IndexType(), RealType() ) ) { IndexType columnIdx = columns_view[ globalIdx ]; if( columnIdx != paddingIndex_ ) return fetch( rowIdx, columnIdx, values_view[ globalIdx ] ); diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp index 5ac494a9b6a593fed8f7c8580f6169bb4eb0c65a..ce0e7aa1818a743a6b11816ee2b2904e8c92cfb9 100644 --- a/src/TNL/Matrices/SparseMatrixView.hpp +++ b/src/TNL/Matrices/SparseMatrixView.hpp @@ -508,9 +508,10 @@ vectorProduct( const InVector& inVector, const auto valuesView = this->values.getConstView(); const auto columnIndexesView = this->columnIndexes.getConstView(); const IndexType paddingIndex = this->getPaddingIndex(); - auto fetch = [=] __cuda_callable__ ( IndexType row, IndexType offset ) -> RealType { + auto fetch = [=] __cuda_callable__ ( IndexType row, IndexType offset, bool& compute ) -> RealType { const IndexType column = columnIndexesView[ offset ]; - if( column == paddingIndex ) + compute = ( column != paddingIndex ); + if( ! compute ) return 0.0; return valuesView[ offset ] * inVectorView[ column ]; }; diff --git a/src/UnitTests/Containers/Segments/SegmentsTest.hpp b/src/UnitTests/Containers/Segments/SegmentsTest.hpp index 5e74f96b039b23102908c772f576b9592c1bb079..6189c2e9a4b3c3d81ef1be5466cd79b6d746f188 100644 --- a/src/UnitTests/Containers/Segments/SegmentsTest.hpp +++ b/src/UnitTests/Containers/Segments/SegmentsTest.hpp @@ -143,7 +143,7 @@ void test_AllReduction_MaximumInSegments() const auto v_view = v.getConstView(); auto result_view = result.getView(); - auto fetch = [=] __cuda_callable__ ( IndexType segmentIdx, IndexType globalIdx ) -> IndexType { + auto fetch = [=] __cuda_callable__ ( IndexType segmentIdx, IndexType globalIdx, bool& compute ) -> IndexType { return v_view[ globalIdx ]; }; auto reduce = [] __cuda_callable__ ( IndexType& a, const IndexType b ) {