From e24cafa2f33d088f4b955dddebf577bba5d7605a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com> Date: Fri, 12 Feb 2021 17:38:21 +0100 Subject: [PATCH] Added method havePadding to segments. --- src/TNL/Algorithms/Segments/BiEllpack.h | 2 + src/TNL/Algorithms/Segments/BiEllpackView.h | 2 + .../Segments/CSRAdaptiveKernelView.hpp | 181 ++++++++++++------ src/TNL/Algorithms/Segments/CSRView.h | 2 + src/TNL/Algorithms/Segments/ChunkedEllpack.h | 2 + .../Algorithms/Segments/ChunkedEllpackView.h | 2 + src/TNL/Algorithms/Segments/Ellpack.h | 2 + src/TNL/Algorithms/Segments/EllpackView.h | 2 + src/TNL/Algorithms/Segments/SlicedEllpack.h | 2 + .../Algorithms/Segments/SlicedEllpackView.h | 2 + src/TNL/Matrices/SparseMatrixView.hpp | 11 +- 11 files changed, 144 insertions(+), 66 deletions(-) diff --git a/src/TNL/Algorithms/Segments/BiEllpack.h b/src/TNL/Algorithms/Segments/BiEllpack.h index e7f01e6125..ffc2e53642 100644 --- a/src/TNL/Algorithms/Segments/BiEllpack.h +++ b/src/TNL/Algorithms/Segments/BiEllpack.h @@ -38,6 +38,8 @@ class BiEllpack using ConstViewType = BiEllpackView< Device, std::add_const_t< IndexType >, Organization >; using SegmentViewType = BiEllpackSegmentView< IndexType, Organization >; + static constexpr bool havePadding() { return true; }; + BiEllpack() = default; BiEllpack( const Containers::Vector< IndexType, DeviceType, IndexType >& sizes ); diff --git a/src/TNL/Algorithms/Segments/BiEllpackView.h b/src/TNL/Algorithms/Segments/BiEllpackView.h index 53278511ce..d7dc52054d 100644 --- a/src/TNL/Algorithms/Segments/BiEllpackView.h +++ b/src/TNL/Algorithms/Segments/BiEllpackView.h @@ -40,6 +40,8 @@ class BiEllpackView using ConstViewType = BiEllpackView< Device, std::add_const_t< Index > >; using SegmentViewType = BiEllpackSegmentView< IndexType, Organization >; + static constexpr bool havePadding() { return true; }; + __cuda_callable__ BiEllpackView() = default; diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp index 35424d93c2..40700c50f2 100644 --- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp +++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp @@ -53,46 +53,39 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, constexpr size_t StreamedSharedElementsPerWarp = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::StreamedSharedElementsPerWarp(); __shared__ Real streamShared[ WarpsCount ][ StreamedSharedElementsPerWarp ]; - __shared__ Real multivectorShared[ CudaBlockSize / WarpSize ]; - __shared__ BlockType sharedBlocks[ WarpsCount ]; + //__shared__ Real multivectorShared[ CudaBlockSize / WarpSize ]; + //__shared__ BlockType sharedBlocks[ WarpsCount ]; const Index index = ( ( gridIdx * TNL::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x ) + threadIdx.x; const Index blockIdx = index / WarpSize; if( blockIdx >= blocks.getSize() - 1 ) return; - if( threadIdx.x < CudaBlockSize / WarpSize ) - multivectorShared[ threadIdx.x ] = zero; + //if( threadIdx.x < CudaBlockSize / WarpSize ) + // multivectorShared[ threadIdx.x ] = zero; Real result = zero; bool compute( true ); const Index laneIdx = threadIdx.x & 31; // & is cheaper than % - const Index warpIdx = threadIdx.x / 32; /*if( laneIdx == 0 ) sharedBlocks[ warpIdx ] = blocks[ blockIdx ]; __syncthreads(); const auto& block = sharedBlocks[ warpIdx ];*/ const BlockType block = blocks[ blockIdx ]; - const Index& firstSegmentIdx = block.getFirstSegment(); - const Index begin = offsets[ firstSegmentIdx ]; + const Index begin = offsets[ block.getFirstSegment() ]; - const auto blockType = block.getType(); - if( blockType == details::Type::STREAM ) // Stream kernel - many short segments per warp + if( block.getType() == details::Type::STREAM ) // Stream kernel - many short segments per warp { - + const Index warpIdx = threadIdx.x / 32; const Index end = begin + block.getSize(); // Stream data to shared memory for( Index globalIdx = laneIdx + begin; globalIdx < end; globalIdx += WarpSize ) { - streamShared[ warpIdx ][ globalIdx - begin ] = //fetch( globalIdx, compute ); - details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute ); - // TODO:: fix this by template specialization so that we can assume fetch lambda - // with short parameters + streamShared[ warpIdx ][ globalIdx - begin ] = fetch( globalIdx, compute ); } + //const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock(); - const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock(); - - for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += WarpSize ) + /*for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += WarpSize ) { const Index sharedEnd = offsets[ i + 1 ] - begin; // end of preprocessed data result = zero; @@ -100,15 +93,15 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, for( Index sharedIdx = offsets[ i ] - begin; sharedIdx < sharedEnd; sharedIdx++ ) result = reduce( result, streamShared[ warpIdx ][ sharedIdx ] ); keep( i, result ); - } + }*/ } - else if( blockType == details::Type::VECTOR ) // Vector kernel - one segment per warp + /*else if( block.getType() == details::Type::VECTOR ) // Vector kernel - one segment per warp { const Index end = begin + block.getSize(); const Index segmentIdx = block.getFirstSegment(); for( Index globalIdx = begin + laneIdx; globalIdx < end; globalIdx += WarpSize ) - result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); // fix local idx + result = reduce( result, fetch( globalIdx, compute ) ); // Parallel reduction result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 16 ) ); @@ -119,7 +112,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, if( laneIdx == 0 ) keep( segmentIdx, result ); } - else // blockType == Type::LONG - several warps per segment + else // block.getType() == Type::LONG - several warps per segment { const Index segmentIdx = block.getFirstSegment();//block.index[0]; const Index end = offsets[segmentIdx + 1]; @@ -130,7 +123,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, globalIdx < end; globalIdx += TNL::Cuda::getWarpSize() * block.getWarpsCount() ) { - result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); + result = reduce( result, fetch( globalIdx, compute ) ); } result += __shfl_down_sync(0xFFFFFFFF, result, 16); @@ -179,10 +172,111 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, keep( segmentIdx, multivectorShared[ 0 ] ); } } - } + }*/ } #endif +template< typename Index, + typename Device, + typename Fetch, + typename Reduction, + typename ResultKeeper, + bool DispatchScalarCSR = + details::CheckFetchLambda< Index, Fetch >::hasAllParameters() || + std::is_same< Device, Devices::Host >::value > +struct CSRAdaptiveKernelSegmentsReductionDispatcher; + +template< typename Index, + typename Device, + typename Fetch, + typename Reduction, + typename ResultKeeper > +struct CSRAdaptiveKernelSegmentsReductionDispatcher< Index, Device, Fetch, Reduction, ResultKeeper, true > +{ + + template< typename BlocksView, + typename Offsets, + typename Real, + typename... Args > + static void reduce( const Offsets& offsets, + const BlocksView& blocks, + Index first, + Index last, + Fetch& fetch, + const Reduction& reduction, + ResultKeeper& keeper, + const Real& zero, + Args... args) + { + TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >:: + segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); + } +}; + +template< typename Index, + typename Device, + typename Fetch, + typename Reduction, + typename ResultKeeper > +struct CSRAdaptiveKernelSegmentsReductionDispatcher< Index, Device, Fetch, Reduction, ResultKeeper, false > +{ + template< typename BlocksView, + typename Offsets, + typename Real, + typename... Args > + static void reduce( const Offsets& offsets, + const BlocksView& blocks, + Index first, + Index last, + Fetch& fetch, + const Reduction& reduction, + ResultKeeper& keeper, + const Real& zero, + Args... args) + { +#ifdef HAVE_CUDA + + Index blocksCount; + + const Index threads = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize(); + constexpr size_t maxGridSize = TNL::Cuda::getMaxGridSize(); + + // Fill blocks + size_t neededThreads = blocks.getSize() * TNL::Cuda::getWarpSize(); // one warp per block + // Execute kernels on device + for (Index gridIdx = 0; neededThreads != 0; gridIdx++ ) + { + if( maxGridSize * threads >= neededThreads ) + { + blocksCount = roundUpDivision( neededThreads, threads ); + neededThreads = 0; + } + else + { + blocksCount = maxGridSize; + neededThreads -= maxGridSize * threads; + } + + segmentsReductionCSRAdaptiveKernel< + BlocksView, + Offsets, + Index, Fetch, Reduction, ResultKeeper, Real, Args... > + <<<blocksCount, threads>>>( + blocks, + gridIdx, + offsets, + first, + last, + fetch, + reduction, + keeper, + zero, + args... ); + } +#endif + } +}; + template< typename Index, typename Device > void @@ -238,7 +332,6 @@ segmentsReduction( const OffsetsView& offsets, const Real& zero, Args... args ) const { -#ifdef HAVE_CUDA int valueSizeLog = getSizeValueLog( sizeof( Real ) ); if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() || valueSizeLog >= MaxValueSizeLog ) @@ -248,44 +341,8 @@ segmentsReduction( const OffsetsView& offsets, return; } - Index blocksCount; - - const Index threads = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize(); - constexpr size_t maxGridSize = TNL::Cuda::getMaxGridSize(); - - // Fill blocks - size_t neededThreads = this->blocksArray[ valueSizeLog ].getSize() * TNL::Cuda::getWarpSize(); // one warp per block - // Execute kernels on device - for (Index gridIdx = 0; neededThreads != 0; gridIdx++ ) - { - if( maxGridSize * threads >= neededThreads ) - { - blocksCount = roundUpDivision( neededThreads, threads ); - neededThreads = 0; - } - else - { - blocksCount = maxGridSize; - neededThreads -= maxGridSize * threads; - } - - segmentsReductionCSRAdaptiveKernel< - BlocksView, - OffsetsView, - Index, Fetch, Reduction, ResultKeeper, Real, Args... > - <<<blocksCount, threads>>>( - this->blocksArray[ valueSizeLog ], - gridIdx, - offsets, - first, - last, - fetch, - reduction, - keeper, - zero, - args... ); - } -#endif + CSRAdaptiveKernelSegmentsReductionDispatcher< Index, Device, Fetch, Reduction, ResultKeeper >::template + reduce< BlocksView, OffsetsView, Real, Args... >( offsets, this->blocksArray[ valueSizeLog ], first, last, fetch, reduction, keeper, zero, args... ); } template< typename Index, diff --git a/src/TNL/Algorithms/Segments/CSRView.h b/src/TNL/Algorithms/Segments/CSRView.h index a97d784536..a8631a92b6 100644 --- a/src/TNL/Algorithms/Segments/CSRView.h +++ b/src/TNL/Algorithms/Segments/CSRView.h @@ -42,6 +42,8 @@ class CSRView using ConstViewType = CSRView< Device, std::add_const_t< Index >, Kernel >; using SegmentViewType = SegmentView< IndexType, RowMajorOrder >; + static constexpr bool havePadding() { return false; }; + __cuda_callable__ CSRView(); diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpack.h b/src/TNL/Algorithms/Segments/ChunkedEllpack.h index 81f1fb7153..48628f7540 100644 --- a/src/TNL/Algorithms/Segments/ChunkedEllpack.h +++ b/src/TNL/Algorithms/Segments/ChunkedEllpack.h @@ -41,6 +41,8 @@ class ChunkedEllpack using ChunkedEllpackSliceInfoAllocator = typename Allocators::Default< Device >::template Allocator< ChunkedEllpackSliceInfoType >; using ChunkedEllpackSliceInfoContainer = Containers::Array< ChunkedEllpackSliceInfoType, DeviceType, IndexType, ChunkedEllpackSliceInfoAllocator >; + static constexpr bool havePadding() { return true; }; + ChunkedEllpack() = default; ChunkedEllpack( const Containers::Vector< IndexType, DeviceType, IndexType >& sizes ); diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpackView.h b/src/TNL/Algorithms/Segments/ChunkedEllpackView.h index 8689167e5d..d5192e53eb 100644 --- a/src/TNL/Algorithms/Segments/ChunkedEllpackView.h +++ b/src/TNL/Algorithms/Segments/ChunkedEllpackView.h @@ -43,6 +43,8 @@ class ChunkedEllpackView using ChunkedEllpackSliceInfoContainer = Containers::Array< ChunkedEllpackSliceInfoType, DeviceType, IndexType, ChunkedEllpackSliceInfoAllocator >; using ChunkedEllpackSliceInfoContainerView = typename ChunkedEllpackSliceInfoContainer::ViewType; + static constexpr bool havePadding() { return true; }; + __cuda_callable__ ChunkedEllpackView() = default; diff --git a/src/TNL/Algorithms/Segments/Ellpack.h b/src/TNL/Algorithms/Segments/Ellpack.h index e5bcaf8e65..79ad4745d5 100644 --- a/src/TNL/Algorithms/Segments/Ellpack.h +++ b/src/TNL/Algorithms/Segments/Ellpack.h @@ -39,6 +39,8 @@ class Ellpack using ConstViewType = typename ViewType::ConstViewType; using SegmentViewType = SegmentView< IndexType, Organization >; + static constexpr bool havePadding() { return true; }; + Ellpack(); Ellpack( const SegmentsSizes& sizes ); diff --git a/src/TNL/Algorithms/Segments/EllpackView.h b/src/TNL/Algorithms/Segments/EllpackView.h index 981d71244b..bb1fcc1bf8 100644 --- a/src/TNL/Algorithms/Segments/EllpackView.h +++ b/src/TNL/Algorithms/Segments/EllpackView.h @@ -41,6 +41,8 @@ class EllpackView using ConstViewType = ViewType; using SegmentViewType = SegmentView< IndexType, Organization >; + static constexpr bool havePadding() { return true; }; + __cuda_callable__ EllpackView(); diff --git a/src/TNL/Algorithms/Segments/SlicedEllpack.h b/src/TNL/Algorithms/Segments/SlicedEllpack.h index 580af7897c..e7f3a20616 100644 --- a/src/TNL/Algorithms/Segments/SlicedEllpack.h +++ b/src/TNL/Algorithms/Segments/SlicedEllpack.h @@ -39,6 +39,8 @@ class SlicedEllpack using ConstViewType = SlicedEllpackView< Device, std::add_const_t< Index >, Organization, SliceSize >; using SegmentViewType = SegmentView< IndexType, Organization >; + static constexpr bool havePadding() { return true; }; + SlicedEllpack(); SlicedEllpack( const Containers::Vector< IndexType, DeviceType, IndexType >& sizes ); diff --git a/src/TNL/Algorithms/Segments/SlicedEllpackView.h b/src/TNL/Algorithms/Segments/SlicedEllpackView.h index 46fc80aef5..f3c257bd0f 100644 --- a/src/TNL/Algorithms/Segments/SlicedEllpackView.h +++ b/src/TNL/Algorithms/Segments/SlicedEllpackView.h @@ -39,6 +39,8 @@ class SlicedEllpackView using ConstViewType = ViewType; using SegmentViewType = SegmentView< IndexType, Organization >; + static constexpr bool havePadding() { return true; }; + __cuda_callable__ SlicedEllpackView(); diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp index d4ea65b3d2..ae83dfa891 100644 --- a/src/TNL/Matrices/SparseMatrixView.hpp +++ b/src/TNL/Matrices/SparseMatrixView.hpp @@ -437,15 +437,18 @@ vectorProduct( const InVector& inVector, }; auto fetch = [=] __cuda_callable__ ( IndexType globalIdx, bool& compute ) mutable -> ComputeRealType { const IndexType column = columnIndexesView[ globalIdx ]; - compute = ( column != paddingIndex ); - if( ! compute ) - return 0.0; + if( SegmentsViewType::havePadding() ) + { + compute = ( column != paddingIndex ); + if( ! compute ) + return 0.0; + } if( isBinary() ) return inVectorView[ column ]; return valuesView[ globalIdx ] * inVectorView[ column ]; }; - auto keeper = [=] __cuda_callable__ ( IndexType row, const ComputeRealType& value ) mutable { + auto keeperGeneral = [=] __cuda_callable__ ( IndexType row, const ComputeRealType& value ) mutable { if( isSymmetric() ) { typename OutVector::RealType aux = matrixMultiplicator * value; -- GitLab