diff --git a/src/TNL/Algorithms/Segments/BiEllpack.h b/src/TNL/Algorithms/Segments/BiEllpack.h index e7f01e6125fb2c7a66a5e0933caf92fe99be3ab5..ffc2e53642fae69b1b0cc53513b0de3e85aa47cd 100644 --- a/src/TNL/Algorithms/Segments/BiEllpack.h +++ b/src/TNL/Algorithms/Segments/BiEllpack.h @@ -38,6 +38,8 @@ class BiEllpack using ConstViewType = BiEllpackView< Device, std::add_const_t< IndexType >, Organization >; using SegmentViewType = BiEllpackSegmentView< IndexType, Organization >; + static constexpr bool havePadding() { return true; }; + BiEllpack() = default; BiEllpack( const Containers::Vector< IndexType, DeviceType, IndexType >& sizes ); diff --git a/src/TNL/Algorithms/Segments/BiEllpackView.h b/src/TNL/Algorithms/Segments/BiEllpackView.h index 53278511ceba54492e684862307a822e24c08461..d7dc52054d87d0abf5ae46ba594d9a71c0059e0a 100644 --- a/src/TNL/Algorithms/Segments/BiEllpackView.h +++ b/src/TNL/Algorithms/Segments/BiEllpackView.h @@ -40,6 +40,8 @@ class BiEllpackView using ConstViewType = BiEllpackView< Device, std::add_const_t< Index > >; using SegmentViewType = BiEllpackSegmentView< IndexType, Organization >; + static constexpr bool havePadding() { return true; }; + __cuda_callable__ BiEllpackView() = default; diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp index 35424d93c22bc18b7525c13c1acf8c574c380aae..40700c50f25ed5c50539bf0d61dc7a6330212cfc 100644 --- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp +++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp @@ -53,46 +53,39 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, constexpr size_t StreamedSharedElementsPerWarp = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::StreamedSharedElementsPerWarp(); __shared__ Real streamShared[ WarpsCount ][ StreamedSharedElementsPerWarp ]; - __shared__ Real multivectorShared[ CudaBlockSize / WarpSize ]; - __shared__ BlockType sharedBlocks[ WarpsCount ]; + //__shared__ Real multivectorShared[ CudaBlockSize / WarpSize ]; + //__shared__ BlockType sharedBlocks[ WarpsCount ]; const Index index = ( ( gridIdx * TNL::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x ) + threadIdx.x; const Index blockIdx = index / WarpSize; if( blockIdx >= blocks.getSize() - 1 ) return; - if( threadIdx.x < CudaBlockSize / WarpSize ) - multivectorShared[ threadIdx.x ] = zero; + //if( threadIdx.x < CudaBlockSize / WarpSize ) + // multivectorShared[ threadIdx.x ] = zero; Real result = zero; bool compute( true ); const Index laneIdx = threadIdx.x & 31; // & is cheaper than % - const Index warpIdx = threadIdx.x / 32; /*if( laneIdx == 0 ) sharedBlocks[ warpIdx ] = blocks[ blockIdx ]; __syncthreads(); const auto& block = sharedBlocks[ warpIdx ];*/ const BlockType block = blocks[ blockIdx ]; - const Index& firstSegmentIdx = block.getFirstSegment(); - const Index begin = offsets[ firstSegmentIdx ]; + const Index begin = offsets[ block.getFirstSegment() ]; - const auto blockType = block.getType(); - if( blockType == details::Type::STREAM ) // Stream kernel - many short segments per warp + if( block.getType() == details::Type::STREAM ) // Stream kernel - many short segments per warp { - + const Index warpIdx = threadIdx.x / 32; const Index end = begin + block.getSize(); // Stream data to shared memory for( Index globalIdx = laneIdx + begin; globalIdx < end; globalIdx += WarpSize ) { - streamShared[ warpIdx ][ globalIdx - begin ] = //fetch( globalIdx, compute ); - details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute ); - // TODO:: fix this by template specialization so that we can assume fetch lambda - // with short parameters + streamShared[ warpIdx ][ globalIdx - begin ] = fetch( globalIdx, compute ); } + //const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock(); - const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock(); - - for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += WarpSize ) + /*for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += WarpSize ) { const Index sharedEnd = offsets[ i + 1 ] - begin; // end of preprocessed data result = zero; @@ -100,15 +93,15 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, for( Index sharedIdx = offsets[ i ] - begin; sharedIdx < sharedEnd; sharedIdx++ ) result = reduce( result, streamShared[ warpIdx ][ sharedIdx ] ); keep( i, result ); - } + }*/ } - else if( blockType == details::Type::VECTOR ) // Vector kernel - one segment per warp + /*else if( block.getType() == details::Type::VECTOR ) // Vector kernel - one segment per warp { const Index end = begin + block.getSize(); const Index segmentIdx = block.getFirstSegment(); for( Index globalIdx = begin + laneIdx; globalIdx < end; globalIdx += WarpSize ) - result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); // fix local idx + result = reduce( result, fetch( globalIdx, compute ) ); // Parallel reduction result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 16 ) ); @@ -119,7 +112,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, if( laneIdx == 0 ) keep( segmentIdx, result ); } - else // blockType == Type::LONG - several warps per segment + else // block.getType() == Type::LONG - several warps per segment { const Index segmentIdx = block.getFirstSegment();//block.index[0]; const Index end = offsets[segmentIdx + 1]; @@ -130,7 +123,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, globalIdx < end; globalIdx += TNL::Cuda::getWarpSize() * block.getWarpsCount() ) { - result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); + result = reduce( result, fetch( globalIdx, compute ) ); } result += __shfl_down_sync(0xFFFFFFFF, result, 16); @@ -179,10 +172,111 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, keep( segmentIdx, multivectorShared[ 0 ] ); } } - } + }*/ } #endif +template< typename Index, + typename Device, + typename Fetch, + typename Reduction, + typename ResultKeeper, + bool DispatchScalarCSR = + details::CheckFetchLambda< Index, Fetch >::hasAllParameters() || + std::is_same< Device, Devices::Host >::value > +struct CSRAdaptiveKernelSegmentsReductionDispatcher; + +template< typename Index, + typename Device, + typename Fetch, + typename Reduction, + typename ResultKeeper > +struct CSRAdaptiveKernelSegmentsReductionDispatcher< Index, Device, Fetch, Reduction, ResultKeeper, true > +{ + + template< typename BlocksView, + typename Offsets, + typename Real, + typename... Args > + static void reduce( const Offsets& offsets, + const BlocksView& blocks, + Index first, + Index last, + Fetch& fetch, + const Reduction& reduction, + ResultKeeper& keeper, + const Real& zero, + Args... args) + { + TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >:: + segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); + } +}; + +template< typename Index, + typename Device, + typename Fetch, + typename Reduction, + typename ResultKeeper > +struct CSRAdaptiveKernelSegmentsReductionDispatcher< Index, Device, Fetch, Reduction, ResultKeeper, false > +{ + template< typename BlocksView, + typename Offsets, + typename Real, + typename... Args > + static void reduce( const Offsets& offsets, + const BlocksView& blocks, + Index first, + Index last, + Fetch& fetch, + const Reduction& reduction, + ResultKeeper& keeper, + const Real& zero, + Args... args) + { +#ifdef HAVE_CUDA + + Index blocksCount; + + const Index threads = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize(); + constexpr size_t maxGridSize = TNL::Cuda::getMaxGridSize(); + + // Fill blocks + size_t neededThreads = blocks.getSize() * TNL::Cuda::getWarpSize(); // one warp per block + // Execute kernels on device + for (Index gridIdx = 0; neededThreads != 0; gridIdx++ ) + { + if( maxGridSize * threads >= neededThreads ) + { + blocksCount = roundUpDivision( neededThreads, threads ); + neededThreads = 0; + } + else + { + blocksCount = maxGridSize; + neededThreads -= maxGridSize * threads; + } + + segmentsReductionCSRAdaptiveKernel< + BlocksView, + Offsets, + Index, Fetch, Reduction, ResultKeeper, Real, Args... > + <<<blocksCount, threads>>>( + blocks, + gridIdx, + offsets, + first, + last, + fetch, + reduction, + keeper, + zero, + args... ); + } +#endif + } +}; + template< typename Index, typename Device > void @@ -238,7 +332,6 @@ segmentsReduction( const OffsetsView& offsets, const Real& zero, Args... args ) const { -#ifdef HAVE_CUDA int valueSizeLog = getSizeValueLog( sizeof( Real ) ); if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() || valueSizeLog >= MaxValueSizeLog ) @@ -248,44 +341,8 @@ segmentsReduction( const OffsetsView& offsets, return; } - Index blocksCount; - - const Index threads = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize(); - constexpr size_t maxGridSize = TNL::Cuda::getMaxGridSize(); - - // Fill blocks - size_t neededThreads = this->blocksArray[ valueSizeLog ].getSize() * TNL::Cuda::getWarpSize(); // one warp per block - // Execute kernels on device - for (Index gridIdx = 0; neededThreads != 0; gridIdx++ ) - { - if( maxGridSize * threads >= neededThreads ) - { - blocksCount = roundUpDivision( neededThreads, threads ); - neededThreads = 0; - } - else - { - blocksCount = maxGridSize; - neededThreads -= maxGridSize * threads; - } - - segmentsReductionCSRAdaptiveKernel< - BlocksView, - OffsetsView, - Index, Fetch, Reduction, ResultKeeper, Real, Args... > - <<<blocksCount, threads>>>( - this->blocksArray[ valueSizeLog ], - gridIdx, - offsets, - first, - last, - fetch, - reduction, - keeper, - zero, - args... ); - } -#endif + CSRAdaptiveKernelSegmentsReductionDispatcher< Index, Device, Fetch, Reduction, ResultKeeper >::template + reduce< BlocksView, OffsetsView, Real, Args... >( offsets, this->blocksArray[ valueSizeLog ], first, last, fetch, reduction, keeper, zero, args... ); } template< typename Index, diff --git a/src/TNL/Algorithms/Segments/CSRView.h b/src/TNL/Algorithms/Segments/CSRView.h index a97d784536c1c2b0105412edaffb814ec36a1bc0..a8631a92b60525ae3da73ce695b2edee3f02d6ae 100644 --- a/src/TNL/Algorithms/Segments/CSRView.h +++ b/src/TNL/Algorithms/Segments/CSRView.h @@ -42,6 +42,8 @@ class CSRView using ConstViewType = CSRView< Device, std::add_const_t< Index >, Kernel >; using SegmentViewType = SegmentView< IndexType, RowMajorOrder >; + static constexpr bool havePadding() { return false; }; + __cuda_callable__ CSRView(); diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpack.h b/src/TNL/Algorithms/Segments/ChunkedEllpack.h index 81f1fb715333b2bf1757f7463fb9a192dcc04d5c..48628f754073dacebd0a0a7129813d1cb3fa6eb8 100644 --- a/src/TNL/Algorithms/Segments/ChunkedEllpack.h +++ b/src/TNL/Algorithms/Segments/ChunkedEllpack.h @@ -41,6 +41,8 @@ class ChunkedEllpack using ChunkedEllpackSliceInfoAllocator = typename Allocators::Default< Device >::template Allocator< ChunkedEllpackSliceInfoType >; using ChunkedEllpackSliceInfoContainer = Containers::Array< ChunkedEllpackSliceInfoType, DeviceType, IndexType, ChunkedEllpackSliceInfoAllocator >; + static constexpr bool havePadding() { return true; }; + ChunkedEllpack() = default; ChunkedEllpack( const Containers::Vector< IndexType, DeviceType, IndexType >& sizes ); diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpackView.h b/src/TNL/Algorithms/Segments/ChunkedEllpackView.h index 8689167e5d990a5f4be83feda8e32b7d69103b7f..d5192e53ebb80d7f529ee473c3eca3e0f8e3b18e 100644 --- a/src/TNL/Algorithms/Segments/ChunkedEllpackView.h +++ b/src/TNL/Algorithms/Segments/ChunkedEllpackView.h @@ -43,6 +43,8 @@ class ChunkedEllpackView using ChunkedEllpackSliceInfoContainer = Containers::Array< ChunkedEllpackSliceInfoType, DeviceType, IndexType, ChunkedEllpackSliceInfoAllocator >; using ChunkedEllpackSliceInfoContainerView = typename ChunkedEllpackSliceInfoContainer::ViewType; + static constexpr bool havePadding() { return true; }; + __cuda_callable__ ChunkedEllpackView() = default; diff --git a/src/TNL/Algorithms/Segments/Ellpack.h b/src/TNL/Algorithms/Segments/Ellpack.h index e5bcaf8e65aca8bb40b33dab6594aa5372f193af..79ad4745d52d14b887401f1da37a34f5d377ab75 100644 --- a/src/TNL/Algorithms/Segments/Ellpack.h +++ b/src/TNL/Algorithms/Segments/Ellpack.h @@ -39,6 +39,8 @@ class Ellpack using ConstViewType = typename ViewType::ConstViewType; using SegmentViewType = SegmentView< IndexType, Organization >; + static constexpr bool havePadding() { return true; }; + Ellpack(); Ellpack( const SegmentsSizes& sizes ); diff --git a/src/TNL/Algorithms/Segments/EllpackView.h b/src/TNL/Algorithms/Segments/EllpackView.h index 981d71244b8ed3dc751491089bfaaf162e124741..bb1fcc1bf85c5a23c3162c74559c73640fae72b1 100644 --- a/src/TNL/Algorithms/Segments/EllpackView.h +++ b/src/TNL/Algorithms/Segments/EllpackView.h @@ -41,6 +41,8 @@ class EllpackView using ConstViewType = ViewType; using SegmentViewType = SegmentView< IndexType, Organization >; + static constexpr bool havePadding() { return true; }; + __cuda_callable__ EllpackView(); diff --git a/src/TNL/Algorithms/Segments/SlicedEllpack.h b/src/TNL/Algorithms/Segments/SlicedEllpack.h index 580af7897c0f61f41a6c7857a781b9a869bf0996..e7f3a2061618d529f1f08818dede608b4d201dae 100644 --- a/src/TNL/Algorithms/Segments/SlicedEllpack.h +++ b/src/TNL/Algorithms/Segments/SlicedEllpack.h @@ -39,6 +39,8 @@ class SlicedEllpack using ConstViewType = SlicedEllpackView< Device, std::add_const_t< Index >, Organization, SliceSize >; using SegmentViewType = SegmentView< IndexType, Organization >; + static constexpr bool havePadding() { return true; }; + SlicedEllpack(); SlicedEllpack( const Containers::Vector< IndexType, DeviceType, IndexType >& sizes ); diff --git a/src/TNL/Algorithms/Segments/SlicedEllpackView.h b/src/TNL/Algorithms/Segments/SlicedEllpackView.h index 46fc80aef5cb0875269681206c7ede8fc2c4aafa..f3c257bd0f63bd1cf287ac94632e6d76f371811f 100644 --- a/src/TNL/Algorithms/Segments/SlicedEllpackView.h +++ b/src/TNL/Algorithms/Segments/SlicedEllpackView.h @@ -39,6 +39,8 @@ class SlicedEllpackView using ConstViewType = ViewType; using SegmentViewType = SegmentView< IndexType, Organization >; + static constexpr bool havePadding() { return true; }; + __cuda_callable__ SlicedEllpackView(); diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp index d4ea65b3d254b91e00e399a5193f2a5070c87f97..ae83dfa891c75e553fdf5e38ad92ac8cde51cb76 100644 --- a/src/TNL/Matrices/SparseMatrixView.hpp +++ b/src/TNL/Matrices/SparseMatrixView.hpp @@ -437,15 +437,18 @@ vectorProduct( const InVector& inVector, }; auto fetch = [=] __cuda_callable__ ( IndexType globalIdx, bool& compute ) mutable -> ComputeRealType { const IndexType column = columnIndexesView[ globalIdx ]; - compute = ( column != paddingIndex ); - if( ! compute ) - return 0.0; + if( SegmentsViewType::havePadding() ) + { + compute = ( column != paddingIndex ); + if( ! compute ) + return 0.0; + } if( isBinary() ) return inVectorView[ column ]; return valuesView[ globalIdx ] * inVectorView[ column ]; }; - auto keeper = [=] __cuda_callable__ ( IndexType row, const ComputeRealType& value ) mutable { + auto keeperGeneral = [=] __cuda_callable__ ( IndexType row, const ComputeRealType& value ) mutable { if( isSymmetric() ) { typename OutVector::RealType aux = matrixMultiplicator * value;