Loading src/TNL/Algorithms/Segments/CSR.h +1 −1 Original line number Diff line number Diff line Loading @@ -138,7 +138,7 @@ using CSRScalar = CSR< Device, Index, CSRKernelScalar< Index, Device >, IndexAll template< typename Device, typename Index, typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > > using CSRVector = CSR< Device, Index, CSRVectorKernel< Index, Device >, IndexAllocator >; using CSRVector = CSR< Device, Index, CSRKernelVector< Index, Device >, IndexAllocator >; template< typename Device, typename Index, Loading src/TNL/Algorithms/Segments/CSRKernelVector.h 0 → 100644 +62 −0 Original line number Diff line number Diff line /*************************************************************************** CSRKernelVector.h - description ------------------- begin : Jan 23, 2021 -> Joe Biden inauguration copyright : (C) 2021 by Tomas Oberhuber email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ #pragma once #include <TNL/Assert.h> #include <TNL/Cuda/LaunchHelpers.h> #include <TNL/Containers/VectorView.h> #include <TNL/Algorithms/ParallelFor.h> #include <TNL/Algorithms/Segments/details/LambdaAdapter.h> namespace TNL { namespace Algorithms { namespace Segments { template< typename Index, typename Device > struct CSRKernelVector { using IndexType = Index; using DeviceType = Device; using ViewType = CSRKernelVector< Index, Device >; using ConstViewType = CSRKernelVector< Index, Device >; template< typename Offsets > void init( const Offsets& offsets ); void reset(); ViewType getView(); ConstViewType getConstView() const; template< typename OffsetsView, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > static void segmentsReduction( const OffsetsView& offsets, Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ); }; } // namespace Segments } // namespace Algorithms } // namespace TNL #include <TNL/Algorithms/Segments/CSRKernelVector.hpp> src/TNL/Algorithms/Segments/CSRKernelVector.hpp 0 → 100644 +152 −0 Original line number Diff line number Diff line /*************************************************************************** CSRKernelVector.hpp - description ------------------- begin : Jan 23, 2021 -> Joe Biden inauguration copyright : (C) 2021 by Tomas Oberhuber email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ #pragma once #include <TNL/Assert.h> #include <TNL/Cuda/LaunchHelpers.h> #include <TNL/Containers/VectorView.h> #include <TNL/Algorithms/ParallelFor.h> #include <TNL/Algorithms/Segments/details/LambdaAdapter.h> #include <TNL/Algorithms/Segments/CSRKernelVector.h> namespace TNL { namespace Algorithms { namespace Segments { #ifdef HAVE_CUDA template< typename Offsets, typename Index, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > __global__ void segmentsReductionCSRKernelVector( int gridIdx, const Offsets offsets, Index first, Index last, Fetch fetch, const Reduction reduce, ResultKeeper keep, const Real zero, Args... args ) { /*** * We map one warp to each segment */ const Index segmentIdx = TNL::Cuda::getGlobalThreadIdx( gridIdx ) / TNL::Cuda::getWarpSize() + first; if( segmentIdx >= last ) return; const int laneIdx = threadIdx.x & ( TNL::Cuda::getWarpSize() - 1 ); // & is cheaper than % TNL_ASSERT_LT( segmentIdx + 1, offsets.getSize(), "" ); Index endIdx = offsets[ segmentIdx + 1 ]; Index localIdx( laneIdx ); Real aux = zero; bool compute( true ); for( Index globalIdx = offsets[ segmentIdx ] + localIdx; globalIdx < endIdx; globalIdx += TNL::Cuda::getWarpSize() ) { TNL_ASSERT_LT( globalIdx, endIdx, "" ); aux = reduce( aux, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) ); localIdx += TNL::Cuda::getWarpSize(); } /**** * Reduction in each warp which means in each segment. */ aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 16 ) ); aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 8 ) ); aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 4 ) ); aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 2 ) ); aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 1 ) ); if( laneIdx == 0 ) keep( segmentIdx, aux ); } #endif template< typename Index, typename Device > template< typename Offsets > void CSRKernelVector< Index, Device >:: init( const Offsets& offsets ) { } template< typename Index, typename Device > void CSRKernelVector< Index, Device >:: reset() { } template< typename Index, typename Device > auto CSRKernelVector< Index, Device >:: getView() -> ViewType { return *this; } template< typename Index, typename Device > auto CSRKernelVector< Index, Device >:: getConstView() const -> ConstViewType { return *this; }; template< typename Index, typename Device > template< typename OffsetsView, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > void CSRKernelVector< Index, Device >:: segmentsReduction( const OffsetsView& offsets, Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) { #ifdef HAVE_CUDA const Index warpsCount = last - first; const size_t threadsCount = warpsCount * TNL::Cuda::getWarpSize(); dim3 blocksCount, gridsCount, blockSize( 256 ); TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount ); dim3 gridIdx; for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x ++ ) { dim3 gridSize; TNL::Cuda::setupGrid( blocksCount, gridsCount, gridIdx, gridSize ); segmentsReductionCSRKernelVector< OffsetsView, IndexType, Fetch, Reduction, ResultKeeper, Real, Args... > <<< gridSize, blockSize >>>( gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args... ); }; #endif } } // namespace Segments } // namespace Algorithms } // namespace TNL src/TNL/Algorithms/Segments/CSRKernels.h +0 −106 Original line number Diff line number Diff line Loading @@ -20,112 +20,6 @@ namespace TNL { namespace Algorithms { namespace Segments { #ifdef HAVE_CUDA template< typename Offsets, typename Index, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > __global__ void segmentsReductionCSRVectorKernel( int gridIdx, const Offsets offsets, Index first, Index last, Fetch fetch, const Reduction reduce, ResultKeeper keep, const Real zero, Args... args ) { /*** * We map one warp to each segment */ const Index segmentIdx = TNL::Cuda::getGlobalThreadIdx( gridIdx ) / TNL::Cuda::getWarpSize() + first; if( segmentIdx >= last ) return; const int laneIdx = threadIdx.x & ( TNL::Cuda::getWarpSize() - 1 ); // & is cheaper than % TNL_ASSERT_LT( segmentIdx + 1, offsets.getSize(), "" ); Index endIdx = offsets[ segmentIdx + 1 ]; Index localIdx( laneIdx ); Real aux = zero; bool compute( true ); for( Index globalIdx = offsets[ segmentIdx ] + localIdx; globalIdx < endIdx; globalIdx += TNL::Cuda::getWarpSize() ) { TNL_ASSERT_LT( globalIdx, endIdx, "" ); aux = reduce( aux, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) ); localIdx += TNL::Cuda::getWarpSize(); } /**** * Reduction in each warp which means in each segment. */ aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 16 ) ); aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 8 ) ); aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 4 ) ); aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 2 ) ); aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 1 ) ); if( laneIdx == 0 ) keep( segmentIdx, aux ); } #endif template< typename Index, typename Device > struct CSRVectorKernel { using IndexType = Index; using DeviceType = Device; using ViewType = CSRVectorKernel< Index, Device >; using ConstViewType = CSRVectorKernel< Index, Device >; template< typename Offsets > void init( const Offsets& offsets ) {}; void reset(){}; ViewType getView() { return *this; }; ConstViewType getConstView() const { return *this; }; template< typename OffsetsView, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > static void segmentsReduction( const OffsetsView& offsets, Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) { #ifdef HAVE_CUDA const Index warpsCount = last - first; const size_t threadsCount = warpsCount * TNL::Cuda::getWarpSize(); dim3 blocksCount, gridsCount, blockSize( 256 ); TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount ); dim3 gridIdx; for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x ++ ) { dim3 gridSize; TNL::Cuda::setupGrid( blocksCount, gridsCount, gridIdx, gridSize ); segmentsReductionCSRVectorKernel< OffsetsView, IndexType, Fetch, Reduction, ResultKeeper, Real, Args... > <<< gridSize, blockSize >>>( gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args... ); }; #endif } }; #ifdef HAVE_CUDA Loading src/TNL/Algorithms/Segments/CSRView.h +2 −1 Original line number Diff line number Diff line Loading @@ -15,6 +15,7 @@ #include <TNL/Containers/Vector.h> #include <TNL/Algorithms/Segments/SegmentView.h> #include <TNL/Algorithms/Segments/CSRKernelScalar.h> #include <TNL/Algorithms/Segments/CSRKernelVector.h> #include <TNL/Algorithms/Segments/CSRKernels.h> namespace TNL { Loading Loading @@ -136,7 +137,7 @@ using CSRViewScalar = CSRView< Device, Index, CSRKernelScalar< Index, Device > > template< typename Device, typename Index > using CSRViewVector = CSRView< Device, Index, CSRVectorKernel< Index, Device > >; using CSRViewVector = CSRView< Device, Index, CSRKernelVector< Index, Device > >; template< typename Device, typename Index > Loading Loading
src/TNL/Algorithms/Segments/CSR.h +1 −1 Original line number Diff line number Diff line Loading @@ -138,7 +138,7 @@ using CSRScalar = CSR< Device, Index, CSRKernelScalar< Index, Device >, IndexAll template< typename Device, typename Index, typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > > using CSRVector = CSR< Device, Index, CSRVectorKernel< Index, Device >, IndexAllocator >; using CSRVector = CSR< Device, Index, CSRKernelVector< Index, Device >, IndexAllocator >; template< typename Device, typename Index, Loading
src/TNL/Algorithms/Segments/CSRKernelVector.h 0 → 100644 +62 −0 Original line number Diff line number Diff line /*************************************************************************** CSRKernelVector.h - description ------------------- begin : Jan 23, 2021 -> Joe Biden inauguration copyright : (C) 2021 by Tomas Oberhuber email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ #pragma once #include <TNL/Assert.h> #include <TNL/Cuda/LaunchHelpers.h> #include <TNL/Containers/VectorView.h> #include <TNL/Algorithms/ParallelFor.h> #include <TNL/Algorithms/Segments/details/LambdaAdapter.h> namespace TNL { namespace Algorithms { namespace Segments { template< typename Index, typename Device > struct CSRKernelVector { using IndexType = Index; using DeviceType = Device; using ViewType = CSRKernelVector< Index, Device >; using ConstViewType = CSRKernelVector< Index, Device >; template< typename Offsets > void init( const Offsets& offsets ); void reset(); ViewType getView(); ConstViewType getConstView() const; template< typename OffsetsView, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > static void segmentsReduction( const OffsetsView& offsets, Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ); }; } // namespace Segments } // namespace Algorithms } // namespace TNL #include <TNL/Algorithms/Segments/CSRKernelVector.hpp>
src/TNL/Algorithms/Segments/CSRKernelVector.hpp 0 → 100644 +152 −0 Original line number Diff line number Diff line /*************************************************************************** CSRKernelVector.hpp - description ------------------- begin : Jan 23, 2021 -> Joe Biden inauguration copyright : (C) 2021 by Tomas Oberhuber email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ #pragma once #include <TNL/Assert.h> #include <TNL/Cuda/LaunchHelpers.h> #include <TNL/Containers/VectorView.h> #include <TNL/Algorithms/ParallelFor.h> #include <TNL/Algorithms/Segments/details/LambdaAdapter.h> #include <TNL/Algorithms/Segments/CSRKernelVector.h> namespace TNL { namespace Algorithms { namespace Segments { #ifdef HAVE_CUDA template< typename Offsets, typename Index, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > __global__ void segmentsReductionCSRKernelVector( int gridIdx, const Offsets offsets, Index first, Index last, Fetch fetch, const Reduction reduce, ResultKeeper keep, const Real zero, Args... args ) { /*** * We map one warp to each segment */ const Index segmentIdx = TNL::Cuda::getGlobalThreadIdx( gridIdx ) / TNL::Cuda::getWarpSize() + first; if( segmentIdx >= last ) return; const int laneIdx = threadIdx.x & ( TNL::Cuda::getWarpSize() - 1 ); // & is cheaper than % TNL_ASSERT_LT( segmentIdx + 1, offsets.getSize(), "" ); Index endIdx = offsets[ segmentIdx + 1 ]; Index localIdx( laneIdx ); Real aux = zero; bool compute( true ); for( Index globalIdx = offsets[ segmentIdx ] + localIdx; globalIdx < endIdx; globalIdx += TNL::Cuda::getWarpSize() ) { TNL_ASSERT_LT( globalIdx, endIdx, "" ); aux = reduce( aux, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) ); localIdx += TNL::Cuda::getWarpSize(); } /**** * Reduction in each warp which means in each segment. */ aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 16 ) ); aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 8 ) ); aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 4 ) ); aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 2 ) ); aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 1 ) ); if( laneIdx == 0 ) keep( segmentIdx, aux ); } #endif template< typename Index, typename Device > template< typename Offsets > void CSRKernelVector< Index, Device >:: init( const Offsets& offsets ) { } template< typename Index, typename Device > void CSRKernelVector< Index, Device >:: reset() { } template< typename Index, typename Device > auto CSRKernelVector< Index, Device >:: getView() -> ViewType { return *this; } template< typename Index, typename Device > auto CSRKernelVector< Index, Device >:: getConstView() const -> ConstViewType { return *this; }; template< typename Index, typename Device > template< typename OffsetsView, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > void CSRKernelVector< Index, Device >:: segmentsReduction( const OffsetsView& offsets, Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) { #ifdef HAVE_CUDA const Index warpsCount = last - first; const size_t threadsCount = warpsCount * TNL::Cuda::getWarpSize(); dim3 blocksCount, gridsCount, blockSize( 256 ); TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount ); dim3 gridIdx; for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x ++ ) { dim3 gridSize; TNL::Cuda::setupGrid( blocksCount, gridsCount, gridIdx, gridSize ); segmentsReductionCSRKernelVector< OffsetsView, IndexType, Fetch, Reduction, ResultKeeper, Real, Args... > <<< gridSize, blockSize >>>( gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args... ); }; #endif } } // namespace Segments } // namespace Algorithms } // namespace TNL
src/TNL/Algorithms/Segments/CSRKernels.h +0 −106 Original line number Diff line number Diff line Loading @@ -20,112 +20,6 @@ namespace TNL { namespace Algorithms { namespace Segments { #ifdef HAVE_CUDA template< typename Offsets, typename Index, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > __global__ void segmentsReductionCSRVectorKernel( int gridIdx, const Offsets offsets, Index first, Index last, Fetch fetch, const Reduction reduce, ResultKeeper keep, const Real zero, Args... args ) { /*** * We map one warp to each segment */ const Index segmentIdx = TNL::Cuda::getGlobalThreadIdx( gridIdx ) / TNL::Cuda::getWarpSize() + first; if( segmentIdx >= last ) return; const int laneIdx = threadIdx.x & ( TNL::Cuda::getWarpSize() - 1 ); // & is cheaper than % TNL_ASSERT_LT( segmentIdx + 1, offsets.getSize(), "" ); Index endIdx = offsets[ segmentIdx + 1 ]; Index localIdx( laneIdx ); Real aux = zero; bool compute( true ); for( Index globalIdx = offsets[ segmentIdx ] + localIdx; globalIdx < endIdx; globalIdx += TNL::Cuda::getWarpSize() ) { TNL_ASSERT_LT( globalIdx, endIdx, "" ); aux = reduce( aux, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) ); localIdx += TNL::Cuda::getWarpSize(); } /**** * Reduction in each warp which means in each segment. */ aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 16 ) ); aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 8 ) ); aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 4 ) ); aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 2 ) ); aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 1 ) ); if( laneIdx == 0 ) keep( segmentIdx, aux ); } #endif template< typename Index, typename Device > struct CSRVectorKernel { using IndexType = Index; using DeviceType = Device; using ViewType = CSRVectorKernel< Index, Device >; using ConstViewType = CSRVectorKernel< Index, Device >; template< typename Offsets > void init( const Offsets& offsets ) {}; void reset(){}; ViewType getView() { return *this; }; ConstViewType getConstView() const { return *this; }; template< typename OffsetsView, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > static void segmentsReduction( const OffsetsView& offsets, Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) { #ifdef HAVE_CUDA const Index warpsCount = last - first; const size_t threadsCount = warpsCount * TNL::Cuda::getWarpSize(); dim3 blocksCount, gridsCount, blockSize( 256 ); TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount ); dim3 gridIdx; for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x ++ ) { dim3 gridSize; TNL::Cuda::setupGrid( blocksCount, gridsCount, gridIdx, gridSize ); segmentsReductionCSRVectorKernel< OffsetsView, IndexType, Fetch, Reduction, ResultKeeper, Real, Args... > <<< gridSize, blockSize >>>( gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args... ); }; #endif } }; #ifdef HAVE_CUDA Loading
src/TNL/Algorithms/Segments/CSRView.h +2 −1 Original line number Diff line number Diff line Loading @@ -15,6 +15,7 @@ #include <TNL/Containers/Vector.h> #include <TNL/Algorithms/Segments/SegmentView.h> #include <TNL/Algorithms/Segments/CSRKernelScalar.h> #include <TNL/Algorithms/Segments/CSRKernelVector.h> #include <TNL/Algorithms/Segments/CSRKernels.h> namespace TNL { Loading Loading @@ -136,7 +137,7 @@ using CSRViewScalar = CSRView< Device, Index, CSRKernelScalar< Index, Device > > template< typename Device, typename Index > using CSRViewVector = CSRView< Device, Index, CSRVectorKernel< Index, Device > >; using CSRViewVector = CSRView< Device, Index, CSRKernelVector< Index, Device > >; template< typename Device, typename Index > Loading