Loading src/TNL/Algorithms/Segments/CSR.h +1 −1 Original line number Diff line number Diff line Loading @@ -143,7 +143,7 @@ using CSRVector = CSR< Device, Index, CSRKernelVector< Index, Device >, IndexAll template< typename Device, typename Index, typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > > using CSRLight = CSR< Device, Index, CSRLightKernel< Index, Device >, IndexAllocator >; using CSRHybrid = CSR< Device, Index, CSRKernelHybrid< Index, Device >, IndexAllocator >; template< typename Device, typename Index, Loading src/TNL/Algorithms/Segments/CSRKernelHybrid.h 0 → 100644 +65 −0 Original line number Diff line number Diff line /*************************************************************************** CSRKernelHybrid.h - description ------------------- begin : Jan 23, 2021 -> Joe Biden inauguration copyright : (C) 2021 by Tomas Oberhuber email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ #pragma once #include <TNL/Assert.h> #include <TNL/Cuda/LaunchHelpers.h> #include <TNL/Containers/VectorView.h> #include <TNL/Algorithms/ParallelFor.h> #include <TNL/Algorithms/Segments/details/LambdaAdapter.h> namespace TNL { namespace Algorithms { namespace Segments { template< typename Index, typename Device > struct CSRKernelHybrid { using IndexType = Index; using DeviceType = Device; using ViewType = CSRKernelHybrid< Index, Device >; using ConstViewType = CSRKernelHybrid< Index, Device >; template< typename Offsets > void init( const Offsets& offsets ); void reset(); ViewType getView(); ConstViewType getConstView() const; template< typename OffsetsView, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > void segmentsReduction( const OffsetsView& offsets, Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const; protected: int threadsPerSegment; }; } // namespace Segments } // namespace Algorithms } // namespace TNL #include <TNL/Algorithms/Segments/CSRKernelHybrid.hpp> src/TNL/Algorithms/Segments/CSRKernelHybrid.hpp 0 → 100644 +195 −0 Original line number Diff line number Diff line /*************************************************************************** CSRKernelHybrid.hpp - description ------------------- begin : Jan 23, 2021 -> Joe Biden inauguration copyright : (C) 2021 by Tomas Oberhuber email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ #pragma once #include <TNL/Assert.h> #include <TNL/Cuda/LaunchHelpers.h> #include <TNL/Containers/VectorView.h> #include <TNL/Algorithms/ParallelFor.h> #include <TNL/Algorithms/Segments/details/LambdaAdapter.h> #include <TNL/Algorithms/Segments/CSRKernelHybrid.h> namespace TNL { namespace Algorithms { namespace Segments { #ifdef HAVE_CUDA template< int ThreadsPerSegment, typename Offsets, typename Index, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > __global__ void segmentsReductionCSRHybridKernel( int gridIdx, const Offsets offsets, Index first, Index last, Fetch fetch, const Reduction reduce, ResultKeeper keep, const Real zero, Args... args ) { /*** * We map one warp to each segment */ const Index segmentIdx = TNL::Cuda::getGlobalThreadIdx( gridIdx ) / ThreadsPerSegment + first; if( segmentIdx >= last ) return; const int laneIdx = threadIdx.x & ( ThreadsPerSegment - 1 ); // & is cheaper than % Index endIdx = offsets[ segmentIdx + 1] ; Index localIdx( laneIdx ); Real aux = zero; bool compute( true ); for( Index globalIdx = offsets[ segmentIdx ] + localIdx; globalIdx < endIdx; globalIdx += ThreadsPerSegment ) { aux = reduce( aux, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) ); localIdx += TNL::Cuda::getWarpSize(); } /**** * Reduction in each segment. */ if( ThreadsPerSegment == 32 ) aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 16 ) ); if( ThreadsPerSegment >= 16 ) aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 8 ) ); if( ThreadsPerSegment >= 8 ) aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 4 ) ); if( ThreadsPerSegment >= 4 ) aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 2 ) ); if( ThreadsPerSegment >= 2 ) aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 1 ) ); if( laneIdx == 0 ) keep( segmentIdx, aux ); } #endif template< typename Index, typename Device > template< typename Offsets > void CSRKernelHybrid< Index, Device >:: init( const Offsets& offsets ) { const Index segmentsCount = offsets.getSize() - 1; const Index elementsInSegment = std::ceil( ( double ) offsets.getElement( segmentsCount ) / ( double ) segmentsCount ); this->threadsPerSegment = TNL::min( std::pow( 2, std::ceil( std::log2( elementsInSegment ) ) ), TNL::Cuda::getWarpSize() ); TNL_ASSERT_GE( threadsPerSegment, 0, "" ); TNL_ASSERT_LE( threadsPerSegment, 32, "" ); } template< typename Index, typename Device > void CSRKernelHybrid< Index, Device >:: reset() { this->threadsPerSegment = 0; } template< typename Index, typename Device > auto CSRKernelHybrid< Index, Device >:: getView() -> ViewType { return *this; } template< typename Index, typename Device > auto CSRKernelHybrid< Index, Device >:: getConstView() const -> ConstViewType { return *this; }; template< typename Index, typename Device > template< typename OffsetsView, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > void CSRKernelHybrid< Index, Device >:: segmentsReduction( const OffsetsView& offsets, Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { TNL_ASSERT_GE( this->threadsPerSegment, 0, "" ); TNL_ASSERT_LE( this->threadsPerSegment, 32, "" ); #ifdef HAVE_CUDA const size_t threadsCount = this->threadsPerSegment * ( last - first ); dim3 blocksCount, gridsCount, blockSize( 256 ); TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount ); //std::cerr << " this->threadsPerSegment = " << this->threadsPerSegment << " offsets = " << offsets << std::endl; for( unsigned int gridIdx = 0; gridIdx < gridsCount.x; gridIdx ++ ) { dim3 gridSize; TNL::Cuda::setupGrid( blocksCount, gridsCount, gridIdx, gridSize ); switch( this->threadsPerSegment ) { case 0: // this means zero/empty matrix break; case 1: segmentsReductionCSRHybridKernel< 1, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); break; case 2: segmentsReductionCSRHybridKernel< 2, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); break; case 4: segmentsReductionCSRHybridKernel< 4, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); break; case 8: segmentsReductionCSRHybridKernel< 8, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); break; case 16: segmentsReductionCSRHybridKernel< 16, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); break; case 32: segmentsReductionCSRHybridKernel< 32, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); break; default: throw std::runtime_error( std::string( "Wrong value of threadsPerSegment: " ) + std::to_string( this->threadsPerSegment ) ); } } #endif } } // namespace Segments } // namespace Algorithms } // namespace TNL src/TNL/Algorithms/Segments/CSRKernels.h +0 −152 Original line number Diff line number Diff line Loading @@ -21,158 +21,6 @@ namespace TNL { namespace Segments { #ifdef HAVE_CUDA template< int ThreadsPerSegment, typename Offsets, typename Index, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > __global__ void segmentsReductionCSRLightKernel( int gridIdx, const Offsets offsets, Index first, Index last, Fetch fetch, const Reduction reduce, ResultKeeper keep, const Real zero, Args... args ) { /*** * We map one warp to each segment */ const Index segmentIdx = TNL::Cuda::getGlobalThreadIdx( gridIdx ) / ThreadsPerSegment + first; if( segmentIdx >= last ) return; const int laneIdx = threadIdx.x & ( ThreadsPerSegment - 1 ); // & is cheaper than % Index endIdx = offsets[ segmentIdx + 1] ; Index localIdx( laneIdx ); Real aux = zero; bool compute( true ); for( Index globalIdx = offsets[ segmentIdx ] + localIdx; globalIdx < endIdx; globalIdx += ThreadsPerSegment ) { aux = reduce( aux, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) ); localIdx += TNL::Cuda::getWarpSize(); } /**** * Reduction in each segment. */ if( ThreadsPerSegment == 32 ) aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 16 ) ); if( ThreadsPerSegment >= 16 ) aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 8 ) ); if( ThreadsPerSegment >= 8 ) aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 4 ) ); if( ThreadsPerSegment >= 4 ) aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 2 ) ); if( ThreadsPerSegment >= 2 ) aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 1 ) ); if( laneIdx == 0 ) keep( segmentIdx, aux ); } #endif template< typename Index, typename Device > struct CSRLightKernel { using IndexType = Index; using DeviceType = Device; using ViewType = CSRLightKernel< Index, Device >; using ConstViewType = CSRLightKernel< Index, Device >; template< typename Offsets > void init( const Offsets& offsets ) { const Index segmentsCount = offsets.getSize() - 1; const Index elementsInSegment = std::ceil( ( double ) offsets.getElement( segmentsCount ) / ( double ) segmentsCount ); this->threadsPerSegment = TNL::min( std::pow( 2, std::ceil( std::log2( elementsInSegment ) ) ), TNL::Cuda::getWarpSize() ); TNL_ASSERT_GE( threadsPerSegment, 0, "" ); TNL_ASSERT_LE( threadsPerSegment, 32, "" ); }; void reset() { this->threadsPerSegment = 0; } ViewType getView() { return *this; }; ConstViewType getConstView() const { return *this; }; template< typename OffsetsView, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > void segmentsReduction( const OffsetsView& offsets, Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { TNL_ASSERT_GE( threadsPerSegment, 0, "" ); TNL_ASSERT_LE( threadsPerSegment, 32, "" ); #ifdef HAVE_CUDA const size_t threadsCount = this->threadsPerSegment * ( last - first ); dim3 blocksCount, gridsCount, blockSize( 256 ); TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount ); //std::cerr << " this->threadsPerSegment = " << this->threadsPerSegment << " offsets = " << offsets << std::endl; for( unsigned int gridIdx = 0; gridIdx < gridsCount.x; gridIdx ++ ) { dim3 gridSize; TNL::Cuda::setupGrid( blocksCount, gridsCount, gridIdx, gridSize ); switch( this->threadsPerSegment ) { case 0: // this means zero/empty matrix break; case 1: segmentsReductionCSRLightKernel< 1, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); break; case 2: segmentsReductionCSRLightKernel< 2, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); break; case 4: segmentsReductionCSRLightKernel< 4, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); break; case 8: segmentsReductionCSRLightKernel< 8, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); break; case 16: segmentsReductionCSRLightKernel< 16, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); break; case 32: segmentsReductionCSRLightKernel< 32, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); break; default: throw std::runtime_error( std::string( "Wrong value of threadsPerSegment: " ) + std::to_string( this->threadsPerSegment ) ); } } #endif } protected: int threadsPerSegment; }; template< typename Index, typename Device > struct CSRAdaptiveKernelView Loading src/TNL/Algorithms/Segments/CSRView.h +2 −1 Original line number Diff line number Diff line Loading @@ -16,6 +16,7 @@ #include <TNL/Algorithms/Segments/SegmentView.h> #include <TNL/Algorithms/Segments/CSRKernelScalar.h> #include <TNL/Algorithms/Segments/CSRKernelVector.h> #include <TNL/Algorithms/Segments/CSRKernelHybrid.h> #include <TNL/Algorithms/Segments/CSRKernels.h> namespace TNL { Loading Loading @@ -141,7 +142,7 @@ using CSRViewVector = CSRView< Device, Index, CSRKernelVector< Index, Device > > template< typename Device, typename Index > using CSRViewLight = CSRView< Device, Index, CSRLightKernel< Index, Device > >; using CSRViewHybrid = CSRView< Device, Index, CSRKernelHybrid< Index, Device > >; template< typename Device, typename Index > Loading Loading
src/TNL/Algorithms/Segments/CSR.h +1 −1 Original line number Diff line number Diff line Loading @@ -143,7 +143,7 @@ using CSRVector = CSR< Device, Index, CSRKernelVector< Index, Device >, IndexAll template< typename Device, typename Index, typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > > using CSRLight = CSR< Device, Index, CSRLightKernel< Index, Device >, IndexAllocator >; using CSRHybrid = CSR< Device, Index, CSRKernelHybrid< Index, Device >, IndexAllocator >; template< typename Device, typename Index, Loading
src/TNL/Algorithms/Segments/CSRKernelHybrid.h 0 → 100644 +65 −0 Original line number Diff line number Diff line /*************************************************************************** CSRKernelHybrid.h - description ------------------- begin : Jan 23, 2021 -> Joe Biden inauguration copyright : (C) 2021 by Tomas Oberhuber email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ #pragma once #include <TNL/Assert.h> #include <TNL/Cuda/LaunchHelpers.h> #include <TNL/Containers/VectorView.h> #include <TNL/Algorithms/ParallelFor.h> #include <TNL/Algorithms/Segments/details/LambdaAdapter.h> namespace TNL { namespace Algorithms { namespace Segments { template< typename Index, typename Device > struct CSRKernelHybrid { using IndexType = Index; using DeviceType = Device; using ViewType = CSRKernelHybrid< Index, Device >; using ConstViewType = CSRKernelHybrid< Index, Device >; template< typename Offsets > void init( const Offsets& offsets ); void reset(); ViewType getView(); ConstViewType getConstView() const; template< typename OffsetsView, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > void segmentsReduction( const OffsetsView& offsets, Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const; protected: int threadsPerSegment; }; } // namespace Segments } // namespace Algorithms } // namespace TNL #include <TNL/Algorithms/Segments/CSRKernelHybrid.hpp>
src/TNL/Algorithms/Segments/CSRKernelHybrid.hpp 0 → 100644 +195 −0 Original line number Diff line number Diff line /*************************************************************************** CSRKernelHybrid.hpp - description ------------------- begin : Jan 23, 2021 -> Joe Biden inauguration copyright : (C) 2021 by Tomas Oberhuber email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ #pragma once #include <TNL/Assert.h> #include <TNL/Cuda/LaunchHelpers.h> #include <TNL/Containers/VectorView.h> #include <TNL/Algorithms/ParallelFor.h> #include <TNL/Algorithms/Segments/details/LambdaAdapter.h> #include <TNL/Algorithms/Segments/CSRKernelHybrid.h> namespace TNL { namespace Algorithms { namespace Segments { #ifdef HAVE_CUDA template< int ThreadsPerSegment, typename Offsets, typename Index, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > __global__ void segmentsReductionCSRHybridKernel( int gridIdx, const Offsets offsets, Index first, Index last, Fetch fetch, const Reduction reduce, ResultKeeper keep, const Real zero, Args... args ) { /*** * We map one warp to each segment */ const Index segmentIdx = TNL::Cuda::getGlobalThreadIdx( gridIdx ) / ThreadsPerSegment + first; if( segmentIdx >= last ) return; const int laneIdx = threadIdx.x & ( ThreadsPerSegment - 1 ); // & is cheaper than % Index endIdx = offsets[ segmentIdx + 1] ; Index localIdx( laneIdx ); Real aux = zero; bool compute( true ); for( Index globalIdx = offsets[ segmentIdx ] + localIdx; globalIdx < endIdx; globalIdx += ThreadsPerSegment ) { aux = reduce( aux, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) ); localIdx += TNL::Cuda::getWarpSize(); } /**** * Reduction in each segment. */ if( ThreadsPerSegment == 32 ) aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 16 ) ); if( ThreadsPerSegment >= 16 ) aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 8 ) ); if( ThreadsPerSegment >= 8 ) aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 4 ) ); if( ThreadsPerSegment >= 4 ) aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 2 ) ); if( ThreadsPerSegment >= 2 ) aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 1 ) ); if( laneIdx == 0 ) keep( segmentIdx, aux ); } #endif template< typename Index, typename Device > template< typename Offsets > void CSRKernelHybrid< Index, Device >:: init( const Offsets& offsets ) { const Index segmentsCount = offsets.getSize() - 1; const Index elementsInSegment = std::ceil( ( double ) offsets.getElement( segmentsCount ) / ( double ) segmentsCount ); this->threadsPerSegment = TNL::min( std::pow( 2, std::ceil( std::log2( elementsInSegment ) ) ), TNL::Cuda::getWarpSize() ); TNL_ASSERT_GE( threadsPerSegment, 0, "" ); TNL_ASSERT_LE( threadsPerSegment, 32, "" ); } template< typename Index, typename Device > void CSRKernelHybrid< Index, Device >:: reset() { this->threadsPerSegment = 0; } template< typename Index, typename Device > auto CSRKernelHybrid< Index, Device >:: getView() -> ViewType { return *this; } template< typename Index, typename Device > auto CSRKernelHybrid< Index, Device >:: getConstView() const -> ConstViewType { return *this; }; template< typename Index, typename Device > template< typename OffsetsView, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > void CSRKernelHybrid< Index, Device >:: segmentsReduction( const OffsetsView& offsets, Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { TNL_ASSERT_GE( this->threadsPerSegment, 0, "" ); TNL_ASSERT_LE( this->threadsPerSegment, 32, "" ); #ifdef HAVE_CUDA const size_t threadsCount = this->threadsPerSegment * ( last - first ); dim3 blocksCount, gridsCount, blockSize( 256 ); TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount ); //std::cerr << " this->threadsPerSegment = " << this->threadsPerSegment << " offsets = " << offsets << std::endl; for( unsigned int gridIdx = 0; gridIdx < gridsCount.x; gridIdx ++ ) { dim3 gridSize; TNL::Cuda::setupGrid( blocksCount, gridsCount, gridIdx, gridSize ); switch( this->threadsPerSegment ) { case 0: // this means zero/empty matrix break; case 1: segmentsReductionCSRHybridKernel< 1, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); break; case 2: segmentsReductionCSRHybridKernel< 2, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); break; case 4: segmentsReductionCSRHybridKernel< 4, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); break; case 8: segmentsReductionCSRHybridKernel< 8, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); break; case 16: segmentsReductionCSRHybridKernel< 16, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); break; case 32: segmentsReductionCSRHybridKernel< 32, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); break; default: throw std::runtime_error( std::string( "Wrong value of threadsPerSegment: " ) + std::to_string( this->threadsPerSegment ) ); } } #endif } } // namespace Segments } // namespace Algorithms } // namespace TNL
src/TNL/Algorithms/Segments/CSRKernels.h +0 −152 Original line number Diff line number Diff line Loading @@ -21,158 +21,6 @@ namespace TNL { namespace Segments { #ifdef HAVE_CUDA template< int ThreadsPerSegment, typename Offsets, typename Index, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > __global__ void segmentsReductionCSRLightKernel( int gridIdx, const Offsets offsets, Index first, Index last, Fetch fetch, const Reduction reduce, ResultKeeper keep, const Real zero, Args... args ) { /*** * We map one warp to each segment */ const Index segmentIdx = TNL::Cuda::getGlobalThreadIdx( gridIdx ) / ThreadsPerSegment + first; if( segmentIdx >= last ) return; const int laneIdx = threadIdx.x & ( ThreadsPerSegment - 1 ); // & is cheaper than % Index endIdx = offsets[ segmentIdx + 1] ; Index localIdx( laneIdx ); Real aux = zero; bool compute( true ); for( Index globalIdx = offsets[ segmentIdx ] + localIdx; globalIdx < endIdx; globalIdx += ThreadsPerSegment ) { aux = reduce( aux, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) ); localIdx += TNL::Cuda::getWarpSize(); } /**** * Reduction in each segment. */ if( ThreadsPerSegment == 32 ) aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 16 ) ); if( ThreadsPerSegment >= 16 ) aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 8 ) ); if( ThreadsPerSegment >= 8 ) aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 4 ) ); if( ThreadsPerSegment >= 4 ) aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 2 ) ); if( ThreadsPerSegment >= 2 ) aux = reduce( aux, __shfl_down_sync( 0xFFFFFFFF, aux, 1 ) ); if( laneIdx == 0 ) keep( segmentIdx, aux ); } #endif template< typename Index, typename Device > struct CSRLightKernel { using IndexType = Index; using DeviceType = Device; using ViewType = CSRLightKernel< Index, Device >; using ConstViewType = CSRLightKernel< Index, Device >; template< typename Offsets > void init( const Offsets& offsets ) { const Index segmentsCount = offsets.getSize() - 1; const Index elementsInSegment = std::ceil( ( double ) offsets.getElement( segmentsCount ) / ( double ) segmentsCount ); this->threadsPerSegment = TNL::min( std::pow( 2, std::ceil( std::log2( elementsInSegment ) ) ), TNL::Cuda::getWarpSize() ); TNL_ASSERT_GE( threadsPerSegment, 0, "" ); TNL_ASSERT_LE( threadsPerSegment, 32, "" ); }; void reset() { this->threadsPerSegment = 0; } ViewType getView() { return *this; }; ConstViewType getConstView() const { return *this; }; template< typename OffsetsView, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > void segmentsReduction( const OffsetsView& offsets, Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { TNL_ASSERT_GE( threadsPerSegment, 0, "" ); TNL_ASSERT_LE( threadsPerSegment, 32, "" ); #ifdef HAVE_CUDA const size_t threadsCount = this->threadsPerSegment * ( last - first ); dim3 blocksCount, gridsCount, blockSize( 256 ); TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount ); //std::cerr << " this->threadsPerSegment = " << this->threadsPerSegment << " offsets = " << offsets << std::endl; for( unsigned int gridIdx = 0; gridIdx < gridsCount.x; gridIdx ++ ) { dim3 gridSize; TNL::Cuda::setupGrid( blocksCount, gridsCount, gridIdx, gridSize ); switch( this->threadsPerSegment ) { case 0: // this means zero/empty matrix break; case 1: segmentsReductionCSRLightKernel< 1, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); break; case 2: segmentsReductionCSRLightKernel< 2, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); break; case 4: segmentsReductionCSRLightKernel< 4, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); break; case 8: segmentsReductionCSRLightKernel< 8, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); break; case 16: segmentsReductionCSRLightKernel< 16, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); break; case 32: segmentsReductionCSRLightKernel< 32, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... ><<< gridSize, blockSize >>>( gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); break; default: throw std::runtime_error( std::string( "Wrong value of threadsPerSegment: " ) + std::to_string( this->threadsPerSegment ) ); } } #endif } protected: int threadsPerSegment; }; template< typename Index, typename Device > struct CSRAdaptiveKernelView Loading
src/TNL/Algorithms/Segments/CSRView.h +2 −1 Original line number Diff line number Diff line Loading @@ -16,6 +16,7 @@ #include <TNL/Algorithms/Segments/SegmentView.h> #include <TNL/Algorithms/Segments/CSRKernelScalar.h> #include <TNL/Algorithms/Segments/CSRKernelVector.h> #include <TNL/Algorithms/Segments/CSRKernelHybrid.h> #include <TNL/Algorithms/Segments/CSRKernels.h> namespace TNL { Loading Loading @@ -141,7 +142,7 @@ using CSRViewVector = CSRView< Device, Index, CSRKernelVector< Index, Device > > template< typename Device, typename Index > using CSRViewLight = CSRView< Device, Index, CSRLightKernel< Index, Device > >; using CSRViewHybrid = CSRView< Device, Index, CSRKernelHybrid< Index, Device > >; template< typename Device, typename Index > Loading