Loading src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h +11 −133 Original line number Diff line number Diff line Loading @@ -23,35 +23,6 @@ namespace TNL { namespace Algorithms { namespace Segments { #ifdef HAVE_CUDA template< int CudaBlockSize, int warpSize, int WARPS, int SHARED_PER_WARP, int MAX_ELEM_PER_WARP, typename BlocksView, typename Offsets, typename Index, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > __global__ void segmentsReductionCSRAdaptiveKernel( BlocksView blocks, int gridIdx, Offsets offsets, Index first, Index last, Fetch fetch, Reduction reduce, ResultKeeper keep, Real zero, Args... args ); #endif template< typename Index, typename Device > struct CSRAdaptiveKernel Loading @@ -63,114 +34,24 @@ struct CSRAdaptiveKernel using BlocksType = typename ViewType::BlocksType; using BlocksView = typename BlocksType::ViewType; static TNL::String getKernelType() { return ViewType::getKernelType(); }; static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; /* How many shared memory use per block in CSR Adaptive kernel */ static constexpr Index SHARED_PER_BLOCK = 20000; //24576; TODO: /* Number of elements in shared memory */ static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(double); /* Number of warps in block for CSR Adaptive */ static constexpr Index WARPS = THREADS_ADAPTIVE / 32; static TNL::String getKernelType(); /* Number of elements in shared memory per one warp */ static constexpr Index SHARED_PER_WARP = SHARED / WARPS; /* Max length of row to process one warp for CSR Light, MultiVector */ static constexpr Index MAX_ELEMENTS_PER_WARP = 384; /* Max length of row to process one warp for CSR Adaptive */ static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512; template< typename Offsets > Index findLimit( const Index start, const Offsets& offsets, const Index size, details::Type &type, Index &sum ) { sum = 0; for (Index current = start; current < size - 1; current++ ) { Index elements = offsets[ current + 1 ] - offsets[ current ]; sum += elements; if( sum > SHARED_PER_WARP ) { if( current - start > 0 ) // extra row { type = details::Type::STREAM; return current; } else { // one long row if( sum <= 2 * MAX_ELEMENTS_PER_WARP_ADAPT ) type = details::Type::VECTOR; else type = details::Type::LONG; return current + 1; } } } type = details::Type::STREAM; return size - 1; // return last row pointer } Index &sum ); template< typename Offsets > void init( const Offsets& offsets ) { using HostOffsetsType = TNL::Containers::Vector< typename Offsets::IndexType, TNL::Devices::Host, typename Offsets::IndexType >; HostOffsetsType hostOffsets( offsets ); const Index rows = offsets.getSize(); Index sum, start( 0 ), nextStart( 0 ); void init( const Offsets& offsets ); // Fill blocks std::vector< details::CSRAdaptiveKernelBlockDescriptor< Index > > inBlocks; inBlocks.reserve( rows ); void reset(); while( nextStart != rows - 1 ) { details::Type type; nextStart = findLimit( start, hostOffsets, rows, type, sum ); ViewType getView(); if( type == details::Type::LONG ) { const Index blocksCount = inBlocks.size(); const Index warpsPerCudaBlock = THREADS_ADAPTIVE / TNL::Cuda::getWarpSize(); Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount; if( warpsLeft == 0 ) warpsLeft = warpsPerCudaBlock; for( Index index = 0; index < warpsLeft; index++ ) inBlocks.emplace_back( start, details::Type::LONG, index, warpsLeft ); } else { inBlocks.emplace_back(start, type, nextStart, offsets.getElement(nextStart), offsets.getElement(start) ); } start = nextStart; } inBlocks.emplace_back(nextStart); this->blocks = inBlocks; this->view.setBlocks( blocks ); } void reset() { this->blocks.reset(); this->view.setBlocks( blocks ); } ViewType getView() { return this->view; }; ConstViewType getConstView() const { return this->view; }; ConstViewType getConstView() const; template< typename OffsetsView, typename Fetch, Loading @@ -185,10 +66,7 @@ struct CSRAdaptiveKernel const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); } Args... args ) const; protected: BlocksType blocks; Loading src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp +142 −1 Original line number Diff line number Diff line Loading @@ -15,6 +15,7 @@ #include <TNL/Containers/VectorView.h> #include <TNL/Algorithms/ParallelFor.h> #include <TNL/Algorithms/Segments/details/LambdaAdapter.h> #include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h> #include <TNL/Algorithms/Segments/CSRScalarKernel.h> #include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h> Loading @@ -22,6 +23,146 @@ namespace TNL { namespace Algorithms { namespace Segments { template< typename Index, typename Device > TNL::String CSRAdaptiveKernel< Index, Device >:: getKernelType() { return ViewType::getKernelType(); } template< typename Index, typename Device > template< typename Offsets > Index CSRAdaptiveKernel< Index, Device >:: findLimit( const Index start, const Offsets& offsets, const Index size, details::Type &type, Index &sum ) { sum = 0; for (Index current = start; current < size - 1; current++ ) { Index elements = offsets[ current + 1 ] - offsets[ current ]; sum += elements; if( sum > details::CSRAdaptiveKernelParameters< double >::StreamedSharedElementsPerWarp() ) { if( current - start > 0 ) // extra row { type = details::Type::STREAM; return current; } else { // one long row if( sum <= 2 * details::CSRAdaptiveKernelParameters< double >::MaxAdaptiveElementsPerWarp() ) type = details::Type::VECTOR; else type = details::Type::LONG; return current + 1; } } } type = details::Type::STREAM; return size - 1; // return last row pointer } template< typename Index, typename Device > template< typename Offsets > void CSRAdaptiveKernel< Index, Device >:: init( const Offsets& offsets ) { using HostOffsetsType = TNL::Containers::Vector< typename Offsets::IndexType, TNL::Devices::Host, typename Offsets::IndexType >; HostOffsetsType hostOffsets( offsets ); const Index rows = offsets.getSize(); Index sum, start( 0 ), nextStart( 0 ); // Fill blocks std::vector< details::CSRAdaptiveKernelBlockDescriptor< Index > > inBlocks; inBlocks.reserve( rows ); while( nextStart != rows - 1 ) { details::Type type; nextStart = findLimit( start, hostOffsets, rows, type, sum ); if( type == details::Type::LONG ) { const Index blocksCount = inBlocks.size(); const Index warpsPerCudaBlock = details::CSRAdaptiveKernelParameters< double >::CudaBlockSize() / Cuda::getWarpSize(); Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount; if( warpsLeft == 0 ) warpsLeft = warpsPerCudaBlock; for( Index index = 0; index < warpsLeft; index++ ) inBlocks.emplace_back( start, details::Type::LONG, index, warpsLeft ); } else { inBlocks.emplace_back(start, type, nextStart, offsets.getElement(nextStart), offsets.getElement(start) ); } start = nextStart; } inBlocks.emplace_back(nextStart); this->blocks = inBlocks; this->view.setBlocks( blocks ); } template< typename Index, typename Device > void CSRAdaptiveKernel< Index, Device >:: reset() { this->blocks.reset(); this->view.setBlocks( blocks ); } template< typename Index, typename Device > auto CSRAdaptiveKernel< Index, Device >:: getView() -> ViewType { return this->view; } template< typename Index, typename Device > auto CSRAdaptiveKernel< Index, Device >:: getConstView() const -> ConstViewType { return this->view; }; template< typename Index, typename Device > template< typename OffsetsView, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > void CSRAdaptiveKernel< Index, Device >:: segmentsReduction( const OffsetsView& offsets, Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); } } // namespace Segments } // namespace Algorithms Loading src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h +1 −0 Original line number Diff line number Diff line Loading @@ -12,6 +12,7 @@ #include <TNL/Containers/Vector.h> #include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h> #include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h> namespace TNL { namespace Algorithms { Loading src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp +30 −77 Original line number Diff line number Diff line Loading @@ -25,12 +25,7 @@ namespace TNL { #ifdef HAVE_CUDA template< int CudaBlockSize, int warpSize, int WARPS, int SHARED_PER_WARP, int MAX_ELEM_PER_WARP, typename BlocksView, template< typename BlocksView, typename Offsets, typename Index, typename Fetch, Loading @@ -50,10 +45,18 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, Real zero, Args... args ) { __shared__ Real streamShared[ WARPS ][ SHARED_PER_WARP ]; __shared__ Real multivectorShared[ CudaBlockSize / warpSize ]; constexpr size_t MAX_X_DIM = 2147483647; const Index index = (gridIdx * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; constexpr size_t CudaBlockSize = details::CSRAdaptiveKernelParameters< Real >::CudaBlockSize(); constexpr int WarpSize = Cuda::getWarpSize(); constexpr int WarpsCount = details::CSRAdaptiveKernelParameters< Real >::WarpsCount(); constexpr size_t StreamedSharedElementsPerWarp = details::CSRAdaptiveKernelParameters< Real >::StreamedSharedElementsPerWarp(); ///// // Note, for some large large Value types the following allocation of the shared memory might fail. // We might add some check here. __shared__ Real streamShared[ WarpsCount ][ StreamedSharedElementsPerWarp ]; __shared__ Real multivectorShared[ CudaBlockSize / WarpSize ]; const Index index = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; const Index blockIdx = index / warpSize; if( blockIdx >= blocks.getSize() - 1 ) return; Loading Loading @@ -113,32 +116,17 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, } else // blockType == Type::LONG - several warps per segment { // Number of elements processed by previous warps //const Index offset = //block.index[1] * MAX_ELEM_PER_WARP; /// block.getWarpIdx() * MAX_ELEM_PER_WARP; //Index to = begin + (block.getWarpIdx() + 1) * MAX_ELEM_PER_WARP; const Index segmentIdx = block.getFirstSegment();//block.index[0]; //minID = offsets[block.index[0] ]; const Index segmentIdx = block.getFirstSegment(); const Index end = offsets[ segmentIdx + 1 ]; //const int tid = threadIdx.x; //const int inBlockWarpIdx = block.getWarpIdx(); //if( to > end ) // to = end; TNL_ASSERT_GT( block.getWarpsCount(), 0, "" ); result = zero; //printf( "LONG tid %d warpIdx %d: LONG \n", tid, block.getWarpIdx() ); for( Index globalIdx = begin + laneIdx + TNL::Cuda::getWarpSize() * block.getWarpIdx(); globalIdx < end; globalIdx += TNL::Cuda::getWarpSize() * block.getWarpsCount() ) { result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); //if( laneIdx == 0 ) // printf( "LONG warpIdx: %d gid: %d begin: %d end: %d -> %d \n", ( int ) block.getWarpIdx(), globalIdx, begin, end, // details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, 0, globalIdx, compute ) ); //result += values[i] * inVector[columnIndexes[i]]; } //printf( "tid %d -> %d \n", tid, result ); result += __shfl_down_sync( 0xFFFFFFFF, result, 16 ); result += __shfl_down_sync( 0xFFFFFFFF, result, 8 ); Loading @@ -146,9 +134,6 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, result += __shfl_down_sync( 0xFFFFFFFF, result, 2 ); result += __shfl_down_sync( 0xFFFFFFFF, result, 1 ); //if( laneIdx == 0 ) // printf( "WARP RESULT: tid %d -> %d \n", tid, result ); const Index warpID = threadIdx.x / 32; if( laneIdx == 0 ) multivectorShared[ warpID ] = result; Loading @@ -157,7 +142,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, // Reduction in multivectorShared if( block.getWarpIdx() == 0 && laneIdx < 16 ) { constexpr int totalWarps = CudaBlockSize / warpSize; constexpr int totalWarps = CudaBlockSize / WarpSize; if( totalWarps >= 32 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 16 ] ); Loading @@ -184,13 +169,10 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, __syncwarp(); } if( laneIdx == 0 ) { //printf( "Long: segmentIdx %d -> %d \n", segmentIdx, multivectorShared[ 0 ] ); keep( segmentIdx, multivectorShared[ 0 ] ); } } } } #endif template< typename Index, Loading Loading @@ -264,58 +246,29 @@ segmentsReduction( const OffsetsView& offsets, return; } static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; //static constexpr Index THREADS_SCALAR = 128; //static constexpr Index THREADS_VECTOR = 128; //static constexpr Index THREADS_LIGHT = 128; /* Max length of row to process one warp for CSR Light, MultiVector */ //static constexpr Index MAX_ELEMENTS_PER_WARP = 384; /* Max length of row to process one warp for CSR Adaptive */ static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512; /* How many shared memory use per block in CSR Adaptive kernel */ static constexpr Index SHARED_PER_BLOCK = 24576; /* Number of elements in shared memory */ static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real); /* Number of warps in block for CSR Adaptive */ static constexpr Index WARPS = THREADS_ADAPTIVE / 32; /* Number of elements in shared memory per one warp */ static constexpr Index SHARED_PER_WARP = SHARED / WARPS; constexpr int warpSize = 32; //constexpr int warpSize = 32; Index blocksCount; const Index threads = THREADS_ADAPTIVE; constexpr size_t MAX_X_DIM = 2147483647; const Index threads = details::CSRAdaptiveKernelParameters< Real >::CudaBlockSize(); /* Fill blocks */ size_t neededThreads = this->blocks.getSize() * warpSize; // one warp per block size_t neededThreads = this->blocks.getSize() * TNL::Cuda::getWarpSize(); // one warp per block /* Execute kernels on device */ for( Index gridIdx = 0; neededThreads != 0; gridIdx++ ) { if (MAX_X_DIM * threads >= neededThreads) if( Cuda::getMaxGridSize() * threads >= neededThreads ) { blocksCount = roundUpDivision( neededThreads, threads ); neededThreads = 0; } else { blocksCount = MAX_X_DIM; neededThreads -= MAX_X_DIM * threads; blocksCount = TNL::Cuda::getMaxGridSize(); neededThreads -= TNL::Cuda::getMaxGridSize() * threads; } segmentsReductionCSRAdaptiveKernel< THREADS_ADAPTIVE, warpSize, WARPS, SHARED_PER_WARP, MAX_ELEMENTS_PER_WARP_ADAPT, BlocksView, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... > Loading src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h 0 → 100644 +70 −0 Original line number Diff line number Diff line /*************************************************************************** CSRAdaptiveKernelBlockDescriptor.h - description ------------------- begin : Jan 25, 2021 -> Joe Biden inauguration copyright : (C) 2021 by Tomas Oberhuber email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ #pragma once namespace TNL { namespace Algorithms { namespace Segments { namespace details { template< typename Value > struct CSRAdaptiveKernelParameters { /** * \brief Computes number of CUDA threads per block depending on Value type. * * \return CUDA block size. */ static constexpr int CudaBlockSize() { return std::max( ( int ) ( 1024 / sizeof( Value ) ), ( int ) Cuda::getWarpSize() ); }; /** * \brief Returns amount of shared memory dedicated for stream CSR kernel. * * \return Stream shared memory. */ static constexpr size_t StreamedSharedMemory() { return 40960; }; /** * \brief Number of elements fitting into streamed shared memory. */ static constexpr size_t StreamedSharedElementsCount() { return StreamedSharedMemory() / sizeof( Value ); }; /** * \brief Computes number of warps in one CUDA block. */ static constexpr size_t WarpsCount() { return CudaBlockSize() / Cuda::getWarpSize(); }; /** * \brief Computes number of elements to be streamed into the shared memory. * * \return Number of elements to be streamed into the shared memory. */ static constexpr size_t StreamedSharedElementsPerWarp() { return StreamedSharedElementsCount() / WarpsCount(); }; /** * \brief Returns maximum number of elements per warp for vector and hybrid kernel. * * \return Maximum number of elements per warp for vector and hybrid kernel. */ static constexpr int MaxVectorElementsPerWarp() { return 384; }; /** * \brief Returns maximum number of elements per warp for adaptive kernel. * * \return Maximum number of elements per warp for adaptive kernel. */ static constexpr int MaxAdaptiveElementsPerWarp() { return 512; }; }; } // namespace details } // namespace Segments } // namespace Algorithms } // namespace TNL Loading
src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h +11 −133 Original line number Diff line number Diff line Loading @@ -23,35 +23,6 @@ namespace TNL { namespace Algorithms { namespace Segments { #ifdef HAVE_CUDA template< int CudaBlockSize, int warpSize, int WARPS, int SHARED_PER_WARP, int MAX_ELEM_PER_WARP, typename BlocksView, typename Offsets, typename Index, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > __global__ void segmentsReductionCSRAdaptiveKernel( BlocksView blocks, int gridIdx, Offsets offsets, Index first, Index last, Fetch fetch, Reduction reduce, ResultKeeper keep, Real zero, Args... args ); #endif template< typename Index, typename Device > struct CSRAdaptiveKernel Loading @@ -63,114 +34,24 @@ struct CSRAdaptiveKernel using BlocksType = typename ViewType::BlocksType; using BlocksView = typename BlocksType::ViewType; static TNL::String getKernelType() { return ViewType::getKernelType(); }; static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; /* How many shared memory use per block in CSR Adaptive kernel */ static constexpr Index SHARED_PER_BLOCK = 20000; //24576; TODO: /* Number of elements in shared memory */ static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(double); /* Number of warps in block for CSR Adaptive */ static constexpr Index WARPS = THREADS_ADAPTIVE / 32; static TNL::String getKernelType(); /* Number of elements in shared memory per one warp */ static constexpr Index SHARED_PER_WARP = SHARED / WARPS; /* Max length of row to process one warp for CSR Light, MultiVector */ static constexpr Index MAX_ELEMENTS_PER_WARP = 384; /* Max length of row to process one warp for CSR Adaptive */ static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512; template< typename Offsets > Index findLimit( const Index start, const Offsets& offsets, const Index size, details::Type &type, Index &sum ) { sum = 0; for (Index current = start; current < size - 1; current++ ) { Index elements = offsets[ current + 1 ] - offsets[ current ]; sum += elements; if( sum > SHARED_PER_WARP ) { if( current - start > 0 ) // extra row { type = details::Type::STREAM; return current; } else { // one long row if( sum <= 2 * MAX_ELEMENTS_PER_WARP_ADAPT ) type = details::Type::VECTOR; else type = details::Type::LONG; return current + 1; } } } type = details::Type::STREAM; return size - 1; // return last row pointer } Index &sum ); template< typename Offsets > void init( const Offsets& offsets ) { using HostOffsetsType = TNL::Containers::Vector< typename Offsets::IndexType, TNL::Devices::Host, typename Offsets::IndexType >; HostOffsetsType hostOffsets( offsets ); const Index rows = offsets.getSize(); Index sum, start( 0 ), nextStart( 0 ); void init( const Offsets& offsets ); // Fill blocks std::vector< details::CSRAdaptiveKernelBlockDescriptor< Index > > inBlocks; inBlocks.reserve( rows ); void reset(); while( nextStart != rows - 1 ) { details::Type type; nextStart = findLimit( start, hostOffsets, rows, type, sum ); ViewType getView(); if( type == details::Type::LONG ) { const Index blocksCount = inBlocks.size(); const Index warpsPerCudaBlock = THREADS_ADAPTIVE / TNL::Cuda::getWarpSize(); Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount; if( warpsLeft == 0 ) warpsLeft = warpsPerCudaBlock; for( Index index = 0; index < warpsLeft; index++ ) inBlocks.emplace_back( start, details::Type::LONG, index, warpsLeft ); } else { inBlocks.emplace_back(start, type, nextStart, offsets.getElement(nextStart), offsets.getElement(start) ); } start = nextStart; } inBlocks.emplace_back(nextStart); this->blocks = inBlocks; this->view.setBlocks( blocks ); } void reset() { this->blocks.reset(); this->view.setBlocks( blocks ); } ViewType getView() { return this->view; }; ConstViewType getConstView() const { return this->view; }; ConstViewType getConstView() const; template< typename OffsetsView, typename Fetch, Loading @@ -185,10 +66,7 @@ struct CSRAdaptiveKernel const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); } Args... args ) const; protected: BlocksType blocks; Loading
src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp +142 −1 Original line number Diff line number Diff line Loading @@ -15,6 +15,7 @@ #include <TNL/Containers/VectorView.h> #include <TNL/Algorithms/ParallelFor.h> #include <TNL/Algorithms/Segments/details/LambdaAdapter.h> #include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h> #include <TNL/Algorithms/Segments/CSRScalarKernel.h> #include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h> Loading @@ -22,6 +23,146 @@ namespace TNL { namespace Algorithms { namespace Segments { template< typename Index, typename Device > TNL::String CSRAdaptiveKernel< Index, Device >:: getKernelType() { return ViewType::getKernelType(); } template< typename Index, typename Device > template< typename Offsets > Index CSRAdaptiveKernel< Index, Device >:: findLimit( const Index start, const Offsets& offsets, const Index size, details::Type &type, Index &sum ) { sum = 0; for (Index current = start; current < size - 1; current++ ) { Index elements = offsets[ current + 1 ] - offsets[ current ]; sum += elements; if( sum > details::CSRAdaptiveKernelParameters< double >::StreamedSharedElementsPerWarp() ) { if( current - start > 0 ) // extra row { type = details::Type::STREAM; return current; } else { // one long row if( sum <= 2 * details::CSRAdaptiveKernelParameters< double >::MaxAdaptiveElementsPerWarp() ) type = details::Type::VECTOR; else type = details::Type::LONG; return current + 1; } } } type = details::Type::STREAM; return size - 1; // return last row pointer } template< typename Index, typename Device > template< typename Offsets > void CSRAdaptiveKernel< Index, Device >:: init( const Offsets& offsets ) { using HostOffsetsType = TNL::Containers::Vector< typename Offsets::IndexType, TNL::Devices::Host, typename Offsets::IndexType >; HostOffsetsType hostOffsets( offsets ); const Index rows = offsets.getSize(); Index sum, start( 0 ), nextStart( 0 ); // Fill blocks std::vector< details::CSRAdaptiveKernelBlockDescriptor< Index > > inBlocks; inBlocks.reserve( rows ); while( nextStart != rows - 1 ) { details::Type type; nextStart = findLimit( start, hostOffsets, rows, type, sum ); if( type == details::Type::LONG ) { const Index blocksCount = inBlocks.size(); const Index warpsPerCudaBlock = details::CSRAdaptiveKernelParameters< double >::CudaBlockSize() / Cuda::getWarpSize(); Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount; if( warpsLeft == 0 ) warpsLeft = warpsPerCudaBlock; for( Index index = 0; index < warpsLeft; index++ ) inBlocks.emplace_back( start, details::Type::LONG, index, warpsLeft ); } else { inBlocks.emplace_back(start, type, nextStart, offsets.getElement(nextStart), offsets.getElement(start) ); } start = nextStart; } inBlocks.emplace_back(nextStart); this->blocks = inBlocks; this->view.setBlocks( blocks ); } template< typename Index, typename Device > void CSRAdaptiveKernel< Index, Device >:: reset() { this->blocks.reset(); this->view.setBlocks( blocks ); } template< typename Index, typename Device > auto CSRAdaptiveKernel< Index, Device >:: getView() -> ViewType { return this->view; } template< typename Index, typename Device > auto CSRAdaptiveKernel< Index, Device >:: getConstView() const -> ConstViewType { return this->view; }; template< typename Index, typename Device > template< typename OffsetsView, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > void CSRAdaptiveKernel< Index, Device >:: segmentsReduction( const OffsetsView& offsets, Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); } } // namespace Segments } // namespace Algorithms Loading
src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h +1 −0 Original line number Diff line number Diff line Loading @@ -12,6 +12,7 @@ #include <TNL/Containers/Vector.h> #include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h> #include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h> namespace TNL { namespace Algorithms { Loading
src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp +30 −77 Original line number Diff line number Diff line Loading @@ -25,12 +25,7 @@ namespace TNL { #ifdef HAVE_CUDA template< int CudaBlockSize, int warpSize, int WARPS, int SHARED_PER_WARP, int MAX_ELEM_PER_WARP, typename BlocksView, template< typename BlocksView, typename Offsets, typename Index, typename Fetch, Loading @@ -50,10 +45,18 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, Real zero, Args... args ) { __shared__ Real streamShared[ WARPS ][ SHARED_PER_WARP ]; __shared__ Real multivectorShared[ CudaBlockSize / warpSize ]; constexpr size_t MAX_X_DIM = 2147483647; const Index index = (gridIdx * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; constexpr size_t CudaBlockSize = details::CSRAdaptiveKernelParameters< Real >::CudaBlockSize(); constexpr int WarpSize = Cuda::getWarpSize(); constexpr int WarpsCount = details::CSRAdaptiveKernelParameters< Real >::WarpsCount(); constexpr size_t StreamedSharedElementsPerWarp = details::CSRAdaptiveKernelParameters< Real >::StreamedSharedElementsPerWarp(); ///// // Note, for some large large Value types the following allocation of the shared memory might fail. // We might add some check here. __shared__ Real streamShared[ WarpsCount ][ StreamedSharedElementsPerWarp ]; __shared__ Real multivectorShared[ CudaBlockSize / WarpSize ]; const Index index = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; const Index blockIdx = index / warpSize; if( blockIdx >= blocks.getSize() - 1 ) return; Loading Loading @@ -113,32 +116,17 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, } else // blockType == Type::LONG - several warps per segment { // Number of elements processed by previous warps //const Index offset = //block.index[1] * MAX_ELEM_PER_WARP; /// block.getWarpIdx() * MAX_ELEM_PER_WARP; //Index to = begin + (block.getWarpIdx() + 1) * MAX_ELEM_PER_WARP; const Index segmentIdx = block.getFirstSegment();//block.index[0]; //minID = offsets[block.index[0] ]; const Index segmentIdx = block.getFirstSegment(); const Index end = offsets[ segmentIdx + 1 ]; //const int tid = threadIdx.x; //const int inBlockWarpIdx = block.getWarpIdx(); //if( to > end ) // to = end; TNL_ASSERT_GT( block.getWarpsCount(), 0, "" ); result = zero; //printf( "LONG tid %d warpIdx %d: LONG \n", tid, block.getWarpIdx() ); for( Index globalIdx = begin + laneIdx + TNL::Cuda::getWarpSize() * block.getWarpIdx(); globalIdx < end; globalIdx += TNL::Cuda::getWarpSize() * block.getWarpsCount() ) { result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); //if( laneIdx == 0 ) // printf( "LONG warpIdx: %d gid: %d begin: %d end: %d -> %d \n", ( int ) block.getWarpIdx(), globalIdx, begin, end, // details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, 0, globalIdx, compute ) ); //result += values[i] * inVector[columnIndexes[i]]; } //printf( "tid %d -> %d \n", tid, result ); result += __shfl_down_sync( 0xFFFFFFFF, result, 16 ); result += __shfl_down_sync( 0xFFFFFFFF, result, 8 ); Loading @@ -146,9 +134,6 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, result += __shfl_down_sync( 0xFFFFFFFF, result, 2 ); result += __shfl_down_sync( 0xFFFFFFFF, result, 1 ); //if( laneIdx == 0 ) // printf( "WARP RESULT: tid %d -> %d \n", tid, result ); const Index warpID = threadIdx.x / 32; if( laneIdx == 0 ) multivectorShared[ warpID ] = result; Loading @@ -157,7 +142,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, // Reduction in multivectorShared if( block.getWarpIdx() == 0 && laneIdx < 16 ) { constexpr int totalWarps = CudaBlockSize / warpSize; constexpr int totalWarps = CudaBlockSize / WarpSize; if( totalWarps >= 32 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 16 ] ); Loading @@ -184,13 +169,10 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, __syncwarp(); } if( laneIdx == 0 ) { //printf( "Long: segmentIdx %d -> %d \n", segmentIdx, multivectorShared[ 0 ] ); keep( segmentIdx, multivectorShared[ 0 ] ); } } } } #endif template< typename Index, Loading Loading @@ -264,58 +246,29 @@ segmentsReduction( const OffsetsView& offsets, return; } static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; //static constexpr Index THREADS_SCALAR = 128; //static constexpr Index THREADS_VECTOR = 128; //static constexpr Index THREADS_LIGHT = 128; /* Max length of row to process one warp for CSR Light, MultiVector */ //static constexpr Index MAX_ELEMENTS_PER_WARP = 384; /* Max length of row to process one warp for CSR Adaptive */ static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512; /* How many shared memory use per block in CSR Adaptive kernel */ static constexpr Index SHARED_PER_BLOCK = 24576; /* Number of elements in shared memory */ static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real); /* Number of warps in block for CSR Adaptive */ static constexpr Index WARPS = THREADS_ADAPTIVE / 32; /* Number of elements in shared memory per one warp */ static constexpr Index SHARED_PER_WARP = SHARED / WARPS; constexpr int warpSize = 32; //constexpr int warpSize = 32; Index blocksCount; const Index threads = THREADS_ADAPTIVE; constexpr size_t MAX_X_DIM = 2147483647; const Index threads = details::CSRAdaptiveKernelParameters< Real >::CudaBlockSize(); /* Fill blocks */ size_t neededThreads = this->blocks.getSize() * warpSize; // one warp per block size_t neededThreads = this->blocks.getSize() * TNL::Cuda::getWarpSize(); // one warp per block /* Execute kernels on device */ for( Index gridIdx = 0; neededThreads != 0; gridIdx++ ) { if (MAX_X_DIM * threads >= neededThreads) if( Cuda::getMaxGridSize() * threads >= neededThreads ) { blocksCount = roundUpDivision( neededThreads, threads ); neededThreads = 0; } else { blocksCount = MAX_X_DIM; neededThreads -= MAX_X_DIM * threads; blocksCount = TNL::Cuda::getMaxGridSize(); neededThreads -= TNL::Cuda::getMaxGridSize() * threads; } segmentsReductionCSRAdaptiveKernel< THREADS_ADAPTIVE, warpSize, WARPS, SHARED_PER_WARP, MAX_ELEMENTS_PER_WARP_ADAPT, BlocksView, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... > Loading
src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h 0 → 100644 +70 −0 Original line number Diff line number Diff line /*************************************************************************** CSRAdaptiveKernelBlockDescriptor.h - description ------------------- begin : Jan 25, 2021 -> Joe Biden inauguration copyright : (C) 2021 by Tomas Oberhuber email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ #pragma once namespace TNL { namespace Algorithms { namespace Segments { namespace details { template< typename Value > struct CSRAdaptiveKernelParameters { /** * \brief Computes number of CUDA threads per block depending on Value type. * * \return CUDA block size. */ static constexpr int CudaBlockSize() { return std::max( ( int ) ( 1024 / sizeof( Value ) ), ( int ) Cuda::getWarpSize() ); }; /** * \brief Returns amount of shared memory dedicated for stream CSR kernel. * * \return Stream shared memory. */ static constexpr size_t StreamedSharedMemory() { return 40960; }; /** * \brief Number of elements fitting into streamed shared memory. */ static constexpr size_t StreamedSharedElementsCount() { return StreamedSharedMemory() / sizeof( Value ); }; /** * \brief Computes number of warps in one CUDA block. */ static constexpr size_t WarpsCount() { return CudaBlockSize() / Cuda::getWarpSize(); }; /** * \brief Computes number of elements to be streamed into the shared memory. * * \return Number of elements to be streamed into the shared memory. */ static constexpr size_t StreamedSharedElementsPerWarp() { return StreamedSharedElementsCount() / WarpsCount(); }; /** * \brief Returns maximum number of elements per warp for vector and hybrid kernel. * * \return Maximum number of elements per warp for vector and hybrid kernel. */ static constexpr int MaxVectorElementsPerWarp() { return 384; }; /** * \brief Returns maximum number of elements per warp for adaptive kernel. * * \return Maximum number of elements per warp for adaptive kernel. */ static constexpr int MaxAdaptiveElementsPerWarp() { return 512; }; }; } // namespace details } // namespace Segments } // namespace Algorithms } // namespace TNL