Loading src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h +1 −144 Original line number Diff line number Diff line Loading @@ -16,6 +16,7 @@ #include <TNL/Algorithms/ParallelFor.h> #include <TNL/Algorithms/Segments/details/LambdaAdapter.h> #include <TNL/Algorithms/Segments/CSRScalarKernel.h> #include <TNL/Algorithms/Segments/CSRAdaptiveKernelView.h> #include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h> namespace TNL { Loading Loading @@ -50,150 +51,6 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, Args... args ); #endif template< typename Index, typename Device > struct CSRAdaptiveKernelView { using IndexType = Index; using DeviceType = Device; using ViewType = CSRAdaptiveKernelView< Index, Device >; using ConstViewType = CSRAdaptiveKernelView< Index, Device >; using BlocksType = TNL::Containers::Vector< details::CSRAdaptiveKernelBlockDescriptor< Index >, Device, Index >; using BlocksView = typename BlocksType::ViewType; CSRAdaptiveKernelView() = default; CSRAdaptiveKernelView( BlocksType& blocks ) { this->blocks.bind( blocks ); }; void setBlocks( BlocksType& blocks ) { this->blocks.bind( blocks ); } ViewType getView() { return *this; }; ConstViewType getConstView() const { return *this; }; static TNL::String getKernelType() { return "Adaptive"; }; template< typename OffsetsView, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > void segmentsReduction( const OffsetsView& offsets, Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { #ifdef HAVE_CUDA if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() ) { TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >:: segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); return; } static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; //static constexpr Index THREADS_SCALAR = 128; //static constexpr Index THREADS_VECTOR = 128; //static constexpr Index THREADS_LIGHT = 128; /* Max length of row to process one warp for CSR Light, MultiVector */ //static constexpr Index MAX_ELEMENTS_PER_WARP = 384; /* Max length of row to process one warp for CSR Adaptive */ static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512; /* How many shared memory use per block in CSR Adaptive kernel */ static constexpr Index SHARED_PER_BLOCK = 24576; /* Number of elements in shared memory */ static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real); /* Number of warps in block for CSR Adaptive */ static constexpr Index WARPS = THREADS_ADAPTIVE / 32; /* Number of elements in shared memory per one warp */ static constexpr Index SHARED_PER_WARP = SHARED / WARPS; constexpr int warpSize = 32; Index blocksCount; const Index threads = THREADS_ADAPTIVE; constexpr size_t MAX_X_DIM = 2147483647; /* Fill blocks */ size_t neededThreads = this->blocks.getSize() * warpSize; // one warp per block /* Execute kernels on device */ for (Index gridIdx = 0; neededThreads != 0; gridIdx++ ) { if (MAX_X_DIM * threads >= neededThreads) { blocksCount = roundUpDivision(neededThreads, threads); neededThreads = 0; } else { blocksCount = MAX_X_DIM; neededThreads -= MAX_X_DIM * threads; } segmentsReductionCSRAdaptiveKernel< THREADS_ADAPTIVE, warpSize, WARPS, SHARED_PER_WARP, MAX_ELEMENTS_PER_WARP_ADAPT, BlocksView, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... > <<<blocksCount, threads>>>( this->blocks, gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); } #endif } CSRAdaptiveKernelView& operator=( const CSRAdaptiveKernelView< Index, Device >& kernelView ) { this->blocks.bind( kernelView.blocks ); return *this; } void printBlocks() const { for( Index i = 0; i < this->blocks.getSize(); i++ ) { auto block = blocks.getElement( i ); std::cout << "Block " << i << " : " << block << std::endl; } } protected: BlocksView blocks; }; template< typename Index, typename Device > Loading src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp +1 −170 Original line number Diff line number Diff line /*************************************************************************** CSRAdaptiveKernel.h - description CSRAdaptiveKernel.hpp - description ------------------- begin : Feb 7, 2021 -> Joe Biden inauguration copyright : (C) 2021 by Tomas Oberhuber Loading @@ -22,175 +22,6 @@ namespace TNL { namespace Algorithms { namespace Segments { #ifdef HAVE_CUDA template< int CudaBlockSize, int warpSize, int WARPS, int SHARED_PER_WARP, int MAX_ELEM_PER_WARP, typename BlocksView, typename Offsets, typename Index, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > __global__ void segmentsReductionCSRAdaptiveKernel( BlocksView blocks, int gridIdx, Offsets offsets, Index first, Index last, Fetch fetch, Reduction reduce, ResultKeeper keep, Real zero, Args... args ) { __shared__ Real streamShared[ WARPS ][ SHARED_PER_WARP ]; __shared__ Real multivectorShared[ CudaBlockSize / warpSize ]; constexpr size_t MAX_X_DIM = 2147483647; const Index index = (gridIdx * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; const Index blockIdx = index / warpSize; if( blockIdx >= blocks.getSize() - 1 ) return; if( threadIdx.x < CudaBlockSize / warpSize ) multivectorShared[ threadIdx.x ] = zero; Real result = zero; bool compute( true ); const Index laneIdx = threadIdx.x & 31; // & is cheaper than % const details::CSRAdaptiveKernelBlockDescriptor< Index > block = blocks[ blockIdx ]; const Index& firstSegmentIdx = block.getFirstSegment(); const Index begin = offsets[ firstSegmentIdx ]; const auto blockType = block.getType(); if( blockType == details::Type::STREAM ) // Stream kernel - many short segments per warp { const Index warpIdx = threadIdx.x / 32; const Index end = begin + block.getSize(); // Stream data to shared memory for( Index globalIdx = laneIdx + begin; globalIdx < end; globalIdx += warpSize ) { streamShared[ warpIdx ][ globalIdx - begin ] = //fetch( globalIdx, compute ); details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute ); // TODO:: fix this by template specialization so that we can assume fetch lambda // with short parameters } const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock(); for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += warpSize ) { const Index sharedEnd = offsets[ i + 1 ] - begin; // end of preprocessed data result = zero; // Scalar reduction for( Index sharedIdx = offsets[ i ] - begin; sharedIdx < sharedEnd; sharedIdx++ ) result = reduce( result, streamShared[ warpIdx ][ sharedIdx ] ); keep( i, result ); } } else if( blockType == details::Type::VECTOR ) // Vector kernel - one segment per warp { const Index end = begin + block.getSize(); const Index segmentIdx = block.getFirstSegment(); for( Index globalIdx = begin + laneIdx; globalIdx < end; globalIdx += warpSize ) result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); // fix local idx // Parallel reduction result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 16 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 8 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 4 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 2 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 1 ) ); if( laneIdx == 0 ) keep( segmentIdx, result ); } else // blockType == Type::LONG - several warps per segment { // Number of elements processed by previous warps //const Index offset = //block.index[1] * MAX_ELEM_PER_WARP; /// block.getWarpIdx() * MAX_ELEM_PER_WARP; //Index to = begin + (block.getWarpIdx() + 1) * MAX_ELEM_PER_WARP; const Index segmentIdx = block.getFirstSegment();//block.index[0]; //minID = offsets[block.index[0] ]; const Index end = offsets[segmentIdx + 1]; //const int tid = threadIdx.x; //const int inBlockWarpIdx = block.getWarpIdx(); //if( to > end ) // to = end; TNL_ASSERT_GT( block.getWarpsCount(), 0, "" ); result = zero; //printf( "LONG tid %d warpIdx %d: LONG \n", tid, block.getWarpIdx() ); for( Index globalIdx = begin + laneIdx + TNL::Cuda::getWarpSize() * block.getWarpIdx(); globalIdx < end; globalIdx += TNL::Cuda::getWarpSize() * block.getWarpsCount() ) { result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); //if( laneIdx == 0 ) // printf( "LONG warpIdx: %d gid: %d begin: %d end: %d -> %d \n", ( int ) block.getWarpIdx(), globalIdx, begin, end, // details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, 0, globalIdx, compute ) ); //result += values[i] * inVector[columnIndexes[i]]; } //printf( "tid %d -> %d \n", tid, result ); result += __shfl_down_sync(0xFFFFFFFF, result, 16); result += __shfl_down_sync(0xFFFFFFFF, result, 8); result += __shfl_down_sync(0xFFFFFFFF, result, 4); result += __shfl_down_sync(0xFFFFFFFF, result, 2); result += __shfl_down_sync(0xFFFFFFFF, result, 1); //if( laneIdx == 0 ) // printf( "WARP RESULT: tid %d -> %d \n", tid, result ); const Index warpID = threadIdx.x / 32; if( laneIdx == 0 ) multivectorShared[ warpID ] = result; __syncthreads(); // Reduction in multivectorShared if( block.getWarpIdx() == 0 && laneIdx < 16 ) { constexpr int totalWarps = CudaBlockSize / warpSize; if( totalWarps >= 32 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 16 ] ); __syncwarp(); } if( totalWarps >= 16 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 8 ] ); __syncwarp(); } if( totalWarps >= 8 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 4 ] ); __syncwarp(); } if( totalWarps >= 4 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 2 ] ); __syncwarp(); } if( totalWarps >= 2 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 1 ] ); __syncwarp(); } if( laneIdx == 0 ) { //printf( "Long: segmentIdx %d -> %d \n", segmentIdx, multivectorShared[ 0 ] ); keep( segmentIdx, multivectorShared[ 0 ] ); } } } } #endif } // namespace Segments } // namespace Algorithms Loading src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h 0 → 100644 +70 −0 Original line number Diff line number Diff line /*************************************************************************** CSRAdaptiveKernelView.h - description ------------------- begin : Feb 7, 2021 -> Joe Biden inauguration copyright : (C) 2021 by Tomas Oberhuber email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ #pragma once #include <TNL/Containers/Vector.h> #include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h> namespace TNL { namespace Algorithms { namespace Segments { template< typename Index, typename Device > struct CSRAdaptiveKernelView { using IndexType = Index; using DeviceType = Device; using ViewType = CSRAdaptiveKernelView< Index, Device >; using ConstViewType = CSRAdaptiveKernelView< Index, Device >; using BlocksType = TNL::Containers::Vector< details::CSRAdaptiveKernelBlockDescriptor< Index >, Device, Index >; using BlocksView = typename BlocksType::ViewType; CSRAdaptiveKernelView() = default; CSRAdaptiveKernelView( BlocksType& blocks ); void setBlocks( BlocksType& blocks ); ViewType getView(); ConstViewType getConstView() const; static TNL::String getKernelType(); template< typename OffsetsView, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > void segmentsReduction( const OffsetsView& offsets, Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const; CSRAdaptiveKernelView& operator=( const CSRAdaptiveKernelView< Index, Device >& kernelView ); void printBlocks() const; protected: BlocksView blocks; }; } // namespace Segments } // namespace Algorithms } // namespace TNL #include <TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp> No newline at end of file src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp 0 → 100644 +363 −0 File added.Preview size limit exceeded, changes collapsed. Show changes Loading
src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h +1 −144 Original line number Diff line number Diff line Loading @@ -16,6 +16,7 @@ #include <TNL/Algorithms/ParallelFor.h> #include <TNL/Algorithms/Segments/details/LambdaAdapter.h> #include <TNL/Algorithms/Segments/CSRScalarKernel.h> #include <TNL/Algorithms/Segments/CSRAdaptiveKernelView.h> #include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h> namespace TNL { Loading Loading @@ -50,150 +51,6 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, Args... args ); #endif template< typename Index, typename Device > struct CSRAdaptiveKernelView { using IndexType = Index; using DeviceType = Device; using ViewType = CSRAdaptiveKernelView< Index, Device >; using ConstViewType = CSRAdaptiveKernelView< Index, Device >; using BlocksType = TNL::Containers::Vector< details::CSRAdaptiveKernelBlockDescriptor< Index >, Device, Index >; using BlocksView = typename BlocksType::ViewType; CSRAdaptiveKernelView() = default; CSRAdaptiveKernelView( BlocksType& blocks ) { this->blocks.bind( blocks ); }; void setBlocks( BlocksType& blocks ) { this->blocks.bind( blocks ); } ViewType getView() { return *this; }; ConstViewType getConstView() const { return *this; }; static TNL::String getKernelType() { return "Adaptive"; }; template< typename OffsetsView, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > void segmentsReduction( const OffsetsView& offsets, Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { #ifdef HAVE_CUDA if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() ) { TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >:: segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); return; } static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; //static constexpr Index THREADS_SCALAR = 128; //static constexpr Index THREADS_VECTOR = 128; //static constexpr Index THREADS_LIGHT = 128; /* Max length of row to process one warp for CSR Light, MultiVector */ //static constexpr Index MAX_ELEMENTS_PER_WARP = 384; /* Max length of row to process one warp for CSR Adaptive */ static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512; /* How many shared memory use per block in CSR Adaptive kernel */ static constexpr Index SHARED_PER_BLOCK = 24576; /* Number of elements in shared memory */ static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real); /* Number of warps in block for CSR Adaptive */ static constexpr Index WARPS = THREADS_ADAPTIVE / 32; /* Number of elements in shared memory per one warp */ static constexpr Index SHARED_PER_WARP = SHARED / WARPS; constexpr int warpSize = 32; Index blocksCount; const Index threads = THREADS_ADAPTIVE; constexpr size_t MAX_X_DIM = 2147483647; /* Fill blocks */ size_t neededThreads = this->blocks.getSize() * warpSize; // one warp per block /* Execute kernels on device */ for (Index gridIdx = 0; neededThreads != 0; gridIdx++ ) { if (MAX_X_DIM * threads >= neededThreads) { blocksCount = roundUpDivision(neededThreads, threads); neededThreads = 0; } else { blocksCount = MAX_X_DIM; neededThreads -= MAX_X_DIM * threads; } segmentsReductionCSRAdaptiveKernel< THREADS_ADAPTIVE, warpSize, WARPS, SHARED_PER_WARP, MAX_ELEMENTS_PER_WARP_ADAPT, BlocksView, OffsetsView, Index, Fetch, Reduction, ResultKeeper, Real, Args... > <<<blocksCount, threads>>>( this->blocks, gridIdx, offsets, first, last, fetch, reduction, keeper, zero, args... ); } #endif } CSRAdaptiveKernelView& operator=( const CSRAdaptiveKernelView< Index, Device >& kernelView ) { this->blocks.bind( kernelView.blocks ); return *this; } void printBlocks() const { for( Index i = 0; i < this->blocks.getSize(); i++ ) { auto block = blocks.getElement( i ); std::cout << "Block " << i << " : " << block << std::endl; } } protected: BlocksView blocks; }; template< typename Index, typename Device > Loading
src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp +1 −170 Original line number Diff line number Diff line /*************************************************************************** CSRAdaptiveKernel.h - description CSRAdaptiveKernel.hpp - description ------------------- begin : Feb 7, 2021 -> Joe Biden inauguration copyright : (C) 2021 by Tomas Oberhuber Loading @@ -22,175 +22,6 @@ namespace TNL { namespace Algorithms { namespace Segments { #ifdef HAVE_CUDA template< int CudaBlockSize, int warpSize, int WARPS, int SHARED_PER_WARP, int MAX_ELEM_PER_WARP, typename BlocksView, typename Offsets, typename Index, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > __global__ void segmentsReductionCSRAdaptiveKernel( BlocksView blocks, int gridIdx, Offsets offsets, Index first, Index last, Fetch fetch, Reduction reduce, ResultKeeper keep, Real zero, Args... args ) { __shared__ Real streamShared[ WARPS ][ SHARED_PER_WARP ]; __shared__ Real multivectorShared[ CudaBlockSize / warpSize ]; constexpr size_t MAX_X_DIM = 2147483647; const Index index = (gridIdx * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; const Index blockIdx = index / warpSize; if( blockIdx >= blocks.getSize() - 1 ) return; if( threadIdx.x < CudaBlockSize / warpSize ) multivectorShared[ threadIdx.x ] = zero; Real result = zero; bool compute( true ); const Index laneIdx = threadIdx.x & 31; // & is cheaper than % const details::CSRAdaptiveKernelBlockDescriptor< Index > block = blocks[ blockIdx ]; const Index& firstSegmentIdx = block.getFirstSegment(); const Index begin = offsets[ firstSegmentIdx ]; const auto blockType = block.getType(); if( blockType == details::Type::STREAM ) // Stream kernel - many short segments per warp { const Index warpIdx = threadIdx.x / 32; const Index end = begin + block.getSize(); // Stream data to shared memory for( Index globalIdx = laneIdx + begin; globalIdx < end; globalIdx += warpSize ) { streamShared[ warpIdx ][ globalIdx - begin ] = //fetch( globalIdx, compute ); details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute ); // TODO:: fix this by template specialization so that we can assume fetch lambda // with short parameters } const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock(); for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += warpSize ) { const Index sharedEnd = offsets[ i + 1 ] - begin; // end of preprocessed data result = zero; // Scalar reduction for( Index sharedIdx = offsets[ i ] - begin; sharedIdx < sharedEnd; sharedIdx++ ) result = reduce( result, streamShared[ warpIdx ][ sharedIdx ] ); keep( i, result ); } } else if( blockType == details::Type::VECTOR ) // Vector kernel - one segment per warp { const Index end = begin + block.getSize(); const Index segmentIdx = block.getFirstSegment(); for( Index globalIdx = begin + laneIdx; globalIdx < end; globalIdx += warpSize ) result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); // fix local idx // Parallel reduction result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 16 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 8 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 4 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 2 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 1 ) ); if( laneIdx == 0 ) keep( segmentIdx, result ); } else // blockType == Type::LONG - several warps per segment { // Number of elements processed by previous warps //const Index offset = //block.index[1] * MAX_ELEM_PER_WARP; /// block.getWarpIdx() * MAX_ELEM_PER_WARP; //Index to = begin + (block.getWarpIdx() + 1) * MAX_ELEM_PER_WARP; const Index segmentIdx = block.getFirstSegment();//block.index[0]; //minID = offsets[block.index[0] ]; const Index end = offsets[segmentIdx + 1]; //const int tid = threadIdx.x; //const int inBlockWarpIdx = block.getWarpIdx(); //if( to > end ) // to = end; TNL_ASSERT_GT( block.getWarpsCount(), 0, "" ); result = zero; //printf( "LONG tid %d warpIdx %d: LONG \n", tid, block.getWarpIdx() ); for( Index globalIdx = begin + laneIdx + TNL::Cuda::getWarpSize() * block.getWarpIdx(); globalIdx < end; globalIdx += TNL::Cuda::getWarpSize() * block.getWarpsCount() ) { result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); //if( laneIdx == 0 ) // printf( "LONG warpIdx: %d gid: %d begin: %d end: %d -> %d \n", ( int ) block.getWarpIdx(), globalIdx, begin, end, // details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, 0, globalIdx, compute ) ); //result += values[i] * inVector[columnIndexes[i]]; } //printf( "tid %d -> %d \n", tid, result ); result += __shfl_down_sync(0xFFFFFFFF, result, 16); result += __shfl_down_sync(0xFFFFFFFF, result, 8); result += __shfl_down_sync(0xFFFFFFFF, result, 4); result += __shfl_down_sync(0xFFFFFFFF, result, 2); result += __shfl_down_sync(0xFFFFFFFF, result, 1); //if( laneIdx == 0 ) // printf( "WARP RESULT: tid %d -> %d \n", tid, result ); const Index warpID = threadIdx.x / 32; if( laneIdx == 0 ) multivectorShared[ warpID ] = result; __syncthreads(); // Reduction in multivectorShared if( block.getWarpIdx() == 0 && laneIdx < 16 ) { constexpr int totalWarps = CudaBlockSize / warpSize; if( totalWarps >= 32 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 16 ] ); __syncwarp(); } if( totalWarps >= 16 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 8 ] ); __syncwarp(); } if( totalWarps >= 8 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 4 ] ); __syncwarp(); } if( totalWarps >= 4 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 2 ] ); __syncwarp(); } if( totalWarps >= 2 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 1 ] ); __syncwarp(); } if( laneIdx == 0 ) { //printf( "Long: segmentIdx %d -> %d \n", segmentIdx, multivectorShared[ 0 ] ); keep( segmentIdx, multivectorShared[ 0 ] ); } } } } #endif } // namespace Segments } // namespace Algorithms Loading
src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h 0 → 100644 +70 −0 Original line number Diff line number Diff line /*************************************************************************** CSRAdaptiveKernelView.h - description ------------------- begin : Feb 7, 2021 -> Joe Biden inauguration copyright : (C) 2021 by Tomas Oberhuber email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ #pragma once #include <TNL/Containers/Vector.h> #include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h> namespace TNL { namespace Algorithms { namespace Segments { template< typename Index, typename Device > struct CSRAdaptiveKernelView { using IndexType = Index; using DeviceType = Device; using ViewType = CSRAdaptiveKernelView< Index, Device >; using ConstViewType = CSRAdaptiveKernelView< Index, Device >; using BlocksType = TNL::Containers::Vector< details::CSRAdaptiveKernelBlockDescriptor< Index >, Device, Index >; using BlocksView = typename BlocksType::ViewType; CSRAdaptiveKernelView() = default; CSRAdaptiveKernelView( BlocksType& blocks ); void setBlocks( BlocksType& blocks ); ViewType getView(); ConstViewType getConstView() const; static TNL::String getKernelType(); template< typename OffsetsView, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > void segmentsReduction( const OffsetsView& offsets, Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const; CSRAdaptiveKernelView& operator=( const CSRAdaptiveKernelView< Index, Device >& kernelView ); void printBlocks() const; protected: BlocksView blocks; }; } // namespace Segments } // namespace Algorithms } // namespace TNL #include <TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp> No newline at end of file
src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp 0 → 100644 +363 −0 File added.Preview size limit exceeded, changes collapsed. Show changes