diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h index 0336b06e1f033e0487f3fbaed65053b8d1bb10b8..899e22ff9af27abb66cb957c6e7b68c5cb2f65ca 100644 --- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h +++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h @@ -16,6 +16,7 @@ #include <TNL/Algorithms/ParallelFor.h> #include <TNL/Algorithms/Segments/details/LambdaAdapter.h> #include <TNL/Algorithms/Segments/CSRScalarKernel.h> +#include <TNL/Algorithms/Segments/CSRAdaptiveKernelView.h> #include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h> namespace TNL { @@ -50,150 +51,6 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, Args... args ); #endif -template< typename Index, - typename Device > -struct CSRAdaptiveKernelView -{ - using IndexType = Index; - using DeviceType = Device; - using ViewType = CSRAdaptiveKernelView< Index, Device >; - using ConstViewType = CSRAdaptiveKernelView< Index, Device >; - using BlocksType = TNL::Containers::Vector< details::CSRAdaptiveKernelBlockDescriptor< Index >, Device, Index >; - using BlocksView = typename BlocksType::ViewType; - - CSRAdaptiveKernelView() = default; - - CSRAdaptiveKernelView( BlocksType& blocks ) - { - this->blocks.bind( blocks ); - }; - - void setBlocks( BlocksType& blocks ) - { - this->blocks.bind( blocks ); - } - - ViewType getView() { return *this; }; - - ConstViewType getConstView() const { return *this; }; - - static TNL::String getKernelType() - { - return "Adaptive"; - }; - - template< typename OffsetsView, - typename Fetch, - typename Reduction, - typename ResultKeeper, - typename Real, - typename... Args > - void segmentsReduction( const OffsetsView& offsets, - Index first, - Index last, - Fetch& fetch, - const Reduction& reduction, - ResultKeeper& keeper, - const Real& zero, - Args... args ) const - { -#ifdef HAVE_CUDA - if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() ) - { - TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >:: - segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); - return; - } - - static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; - //static constexpr Index THREADS_SCALAR = 128; - //static constexpr Index THREADS_VECTOR = 128; - //static constexpr Index THREADS_LIGHT = 128; - - /* Max length of row to process one warp for CSR Light, MultiVector */ - //static constexpr Index MAX_ELEMENTS_PER_WARP = 384; - - /* Max length of row to process one warp for CSR Adaptive */ - static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512; - - /* How many shared memory use per block in CSR Adaptive kernel */ - static constexpr Index SHARED_PER_BLOCK = 24576; - - /* Number of elements in shared memory */ - static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real); - - /* Number of warps in block for CSR Adaptive */ - static constexpr Index WARPS = THREADS_ADAPTIVE / 32; - - /* Number of elements in shared memory per one warp */ - static constexpr Index SHARED_PER_WARP = SHARED / WARPS; - - constexpr int warpSize = 32; - - Index blocksCount; - - const Index threads = THREADS_ADAPTIVE; - constexpr size_t MAX_X_DIM = 2147483647; - - /* Fill blocks */ - size_t neededThreads = this->blocks.getSize() * warpSize; // one warp per block - /* Execute kernels on device */ - for (Index gridIdx = 0; neededThreads != 0; gridIdx++ ) - { - if (MAX_X_DIM * threads >= neededThreads) - { - blocksCount = roundUpDivision(neededThreads, threads); - neededThreads = 0; - } - else - { - blocksCount = MAX_X_DIM; - neededThreads -= MAX_X_DIM * threads; - } - - segmentsReductionCSRAdaptiveKernel< - THREADS_ADAPTIVE, - warpSize, - WARPS, - SHARED_PER_WARP, - MAX_ELEMENTS_PER_WARP_ADAPT, - BlocksView, - OffsetsView, - Index, Fetch, Reduction, ResultKeeper, Real, Args... > - <<<blocksCount, threads>>>( - this->blocks, - gridIdx, - offsets, - first, - last, - fetch, - reduction, - keeper, - zero, - args... ); - } -#endif - } - - CSRAdaptiveKernelView& operator=( const CSRAdaptiveKernelView< Index, Device >& kernelView ) - { - this->blocks.bind( kernelView.blocks ); - return *this; - } - - void printBlocks() const - { - for( Index i = 0; i < this->blocks.getSize(); i++ ) - { - auto block = blocks.getElement( i ); - std::cout << "Block " << i << " : " << block << std::endl; - } - - } - - protected: - BlocksView blocks; -}; template< typename Index, typename Device > diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp index 1557628b8ed71bdac70a42d2483f004fb693d6fc..b795a52f57603615f0ef6c52cae072ce048940d6 100644 --- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp +++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp @@ -1,5 +1,5 @@ /*************************************************************************** - CSRAdaptiveKernel.h - description + CSRAdaptiveKernel.hpp - description ------------------- begin : Feb 7, 2021 -> Joe Biden inauguration copyright : (C) 2021 by Tomas Oberhuber @@ -22,175 +22,6 @@ namespace TNL { namespace Algorithms { namespace Segments { -#ifdef HAVE_CUDA - -template< int CudaBlockSize, - int warpSize, - int WARPS, - int SHARED_PER_WARP, - int MAX_ELEM_PER_WARP, - typename BlocksView, - typename Offsets, - typename Index, - typename Fetch, - typename Reduction, - typename ResultKeeper, - typename Real, - typename... Args > -__global__ void -segmentsReductionCSRAdaptiveKernel( BlocksView blocks, - int gridIdx, - Offsets offsets, - Index first, - Index last, - Fetch fetch, - Reduction reduce, - ResultKeeper keep, - Real zero, - Args... args ) -{ - __shared__ Real streamShared[ WARPS ][ SHARED_PER_WARP ]; - __shared__ Real multivectorShared[ CudaBlockSize / warpSize ]; - constexpr size_t MAX_X_DIM = 2147483647; - const Index index = (gridIdx * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; - const Index blockIdx = index / warpSize; - if( blockIdx >= blocks.getSize() - 1 ) - return; - - if( threadIdx.x < CudaBlockSize / warpSize ) - multivectorShared[ threadIdx.x ] = zero; - Real result = zero; - bool compute( true ); - const Index laneIdx = threadIdx.x & 31; // & is cheaper than % - const details::CSRAdaptiveKernelBlockDescriptor< Index > block = blocks[ blockIdx ]; - const Index& firstSegmentIdx = block.getFirstSegment(); - const Index begin = offsets[ firstSegmentIdx ]; - - const auto blockType = block.getType(); - if( blockType == details::Type::STREAM ) // Stream kernel - many short segments per warp - { - const Index warpIdx = threadIdx.x / 32; - const Index end = begin + block.getSize(); - - // Stream data to shared memory - for( Index globalIdx = laneIdx + begin; globalIdx < end; globalIdx += warpSize ) - { - streamShared[ warpIdx ][ globalIdx - begin ] = //fetch( globalIdx, compute ); - details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute ); - // TODO:: fix this by template specialization so that we can assume fetch lambda - // with short parameters - } - - const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock(); - - for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += warpSize ) - { - const Index sharedEnd = offsets[ i + 1 ] - begin; // end of preprocessed data - result = zero; - // Scalar reduction - for( Index sharedIdx = offsets[ i ] - begin; sharedIdx < sharedEnd; sharedIdx++ ) - result = reduce( result, streamShared[ warpIdx ][ sharedIdx ] ); - keep( i, result ); - } - } - else if( blockType == details::Type::VECTOR ) // Vector kernel - one segment per warp - { - const Index end = begin + block.getSize(); - const Index segmentIdx = block.getFirstSegment(); - - for( Index globalIdx = begin + laneIdx; globalIdx < end; globalIdx += warpSize ) - result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); // fix local idx - - // Parallel reduction - result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 16 ) ); - result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 8 ) ); - result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 4 ) ); - result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 2 ) ); - result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 1 ) ); - if( laneIdx == 0 ) - keep( segmentIdx, result ); - } - else // blockType == Type::LONG - several warps per segment - { - // Number of elements processed by previous warps - //const Index offset = //block.index[1] * MAX_ELEM_PER_WARP; - /// block.getWarpIdx() * MAX_ELEM_PER_WARP; - //Index to = begin + (block.getWarpIdx() + 1) * MAX_ELEM_PER_WARP; - const Index segmentIdx = block.getFirstSegment();//block.index[0]; - //minID = offsets[block.index[0] ]; - const Index end = offsets[segmentIdx + 1]; - //const int tid = threadIdx.x; - //const int inBlockWarpIdx = block.getWarpIdx(); - - //if( to > end ) - // to = end; - TNL_ASSERT_GT( block.getWarpsCount(), 0, "" ); - result = zero; - //printf( "LONG tid %d warpIdx %d: LONG \n", tid, block.getWarpIdx() ); - for( Index globalIdx = begin + laneIdx + TNL::Cuda::getWarpSize() * block.getWarpIdx(); - globalIdx < end; - globalIdx += TNL::Cuda::getWarpSize() * block.getWarpsCount() ) - { - result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); - //if( laneIdx == 0 ) - // printf( "LONG warpIdx: %d gid: %d begin: %d end: %d -> %d \n", ( int ) block.getWarpIdx(), globalIdx, begin, end, - // details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, 0, globalIdx, compute ) ); - //result += values[i] * inVector[columnIndexes[i]]; - } - //printf( "tid %d -> %d \n", tid, result ); - - result += __shfl_down_sync(0xFFFFFFFF, result, 16); - result += __shfl_down_sync(0xFFFFFFFF, result, 8); - result += __shfl_down_sync(0xFFFFFFFF, result, 4); - result += __shfl_down_sync(0xFFFFFFFF, result, 2); - result += __shfl_down_sync(0xFFFFFFFF, result, 1); - - //if( laneIdx == 0 ) - // printf( "WARP RESULT: tid %d -> %d \n", tid, result ); - - const Index warpID = threadIdx.x / 32; - if( laneIdx == 0 ) - multivectorShared[ warpID ] = result; - - __syncthreads(); - // Reduction in multivectorShared - if( block.getWarpIdx() == 0 && laneIdx < 16 ) - { - constexpr int totalWarps = CudaBlockSize / warpSize; - if( totalWarps >= 32 ) - { - multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 16 ] ); - __syncwarp(); - } - if( totalWarps >= 16 ) - { - multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 8 ] ); - __syncwarp(); - } - if( totalWarps >= 8 ) - { - multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 4 ] ); - __syncwarp(); - } - if( totalWarps >= 4 ) - { - multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 2 ] ); - __syncwarp(); - } - if( totalWarps >= 2 ) - { - multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 1 ] ); - __syncwarp(); - } - if( laneIdx == 0 ) - { - //printf( "Long: segmentIdx %d -> %d \n", segmentIdx, multivectorShared[ 0 ] ); - keep( segmentIdx, multivectorShared[ 0 ] ); - } - } - } -} -#endif } // namespace Segments } // namespace Algorithms diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h new file mode 100644 index 0000000000000000000000000000000000000000..431b72f4e37b98cf5a00fc64fdb6ff0e7d727e9c --- /dev/null +++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h @@ -0,0 +1,70 @@ +/*************************************************************************** + CSRAdaptiveKernelView.h - description + ------------------- + begin : Feb 7, 2021 -> Joe Biden inauguration + copyright : (C) 2021 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include <TNL/Containers/Vector.h> +#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h> + +namespace TNL { + namespace Algorithms { + namespace Segments { + +template< typename Index, + typename Device > +struct CSRAdaptiveKernelView +{ + using IndexType = Index; + using DeviceType = Device; + using ViewType = CSRAdaptiveKernelView< Index, Device >; + using ConstViewType = CSRAdaptiveKernelView< Index, Device >; + using BlocksType = TNL::Containers::Vector< details::CSRAdaptiveKernelBlockDescriptor< Index >, Device, Index >; + using BlocksView = typename BlocksType::ViewType; + + CSRAdaptiveKernelView() = default; + + CSRAdaptiveKernelView( BlocksType& blocks ); + + void setBlocks( BlocksType& blocks ); + + ViewType getView(); + + ConstViewType getConstView() const; + + static TNL::String getKernelType(); + + template< typename OffsetsView, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real, + typename... Args > + void segmentsReduction( const OffsetsView& offsets, + Index first, + Index last, + Fetch& fetch, + const Reduction& reduction, + ResultKeeper& keeper, + const Real& zero, + Args... args ) const; + + CSRAdaptiveKernelView& operator=( const CSRAdaptiveKernelView< Index, Device >& kernelView ); + + void printBlocks() const; + + protected: + BlocksView blocks; +}; + + } // namespace Segments + } // namespace Algorithms +} // namespace TNL + +#include <TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp> \ No newline at end of file diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e9e1badbaba8a8ab317240b99bca54ad5fc3a591 --- /dev/null +++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp @@ -0,0 +1,363 @@ +/*************************************************************************** + CSRAdaptiveKernelView.hpp - description + ------------------- + begin : Feb 7, 2021 -> Joe Biden inauguration + copyright : (C) 2021 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include <TNL/Assert.h> +#include <TNL/Cuda/LaunchHelpers.h> +#include <TNL/Containers/VectorView.h> +#include <TNL/Algorithms/ParallelFor.h> +#include <TNL/Algorithms/Segments/details/LambdaAdapter.h> +#include <TNL/Algorithms/Segments/CSRScalarKernel.h> +#include <TNL/Algorithms/Segments/CSRAdaptiveKernelView.h> +#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h> + +namespace TNL { + namespace Algorithms { + namespace Segments { + +#ifdef HAVE_CUDA + +template< int CudaBlockSize, + int warpSize, + int WARPS, + int SHARED_PER_WARP, + int MAX_ELEM_PER_WARP, + typename BlocksView, + typename Offsets, + typename Index, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real, + typename... Args > +__global__ void +segmentsReductionCSRAdaptiveKernel( BlocksView blocks, + int gridIdx, + Offsets offsets, + Index first, + Index last, + Fetch fetch, + Reduction reduce, + ResultKeeper keep, + Real zero, + Args... args ) +{ + __shared__ Real streamShared[ WARPS ][ SHARED_PER_WARP ]; + __shared__ Real multivectorShared[ CudaBlockSize / warpSize ]; + constexpr size_t MAX_X_DIM = 2147483647; + const Index index = (gridIdx * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; + const Index blockIdx = index / warpSize; + if( blockIdx >= blocks.getSize() - 1 ) + return; + + if( threadIdx.x < CudaBlockSize / warpSize ) + multivectorShared[ threadIdx.x ] = zero; + Real result = zero; + bool compute( true ); + const Index laneIdx = threadIdx.x & 31; // & is cheaper than % + const details::CSRAdaptiveKernelBlockDescriptor< Index > block = blocks[ blockIdx ]; + const Index& firstSegmentIdx = block.getFirstSegment(); + const Index begin = offsets[ firstSegmentIdx ]; + + const auto blockType = block.getType(); + if( blockType == details::Type::STREAM ) // Stream kernel - many short segments per warp + { + const Index warpIdx = threadIdx.x / 32; + const Index end = begin + block.getSize(); + + // Stream data to shared memory + for( Index globalIdx = laneIdx + begin; globalIdx < end; globalIdx += warpSize ) + { + streamShared[ warpIdx ][ globalIdx - begin ] = //fetch( globalIdx, compute ); + details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute ); + // TODO:: fix this by template specialization so that we can assume fetch lambda + // with short parameters + } + + const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock(); + + for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += warpSize ) + { + const Index sharedEnd = offsets[ i + 1 ] - begin; // end of preprocessed data + result = zero; + // Scalar reduction + for( Index sharedIdx = offsets[ i ] - begin; sharedIdx < sharedEnd; sharedIdx++ ) + result = reduce( result, streamShared[ warpIdx ][ sharedIdx ] ); + keep( i, result ); + } + } + else if( blockType == details::Type::VECTOR ) // Vector kernel - one segment per warp + { + const Index end = begin + block.getSize(); + const Index segmentIdx = block.getFirstSegment(); + + for( Index globalIdx = begin + laneIdx; globalIdx < end; globalIdx += warpSize ) + result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); // fix local idx + + // Parallel reduction + result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 16 ) ); + result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 8 ) ); + result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 4 ) ); + result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 2 ) ); + result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 1 ) ); + if( laneIdx == 0 ) + keep( segmentIdx, result ); + } + else // blockType == Type::LONG - several warps per segment + { + // Number of elements processed by previous warps + //const Index offset = //block.index[1] * MAX_ELEM_PER_WARP; + /// block.getWarpIdx() * MAX_ELEM_PER_WARP; + //Index to = begin + (block.getWarpIdx() + 1) * MAX_ELEM_PER_WARP; + const Index segmentIdx = block.getFirstSegment();//block.index[0]; + //minID = offsets[block.index[0] ]; + const Index end = offsets[segmentIdx + 1]; + //const int tid = threadIdx.x; + //const int inBlockWarpIdx = block.getWarpIdx(); + + //if( to > end ) + // to = end; + TNL_ASSERT_GT( block.getWarpsCount(), 0, "" ); + result = zero; + //printf( "LONG tid %d warpIdx %d: LONG \n", tid, block.getWarpIdx() ); + for( Index globalIdx = begin + laneIdx + TNL::Cuda::getWarpSize() * block.getWarpIdx(); + globalIdx < end; + globalIdx += TNL::Cuda::getWarpSize() * block.getWarpsCount() ) + { + result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); + //if( laneIdx == 0 ) + // printf( "LONG warpIdx: %d gid: %d begin: %d end: %d -> %d \n", ( int ) block.getWarpIdx(), globalIdx, begin, end, + // details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, 0, globalIdx, compute ) ); + //result += values[i] * inVector[columnIndexes[i]]; + } + //printf( "tid %d -> %d \n", tid, result ); + + result += __shfl_down_sync(0xFFFFFFFF, result, 16); + result += __shfl_down_sync(0xFFFFFFFF, result, 8); + result += __shfl_down_sync(0xFFFFFFFF, result, 4); + result += __shfl_down_sync(0xFFFFFFFF, result, 2); + result += __shfl_down_sync(0xFFFFFFFF, result, 1); + + //if( laneIdx == 0 ) + // printf( "WARP RESULT: tid %d -> %d \n", tid, result ); + + const Index warpID = threadIdx.x / 32; + if( laneIdx == 0 ) + multivectorShared[ warpID ] = result; + + __syncthreads(); + // Reduction in multivectorShared + if( block.getWarpIdx() == 0 && laneIdx < 16 ) + { + constexpr int totalWarps = CudaBlockSize / warpSize; + if( totalWarps >= 32 ) + { + multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 16 ] ); + __syncwarp(); + } + if( totalWarps >= 16 ) + { + multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 8 ] ); + __syncwarp(); + } + if( totalWarps >= 8 ) + { + multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 4 ] ); + __syncwarp(); + } + if( totalWarps >= 4 ) + { + multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 2 ] ); + __syncwarp(); + } + if( totalWarps >= 2 ) + { + multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 1 ] ); + __syncwarp(); + } + if( laneIdx == 0 ) + { + //printf( "Long: segmentIdx %d -> %d \n", segmentIdx, multivectorShared[ 0 ] ); + keep( segmentIdx, multivectorShared[ 0 ] ); + } + } + } +} +#endif + +template< typename Index, + typename Device > +CSRAdaptiveKernelView< Index, Device >:: +CSRAdaptiveKernelView( BlocksType& blocks ) +{ + this->blocks.bind( blocks ); +} + +template< typename Index, + typename Device > +void +CSRAdaptiveKernelView< Index, Device >:: +setBlocks( BlocksType& blocks ) +{ + this->blocks.bind( blocks ); +} + +template< typename Index, + typename Device > +auto +CSRAdaptiveKernelView< Index, Device >:: +getView() -> ViewType +{ + return *this; +}; + +template< typename Index, + typename Device > +auto +CSRAdaptiveKernelView< Index, Device >:: +getConstView() const -> ConstViewType +{ + return *this; +} + +template< typename Index, + typename Device > +TNL::String +CSRAdaptiveKernelView< Index, Device >:: +getKernelType() +{ + return "Adaptive"; +} + +template< typename Index, + typename Device > + template< typename OffsetsView, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real, + typename... Args > +void +CSRAdaptiveKernelView< Index, Device >:: +segmentsReduction( const OffsetsView& offsets, + Index first, + Index last, + Fetch& fetch, + const Reduction& reduction, + ResultKeeper& keeper, + const Real& zero, + Args... args ) const +{ +#ifdef HAVE_CUDA + if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() ) + { + TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >:: + segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); + return; + } + + static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; + //static constexpr Index THREADS_SCALAR = 128; + //static constexpr Index THREADS_VECTOR = 128; + //static constexpr Index THREADS_LIGHT = 128; + + /* Max length of row to process one warp for CSR Light, MultiVector */ + //static constexpr Index MAX_ELEMENTS_PER_WARP = 384; + + /* Max length of row to process one warp for CSR Adaptive */ + static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512; + + /* How many shared memory use per block in CSR Adaptive kernel */ + static constexpr Index SHARED_PER_BLOCK = 24576; + + /* Number of elements in shared memory */ + static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real); + + /* Number of warps in block for CSR Adaptive */ + static constexpr Index WARPS = THREADS_ADAPTIVE / 32; + + /* Number of elements in shared memory per one warp */ + static constexpr Index SHARED_PER_WARP = SHARED / WARPS; + + constexpr int warpSize = 32; + + Index blocksCount; + + const Index threads = THREADS_ADAPTIVE; + constexpr size_t MAX_X_DIM = 2147483647; + + /* Fill blocks */ + size_t neededThreads = this->blocks.getSize() * warpSize; // one warp per block + /* Execute kernels on device */ + for (Index gridIdx = 0; neededThreads != 0; gridIdx++ ) + { + if (MAX_X_DIM * threads >= neededThreads) + { + blocksCount = roundUpDivision(neededThreads, threads); + neededThreads = 0; + } + else + { + blocksCount = MAX_X_DIM; + neededThreads -= MAX_X_DIM * threads; + } + + segmentsReductionCSRAdaptiveKernel< + THREADS_ADAPTIVE, + warpSize, + WARPS, + SHARED_PER_WARP, + MAX_ELEMENTS_PER_WARP_ADAPT, + BlocksView, + OffsetsView, + Index, Fetch, Reduction, ResultKeeper, Real, Args... > + <<<blocksCount, threads>>>( + this->blocks, + gridIdx, + offsets, + first, + last, + fetch, + reduction, + keeper, + zero, + args... ); + } +#endif +} + +template< typename Index, + typename Device > +CSRAdaptiveKernelView< Index, Device >& +CSRAdaptiveKernelView< Index, Device >:: +operator=( const CSRAdaptiveKernelView< Index, Device >& kernelView ) +{ + this->blocks.bind( kernelView.blocks ); + return *this; +} + +template< typename Index, + typename Device > +void +CSRAdaptiveKernelView< Index, Device >:: +printBlocks() const +{ + for( Index i = 0; i < this->blocks.getSize(); i++ ) + { + auto block = blocks.getElement( i ); + std::cout << "Block " << i << " : " << block << std::endl; + } + +} + + } // namespace Segments + } // namespace Algorithms +} // namespace TNL