From 46482276b60fb41514f4448aef2951718b5e1696 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com> Date: Sun, 7 Feb 2021 20:48:05 +0100 Subject: [PATCH] Renaming CSRKernelAdaptive to CSRAdaptiveKernel. --- src/TNL/Algorithms/Segments/CSR.h | 2 +- ...SRKernelAdaptive.h => CSRAdaptiveKernel.h} | 171 ++------------- .../Algorithms/Segments/CSRAdaptiveKernel.hpp | 197 ++++++++++++++++++ src/TNL/Algorithms/Segments/CSRView.h | 4 +- 4 files changed, 215 insertions(+), 159 deletions(-) rename src/TNL/Algorithms/Segments/{CSRKernelAdaptive.h => CSRAdaptiveKernel.h} (59%) create mode 100644 src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp diff --git a/src/TNL/Algorithms/Segments/CSR.h b/src/TNL/Algorithms/Segments/CSR.h index 188960b6d4..394d4dbade 100644 --- a/src/TNL/Algorithms/Segments/CSR.h +++ b/src/TNL/Algorithms/Segments/CSR.h @@ -155,7 +155,7 @@ using CSRHybrid = CSR< Device, Index, CSRHybridKernel< Index, Device >, IndexAll template< typename Device, typename Index, typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > > -using CSRAdaptive = CSR< Device, Index, CSRKernelAdaptive< Index, Device >, IndexAllocator >; +using CSRAdaptive = CSR< Device, Index, CSRAdaptiveKernel< Index, Device >, IndexAllocator >; template< typename Device, typename Index, diff --git a/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h similarity index 59% rename from src/TNL/Algorithms/Segments/CSRKernelAdaptive.h rename to src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h index 84f1cc4376..0336b06e1f 100644 --- a/src/TNL/Algorithms/Segments/CSRKernelAdaptive.h +++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h @@ -1,5 +1,5 @@ /*************************************************************************** - CSRKernels.h - description + CSRAdaptiveKernel.h - description ------------------- begin : Jan 20, 2021 -> Joe Biden inauguration copyright : (C) 2021 by Tomas Oberhuber @@ -15,7 +15,7 @@ #include <TNL/Containers/VectorView.h> #include <TNL/Algorithms/ParallelFor.h> #include <TNL/Algorithms/Segments/details/LambdaAdapter.h> -#include <TNL/Algorithms/Segments/CSRKernelScalar.h> +#include <TNL/Algorithms/Segments/CSRScalarKernel.h> #include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h> namespace TNL { @@ -47,166 +47,23 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, Reduction reduce, ResultKeeper keep, Real zero, - Args... args ) -{ - __shared__ Real streamShared[ WARPS ][ SHARED_PER_WARP ]; - __shared__ Real multivectorShared[ CudaBlockSize / warpSize ]; - constexpr size_t MAX_X_DIM = 2147483647; - const Index index = (gridIdx * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; - const Index blockIdx = index / warpSize; - if( blockIdx >= blocks.getSize() - 1 ) - return; - - if( threadIdx.x < CudaBlockSize / warpSize ) - multivectorShared[ threadIdx.x ] = zero; - Real result = zero; - bool compute( true ); - const Index laneIdx = threadIdx.x & 31; // & is cheaper than % - const details::CSRAdaptiveKernelBlockDescriptor< Index > block = blocks[ blockIdx ]; - const Index& firstSegmentIdx = block.getFirstSegment(); - const Index begin = offsets[ firstSegmentIdx ]; - - const auto blockType = block.getType(); - if( blockType == details::Type::STREAM ) // Stream kernel - many short segments per warp - { - const Index warpIdx = threadIdx.x / 32; - const Index end = begin + block.getSize(); - - // Stream data to shared memory - for( Index globalIdx = laneIdx + begin; globalIdx < end; globalIdx += warpSize ) - { - streamShared[ warpIdx ][ globalIdx - begin ] = //fetch( globalIdx, compute ); - details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute ); - // TODO:: fix this by template specialization so that we can assume fetch lambda - // with short parameters - } - - const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock(); - - for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += warpSize ) - { - const Index sharedEnd = offsets[ i + 1 ] - begin; // end of preprocessed data - result = zero; - // Scalar reduction - for( Index sharedIdx = offsets[ i ] - begin; sharedIdx < sharedEnd; sharedIdx++ ) - result = reduce( result, streamShared[ warpIdx ][ sharedIdx ] ); - keep( i, result ); - } - } - else if( blockType == details::Type::VECTOR ) // Vector kernel - one segment per warp - { - const Index end = begin + block.getSize(); - const Index segmentIdx = block.getFirstSegment(); - - for( Index globalIdx = begin + laneIdx; globalIdx < end; globalIdx += warpSize ) - result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); // fix local idx - - // Parallel reduction - result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 16 ) ); - result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 8 ) ); - result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 4 ) ); - result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 2 ) ); - result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 1 ) ); - if( laneIdx == 0 ) - keep( segmentIdx, result ); - } - else // blockType == Type::LONG - several warps per segment - { - // Number of elements processed by previous warps - //const Index offset = //block.index[1] * MAX_ELEM_PER_WARP; - /// block.getWarpIdx() * MAX_ELEM_PER_WARP; - //Index to = begin + (block.getWarpIdx() + 1) * MAX_ELEM_PER_WARP; - const Index segmentIdx = block.getFirstSegment();//block.index[0]; - //minID = offsets[block.index[0] ]; - const Index end = offsets[segmentIdx + 1]; - //const int tid = threadIdx.x; - //const int inBlockWarpIdx = block.getWarpIdx(); - - //if( to > end ) - // to = end; - TNL_ASSERT_GT( block.getWarpsCount(), 0, "" ); - result = zero; - //printf( "LONG tid %d warpIdx %d: LONG \n", tid, block.getWarpIdx() ); - for( Index globalIdx = begin + laneIdx + TNL::Cuda::getWarpSize() * block.getWarpIdx(); - globalIdx < end; - globalIdx += TNL::Cuda::getWarpSize() * block.getWarpsCount() ) - { - result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); - //if( laneIdx == 0 ) - // printf( "LONG warpIdx: %d gid: %d begin: %d end: %d -> %d \n", ( int ) block.getWarpIdx(), globalIdx, begin, end, - // details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, 0, globalIdx, compute ) ); - //result += values[i] * inVector[columnIndexes[i]]; - } - //printf( "tid %d -> %d \n", tid, result ); - - result += __shfl_down_sync(0xFFFFFFFF, result, 16); - result += __shfl_down_sync(0xFFFFFFFF, result, 8); - result += __shfl_down_sync(0xFFFFFFFF, result, 4); - result += __shfl_down_sync(0xFFFFFFFF, result, 2); - result += __shfl_down_sync(0xFFFFFFFF, result, 1); - - //if( laneIdx == 0 ) - // printf( "WARP RESULT: tid %d -> %d \n", tid, result ); - - const Index warpID = threadIdx.x / 32; - if( laneIdx == 0 ) - multivectorShared[ warpID ] = result; - - __syncthreads(); - // Reduction in multivectorShared - if( block.getWarpIdx() == 0 && laneIdx < 16 ) - { - constexpr int totalWarps = CudaBlockSize / warpSize; - if( totalWarps >= 32 ) - { - multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 16 ] ); - __syncwarp(); - } - if( totalWarps >= 16 ) - { - multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 8 ] ); - __syncwarp(); - } - if( totalWarps >= 8 ) - { - multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 4 ] ); - __syncwarp(); - } - if( totalWarps >= 4 ) - { - multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 2 ] ); - __syncwarp(); - } - if( totalWarps >= 2 ) - { - multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 1 ] ); - __syncwarp(); - } - if( laneIdx == 0 ) - { - //printf( "Long: segmentIdx %d -> %d \n", segmentIdx, multivectorShared[ 0 ] ); - keep( segmentIdx, multivectorShared[ 0 ] ); - } - } - } -} + Args... args ); #endif - template< typename Index, typename Device > -struct CSRKernelAdaptiveView +struct CSRAdaptiveKernelView { using IndexType = Index; using DeviceType = Device; - using ViewType = CSRKernelAdaptiveView< Index, Device >; - using ConstViewType = CSRKernelAdaptiveView< Index, Device >; + using ViewType = CSRAdaptiveKernelView< Index, Device >; + using ConstViewType = CSRAdaptiveKernelView< Index, Device >; using BlocksType = TNL::Containers::Vector< details::CSRAdaptiveKernelBlockDescriptor< Index >, Device, Index >; using BlocksView = typename BlocksType::ViewType; - CSRKernelAdaptiveView() = default; + CSRAdaptiveKernelView() = default; - CSRKernelAdaptiveView( BlocksType& blocks ) + CSRAdaptiveKernelView( BlocksType& blocks ) { this->blocks.bind( blocks ); }; @@ -243,7 +100,7 @@ struct CSRKernelAdaptiveView #ifdef HAVE_CUDA if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() ) { - TNL::Algorithms::Segments::CSRKernelScalar< Index, Device >:: + TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >:: segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); return; } @@ -318,7 +175,7 @@ struct CSRKernelAdaptiveView #endif } - CSRKernelAdaptiveView& operator=( const CSRKernelAdaptiveView< Index, Device >& kernelView ) + CSRAdaptiveKernelView& operator=( const CSRAdaptiveKernelView< Index, Device >& kernelView ) { this->blocks.bind( kernelView.blocks ); return *this; @@ -340,12 +197,12 @@ struct CSRKernelAdaptiveView template< typename Index, typename Device > -struct CSRKernelAdaptive +struct CSRAdaptiveKernel { using IndexType = Index; using DeviceType = Device; - using ViewType = CSRKernelAdaptiveView< Index, Device >; - using ConstViewType = CSRKernelAdaptiveView< Index, Device >; + using ViewType = CSRAdaptiveKernelView< Index, Device >; + using ConstViewType = CSRAdaptiveKernelView< Index, Device >; using BlocksType = typename ViewType::BlocksType; using BlocksView = typename BlocksType::ViewType; @@ -485,3 +342,5 @@ struct CSRKernelAdaptive } // namespace Segments } // namespace Algorithms } // namespace TNL + +#include <TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp> diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp new file mode 100644 index 0000000000..1557628b8e --- /dev/null +++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp @@ -0,0 +1,197 @@ +/*************************************************************************** + CSRAdaptiveKernel.h - description + ------------------- + begin : Feb 7, 2021 -> Joe Biden inauguration + copyright : (C) 2021 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include <TNL/Assert.h> +#include <TNL/Cuda/LaunchHelpers.h> +#include <TNL/Containers/VectorView.h> +#include <TNL/Algorithms/ParallelFor.h> +#include <TNL/Algorithms/Segments/details/LambdaAdapter.h> +#include <TNL/Algorithms/Segments/CSRScalarKernel.h> +#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h> + +namespace TNL { + namespace Algorithms { + namespace Segments { + +#ifdef HAVE_CUDA + +template< int CudaBlockSize, + int warpSize, + int WARPS, + int SHARED_PER_WARP, + int MAX_ELEM_PER_WARP, + typename BlocksView, + typename Offsets, + typename Index, + typename Fetch, + typename Reduction, + typename ResultKeeper, + typename Real, + typename... Args > +__global__ void +segmentsReductionCSRAdaptiveKernel( BlocksView blocks, + int gridIdx, + Offsets offsets, + Index first, + Index last, + Fetch fetch, + Reduction reduce, + ResultKeeper keep, + Real zero, + Args... args ) +{ + __shared__ Real streamShared[ WARPS ][ SHARED_PER_WARP ]; + __shared__ Real multivectorShared[ CudaBlockSize / warpSize ]; + constexpr size_t MAX_X_DIM = 2147483647; + const Index index = (gridIdx * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; + const Index blockIdx = index / warpSize; + if( blockIdx >= blocks.getSize() - 1 ) + return; + + if( threadIdx.x < CudaBlockSize / warpSize ) + multivectorShared[ threadIdx.x ] = zero; + Real result = zero; + bool compute( true ); + const Index laneIdx = threadIdx.x & 31; // & is cheaper than % + const details::CSRAdaptiveKernelBlockDescriptor< Index > block = blocks[ blockIdx ]; + const Index& firstSegmentIdx = block.getFirstSegment(); + const Index begin = offsets[ firstSegmentIdx ]; + + const auto blockType = block.getType(); + if( blockType == details::Type::STREAM ) // Stream kernel - many short segments per warp + { + const Index warpIdx = threadIdx.x / 32; + const Index end = begin + block.getSize(); + + // Stream data to shared memory + for( Index globalIdx = laneIdx + begin; globalIdx < end; globalIdx += warpSize ) + { + streamShared[ warpIdx ][ globalIdx - begin ] = //fetch( globalIdx, compute ); + details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute ); + // TODO:: fix this by template specialization so that we can assume fetch lambda + // with short parameters + } + + const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock(); + + for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += warpSize ) + { + const Index sharedEnd = offsets[ i + 1 ] - begin; // end of preprocessed data + result = zero; + // Scalar reduction + for( Index sharedIdx = offsets[ i ] - begin; sharedIdx < sharedEnd; sharedIdx++ ) + result = reduce( result, streamShared[ warpIdx ][ sharedIdx ] ); + keep( i, result ); + } + } + else if( blockType == details::Type::VECTOR ) // Vector kernel - one segment per warp + { + const Index end = begin + block.getSize(); + const Index segmentIdx = block.getFirstSegment(); + + for( Index globalIdx = begin + laneIdx; globalIdx < end; globalIdx += warpSize ) + result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); // fix local idx + + // Parallel reduction + result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 16 ) ); + result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 8 ) ); + result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 4 ) ); + result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 2 ) ); + result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 1 ) ); + if( laneIdx == 0 ) + keep( segmentIdx, result ); + } + else // blockType == Type::LONG - several warps per segment + { + // Number of elements processed by previous warps + //const Index offset = //block.index[1] * MAX_ELEM_PER_WARP; + /// block.getWarpIdx() * MAX_ELEM_PER_WARP; + //Index to = begin + (block.getWarpIdx() + 1) * MAX_ELEM_PER_WARP; + const Index segmentIdx = block.getFirstSegment();//block.index[0]; + //minID = offsets[block.index[0] ]; + const Index end = offsets[segmentIdx + 1]; + //const int tid = threadIdx.x; + //const int inBlockWarpIdx = block.getWarpIdx(); + + //if( to > end ) + // to = end; + TNL_ASSERT_GT( block.getWarpsCount(), 0, "" ); + result = zero; + //printf( "LONG tid %d warpIdx %d: LONG \n", tid, block.getWarpIdx() ); + for( Index globalIdx = begin + laneIdx + TNL::Cuda::getWarpSize() * block.getWarpIdx(); + globalIdx < end; + globalIdx += TNL::Cuda::getWarpSize() * block.getWarpsCount() ) + { + result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); + //if( laneIdx == 0 ) + // printf( "LONG warpIdx: %d gid: %d begin: %d end: %d -> %d \n", ( int ) block.getWarpIdx(), globalIdx, begin, end, + // details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, 0, globalIdx, compute ) ); + //result += values[i] * inVector[columnIndexes[i]]; + } + //printf( "tid %d -> %d \n", tid, result ); + + result += __shfl_down_sync(0xFFFFFFFF, result, 16); + result += __shfl_down_sync(0xFFFFFFFF, result, 8); + result += __shfl_down_sync(0xFFFFFFFF, result, 4); + result += __shfl_down_sync(0xFFFFFFFF, result, 2); + result += __shfl_down_sync(0xFFFFFFFF, result, 1); + + //if( laneIdx == 0 ) + // printf( "WARP RESULT: tid %d -> %d \n", tid, result ); + + const Index warpID = threadIdx.x / 32; + if( laneIdx == 0 ) + multivectorShared[ warpID ] = result; + + __syncthreads(); + // Reduction in multivectorShared + if( block.getWarpIdx() == 0 && laneIdx < 16 ) + { + constexpr int totalWarps = CudaBlockSize / warpSize; + if( totalWarps >= 32 ) + { + multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 16 ] ); + __syncwarp(); + } + if( totalWarps >= 16 ) + { + multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 8 ] ); + __syncwarp(); + } + if( totalWarps >= 8 ) + { + multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 4 ] ); + __syncwarp(); + } + if( totalWarps >= 4 ) + { + multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 2 ] ); + __syncwarp(); + } + if( totalWarps >= 2 ) + { + multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 1 ] ); + __syncwarp(); + } + if( laneIdx == 0 ) + { + //printf( "Long: segmentIdx %d -> %d \n", segmentIdx, multivectorShared[ 0 ] ); + keep( segmentIdx, multivectorShared[ 0 ] ); + } + } + } +} +#endif + + } // namespace Segments + } // namespace Algorithms +} // namespace TNL \ No newline at end of file diff --git a/src/TNL/Algorithms/Segments/CSRView.h b/src/TNL/Algorithms/Segments/CSRView.h index 5d8ebeeb91..a97d784536 100644 --- a/src/TNL/Algorithms/Segments/CSRView.h +++ b/src/TNL/Algorithms/Segments/CSRView.h @@ -17,7 +17,7 @@ #include <TNL/Algorithms/Segments/CSRScalarKernel.h> #include <TNL/Algorithms/Segments/CSRVectorKernel.h> #include <TNL/Algorithms/Segments/CSRHybridKernel.h> -#include <TNL/Algorithms/Segments/CSRKernelAdaptive.h> +#include <TNL/Algorithms/Segments/CSRAdaptiveKernel.h> namespace TNL { namespace Algorithms { @@ -146,7 +146,7 @@ using CSRViewHybrid = CSRView< Device, Index, CSRHybridKernel< Index, Device > > template< typename Device, typename Index > -using CSRViewAdaptive = CSRView< Device, Index, CSRKernelAdaptive< Index, Device > >; +using CSRViewAdaptive = CSRView< Device, Index, CSRAdaptiveKernel< Index, Device > >; template< typename Device, typename Index > -- GitLab