Loading src/TNL/Algorithms/Segments/CSR.h +1 −1 Original line number Diff line number Diff line Loading @@ -155,7 +155,7 @@ using CSRHybrid = CSR< Device, Index, CSRHybridKernel< Index, Device >, IndexAll template< typename Device, typename Index, typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > > using CSRAdaptive = CSR< Device, Index, CSRKernelAdaptive< Index, Device >, IndexAllocator >; using CSRAdaptive = CSR< Device, Index, CSRAdaptiveKernel< Index, Device >, IndexAllocator >; template< typename Device, typename Index, Loading src/TNL/Algorithms/Segments/CSRKernelAdaptive.h→src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h +15 −156 Original line number Diff line number Diff line /*************************************************************************** CSRKernels.h - description CSRAdaptiveKernel.h - description ------------------- begin : Jan 20, 2021 -> Joe Biden inauguration copyright : (C) 2021 by Tomas Oberhuber Loading @@ -15,7 +15,7 @@ #include <TNL/Containers/VectorView.h> #include <TNL/Algorithms/ParallelFor.h> #include <TNL/Algorithms/Segments/details/LambdaAdapter.h> #include <TNL/Algorithms/Segments/CSRKernelScalar.h> #include <TNL/Algorithms/Segments/CSRScalarKernel.h> #include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h> namespace TNL { Loading Loading @@ -47,166 +47,23 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, Reduction reduce, ResultKeeper keep, Real zero, Args... args ) { __shared__ Real streamShared[ WARPS ][ SHARED_PER_WARP ]; __shared__ Real multivectorShared[ CudaBlockSize / warpSize ]; constexpr size_t MAX_X_DIM = 2147483647; const Index index = (gridIdx * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; const Index blockIdx = index / warpSize; if( blockIdx >= blocks.getSize() - 1 ) return; if( threadIdx.x < CudaBlockSize / warpSize ) multivectorShared[ threadIdx.x ] = zero; Real result = zero; bool compute( true ); const Index laneIdx = threadIdx.x & 31; // & is cheaper than % const details::CSRAdaptiveKernelBlockDescriptor< Index > block = blocks[ blockIdx ]; const Index& firstSegmentIdx = block.getFirstSegment(); const Index begin = offsets[ firstSegmentIdx ]; const auto blockType = block.getType(); if( blockType == details::Type::STREAM ) // Stream kernel - many short segments per warp { const Index warpIdx = threadIdx.x / 32; const Index end = begin + block.getSize(); // Stream data to shared memory for( Index globalIdx = laneIdx + begin; globalIdx < end; globalIdx += warpSize ) { streamShared[ warpIdx ][ globalIdx - begin ] = //fetch( globalIdx, compute ); details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute ); // TODO:: fix this by template specialization so that we can assume fetch lambda // with short parameters } const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock(); for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += warpSize ) { const Index sharedEnd = offsets[ i + 1 ] - begin; // end of preprocessed data result = zero; // Scalar reduction for( Index sharedIdx = offsets[ i ] - begin; sharedIdx < sharedEnd; sharedIdx++ ) result = reduce( result, streamShared[ warpIdx ][ sharedIdx ] ); keep( i, result ); } } else if( blockType == details::Type::VECTOR ) // Vector kernel - one segment per warp { const Index end = begin + block.getSize(); const Index segmentIdx = block.getFirstSegment(); for( Index globalIdx = begin + laneIdx; globalIdx < end; globalIdx += warpSize ) result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); // fix local idx // Parallel reduction result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 16 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 8 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 4 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 2 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 1 ) ); if( laneIdx == 0 ) keep( segmentIdx, result ); } else // blockType == Type::LONG - several warps per segment { // Number of elements processed by previous warps //const Index offset = //block.index[1] * MAX_ELEM_PER_WARP; /// block.getWarpIdx() * MAX_ELEM_PER_WARP; //Index to = begin + (block.getWarpIdx() + 1) * MAX_ELEM_PER_WARP; const Index segmentIdx = block.getFirstSegment();//block.index[0]; //minID = offsets[block.index[0] ]; const Index end = offsets[segmentIdx + 1]; //const int tid = threadIdx.x; //const int inBlockWarpIdx = block.getWarpIdx(); //if( to > end ) // to = end; TNL_ASSERT_GT( block.getWarpsCount(), 0, "" ); result = zero; //printf( "LONG tid %d warpIdx %d: LONG \n", tid, block.getWarpIdx() ); for( Index globalIdx = begin + laneIdx + TNL::Cuda::getWarpSize() * block.getWarpIdx(); globalIdx < end; globalIdx += TNL::Cuda::getWarpSize() * block.getWarpsCount() ) { result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); //if( laneIdx == 0 ) // printf( "LONG warpIdx: %d gid: %d begin: %d end: %d -> %d \n", ( int ) block.getWarpIdx(), globalIdx, begin, end, // details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, 0, globalIdx, compute ) ); //result += values[i] * inVector[columnIndexes[i]]; } //printf( "tid %d -> %d \n", tid, result ); result += __shfl_down_sync(0xFFFFFFFF, result, 16); result += __shfl_down_sync(0xFFFFFFFF, result, 8); result += __shfl_down_sync(0xFFFFFFFF, result, 4); result += __shfl_down_sync(0xFFFFFFFF, result, 2); result += __shfl_down_sync(0xFFFFFFFF, result, 1); //if( laneIdx == 0 ) // printf( "WARP RESULT: tid %d -> %d \n", tid, result ); const Index warpID = threadIdx.x / 32; if( laneIdx == 0 ) multivectorShared[ warpID ] = result; __syncthreads(); // Reduction in multivectorShared if( block.getWarpIdx() == 0 && laneIdx < 16 ) { constexpr int totalWarps = CudaBlockSize / warpSize; if( totalWarps >= 32 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 16 ] ); __syncwarp(); } if( totalWarps >= 16 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 8 ] ); __syncwarp(); } if( totalWarps >= 8 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 4 ] ); __syncwarp(); } if( totalWarps >= 4 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 2 ] ); __syncwarp(); } if( totalWarps >= 2 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 1 ] ); __syncwarp(); } if( laneIdx == 0 ) { //printf( "Long: segmentIdx %d -> %d \n", segmentIdx, multivectorShared[ 0 ] ); keep( segmentIdx, multivectorShared[ 0 ] ); } } } } Args... args ); #endif template< typename Index, typename Device > struct CSRKernelAdaptiveView struct CSRAdaptiveKernelView { using IndexType = Index; using DeviceType = Device; using ViewType = CSRKernelAdaptiveView< Index, Device >; using ConstViewType = CSRKernelAdaptiveView< Index, Device >; using ViewType = CSRAdaptiveKernelView< Index, Device >; using ConstViewType = CSRAdaptiveKernelView< Index, Device >; using BlocksType = TNL::Containers::Vector< details::CSRAdaptiveKernelBlockDescriptor< Index >, Device, Index >; using BlocksView = typename BlocksType::ViewType; CSRKernelAdaptiveView() = default; CSRAdaptiveKernelView() = default; CSRKernelAdaptiveView( BlocksType& blocks ) CSRAdaptiveKernelView( BlocksType& blocks ) { this->blocks.bind( blocks ); }; Loading Loading @@ -243,7 +100,7 @@ struct CSRKernelAdaptiveView #ifdef HAVE_CUDA if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() ) { TNL::Algorithms::Segments::CSRKernelScalar< Index, Device >:: TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >:: segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); return; } Loading Loading @@ -318,7 +175,7 @@ struct CSRKernelAdaptiveView #endif } CSRKernelAdaptiveView& operator=( const CSRKernelAdaptiveView< Index, Device >& kernelView ) CSRAdaptiveKernelView& operator=( const CSRAdaptiveKernelView< Index, Device >& kernelView ) { this->blocks.bind( kernelView.blocks ); return *this; Loading @@ -340,12 +197,12 @@ struct CSRKernelAdaptiveView template< typename Index, typename Device > struct CSRKernelAdaptive struct CSRAdaptiveKernel { using IndexType = Index; using DeviceType = Device; using ViewType = CSRKernelAdaptiveView< Index, Device >; using ConstViewType = CSRKernelAdaptiveView< Index, Device >; using ViewType = CSRAdaptiveKernelView< Index, Device >; using ConstViewType = CSRAdaptiveKernelView< Index, Device >; using BlocksType = typename ViewType::BlocksType; using BlocksView = typename BlocksType::ViewType; Loading Loading @@ -485,3 +342,5 @@ struct CSRKernelAdaptive } // namespace Segments } // namespace Algorithms } // namespace TNL #include <TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp> src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp 0 → 100644 +197 −0 Original line number Diff line number Diff line /*************************************************************************** CSRAdaptiveKernel.h - description ------------------- begin : Feb 7, 2021 -> Joe Biden inauguration copyright : (C) 2021 by Tomas Oberhuber email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ #pragma once #include <TNL/Assert.h> #include <TNL/Cuda/LaunchHelpers.h> #include <TNL/Containers/VectorView.h> #include <TNL/Algorithms/ParallelFor.h> #include <TNL/Algorithms/Segments/details/LambdaAdapter.h> #include <TNL/Algorithms/Segments/CSRScalarKernel.h> #include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h> namespace TNL { namespace Algorithms { namespace Segments { #ifdef HAVE_CUDA template< int CudaBlockSize, int warpSize, int WARPS, int SHARED_PER_WARP, int MAX_ELEM_PER_WARP, typename BlocksView, typename Offsets, typename Index, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > __global__ void segmentsReductionCSRAdaptiveKernel( BlocksView blocks, int gridIdx, Offsets offsets, Index first, Index last, Fetch fetch, Reduction reduce, ResultKeeper keep, Real zero, Args... args ) { __shared__ Real streamShared[ WARPS ][ SHARED_PER_WARP ]; __shared__ Real multivectorShared[ CudaBlockSize / warpSize ]; constexpr size_t MAX_X_DIM = 2147483647; const Index index = (gridIdx * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; const Index blockIdx = index / warpSize; if( blockIdx >= blocks.getSize() - 1 ) return; if( threadIdx.x < CudaBlockSize / warpSize ) multivectorShared[ threadIdx.x ] = zero; Real result = zero; bool compute( true ); const Index laneIdx = threadIdx.x & 31; // & is cheaper than % const details::CSRAdaptiveKernelBlockDescriptor< Index > block = blocks[ blockIdx ]; const Index& firstSegmentIdx = block.getFirstSegment(); const Index begin = offsets[ firstSegmentIdx ]; const auto blockType = block.getType(); if( blockType == details::Type::STREAM ) // Stream kernel - many short segments per warp { const Index warpIdx = threadIdx.x / 32; const Index end = begin + block.getSize(); // Stream data to shared memory for( Index globalIdx = laneIdx + begin; globalIdx < end; globalIdx += warpSize ) { streamShared[ warpIdx ][ globalIdx - begin ] = //fetch( globalIdx, compute ); details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute ); // TODO:: fix this by template specialization so that we can assume fetch lambda // with short parameters } const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock(); for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += warpSize ) { const Index sharedEnd = offsets[ i + 1 ] - begin; // end of preprocessed data result = zero; // Scalar reduction for( Index sharedIdx = offsets[ i ] - begin; sharedIdx < sharedEnd; sharedIdx++ ) result = reduce( result, streamShared[ warpIdx ][ sharedIdx ] ); keep( i, result ); } } else if( blockType == details::Type::VECTOR ) // Vector kernel - one segment per warp { const Index end = begin + block.getSize(); const Index segmentIdx = block.getFirstSegment(); for( Index globalIdx = begin + laneIdx; globalIdx < end; globalIdx += warpSize ) result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); // fix local idx // Parallel reduction result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 16 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 8 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 4 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 2 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 1 ) ); if( laneIdx == 0 ) keep( segmentIdx, result ); } else // blockType == Type::LONG - several warps per segment { // Number of elements processed by previous warps //const Index offset = //block.index[1] * MAX_ELEM_PER_WARP; /// block.getWarpIdx() * MAX_ELEM_PER_WARP; //Index to = begin + (block.getWarpIdx() + 1) * MAX_ELEM_PER_WARP; const Index segmentIdx = block.getFirstSegment();//block.index[0]; //minID = offsets[block.index[0] ]; const Index end = offsets[segmentIdx + 1]; //const int tid = threadIdx.x; //const int inBlockWarpIdx = block.getWarpIdx(); //if( to > end ) // to = end; TNL_ASSERT_GT( block.getWarpsCount(), 0, "" ); result = zero; //printf( "LONG tid %d warpIdx %d: LONG \n", tid, block.getWarpIdx() ); for( Index globalIdx = begin + laneIdx + TNL::Cuda::getWarpSize() * block.getWarpIdx(); globalIdx < end; globalIdx += TNL::Cuda::getWarpSize() * block.getWarpsCount() ) { result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); //if( laneIdx == 0 ) // printf( "LONG warpIdx: %d gid: %d begin: %d end: %d -> %d \n", ( int ) block.getWarpIdx(), globalIdx, begin, end, // details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, 0, globalIdx, compute ) ); //result += values[i] * inVector[columnIndexes[i]]; } //printf( "tid %d -> %d \n", tid, result ); result += __shfl_down_sync(0xFFFFFFFF, result, 16); result += __shfl_down_sync(0xFFFFFFFF, result, 8); result += __shfl_down_sync(0xFFFFFFFF, result, 4); result += __shfl_down_sync(0xFFFFFFFF, result, 2); result += __shfl_down_sync(0xFFFFFFFF, result, 1); //if( laneIdx == 0 ) // printf( "WARP RESULT: tid %d -> %d \n", tid, result ); const Index warpID = threadIdx.x / 32; if( laneIdx == 0 ) multivectorShared[ warpID ] = result; __syncthreads(); // Reduction in multivectorShared if( block.getWarpIdx() == 0 && laneIdx < 16 ) { constexpr int totalWarps = CudaBlockSize / warpSize; if( totalWarps >= 32 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 16 ] ); __syncwarp(); } if( totalWarps >= 16 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 8 ] ); __syncwarp(); } if( totalWarps >= 8 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 4 ] ); __syncwarp(); } if( totalWarps >= 4 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 2 ] ); __syncwarp(); } if( totalWarps >= 2 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 1 ] ); __syncwarp(); } if( laneIdx == 0 ) { //printf( "Long: segmentIdx %d -> %d \n", segmentIdx, multivectorShared[ 0 ] ); keep( segmentIdx, multivectorShared[ 0 ] ); } } } } #endif } // namespace Segments } // namespace Algorithms } // namespace TNL No newline at end of file src/TNL/Algorithms/Segments/CSRView.h +2 −2 Original line number Diff line number Diff line Loading @@ -17,7 +17,7 @@ #include <TNL/Algorithms/Segments/CSRScalarKernel.h> #include <TNL/Algorithms/Segments/CSRVectorKernel.h> #include <TNL/Algorithms/Segments/CSRHybridKernel.h> #include <TNL/Algorithms/Segments/CSRKernelAdaptive.h> #include <TNL/Algorithms/Segments/CSRAdaptiveKernel.h> namespace TNL { namespace Algorithms { Loading Loading @@ -146,7 +146,7 @@ using CSRViewHybrid = CSRView< Device, Index, CSRHybridKernel< Index, Device > > template< typename Device, typename Index > using CSRViewAdaptive = CSRView< Device, Index, CSRKernelAdaptive< Index, Device > >; using CSRViewAdaptive = CSRView< Device, Index, CSRAdaptiveKernel< Index, Device > >; template< typename Device, typename Index > Loading Loading
src/TNL/Algorithms/Segments/CSR.h +1 −1 Original line number Diff line number Diff line Loading @@ -155,7 +155,7 @@ using CSRHybrid = CSR< Device, Index, CSRHybridKernel< Index, Device >, IndexAll template< typename Device, typename Index, typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > > using CSRAdaptive = CSR< Device, Index, CSRKernelAdaptive< Index, Device >, IndexAllocator >; using CSRAdaptive = CSR< Device, Index, CSRAdaptiveKernel< Index, Device >, IndexAllocator >; template< typename Device, typename Index, Loading
src/TNL/Algorithms/Segments/CSRKernelAdaptive.h→src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h +15 −156 Original line number Diff line number Diff line /*************************************************************************** CSRKernels.h - description CSRAdaptiveKernel.h - description ------------------- begin : Jan 20, 2021 -> Joe Biden inauguration copyright : (C) 2021 by Tomas Oberhuber Loading @@ -15,7 +15,7 @@ #include <TNL/Containers/VectorView.h> #include <TNL/Algorithms/ParallelFor.h> #include <TNL/Algorithms/Segments/details/LambdaAdapter.h> #include <TNL/Algorithms/Segments/CSRKernelScalar.h> #include <TNL/Algorithms/Segments/CSRScalarKernel.h> #include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h> namespace TNL { Loading Loading @@ -47,166 +47,23 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, Reduction reduce, ResultKeeper keep, Real zero, Args... args ) { __shared__ Real streamShared[ WARPS ][ SHARED_PER_WARP ]; __shared__ Real multivectorShared[ CudaBlockSize / warpSize ]; constexpr size_t MAX_X_DIM = 2147483647; const Index index = (gridIdx * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; const Index blockIdx = index / warpSize; if( blockIdx >= blocks.getSize() - 1 ) return; if( threadIdx.x < CudaBlockSize / warpSize ) multivectorShared[ threadIdx.x ] = zero; Real result = zero; bool compute( true ); const Index laneIdx = threadIdx.x & 31; // & is cheaper than % const details::CSRAdaptiveKernelBlockDescriptor< Index > block = blocks[ blockIdx ]; const Index& firstSegmentIdx = block.getFirstSegment(); const Index begin = offsets[ firstSegmentIdx ]; const auto blockType = block.getType(); if( blockType == details::Type::STREAM ) // Stream kernel - many short segments per warp { const Index warpIdx = threadIdx.x / 32; const Index end = begin + block.getSize(); // Stream data to shared memory for( Index globalIdx = laneIdx + begin; globalIdx < end; globalIdx += warpSize ) { streamShared[ warpIdx ][ globalIdx - begin ] = //fetch( globalIdx, compute ); details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute ); // TODO:: fix this by template specialization so that we can assume fetch lambda // with short parameters } const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock(); for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += warpSize ) { const Index sharedEnd = offsets[ i + 1 ] - begin; // end of preprocessed data result = zero; // Scalar reduction for( Index sharedIdx = offsets[ i ] - begin; sharedIdx < sharedEnd; sharedIdx++ ) result = reduce( result, streamShared[ warpIdx ][ sharedIdx ] ); keep( i, result ); } } else if( blockType == details::Type::VECTOR ) // Vector kernel - one segment per warp { const Index end = begin + block.getSize(); const Index segmentIdx = block.getFirstSegment(); for( Index globalIdx = begin + laneIdx; globalIdx < end; globalIdx += warpSize ) result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); // fix local idx // Parallel reduction result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 16 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 8 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 4 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 2 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 1 ) ); if( laneIdx == 0 ) keep( segmentIdx, result ); } else // blockType == Type::LONG - several warps per segment { // Number of elements processed by previous warps //const Index offset = //block.index[1] * MAX_ELEM_PER_WARP; /// block.getWarpIdx() * MAX_ELEM_PER_WARP; //Index to = begin + (block.getWarpIdx() + 1) * MAX_ELEM_PER_WARP; const Index segmentIdx = block.getFirstSegment();//block.index[0]; //minID = offsets[block.index[0] ]; const Index end = offsets[segmentIdx + 1]; //const int tid = threadIdx.x; //const int inBlockWarpIdx = block.getWarpIdx(); //if( to > end ) // to = end; TNL_ASSERT_GT( block.getWarpsCount(), 0, "" ); result = zero; //printf( "LONG tid %d warpIdx %d: LONG \n", tid, block.getWarpIdx() ); for( Index globalIdx = begin + laneIdx + TNL::Cuda::getWarpSize() * block.getWarpIdx(); globalIdx < end; globalIdx += TNL::Cuda::getWarpSize() * block.getWarpsCount() ) { result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); //if( laneIdx == 0 ) // printf( "LONG warpIdx: %d gid: %d begin: %d end: %d -> %d \n", ( int ) block.getWarpIdx(), globalIdx, begin, end, // details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, 0, globalIdx, compute ) ); //result += values[i] * inVector[columnIndexes[i]]; } //printf( "tid %d -> %d \n", tid, result ); result += __shfl_down_sync(0xFFFFFFFF, result, 16); result += __shfl_down_sync(0xFFFFFFFF, result, 8); result += __shfl_down_sync(0xFFFFFFFF, result, 4); result += __shfl_down_sync(0xFFFFFFFF, result, 2); result += __shfl_down_sync(0xFFFFFFFF, result, 1); //if( laneIdx == 0 ) // printf( "WARP RESULT: tid %d -> %d \n", tid, result ); const Index warpID = threadIdx.x / 32; if( laneIdx == 0 ) multivectorShared[ warpID ] = result; __syncthreads(); // Reduction in multivectorShared if( block.getWarpIdx() == 0 && laneIdx < 16 ) { constexpr int totalWarps = CudaBlockSize / warpSize; if( totalWarps >= 32 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 16 ] ); __syncwarp(); } if( totalWarps >= 16 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 8 ] ); __syncwarp(); } if( totalWarps >= 8 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 4 ] ); __syncwarp(); } if( totalWarps >= 4 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 2 ] ); __syncwarp(); } if( totalWarps >= 2 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 1 ] ); __syncwarp(); } if( laneIdx == 0 ) { //printf( "Long: segmentIdx %d -> %d \n", segmentIdx, multivectorShared[ 0 ] ); keep( segmentIdx, multivectorShared[ 0 ] ); } } } } Args... args ); #endif template< typename Index, typename Device > struct CSRKernelAdaptiveView struct CSRAdaptiveKernelView { using IndexType = Index; using DeviceType = Device; using ViewType = CSRKernelAdaptiveView< Index, Device >; using ConstViewType = CSRKernelAdaptiveView< Index, Device >; using ViewType = CSRAdaptiveKernelView< Index, Device >; using ConstViewType = CSRAdaptiveKernelView< Index, Device >; using BlocksType = TNL::Containers::Vector< details::CSRAdaptiveKernelBlockDescriptor< Index >, Device, Index >; using BlocksView = typename BlocksType::ViewType; CSRKernelAdaptiveView() = default; CSRAdaptiveKernelView() = default; CSRKernelAdaptiveView( BlocksType& blocks ) CSRAdaptiveKernelView( BlocksType& blocks ) { this->blocks.bind( blocks ); }; Loading Loading @@ -243,7 +100,7 @@ struct CSRKernelAdaptiveView #ifdef HAVE_CUDA if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() ) { TNL::Algorithms::Segments::CSRKernelScalar< Index, Device >:: TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >:: segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); return; } Loading Loading @@ -318,7 +175,7 @@ struct CSRKernelAdaptiveView #endif } CSRKernelAdaptiveView& operator=( const CSRKernelAdaptiveView< Index, Device >& kernelView ) CSRAdaptiveKernelView& operator=( const CSRAdaptiveKernelView< Index, Device >& kernelView ) { this->blocks.bind( kernelView.blocks ); return *this; Loading @@ -340,12 +197,12 @@ struct CSRKernelAdaptiveView template< typename Index, typename Device > struct CSRKernelAdaptive struct CSRAdaptiveKernel { using IndexType = Index; using DeviceType = Device; using ViewType = CSRKernelAdaptiveView< Index, Device >; using ConstViewType = CSRKernelAdaptiveView< Index, Device >; using ViewType = CSRAdaptiveKernelView< Index, Device >; using ConstViewType = CSRAdaptiveKernelView< Index, Device >; using BlocksType = typename ViewType::BlocksType; using BlocksView = typename BlocksType::ViewType; Loading Loading @@ -485,3 +342,5 @@ struct CSRKernelAdaptive } // namespace Segments } // namespace Algorithms } // namespace TNL #include <TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp>
src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp 0 → 100644 +197 −0 Original line number Diff line number Diff line /*************************************************************************** CSRAdaptiveKernel.h - description ------------------- begin : Feb 7, 2021 -> Joe Biden inauguration copyright : (C) 2021 by Tomas Oberhuber email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ #pragma once #include <TNL/Assert.h> #include <TNL/Cuda/LaunchHelpers.h> #include <TNL/Containers/VectorView.h> #include <TNL/Algorithms/ParallelFor.h> #include <TNL/Algorithms/Segments/details/LambdaAdapter.h> #include <TNL/Algorithms/Segments/CSRScalarKernel.h> #include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h> namespace TNL { namespace Algorithms { namespace Segments { #ifdef HAVE_CUDA template< int CudaBlockSize, int warpSize, int WARPS, int SHARED_PER_WARP, int MAX_ELEM_PER_WARP, typename BlocksView, typename Offsets, typename Index, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > __global__ void segmentsReductionCSRAdaptiveKernel( BlocksView blocks, int gridIdx, Offsets offsets, Index first, Index last, Fetch fetch, Reduction reduce, ResultKeeper keep, Real zero, Args... args ) { __shared__ Real streamShared[ WARPS ][ SHARED_PER_WARP ]; __shared__ Real multivectorShared[ CudaBlockSize / warpSize ]; constexpr size_t MAX_X_DIM = 2147483647; const Index index = (gridIdx * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; const Index blockIdx = index / warpSize; if( blockIdx >= blocks.getSize() - 1 ) return; if( threadIdx.x < CudaBlockSize / warpSize ) multivectorShared[ threadIdx.x ] = zero; Real result = zero; bool compute( true ); const Index laneIdx = threadIdx.x & 31; // & is cheaper than % const details::CSRAdaptiveKernelBlockDescriptor< Index > block = blocks[ blockIdx ]; const Index& firstSegmentIdx = block.getFirstSegment(); const Index begin = offsets[ firstSegmentIdx ]; const auto blockType = block.getType(); if( blockType == details::Type::STREAM ) // Stream kernel - many short segments per warp { const Index warpIdx = threadIdx.x / 32; const Index end = begin + block.getSize(); // Stream data to shared memory for( Index globalIdx = laneIdx + begin; globalIdx < end; globalIdx += warpSize ) { streamShared[ warpIdx ][ globalIdx - begin ] = //fetch( globalIdx, compute ); details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute ); // TODO:: fix this by template specialization so that we can assume fetch lambda // with short parameters } const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock(); for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += warpSize ) { const Index sharedEnd = offsets[ i + 1 ] - begin; // end of preprocessed data result = zero; // Scalar reduction for( Index sharedIdx = offsets[ i ] - begin; sharedIdx < sharedEnd; sharedIdx++ ) result = reduce( result, streamShared[ warpIdx ][ sharedIdx ] ); keep( i, result ); } } else if( blockType == details::Type::VECTOR ) // Vector kernel - one segment per warp { const Index end = begin + block.getSize(); const Index segmentIdx = block.getFirstSegment(); for( Index globalIdx = begin + laneIdx; globalIdx < end; globalIdx += warpSize ) result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); // fix local idx // Parallel reduction result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 16 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 8 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 4 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 2 ) ); result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 1 ) ); if( laneIdx == 0 ) keep( segmentIdx, result ); } else // blockType == Type::LONG - several warps per segment { // Number of elements processed by previous warps //const Index offset = //block.index[1] * MAX_ELEM_PER_WARP; /// block.getWarpIdx() * MAX_ELEM_PER_WARP; //Index to = begin + (block.getWarpIdx() + 1) * MAX_ELEM_PER_WARP; const Index segmentIdx = block.getFirstSegment();//block.index[0]; //minID = offsets[block.index[0] ]; const Index end = offsets[segmentIdx + 1]; //const int tid = threadIdx.x; //const int inBlockWarpIdx = block.getWarpIdx(); //if( to > end ) // to = end; TNL_ASSERT_GT( block.getWarpsCount(), 0, "" ); result = zero; //printf( "LONG tid %d warpIdx %d: LONG \n", tid, block.getWarpIdx() ); for( Index globalIdx = begin + laneIdx + TNL::Cuda::getWarpSize() * block.getWarpIdx(); globalIdx < end; globalIdx += TNL::Cuda::getWarpSize() * block.getWarpsCount() ) { result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); //if( laneIdx == 0 ) // printf( "LONG warpIdx: %d gid: %d begin: %d end: %d -> %d \n", ( int ) block.getWarpIdx(), globalIdx, begin, end, // details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, 0, globalIdx, compute ) ); //result += values[i] * inVector[columnIndexes[i]]; } //printf( "tid %d -> %d \n", tid, result ); result += __shfl_down_sync(0xFFFFFFFF, result, 16); result += __shfl_down_sync(0xFFFFFFFF, result, 8); result += __shfl_down_sync(0xFFFFFFFF, result, 4); result += __shfl_down_sync(0xFFFFFFFF, result, 2); result += __shfl_down_sync(0xFFFFFFFF, result, 1); //if( laneIdx == 0 ) // printf( "WARP RESULT: tid %d -> %d \n", tid, result ); const Index warpID = threadIdx.x / 32; if( laneIdx == 0 ) multivectorShared[ warpID ] = result; __syncthreads(); // Reduction in multivectorShared if( block.getWarpIdx() == 0 && laneIdx < 16 ) { constexpr int totalWarps = CudaBlockSize / warpSize; if( totalWarps >= 32 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 16 ] ); __syncwarp(); } if( totalWarps >= 16 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 8 ] ); __syncwarp(); } if( totalWarps >= 8 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 4 ] ); __syncwarp(); } if( totalWarps >= 4 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 2 ] ); __syncwarp(); } if( totalWarps >= 2 ) { multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 1 ] ); __syncwarp(); } if( laneIdx == 0 ) { //printf( "Long: segmentIdx %d -> %d \n", segmentIdx, multivectorShared[ 0 ] ); keep( segmentIdx, multivectorShared[ 0 ] ); } } } } #endif } // namespace Segments } // namespace Algorithms } // namespace TNL No newline at end of file
src/TNL/Algorithms/Segments/CSRView.h +2 −2 Original line number Diff line number Diff line Loading @@ -17,7 +17,7 @@ #include <TNL/Algorithms/Segments/CSRScalarKernel.h> #include <TNL/Algorithms/Segments/CSRVectorKernel.h> #include <TNL/Algorithms/Segments/CSRHybridKernel.h> #include <TNL/Algorithms/Segments/CSRKernelAdaptive.h> #include <TNL/Algorithms/Segments/CSRAdaptiveKernel.h> namespace TNL { namespace Algorithms { Loading Loading @@ -146,7 +146,7 @@ using CSRViewHybrid = CSRView< Device, Index, CSRHybridKernel< Index, Device > > template< typename Device, typename Index > using CSRViewAdaptive = CSRView< Device, Index, CSRKernelAdaptive< Index, Device > >; using CSRViewAdaptive = CSRView< Device, Index, CSRAdaptiveKernel< Index, Device > >; template< typename Device, typename Index > Loading