Renaming CSRKernelAdaptive to CSRAdaptiveKernel. (46482276) · Commits · TNL / tnl-dev

src/TNL/Algorithms/Segments/CSR.h

+1 −1

Original line number	Diff line number	Diff line
		@@ -155,7 +155,7 @@ using CSRHybrid = CSR< Device, Index, CSRHybridKernel< Index, Device >, IndexAll
		template< typename Device,
		typename Index,
		typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
		using CSRAdaptive = CSR< Device, Index, CSRKernelAdaptive< Index, Device >, IndexAllocator >;
		using CSRAdaptive = CSR< Device, Index, CSRAdaptiveKernel< Index, Device >, IndexAllocator >;

		template< typename Device,
		typename Index,

src/TNL/Algorithms/Segments/CSRKernelAdaptive.h→src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h

+15 −156

Original line number	Diff line number	Diff line
		/***************************************************************************
		CSRKernels.h - description
		CSRAdaptiveKernel.h - description
		-------------------
		begin : Jan 20, 2021 -> Joe Biden inauguration
		copyright : (C) 2021 by Tomas Oberhuber
		@@ -15,7 +15,7 @@
		#include <TNL/Containers/VectorView.h>
		#include <TNL/Algorithms/ParallelFor.h>
		#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
		#include <TNL/Algorithms/Segments/CSRKernelScalar.h>
		#include <TNL/Algorithms/Segments/CSRScalarKernel.h>
		#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h>

		namespace TNL {
		@@ -47,166 +47,23 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
		Reduction reduce,
		ResultKeeper keep,
		Real zero,
		Args... args )
		{
		__shared__ Real streamShared[ WARPS ][ SHARED_PER_WARP ];
		__shared__ Real multivectorShared[ CudaBlockSize / warpSize ];
		constexpr size_t MAX_X_DIM = 2147483647;
		const Index index = (gridIdx * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
		const Index blockIdx = index / warpSize;
		if( blockIdx >= blocks.getSize() - 1 )
		return;

		if( threadIdx.x < CudaBlockSize / warpSize )
		multivectorShared[ threadIdx.x ] = zero;
		Real result = zero;
		bool compute( true );
		const Index laneIdx = threadIdx.x & 31; // & is cheaper than %
		const details::CSRAdaptiveKernelBlockDescriptor< Index > block = blocks[ blockIdx ];
		const Index& firstSegmentIdx = block.getFirstSegment();
		const Index begin = offsets[ firstSegmentIdx ];

		const auto blockType = block.getType();
		if( blockType == details::Type::STREAM ) // Stream kernel - many short segments per warp
		{
		const Index warpIdx = threadIdx.x / 32;
		const Index end = begin + block.getSize();

		// Stream data to shared memory
		for( Index globalIdx = laneIdx + begin; globalIdx < end; globalIdx += warpSize )
		{
		streamShared[ warpIdx ][ globalIdx - begin ] = //fetch( globalIdx, compute );
		details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute );
		// TODO:: fix this by template specialization so that we can assume fetch lambda
		// with short parameters
		}

		const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock();

		for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += warpSize )
		{
		const Index sharedEnd = offsets[ i + 1 ] - begin; // end of preprocessed data
		result = zero;
		// Scalar reduction
		for( Index sharedIdx = offsets[ i ] - begin; sharedIdx < sharedEnd; sharedIdx++ )
		result = reduce( result, streamShared[ warpIdx ][ sharedIdx ] );
		keep( i, result );
		}
		}
		else if( blockType == details::Type::VECTOR ) // Vector kernel - one segment per warp
		{
		const Index end = begin + block.getSize();
		const Index segmentIdx = block.getFirstSegment();

		for( Index globalIdx = begin + laneIdx; globalIdx < end; globalIdx += warpSize )
		result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); // fix local idx

		// Parallel reduction
		result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 16 ) );
		result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 8 ) );
		result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 4 ) );
		result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 2 ) );
		result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 1 ) );
		if( laneIdx == 0 )
		keep( segmentIdx, result );
		}
		else // blockType == Type::LONG - several warps per segment
		{
		// Number of elements processed by previous warps
		//const Index offset = //block.index[1] * MAX_ELEM_PER_WARP;
		/// block.getWarpIdx() * MAX_ELEM_PER_WARP;
		//Index to = begin + (block.getWarpIdx() + 1) * MAX_ELEM_PER_WARP;
		const Index segmentIdx = block.getFirstSegment();//block.index[0];
		//minID = offsets[block.index[0] ];
		const Index end = offsets[segmentIdx + 1];
		//const int tid = threadIdx.x;
		//const int inBlockWarpIdx = block.getWarpIdx();

		//if( to > end )
		// to = end;
		TNL_ASSERT_GT( block.getWarpsCount(), 0, "" );
		result = zero;
		//printf( "LONG tid %d warpIdx %d: LONG \n", tid, block.getWarpIdx() );
		for( Index globalIdx = begin + laneIdx + TNL::Cuda::getWarpSize() * block.getWarpIdx();
		globalIdx < end;
		globalIdx += TNL::Cuda::getWarpSize() * block.getWarpsCount() )
		{
		result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) );
		//if( laneIdx == 0 )
		// printf( "LONG warpIdx: %d gid: %d begin: %d end: %d -> %d \n", ( int ) block.getWarpIdx(), globalIdx, begin, end,
		// details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, 0, globalIdx, compute ) );
		//result += values[i] * inVector[columnIndexes[i]];
		}
		//printf( "tid %d -> %d \n", tid, result );

		result += __shfl_down_sync(0xFFFFFFFF, result, 16);
		result += __shfl_down_sync(0xFFFFFFFF, result, 8);
		result += __shfl_down_sync(0xFFFFFFFF, result, 4);
		result += __shfl_down_sync(0xFFFFFFFF, result, 2);
		result += __shfl_down_sync(0xFFFFFFFF, result, 1);

		//if( laneIdx == 0 )
		// printf( "WARP RESULT: tid %d -> %d \n", tid, result );

		const Index warpID = threadIdx.x / 32;
		if( laneIdx == 0 )
		multivectorShared[ warpID ] = result;

		__syncthreads();
		// Reduction in multivectorShared
		if( block.getWarpIdx() == 0 && laneIdx < 16 )
		{
		constexpr int totalWarps = CudaBlockSize / warpSize;
		if( totalWarps >= 32 )
		{
		multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 16 ] );
		__syncwarp();
		}
		if( totalWarps >= 16 )
		{
		multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 8 ] );
		__syncwarp();
		}
		if( totalWarps >= 8 )
		{
		multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 4 ] );
		__syncwarp();
		}
		if( totalWarps >= 4 )
		{
		multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 2 ] );
		__syncwarp();
		}
		if( totalWarps >= 2 )
		{
		multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 1 ] );
		__syncwarp();
		}
		if( laneIdx == 0 )
		{
		//printf( "Long: segmentIdx %d -> %d \n", segmentIdx, multivectorShared[ 0 ] );
		keep( segmentIdx, multivectorShared[ 0 ] );
		}
		}
		}
		}
		Args... args );
		#endif


		template< typename Index,
		typename Device >
		struct CSRKernelAdaptiveView
		struct CSRAdaptiveKernelView
		{
		using IndexType = Index;
		using DeviceType = Device;
		using ViewType = CSRKernelAdaptiveView< Index, Device >;
		using ConstViewType = CSRKernelAdaptiveView< Index, Device >;
		using ViewType = CSRAdaptiveKernelView< Index, Device >;
		using ConstViewType = CSRAdaptiveKernelView< Index, Device >;
		using BlocksType = TNL::Containers::Vector< details::CSRAdaptiveKernelBlockDescriptor< Index >, Device, Index >;
		using BlocksView = typename BlocksType::ViewType;

		CSRKernelAdaptiveView() = default;
		CSRAdaptiveKernelView() = default;

		CSRKernelAdaptiveView( BlocksType& blocks )
		CSRAdaptiveKernelView( BlocksType& blocks )
		{
		this->blocks.bind( blocks );
		};
		@@ -243,7 +100,7 @@ struct CSRKernelAdaptiveView
		#ifdef HAVE_CUDA
		if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() )
		{
		TNL::Algorithms::Segments::CSRKernelScalar< Index, Device >::
		TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >::
		segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
		return;
		}
		@@ -318,7 +175,7 @@ struct CSRKernelAdaptiveView
		#endif
		}

		CSRKernelAdaptiveView& operator=( const CSRKernelAdaptiveView< Index, Device >& kernelView )
		CSRAdaptiveKernelView& operator=( const CSRAdaptiveKernelView< Index, Device >& kernelView )
		{
		this->blocks.bind( kernelView.blocks );
		return *this;
		@@ -340,12 +197,12 @@ struct CSRKernelAdaptiveView

		template< typename Index,
		typename Device >
		struct CSRKernelAdaptive
		struct CSRAdaptiveKernel
		{
		using IndexType = Index;
		using DeviceType = Device;
		using ViewType = CSRKernelAdaptiveView< Index, Device >;
		using ConstViewType = CSRKernelAdaptiveView< Index, Device >;
		using ViewType = CSRAdaptiveKernelView< Index, Device >;
		using ConstViewType = CSRAdaptiveKernelView< Index, Device >;
		using BlocksType = typename ViewType::BlocksType;
		using BlocksView = typename BlocksType::ViewType;

		@@ -485,3 +342,5 @@ struct CSRKernelAdaptive
		} // namespace Segments
		} // namespace Algorithms
		} // namespace TNL

		#include <TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp>

src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp

0 → 100644

+197 −0

Original line number	Diff line number	Diff line
		/***************************************************************************
		CSRAdaptiveKernel.h - description
		-------------------
		begin : Feb 7, 2021 -> Joe Biden inauguration
		copyright : (C) 2021 by Tomas Oberhuber
		email : tomas.oberhuber@fjfi.cvut.cz
		***************************************************************************/

		/* See Copyright Notice in tnl/Copyright */

		#pragma once

		#include <TNL/Assert.h>
		#include <TNL/Cuda/LaunchHelpers.h>
		#include <TNL/Containers/VectorView.h>
		#include <TNL/Algorithms/ParallelFor.h>
		#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
		#include <TNL/Algorithms/Segments/CSRScalarKernel.h>
		#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h>

		namespace TNL {
		namespace Algorithms {
		namespace Segments {

		#ifdef HAVE_CUDA

		template< int CudaBlockSize,
		int warpSize,
		int WARPS,
		int SHARED_PER_WARP,
		int MAX_ELEM_PER_WARP,
		typename BlocksView,
		typename Offsets,
		typename Index,
		typename Fetch,
		typename Reduction,
		typename ResultKeeper,
		typename Real,
		typename... Args >
		__global__ void
		segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
		int gridIdx,
		Offsets offsets,
		Index first,
		Index last,
		Fetch fetch,
		Reduction reduce,
		ResultKeeper keep,
		Real zero,
		Args... args )
		{
		__shared__ Real streamShared[ WARPS ][ SHARED_PER_WARP ];
		__shared__ Real multivectorShared[ CudaBlockSize / warpSize ];
		constexpr size_t MAX_X_DIM = 2147483647;
		const Index index = (gridIdx * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
		const Index blockIdx = index / warpSize;
		if( blockIdx >= blocks.getSize() - 1 )
		return;

		if( threadIdx.x < CudaBlockSize / warpSize )
		multivectorShared[ threadIdx.x ] = zero;
		Real result = zero;
		bool compute( true );
		const Index laneIdx = threadIdx.x & 31; // & is cheaper than %
		const details::CSRAdaptiveKernelBlockDescriptor< Index > block = blocks[ blockIdx ];
		const Index& firstSegmentIdx = block.getFirstSegment();
		const Index begin = offsets[ firstSegmentIdx ];

		const auto blockType = block.getType();
		if( blockType == details::Type::STREAM ) // Stream kernel - many short segments per warp
		{
		const Index warpIdx = threadIdx.x / 32;
		const Index end = begin + block.getSize();

		// Stream data to shared memory
		for( Index globalIdx = laneIdx + begin; globalIdx < end; globalIdx += warpSize )
		{
		streamShared[ warpIdx ][ globalIdx - begin ] = //fetch( globalIdx, compute );
		details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute );
		// TODO:: fix this by template specialization so that we can assume fetch lambda
		// with short parameters
		}

		const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock();

		for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += warpSize )
		{
		const Index sharedEnd = offsets[ i + 1 ] - begin; // end of preprocessed data
		result = zero;
		// Scalar reduction
		for( Index sharedIdx = offsets[ i ] - begin; sharedIdx < sharedEnd; sharedIdx++ )
		result = reduce( result, streamShared[ warpIdx ][ sharedIdx ] );
		keep( i, result );
		}
		}
		else if( blockType == details::Type::VECTOR ) // Vector kernel - one segment per warp
		{
		const Index end = begin + block.getSize();
		const Index segmentIdx = block.getFirstSegment();

		for( Index globalIdx = begin + laneIdx; globalIdx < end; globalIdx += warpSize )
		result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); // fix local idx

		// Parallel reduction
		result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 16 ) );
		result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 8 ) );
		result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 4 ) );
		result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 2 ) );
		result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 1 ) );
		if( laneIdx == 0 )
		keep( segmentIdx, result );
		}
		else // blockType == Type::LONG - several warps per segment
		{
		// Number of elements processed by previous warps
		//const Index offset = //block.index[1] * MAX_ELEM_PER_WARP;
		/// block.getWarpIdx() * MAX_ELEM_PER_WARP;
		//Index to = begin + (block.getWarpIdx() + 1) * MAX_ELEM_PER_WARP;
		const Index segmentIdx = block.getFirstSegment();//block.index[0];
		//minID = offsets[block.index[0] ];
		const Index end = offsets[segmentIdx + 1];
		//const int tid = threadIdx.x;
		//const int inBlockWarpIdx = block.getWarpIdx();

		//if( to > end )
		// to = end;
		TNL_ASSERT_GT( block.getWarpsCount(), 0, "" );
		result = zero;
		//printf( "LONG tid %d warpIdx %d: LONG \n", tid, block.getWarpIdx() );
		for( Index globalIdx = begin + laneIdx + TNL::Cuda::getWarpSize() * block.getWarpIdx();
		globalIdx < end;
		globalIdx += TNL::Cuda::getWarpSize() * block.getWarpsCount() )
		{
		result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) );
		//if( laneIdx == 0 )
		// printf( "LONG warpIdx: %d gid: %d begin: %d end: %d -> %d \n", ( int ) block.getWarpIdx(), globalIdx, begin, end,
		// details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, 0, globalIdx, compute ) );
		//result += values[i] * inVector[columnIndexes[i]];
		}
		//printf( "tid %d -> %d \n", tid, result );

		result += __shfl_down_sync(0xFFFFFFFF, result, 16);
		result += __shfl_down_sync(0xFFFFFFFF, result, 8);
		result += __shfl_down_sync(0xFFFFFFFF, result, 4);
		result += __shfl_down_sync(0xFFFFFFFF, result, 2);
		result += __shfl_down_sync(0xFFFFFFFF, result, 1);

		//if( laneIdx == 0 )
		// printf( "WARP RESULT: tid %d -> %d \n", tid, result );

		const Index warpID = threadIdx.x / 32;
		if( laneIdx == 0 )
		multivectorShared[ warpID ] = result;

		__syncthreads();
		// Reduction in multivectorShared
		if( block.getWarpIdx() == 0 && laneIdx < 16 )
		{
		constexpr int totalWarps = CudaBlockSize / warpSize;
		if( totalWarps >= 32 )
		{
		multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 16 ] );
		__syncwarp();
		}
		if( totalWarps >= 16 )
		{
		multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 8 ] );
		__syncwarp();
		}
		if( totalWarps >= 8 )
		{
		multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 4 ] );
		__syncwarp();
		}
		if( totalWarps >= 4 )
		{
		multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 2 ] );
		__syncwarp();
		}
		if( totalWarps >= 2 )
		{
		multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 1 ] );
		__syncwarp();
		}
		if( laneIdx == 0 )
		{
		//printf( "Long: segmentIdx %d -> %d \n", segmentIdx, multivectorShared[ 0 ] );
		keep( segmentIdx, multivectorShared[ 0 ] );
		}
		}
		}
		}
		#endif

		} // namespace Segments
		} // namespace Algorithms
		} // namespace TNL
		No newline at end of file

src/TNL/Algorithms/Segments/CSRView.h

+2 −2

Original line number	Diff line number	Diff line
		@@ -17,7 +17,7 @@
		#include <TNL/Algorithms/Segments/CSRScalarKernel.h>
		#include <TNL/Algorithms/Segments/CSRVectorKernel.h>
		#include <TNL/Algorithms/Segments/CSRHybridKernel.h>
		#include <TNL/Algorithms/Segments/CSRKernelAdaptive.h>
		#include <TNL/Algorithms/Segments/CSRAdaptiveKernel.h>

		namespace TNL {
		namespace Algorithms {
		@@ -146,7 +146,7 @@ using CSRViewHybrid = CSRView< Device, Index, CSRHybridKernel< Index, Device > >

		template< typename Device,
		typename Index >
		using CSRViewAdaptive = CSRView< Device, Index, CSRKernelAdaptive< Index, Device > >;
		using CSRViewAdaptive = CSRView< Device, Index, CSRAdaptiveKernel< Index, Device > >;

		template< typename Device,
		typename Index >