Adding CSR Light kernel. (93dd8e20) · Commits · TNL / tnl-dev

src/TNL/Algorithms/Segments/details/CSRKernels.h

0 → 100644

+280 −0

Original line number	Diff line number	Diff line
		/***************************************************************************
		CSRKernels.h - description
		-------------------
		begin : Jan 20, 2021 -> Joe Biden inauguration
		copyright : (C) 2021 by Tomas Oberhuber
		email : tomas.oberhuber@fjfi.cvut.cz
		***************************************************************************/

		/* See Copyright Notice in tnl/Copyright */

		#pragma once

		#include <TNL/Cuda/LaunchHelpers.h>
		#include <TNL/Containers/VectorView.h>
		#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>

		namespace TNL {
		namespace Algorithms {
		namespace Segments {
		namespace details {


		#ifdef HAVE_CUDA
		template< typename Device,
		typename Index,
		typename Fetch,
		typename Reduction,
		typename ResultKeeper,
		typename Real,
		typename... Args >
		__global__
		void RowsReductionVectorKernel(
		int gridIdx,
		const TNL::Containers::VectorView< Index, TNL::Devices::Cuda, Index > offsets,
		Index first,
		Index last,
		Fetch& fetch,
		const Reduction& reduction,
		ResultKeeper& keeper,
		const Real& zero,
		Args... args )
		{
		/***
		* We map one warp to each segment
		*/
		const Index segmentIdx = TNL::Cuda::getGlobalThreadIdx( gridIdx ) / TNL::Cuda::getWarpSize() + first;
		if( segmentIdx >= last )
		return;

		const int laneIdx = threadIdx.x & 31; // & is cheaper than %
		Index endIdx = offsets[ segmentIdx + 1] ;

		Index localIdx( laneIdx );
		Real aux = zero;
		for( Index globalIdx = offsets[ segmentIdx ] + localIdx; i < endIdx; i += TNL::Cuda::getWarpSize() )
		{
		aux = reduce( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
		localIdx += TNL::Cuda::getWarpSize();
		}

		/****
		* Reduction in each warp which means in each segment.
		*/
		aux += __shfl_down_sync(0xFFFFFFFF, aux, 16);
		aux += __shfl_down_sync(0xFFFFFFFF, aux, 8);
		aux += __shfl_down_sync(0xFFFFFFFF, aux, 4);
		aux += __shfl_down_sync(0xFFFFFFFF, aux, 2);
		aux += __shfl_down_sync(0xFFFFFFFF, aux, 1);

		if( laneIdx == 0 )
		keeper( segmentIdx, aux )



		/const Index warpID = ((gridID MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x) / warpSize;
		if (warpID >= rows)
		return;

		Real result = 0.0;
		const Index laneID = threadIdx.x & 31; // & is cheaper than %
		Index endID = rowPointers[warpID + 1];

		// Calculate result
		for (Index i = rowPointers[warpID] + laneID; i < endID; i += warpSize)
		result += values[i] * inVector[columnIndexes[i]];

		// Reduction
		result += __shfl_down_sync(0xFFFFFFFF, result, 16);
		result += __shfl_down_sync(0xFFFFFFFF, result, 8);
		result += __shfl_down_sync(0xFFFFFFFF, result, 4);
		result += __shfl_down_sync(0xFFFFFFFF, result, 2);
		result += __shfl_down_sync(0xFFFFFFFF, result, 1);
		// Write result
		if (laneID == 0) outVector[warpID] = result;*/
		}
		#endif

		template< typename OffsetsView,
		typename Index,
		typename Fetch,
		typename Reduction,
		typename ResultKeeper,
		typename Real,
		typename... Args >
		void
		RowsReductionVectorKernelCaller(
		const OffsetsView& offsets,
		Index first,
		Index last,
		Fetch& fetch,
		const Reduction& reduction,
		ResultKeeper& keeper,
		const Real& zero,
		Args... args )
		{
		#ifdef HAVE_CUDA
		const Index warpsCount = last - first;
		const size_t threadsCount = warpsCount * TNL::Cuda::getWarpSize();
		dim3 blocksCount, gridsCount, blockSize( 256 );
		TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount );
		for( int gridIdx = 0; gridIdx < gridsCount.x; gridIdx ++ )
		{
		dim3 gridSize;
		setupGrid( blocksCount, gridsCount, gridIdx, gridSize );
		SpMVCSRVector< Index, Fetch, Redcution, ResultKeeper, Real, Args ><<< gridSize, blockSize >>>(
		gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args );
		};

		#endif

		/*const Index threads = matrix.THREADS_VECTOR; // block size
		size_t neededThreads = matrix.getRowPointers().getSize() * warpSize;
		Index blocks;
		// Execute kernels on device
		for (Index grid = 0; neededThreads != 0; ++grid) {
		if (MAX_X_DIM * threads >= neededThreads) {
		blocks = roundUpDivision(neededThreads, threads);
		neededThreads = 0;
		} else {
		blocks = MAX_X_DIM;
		neededThreads -= MAX_X_DIM * threads;
		}

		SpMVCSRVector<Real, Index, warpSize><<<blocks, threads>>>(
		inVector,
		outVector,
		matrix.getRowPointers().getData(),
		matrix.getColumnIndexes().getData(),
		matrix.getValues().getData(),
		matrix.getRowPointers().getSize() - 1,
		grid
		);
		}*/
		}

		#ifdef HAVE_CUDA
		template< int ThreadsPerSegment,
		typename Device,
		typename Index,
		typename Fetch,
		typename Reduction,
		typename ResultKeeper,
		typename Real,
		typename... Args >
		__global__
		void RowsReductionLightKernel(
		int gridIdx,
		const TNL::Containers::VectorView< Index, TNL::Devices::Cuda, Index > offsets,
		Index first,
		Index last,
		Fetch& fetch,
		const Reduction& reduction,
		ResultKeeper& keeper,
		const Real& zero,
		Args... args )
		{
		/***
		* We map one warp to each segment
		*/
		const Index segmentIdx = TNL::Cuda::getGlobalThreadIdx( gridIdx ) / TNL::Cuda::getWarpSize() + first;
		if( segmentIdx >= last )
		return;

		const int laneIdx = threadIdx.x & ( ThreadsPerSegment - 1 ); // & is cheaper than %
		Index endIdx = offsets[ segmentIdx + 1] ;

		Index localIdx( laneIdx );
		Real aux = zero;
		for( Index globalIdx = offsets[ segmentIdx ] + localIdx; i < endIdx; i += ThreadsPerSegment )
		{
		aux = reduce( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
		localIdx += TNL::Cuda::getWarpSize();
		}

		/****
		* Reduction in each segment.
		*/
		if( ThreadsPerSegment == 32 )
		aux += __shfl_down_sync(0xFFFFFFFF, aux, 16);
		if( ThreadsPerSegment >= 16 )
		aux += __shfl_down_sync(0xFFFFFFFF, aux, 8);
		if( ThreadsPerSegment >= 8 )
		aux += __shfl_down_sync(0xFFFFFFFF, aux, 4);
		if( ThreadsPerSegment >= 4 )
		aux += __shfl_down_sync(0xFFFFFFFF, aux, 2);
		if( ThreadsPerSegment >= 2 )
		aux += __shfl_down_sync(0xFFFFFFFF, aux, 1);

		if( laneIdx == 0 )
		keeper( segmentIdx, aux )
		}
		#endif


		template< typename OffsetsView,
		typename Index,
		typename Fetch,
		typename Reduction,
		typename ResultKeeper,
		typename Real,
		typename... Args >
		void
		RowsReductionLightKernelCaller(
		const Index elementsInSegment,
		const OffsetsView& offsets,
		Index first,
		Index last,
		Fetch& fetch,
		const Reduction& reduction,
		ResultKeeper& keeper,
		const Real& zero,
		Args... args )
		{
		#ifdef HAVE_CUDA
		const int threadsPerSegment = TNL::min( std::pow( 2, std::floor( std::log2( elementInSegment ) ) ), TNL::Cuda::getWarpSize() );
		TNL::ASSERT_GE( threadsPerSegment, 0 );
		TNL::ASSERT_LE( threadsPerSegment, 32 );
		const size_t threadsCount = threadsPerSegment * ( last - first );
		dim3 blocksCount, gridsCount, blockSize( 256 );
		TNL::Cuda::setupThreads( blockSize, blocksCount, gridsCount, threadsCount );
		for( int gridIdx = 0; gridIdx < gridsCount.x; gridIdx ++ )
		{
		dim3 gridSize;
		setupGrid( blocksCount, gridsCount, gridIdx, gridSize );
		switch( threadsPerSegment )
		{
		case 1:
		SpMVCSRLight< 1, Index, Fetch, Redcution, ResultKeeper, Real, Args ><<< gridSize, blockSize >>>(
		gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args );
		break;
		case 2:
		SpMVCSRLight< 2, Index, Fetch, Redcution, ResultKeeper, Real, Args ><<< gridSize, blockSize >>>(
		gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args );
		break;
		case 4:
		SpMVCSRLight< 4, Index, Fetch, Redcution, ResultKeeper, Real, Args ><<< gridSize, blockSize >>>(
		gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args );
		break;
		case 8:
		SpMVCSRLight< 8, Index, Fetch, Redcution, ResultKeeper, Real, Args ><<< gridSize, blockSize >>>(
		gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args );
		break;
		case 16:
		SpMVCSRLight< 16, Index, Fetch, Redcution, ResultKeeper, Real, Args ><<< gridSize, blockSize >>>(
		gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args );
		break;
		case 32:
		SpMVCSRLight< 32, Index, Fetch, Redcution, ResultKeeper, Real, Args ><<< gridSize, blockSize >>>(
		gridIdx.x, offsets, first, last, fetch, reduction, keeper, zero, args );
		break;
		default:
		throw std::runtime_error( "Wrong value of threadsPerSegment." );
		};
		#endif
		}

		} // namespace details
		} // namespace Segments
		} // namespace Algorithms
		} // namespace TNL