Implementing Light CSR kernel for segments. (45ad3fa7) · Commits · TNL / tnl-dev

src/TNL/Algorithms/Segments/CSR.h

+5 −1

Original line number	Diff line number	Diff line
		@@ -546,6 +546,11 @@ template< typename Device,
		typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
		using CSRHybrid = CSR< Device, Index, CSRHybridKernel< Index, Device >, IndexAllocator >;

		template< typename Device,
		typename Index,
		typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
		using CSRLight = CSR< Device, Index, CSRLightKernel< Index, Device >, IndexAllocator >;

		template< typename Device,
		typename Index,
		typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
		@@ -556,7 +561,6 @@ template< typename Device,
		typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
		using CSRDefault = CSRScalar< Device, Index, IndexAllocator >;


		} // namespace Segments
		} // namespace Algorithms
		} // namespace TNL

src/TNL/Algorithms/Segments/CSRView.h

+7 −1

Original line number	Diff line number	Diff line
		@@ -17,6 +17,7 @@
		#include <TNL/Algorithms/Segments/Kernels/CSRScalarKernel.h>
		#include <TNL/Algorithms/Segments/Kernels/CSRVectorKernel.h>
		#include <TNL/Algorithms/Segments/Kernels/CSRHybridKernel.h>
		#include <TNL/Algorithms/Segments/Kernels/CSRLightKernel.h>
		#include <TNL/Algorithms/Segments/Kernels/CSRAdaptiveKernel.h>
		#include <TNL/Algorithms/Segments/SegmentsPrinting.h>

		@@ -163,9 +164,14 @@ template< typename Device,
		typename Index >
		using CSRViewVector = CSRView< Device, Index, CSRVectorKernel< Index, Device > >;

		template< typename Device,
		typename Index,
		int ThreadsInBlock = 256 >
		using CSRViewHybrid = CSRView< Device, Index, CSRHybridKernel< Index, Device, ThreadsInBlock > >;

		template< typename Device,
		typename Index >
		using CSRViewHybrid = CSRView< Device, Index, CSRHybridKernel< Index, Device > >;
		using CSRViewLight = CSRView< Device, Index, CSRLightKernel< Index, Device > >;

		template< typename Device,
		typename Index >

src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.h

0 → 100644

+64 −0

Original line number	Diff line number	Diff line
		/***************************************************************************
		CSRLightKernel.h - description
		-------------------
		begin : Jun 9, 2021 -> Joe Biden inauguration
		copyright : (C) 2021 by Tomas Oberhuber
		email : tomas.oberhuber@fjfi.cvut.cz
		***************************************************************************/

		/* See Copyright Notice in tnl/Copyright */

		#pragma once

		#include <TNL/Assert.h>
		#include <TNL/Cuda/LaunchHelpers.h>
		#include <TNL/Containers/VectorView.h>
		#include <TNL/Algorithms/ParallelFor.h>
		#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>

		namespace TNL {
		namespace Algorithms {
		namespace Segments {

		template< typename Index,
		typename Device >
		struct CSRLightKernel
		{
		using IndexType = Index;
		using DeviceType = Device;
		using ViewType = CSRLightKernel< Index, Device >;
		using ConstViewType = CSRLightKernel< Index, Device >;

		template< typename Offsets >
		void init( const Offsets& offsets );

		void reset();

		ViewType getView();

		ConstViewType getConstView() const;

		static TNL::String getKernelType();

		template< typename OffsetsView,
		typename Fetch,
		typename Reduction,
		typename ResultKeeper,
		typename Real >
		void reduceSegments( const OffsetsView& offsets,
		Index first,
		Index last,
		Fetch& fetch,
		const Reduction& reduction,
		ResultKeeper& keeper,
		const Real& zero ) const;

		protected:
		int threadsPerSegment = 0;
		};

		} // namespace Segments
		} // namespace Algorithms
		} // namespace TNL

		#include <TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp>

src/TNL/Algorithms/Segments/Kernels/CSRLightKernel.hpp

0 → 100644

+422 −0

Original line number	Diff line number	Diff line
		/***************************************************************************
		CSRLightKernel.hpp - description
		-------------------
		begin : Jun 9, 2021 -> Joe Biden inauguration
		copyright : (C) 2021 by Tomas Oberhuber
		email : tomas.oberhuber@fjfi.cvut.cz
		***************************************************************************/

		/* See Copyright Notice in tnl/Copyright */

		#pragma once

		#include <TNL/Assert.h>
		#include <TNL/Cuda/LaunchHelpers.h>
		#include <TNL/Containers/VectorView.h>
		#include <TNL/Algorithms/ParallelFor.h>
		#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
		#include <TNL/Algorithms/Segments/Kernels/CSRLightKernel.h>

		namespace TNL {
		namespace Algorithms {
		namespace Segments {

		#ifdef HAVE_CUDA
		template< typename Real,
		typename Index,
		typename OffsetsView,
		typename Fetch,
		typename Reduce,
		typename Keep >
		__global__
		void SpMVCSRLightWithoutAtomic2( OffsetsView offsets,
		const Index first,
		const Index last,
		Fetch fetch,
		Reduce reduce,
		Keep keep,
		const Real zero,
		const Index gridID)
		{
		const Index segmentIdx =
		first + ( ( gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x ) / 2;
		if( segmentIdx >= last )
		return;

		const Index inGroupID = threadIdx.x & 1; // & is cheaper than %
		const Index maxID = offsets[ segmentIdx + 1];

		Real result = zero;
		for( Index i = offsets[segmentIdx] + inGroupID; i < maxID; i += 2)
		result = reduce( result, fetch( i, true ) );

		/* Parallel reduction */
		result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 1 ) );

		/* Write result */
		if( inGroupID == 0 )
		keep( segmentIdx, result );
		}

		template< typename Real,
		typename Index,
		typename OffsetsView,
		typename Fetch,
		typename Reduce,
		typename Keep >
		__global__
		void SpMVCSRLightWithoutAtomic4( OffsetsView offsets,
		const Index first,
		const Index last,
		Fetch fetch,
		Reduce reduce,
		Keep keep,
		const Real zero,
		const Index gridID )
		{
		const Index segmentIdx =
		first + ((gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / 4;
		if (segmentIdx >= last)
		return;

		const Index inGroupID = threadIdx.x & 3; // & is cheaper than %
		const Index maxID = offsets[segmentIdx + 1];

		Real result = zero;
		for (Index i = offsets[segmentIdx] + inGroupID; i < maxID; i += 4)
		result = reduce( result, fetch( i, true ) );

		/* Parallel reduction */
		result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 2 ) );
		result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 1 ) );

		/* Write result */
		if( inGroupID == 0 )
		keep( segmentIdx, result );

		}

		template< typename Real,
		typename Index,
		typename OffsetsView,
		typename Fetch,
		typename Reduce,
		typename Keep >
		__global__
		void SpMVCSRLightWithoutAtomic8( OffsetsView offsets,
		const Index first,
		const Index last,
		Fetch fetch,
		Reduce reduce,
		Keep keep,
		const Real zero,
		const Index gridID)
		{
		const Index segmentIdx =
		first + ((gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / 8;
		if (segmentIdx >= last)
		return;

		Index i;
		const Index inGroupID = threadIdx.x & 7; // & is cheaper than %
		const Index maxID = offsets[segmentIdx + 1];

		Real result = zero;
		for (i = offsets[segmentIdx] + inGroupID; i < maxID; i += 8)
		result = reduce( result, fetch( i, true ) );

		/* Parallel reduction */
		result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 4 ) );
		result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 2 ) );
		result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 1 ) );

		/* Write result */
		if( inGroupID == 0 )
		keep( segmentIdx, result );
		}

		template< typename Real,
		typename Index,
		typename OffsetsView,
		typename Fetch,
		typename Reduce,
		typename Keep >
		__global__
		void SpMVCSRLightWithoutAtomic16( OffsetsView offsets,
		const Index first,
		const Index last,
		Fetch fetch,
		Reduce reduce,
		Keep keep,
		const Real zero,
		const Index gridID )
		{
		const Index segmentIdx =
		first + ((gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x ) / 16;
		if( segmentIdx >= last )
		return;


		Index i;
		const Index inGroupID = threadIdx.x & 15; // & is cheaper than %
		const Index maxID = offsets[segmentIdx + 1];

		Real result = zero;
		for( i = offsets[segmentIdx] + inGroupID; i < maxID; i += 16 )
		result = reduce( result, fetch( i, true ) );

		/* Parallel reduction */
		result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 8 ) );
		result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 4 ) );
		result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 2 ) );
		result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 1 ) );

		/* Write result */
		if( inGroupID == 0 )
		keep( segmentIdx, result );
		}

		template< typename Real,
		typename Index,
		typename OffsetsView,
		typename Fetch,
		typename Reduce,
		typename Keep >
		__global__
		void SpMVCSRVector( OffsetsView offsets,
		const Index first,
		const Index last,
		Fetch fetch,
		Reduce reduce,
		Keep keep,
		const Real zero,
		const Index gridID )
		{
		const int warpSize = 32;
		const Index warpID = first + ((gridID * TNL::Cuda::getMaxGridXSize() ) + (blockIdx.x * blockDim.x) + threadIdx.x) / warpSize;
		if (warpID >= last)
		return;

		Real result = zero;
		const Index laneID = threadIdx.x & 31; // & is cheaper than %
		Index endID = offsets[warpID + 1];

		/* Calculate result */
		for (Index i = offsets[warpID] + laneID; i < endID; i += warpSize)
		result = reduce( result, fetch( i, true ) );

		/* Reduction */
		result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 16 ) );
		result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 8 ) );
		result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 4 ) );
		result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 2 ) );
		result = reduce( result, __shfl_down_sync(0xFFFFFFFF, result, 1 ) );
		/* Write result */
		if( laneID == 0 )
		keep( warpID, result );
		}
		#endif
		template< typename Index,
		typename Device,
		typename Fetch,
		typename Reduce,
		typename Keep,
		bool DispatchScalarCSR =
		details::CheckFetchLambda< Index, Fetch >::hasAllParameters() \|\|
		std::is_same< Device, Devices::Host >::value >
		struct CSRLightKernelreduceSegmentsDispatcher;

		template< typename Index,
		typename Device,
		typename Fetch,
		typename Reduction,
		typename ResultKeeper >
		struct CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduction, ResultKeeper, true >
		{

		template< typename Offsets,
		typename Real >
		static void reduce( const Offsets& offsets,
		Index first,
		Index last,
		Fetch& fetch,
		const Reduction& reduce,
		ResultKeeper& keep,
		const Real& zero,
		const Index threadsPerSegment )
		{
		TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >::
		reduceSegments( offsets, first, last, fetch, reduce, keep, zero );
		}
		};

		template< typename Index,
		typename Device,
		typename Fetch,
		typename Reduce,
		typename Keep >
		struct CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduce, Keep, false >
		{
		template< typename OffsetsView,
		typename Real >
		static void reduce( const OffsetsView& offsets,
		Index first,
		Index last,
		Fetch& fetch,
		const Reduce& reduce,
		Keep& keep,
		const Real& zero,
		const Index threadsPerSegment )
		{
		#ifdef HAVE_CUDA
		const int threads = 128;
		Index blocks, groupSize;
		//if (KernelType == CSRLightWithoutAtomic)
		int neededThreads = threadsPerSegment * ( last - first );
		//else
		// neededThreads = rows * (threadsPerSegment > 32 ? 32 : threadsPerSegment);

		/* Execute kernels on device */
		for (Index grid = 0; neededThreads != 0; ++grid)
		{
		if( TNL::Cuda::getMaxGridXSize() * threads >= neededThreads)
		{
		blocks = roundUpDivision(neededThreads, threads);
		neededThreads = 0;
		}
		else
		{
		blocks = TNL::Cuda::getMaxGridXSize();
		neededThreads -= TNL::Cuda::getMaxGridXSize() * threads;
		}

		if (threadsPerSegment == 2)
		SpMVCSRLightWithoutAtomic2<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>(
		offsets, first, last, fetch, reduce, keep, zero, grid );
		else if (threadsPerSegment == 4)
		SpMVCSRLightWithoutAtomic4<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>(
		offsets, first, last, fetch, reduce, keep, zero, grid );
		else if (threadsPerSegment == 8)
		SpMVCSRLightWithoutAtomic8<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>(
		offsets, first, last, fetch, reduce, keep, zero, grid );
		else if (threadsPerSegment == 16)
		SpMVCSRLightWithoutAtomic16<Real, Index, OffsetsView, Fetch, Reduce, Keep ><<<blocks, threads>>>(
		offsets, first, last, fetch, reduce, keep, zero, grid );
		else // if (threadsPerSegment == 32)
		{ // CSR SpMV Light with threadsPerSegment = 32 is CSR Vector
		SpMVCSRVector<Real, Index, OffsetsView, Fetch, Reduce, Keep, warpSize ><<<blocks, threads>>>(
		offsets, first, last, fetch, reduce, keep, zero, grid );
		}
		/*else
		{ // Execute CSR MultiVector
		SpMVCSRMultiVector<Real, Index, warpSize><<<blocks, threads>>>(
		inVector, outVector, matrix.getoffsets().getData(),
		matrix.getColumnIndexes().getData(), matrix.getValues().getData(),
		rows, threadsPerSegment / 32, grid
		);
		}*/

		}
		#endif

		}
		};


		template< typename Index,
		typename Device >
		template< typename Offsets >
		void
		CSRLightKernel< Index, Device >::
		init( const Offsets& offsets )
		{
		//const Index elementsInSegment = std::ceil( ( double ) offsets.getElement( segmentsCount ) / ( double ) segmentsCount );
		//this->threadsPerSegment = TNL::min( std::pow( 2, std::ceil( std::log2( elementsInSegment ) ) ) ); //TNL::Cuda::getWarpSize() );

		const Index segmentsCount = offsets.getSize() - 1;
		//const Index threads = 128; // !!!!!!!!!!!!!!!!!!!!!! block size
		size_t neededThreads = segmentsCount * 32;//warpSize;
		Index blocks, threadsPerSegment;

		const Index elementsInSegment = roundUpDivision( offsets.getElement( segmentsCount ), segmentsCount ); // non zeroes per row
		if( elementsInSegment <= 2 )
		threadsPerSegment = 2;
		else if( elementsInSegment <= 4 )
		threadsPerSegment = 4;
		else if( elementsInSegment <= 8 )
		threadsPerSegment = 8;
		else if( elementsInSegment <= 16 )
		threadsPerSegment = 16;
		else //if (nnz <= 2 * matrix.MAX_ELEMENTS_PER_WARP)
		threadsPerSegment = 32; // CSR Vector
		//else
		// threadsPerSegment = roundUpDivision(nnz, matrix.MAX_ELEMENTS_PER_WARP) * 32; // CSR MultiVector

		TNL_ASSERT_GE( threadsPerSegment, 0, "" );
		TNL_ASSERT_LE( threadsPerSegment, 33, "" );

		}

		template< typename Index,
		typename Device >
		void
		CSRLightKernel< Index, Device >::
		reset()
		{
		this->threadsPerSegment = 0;
		}

		template< typename Index,
		typename Device >
		auto
		CSRLightKernel< Index, Device >::
		getView() -> ViewType
		{
		return *this;
		}

		template< typename Index,
		typename Device >
		TNL::String
		CSRLightKernel< Index, Device >::
		getKernelType()
		{
		return "Light";
		}

		template< typename Index,
		typename Device >
		auto
		CSRLightKernel< Index, Device >::
		getConstView() const -> ConstViewType
		{
		return *this;
		};


		template< typename Index,
		typename Device >
		template< typename OffsetsView,
		typename Fetch,
		typename Reduce,
		typename Keep,
		typename Real >
		void
		CSRLightKernel< Index, Device >::
		reduceSegments( const OffsetsView& offsets,
		Index first,
		Index last,
		Fetch& fetch,
		const Reduce& reduce,
		Keep& keep,
		const Real& zero ) const
		{
		TNL_ASSERT_GE( this->threadsPerSegment, 0, "" );
		TNL_ASSERT_LE( this->threadsPerSegment, 33, "" );
		CSRLightKernelreduceSegmentsDispatcher< Index, Device, Fetch, Reduce, Keep >::reduce(
		offsets, first, last, fetch, reduce, keep, zero, this->threadsPerSegment );
		}

		} // namespace Segments
		} // namespace Algorithms
		} // namespace TNL

src/UnitTests/Matrices/CMakeLists.txt

+2 −0

Original line number	Diff line number	Diff line
		@@ -8,6 +8,7 @@ set( COMMON_TESTS
		SparseMatrixTest_CSRScalar
		SparseMatrixTest_CSRVector
		SparseMatrixTest_CSRHybrid
		SparseMatrixTest_CSRLight
		SparseMatrixTest_CSRAdaptive
		SparseMatrixTest_Ellpack
		SparseMatrixTest_SlicedEllpack
		@@ -16,6 +17,7 @@ set( COMMON_TESTS
		SparseMatrixVectorProductTest_CSRScalar
		SparseMatrixVectorProductTest_CSRVector
		SparseMatrixVectorProductTest_CSRHybrid
		SparseMatrixVectorProductTest_CSRLight
		SparseMatrixVectorProductTest_CSRAdaptive
		SparseMatrixVectorProductTest_Ellpack
		SparseMatrixVectorProductTest_SlicedEllpack