Adding Adaptive CSR kernel. (536a6526) · Commits · TNL / tnl-dev

src/TNL/Algorithms/Segments/CSR.h

+5 −0

Original line number	Diff line number	Diff line
		@@ -145,6 +145,11 @@ template< typename Device,
		typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
		using CSRHybrid = CSR< Device, Index, CSRKernelHybrid< Index, Device >, IndexAllocator >;

		template< typename Device,
		typename Index,
		typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >
		using CSRAdaptive = CSR< Device, Index, CSRKernelAdaptive< Index, Device >, IndexAllocator >;

		template< typename Device,
		typename Index,
		typename IndexAllocator = typename Allocators::Default< Device >::template Allocator< Index > >

src/TNL/Algorithms/Segments/CSRKernels.h→src/TNL/Algorithms/Segments/CSRKernelAdaptive.h

+329 −0

Original line number	Diff line number	Diff line
		@@ -20,15 +20,136 @@ namespace TNL {
		namespace Algorithms {
		namespace Segments {

		enum class Type {
		/* LONG = 0!!! Non zero value rewrites index[1] */
		LONG = 0,
		STREAM = 1,
		VECTOR = 2
		};

		template<typename Index>
		union Block {
		Block(Index row, Type type = Type::VECTOR, Index index = 0) noexcept {
		this->index[0] = row;
		this->index[1] = index;
		this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type;
		}

		Block(Index row, Type type, Index nextRow, Index maxID, Index minID) noexcept {
		this->index[0] = row;
		this->index[1] = 0;
		this->twobytes[sizeof(Index) == 4 ? 2 : 4] = maxID - minID;

		if (type == Type::STREAM)
		this->twobytes[sizeof(Index) == 4 ? 3 : 5] = nextRow - row;

		if (type == Type::STREAM)
		this->byte[sizeof(Index) == 4 ? 7 : 15] \|= 0b1000000;
		else if (type == Type::VECTOR)
		this->byte[sizeof(Index) == 4 ? 7 : 15] \|= 0b10000000;
		}

		Block() = default;

		Index index[2]; // index[0] is row pointer, index[1] is index in warp
		uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator
		uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID
		//twobytes[3/5] is nextRow - row
		};

		#ifdef HAVE_CUDA

		template< typename Real,
		typename Index,
		int warpSize,
		int WARPS,
		int SHARED_PER_WARP,
		int MAX_ELEM_PER_WARP >
		__global__
		void SpMVCSRAdaptive( const Real *inVector,
		Real *outVector,
		const Index* rowPointers,
		const Index* columnIndexes,
		const Real* values,
		const Block<Index> *blocks,
		Index blocksSize,
		Index gridID) {
		__shared__ Real shared[WARPS][SHARED_PER_WARP];
		const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
		const Index blockIdx = index / warpSize;
		if (blockIdx >= blocksSize)
		return;

		Real result = 0.0;
		const Index laneID = threadIdx.x & 31; // & is cheaper than %
		Block<Index> block = blocks[blockIdx];
		const Index minID = rowPointers[block.index[0]/* minRow */];
		Index i, to, maxID;
		if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b1000000) {
		/////////////////////////////////////* CSR STREAM *//////////////
		const Index warpID = threadIdx.x / 32;
		maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4];

		/* Stream data to shared memory */
		for (i = laneID + minID; i < maxID; i += warpSize)
		shared[warpID][i - minID] = values[i] * inVector[columnIndexes[i]];

		const Index maxRow = block.index[0]/* minRow */ +
		/* maxRow - minRow */(block.twobytes[sizeof(Index) == 4 ? 3 : 5] & 0x3FFF);
		/* Calculate result */
		for (i = block.index[0]/* minRow */ + laneID; i < maxRow; i += warpSize) {
		to = rowPointers[i + 1] - minID; // end of preprocessed data
		result = 0;
		/* Scalar reduction */
		for (Index sharedID = rowPointers[i] - minID; sharedID < to; ++sharedID)
		result += shared[warpID][sharedID];

		outVector[i] = result; // Write result
		}
		} else if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b10000000) {
		/////////////////////////////////////* CSR VECTOR *//////////////
		maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4];

		for (i = minID + laneID; i < maxID; i += warpSize)
		result += values[i] * inVector[columnIndexes[i]];

		/* Parallel reduction */
		result += __shfl_down_sync(0xFFFFFFFF, result, 16);
		result += __shfl_down_sync(0xFFFFFFFF, result, 8);
		result += __shfl_down_sync(0xFFFFFFFF, result, 4);
		result += __shfl_down_sync(0xFFFFFFFF, result, 2);
		result += __shfl_down_sync(0xFFFFFFFF, result, 1);
		if (laneID == 0) outVector[block.index[0]/* minRow */] = result; // Write result
		} else {
		/////////////////////////////////////* CSR VECTOR L */////////////
		/* Number of elements processed by previous warps */
		const Index offset = block.index[1]/* warpInRow / MAX_ELEM_PER_WARP;
		to = minID + (block.index[1]/* warpInRow / + 1) MAX_ELEM_PER_WARP;
		maxID = rowPointers[block.index[0]/* minRow */ + 1];
		if (to > maxID) to = maxID;
		for (i = minID + offset + laneID; i < to; i += warpSize)
		result += values[i] * inVector[columnIndexes[i]];

		/* Parallel reduction */
		result += __shfl_down_sync(0xFFFFFFFF, result, 16);
		result += __shfl_down_sync(0xFFFFFFFF, result, 8);
		result += __shfl_down_sync(0xFFFFFFFF, result, 4);
		result += __shfl_down_sync(0xFFFFFFFF, result, 2);
		result += __shfl_down_sync(0xFFFFFFFF, result, 1);
		if (laneID == 0) atomicAdd(&outVector[block.index[0]/* minRow */], result);
		}
		}
		#endif


		template< typename Index,
		typename Device >
		struct CSRAdaptiveKernelView
		struct CSRKernelAdaptiveView
		{
		using IndexType = Index;
		using DeviceType = Device;
		using ViewType = CSRAdaptiveKernelView< Index, Device >;
		using ConstViewType = CSRAdaptiveKernelView< Index, Device >;
		using ViewType = CSRKernelAdaptiveView< Index, Device >;
		using ConstViewType = CSRKernelAdaptiveView< Index, Device >;

		ViewType getView() { return *this; };

		@@ -49,22 +170,96 @@ struct CSRAdaptiveKernelView
		const Real& zero,
		Args... args ) const
		{

		Index blocks;
		const Index threads = matrix.THREADS_ADAPTIVE;

		/* Fill blocks */
		size_t neededThreads = matrix.blocks.getSize() * warpSize; // one warp per block
		/* Execute kernels on device */
		for (Index grid = 0; neededThreads != 0; ++grid) {
		if (MAX_X_DIM * threads >= neededThreads) {
		blocks = roundUpDivision(neededThreads, threads);
		neededThreads = 0;
		} else {
		blocks = MAX_X_DIM;
		neededThreads -= MAX_X_DIM * threads;
		}

		SpMVCSRAdaptive< Real, Index, warpSize,
		matrix.WARPS,
		matrix.SHARED_PER_WARP,
		matrix.MAX_ELEMENTS_PER_WARP_ADAPT >
		<<<blocks, threads>>>(
		inVector,
		outVector,
		matrix.getRowPointers().getData(),
		matrix.getColumnIndexes().getData(),
		matrix.getValues().getData(),
		matrix.blocks.getData(),
		matrix.blocks.getSize() - 1, // last block shouldn't be used
		grid
		);
		}
		}
		};

		template< typename Index,
		typename Device >
		struct CSRAdaptiveKernel
		struct CSRKernelAdaptive
		{
		using IndexType = Index;
		using DeviceType = Device;
		using ViewType = CSRAdaptiveKernel< Index, Device >;
		using ConstViewType = CSRAdaptiveKernel< Index, Device >;
		using ViewType = CSRKernelAdaptiveView< Index, Device >;
		using ConstViewType = CSRKernelAdaptiveView< Index, Device >;

		static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256;

		/* How many shared memory use per block in CSR Adaptive kernel */
		static constexpr Index SHARED_PER_BLOCK = 24576;

		/* Number of elements in shared memory */
		static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(double);

		/* Number of warps in block for CSR Adaptive */
		static constexpr Index WARPS = THREADS_ADAPTIVE / 32;

		/* Number of elements in shared memory per one warp */
		static constexpr Index SHARED_PER_WARP = SHARED / WARPS;

		template< typename Offsets >
		Index findLimit(const Index start,
		const Offsets& offsets,
		const Index size,
		Type &type,
		Index &sum) {
		sum = 0;
		for (Index current = start; current < size - 1; ++current) {
		Index elements = offsets.getElement(current + 1) -
		offsets.getElement(current);
		sum += elements;
		if (sum > matrix.SHARED_PER_WARP) {
		if (current - start > 0) { // extra row
		type = Type::STREAM;
		return current;
		} else { // one long row
		if (sum <= 2 * matrix.MAX_ELEMENTS_PER_WARP_ADAPT)
		type = Type::VECTOR;
		else
		type = Type::LONG;
		return current + 1;
		}
		}
		}

		type = Type::STREAM;
		return size - 1; // return last row pointer
		}

		template< typename Offsets >
		void init( const Offsets& offsets )
		{
		/*const Index rows = offsets.getSize();
		const Index rows = offsets.getSize();
		Index sum, start = 0, nextStart = 0;

		// Fill blocks
		@@ -100,7 +295,6 @@ struct CSRAdaptiveKernel
		this->blocks.setSize(inBlock.size());
		for (size_t i = 0; i < inBlock.size(); ++i)
		this->blocks.setElement(i, inBlock[i]);
		*/
		};

		ViewType getView() { return view; };

src/TNL/Algorithms/Segments/CSRView.h

+5 −1

Original line number	Diff line number	Diff line
		@@ -17,7 +17,7 @@
		#include <TNL/Algorithms/Segments/CSRKernelScalar.h>
		#include <TNL/Algorithms/Segments/CSRKernelVector.h>
		#include <TNL/Algorithms/Segments/CSRKernelHybrid.h>
		#include <TNL/Algorithms/Segments/CSRKernels.h>
		#include <TNL/Algorithms/Segments/CSRKernelAdaptive.h>

		namespace TNL {
		namespace Algorithms {
		@@ -144,6 +144,10 @@ template< typename Device,
		typename Index >
		using CSRViewHybrid = CSRView< Device, Index, CSRKernelHybrid< Index, Device > >;

		template< typename Device,
		typename Index >
		using CSRViewAdaptive = CSRView< Device, Index, CSRKernelAdaptive< Index, Device > >;

		template< typename Device,
		typename Index >
		using CSRViewDefault = CSRViewScalar< Device, Index >;

src/UnitTests/Matrices/CMakeLists.txt

+1 −0

Original line number	Diff line number	Diff line
		@@ -9,6 +9,7 @@ set( COMMON_TESTS
		SparseMatrixTest_CSRScalar
		SparseMatrixTest_CSRVector
		SparseMatrixTest_CSRHybrid
		SparseMatrixTest_CSRAdaptive
		SparseMatrixTest_Ellpack
		SparseMatrixTest_SlicedEllpack
		SparseMatrixTest_ChunkedEllpack

src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cpp

0 → 100644

+11 −0

Original line number	Diff line number	Diff line
		/***************************************************************************
		SparseMatrixTest_CSRHybrid.cpp - description
		-------------------
		begin : Jan 23, 2021
		copyright : (C) 2021 by Tomas Oberhuber et al.
		email : tomas.oberhuber@fjfi.cvut.cz
		***************************************************************************/

		/* See Copyright Notice in tnl/Copyright */

		#include "SparseMatrixTest_CSRHybrid.h"