Debuging CSR Adaptive kernel. (316819ca) · Commits · TNL / tnl-dev

src/TNL/Algorithms/Segments/CSRKernelAdaptive.h

+318 −128

Original line number	Diff line number	Diff line
		@@ -15,6 +15,7 @@
		#include <TNL/Containers/VectorView.h>
		#include <TNL/Algorithms/ParallelFor.h>
		#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
		#include <TNL/Algorithms/Segments/CSRKernelScalar.h>

		namespace TNL {
		namespace Algorithms {
		@@ -28,14 +29,17 @@ enum class Type {
		};

		template< typename Index >
		union Block {
		Block(Index row, Type type = Type::VECTOR, Index index = 0) noexcept {
		union Block
		{
		Block(Index row, Type type = Type::VECTOR, Index index = 0) noexcept
		{
		this->index[0] = row;
		this->index[1] = index;
		this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type;
		}

		Block(Index row, Type type, Index nextRow, Index maxID, Index minID) noexcept {
		Block(Index row, Type type, Index nextRow, Index maxID, Index minID) noexcept
		{
		this->index[0] = row;
		this->index[1] = 0;
		this->twobytes[sizeof(Index) == 4 ? 2 : 4] = maxID - minID;
		@@ -51,93 +55,177 @@ union Block {

		Block() = default;

		Type getType() const
		{
		if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b1000000 )
		return Type::STREAM;
		if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b10000000 )
		return Type::VECTOR;
		return Type::LONG;
		}

		Index getFirstRow() const
		{
		return index[ 0 ];
		}

		Index getRowsInBlock() const
		{
		return twobytes[ sizeof(Index) == 4 ? 2 : 4 ];
		}

		void print( std::ostream& str ) const
		{
		Type type = this->getType();
		str << "Type: ";
		switch( type )
		{
		case Type::STREAM:
		str << " Stream ";
		break;
		case Type::VECTOR:
		str << " Vector ";
		break;
		case Type::LONG:
		str << " Long ";
		break;
		}
		str << " first row: " << getFirstRow();
		str << " rows per block: " << getRowsInBlock();
		str << " index in warp: " << index[ 1 ];
		}
		Index index[2]; // index[0] is row pointer, index[1] is index in warp
		uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator
		uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID
		//twobytes[3/5] is nextRow - row
		};

		template< typename Index >
		std::ostream& operator<< ( std::ostream& str, const Block< Index >& block )
		{
		block.print( str );
		return str;
		}

		#ifdef HAVE_CUDA

		template< typename Real,
		typename Index,
		int warpSize,
		template< int warpSize,
		int WARPS,
		int SHARED_PER_WARP,
		int MAX_ELEM_PER_WARP >
		__global__
		void SpMVCSRAdaptive( const Real *inVector,
		Real *outVector,
		const Index* rowPointers,
		const Index* columnIndexes,
		const Real* values,
		const Block<Index> *blocks,
		int MAX_ELEM_PER_WARP,
		typename Offsets,
		typename Index,
		typename Fetch,
		typename Reduction,
		typename ResultKeeper,
		typename Real,
		typename... Args >
		__global__ void
		segmentsReductionCSRAdaptiveKernel( const Block< Index > *blocks,
		Index blocksSize,
		Index gridID) {
		int gridIdx,
		Offsets offsets,
		Index first,
		Index last,
		Fetch fetch,
		Reduction reduce,
		ResultKeeper keep,
		Real zero,
		Args... args )
		{
		__shared__ Real shared[WARPS][SHARED_PER_WARP];
		const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
		constexpr size_t MAX_X_DIM = 2147483647;
		const Index index = (gridIdx * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
		const Index blockIdx = index / warpSize;
		if (blockIdx >= blocksSize)
		return;

		Real result = 0.0;
		Real result = zero;
		bool compute( true );
		const Index laneID = threadIdx.x & 31; // & is cheaper than %
		Block<Index> block = blocks[blockIdx];
		const Index minID = rowPointers[block.index[0]/* minRow */];
		const Index minID = offsets[block.index[0]/* minRow */];
		Index i, to, maxID;
		if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b1000000) {
		/////////////////////////////////////* CSR STREAM *//////////////

		if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b1000000)
		{
		/****
		* CSR Stream: Copy first all data into shared memory
		*/

		const Index warpID = threadIdx.x / 32;
		maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4];

		/* Stream data to shared memory */
		for (i = laneID + minID; i < maxID; i += warpSize)
		shared[warpID][i - minID] = values[i] * inVector[columnIndexes[i]];
		for( Index globalIdx = laneID + minID; globalIdx < maxID; globalIdx += warpSize )
		{
		shared[warpID][i - minID] = //fetch( globalIdx, compute );
		details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute );
		printf( "Stream: Fetch at %d -> %f \n", globalIdx, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, -1, -1, globalIdx, compute ) );
		// TODO:: fix this
		//values[i] * inVector[columnIndexes[i]];
		}

		const Index maxRow = block.index[0]/* minRow */ +
		/* maxRow - minRow */(block.twobytes[sizeof(Index) == 4 ? 3 : 5] & 0x3FFF);
		/* Calculate result */
		for (i = block.index[0]/* minRow */ + laneID; i < maxRow; i += warpSize) {
		to = rowPointers[i + 1] - minID; // end of preprocessed data
		result = 0;
		for (i = block.index[0]/* minRow */ + laneID; i < maxRow; i += warpSize)
		{
		to = offsets[i + 1] - minID; // end of preprocessed data
		result = zero;
		/* Scalar reduction */
		for (Index sharedID = rowPointers[i] - minID; sharedID < to; ++sharedID)
		result += shared[warpID][sharedID];
		for( Index sharedID = offsets[ i ] - minID; sharedID < to; ++sharedID)
		result = reduce( result, shared[warpID][sharedID] );

		outVector[i] = result; // Write result
		printf( "Stream: threadIdx = %d result for segment %d is %f \n", threadIdx, i, result );
		keep( i, result );
		//outVector[i] = result; // Write result
		}
		} else if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b10000000) {
		}
		else //if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b10000000)
		{
		printf( "Vector: threadIdx = %d \n", threadIdx );
		/////////////////////////////////////* CSR VECTOR *//////////////
		maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4];
		const Index segmentIdx = block.index[0];

		for (i = minID + laneID; i < maxID; i += warpSize)
		result += values[i] * inVector[columnIndexes[i]];
		for( Index globalIdx = minID + laneID; globalIdx < maxID; globalIdx += warpSize )
		result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) ); // fix local idx
		//values[i] * inVector[columnIndexes[i]];

		/* Parallel reduction */
		result += __shfl_down_sync(0xFFFFFFFF, result, 16);
		result += __shfl_down_sync(0xFFFFFFFF, result, 8);
		result += __shfl_down_sync(0xFFFFFFFF, result, 4);
		result += __shfl_down_sync(0xFFFFFFFF, result, 2);
		result += __shfl_down_sync(0xFFFFFFFF, result, 1);
		if (laneID == 0) outVector[block.index[0]/* minRow */] = result; // Write result
		} else {
		/////////////////////////////////////* CSR VECTOR L */////////////
		/* Number of elements processed by previous warps */
		const Index offset = block.index[1]/* warpInRow / MAX_ELEM_PER_WARP;
		to = minID + (block.index[1]/* warpInRow / + 1) MAX_ELEM_PER_WARP;
		maxID = rowPointers[block.index[0]/* minRow */ + 1];
		if (to > maxID) to = maxID;
		for (i = minID + offset + laneID; i < to; i += warpSize)
		result += values[i] * inVector[columnIndexes[i]];
		result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 16 ) );
		result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 8 ) );
		result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 4 ) );
		result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 2 ) );
		result = reduce( result, __shfl_down_sync( 0xFFFFFFFF, result, 1 ) );
		if( laneID == 0 )
		{
		printf( "Vector: threadIdx = %d result for segment %d is %f \n", threadIdx, i, result );
		keep( segmentIdx, result );
		//outVector[block.index[0]/* minRow */] = result; // Write result
		}
		}/*
		else
		{
		///////////////////////////////////// CSR VECTOR L /////////////
		// Number of elements processed by previous warps
		const Index offset = block.index[1] * MAX_ELEM_PER_WARP;
		to = minID + (block.index[1] + 1) * MAX_ELEM_PER_WARP;
		maxID = offsets[block.index[0] + 1];
		if( to > maxID )
		to = maxID;
		for( Index globalIdx = minID + offset + laneID; globalIdx < to; globalIdx += warpSize )
		result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
		//result += values[i] * inVector[columnIndexes[i]];

		/* Parallel reduction */
		result += __shfl_down_sync(0xFFFFFFFF, result, 16);
		result += __shfl_down_sync(0xFFFFFFFF, result, 8);
		result += __shfl_down_sync(0xFFFFFFFF, result, 4);
		result += __shfl_down_sync(0xFFFFFFFF, result, 2);
		result += __shfl_down_sync(0xFFFFFFFF, result, 1);
		if (laneID == 0) atomicAdd(&outVector[block.index[0]/* minRow */], result);
		}
		if (laneID == 0) atomicAdd(&outVector[block.index[0] ], result);
		}*/
		}
		#endif

		@@ -150,6 +238,20 @@ struct CSRKernelAdaptiveView
		using DeviceType = Device;
		using ViewType = CSRKernelAdaptiveView< Index, Device >;
		using ConstViewType = CSRKernelAdaptiveView< Index, Device >;
		using BlocksType = TNL::Containers::Vector< Block< Index >, Device, Index >;
		using BlocksView = typename BlocksType::ViewType;

		CSRKernelAdaptiveView() = default;

		CSRKernelAdaptiveView( BlocksType& blocks )
		{
		this->blocks.bind( blocks );
		};

		void setBlocks( BlocksType& blocks )
		{
		this->blocks.bind( blocks );
		}

		ViewType getView() { return *this; };

		@@ -170,38 +272,102 @@ struct CSRKernelAdaptiveView
		const Real& zero,
		Args... args ) const
		{
		#ifdef HAVE_CUDA
		if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() )
		{
		TNL::Algorithms::Segments::CSRKernelScalar< Index, Device >::
		segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
		return;
		}

		this->printBlocks();
		static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256;
		//static constexpr Index THREADS_SCALAR = 128;
		static constexpr Index THREADS_VECTOR = 128;
		static constexpr Index THREADS_LIGHT = 128;

		/* Max length of row to process one warp for CSR Light, MultiVector */
		static constexpr Index MAX_ELEMENTS_PER_WARP = 384;

		Index blocks;
		const Index threads = matrix.THREADS_ADAPTIVE;
		/* Max length of row to process one warp for CSR Adaptive */
		static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512;

		/* How many shared memory use per block in CSR Adaptive kernel */
		static constexpr Index SHARED_PER_BLOCK = 24576;

		/* Number of elements in shared memory */
		static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real);

		/* Number of warps in block for CSR Adaptive */
		static constexpr Index WARPS = THREADS_ADAPTIVE / 32;

		/* Number of elements in shared memory per one warp */
		static constexpr Index SHARED_PER_WARP = SHARED / WARPS;

		constexpr int warpSize = 32;

		Index blocksCount;

		const Index threads = THREADS_ADAPTIVE;
		constexpr size_t MAX_X_DIM = 2147483647;

		/* Fill blocks */
		size_t neededThreads = matrix.blocks.getSize() * warpSize; // one warp per block
		size_t neededThreads = blocks.getSize() * warpSize; // one warp per block
		/* Execute kernels on device */
		for (Index grid = 0; neededThreads != 0; ++grid) {
		if (MAX_X_DIM * threads >= neededThreads) {
		blocks = roundUpDivision(neededThreads, threads);
		for (Index gridIdx = 0; neededThreads != 0; gridIdx++ )
		{
		if (MAX_X_DIM * threads >= neededThreads)
		{
		blocksCount = roundUpDivision(neededThreads, threads);
		neededThreads = 0;
		} else {
		blocks = MAX_X_DIM;
		}
		else
		{
		blocksCount = MAX_X_DIM;
		neededThreads -= MAX_X_DIM * threads;
		}

		SpMVCSRAdaptive< Real, Index, warpSize,
		matrix.WARPS,
		matrix.SHARED_PER_WARP,
		matrix.MAX_ELEMENTS_PER_WARP_ADAPT >
		<<<blocks, threads>>>(
		inVector,
		outVector,
		matrix.getRowPointers().getData(),
		matrix.getColumnIndexes().getData(),
		matrix.getValues().getData(),
		matrix.blocks.getData(),
		matrix.blocks.getSize() - 1, // last block shouldn't be used
		grid
		);
		segmentsReductionCSRAdaptiveKernel<
		warpSize,
		WARPS,
		SHARED_PER_WARP,
		MAX_ELEMENTS_PER_WARP_ADAPT,
		OffsetsView,
		Index, Fetch, Reduction, ResultKeeper, Real, Args... >
		<<<blocksCount, threads>>>(
		blocks.getData(),
		blocks.getSize() - 1, // last block shouldn't be used
		gridIdx,
		offsets,
		first,
		last,
		fetch,
		reduction,
		keeper,
		zero,
		args... );
		}
		#endif
		}

		CSRKernelAdaptiveView& operator=( const CSRKernelAdaptiveView< Index, Device >& kernelView )
		{
		this->blocks.bind( kernelView.blocks );
		return *this;
		}

		void printBlocks() const
		{
		for( Index i = 0; i < this->blocks.getSize(); i++ )
		{
		auto block = blocks.getElement( i );
		std::cout << "Block " << i << " : " << block << std::endl;
		}

		}

		protected:
		BlocksView blocks;
		};

		template< typename Index,
		@@ -212,6 +378,9 @@ struct CSRKernelAdaptive
		using DeviceType = Device;
		using ViewType = CSRKernelAdaptiveView< Index, Device >;
		using ConstViewType = CSRKernelAdaptiveView< Index, Device >;
		using BlocksType = typename ViewType::BlocksType;
		using BlocksView = typename BlocksType::ViewType;


		static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256;

		@@ -227,23 +396,35 @@ struct CSRKernelAdaptive
		/* Number of elements in shared memory per one warp */
		static constexpr Index SHARED_PER_WARP = SHARED / WARPS;

		/* Max length of row to process one warp for CSR Light, MultiVector */
		static constexpr Index MAX_ELEMENTS_PER_WARP = 384;

		/* Max length of row to process one warp for CSR Adaptive */
		static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512;

		template< typename Offsets >
		Index findLimit(const Index start,
		const Offsets& offsets,
		const Index size,
		Type &type,
		Index &sum) {
		Index &sum)
		{
		sum = 0;
		for (Index current = start; current < size - 1; ++current) {
		for (Index current = start; current < size - 1; ++current)
		{
		Index elements = offsets.getElement(current + 1) -
		offsets.getElement(current);
		sum += elements;
		if (sum > matrix.SHARED_PER_WARP) {
		if (current - start > 0) { // extra row
		if (sum >SHARED_PER_WARP)
		{
		if (current - start > 0)
		{ // extra row
		type = Type::STREAM;
		return current;
		} else { // one long row
		if (sum <= 2 * matrix.MAX_ELEMENTS_PER_WARP_ADAPT)
		}
		else
		{ // one long row
		if (sum <= 2 * MAX_ELEMENTS_PER_WARP_ADAPT)
		type = Type::VECTOR;
		else
		type = Type::LONG;
		@@ -251,7 +432,6 @@ struct CSRKernelAdaptive
		}
		}
		}

		type = Type::STREAM;
		return size - 1; // return last row pointer
		}
		@@ -269,8 +449,7 @@ struct CSRKernelAdaptive
		while (nextStart != rows - 1)
		{
		Type type;
		nextStart = findLimit<Real, Index, Device, KernelType>(
		start, *this, rows, type, sum );
		nextStart = findLimit( start, offsets, rows, type, sum );

		if (type == Type::LONG)
		{
		@@ -284,8 +463,8 @@ struct CSRKernelAdaptive
		{
		inBlock.emplace_back(start, type,
		nextStart,
		this->rowPointers.getElement(nextStart),
		this->rowPointers.getElement(start) );
		offsets.getElement(nextStart),
		offsets.getElement(start) );
		}
		start = nextStart;
		}
		@@ -295,11 +474,19 @@ struct CSRKernelAdaptive
		this->blocks.setSize(inBlock.size());
		for (size_t i = 0; i < inBlock.size(); ++i)
		this->blocks.setElement(i, inBlock[i]);

		this->view.setBlocks( blocks );
		};

		ViewType getView() { return view; };
		void reset()
		{
		this->blocks.reset();
		this->view.setBlocks( blocks );
		}

		ViewType getView() { return this->view; };

		ConstViewType getConstView() const { return ConstViewType(); };
		ConstViewType getConstView() const { return this->view; };

		template< typename OffsetsView,
		typename Fetch,
		@@ -319,6 +506,9 @@ struct CSRKernelAdaptive
		view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
		}

		protected:
		BlocksType blocks;

		ViewType view;
		};

src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cpp

+2 −2

Original line number	Diff line number	Diff line
		/***************************************************************************
		SparseMatrixTest_CSRHybrid.cpp - description
		SparseMatrixTest_CSRAdaptive.cpp - description
		-------------------
		begin : Jan 23, 2021
		copyright : (C) 2021 by Tomas Oberhuber et al.
		@@ -8,4 +8,4 @@

		/* See Copyright Notice in tnl/Copyright */

		#include "SparseMatrixTest_CSRHybrid.h"
		#include "SparseMatrixTest_CSRAdaptive.h"

src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.cu

+2 −2

Original line number	Diff line number	Diff line
		/***************************************************************************
		SparseMatrixTest_CSRHybrid.cu - description
		SparseMatrixTest_CSRAdaptive.cu - description
		-------------------
		begin : Jan 23, 2021
		copyright : (C) 2021 by Tomas Oberhuber et al.
		@@ -8,4 +8,4 @@

		/* See Copyright Notice in tnl/Copyright */

		#include "SparseMatrixTest_CSRHybrid.h"
		#include "SparseMatrixTest_CSRAdaptive.h"

src/UnitTests/Matrices/SparseMatrixTest_CSRAdaptive.h

+18 −18

Original line number	Diff line number	Diff line
		/***************************************************************************
		SparseMatrixTest_CSRHybrid.h - description
		SparseMatrixTest_CSRAdaptive.h - description
		-------------------
		begin : Jan 23, 2021
		copyright : (C) 2021 by Tomas Oberhuber et al.
		@@ -15,28 +15,28 @@
		#ifdef HAVE_GTEST
		#include <gtest/gtest.h>

		const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRHybrid_segments";
		const char* saveAndLoadFileName = "test_SparseMatrixTest_CSRAdaptive_segments";

		// types for which MatrixTest is instantiated
		using MatrixTypes = ::testing::Types
		<
		TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
		TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
		TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
		TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
		TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
		TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
		TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
		TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >
		TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
		TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
		TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
		TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
		TNL::Matrices::SparseMatrix< int, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
		TNL::Matrices::SparseMatrix< long, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
		TNL::Matrices::SparseMatrix< float, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
		TNL::Matrices::SparseMatrix< double, TNL::Devices::Host, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >
		#ifdef HAVE_CUDA
		,TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
		TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
		TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
		TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
		TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
		TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
		TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >,
		TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRHybrid >
		,TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
		//TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
		//TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
		//TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, int, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
		//TNL::Matrices::SparseMatrix< int, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
		//TNL::Matrices::SparseMatrix< long, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
		//TNL::Matrices::SparseMatrix< float, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >,
		TNL::Matrices::SparseMatrix< double, TNL::Devices::Cuda, long, TNL::Matrices::GeneralMatrix, TNL::Algorithms::Segments::CSRAdaptive >
		#endif
		>;