Optimizations for all kernels (c8ee6a28) · Commits · TNL / tnl-dev

src/TNL/Matrices/Legacy/CSR_impl.h

+65 −107

Original line number	Diff line number	Diff line
		@@ -40,17 +40,6 @@ union Block {
		uint8_t byte[8]; // byte[7] is type specificator
		};

		// template<typename Index>
		// struct Block_old {
		// Block(Index row, Index index = 0) noexcept {
		// this->index = index;
		// this->row = row;
		// }

		// Index index;
		// Index row;
		// };

		/* Configuration */
		constexpr size_t MAX_X_DIM = 2147483647;
		constexpr int ELEMENTS_PER_WARP = 1024;
		@@ -860,11 +849,11 @@ void SpMVCSRAdaptive( const Real *inVector,
		result += values[i] * inVector[column];
		}
		/* Parallel reduction */
		result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16);
		result += __shfl_down_sync((unsigned)(warpSize - 1), result, 8);
		result += __shfl_down_sync((unsigned)(warpSize - 1), result, 4);
		result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2);
		result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1);
		result += __shfl_down_sync(0xFFFFFFFF, result, 16);
		result += __shfl_down_sync(0xFFFFFFFF, result, 8);
		result += __shfl_down_sync(0xFFFFFFFF, result, 4);
		result += __shfl_down_sync(0xFFFFFFFF, result, 2);
		result += __shfl_down_sync(0xFFFFFFFF, result, 1);
		if (laneID == 0) outVector[block.index[0]/* minRow */] = result; // Write result
		} else {
		/////////////////////////////////////* CSR VECTOR L */////////////
		@@ -883,18 +872,17 @@ void SpMVCSRAdaptive( const Real *inVector,
		}

		/* Parallel reduction */
		result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16);
		result += __shfl_down_sync((unsigned)(warpSize - 1), result, 8);
		result += __shfl_down_sync((unsigned)(warpSize - 1), result, 4);
		result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2);
		result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1);
		result += __shfl_down_sync(0xFFFFFFFF, result, 16);
		result += __shfl_down_sync(0xFFFFFFFF, result, 8);
		result += __shfl_down_sync(0xFFFFFFFF, result, 4);
		result += __shfl_down_sync(0xFFFFFFFF, result, 2);
		result += __shfl_down_sync(0xFFFFFFFF, result, 1);
		if (laneID == 0) atomicAdd(&outVector[block.index[0]/* minRow */], result);
		}
		}

		template< typename Real,
		typename Index,
		int warpSize >
		typename Index>
		__global__
		void SpMVCSRScalar( const Real *inVector,
		Real* outVector,
		@@ -908,11 +896,12 @@ void SpMVCSRScalar( const Real *inVector,
		if (index >= rows)
		return;

		Index column;
		Real result = 0.0;
		const Index endID = rowPointers[index + 1];

		for (Index i = rowPointers[index]; i < endID; ++i) {
		const Index column = columnIndexes[i];
		column = columnIndexes[i];
		if (column >= getColumns)
		break;

		@@ -933,7 +922,6 @@ void SpMVCSRMultiVector( const Real *inVector,
		const Real* values,
		const Index rows,
		const Index getColumns,
		const Index perWarp,
		const Index offset,
		const Index gridID)
		{
		@@ -945,11 +933,12 @@ void SpMVCSRMultiVector( const Real *inVector,
		const Index inRowID = index % offset;

		Real result = 0.0;
		Index column;
		Index endID = rowPointers[rowID + 1];

		/* Calculate result */
		for (Index i = rowPointers[rowID] + inRowID; i < endID; i += offset) {
		Index column = columnIndexes[i];
		column = columnIndexes[i];
		if (column >= getColumns)
		break;

		@@ -957,11 +946,11 @@ void SpMVCSRMultiVector( const Real *inVector,
		}

		/* Reduction */
		result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16);
		result += __shfl_down_sync((unsigned)(warpSize - 1), result, 8);
		result += __shfl_down_sync((unsigned)(warpSize - 1), result, 4);
		result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2);
		result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1);
		result += __shfl_down_sync(0xFFFFFFFF, result, 16);
		result += __shfl_down_sync(0xFFFFFFFF, result, 8);
		result += __shfl_down_sync(0xFFFFFFFF, result, 4);
		result += __shfl_down_sync(0xFFFFFFFF, result, 2);
		result += __shfl_down_sync(0xFFFFFFFF, result, 1);
		/* Write result */
		if (index % warpSize == 0) atomicAdd(&outVector[rowID], result);
		}
		@@ -984,13 +973,14 @@ void SpMVCSRVector( const Real *inVector,
		if (warpID >= rows)
		return;

		const Index laneID = index % warpSize;
		Real result = 0.0;
		Index column;
		const Index laneID = index % warpSize;
		Index endID = rowPointers[warpID + 1];

		/* Calculate result */
		for (Index i = rowPointers[warpID] + laneID; i < endID; i += warpSize) {
		Index column = columnIndexes[i];
		column = columnIndexes[i];
		if (column >= getColumns)
		break;

		@@ -998,18 +988,17 @@ void SpMVCSRVector( const Real *inVector,
		}

		/* Reduction */
		result += __shfl_down_sync((unsigned)(warpSize - 1), result, 16);
		result += __shfl_down_sync((unsigned)(warpSize - 1), result, 8);
		result += __shfl_down_sync((unsigned)(warpSize - 1), result, 4);
		result += __shfl_down_sync((unsigned)(warpSize - 1), result, 2);
		result += __shfl_down_sync((unsigned)(warpSize - 1), result, 1);
		result += __shfl_down_sync(0xFFFFFFFF, result, 16);
		result += __shfl_down_sync(0xFFFFFFFF, result, 8);
		result += __shfl_down_sync(0xFFFFFFFF, result, 4);
		result += __shfl_down_sync(0xFFFFFFFF, result, 2);
		result += __shfl_down_sync(0xFFFFFFFF, result, 1);
		/* Write result */
		if (laneID == 0) outVector[warpID] = result;
		}

		template< typename Real,
		typename Index,
		int warpSize >
		typename Index >
		__global__
		void SpMVCSRLight( const Real *inVector,
		Real* outVector,
		@@ -1019,13 +1008,11 @@ void SpMVCSRLight( const Real *inVector,
		const Index rows,
		const Index getColumns,
		const Index groupSize,
		const Index gridID,
		unsigned *rowCnt) {
		const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
		const Index laneID = index % warpSize;
		const Index groupID = laneID / groupSize;
		const Index inGroupID = laneID % groupSize;
		Index row, minID, maxID, i;
		const Index groupID = threadIdx.x / groupSize;
		const Index inGroupID = threadIdx.x % groupSize;
		Index row, maxID, i;
		Real result;

		while (true) {

		@@ -1033,15 +1020,14 @@ void SpMVCSRLight( const Real *inVector,
		if (inGroupID == 0) row = atomicAdd(rowCnt, 1);

		/* Propagate row number in group */
		row = __shfl_sync((unsigned)(warpSize - 1), row, groupID * groupSize);
		row = __shfl_sync(0xFFFFFFFF, row, groupID * groupSize);
		if (row >= rows)
		return;

		minID = rowPointers[row];
		maxID = rowPointers[row + 1];

		Real result = 0.0;
		for (i = minID + inGroupID; i < maxID; i += groupSize) {
		result = 0.0;
		for (i = rowPointers[row] + inGroupID; i < maxID; i += groupSize) {
		const Index column = columnIndexes[i];
		if (column >= getColumns)
		break;
		@@ -1050,8 +1036,8 @@ void SpMVCSRLight( const Real *inVector,
		}

		/* Parallel reduction */
		for (i = groupSize / 2; i > 0; i /= 2)
		result += __shfl_down_sync((unsigned)(warpSize - 1), result, i);
		for (i = groupSize >> 1; i > 0; i >>= 1)
		result += __shfl_down_sync(0xFFFFFFFF, result, i);
		/* Write result */
		if (inGroupID == 0)
		outVector[row] = result;
		@@ -1059,8 +1045,7 @@ void SpMVCSRLight( const Real *inVector,
		}

		template< typename Real,
		typename Index,
		int warpSize >
		typename Index>
		__global__
		void SpMVCSRLightWithoutAtomic( const Real *inVector,
		Real* outVector,
		@@ -1073,18 +1058,17 @@ void SpMVCSRLightWithoutAtomic( const Real *inVector,
		const Index gridID) {
		const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
		const Index row = index / groupSize;
		Index i;
		Index i, column;

		if (row >= rows)
		return;

		const Index inGroupID = index % groupSize;
		const Index minID = rowPointers[row];
		const Index maxID = rowPointers[row + 1];

		Real result = 0.0;
		for (i = minID + inGroupID; i < maxID; i += groupSize) {
		const Index column = columnIndexes[i];
		for (i = rowPointers[row] + inGroupID; i < maxID; i += groupSize) {
		column = columnIndexes[i];
		if (column >= getColumns)
		break;

		@@ -1092,8 +1076,8 @@ void SpMVCSRLightWithoutAtomic( const Real *inVector,
		}

		/* Parallel reduction */
		for (i = groupSize / 2; i > 0; i /= 2)
		result += __shfl_down_sync((unsigned)(warpSize - 1), result, i);
		for (i = groupSize >> 1; i > 0; i >>= 1)
		result += __shfl_down_sync(0xFFFFFFFF, result, i);

		/* Write result */
		if (inGroupID == 0) outVector[row] = result;
		@@ -1122,7 +1106,7 @@ void SpMVCSRScalarPrepare( const Real *inVector,
		neededThreads -= MAX_X_DIM * threads;
		}

		SpMVCSRScalar<Real, Index, warpSize><<<blocks, threads>>>(
		SpMVCSRScalar<Real, Index><<<blocks, threads>>>(
		inVector,
		outVector,
		rowPointers,
		@@ -1182,51 +1166,29 @@ void SpMVCSRLightPrepare( const Real *inVector,
		const Index valuesSize,
		const Index rows,
		const Index getColumns) {
		const Index threads = 1024; // block size
		const Index threads = 1024; // max block size
		Index blocks, groupSize;
		/* Copy rowCnt to GPU */
		unsigned rowCnt = 0;
		unsigned *kernelRowCnt;
		unsigned *kernelRowCnt = nullptr;
		cudaMalloc((void *)&kernelRowCnt, sizeof(kernelRowCnt));
		cudaMemcpy(kernelRowCnt, &rowCnt, sizeof(*kernelRowCnt), cudaMemcpyHostToDevice);

		cudaDeviceProp properties;
		cudaGetDeviceProperties( &properties, Cuda::DeviceInfo::getActiveDevice() );
		blocks = properties.multiProcessorCount * properties.maxThreadsPerMultiProcessor / threads;

		const Index nnz = roundUpDivision(valuesSize, rows); // non zeroes per row
		if (nnz <= 2)
		groupSize = 2;
		else if (nnz <= 4)
		groupSize = 4;
		else if (nnz <= 8)
		else if (nnz <= 64)
		groupSize = 8;
		else if (nnz <= 16)
		groupSize = 16;
		else
		groupSize = 32;

		size_t neededThreads = groupSize * rows;
		/* Execute kernels on device */
		for (Index grid = 0; neededThreads != 0; ++grid) {
		if (MAX_X_DIM * threads >= neededThreads) {
		blocks = roundUpDivision(neededThreads, threads);
		neededThreads = 0;
		} else {
		blocks = MAX_X_DIM;
		neededThreads -= MAX_X_DIM * threads;
		}

		if (groupSize == 32) { // CSR SpMV Light with groupsize = 32 is CSR Vector
		SpMVCSRVector<Real, Index, warpSize><<<blocks, threads>>>(
		inVector,
		outVector,
		rowPointers,
		columnIndexes,
		values,
		rows,
		getColumns,
		grid
		);
		} else {
		SpMVCSRLight<Real, Index, warpSize><<<blocks, threads>>>(
		SpMVCSRLight<Real, Index><<<blocks, threads>>>(
		inVector,
		outVector,
		rowPointers,
		@@ -1235,11 +1197,8 @@ void SpMVCSRLightPrepare( const Real *inVector,
		rows,
		getColumns,
		groupSize,
		grid,
		kernelRowCnt
		);
		}
		}

		cudaFree(kernelRowCnt);
		}
		@@ -1294,7 +1253,7 @@ void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector,
		grid
		);
		} else {
		SpMVCSRLightWithoutAtomic<Real, Index, warpSize><<<blocks, threads>>>(
		SpMVCSRLightWithoutAtomic<Real, Index><<<blocks, threads>>>(
		inVector,
		outVector,
		rowPointers,
		@@ -1359,7 +1318,6 @@ void SpMVCSRMultiVectorPrepare( const Real *inVector,
		values,
		rows,
		getColumns,
		ELEMENTS_PER_WARP,
		offset,
		grid
		);
		@@ -1427,7 +1385,7 @@ void SpMVCSRAdaptivePrepare( const Real *inVector,

		/* Fill blocks */
		std::vector<Block> inBlock;
		inBlock.reserve(rows); // resere space to avoid reallocation
		inBlock.reserve(rows); // reserve space to avoid reallocation

		while (nextStart != rows - 1) {
		Type type;