Fixed block sizes for CSR Light, other improvements (4196f915) · Commits · TNL / tnl-dev

src/TNL/Matrices/Legacy/CSR.h

+6 −6

Original line number	Diff line number	Diff line
		@@ -110,16 +110,16 @@ public:
		/* Configuration of CSR SpMV kernels ----------------------------------------- */

		/* Block sizes */
		static constexpr Index THREADS_ADAPTIVE = 1024;
		static constexpr Index THREADS_SCALAR = 1024;
		static constexpr Index THREADS_VECTOR = 1024;
		static constexpr Index THREADS_LIGHT = 1024;
		static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256;
		static constexpr Index THREADS_SCALAR = 128;
		static constexpr Index THREADS_VECTOR = 128;
		static constexpr Index THREADS_LIGHT = 128;

		/* Max length of row to process one warp */
		static constexpr Index MAX_ELEMENTS_PER_WARP = 1024;
		static constexpr Index MAX_ELEMENTS_PER_WARP = 512;

		/* How many shared memory use per block in CSR Adaptive kernel */
		static constexpr Index SHARED_PER_BLOCK = 49152;
		static constexpr Index SHARED_PER_BLOCK = 24576;

		/* Number of elements in shared memory */
		static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real);

+4 −4

Original line number	Diff line number	Diff line
		@@ -134,7 +134,7 @@ Index findLimit(const Index start,
		type = Type::STREAM;
		return current;
		} else { // one long row
		if (sum <= matrix.MAX_ELEMENTS_PER_WARP)
		if (sum <= 2 * matrix.MAX_ELEMENTS_PER_WARP)
		type = Type::VECTOR;
		else
		type = Type::LONG;
		@@ -1407,7 +1407,7 @@ template< typename Real,
		void SpMVCSRLightPrepare( const Real *inVector,
		Real* outVector,
		const CSR< Real, Device, Index, KernelType >& matrix) {
		const Index threads = matrix.THREADS_LIGHT; // max block size
		const Index threads = 1024; // max block size
		const Index rows = matrix.getRowPointers().getSize() - 1;
		/* Copy rowCnt to GPU */
		unsigned rowCnt = 0;
		@@ -1554,7 +1554,7 @@ void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector,
		groupSize = 8;
		else if (nnz <= 16)
		groupSize = 16;
		else if (nnz <= matrix.MAX_ELEMENTS_PER_WARP)
		else if (nnz <= 2 * matrix.MAX_ELEMENTS_PER_WARP)
		groupSize = 32; // CSR Vector
		else
		groupSize = roundUpDivision(nnz, matrix.MAX_ELEMENTS_PER_WARP) * 32; // CSR MultiVector