Loading src/TNL/Matrices/Legacy/CSR.h +6 −6 Original line number Diff line number Diff line Loading @@ -110,16 +110,16 @@ public: /* Configuration of CSR SpMV kernels ----------------------------------------- */ /* Block sizes */ static constexpr Index THREADS_ADAPTIVE = 1024; static constexpr Index THREADS_SCALAR = 1024; static constexpr Index THREADS_VECTOR = 1024; static constexpr Index THREADS_LIGHT = 1024; static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; static constexpr Index THREADS_SCALAR = 128; static constexpr Index THREADS_VECTOR = 128; static constexpr Index THREADS_LIGHT = 128; /* Max length of row to process one warp */ static constexpr Index MAX_ELEMENTS_PER_WARP = 1024; static constexpr Index MAX_ELEMENTS_PER_WARP = 512; /* How many shared memory use per block in CSR Adaptive kernel */ static constexpr Index SHARED_PER_BLOCK = 49152; static constexpr Index SHARED_PER_BLOCK = 24576; /* Number of elements in shared memory */ static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real); Loading src/TNL/Matrices/Legacy/CSR_impl.h +4 −4 Original line number Diff line number Diff line Loading @@ -134,7 +134,7 @@ Index findLimit(const Index start, type = Type::STREAM; return current; } else { // one long row if (sum <= matrix.MAX_ELEMENTS_PER_WARP) if (sum <= 2 * matrix.MAX_ELEMENTS_PER_WARP) type = Type::VECTOR; else type = Type::LONG; Loading Loading @@ -1407,7 +1407,7 @@ template< typename Real, void SpMVCSRLightPrepare( const Real *inVector, Real* outVector, const CSR< Real, Device, Index, KernelType >& matrix) { const Index threads = matrix.THREADS_LIGHT; // max block size const Index threads = 1024; // max block size const Index rows = matrix.getRowPointers().getSize() - 1; /* Copy rowCnt to GPU */ unsigned rowCnt = 0; Loading Loading @@ -1554,7 +1554,7 @@ void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector, groupSize = 8; else if (nnz <= 16) groupSize = 16; else if (nnz <= matrix.MAX_ELEMENTS_PER_WARP) else if (nnz <= 2 * matrix.MAX_ELEMENTS_PER_WARP) groupSize = 32; // CSR Vector else groupSize = roundUpDivision(nnz, matrix.MAX_ELEMENTS_PER_WARP) * 32; // CSR MultiVector Loading Loading
src/TNL/Matrices/Legacy/CSR.h +6 −6 Original line number Diff line number Diff line Loading @@ -110,16 +110,16 @@ public: /* Configuration of CSR SpMV kernels ----------------------------------------- */ /* Block sizes */ static constexpr Index THREADS_ADAPTIVE = 1024; static constexpr Index THREADS_SCALAR = 1024; static constexpr Index THREADS_VECTOR = 1024; static constexpr Index THREADS_LIGHT = 1024; static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; static constexpr Index THREADS_SCALAR = 128; static constexpr Index THREADS_VECTOR = 128; static constexpr Index THREADS_LIGHT = 128; /* Max length of row to process one warp */ static constexpr Index MAX_ELEMENTS_PER_WARP = 1024; static constexpr Index MAX_ELEMENTS_PER_WARP = 512; /* How many shared memory use per block in CSR Adaptive kernel */ static constexpr Index SHARED_PER_BLOCK = 49152; static constexpr Index SHARED_PER_BLOCK = 24576; /* Number of elements in shared memory */ static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real); Loading
src/TNL/Matrices/Legacy/CSR_impl.h +4 −4 Original line number Diff line number Diff line Loading @@ -134,7 +134,7 @@ Index findLimit(const Index start, type = Type::STREAM; return current; } else { // one long row if (sum <= matrix.MAX_ELEMENTS_PER_WARP) if (sum <= 2 * matrix.MAX_ELEMENTS_PER_WARP) type = Type::VECTOR; else type = Type::LONG; Loading Loading @@ -1407,7 +1407,7 @@ template< typename Real, void SpMVCSRLightPrepare( const Real *inVector, Real* outVector, const CSR< Real, Device, Index, KernelType >& matrix) { const Index threads = matrix.THREADS_LIGHT; // max block size const Index threads = 1024; // max block size const Index rows = matrix.getRowPointers().getSize() - 1; /* Copy rowCnt to GPU */ unsigned rowCnt = 0; Loading Loading @@ -1554,7 +1554,7 @@ void SpMVCSRLightWithoutAtomicPrepare( const Real *inVector, groupSize = 8; else if (nnz <= 16) groupSize = 16; else if (nnz <= matrix.MAX_ELEMENTS_PER_WARP) else if (nnz <= 2 * matrix.MAX_ELEMENTS_PER_WARP) groupSize = 32; // CSR Vector else groupSize = roundUpDivision(nnz, matrix.MAX_ELEMENTS_PER_WARP) * 32; // CSR MultiVector Loading