Loading src/TNL/Matrices/Legacy/CSR.h +9 −9 Original line number Diff line number Diff line Loading @@ -107,13 +107,10 @@ public: Containers::Vector< Block<Index>, Device, Index > blocks; /* Configuration of SpMV kernels ------------------------------------------- */ /* Configuration of CSR SpMV kernels ----------------------------------------- */ /* Block sizes */ // Execute 1024 threads per block for float, (12 elements per thread) for 48KB cache // 512 threads per block for double (12 elements per thread) static constexpr Index THREADS_ADAPTIVE = sizeof(Real) == 4 ? 1024 : 512; static constexpr Index THREADS_ADAPTIVE = 1024; static constexpr Index THREADS_SCALAR = 1024; static constexpr Index THREADS_VECTOR = 1024; static constexpr Index THREADS_LIGHT = 1024; Loading @@ -127,8 +124,11 @@ public: /* Number of elements in shared memory */ static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real); /* Number of warps in block for CSR Adaptive */ static constexpr Index WARPS = THREADS_ADAPTIVE / 32; /* Number of elements in shared memory per one warp */ static constexpr Index SHARED_PER_WARP = SHARED / (THREADS_ADAPTIVE / 32); static constexpr Index SHARED_PER_WARP = SHARED / WARPS; /* -------------------------------------------------------------------------- */ Loading src/TNL/Matrices/Legacy/CSR_impl.h +17 −17 Original line number Diff line number Diff line Loading @@ -130,7 +130,7 @@ Index findLimit(const Index start, matrix.getRowPointers().getElement(current); sum += elements; if (sum > matrix.SHARED_PER_WARP) { if (current - start > 1) { // extra row if (current - start > 0) { // extra row type = Type::STREAM; return current; } else { // one long row Loading Loading @@ -804,7 +804,7 @@ Index CSR< Real, Device, Index, KernelType >::getHybridModeSplit() const template< typename Real, typename Index, int warpSize, int SHARED, int WARPS, int SHARED_PER_WARP, int MAX_ELEM_PER_WARP > __global__ Loading @@ -816,7 +816,7 @@ void SpMVCSRAdaptive( const Real *inVector, const Block<Index> *blocks, Index blocksSize, Index gridID) { __shared__ Real shared[SHARED]; __shared__ Real shared[WARPS][SHARED_PER_WARP]; const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; const Index blockIdx = index / warpSize; if (blockIdx >= blocksSize) Loading @@ -826,25 +826,25 @@ void SpMVCSRAdaptive( const Real *inVector, const Index laneID = threadIdx.x & 31; // & is cheaper than % Block<Index> block = blocks[blockIdx]; const Index minID = rowPointers[block.index[0]/* minRow */]; Index i, to, offset, maxID; Index i, to, maxID; if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b1000000) { /////////////////////////////////////* CSR STREAM *////////////// const Index maxRow = block.index[0]/* minRow */ + /* maxRow - minRow */(block.twobytes[sizeof(Index) == 4 ? 3 : 5] & 0x3FFF); const Index warpID = threadIdx.x / 32; maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4]; /* offset between shared and global addresses */ offset = minID - (threadIdx.x / warpSize * SHARED_PER_WARP); /* Copy and calculate elements from global to shared memory, coalesced */ /* Stream data to shared memory */ for (i = laneID + minID; i < maxID; i += warpSize) shared[i - offset] = values[i] * inVector[columnIndexes[i]]; shared[warpID][i - minID] = values[i] * inVector[columnIndexes[i]]; const Index maxRow = block.index[0]/* minRow */ + /* maxRow - minRow */(block.twobytes[sizeof(Index) == 4 ? 3 : 5] & 0x3FFF); /* Calculate result */ for (i = block.index[0]/* minRow */ + laneID; i < maxRow; i += warpSize) { to = rowPointers[i + 1] - offset; // end of preprocessed data to = rowPointers[i + 1] - minID; // end of preprocessed data result = 0; /* Scalar reduction */ for (Index sharedID = rowPointers[i] - offset; sharedID < to; ++sharedID) result += shared[sharedID]; for (Index sharedID = rowPointers[i] - minID; sharedID < to; ++sharedID) result += shared[warpID][sharedID]; outVector[i] = result; // Write result } Loading @@ -864,10 +864,10 @@ void SpMVCSRAdaptive( const Real *inVector, if (laneID == 0) outVector[block.index[0]/* minRow */] = result; // Write result } else { /////////////////////////////////////* CSR VECTOR L *///////////// maxID = rowPointers[block.index[0]/* minRow */ + 1]; offset = block.index[1]/* warpInRow */ * MAX_ELEM_PER_WARP; /* Number of elements processed by previous warps */ const Index offset = block.index[1]/* warpInRow */ * MAX_ELEM_PER_WARP; to = minID + (block.index[1]/* warpInRow */ + 1) * MAX_ELEM_PER_WARP; maxID = rowPointers[block.index[0]/* minRow */ + 1]; if (to > maxID) to = maxID; for (i = minID + offset + laneID; i < to; i += warpSize) result += values[i] * inVector[columnIndexes[i]]; Loading Loading @@ -1754,7 +1754,7 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, } SpMVCSRAdaptive< Real, Index, warpSize, matrix.SHARED, matrix.WARPS, matrix.SHARED_PER_WARP, matrix.MAX_ELEMENTS_PER_WARP > <<<blocks, threads>>>( Loading Loading
src/TNL/Matrices/Legacy/CSR.h +9 −9 Original line number Diff line number Diff line Loading @@ -107,13 +107,10 @@ public: Containers::Vector< Block<Index>, Device, Index > blocks; /* Configuration of SpMV kernels ------------------------------------------- */ /* Configuration of CSR SpMV kernels ----------------------------------------- */ /* Block sizes */ // Execute 1024 threads per block for float, (12 elements per thread) for 48KB cache // 512 threads per block for double (12 elements per thread) static constexpr Index THREADS_ADAPTIVE = sizeof(Real) == 4 ? 1024 : 512; static constexpr Index THREADS_ADAPTIVE = 1024; static constexpr Index THREADS_SCALAR = 1024; static constexpr Index THREADS_VECTOR = 1024; static constexpr Index THREADS_LIGHT = 1024; Loading @@ -127,8 +124,11 @@ public: /* Number of elements in shared memory */ static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real); /* Number of warps in block for CSR Adaptive */ static constexpr Index WARPS = THREADS_ADAPTIVE / 32; /* Number of elements in shared memory per one warp */ static constexpr Index SHARED_PER_WARP = SHARED / (THREADS_ADAPTIVE / 32); static constexpr Index SHARED_PER_WARP = SHARED / WARPS; /* -------------------------------------------------------------------------- */ Loading
src/TNL/Matrices/Legacy/CSR_impl.h +17 −17 Original line number Diff line number Diff line Loading @@ -130,7 +130,7 @@ Index findLimit(const Index start, matrix.getRowPointers().getElement(current); sum += elements; if (sum > matrix.SHARED_PER_WARP) { if (current - start > 1) { // extra row if (current - start > 0) { // extra row type = Type::STREAM; return current; } else { // one long row Loading Loading @@ -804,7 +804,7 @@ Index CSR< Real, Device, Index, KernelType >::getHybridModeSplit() const template< typename Real, typename Index, int warpSize, int SHARED, int WARPS, int SHARED_PER_WARP, int MAX_ELEM_PER_WARP > __global__ Loading @@ -816,7 +816,7 @@ void SpMVCSRAdaptive( const Real *inVector, const Block<Index> *blocks, Index blocksSize, Index gridID) { __shared__ Real shared[SHARED]; __shared__ Real shared[WARPS][SHARED_PER_WARP]; const Index index = (gridID * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x; const Index blockIdx = index / warpSize; if (blockIdx >= blocksSize) Loading @@ -826,25 +826,25 @@ void SpMVCSRAdaptive( const Real *inVector, const Index laneID = threadIdx.x & 31; // & is cheaper than % Block<Index> block = blocks[blockIdx]; const Index minID = rowPointers[block.index[0]/* minRow */]; Index i, to, offset, maxID; Index i, to, maxID; if (block.byte[sizeof(Index) == 4 ? 7 : 15] & 0b1000000) { /////////////////////////////////////* CSR STREAM *////////////// const Index maxRow = block.index[0]/* minRow */ + /* maxRow - minRow */(block.twobytes[sizeof(Index) == 4 ? 3 : 5] & 0x3FFF); const Index warpID = threadIdx.x / 32; maxID = minID + /* maxID - minID */block.twobytes[sizeof(Index) == 4 ? 2 : 4]; /* offset between shared and global addresses */ offset = minID - (threadIdx.x / warpSize * SHARED_PER_WARP); /* Copy and calculate elements from global to shared memory, coalesced */ /* Stream data to shared memory */ for (i = laneID + minID; i < maxID; i += warpSize) shared[i - offset] = values[i] * inVector[columnIndexes[i]]; shared[warpID][i - minID] = values[i] * inVector[columnIndexes[i]]; const Index maxRow = block.index[0]/* minRow */ + /* maxRow - minRow */(block.twobytes[sizeof(Index) == 4 ? 3 : 5] & 0x3FFF); /* Calculate result */ for (i = block.index[0]/* minRow */ + laneID; i < maxRow; i += warpSize) { to = rowPointers[i + 1] - offset; // end of preprocessed data to = rowPointers[i + 1] - minID; // end of preprocessed data result = 0; /* Scalar reduction */ for (Index sharedID = rowPointers[i] - offset; sharedID < to; ++sharedID) result += shared[sharedID]; for (Index sharedID = rowPointers[i] - minID; sharedID < to; ++sharedID) result += shared[warpID][sharedID]; outVector[i] = result; // Write result } Loading @@ -864,10 +864,10 @@ void SpMVCSRAdaptive( const Real *inVector, if (laneID == 0) outVector[block.index[0]/* minRow */] = result; // Write result } else { /////////////////////////////////////* CSR VECTOR L *///////////// maxID = rowPointers[block.index[0]/* minRow */ + 1]; offset = block.index[1]/* warpInRow */ * MAX_ELEM_PER_WARP; /* Number of elements processed by previous warps */ const Index offset = block.index[1]/* warpInRow */ * MAX_ELEM_PER_WARP; to = minID + (block.index[1]/* warpInRow */ + 1) * MAX_ELEM_PER_WARP; maxID = rowPointers[block.index[0]/* minRow */ + 1]; if (to > maxID) to = maxID; for (i = minID + offset + laneID; i < to; i += warpSize) result += values[i] * inVector[columnIndexes[i]]; Loading Loading @@ -1754,7 +1754,7 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, } SpMVCSRAdaptive< Real, Index, warpSize, matrix.SHARED, matrix.WARPS, matrix.SHARED_PER_WARP, matrix.MAX_ELEMENTS_PER_WARP > <<<blocks, threads>>>( Loading