Loading src/TNL/Matrices/Legacy/CSR.h +2 −1 Original line number Diff line number Diff line Loading @@ -29,7 +29,7 @@ enum class Type { }; union Block { void set(uint32_t row, Type type = Type::VECTOR, uint32_t index = 0) noexcept { Block(uint32_t row, Type type = Type::VECTOR, uint32_t index = 0) noexcept { this->index[0] = row; this->index[1] = index; this->byte[7] = (uint8_t)type; Loading Loading @@ -87,6 +87,7 @@ public: Containers::Vector< Block, Device, Index > blocks; Index maxElementsPerWarp = 1024; using Sparse< Real, Device, Index >::getAllocatedElementsCount; Loading src/TNL/Matrices/Legacy/CSR_impl.h +82 −145 Original line number Diff line number Diff line Loading @@ -16,7 +16,7 @@ #include <TNL/Algorithms/AtomicOperations.h> #include <TNL/Exceptions/NotImplementedError.h> #include <TNL/Atomic.h> #include <vector> #include <vector> // for blocks in CSR Adaptive #ifdef HAVE_CUSPARSE #include <cuda.h> Loading Loading @@ -113,7 +113,7 @@ void CSR< Real, Device, Index, KernelType >::setCompressedRowLengths( ConstCompr this->columnIndexes.setSize( this->rowPointers.getElement( this->rows ) ); this->columnIndexes.setValue( this->columns ); // if (KernelType == CSRAdaptive) if (KernelType == CSRAdaptive) this->setBlocks(); } Loading @@ -121,11 +121,11 @@ void CSR< Real, Device, Index, KernelType >::setCompressedRowLengths( ConstCompr template< typename Real, typename Index, typename Device, CSRKernel KernelType, int maxElemPerWarp> CSRKernel KernelType> Index findLimit(const Index start, const Index max, const CSR< Real, Device, Index, KernelType >& matrix, const Index size, const Index maxElemPerWarp, Type &type, Index &sum) { sum = 0; Loading Loading @@ -158,33 +158,34 @@ template< typename Real, void CSR< Real, Device, Index, KernelType >::setBlocks() { const Index rows = this->getRowPointers().getSize(); Block *tmpBlocks = new Block[rows]; Index nextStart = 0, start = 0, cnt = 0, sum = 0; Index sum, start = 0, nextStart = 0; /* Fill blocks */ std::vector<Block> inBlock; inBlock.reserve(rows); // reserve space to avoid reallocation while (nextStart != rows - 1) { Type type; nextStart = findLimit<Real, Index, Device, KernelType, 384>( start, this->maxElementsPerWarp, *this, rows, type, sum nextStart = findLimit( start, 384, *this, rows, this->maxElementsPerWarp, type, sum ); if (type == Type::LONG) { uint32_t parts = roundUpDivision(sum, this->maxElementsPerWarp); uint32_t parts = roundUpDivision(sum, 384); for (uint32_t index = 0; index < parts; ++index) { tmpBlocks[cnt++].set(start, Type::LONG, index); inBlock.emplace_back(start, Type::LONG, index); } } else { tmpBlocks[cnt++].set(start, type); inBlock.emplace_back(start, type); } start = nextStart; } tmpBlocks[cnt++].set(nextStart); /* Copy to TNL Vector */ this->blocks.setSize(cnt); for (Index i = 0; i < cnt; ++i) this->blocks.setElement(i, tmpBlocks[i]); inBlock.emplace_back(nextStart); delete [] tmpBlocks; /* Copy values */ this->blocks.setSize(inBlock.size()); for (size_t i = 0; i < inBlock.size(); ++i) this->blocks.setElement(i, inBlock[i]); } template< typename Real, Loading Loading @@ -1495,40 +1496,6 @@ void SpMVCSRMultiVectorPrepare( const Real *inVector, } } /* Find limit of block */ // template< typename Real, // typename Index, // typename Device, // CSRKernel KernelType, // int maxElemPerWarp> // Index findLimit(const Index start, const Index max, // const CSR< Real, Device, Index, KernelType >& matrix, // const Index size, // Type &type, // Index &sum) { // sum = 0; // for (Index current = start; current < size - 1; ++current) { // Index elements = matrix.getRowPointers().getElement(current + 1) - // matrix.getRowPointers().getElement(current); // sum += elements; // if (sum > max) { // if (current - start > 1) { // extra row // type = STREAM; // return current; // } else { // one long row // if (sum <= maxElemPerWarp) // type = VECTOR; // else // type = LONG; // return current + 1; // } // } // } // type = STREAM; // return size - 1; // return last row pointer // } template< typename Real, typename Index, typename Device, Loading @@ -1551,38 +1518,10 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, constexpr Index WARPS_PER_BLOCK = THREADS_PER_BLOCK / 32; constexpr Index SHARED_PER_WARP = 49152/sizeof(Real) / WARPS_PER_BLOCK; //-------------------------------------------------------------------- // Index blocks, sum, start = 0, nextStart = 0; Index blocks; const Index threads = THREADS_PER_BLOCK; /* Fill blocks */ // std::vector<Block> inBlock; // inBlock.reserve(rows); // reserve space to avoid reallocation // while (nextStart != rows - 1) { // Type type; // nextStart = findLimit<Real, Index, Device, KernelType, maxElemPerWarp>( // start, SHARED_PER_WARP, matrix, rows, type, sum // ); // if (type == LONG) { // uint32_t parts = roundUpDivision(sum, maxElemPerWarp); // for (uint32_t index = 0; index < parts; ++index) { // inBlock.emplace_back(start, LONG, index); // } // } else { // inBlock.emplace_back(start, type); // } // start = nextStart; // } // inBlock.emplace_back(nextStart); // /* blocks to GPU */ // Block *blocksAdaptive = nullptr; // cudaMalloc((void **)&blocksAdaptive, sizeof(*blocksAdaptive) * inBlock.size()); // cudaMemcpy(blocksAdaptive, inBlock.data(), inBlock.size() * sizeof(*blocksAdaptive), cudaMemcpyHostToDevice); // size_t neededThreads = inBlock.size() * 32; // one warp per block size_t neededThreads = matrix.blocks.getSize() * 32; // one warp per block /* Execute kernels on device */ for (Index grid = 0; neededThreads != 0; ++grid) { Loading @@ -1606,8 +1545,6 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, grid ); } // cudaFree(blocksAdaptive); } #endif Loading Loading @@ -1760,43 +1697,43 @@ class CSRDeviceDependentCode< Devices::Cuda > inVector.getData(), outVector.getData() ); #else // switch(KernelType) // { // case CSRScalar: // SpMVCSRScalarPrepare<Real, Index>( // inVector.getData(), // outVector.getData(), // matrix.getRowPointers().getData(), // matrix.getColumnIndexes().getData(), // matrix.getValues().getData(), // matrix.getRowPointers().getSize() - 1, // matrix.getColumns() // ); // break; // case CSRVector: // SpMVCSRVectorPrepare<Real, Index, 32>( // inVector.getData(), // outVector.getData(), // matrix.getRowPointers().getData(), // matrix.getColumnIndexes().getData(), // matrix.getValues().getData(), // matrix.getRowPointers().getSize() - 1, // matrix.getColumns() // ); // break; // case CSRLight: // SpMVCSRLightPrepare<Real, Index>( // inVector.getData(), // outVector.getData(), // matrix.getRowPointers().getData(), // matrix.getColumnIndexes().getData(), // matrix.getValues().getData(), // matrix.getValues().getSize(), // matrix.getRowPointers().getSize() - 1, // matrix.getColumns() // ); // break; // case CSRAdaptive: switch(KernelType) { case CSRScalar: SpMVCSRScalarPrepare<Real, Index>( inVector.getData(), outVector.getData(), matrix.getRowPointers().getData(), matrix.getColumnIndexes().getData(), matrix.getValues().getData(), matrix.getRowPointers().getSize() - 1, matrix.getColumns() ); break; case CSRVector: SpMVCSRVectorPrepare<Real, Index, 32>( inVector.getData(), outVector.getData(), matrix.getRowPointers().getData(), matrix.getColumnIndexes().getData(), matrix.getValues().getData(), matrix.getRowPointers().getSize() - 1, matrix.getColumns() ); break; case CSRLight: SpMVCSRLightPrepare<Real, Index>( inVector.getData(), outVector.getData(), matrix.getRowPointers().getData(), matrix.getColumnIndexes().getData(), matrix.getValues().getData(), matrix.getValues().getSize(), matrix.getRowPointers().getSize() - 1, matrix.getColumns() ); break; case CSRAdaptive: SpMVCSRAdaptivePrepare<Real, Index, Device, KernelType, 32, 1024>( inVector.getData(), outVector.getData(), Loading @@ -1808,32 +1745,32 @@ class CSRDeviceDependentCode< Devices::Cuda > matrix.getRowPointers().getSize(), // don't add -1 ! matrix.getColumns() ); // break; // case CSRMultiVector: // SpMVCSRMultiVectorPrepare<Real, Index, 32, 1024>( // inVector.getData(), // outVector.getData(), // matrix.getRowPointers().getData(), // matrix.getColumnIndexes().getData(), // matrix.getValues().getData(), // matrix.getValues().getSize(), // matrix.getRowPointers().getSize() - 1, // matrix.getColumns() // ); // break; // case CSRLightWithoutAtomic: // SpMVCSRLightWithoutAtomicPrepare<Real, Index, 32, 1024>( // inVector.getData(), // outVector.getData(), // matrix.getRowPointers().getData(), // matrix.getColumnIndexes().getData(), // matrix.getValues().getData(), // matrix.getValues().getSize(), // matrix.getRowPointers().getSize() - 1, // matrix.getColumns() // ); // break; // } break; case CSRMultiVector: SpMVCSRMultiVectorPrepare<Real, Index, 32, 1024>( inVector.getData(), outVector.getData(), matrix.getRowPointers().getData(), matrix.getColumnIndexes().getData(), matrix.getValues().getData(), matrix.getValues().getSize(), matrix.getRowPointers().getSize() - 1, matrix.getColumns() ); break; case CSRLightWithoutAtomic: SpMVCSRLightWithoutAtomicPrepare<Real, Index, 32, 1024>( inVector.getData(), outVector.getData(), matrix.getRowPointers().getData(), matrix.getColumnIndexes().getData(), matrix.getValues().getData(), matrix.getValues().getSize(), matrix.getRowPointers().getSize() - 1, matrix.getColumns() ); break; } #endif /* HAVE_CUDA */ #endif } Loading Loading
src/TNL/Matrices/Legacy/CSR.h +2 −1 Original line number Diff line number Diff line Loading @@ -29,7 +29,7 @@ enum class Type { }; union Block { void set(uint32_t row, Type type = Type::VECTOR, uint32_t index = 0) noexcept { Block(uint32_t row, Type type = Type::VECTOR, uint32_t index = 0) noexcept { this->index[0] = row; this->index[1] = index; this->byte[7] = (uint8_t)type; Loading Loading @@ -87,6 +87,7 @@ public: Containers::Vector< Block, Device, Index > blocks; Index maxElementsPerWarp = 1024; using Sparse< Real, Device, Index >::getAllocatedElementsCount; Loading
src/TNL/Matrices/Legacy/CSR_impl.h +82 −145 Original line number Diff line number Diff line Loading @@ -16,7 +16,7 @@ #include <TNL/Algorithms/AtomicOperations.h> #include <TNL/Exceptions/NotImplementedError.h> #include <TNL/Atomic.h> #include <vector> #include <vector> // for blocks in CSR Adaptive #ifdef HAVE_CUSPARSE #include <cuda.h> Loading Loading @@ -113,7 +113,7 @@ void CSR< Real, Device, Index, KernelType >::setCompressedRowLengths( ConstCompr this->columnIndexes.setSize( this->rowPointers.getElement( this->rows ) ); this->columnIndexes.setValue( this->columns ); // if (KernelType == CSRAdaptive) if (KernelType == CSRAdaptive) this->setBlocks(); } Loading @@ -121,11 +121,11 @@ void CSR< Real, Device, Index, KernelType >::setCompressedRowLengths( ConstCompr template< typename Real, typename Index, typename Device, CSRKernel KernelType, int maxElemPerWarp> CSRKernel KernelType> Index findLimit(const Index start, const Index max, const CSR< Real, Device, Index, KernelType >& matrix, const Index size, const Index maxElemPerWarp, Type &type, Index &sum) { sum = 0; Loading Loading @@ -158,33 +158,34 @@ template< typename Real, void CSR< Real, Device, Index, KernelType >::setBlocks() { const Index rows = this->getRowPointers().getSize(); Block *tmpBlocks = new Block[rows]; Index nextStart = 0, start = 0, cnt = 0, sum = 0; Index sum, start = 0, nextStart = 0; /* Fill blocks */ std::vector<Block> inBlock; inBlock.reserve(rows); // reserve space to avoid reallocation while (nextStart != rows - 1) { Type type; nextStart = findLimit<Real, Index, Device, KernelType, 384>( start, this->maxElementsPerWarp, *this, rows, type, sum nextStart = findLimit( start, 384, *this, rows, this->maxElementsPerWarp, type, sum ); if (type == Type::LONG) { uint32_t parts = roundUpDivision(sum, this->maxElementsPerWarp); uint32_t parts = roundUpDivision(sum, 384); for (uint32_t index = 0; index < parts; ++index) { tmpBlocks[cnt++].set(start, Type::LONG, index); inBlock.emplace_back(start, Type::LONG, index); } } else { tmpBlocks[cnt++].set(start, type); inBlock.emplace_back(start, type); } start = nextStart; } tmpBlocks[cnt++].set(nextStart); /* Copy to TNL Vector */ this->blocks.setSize(cnt); for (Index i = 0; i < cnt; ++i) this->blocks.setElement(i, tmpBlocks[i]); inBlock.emplace_back(nextStart); delete [] tmpBlocks; /* Copy values */ this->blocks.setSize(inBlock.size()); for (size_t i = 0; i < inBlock.size(); ++i) this->blocks.setElement(i, inBlock[i]); } template< typename Real, Loading Loading @@ -1495,40 +1496,6 @@ void SpMVCSRMultiVectorPrepare( const Real *inVector, } } /* Find limit of block */ // template< typename Real, // typename Index, // typename Device, // CSRKernel KernelType, // int maxElemPerWarp> // Index findLimit(const Index start, const Index max, // const CSR< Real, Device, Index, KernelType >& matrix, // const Index size, // Type &type, // Index &sum) { // sum = 0; // for (Index current = start; current < size - 1; ++current) { // Index elements = matrix.getRowPointers().getElement(current + 1) - // matrix.getRowPointers().getElement(current); // sum += elements; // if (sum > max) { // if (current - start > 1) { // extra row // type = STREAM; // return current; // } else { // one long row // if (sum <= maxElemPerWarp) // type = VECTOR; // else // type = LONG; // return current + 1; // } // } // } // type = STREAM; // return size - 1; // return last row pointer // } template< typename Real, typename Index, typename Device, Loading @@ -1551,38 +1518,10 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, constexpr Index WARPS_PER_BLOCK = THREADS_PER_BLOCK / 32; constexpr Index SHARED_PER_WARP = 49152/sizeof(Real) / WARPS_PER_BLOCK; //-------------------------------------------------------------------- // Index blocks, sum, start = 0, nextStart = 0; Index blocks; const Index threads = THREADS_PER_BLOCK; /* Fill blocks */ // std::vector<Block> inBlock; // inBlock.reserve(rows); // reserve space to avoid reallocation // while (nextStart != rows - 1) { // Type type; // nextStart = findLimit<Real, Index, Device, KernelType, maxElemPerWarp>( // start, SHARED_PER_WARP, matrix, rows, type, sum // ); // if (type == LONG) { // uint32_t parts = roundUpDivision(sum, maxElemPerWarp); // for (uint32_t index = 0; index < parts; ++index) { // inBlock.emplace_back(start, LONG, index); // } // } else { // inBlock.emplace_back(start, type); // } // start = nextStart; // } // inBlock.emplace_back(nextStart); // /* blocks to GPU */ // Block *blocksAdaptive = nullptr; // cudaMalloc((void **)&blocksAdaptive, sizeof(*blocksAdaptive) * inBlock.size()); // cudaMemcpy(blocksAdaptive, inBlock.data(), inBlock.size() * sizeof(*blocksAdaptive), cudaMemcpyHostToDevice); // size_t neededThreads = inBlock.size() * 32; // one warp per block size_t neededThreads = matrix.blocks.getSize() * 32; // one warp per block /* Execute kernels on device */ for (Index grid = 0; neededThreads != 0; ++grid) { Loading @@ -1606,8 +1545,6 @@ void SpMVCSRAdaptivePrepare( const Real *inVector, grid ); } // cudaFree(blocksAdaptive); } #endif Loading Loading @@ -1760,43 +1697,43 @@ class CSRDeviceDependentCode< Devices::Cuda > inVector.getData(), outVector.getData() ); #else // switch(KernelType) // { // case CSRScalar: // SpMVCSRScalarPrepare<Real, Index>( // inVector.getData(), // outVector.getData(), // matrix.getRowPointers().getData(), // matrix.getColumnIndexes().getData(), // matrix.getValues().getData(), // matrix.getRowPointers().getSize() - 1, // matrix.getColumns() // ); // break; // case CSRVector: // SpMVCSRVectorPrepare<Real, Index, 32>( // inVector.getData(), // outVector.getData(), // matrix.getRowPointers().getData(), // matrix.getColumnIndexes().getData(), // matrix.getValues().getData(), // matrix.getRowPointers().getSize() - 1, // matrix.getColumns() // ); // break; // case CSRLight: // SpMVCSRLightPrepare<Real, Index>( // inVector.getData(), // outVector.getData(), // matrix.getRowPointers().getData(), // matrix.getColumnIndexes().getData(), // matrix.getValues().getData(), // matrix.getValues().getSize(), // matrix.getRowPointers().getSize() - 1, // matrix.getColumns() // ); // break; // case CSRAdaptive: switch(KernelType) { case CSRScalar: SpMVCSRScalarPrepare<Real, Index>( inVector.getData(), outVector.getData(), matrix.getRowPointers().getData(), matrix.getColumnIndexes().getData(), matrix.getValues().getData(), matrix.getRowPointers().getSize() - 1, matrix.getColumns() ); break; case CSRVector: SpMVCSRVectorPrepare<Real, Index, 32>( inVector.getData(), outVector.getData(), matrix.getRowPointers().getData(), matrix.getColumnIndexes().getData(), matrix.getValues().getData(), matrix.getRowPointers().getSize() - 1, matrix.getColumns() ); break; case CSRLight: SpMVCSRLightPrepare<Real, Index>( inVector.getData(), outVector.getData(), matrix.getRowPointers().getData(), matrix.getColumnIndexes().getData(), matrix.getValues().getData(), matrix.getValues().getSize(), matrix.getRowPointers().getSize() - 1, matrix.getColumns() ); break; case CSRAdaptive: SpMVCSRAdaptivePrepare<Real, Index, Device, KernelType, 32, 1024>( inVector.getData(), outVector.getData(), Loading @@ -1808,32 +1745,32 @@ class CSRDeviceDependentCode< Devices::Cuda > matrix.getRowPointers().getSize(), // don't add -1 ! matrix.getColumns() ); // break; // case CSRMultiVector: // SpMVCSRMultiVectorPrepare<Real, Index, 32, 1024>( // inVector.getData(), // outVector.getData(), // matrix.getRowPointers().getData(), // matrix.getColumnIndexes().getData(), // matrix.getValues().getData(), // matrix.getValues().getSize(), // matrix.getRowPointers().getSize() - 1, // matrix.getColumns() // ); // break; // case CSRLightWithoutAtomic: // SpMVCSRLightWithoutAtomicPrepare<Real, Index, 32, 1024>( // inVector.getData(), // outVector.getData(), // matrix.getRowPointers().getData(), // matrix.getColumnIndexes().getData(), // matrix.getValues().getData(), // matrix.getValues().getSize(), // matrix.getRowPointers().getSize() - 1, // matrix.getColumns() // ); // break; // } break; case CSRMultiVector: SpMVCSRMultiVectorPrepare<Real, Index, 32, 1024>( inVector.getData(), outVector.getData(), matrix.getRowPointers().getData(), matrix.getColumnIndexes().getData(), matrix.getValues().getData(), matrix.getValues().getSize(), matrix.getRowPointers().getSize() - 1, matrix.getColumns() ); break; case CSRLightWithoutAtomic: SpMVCSRLightWithoutAtomicPrepare<Real, Index, 32, 1024>( inVector.getData(), outVector.getData(), matrix.getRowPointers().getData(), matrix.getColumnIndexes().getData(), matrix.getValues().getData(), matrix.getValues().getSize(), matrix.getRowPointers().getSize() - 1, matrix.getColumns() ); break; } #endif /* HAVE_CUDA */ #endif } Loading