Loading src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py +10 −6 Original line number Diff line number Diff line Loading @@ -20,6 +20,7 @@ cpu_matrix_formats = [ 'CSR', gpu_matrix_formats = [ 'CSR Legacy Scalar', 'CSR Legacy Vector', 'CSR Legacy MultiVector', 'CSR Legacy Light', 'CSR Legacy Light2', 'CSR Legacy Light3', 'CSR Legacy Light4', 'CSR Legacy Light5', 'CSR Legacy Light6', 'CSR Legacy LightWithoutAtomic', 'CSR Legacy Adaptive', 'CSR< Scalar >', 'CSR< Vector >', 'CSR< Hybrid >', 'CSR< Adaptive >', 'Ellpack', 'Ellpack Legacy', 'SlicedEllpack', 'SlicedEllpack Legacy', 'ChunkedEllpack', 'ChunkedEllpack Legacy', Loading @@ -36,7 +37,10 @@ cpu_comparison_formats = { 'CSR' : 'CSR Legacy Scalar', """ GPU formats to be compared """ gpu_comparison_formats = { #'CSR' : 'CSR Legacy Scalar', gpu_comparison_formats = { 'CSR< Scalar >' : 'CSR Legacy Scalar', 'CSR< Vector >' : 'CSR Legacy Vector', 'CSR< Hybrid >' : 'CSR Legacy LightWithoutAtomic', 'CSR< Adaptive >' : 'CSR Legacy Adaptive', 'Ellpack' : 'Ellpack Legacy', 'SlicedEllpack' : 'SlicedEllpack Legacy', 'BiEllpack' : 'BiEllpack Legacy' Loading src/TNL/Algorithms/Segments/CSRKernelAdaptive.h +21 −16 Original line number Diff line number Diff line Loading @@ -220,6 +220,11 @@ struct CSRKernelAdaptiveView ConstViewType getConstView() const { return *this; }; static TNL::String getKernelType() { return "Adaptive"; }; template< typename OffsetsView, typename Fetch, typename Reduction, Loading Loading @@ -344,6 +349,10 @@ struct CSRKernelAdaptive using BlocksType = typename ViewType::BlocksType; using BlocksView = typename BlocksType::ViewType; static TNL::String getKernelType() { return ViewType::getKernelType(); }; static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; Loading Loading @@ -373,10 +382,11 @@ struct CSRKernelAdaptive Index &sum ) { sum = 0; TNL::Containers::Vector< typename Offsets::IndexType, TNL::Devices::Host, typename Offsets::IndexType > hostOffsets( offsets ); for (Index current = start; current < size - 1; current++ ) { Index elements = offsets.getElement(current + 1) - offsets.getElement(current); Index elements = hostOffsets[ current + 1 ] - hostOffsets[ current ]; sum += elements; if( sum > SHARED_PER_WARP ) { Loading Loading @@ -407,8 +417,8 @@ struct CSRKernelAdaptive Index sum, start( 0 ), nextStart( 0 ); // Fill blocks std::vector< details::CSRAdaptiveKernelBlockDescriptor< Index > > inBlock; inBlock.reserve( rows ); std::vector< details::CSRAdaptiveKernelBlockDescriptor< Index > > inBlocks; inBlocks.reserve( rows ); while( nextStart != rows - 1 ) { Loading @@ -417,34 +427,29 @@ struct CSRKernelAdaptive if( type == details::Type::LONG ) { const Index blocksCount = inBlock.size(); const Index blocksCount = inBlocks.size(); const Index warpsPerCudaBlock = THREADS_ADAPTIVE / TNL::Cuda::getWarpSize(); Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount; if( warpsLeft == 0 ) warpsLeft = warpsPerCudaBlock; //Index parts = roundUpDivision(sum, this->SHARED_PER_WARP); inBlock.emplace_back( start, details::Type::LONG, 0, warpsLeft ); inBlocks.emplace_back( start, details::Type::LONG, 0, warpsLeft ); for( Index index = 1; index < warpsLeft; index++ ) { inBlock.emplace_back( start, details::Type::LONG, index, warpsLeft ); inBlocks.emplace_back( start, details::Type::LONG, index, warpsLeft ); } } else { inBlock.emplace_back(start, type, inBlocks.emplace_back(start, type, nextStart, offsets.getElement(nextStart), offsets.getElement(start) ); } start = nextStart; } inBlock.emplace_back(nextStart); // Copy values this->blocks.setSize(inBlock.size()); for (size_t i = 0; i < inBlock.size(); ++i) this->blocks.setElement(i, inBlock[i]); inBlocks.emplace_back(nextStart); this->blocks = inBlocks; this->view.setBlocks( blocks ); }; Loading src/TNL/Algorithms/Segments/CSRKernelHybrid.h +1 −0 Original line number Diff line number Diff line Loading @@ -38,6 +38,7 @@ struct CSRKernelHybrid ConstViewType getConstView() const; static TNL::String getKernelType(); template< typename OffsetsView, typename Fetch, Loading src/TNL/Algorithms/Segments/CSRKernelHybrid.hpp +9 −0 Original line number Diff line number Diff line Loading @@ -114,6 +114,15 @@ getView() -> ViewType return *this; } template< typename Index, typename Device > TNL::String CSRKernelHybrid< Index, Device >:: getKernelType() { return "Hybrid"; } template< typename Index, typename Device > auto Loading src/TNL/Algorithms/Segments/CSRKernelScalar.h +2 −0 Original line number Diff line number Diff line Loading @@ -38,6 +38,8 @@ struct CSRKernelScalar ConstViewType getConstView() const; static TNL::String getKernelType(); template< typename OffsetsView, typename Fetch, typename Reduction, Loading Loading
src/Benchmarks/scripts/tnl-spmv-benchmark-make-tables.py +10 −6 Original line number Diff line number Diff line Loading @@ -20,6 +20,7 @@ cpu_matrix_formats = [ 'CSR', gpu_matrix_formats = [ 'CSR Legacy Scalar', 'CSR Legacy Vector', 'CSR Legacy MultiVector', 'CSR Legacy Light', 'CSR Legacy Light2', 'CSR Legacy Light3', 'CSR Legacy Light4', 'CSR Legacy Light5', 'CSR Legacy Light6', 'CSR Legacy LightWithoutAtomic', 'CSR Legacy Adaptive', 'CSR< Scalar >', 'CSR< Vector >', 'CSR< Hybrid >', 'CSR< Adaptive >', 'Ellpack', 'Ellpack Legacy', 'SlicedEllpack', 'SlicedEllpack Legacy', 'ChunkedEllpack', 'ChunkedEllpack Legacy', Loading @@ -36,7 +37,10 @@ cpu_comparison_formats = { 'CSR' : 'CSR Legacy Scalar', """ GPU formats to be compared """ gpu_comparison_formats = { #'CSR' : 'CSR Legacy Scalar', gpu_comparison_formats = { 'CSR< Scalar >' : 'CSR Legacy Scalar', 'CSR< Vector >' : 'CSR Legacy Vector', 'CSR< Hybrid >' : 'CSR Legacy LightWithoutAtomic', 'CSR< Adaptive >' : 'CSR Legacy Adaptive', 'Ellpack' : 'Ellpack Legacy', 'SlicedEllpack' : 'SlicedEllpack Legacy', 'BiEllpack' : 'BiEllpack Legacy' Loading
src/TNL/Algorithms/Segments/CSRKernelAdaptive.h +21 −16 Original line number Diff line number Diff line Loading @@ -220,6 +220,11 @@ struct CSRKernelAdaptiveView ConstViewType getConstView() const { return *this; }; static TNL::String getKernelType() { return "Adaptive"; }; template< typename OffsetsView, typename Fetch, typename Reduction, Loading Loading @@ -344,6 +349,10 @@ struct CSRKernelAdaptive using BlocksType = typename ViewType::BlocksType; using BlocksView = typename BlocksType::ViewType; static TNL::String getKernelType() { return ViewType::getKernelType(); }; static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; Loading Loading @@ -373,10 +382,11 @@ struct CSRKernelAdaptive Index &sum ) { sum = 0; TNL::Containers::Vector< typename Offsets::IndexType, TNL::Devices::Host, typename Offsets::IndexType > hostOffsets( offsets ); for (Index current = start; current < size - 1; current++ ) { Index elements = offsets.getElement(current + 1) - offsets.getElement(current); Index elements = hostOffsets[ current + 1 ] - hostOffsets[ current ]; sum += elements; if( sum > SHARED_PER_WARP ) { Loading Loading @@ -407,8 +417,8 @@ struct CSRKernelAdaptive Index sum, start( 0 ), nextStart( 0 ); // Fill blocks std::vector< details::CSRAdaptiveKernelBlockDescriptor< Index > > inBlock; inBlock.reserve( rows ); std::vector< details::CSRAdaptiveKernelBlockDescriptor< Index > > inBlocks; inBlocks.reserve( rows ); while( nextStart != rows - 1 ) { Loading @@ -417,34 +427,29 @@ struct CSRKernelAdaptive if( type == details::Type::LONG ) { const Index blocksCount = inBlock.size(); const Index blocksCount = inBlocks.size(); const Index warpsPerCudaBlock = THREADS_ADAPTIVE / TNL::Cuda::getWarpSize(); Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount; if( warpsLeft == 0 ) warpsLeft = warpsPerCudaBlock; //Index parts = roundUpDivision(sum, this->SHARED_PER_WARP); inBlock.emplace_back( start, details::Type::LONG, 0, warpsLeft ); inBlocks.emplace_back( start, details::Type::LONG, 0, warpsLeft ); for( Index index = 1; index < warpsLeft; index++ ) { inBlock.emplace_back( start, details::Type::LONG, index, warpsLeft ); inBlocks.emplace_back( start, details::Type::LONG, index, warpsLeft ); } } else { inBlock.emplace_back(start, type, inBlocks.emplace_back(start, type, nextStart, offsets.getElement(nextStart), offsets.getElement(start) ); } start = nextStart; } inBlock.emplace_back(nextStart); // Copy values this->blocks.setSize(inBlock.size()); for (size_t i = 0; i < inBlock.size(); ++i) this->blocks.setElement(i, inBlock[i]); inBlocks.emplace_back(nextStart); this->blocks = inBlocks; this->view.setBlocks( blocks ); }; Loading
src/TNL/Algorithms/Segments/CSRKernelHybrid.h +1 −0 Original line number Diff line number Diff line Loading @@ -38,6 +38,7 @@ struct CSRKernelHybrid ConstViewType getConstView() const; static TNL::String getKernelType(); template< typename OffsetsView, typename Fetch, Loading
src/TNL/Algorithms/Segments/CSRKernelHybrid.hpp +9 −0 Original line number Diff line number Diff line Loading @@ -114,6 +114,15 @@ getView() -> ViewType return *this; } template< typename Index, typename Device > TNL::String CSRKernelHybrid< Index, Device >:: getKernelType() { return "Hybrid"; } template< typename Index, typename Device > auto Loading
src/TNL/Algorithms/Segments/CSRKernelScalar.h +2 −0 Original line number Diff line number Diff line Loading @@ -38,6 +38,8 @@ struct CSRKernelScalar ConstViewType getConstView() const; static TNL::String getKernelType(); template< typename OffsetsView, typename Fetch, typename Reduction, Loading