Optimize PrefixSum algorithm (feeeb410) · Commits · TNL / CuckooHashing

HashGraph/HashGraphV1/HashGraphV1.hpp

+2 −17

Original line number	Diff line number	Diff line
		@@ -13,26 +13,11 @@ template<typename Item, typename Key, typename Device>
		HashGraphV1<Item, Key, Device>::HashGraphV1(const Array<Item, Device>& items) :
		m_content(items.getSize()),
		m_items(items.getSize()),
		m_offset(items.getSize() + 1, 0),
		m_offset((1 << int(ceil(log2(items.getSize())))), 0),
		m_hash(31, 10538, items.getSize()),
		m_view(std::make_shared<ViewType>(*this, items.getConstView(),
		Array<int, Device>(items.getSize()).getView(),
		Array<int, Device>(items.getSize()).getView())) {
		// Array<int, Device> hashes(m_items.getSize());
		// for (int i = 0; i < m_items.getSize(); i++)
		// hashes[i] = m_hash(items[i].key);
		// Array<int, Device> counter(hashes.getSize(), 0);
		// for (int i = 0; i < hashes.getSize(); i++)
		// counter[hashes[i]]++;
		// fill_offset(counter);
		// counter.setValue(0, 0, counter.getSize());
		// for (int i = 0; i < counter.getSize(); i++) {
		// int pos = m_offset[hashes[i]] + counter[hashes[i]]++;
		// new ((void*) &m_items[pos]) Item(items[i]);
		// }
		// for (int i = 0; i < items.getSize(); i++)
		// items[i].~Item();
		}
		Array<int, Device>(items.getSize()).getView())) {}

		template<typename Item, typename Key, typename Device>
		int HashGraphV1<Item, Key, Device>::duplicates() const {

+22 −7

Original line number	Diff line number	Diff line
		@@ -62,15 +62,30 @@ HashGraphV1View<Item, Key, Device>::~HashGraphV1View() {}
		template<typename Item, typename Key, typename Device>
		void HashGraphV1View<Item, Key, Device>::fill_offset(const ArrayView<int, Device>& counter) {
		auto offset = m_offset;
		auto _fill = [counter, offset] __cuda_callable__ (int j) mutable {
		int latestOffset = 0;
		for (int i = 0; i < counter.getSize(); i++) {
		offset[i] = latestOffset;
		latestOffset += counter[i];
		auto fill = [offset, counter] __cuda_callable__ (int i) mutable {
		offset[i] = i >= counter.getSize() ? 0 : counter[i];
		};
		TNL::Algorithms::ParallelFor<Device>::exec(0, offset.getSize(), fill);

		auto reduce = [offset] __cuda_callable__ (int k, int d) mutable {
		if (k % (1 << (d + 1)) == 0)
		offset[k + (1 << (d + 1)) - 1] = offset[k + (1 << d) - 1]
		+ offset[k + (1 << (d + 1)) - 1];
		};
		for (int d = 0; d < log2(offset.getSize() - 1); d++)
		TNL::Algorithms::ParallelFor<Device>::exec(0, offset.getSize(), reduce, d);

		offset.setElement(offset.getSize() - 1, 0);
		auto up_sweep = [offset] __cuda_callable__ (int r, int d) mutable {
		if ((offset.getSize() - 1 - r) % (1 << d) == 0) {
		int l = r - (1 << (d - 1));
		int t = offset[r];
		offset[r] += offset[l];
		offset[l] = t;
		}
		offset[counter.getSize()] = latestOffset;
		};
		TNL::Algorithms::ParallelFor<Device>::exec(0, 1, _fill);
		for (int d = log2(offset.getSize()); d > 0; d--)
		TNL::Algorithms::ParallelFor<Device>::exec(0, offset.getSize(), up_sweep, d);
		}

		template<typename Item, typename Key, typename Device>