Commit feeeb410 authored by kolusask's avatar kolusask
Browse files

Optimize PrefixSum algorithm

parent 7c20d4e7
Loading
Loading
Loading
Loading
+2 −17
Original line number Diff line number Diff line
@@ -13,26 +13,11 @@ template<typename Item, typename Key, typename Device>
HashGraphV1<Item, Key, Device>::HashGraphV1(const Array<Item, Device>& items) :
            m_content(items.getSize()),
            m_items(items.getSize()),
            m_offset(items.getSize() + 1, 0),
            m_offset((1 << int(ceil(log2(items.getSize())))), 0),
            m_hash(31, 10538, items.getSize()),
            m_view(std::make_shared<ViewType>(*this, items.getConstView(), 
                                              Array<int, Device>(items.getSize()).getView(),
                                              Array<int, Device>(items.getSize()).getView())) {
    // Array<int, Device> hashes(m_items.getSize());
    // for (int i = 0; i < m_items.getSize(); i++)
    //     hashes[i] = m_hash(items[i].key);
    // Array<int, Device> counter(hashes.getSize(), 0);
    // for (int i = 0; i < hashes.getSize(); i++)
    //     counter[hashes[i]]++;
    // fill_offset(counter);
    // counter.setValue(0, 0, counter.getSize());
    // for (int i = 0; i < counter.getSize(); i++) {
    //     int pos = m_offset[hashes[i]] + counter[hashes[i]]++;
    //     new ((void*) &m_items[pos]) Item(items[i]);
    // }
    // for (int i = 0; i < items.getSize(); i++)
    //     items[i].~Item();
}
                                              Array<int, Device>(items.getSize()).getView())) {}

template<typename Item, typename Key, typename Device>
int HashGraphV1<Item, Key, Device>::duplicates() const {
+22 −7
Original line number Diff line number Diff line
@@ -62,15 +62,30 @@ HashGraphV1View<Item, Key, Device>::~HashGraphV1View() {}
template<typename Item, typename Key, typename Device>
void HashGraphV1View<Item, Key, Device>::fill_offset(const ArrayView<int, Device>& counter) {
    auto offset = m_offset;
    auto _fill = [counter, offset] __cuda_callable__ (int j) mutable {
        int latestOffset = 0;
        for (int i = 0; i < counter.getSize(); i++) {
            offset[i] = latestOffset;
            latestOffset += counter[i];
    auto fill = [offset, counter] __cuda_callable__ (int i) mutable {
        offset[i] = i >= counter.getSize() ? 0 : counter[i];
    };
    TNL::Algorithms::ParallelFor<Device>::exec(0, offset.getSize(), fill);

    auto reduce = [offset] __cuda_callable__ (int k, int d) mutable {
        if (k % (1 << (d + 1)) == 0)
            offset[k + (1 << (d + 1)) - 1] = offset[k + (1 << d) - 1]
                                           + offset[k + (1 << (d + 1)) - 1];
    };
    for (int d = 0; d < log2(offset.getSize() - 1); d++)
        TNL::Algorithms::ParallelFor<Device>::exec(0, offset.getSize(), reduce, d);

    offset.setElement(offset.getSize() - 1, 0);
    auto up_sweep = [offset] __cuda_callable__ (int r, int d) mutable {
        if ((offset.getSize() - 1 - r) % (1 << d) == 0) {
            int l = r - (1 << (d - 1));
            int t = offset[r];
            offset[r] += offset[l];
            offset[l] = t;
        }
        offset[counter.getSize()] = latestOffset;
    };
    TNL::Algorithms::ParallelFor<Device>::exec(0, 1, _fill);
    for (int d = log2(offset.getSize()); d > 0; d--)
        TNL::Algorithms::ParallelFor<Device>::exec(0, offset.getSize(), up_sweep, d);
}

template<typename Item, typename Key, typename Device>