Loading implementation/BPlusTree/benchmark/benchmark_cuda.cu +2 −3 Original line number Diff line number Diff line #include "BPlusTree/Default.hpp" #include "BLinkTree/Default.hpp" #include "BTreeContainer/Default.hpp" #include "TNL/Devices/Cuda.h" #include "benchmark.hpp" Loading @@ -12,8 +12,7 @@ int main(int argc, char **argv) { Benchmark::execute<Benchmark::Device::Cuda, KeyType>( "btree_cuda_v7", [](auto &timer, auto input) { size_t size = getContainerSize(input.size(), Order); BTreeContainer<BNode, BNodeLatch, BPlusTree, KeyType, ValueType, Order, TNL::Devices::Cuda> container(size); BTreeContainer<KeyType, ValueType, Order, TNL::Devices::Cuda, BLinkTree> container(size); Containers::Array<KeyType, TNL::Devices::Cuda> keys(input); Containers::Array<ValueType, TNL::Devices::Cuda> values(input); Loading implementation/BPlusTree/benchmark/benchmark_host.cpp +2 −3 Original line number Diff line number Diff line #include "BPlusTree/Default.hpp" #include "BLinkTree/Default.hpp" #include "BTreeContainer/Default.hpp" #include "TNL/Devices/Cuda.h" #include "benchmark.hpp" Loading @@ -12,8 +12,7 @@ int main(int argc, char **argv) { Benchmark::execute<Benchmark::Device::Host, KeyType>( "btree_host_v7", [](auto &timer, auto input) { size_t size = getContainerSize(input.size(), Order); BTreeContainer<BNode, BNodeLatch, BPlusTree, KeyType, ValueType, Order, TNL::Devices::Host> container(size); BTreeContainer<KeyType, ValueType, Order, TNL::Devices::Host, BLinkTree> container(size); Containers::Array<KeyType, TNL::Devices::Host> keys(input); Containers::Array<ValueType, TNL::Devices::Host> values(input); Loading implementation/BPlusTree/src/BLinkTree/BLinkNode.hpp 0 → 100644 +33 −0 Original line number Diff line number Diff line #pragma once #include <TNL/Assert.h> #include <TNL/Cuda/CudaCallable.h> #include <TNL/TypeTraits.h> #include <cstddef> #include <stdint.h> template <typename KeyType, typename ValueType, size_t Order> struct BLinkNode { BLinkNode *volatile mSibling; KeyType mHighKey; uint8_t mHighKeyFlag; uint8_t mLeaf; uint8_t mSize; int mWriteLock = false; KeyType mKeys[Order]; ValueType mValues[Order]; BLinkNode *volatile mChildren[Order]; __cuda_callable__ inline bool leaf() { return this->mLeaf; } __cuda_callable__ inline size_t size() { return this->mSize; } __cuda_callable__ inline size_t childSize() { if (this->mLeaf == false && this->mSize > 0) { return this->mSize + 1; } return this->mSize; } }; No newline at end of file implementation/BPlusTree/src/BLinkTree/BLinkTreeCuda.cu 0 → 100644 +431 −0 Original line number Diff line number Diff line #pragma once #include <TNL/Cuda/CudaCallable.h> #include <algorithm> #include <cstdio> #include <iostream> #include <sstream> #include <string> #include <cooperative_groups.h> #include "../Constants.hpp" #include "../Utils/BumpAllocator.hpp" #include "../Utils/Debugger.hpp" #include "../Utils/Print.hpp" #include "../BNodeLatch/Default.hpp" #include "./BNodeOperations/Default.hpp" #include "TNL/Assert.h" template <typename KeyType, typename ValueType, size_t Order, typename Device> class BLinkTree; template <typename KeyType, typename ValueType, size_t Order> class BLinkTree<KeyType, ValueType, Order, Devices::Cuda> { using Device = Devices::Cuda; using BNode = BLinkNode<KeyType, ValueType, Order>; using Latch = BNodeLatch<BNode, Device>; using Allocator = BumpAllocator<BNode, Device>; using _Operations = BLinkOperations<KeyType, ValueType, Order, Device>; template <typename Warp> __device__ static inline BNode *scan(BNode *node, KeyType key, Warp &warp) { if (node->mHighKeyFlag && key >= node->mHighKey) { return node->mSibling; } size_t size = node->size(); size_t childSize = node->childSize(); size_t targetIndex = upperBoundWarp(node->mKeys, size, key, warp); if (targetIndex >= childSize) { return nullptr; } BNode *result = node->mChildren[targetIndex]; return result; } template <typename Warp> __device__ static inline BNode *moveSide(BNode *cursor, KeyType key, Warp &warp) { BNode *tmp, *result = cursor; while (result != nullptr && result->mSibling != nullptr && (tmp = scan(result, key, warp)) == result->mSibling) { result = tmp; } return result; } template <typename Warp> __device__ static inline BNode *findLeaf(BNode *root, KeyType key, Warp &warp) { BNode *cursor = root; while (cursor != nullptr && cursor->leaf() == false) { cursor = scan(cursor, key, warp); } cursor = moveSide(cursor, key, warp); return cursor; } template <typename Warp> __device__ static BNode *splitNode(BNode *cursor, Allocator &alloc, size_t siblingStart, size_t cursorCount, Warp &warp) { BNode *sibling = nullptr; auto rank = warp.thread_rank(); if (rank == 0) { sibling = alloc.allocate(); } _Operations::init(warp, sibling, cursor); warp.sync(); sibling = warp.shfl(sibling, 0); size_t size = cursor->size(); size_t childSize = cursor->childSize(); bool isLeaf = sibling->leaf(); if (rank >= siblingStart) { if (rank < childSize) { sibling->mChildren[rank - siblingStart] = cursor->mChildren[rank]; } if (rank < size) { sibling->mKeys[rank - siblingStart] = cursor->mKeys[rank]; sibling->mValues[rank - siblingStart] = cursor->mValues[rank]; sibling->mSize = size - siblingStart; } } cursor->mSize = cursorCount; cursor->mSibling = sibling; // wait until all sibling have written // warp.sync(); cursor->mHighKey = sibling->mKeys[0]; cursor->mHighKeyFlag = true; if (warp.thread_rank() == 0) { DBG_SPLIT(cursor, siblingStart, cursorCount, sibling); } return sibling; } template <typename Warp> __device__ static inline void insertIntoFreeNode(BNode *curr, KeyType insertKey, ValueType value, BNode *insertNode, Warp &warp) { auto rank = warp.thread_rank(); size_t keyIdx = upperBoundWarp(curr->mKeys, curr->size(), insertKey, warp); bool isLeaf = curr->leaf(); size_t childSize = curr->childSize(); if (isLeaf) { if (rank == 0) { DBG_INSERT_LEAF(curr, insertKey, keyIdx); } _Operations::insertChild(curr, keyIdx, insertNode, childSize, warp); _Operations::insertKey(curr, keyIdx, insertKey, value, warp); } else { if (rank == 0) { DBG_INSERT_INTERNAL(curr, insertKey, insertNode, keyIdx, (keyIdx + 1)); } _Operations::insertChild(curr, keyIdx + 1, insertNode, childSize, warp); _Operations::insertKey(curr, keyIdx, insertKey, warp); } warp.sync(); } template <typename Warp> __device__ static inline BNode *increaseTreeHeight(BNode *leftNode, KeyType insertKey, BNode *rightNode, Allocator &alloc, Warp &warp) { BNode *rootNode = nullptr; auto rank = warp.thread_rank(); if (rank == 0) { rootNode = alloc.allocate(); } rootNode = warp.shfl(rootNode, 0); _Operations::init(warp, rootNode, false, nullptr, false); warp.sync(); if (rank == 0) { rootNode->mKeys[rank] = insertKey; rootNode->mChildren[rank] = leftNode; } if (rank == 1) { rootNode->mChildren[rank] = rightNode; } rootNode->mSize = 1; if (rank == 0) { DBG_INCREASE_HEIGHT(leftNode, rootNode, rightNode, insertKey); } return rootNode; } template <typename Warp> __device__ static inline bool attemptLatchWarp(BNode *node, Latch &latch, Warp &warp) { bool attempt = false; if (warp.thread_rank() == 0) { attempt = latch.attempt(node); } attempt = warp.any(attempt); return attempt; } template <typename Warp> __device__ static inline void releaseLatchWarp(BNode *node, Latch &latch, Warp &warp) { if (warp.thread_rank() == 0) { latch.release(node); } warp.sync(); } template <typename Warp> __device__ static inline bool verifyParent(BNode *curr, BNode *prev, Warp &warp) { auto rank = warp.thread_rank(); size_t size = prev->childSize(); bool found = false; if (rank < size) { found = prev->mChildren[rank] == curr; } return warp.any(found); } template <typename Warp> __device__ static inline size_t lowerBoundWarp(KeyType *haystack, size_t haystackSize, KeyType needle, Warp &warp) { auto rank = warp.thread_rank(); TNL_ASSERT_GE(warp.size(), Order, "Warp must be larger than order"); bool matches = false; if (rank < haystackSize) { matches = haystack[rank] >= needle; } auto ballot = warp.ballot(matches); size_t res = __ffs(ballot) - 1; if (res > haystackSize) return haystackSize; return res; } template <typename Warp> __device__ static inline size_t upperBoundWarp(KeyType *haystack, size_t haystackSize, KeyType needle, Warp &warp) { auto rank = warp.thread_rank(); TNL_ASSERT_GE(warp.size(), Order, "Warp must be larger than order"); bool matches = false; if (rank < haystackSize) { matches = haystack[rank] > needle; } auto ballot = warp.ballot(matches); size_t res = __ffs(ballot) - 1; if (res > haystackSize) return haystackSize; return res; } public: using Node = BNode; using Operations = _Operations; template <typename Warp> __device__ static inline BNode *init(Allocator &alloc, Warp &warp) { BNode *root = alloc.allocate(); _Operations::init(warp, root, true, nullptr, false); DBG_INIT(root, Order); return root; } template <typename Warp> __device__ static inline bool find(BNode *root, KeyType key, ValueType &result, Warp &warp) { auto rank = warp.thread_rank(); BNode *next = root; while (next) { bool isLeaf = next->leaf(); size_t size = next->size(); KeyType nextKey = next->mKeys[rank]; uint32_t ballot = warp.ballot(nextKey >= key); size_t targetIdx = __ffs(ballot) - 1; targetIdx = (targetIdx > size) ? size : targetIdx; KeyType targetKey = warp.shfl(nextKey, targetIdx); if (isLeaf) { if (targetIdx < size && targetKey == key) { result = next->mValues[targetIdx]; return true; } return false; } // assumption: the tree is fully constructed and insert / splits are done next = next->mChildren[targetIdx + (targetKey == key)]; } return false; } template <typename Warp> __device__ static bool insert(BNode *&root, KeyType key, ValueType value, Allocator &alloc, Latch &latch, Warp &warp) { BNode *prev = nullptr, *curr = nullptr; do { __threadfence(); BNode *tmpPrev = prev; if (curr == nullptr) { curr = root; } else { prev = curr; curr = scan(curr, key, warp); } if (curr != nullptr && curr->size() + !curr->leaf() >= Order) { // if node is full, we haven't propagated, restart the operation if (prev && curr != root && prev->mSibling == curr) { return false; } if (attemptLatchWarp(curr, latch, warp) == false) { return false; } size_t medianIdx = curr->size() / 2; size_t splitIdx = medianIdx + (!curr->leaf()); KeyType medianKey = curr->mKeys[medianIdx]; if (curr == root) { BNode *split = splitNode(curr, alloc, splitIdx, medianIdx, warp); __threadfence(); BNode *newRoot = increaseTreeHeight(curr, medianKey, split, alloc, warp); __threadfence(); releaseLatchWarp(curr, latch, warp); root = newRoot; curr = prev; prev = tmpPrev; } else { // latch the parent too if (attemptLatchWarp(prev, latch, warp) == false) { releaseLatchWarp(curr, latch, warp); return false; } if (prev && verifyParent(curr, prev, warp) == false) { releaseLatchWarp(curr, latch, warp); releaseLatchWarp(prev, latch, warp); return false; } // verify, if the parent has not subsequently went full (esp. when ) if (prev->size() + !prev->leaf() >= Order) { releaseLatchWarp(curr, latch, warp); releaseLatchWarp(prev, latch, warp); return false; } BNode *split = splitNode(curr, alloc, splitIdx, medianIdx, warp); __threadfence(); releaseLatchWarp(curr, latch, warp); insertIntoFreeNode(prev, medianKey, 0, split, warp); __threadfence(); releaseLatchWarp(prev, latch, warp); curr = prev; prev = tmpPrev; } } } while (curr != nullptr && curr->leaf() == false); if (curr == nullptr) { return false; } if (attemptLatchWarp(curr, latch, warp) == false) { return false; } BNode *tmp; while (curr->mSibling != nullptr && (tmp = scan(curr, key, warp)) == curr->mSibling) { bool attempt = attemptLatchWarp(tmp, latch, warp); releaseLatchWarp(curr, latch, warp); if (attempt == false) { return false; } curr = tmp; } // here we don't check if the node has subsequently become full if (curr->size() + !curr->leaf() >= Order) { releaseLatchWarp(curr, latch, warp); return false; } insertIntoFreeNode(curr, key, value, nullptr, warp); __threadfence(); releaseLatchWarp(curr, latch, warp); return true; } template <typename Warp> __device__ static inline bool remove(BNode *root, KeyType key, Latch &latch, Warp &warp) { BNode *cursor = findLeaf(root, key, warp); if (cursor == nullptr) { return true; } if (attemptLatchWarp(cursor, latch, warp) == false) { return false; } __threadfence(); // remove item from keys and remove the respective node size_t keyIdx = lowerBoundWarp(cursor->mKeys, cursor->size(), key, warp); if (keyIdx >= cursor->size() || cursor->mKeys[keyIdx] != key) { releaseLatchWarp(cursor, latch, warp); return true; } _Operations::removeChild(cursor, keyIdx, cursor->childSize(), warp); _Operations::removeKey(cursor, keyIdx, warp); warp.sync(); if (warp.thread_rank() == 0) { DBG_DELETE(cursor, keyIdx); } releaseLatchWarp(cursor, latch, warp); return true; } }; implementation/BPlusTree/src/BLinkTree/BLinkTreeHost.hpp 0 → 100644 +229 −0 Original line number Diff line number Diff line #pragma once #define BTREE_HOST 1 #include <TNL/Cuda/CudaCallable.h> #include <algorithm> #include <iostream> #include <sstream> #include <string> #include "../Constants.hpp" #include "../Utils/Algorithms.hpp" #include "../Utils/BumpAllocator.hpp" #include "../Utils/Print.hpp" #include "../BNodeLatch/Default.hpp" #include "./BLinkNode.hpp" #include "./BNodeOperations/Default.hpp" template <typename KeyType, typename ValueType, size_t Order, typename Device> class BLinkTree; template <typename KeyType, typename ValueType, size_t Order> class BLinkTree<KeyType, ValueType, Order, Devices::Host> { using Device = Devices::Host; using BNode = BLinkNode<KeyType, ValueType, Order>; using Latch = BNodeLatch<BNode, Device>; using Allocator = BumpAllocator<BNode, Device>; using _Operations = BLinkOperations<KeyType, ValueType, Order, Device>; __cuda_callable__ static inline BNode *scan(BNode *node, KeyType key) { if (node->mHighKeyFlag && key >= node->mHighKey) { return node->mSibling; } size_t size = node->size(); size_t childSize = node->childSize(); size_t targetIndex = upperBound(node->mKeys, size, key); if (targetIndex == node->childSize()) { return nullptr; } BNode *result = node->mChildren[targetIndex]; return result; } __cuda_callable__ static inline BNode *moveSide(BNode *cursor, KeyType key) { BNode *tmp, *result = cursor; while (result != nullptr && result->mSibling != nullptr && (tmp = scan(result, key)) == result->mSibling) { result = tmp; } return result; } __cuda_callable__ static inline BNode *findLeaf(BNode *root, KeyType key) { BNode *cursor = root; while (cursor != nullptr && cursor->leaf() == false) { cursor = scan(cursor, key); } cursor = moveSide(cursor, key); return cursor; } __cuda_callable__ static BNode *splitNode(BNode *cursor, Allocator &alloc, size_t siblingStart, size_t cursorCount) { BNode *sibling = alloc.allocate(); _Operations::init(sibling, cursor); for (size_t i = siblingStart; i < cursor->childSize(); ++i) { BNode *node = cursor->mChildren[i]; _Operations::appendChild(sibling, node, i - siblingStart); } for (size_t i = siblingStart; i < cursor->size(); ++i) { if (sibling->leaf()) { _Operations::appendKey(sibling, cursor->mKeys[i], cursor->mValues[i]); } else { _Operations::appendKey(sibling, cursor->mKeys[i]); } } cursor->mSize = cursorCount; cursor->mSibling = sibling; cursor->mHighKey = sibling->mKeys[0]; cursor->mHighKeyFlag = true; return sibling; } __cuda_callable__ static inline void insertIntoFreeNode(BNode *curr, KeyType insertKey, ValueType value, BNode *insertNode) { size_t keyIdx = upperBound(curr->mKeys, curr->size(), insertKey); if (curr->leaf()) { _Operations::insertChild(curr, keyIdx, insertNode, curr->childSize()); _Operations::insertKey(curr, keyIdx, insertKey, value); } else { _Operations::insertChild(curr, keyIdx + 1, insertNode, curr->childSize()); _Operations::insertKey(curr, keyIdx, insertKey); } } __cuda_callable__ static inline void increaseTreeHeight(BNode *curr, KeyType insertKey, BNode *insertNode, Allocator &alloc) { BNode *leftNode = alloc.allocate(); // copy the entire node to the element _Operations::init(leftNode, curr); for (size_t childKey = 0; childKey < curr->childSize(); ++childKey) { BNode *child = curr->mChildren[childKey]; _Operations::appendChild(leftNode, child, childKey); if (childKey < curr->size()) { if (leftNode->leaf()) { _Operations::appendKey(leftNode, curr->mKeys[childKey], curr->mValues[childKey]); } else { _Operations::appendKey(leftNode, curr->mKeys[childKey]); } } } // this might cause some issues in synchronization _Operations::init(curr, false, nullptr, false); _Operations::appendChild(curr, leftNode, 0); _Operations::appendKey(curr, insertKey); _Operations::appendChild(curr, insertNode, 1); } public: using Node = BNode; using Operations = _Operations; __cuda_callable__ static inline BNode *init(Allocator &alloc) { BNode *root = alloc.allocate(); _Operations::init(root, true, nullptr, false); return root; } __cuda_callable__ static inline bool find(BNode *root, KeyType key, ValueType &result) { BNode *leaf = findLeaf(root, key); if (leaf == nullptr) return false; size_t it = lowerBound(leaf->mKeys, leaf->size(), key); if (it < leaf->size() && leaf->mKeys[it] == key) { result = leaf->mValues[it]; return true; } return false; } __cuda_callable__ static bool insert(BNode *root, KeyType key, ValueType value, Allocator &alloc, Latch &latch) { BNode *prev = nullptr, *curr = nullptr; do { BNode *tmpPrev = prev; if (curr == nullptr) { curr = root; } else { prev = curr; curr = scan(curr, key); } if (curr != nullptr && curr->size() + !curr->leaf() >= Order) { // if node is full, we haven't propagated, restart the operation if (curr->mSibling == prev && curr != root) { return false; } size_t medianIdx = curr->size() / 2; size_t splitIdx = medianIdx + (!curr->leaf()); KeyType medianKey = curr->mKeys[medianIdx]; BNode *split = nullptr; split = splitNode(curr, alloc, splitIdx, medianIdx); if (curr == root) { increaseTreeHeight(curr, medianKey, split, alloc); } else { insertIntoFreeNode(prev, medianKey, 0, split); curr = prev; prev = tmpPrev; } } } while (curr != nullptr && curr->leaf() == false); if (curr == nullptr) { return false; } BNode *tmp; while (curr->mSibling != nullptr && (tmp = scan(curr, key)) == curr->mSibling) { curr = tmp; } insertIntoFreeNode(curr, key, value, nullptr); return true; } __cuda_callable__ static inline bool remove(BNode *root, KeyType key, Latch &latch) { BNode *cursor = findLeaf(root, key); if (cursor == nullptr) return true; if (!latch.attempt(cursor)) { return false; } // remove item from keys and remove the respective node size_t keyIdx = lowerBound(cursor->mKeys, cursor->size(), key); if (keyIdx == cursor->size() || cursor->mKeys[keyIdx] != key) { latch.release(cursor); return true; } _Operations::removeChild(cursor, keyIdx, cursor->childSize()); _Operations::removeKey(cursor, keyIdx); latch.release(cursor); return true; } }; No newline at end of file Loading
implementation/BPlusTree/benchmark/benchmark_cuda.cu +2 −3 Original line number Diff line number Diff line #include "BPlusTree/Default.hpp" #include "BLinkTree/Default.hpp" #include "BTreeContainer/Default.hpp" #include "TNL/Devices/Cuda.h" #include "benchmark.hpp" Loading @@ -12,8 +12,7 @@ int main(int argc, char **argv) { Benchmark::execute<Benchmark::Device::Cuda, KeyType>( "btree_cuda_v7", [](auto &timer, auto input) { size_t size = getContainerSize(input.size(), Order); BTreeContainer<BNode, BNodeLatch, BPlusTree, KeyType, ValueType, Order, TNL::Devices::Cuda> container(size); BTreeContainer<KeyType, ValueType, Order, TNL::Devices::Cuda, BLinkTree> container(size); Containers::Array<KeyType, TNL::Devices::Cuda> keys(input); Containers::Array<ValueType, TNL::Devices::Cuda> values(input); Loading
implementation/BPlusTree/benchmark/benchmark_host.cpp +2 −3 Original line number Diff line number Diff line #include "BPlusTree/Default.hpp" #include "BLinkTree/Default.hpp" #include "BTreeContainer/Default.hpp" #include "TNL/Devices/Cuda.h" #include "benchmark.hpp" Loading @@ -12,8 +12,7 @@ int main(int argc, char **argv) { Benchmark::execute<Benchmark::Device::Host, KeyType>( "btree_host_v7", [](auto &timer, auto input) { size_t size = getContainerSize(input.size(), Order); BTreeContainer<BNode, BNodeLatch, BPlusTree, KeyType, ValueType, Order, TNL::Devices::Host> container(size); BTreeContainer<KeyType, ValueType, Order, TNL::Devices::Host, BLinkTree> container(size); Containers::Array<KeyType, TNL::Devices::Host> keys(input); Containers::Array<ValueType, TNL::Devices::Host> values(input); Loading
implementation/BPlusTree/src/BLinkTree/BLinkNode.hpp 0 → 100644 +33 −0 Original line number Diff line number Diff line #pragma once #include <TNL/Assert.h> #include <TNL/Cuda/CudaCallable.h> #include <TNL/TypeTraits.h> #include <cstddef> #include <stdint.h> template <typename KeyType, typename ValueType, size_t Order> struct BLinkNode { BLinkNode *volatile mSibling; KeyType mHighKey; uint8_t mHighKeyFlag; uint8_t mLeaf; uint8_t mSize; int mWriteLock = false; KeyType mKeys[Order]; ValueType mValues[Order]; BLinkNode *volatile mChildren[Order]; __cuda_callable__ inline bool leaf() { return this->mLeaf; } __cuda_callable__ inline size_t size() { return this->mSize; } __cuda_callable__ inline size_t childSize() { if (this->mLeaf == false && this->mSize > 0) { return this->mSize + 1; } return this->mSize; } }; No newline at end of file
implementation/BPlusTree/src/BLinkTree/BLinkTreeCuda.cu 0 → 100644 +431 −0 Original line number Diff line number Diff line #pragma once #include <TNL/Cuda/CudaCallable.h> #include <algorithm> #include <cstdio> #include <iostream> #include <sstream> #include <string> #include <cooperative_groups.h> #include "../Constants.hpp" #include "../Utils/BumpAllocator.hpp" #include "../Utils/Debugger.hpp" #include "../Utils/Print.hpp" #include "../BNodeLatch/Default.hpp" #include "./BNodeOperations/Default.hpp" #include "TNL/Assert.h" template <typename KeyType, typename ValueType, size_t Order, typename Device> class BLinkTree; template <typename KeyType, typename ValueType, size_t Order> class BLinkTree<KeyType, ValueType, Order, Devices::Cuda> { using Device = Devices::Cuda; using BNode = BLinkNode<KeyType, ValueType, Order>; using Latch = BNodeLatch<BNode, Device>; using Allocator = BumpAllocator<BNode, Device>; using _Operations = BLinkOperations<KeyType, ValueType, Order, Device>; template <typename Warp> __device__ static inline BNode *scan(BNode *node, KeyType key, Warp &warp) { if (node->mHighKeyFlag && key >= node->mHighKey) { return node->mSibling; } size_t size = node->size(); size_t childSize = node->childSize(); size_t targetIndex = upperBoundWarp(node->mKeys, size, key, warp); if (targetIndex >= childSize) { return nullptr; } BNode *result = node->mChildren[targetIndex]; return result; } template <typename Warp> __device__ static inline BNode *moveSide(BNode *cursor, KeyType key, Warp &warp) { BNode *tmp, *result = cursor; while (result != nullptr && result->mSibling != nullptr && (tmp = scan(result, key, warp)) == result->mSibling) { result = tmp; } return result; } template <typename Warp> __device__ static inline BNode *findLeaf(BNode *root, KeyType key, Warp &warp) { BNode *cursor = root; while (cursor != nullptr && cursor->leaf() == false) { cursor = scan(cursor, key, warp); } cursor = moveSide(cursor, key, warp); return cursor; } template <typename Warp> __device__ static BNode *splitNode(BNode *cursor, Allocator &alloc, size_t siblingStart, size_t cursorCount, Warp &warp) { BNode *sibling = nullptr; auto rank = warp.thread_rank(); if (rank == 0) { sibling = alloc.allocate(); } _Operations::init(warp, sibling, cursor); warp.sync(); sibling = warp.shfl(sibling, 0); size_t size = cursor->size(); size_t childSize = cursor->childSize(); bool isLeaf = sibling->leaf(); if (rank >= siblingStart) { if (rank < childSize) { sibling->mChildren[rank - siblingStart] = cursor->mChildren[rank]; } if (rank < size) { sibling->mKeys[rank - siblingStart] = cursor->mKeys[rank]; sibling->mValues[rank - siblingStart] = cursor->mValues[rank]; sibling->mSize = size - siblingStart; } } cursor->mSize = cursorCount; cursor->mSibling = sibling; // wait until all sibling have written // warp.sync(); cursor->mHighKey = sibling->mKeys[0]; cursor->mHighKeyFlag = true; if (warp.thread_rank() == 0) { DBG_SPLIT(cursor, siblingStart, cursorCount, sibling); } return sibling; } template <typename Warp> __device__ static inline void insertIntoFreeNode(BNode *curr, KeyType insertKey, ValueType value, BNode *insertNode, Warp &warp) { auto rank = warp.thread_rank(); size_t keyIdx = upperBoundWarp(curr->mKeys, curr->size(), insertKey, warp); bool isLeaf = curr->leaf(); size_t childSize = curr->childSize(); if (isLeaf) { if (rank == 0) { DBG_INSERT_LEAF(curr, insertKey, keyIdx); } _Operations::insertChild(curr, keyIdx, insertNode, childSize, warp); _Operations::insertKey(curr, keyIdx, insertKey, value, warp); } else { if (rank == 0) { DBG_INSERT_INTERNAL(curr, insertKey, insertNode, keyIdx, (keyIdx + 1)); } _Operations::insertChild(curr, keyIdx + 1, insertNode, childSize, warp); _Operations::insertKey(curr, keyIdx, insertKey, warp); } warp.sync(); } template <typename Warp> __device__ static inline BNode *increaseTreeHeight(BNode *leftNode, KeyType insertKey, BNode *rightNode, Allocator &alloc, Warp &warp) { BNode *rootNode = nullptr; auto rank = warp.thread_rank(); if (rank == 0) { rootNode = alloc.allocate(); } rootNode = warp.shfl(rootNode, 0); _Operations::init(warp, rootNode, false, nullptr, false); warp.sync(); if (rank == 0) { rootNode->mKeys[rank] = insertKey; rootNode->mChildren[rank] = leftNode; } if (rank == 1) { rootNode->mChildren[rank] = rightNode; } rootNode->mSize = 1; if (rank == 0) { DBG_INCREASE_HEIGHT(leftNode, rootNode, rightNode, insertKey); } return rootNode; } template <typename Warp> __device__ static inline bool attemptLatchWarp(BNode *node, Latch &latch, Warp &warp) { bool attempt = false; if (warp.thread_rank() == 0) { attempt = latch.attempt(node); } attempt = warp.any(attempt); return attempt; } template <typename Warp> __device__ static inline void releaseLatchWarp(BNode *node, Latch &latch, Warp &warp) { if (warp.thread_rank() == 0) { latch.release(node); } warp.sync(); } template <typename Warp> __device__ static inline bool verifyParent(BNode *curr, BNode *prev, Warp &warp) { auto rank = warp.thread_rank(); size_t size = prev->childSize(); bool found = false; if (rank < size) { found = prev->mChildren[rank] == curr; } return warp.any(found); } template <typename Warp> __device__ static inline size_t lowerBoundWarp(KeyType *haystack, size_t haystackSize, KeyType needle, Warp &warp) { auto rank = warp.thread_rank(); TNL_ASSERT_GE(warp.size(), Order, "Warp must be larger than order"); bool matches = false; if (rank < haystackSize) { matches = haystack[rank] >= needle; } auto ballot = warp.ballot(matches); size_t res = __ffs(ballot) - 1; if (res > haystackSize) return haystackSize; return res; } template <typename Warp> __device__ static inline size_t upperBoundWarp(KeyType *haystack, size_t haystackSize, KeyType needle, Warp &warp) { auto rank = warp.thread_rank(); TNL_ASSERT_GE(warp.size(), Order, "Warp must be larger than order"); bool matches = false; if (rank < haystackSize) { matches = haystack[rank] > needle; } auto ballot = warp.ballot(matches); size_t res = __ffs(ballot) - 1; if (res > haystackSize) return haystackSize; return res; } public: using Node = BNode; using Operations = _Operations; template <typename Warp> __device__ static inline BNode *init(Allocator &alloc, Warp &warp) { BNode *root = alloc.allocate(); _Operations::init(warp, root, true, nullptr, false); DBG_INIT(root, Order); return root; } template <typename Warp> __device__ static inline bool find(BNode *root, KeyType key, ValueType &result, Warp &warp) { auto rank = warp.thread_rank(); BNode *next = root; while (next) { bool isLeaf = next->leaf(); size_t size = next->size(); KeyType nextKey = next->mKeys[rank]; uint32_t ballot = warp.ballot(nextKey >= key); size_t targetIdx = __ffs(ballot) - 1; targetIdx = (targetIdx > size) ? size : targetIdx; KeyType targetKey = warp.shfl(nextKey, targetIdx); if (isLeaf) { if (targetIdx < size && targetKey == key) { result = next->mValues[targetIdx]; return true; } return false; } // assumption: the tree is fully constructed and insert / splits are done next = next->mChildren[targetIdx + (targetKey == key)]; } return false; } template <typename Warp> __device__ static bool insert(BNode *&root, KeyType key, ValueType value, Allocator &alloc, Latch &latch, Warp &warp) { BNode *prev = nullptr, *curr = nullptr; do { __threadfence(); BNode *tmpPrev = prev; if (curr == nullptr) { curr = root; } else { prev = curr; curr = scan(curr, key, warp); } if (curr != nullptr && curr->size() + !curr->leaf() >= Order) { // if node is full, we haven't propagated, restart the operation if (prev && curr != root && prev->mSibling == curr) { return false; } if (attemptLatchWarp(curr, latch, warp) == false) { return false; } size_t medianIdx = curr->size() / 2; size_t splitIdx = medianIdx + (!curr->leaf()); KeyType medianKey = curr->mKeys[medianIdx]; if (curr == root) { BNode *split = splitNode(curr, alloc, splitIdx, medianIdx, warp); __threadfence(); BNode *newRoot = increaseTreeHeight(curr, medianKey, split, alloc, warp); __threadfence(); releaseLatchWarp(curr, latch, warp); root = newRoot; curr = prev; prev = tmpPrev; } else { // latch the parent too if (attemptLatchWarp(prev, latch, warp) == false) { releaseLatchWarp(curr, latch, warp); return false; } if (prev && verifyParent(curr, prev, warp) == false) { releaseLatchWarp(curr, latch, warp); releaseLatchWarp(prev, latch, warp); return false; } // verify, if the parent has not subsequently went full (esp. when ) if (prev->size() + !prev->leaf() >= Order) { releaseLatchWarp(curr, latch, warp); releaseLatchWarp(prev, latch, warp); return false; } BNode *split = splitNode(curr, alloc, splitIdx, medianIdx, warp); __threadfence(); releaseLatchWarp(curr, latch, warp); insertIntoFreeNode(prev, medianKey, 0, split, warp); __threadfence(); releaseLatchWarp(prev, latch, warp); curr = prev; prev = tmpPrev; } } } while (curr != nullptr && curr->leaf() == false); if (curr == nullptr) { return false; } if (attemptLatchWarp(curr, latch, warp) == false) { return false; } BNode *tmp; while (curr->mSibling != nullptr && (tmp = scan(curr, key, warp)) == curr->mSibling) { bool attempt = attemptLatchWarp(tmp, latch, warp); releaseLatchWarp(curr, latch, warp); if (attempt == false) { return false; } curr = tmp; } // here we don't check if the node has subsequently become full if (curr->size() + !curr->leaf() >= Order) { releaseLatchWarp(curr, latch, warp); return false; } insertIntoFreeNode(curr, key, value, nullptr, warp); __threadfence(); releaseLatchWarp(curr, latch, warp); return true; } template <typename Warp> __device__ static inline bool remove(BNode *root, KeyType key, Latch &latch, Warp &warp) { BNode *cursor = findLeaf(root, key, warp); if (cursor == nullptr) { return true; } if (attemptLatchWarp(cursor, latch, warp) == false) { return false; } __threadfence(); // remove item from keys and remove the respective node size_t keyIdx = lowerBoundWarp(cursor->mKeys, cursor->size(), key, warp); if (keyIdx >= cursor->size() || cursor->mKeys[keyIdx] != key) { releaseLatchWarp(cursor, latch, warp); return true; } _Operations::removeChild(cursor, keyIdx, cursor->childSize(), warp); _Operations::removeKey(cursor, keyIdx, warp); warp.sync(); if (warp.thread_rank() == 0) { DBG_DELETE(cursor, keyIdx); } releaseLatchWarp(cursor, latch, warp); return true; } };
implementation/BPlusTree/src/BLinkTree/BLinkTreeHost.hpp 0 → 100644 +229 −0 Original line number Diff line number Diff line #pragma once #define BTREE_HOST 1 #include <TNL/Cuda/CudaCallable.h> #include <algorithm> #include <iostream> #include <sstream> #include <string> #include "../Constants.hpp" #include "../Utils/Algorithms.hpp" #include "../Utils/BumpAllocator.hpp" #include "../Utils/Print.hpp" #include "../BNodeLatch/Default.hpp" #include "./BLinkNode.hpp" #include "./BNodeOperations/Default.hpp" template <typename KeyType, typename ValueType, size_t Order, typename Device> class BLinkTree; template <typename KeyType, typename ValueType, size_t Order> class BLinkTree<KeyType, ValueType, Order, Devices::Host> { using Device = Devices::Host; using BNode = BLinkNode<KeyType, ValueType, Order>; using Latch = BNodeLatch<BNode, Device>; using Allocator = BumpAllocator<BNode, Device>; using _Operations = BLinkOperations<KeyType, ValueType, Order, Device>; __cuda_callable__ static inline BNode *scan(BNode *node, KeyType key) { if (node->mHighKeyFlag && key >= node->mHighKey) { return node->mSibling; } size_t size = node->size(); size_t childSize = node->childSize(); size_t targetIndex = upperBound(node->mKeys, size, key); if (targetIndex == node->childSize()) { return nullptr; } BNode *result = node->mChildren[targetIndex]; return result; } __cuda_callable__ static inline BNode *moveSide(BNode *cursor, KeyType key) { BNode *tmp, *result = cursor; while (result != nullptr && result->mSibling != nullptr && (tmp = scan(result, key)) == result->mSibling) { result = tmp; } return result; } __cuda_callable__ static inline BNode *findLeaf(BNode *root, KeyType key) { BNode *cursor = root; while (cursor != nullptr && cursor->leaf() == false) { cursor = scan(cursor, key); } cursor = moveSide(cursor, key); return cursor; } __cuda_callable__ static BNode *splitNode(BNode *cursor, Allocator &alloc, size_t siblingStart, size_t cursorCount) { BNode *sibling = alloc.allocate(); _Operations::init(sibling, cursor); for (size_t i = siblingStart; i < cursor->childSize(); ++i) { BNode *node = cursor->mChildren[i]; _Operations::appendChild(sibling, node, i - siblingStart); } for (size_t i = siblingStart; i < cursor->size(); ++i) { if (sibling->leaf()) { _Operations::appendKey(sibling, cursor->mKeys[i], cursor->mValues[i]); } else { _Operations::appendKey(sibling, cursor->mKeys[i]); } } cursor->mSize = cursorCount; cursor->mSibling = sibling; cursor->mHighKey = sibling->mKeys[0]; cursor->mHighKeyFlag = true; return sibling; } __cuda_callable__ static inline void insertIntoFreeNode(BNode *curr, KeyType insertKey, ValueType value, BNode *insertNode) { size_t keyIdx = upperBound(curr->mKeys, curr->size(), insertKey); if (curr->leaf()) { _Operations::insertChild(curr, keyIdx, insertNode, curr->childSize()); _Operations::insertKey(curr, keyIdx, insertKey, value); } else { _Operations::insertChild(curr, keyIdx + 1, insertNode, curr->childSize()); _Operations::insertKey(curr, keyIdx, insertKey); } } __cuda_callable__ static inline void increaseTreeHeight(BNode *curr, KeyType insertKey, BNode *insertNode, Allocator &alloc) { BNode *leftNode = alloc.allocate(); // copy the entire node to the element _Operations::init(leftNode, curr); for (size_t childKey = 0; childKey < curr->childSize(); ++childKey) { BNode *child = curr->mChildren[childKey]; _Operations::appendChild(leftNode, child, childKey); if (childKey < curr->size()) { if (leftNode->leaf()) { _Operations::appendKey(leftNode, curr->mKeys[childKey], curr->mValues[childKey]); } else { _Operations::appendKey(leftNode, curr->mKeys[childKey]); } } } // this might cause some issues in synchronization _Operations::init(curr, false, nullptr, false); _Operations::appendChild(curr, leftNode, 0); _Operations::appendKey(curr, insertKey); _Operations::appendChild(curr, insertNode, 1); } public: using Node = BNode; using Operations = _Operations; __cuda_callable__ static inline BNode *init(Allocator &alloc) { BNode *root = alloc.allocate(); _Operations::init(root, true, nullptr, false); return root; } __cuda_callable__ static inline bool find(BNode *root, KeyType key, ValueType &result) { BNode *leaf = findLeaf(root, key); if (leaf == nullptr) return false; size_t it = lowerBound(leaf->mKeys, leaf->size(), key); if (it < leaf->size() && leaf->mKeys[it] == key) { result = leaf->mValues[it]; return true; } return false; } __cuda_callable__ static bool insert(BNode *root, KeyType key, ValueType value, Allocator &alloc, Latch &latch) { BNode *prev = nullptr, *curr = nullptr; do { BNode *tmpPrev = prev; if (curr == nullptr) { curr = root; } else { prev = curr; curr = scan(curr, key); } if (curr != nullptr && curr->size() + !curr->leaf() >= Order) { // if node is full, we haven't propagated, restart the operation if (curr->mSibling == prev && curr != root) { return false; } size_t medianIdx = curr->size() / 2; size_t splitIdx = medianIdx + (!curr->leaf()); KeyType medianKey = curr->mKeys[medianIdx]; BNode *split = nullptr; split = splitNode(curr, alloc, splitIdx, medianIdx); if (curr == root) { increaseTreeHeight(curr, medianKey, split, alloc); } else { insertIntoFreeNode(prev, medianKey, 0, split); curr = prev; prev = tmpPrev; } } } while (curr != nullptr && curr->leaf() == false); if (curr == nullptr) { return false; } BNode *tmp; while (curr->mSibling != nullptr && (tmp = scan(curr, key)) == curr->mSibling) { curr = tmp; } insertIntoFreeNode(curr, key, value, nullptr); return true; } __cuda_callable__ static inline bool remove(BNode *root, KeyType key, Latch &latch) { BNode *cursor = findLeaf(root, key); if (cursor == nullptr) return true; if (!latch.attempt(cursor)) { return false; } // remove item from keys and remove the respective node size_t keyIdx = lowerBound(cursor->mKeys, cursor->size(), key); if (keyIdx == cursor->size() || cursor->mKeys[keyIdx] != key) { latch.release(cursor); return true; } _Operations::removeChild(cursor, keyIdx, cursor->childSize()); _Operations::removeKey(cursor, keyIdx); latch.release(cursor); return true; } }; No newline at end of file