chore: clean up all tests (3df6db3e) · Commits · TNL / GPUBTree

implementation/BPlusTree/benchmark/benchmark_cuda.cu

+2 −3

Original line number	Diff line number	Diff line
		#include "BPlusTree/Default.hpp"
		#include "BLinkTree/Default.hpp"
		#include "BTreeContainer/Default.hpp"
		#include "TNL/Devices/Cuda.h"
		#include "benchmark.hpp"
		@@ -12,8 +12,7 @@ int main(int argc, char **argv) {
		Benchmark::execute<Benchmark::Device::Cuda, KeyType>(
		"btree_cuda_v7", [](auto &timer, auto input) {
		size_t size = getContainerSize(input.size(), Order);
		BTreeContainer<BNode, BNodeLatch, BPlusTree, KeyType, ValueType, Order, TNL::Devices::Cuda>
		container(size);
		BTreeContainer<KeyType, ValueType, Order, TNL::Devices::Cuda, BLinkTree> container(size);

		Containers::Array<KeyType, TNL::Devices::Cuda> keys(input);
		Containers::Array<ValueType, TNL::Devices::Cuda> values(input);

implementation/BPlusTree/benchmark/benchmark_host.cpp

+2 −3

Original line number	Diff line number	Diff line
		#include "BPlusTree/Default.hpp"
		#include "BLinkTree/Default.hpp"
		#include "BTreeContainer/Default.hpp"
		#include "TNL/Devices/Cuda.h"
		#include "benchmark.hpp"
		@@ -12,8 +12,7 @@ int main(int argc, char **argv) {
		Benchmark::execute<Benchmark::Device::Host, KeyType>(
		"btree_host_v7", [](auto &timer, auto input) {
		size_t size = getContainerSize(input.size(), Order);
		BTreeContainer<BNode, BNodeLatch, BPlusTree, KeyType, ValueType, Order, TNL::Devices::Host>
		container(size);
		BTreeContainer<KeyType, ValueType, Order, TNL::Devices::Host, BLinkTree> container(size);

		Containers::Array<KeyType, TNL::Devices::Host> keys(input);
		Containers::Array<ValueType, TNL::Devices::Host> values(input);

implementation/BPlusTree/src/BLinkTree/BLinkNode.hpp

0 → 100644

+33 −0

Original line number	Diff line number	Diff line
		#pragma once
		#include <TNL/Assert.h>
		#include <TNL/Cuda/CudaCallable.h>
		#include <TNL/TypeTraits.h>
		#include <cstddef>
		#include <stdint.h>

		template <typename KeyType, typename ValueType, size_t Order> struct BLinkNode {
		BLinkNode *volatile mSibling;

		KeyType mHighKey;
		uint8_t mHighKeyFlag;

		uint8_t mLeaf;
		uint8_t mSize;

		int mWriteLock = false;

		KeyType mKeys[Order];
		ValueType mValues[Order];
		BLinkNode *volatile mChildren[Order];

		__cuda_callable__ inline bool leaf() { return this->mLeaf; }

		__cuda_callable__ inline size_t size() { return this->mSize; }

		__cuda_callable__ inline size_t childSize() {
		if (this->mLeaf == false && this->mSize > 0) {
		return this->mSize + 1;
		}
		return this->mSize;
		}
		};
		No newline at end of file

implementation/BPlusTree/src/BLinkTree/BLinkTreeCuda.cu

0 → 100644

+431 −0

Original line number	Diff line number	Diff line
		#pragma once
		#include <TNL/Cuda/CudaCallable.h>

		#include <algorithm>
		#include <cstdio>
		#include <iostream>
		#include <sstream>
		#include <string>

		#include <cooperative_groups.h>

		#include "../Constants.hpp"
		#include "../Utils/BumpAllocator.hpp"
		#include "../Utils/Debugger.hpp"
		#include "../Utils/Print.hpp"

		#include "../BNodeLatch/Default.hpp"
		#include "./BNodeOperations/Default.hpp"
		#include "TNL/Assert.h"

		template <typename KeyType, typename ValueType, size_t Order, typename Device> class BLinkTree;

		template <typename KeyType, typename ValueType, size_t Order>
		class BLinkTree<KeyType, ValueType, Order, Devices::Cuda> {
		using Device = Devices::Cuda;
		using BNode = BLinkNode<KeyType, ValueType, Order>;
		using Latch = BNodeLatch<BNode, Device>;
		using Allocator = BumpAllocator<BNode, Device>;
		using _Operations = BLinkOperations<KeyType, ValueType, Order, Device>;

		template <typename Warp>
		__device__ static inline BNode scan(BNode node, KeyType key, Warp &warp) {

		if (node->mHighKeyFlag && key >= node->mHighKey) {
		return node->mSibling;
		}

		size_t size = node->size();
		size_t childSize = node->childSize();
		size_t targetIndex = upperBoundWarp(node->mKeys, size, key, warp);

		if (targetIndex >= childSize) {
		return nullptr;
		}

		BNode *result = node->mChildren[targetIndex];
		return result;
		}

		template <typename Warp>
		__device__ static inline BNode moveSide(BNode cursor, KeyType key, Warp &warp) {
		BNode tmp, result = cursor;
		while (result != nullptr && result->mSibling != nullptr &&
		(tmp = scan(result, key, warp)) == result->mSibling) {
		result = tmp;
		}

		return result;
		}

		template <typename Warp>
		__device__ static inline BNode findLeaf(BNode root, KeyType key, Warp &warp) {
		BNode *cursor = root;
		while (cursor != nullptr && cursor->leaf() == false) {
		cursor = scan(cursor, key, warp);
		}

		cursor = moveSide(cursor, key, warp);
		return cursor;
		}

		template <typename Warp>
		__device__ static BNode splitNode(BNode cursor, Allocator &alloc, size_t siblingStart,
		size_t cursorCount, Warp &warp) {

		BNode *sibling = nullptr;
		auto rank = warp.thread_rank();

		if (rank == 0) {
		sibling = alloc.allocate();
		}

		_Operations::init(warp, sibling, cursor);
		warp.sync();

		sibling = warp.shfl(sibling, 0);

		size_t size = cursor->size();
		size_t childSize = cursor->childSize();
		bool isLeaf = sibling->leaf();

		if (rank >= siblingStart) {
		if (rank < childSize) {
		sibling->mChildren[rank - siblingStart] = cursor->mChildren[rank];
		}

		if (rank < size) {
		sibling->mKeys[rank - siblingStart] = cursor->mKeys[rank];
		sibling->mValues[rank - siblingStart] = cursor->mValues[rank];
		sibling->mSize = size - siblingStart;
		}
		}

		cursor->mSize = cursorCount;
		cursor->mSibling = sibling;
		// wait until all sibling have written
		// warp.sync();
		cursor->mHighKey = sibling->mKeys[0];
		cursor->mHighKeyFlag = true;

		if (warp.thread_rank() == 0) {
		DBG_SPLIT(cursor, siblingStart, cursorCount, sibling);
		}

		return sibling;
		}

		template <typename Warp>
		__device__ static inline void insertIntoFreeNode(BNode *curr, KeyType insertKey, ValueType value,
		BNode *insertNode, Warp &warp) {
		auto rank = warp.thread_rank();
		size_t keyIdx = upperBoundWarp(curr->mKeys, curr->size(), insertKey, warp);

		bool isLeaf = curr->leaf();
		size_t childSize = curr->childSize();

		if (isLeaf) {
		if (rank == 0) {
		DBG_INSERT_LEAF(curr, insertKey, keyIdx);
		}
		_Operations::insertChild(curr, keyIdx, insertNode, childSize, warp);
		_Operations::insertKey(curr, keyIdx, insertKey, value, warp);
		} else {
		if (rank == 0) {
		DBG_INSERT_INTERNAL(curr, insertKey, insertNode, keyIdx, (keyIdx + 1));
		}
		_Operations::insertChild(curr, keyIdx + 1, insertNode, childSize, warp);
		_Operations::insertKey(curr, keyIdx, insertKey, warp);
		}

		warp.sync();
		}

		template <typename Warp>
		__device__ static inline BNode increaseTreeHeight(BNode leftNode, KeyType insertKey,
		BNode *rightNode, Allocator &alloc,
		Warp &warp) {

		BNode *rootNode = nullptr;
		auto rank = warp.thread_rank();
		if (rank == 0) {
		rootNode = alloc.allocate();
		}
		rootNode = warp.shfl(rootNode, 0);

		_Operations::init(warp, rootNode, false, nullptr, false);
		warp.sync();

		if (rank == 0) {
		rootNode->mKeys[rank] = insertKey;
		rootNode->mChildren[rank] = leftNode;
		}
		if (rank == 1) {
		rootNode->mChildren[rank] = rightNode;
		}
		rootNode->mSize = 1;

		if (rank == 0) {
		DBG_INCREASE_HEIGHT(leftNode, rootNode, rightNode, insertKey);
		}

		return rootNode;
		}

		template <typename Warp>
		__device__ static inline bool attemptLatchWarp(BNode *node, Latch &latch, Warp &warp) {
		bool attempt = false;

		if (warp.thread_rank() == 0) {
		attempt = latch.attempt(node);
		}

		attempt = warp.any(attempt);
		return attempt;
		}

		template <typename Warp>
		__device__ static inline void releaseLatchWarp(BNode *node, Latch &latch, Warp &warp) {
		if (warp.thread_rank() == 0) {
		latch.release(node);
		}

		warp.sync();
		}

		template <typename Warp>
		__device__ static inline bool verifyParent(BNode curr, BNode prev, Warp &warp) {

		auto rank = warp.thread_rank();
		size_t size = prev->childSize();

		bool found = false;
		if (rank < size) {
		found = prev->mChildren[rank] == curr;
		}

		return warp.any(found);
		}

		template <typename Warp>
		__device__ static inline size_t lowerBoundWarp(KeyType *haystack, size_t haystackSize,
		KeyType needle, Warp &warp) {
		auto rank = warp.thread_rank();
		TNL_ASSERT_GE(warp.size(), Order, "Warp must be larger than order");

		bool matches = false;
		if (rank < haystackSize) {
		matches = haystack[rank] >= needle;
		}

		auto ballot = warp.ballot(matches);
		size_t res = __ffs(ballot) - 1;
		if (res > haystackSize)
		return haystackSize;
		return res;
		}

		template <typename Warp>
		__device__ static inline size_t upperBoundWarp(KeyType *haystack, size_t haystackSize,
		KeyType needle, Warp &warp) {
		auto rank = warp.thread_rank();
		TNL_ASSERT_GE(warp.size(), Order, "Warp must be larger than order");

		bool matches = false;
		if (rank < haystackSize) {
		matches = haystack[rank] > needle;
		}

		auto ballot = warp.ballot(matches);
		size_t res = __ffs(ballot) - 1;
		if (res > haystackSize)
		return haystackSize;
		return res;
		}

		public:
		using Node = BNode;
		using Operations = _Operations;

		template <typename Warp> __device__ static inline BNode *init(Allocator &alloc, Warp &warp) {
		BNode *root = alloc.allocate();
		_Operations::init(warp, root, true, nullptr, false);

		DBG_INIT(root, Order);
		return root;
		}

		template <typename Warp>
		__device__ static inline bool find(BNode *root, KeyType key, ValueType &result, Warp &warp) {
		auto rank = warp.thread_rank();

		BNode *next = root;
		while (next) {
		bool isLeaf = next->leaf();
		size_t size = next->size();

		KeyType nextKey = next->mKeys[rank];
		uint32_t ballot = warp.ballot(nextKey >= key);
		size_t targetIdx = __ffs(ballot) - 1;

		targetIdx = (targetIdx > size) ? size : targetIdx;
		KeyType targetKey = warp.shfl(nextKey, targetIdx);

		if (isLeaf) {
		if (targetIdx < size && targetKey == key) {
		result = next->mValues[targetIdx];
		return true;
		}
		return false;
		}

		// assumption: the tree is fully constructed and insert / splits are done
		next = next->mChildren[targetIdx + (targetKey == key)];
		}

		return false;
		}

		template <typename Warp>
		__device__ static bool insert(BNode *&root, KeyType key, ValueType value, Allocator &alloc,
		Latch &latch, Warp &warp) {
		BNode prev = nullptr, curr = nullptr;

		do {
		__threadfence();

		BNode *tmpPrev = prev;
		if (curr == nullptr) {
		curr = root;
		} else {
		prev = curr;
		curr = scan(curr, key, warp);
		}

		if (curr != nullptr && curr->size() + !curr->leaf() >= Order) {
		// if node is full, we haven't propagated, restart the operation
		if (prev && curr != root && prev->mSibling == curr) {
		return false;
		}

		if (attemptLatchWarp(curr, latch, warp) == false) {
		return false;
		}

		size_t medianIdx = curr->size() / 2;
		size_t splitIdx = medianIdx + (!curr->leaf());
		KeyType medianKey = curr->mKeys[medianIdx];

		if (curr == root) {
		BNode *split = splitNode(curr, alloc, splitIdx, medianIdx, warp);
		__threadfence();

		BNode *newRoot = increaseTreeHeight(curr, medianKey, split, alloc, warp);

		__threadfence();
		releaseLatchWarp(curr, latch, warp);

		root = newRoot;
		curr = prev;
		prev = tmpPrev;
		} else {
		// latch the parent too
		if (attemptLatchWarp(prev, latch, warp) == false) {
		releaseLatchWarp(curr, latch, warp);
		return false;
		}

		if (prev && verifyParent(curr, prev, warp) == false) {
		releaseLatchWarp(curr, latch, warp);
		releaseLatchWarp(prev, latch, warp);
		return false;
		}

		// verify, if the parent has not subsequently went full (esp. when )
		if (prev->size() + !prev->leaf() >= Order) {
		releaseLatchWarp(curr, latch, warp);
		releaseLatchWarp(prev, latch, warp);
		return false;
		}

		BNode *split = splitNode(curr, alloc, splitIdx, medianIdx, warp);

		__threadfence();
		releaseLatchWarp(curr, latch, warp);

		insertIntoFreeNode(prev, medianKey, 0, split, warp);

		__threadfence();
		releaseLatchWarp(prev, latch, warp);

		curr = prev;
		prev = tmpPrev;
		}
		}
		} while (curr != nullptr && curr->leaf() == false);

		if (curr == nullptr) {
		return false;
		}

		if (attemptLatchWarp(curr, latch, warp) == false) {
		return false;
		}

		BNode *tmp;
		while (curr->mSibling != nullptr && (tmp = scan(curr, key, warp)) == curr->mSibling) {
		bool attempt = attemptLatchWarp(tmp, latch, warp);
		releaseLatchWarp(curr, latch, warp);

		if (attempt == false) {
		return false;
		}
		curr = tmp;
		}

		// here we don't check if the node has subsequently become full
		if (curr->size() + !curr->leaf() >= Order) {
		releaseLatchWarp(curr, latch, warp);
		return false;
		}

		insertIntoFreeNode(curr, key, value, nullptr, warp);

		__threadfence();
		releaseLatchWarp(curr, latch, warp);
		return true;
		}

		template <typename Warp>
		__device__ static inline bool remove(BNode *root, KeyType key, Latch &latch, Warp &warp) {
		BNode *cursor = findLeaf(root, key, warp);

		if (cursor == nullptr) {
		return true;
		}

		if (attemptLatchWarp(cursor, latch, warp) == false) {
		return false;
		}

		__threadfence();

		// remove item from keys and remove the respective node
		size_t keyIdx = lowerBoundWarp(cursor->mKeys, cursor->size(), key, warp);
		if (keyIdx >= cursor->size() \|\| cursor->mKeys[keyIdx] != key) {
		releaseLatchWarp(cursor, latch, warp);
		return true;
		}

		_Operations::removeChild(cursor, keyIdx, cursor->childSize(), warp);
		_Operations::removeKey(cursor, keyIdx, warp);
		warp.sync();

		if (warp.thread_rank() == 0) {
		DBG_DELETE(cursor, keyIdx);
		}

		releaseLatchWarp(cursor, latch, warp);
		return true;
		}
		};

implementation/BPlusTree/src/BLinkTree/BLinkTreeHost.hpp

0 → 100644

+229 −0

Original line number	Diff line number	Diff line
		#pragma once
		#define BTREE_HOST 1

		#include <TNL/Cuda/CudaCallable.h>

		#include <algorithm>
		#include <iostream>
		#include <sstream>
		#include <string>

		#include "../Constants.hpp"
		#include "../Utils/Algorithms.hpp"
		#include "../Utils/BumpAllocator.hpp"
		#include "../Utils/Print.hpp"

		#include "../BNodeLatch/Default.hpp"
		#include "./BLinkNode.hpp"
		#include "./BNodeOperations/Default.hpp"

		template <typename KeyType, typename ValueType, size_t Order, typename Device> class BLinkTree;

		template <typename KeyType, typename ValueType, size_t Order>
		class BLinkTree<KeyType, ValueType, Order, Devices::Host> {
		using Device = Devices::Host;
		using BNode = BLinkNode<KeyType, ValueType, Order>;
		using Latch = BNodeLatch<BNode, Device>;
		using Allocator = BumpAllocator<BNode, Device>;
		using _Operations = BLinkOperations<KeyType, ValueType, Order, Device>;

		__cuda_callable__ static inline BNode scan(BNode node, KeyType key) {
		if (node->mHighKeyFlag && key >= node->mHighKey) {
		return node->mSibling;
		}

		size_t size = node->size();
		size_t childSize = node->childSize();
		size_t targetIndex = upperBound(node->mKeys, size, key);

		if (targetIndex == node->childSize()) {
		return nullptr;
		}

		BNode *result = node->mChildren[targetIndex];
		return result;
		}

		__cuda_callable__ static inline BNode moveSide(BNode cursor, KeyType key) {
		BNode tmp, result = cursor;
		while (result != nullptr && result->mSibling != nullptr &&
		(tmp = scan(result, key)) == result->mSibling) {
		result = tmp;
		}

		return result;
		}

		__cuda_callable__ static inline BNode findLeaf(BNode root, KeyType key) {
		BNode *cursor = root;
		while (cursor != nullptr && cursor->leaf() == false) {
		cursor = scan(cursor, key);
		}

		cursor = moveSide(cursor, key);
		return cursor;
		}

		__cuda_callable__ static BNode splitNode(BNode cursor, Allocator &alloc, size_t siblingStart,
		size_t cursorCount) {

		BNode *sibling = alloc.allocate();
		_Operations::init(sibling, cursor);

		for (size_t i = siblingStart; i < cursor->childSize(); ++i) {
		BNode *node = cursor->mChildren[i];
		_Operations::appendChild(sibling, node, i - siblingStart);
		}

		for (size_t i = siblingStart; i < cursor->size(); ++i) {
		if (sibling->leaf()) {
		_Operations::appendKey(sibling, cursor->mKeys[i], cursor->mValues[i]);
		} else {
		_Operations::appendKey(sibling, cursor->mKeys[i]);
		}
		}

		cursor->mSize = cursorCount;
		cursor->mSibling = sibling;
		cursor->mHighKey = sibling->mKeys[0];
		cursor->mHighKeyFlag = true;

		return sibling;
		}

		__cuda_callable__ static inline void insertIntoFreeNode(BNode *curr, KeyType insertKey,
		ValueType value, BNode *insertNode) {
		size_t keyIdx = upperBound(curr->mKeys, curr->size(), insertKey);

		if (curr->leaf()) {
		_Operations::insertChild(curr, keyIdx, insertNode, curr->childSize());
		_Operations::insertKey(curr, keyIdx, insertKey, value);
		} else {
		_Operations::insertChild(curr, keyIdx + 1, insertNode, curr->childSize());
		_Operations::insertKey(curr, keyIdx, insertKey);
		}
		}

		__cuda_callable__ static inline void increaseTreeHeight(BNode *curr, KeyType insertKey,
		BNode *insertNode, Allocator &alloc) {

		BNode *leftNode = alloc.allocate();

		// copy the entire node to the element
		_Operations::init(leftNode, curr);
		for (size_t childKey = 0; childKey < curr->childSize(); ++childKey) {
		BNode *child = curr->mChildren[childKey];
		_Operations::appendChild(leftNode, child, childKey);

		if (childKey < curr->size()) {
		if (leftNode->leaf()) {
		_Operations::appendKey(leftNode, curr->mKeys[childKey], curr->mValues[childKey]);
		} else {
		_Operations::appendKey(leftNode, curr->mKeys[childKey]);
		}
		}
		}

		// this might cause some issues in synchronization
		_Operations::init(curr, false, nullptr, false);
		_Operations::appendChild(curr, leftNode, 0);
		_Operations::appendKey(curr, insertKey);
		_Operations::appendChild(curr, insertNode, 1);
		}

		public:
		using Node = BNode;
		using Operations = _Operations;

		__cuda_callable__ static inline BNode *init(Allocator &alloc) {
		BNode *root = alloc.allocate();
		_Operations::init(root, true, nullptr, false);
		return root;
		}

		__cuda_callable__ static inline bool find(BNode *root, KeyType key, ValueType &result) {
		BNode *leaf = findLeaf(root, key);
		if (leaf == nullptr)
		return false;

		size_t it = lowerBound(leaf->mKeys, leaf->size(), key);
		if (it < leaf->size() && leaf->mKeys[it] == key) {
		result = leaf->mValues[it];
		return true;
		}
		return false;
		}

		__cuda_callable__ static bool insert(BNode *root, KeyType key, ValueType value, Allocator &alloc,
		Latch &latch) {
		BNode prev = nullptr, curr = nullptr;
		do {
		BNode *tmpPrev = prev;
		if (curr == nullptr) {
		curr = root;
		} else {
		prev = curr;
		curr = scan(curr, key);
		}

		if (curr != nullptr && curr->size() + !curr->leaf() >= Order) {
		// if node is full, we haven't propagated, restart the operation
		if (curr->mSibling == prev && curr != root) {
		return false;
		}

		size_t medianIdx = curr->size() / 2;
		size_t splitIdx = medianIdx + (!curr->leaf());

		KeyType medianKey = curr->mKeys[medianIdx];
		BNode *split = nullptr;

		split = splitNode(curr, alloc, splitIdx, medianIdx);

		if (curr == root) {
		increaseTreeHeight(curr, medianKey, split, alloc);
		} else {
		insertIntoFreeNode(prev, medianKey, 0, split);
		curr = prev;
		prev = tmpPrev;
		}
		}
		} while (curr != nullptr && curr->leaf() == false);

		if (curr == nullptr) {
		return false;
		}

		BNode *tmp;
		while (curr->mSibling != nullptr && (tmp = scan(curr, key)) == curr->mSibling) {
		curr = tmp;
		}

		insertIntoFreeNode(curr, key, value, nullptr);
		return true;
		}

		__cuda_callable__ static inline bool remove(BNode *root, KeyType key, Latch &latch) {
		BNode *cursor = findLeaf(root, key);

		if (cursor == nullptr)
		return true;

		if (!latch.attempt(cursor)) {
		return false;
		}

		// remove item from keys and remove the respective node
		size_t keyIdx = lowerBound(cursor->mKeys, cursor->size(), key);
		if (keyIdx == cursor->size() \|\| cursor->mKeys[keyIdx] != key) {
		latch.release(cursor);
		return true;
		}

		_Operations::removeChild(cursor, keyIdx, cursor->childSize());
		_Operations::removeKey(cursor, keyIdx);

		latch.release(cursor);
		return true;
		}
		};
		No newline at end of file