From 859bf5a0081cc3cddf974e50ace4e88f28846f98 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Fri, 12 Feb 2021 09:51:02 +0100 Subject: [PATCH 001/258] Initial commit --- GPUSort/.gitignore | 1 + GPUSort/bitonicGPU/Makefile | 24 +++++ GPUSort/bitonicGPU/benchmark/Makefile | 25 +++++ GPUSort/bitonicGPU/benchmark/benchmark.cu | 122 ++++++++++++++++++++++ GPUSort/bitonicGPU/bitonicSort.h | 68 ++++++++++++ GPUSort/bitonicGPU/config.mk | 49 +++++++++ GPUSort/bitonicGPU/main.cu | 30 ++++++ GPUSort/bitonicGPU/unitTests/Makefile | 26 +++++ GPUSort/bitonicGPU/unitTests/unitTests.cu | 110 +++++++++++++++++++ 9 files changed, 455 insertions(+) create mode 100644 GPUSort/.gitignore create mode 100644 GPUSort/bitonicGPU/Makefile create mode 100644 GPUSort/bitonicGPU/benchmark/Makefile create mode 100644 GPUSort/bitonicGPU/benchmark/benchmark.cu create mode 100644 GPUSort/bitonicGPU/bitonicSort.h create mode 100644 GPUSort/bitonicGPU/config.mk create mode 100644 GPUSort/bitonicGPU/main.cu create mode 100644 GPUSort/bitonicGPU/unitTests/Makefile create mode 100644 GPUSort/bitonicGPU/unitTests/unitTests.cu diff --git a/GPUSort/.gitignore b/GPUSort/.gitignore new file mode 100644 index 000000000..722d5e71d --- /dev/null +++ b/GPUSort/.gitignore @@ -0,0 +1 @@ +.vscode diff --git a/GPUSort/bitonicGPU/Makefile b/GPUSort/bitonicGPU/Makefile new file mode 100644 index 000000000..6a7032a04 --- /dev/null +++ b/GPUSort/bitonicGPU/Makefile @@ -0,0 +1,24 @@ +include config.mk + +CUDA_SOURCES := $(wildcard *.cu) +CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) + +## targets definitions follow +.PHONY: all host cuda +all: cuda +cuda: $(CUDA_TARGETS) + +run: cuda + ./$(CUDA_TARGETS) + +.PHONY: clean +clean: + rm -f *.d *.o *.cuo $(CUDA_TARGETS) + +# use .cuo instead of .cu.o to avoid problems with the implicit rules: https://stackoverflow.com/q/62967939 +# (and use the host compiler for linking CUDA, nvcc does not understand that .cuo is an object file) +$(CUDA_TARGETS): % : %.cuo + $(CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS) + +$(CUDA_SOURCES:%.cu=%.cuo): %.cuo : %.cu + $(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -c -o $@ $< diff --git a/GPUSort/bitonicGPU/benchmark/Makefile b/GPUSort/bitonicGPU/benchmark/Makefile new file mode 100644 index 000000000..9f523a7de --- /dev/null +++ b/GPUSort/bitonicGPU/benchmark/Makefile @@ -0,0 +1,25 @@ +include ../config.mk + +CUDA_SOURCES := $(wildcard *.cu) +CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) + +## targets definitions follow +.PHONY: all host cuda +all: cuda +cuda: $(CUDA_TARGETS) + +run: cuda + ./$(CUDA_TARGETS) + + +.PHONY: clean +clean: + rm -f *.d *.o *.cuo $(CUDA_TARGETS) + +# use .cuo instead of .cu.o to avoid problems with the implicit rules: https://stackoverflow.com/q/62967939 +# (and use the host compiler for linking CUDA, nvcc does not understand that .cuo is an object file) +$(CUDA_TARGETS): % : %.cuo + $(CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS) + +$(CUDA_SOURCES:%.cu=%.cuo): %.cuo : %.cu + $(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -c -o $@ $< diff --git a/GPUSort/bitonicGPU/benchmark/benchmark.cu b/GPUSort/bitonicGPU/benchmark/benchmark.cu new file mode 100644 index 000000000..a77bca3d4 --- /dev/null +++ b/GPUSort/bitonicGPU/benchmark/benchmark.cu @@ -0,0 +1,122 @@ +#include +#include +#include +#include +#include + +#include + +#include "../bitonicSort.h" + +template +std::ostream& operator<< (std::ostream&out, std::vector &arr) +{ + for (auto x : arr) + std::cout << x << " "; + return out; +} + +struct TIMER +{ + std::string s; + std::chrono::steady_clock::time_point begin; + double result = 0; + bool stopped = false; + + TIMER(const std::string &name = "") + : s(name), begin(std::chrono::steady_clock::now()) {} + + double stop() + { + auto end = std::chrono::steady_clock::now(); + result = (std::chrono::duration_cast(end - begin).count() / 1000.); + stopped = true; + return result; + } + + void printTime() + { + if(!stopped) + stop(); + std::cout << ("Measured " + s + ": ") << result << " ms" << std::endl; + } + + ~TIMER() + { + if(!stopped) + { + stop(); + printTime(); + } + } +}; + + +void test1() +{ + int size = 1<<10; + TNL::Containers::Array cudaArr(size); + cudaArr.evaluate([=] __cuda_callable__ (int i) {return i;}); + auto view = cudaArr.getView(); + + { + TIMER t("sorted sequences"); + bitonicSort(view); + } +} + +void randomShuffles() +{ + int size = 1<<15; + std::vector orig(size); + std::iota(orig.begin(), orig.end(), 0); + std::vector results; + + for (int i = 0; i < 100; i++) + { + std::random_shuffle(orig.begin(), orig.end()); + + TNL::Containers::Array cudaArr(orig); + auto view = cudaArr.getView(); + std::vector tmp(orig.begin(), orig.end()); + + { + TIMER t("random permutation"); + + std::sort(tmp.begin(), tmp.end()); + //bitonicSort(view); + + results.push_back(t.stop()); + //t.printTime(); + } + + } + + std::cout << "average time: " << std::accumulate(results.begin(), results.end(), 0.)/results.size() << " ms" << std::endl; +} + +void allPermutations(std::vector orig) +{ + std::vector results; + while (std::next_permutation(orig.begin(), orig.end())) + { + TNL::Containers::Array cudaArr(orig); + auto view = cudaArr.getView(); + + { + TIMER t("random permutation"); + bitonicSort(view); + results.push_back(t.stop()); + //t.printTime(); + } + } + std::cout << "average time: " << std::accumulate(results.begin(), results.end(), 0.)/results.size() << " ms" << std::endl; +} + + +int main() +{ + randomShuffles(); + + return 0; +} \ No newline at end of file diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h new file mode 100644 index 000000000..cfd859fde --- /dev/null +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -0,0 +1,68 @@ +#include +#include + +using namespace TNL; +using namespace TNL::Containers; + +typedef Devices::Cuda Device; + +//--------------------------------------------- + +__host__ __device__ int closestPow2(int x) +{ + if(x ==0) + return 0; + + int ret = 1; + while (ret < x) + ret <<= 1; + + return ret; +} + +//--------------------------------------------- + +void bitonicSort(ArrayView arr, int begin, int end, bool sortAscending) +{ + int arrSize = end - begin; + int paddedSize = closestPow2(arrSize); + + auto CmpSwap = [=] __cuda_callable__ (int i, int monotonicSeqLen, int len, int partsInSeq) mutable { + + int part = i / (len / 2); + + int s = begin + part * len + (i % (len / 2)); + int e = s + len / 2; + if (e >= end) + return; + + //calculate the direction of swapping + int monotonicSeqIdx = part / partsInSeq; + bool ascending = (monotonicSeqIdx % 2) == 0 ? !sortAscending : sortAscending; + + //special case for parts with no "partner" + if ((monotonicSeqIdx + 1) * monotonicSeqLen >= end) + ascending = sortAscending; + + //templated size of block + + auto &a = arr[s]; + auto &b = arr[e]; + if ((ascending && a > b) || (!ascending && a < b)) + TNL::swap(a, b); + }; + + for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) + { + for (int len = monotonicSeqLen, partsInSeq = 1; len > 1; len /= 2, partsInSeq *= 2) + { + Algorithms::ParallelFor< Device>::exec(0, arrSize/2, CmpSwap, monotonicSeqLen, len, partsInSeq); + } + } +} + + +void bitonicSort(ArrayView arr, bool sortAscending = true) +{ + bitonicSort(arr, 0, arr.getSize(), sortAscending); +} \ No newline at end of file diff --git a/GPUSort/bitonicGPU/config.mk b/GPUSort/bitonicGPU/config.mk new file mode 100644 index 000000000..5b2f0d7f6 --- /dev/null +++ b/GPUSort/bitonicGPU/config.mk @@ -0,0 +1,49 @@ +# configure the include path(s) according to your TNL installation +TNL_INCLUDE_DIRS := -I ~/.local/include + +WITH_OPENMP := yes +WITH_DEBUG := no + +# If TNL is installed on your system, the CUDA architecture can be detected +# automatically by tnl-cuda-arch. This is done if CUDA_ARCH is set to "auto". +# Otherwise, CUDA_ARCH has to be set manually to the desired CUDA architecture +# number, e.g. 60, 61, etc. +CUDA_ARCH := auto + +# compilers +CXX := g++ +CUDA_CXX := nvcc + +# host compiler flags +CXXFLAGS := -std=c++14 $(TNL_INCLUDE_DIRS) +ifeq ($(WITH_DEBUG),yes) + CXXFLAGS += -O0 -g +else + CXXFLAGS += -O3 -DNDEBUG +endif + +# CUDA compiler flags +CUDA_CXXFLAGS := -std=c++14 --expt-relaxed-constexpr --expt-extended-lambda $(TNL_INCLUDE_DIRS) +CUDA_CXXFLAGS += -DHAVE_CUDA +ifeq ($(CUDA_ARCH),auto) + CUDA_CXXFLAGS += $(shell tnl-cuda-arch) +else + CUDA_CXXFLAGS += -gencode arch=compute_$(CUDA_ARCH),code=sm_$(CUDA_ARCH) +endif + +# determine path to the CUDA toolkit installation +# (autodetection is attempted, set it manually if it fails) +CUDA_PATH ?= $(abspath $(dir $(shell command -v nvcc))/..) +#$(info Detected CUDA_PATH: $(CUDA_PATH)) + +# flags for linking CUDA with the host compiler +CUDA_LDFLAGS := -L $(CUDA_PATH)/lib64 +CUDA_LDLIBS := -lcudart -ldl -lrt + +# enable OpenMP +ifeq ($(WITH_OPENMP),yes) + CXXFLAGS += -fopenmp -DHAVE_OPENMP + LDLIBS += -lgomp + CUDA_CXXFLAGS += -Xcompiler -fopenmp -DHAVE_OPENMP + CUDA_LDLIBS += -lgomp +endif diff --git a/GPUSort/bitonicGPU/main.cu b/GPUSort/bitonicGPU/main.cu new file mode 100644 index 000000000..71e614925 --- /dev/null +++ b/GPUSort/bitonicGPU/main.cu @@ -0,0 +1,30 @@ +#include +#include + +#include "bitonicSort.h" +//-------------------------------------------------- +std::ostream& operator<< (std::ostream&out, std::vector &arr) +{ + for (auto x : arr) + std::cout << x << " "; + return std::cout << std::endl; +} + +#define deb(x) std::cout << #x << " = " << x << std::endl; +//-------------------------------------------------- + +int main( int argc, char* argv[] ) +{ + TNL::Containers::Array Arr(argc - 1); + for(int i = 1; i < argc; i++) + Arr.setElement(i-1, std::atoi(argv[i])); + + auto view = Arr.getView(); + + std::cout << "unsorted: " << view << std::endl; + bitonicSort(view); + + std::cout << "sorted: " << view << std::endl; + + return 0; +} \ No newline at end of file diff --git a/GPUSort/bitonicGPU/unitTests/Makefile b/GPUSort/bitonicGPU/unitTests/Makefile new file mode 100644 index 000000000..4cf4ea6f0 --- /dev/null +++ b/GPUSort/bitonicGPU/unitTests/Makefile @@ -0,0 +1,26 @@ +include ../config.mk + +CUDA_SOURCES := $(wildcard *.cu) +CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) + +GTEST := -lgtest -pthread + +## targets definitions follow +.PHONY: all host cuda +all: cuda +cuda: $(CUDA_TARGETS) + +run: cuda + ./$(CUDA_TARGETS) + +.PHONY: clean +clean: + rm -f *.d *.o *.cuo $(CUDA_TARGETS) + +# use .cuo instead of .cu.o to avoid problems with the implicit rules: https://stackoverflow.com/q/62967939 +# (and use the host compiler for linking CUDA, nvcc does not understand that .cuo is an object file) +$(CUDA_TARGETS): % : %.cuo + $(CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS) $(GTEST) + +$(CUDA_SOURCES:%.cu=%.cuo): %.cuo : %.cu + $(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -c -o $@ $< diff --git a/GPUSort/bitonicGPU/unitTests/unitTests.cu b/GPUSort/bitonicGPU/unitTests/unitTests.cu new file mode 100644 index 000000000..6540a967b --- /dev/null +++ b/GPUSort/bitonicGPU/unitTests/unitTests.cu @@ -0,0 +1,110 @@ +#include "gtest/gtest.h" +#include +#include +#include + +#include + +#include "../bitonicSort.h" + +//---------------------------------------------------------------------------------- + +bool is_sorted(TNL::Containers::ArrayView arr) +{ + for(int i = 1; i < arr.getSize(); i++) + if(arr.getElement(i-1) > arr.getElement(i)) + return false; + + return true; +} + +//---------------------------------------------------------------------------------- + +TEST(sortPow2, allPermutationSize4) +{ + int size = 4; + std::vector orig(size); + std::iota(orig.begin(), orig.end(), 0); + + while (std::next_permutation(orig.begin(), orig.end())) + { + TNL::Containers::Array cudaArr(orig); + auto view = cudaArr.getView(); + + bitonicSort(view); + + ASSERT_TRUE(is_sorted(view)); + } +} + +TEST(sortPow2, somePermutationSize8) +{ + int size = 8; + const int stride = 5; + int i = 0; + + std::vector orig(size); + std::iota(orig.begin(), orig.end(), 0); + + while (std::next_permutation(orig.begin(), orig.end())) + { + if((i++)%stride != 0) + continue; + + TNL::Containers::Array cudaArr(orig); + auto view = cudaArr.getView(); + + bitonicSort(view); + + ASSERT_TRUE(is_sorted(view)); + } +} + + +TEST(selectedSize, size5) +{ + TNL::Containers::Array cudaArr{8, 1, 45, 9, -5}; + auto view = cudaArr.getView(); + ASSERT_EQ(5, view.getSize()); + bitonicSort(view); + ASSERT_TRUE(is_sorted(view)); +} + +TEST(selectedSize, size6) +{ + TNL::Containers::Array cudaArr{5, 9, 4, 3, 4, 0}; + auto view = cudaArr.getView(); + ASSERT_EQ(6, view.getSize()); + bitonicSort(view); + ASSERT_TRUE(is_sorted(view)); +} + + +TEST(selectedSize, size7) +{ + TNL::Containers::Array cudaArr{5, 8, 1, 6, 9, 7, 1}; + auto view = cudaArr.getView(); + ASSERT_EQ(7, view.getSize()); + bitonicSort(view); + ASSERT_TRUE(is_sorted(view)); +} + +TEST(selectedSize, size9) +{ + TNL::Containers::Array cudaArr{5, 8, 1, 6, 9, 7, 1, 6, 0}; + auto view = cudaArr.getView(); + ASSERT_EQ(9, view.getSize()); + bitonicSort(view); + ASSERT_TRUE(is_sorted(view)); +} + + + +//---------------------------------------------------------------------------------- + +int main(int argc, char ** argv) +{ + testing::InitGoogleTest(&argc, argv); + + return RUN_ALL_TESTS(); +} \ No newline at end of file -- GitLab From 454e4d53ffe77bffb133038d478e2416b3e33eb0 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sun, 14 Feb 2021 19:44:12 +0100 Subject: [PATCH 002/258] refactor into function instead of parallel for --- GPUSort/bitonicGPU/bitonicSort.h | 53 ++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index cfd859fde..9b1e8d9a3 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -1,5 +1,4 @@ #include -#include using namespace TNL; using namespace TNL::Containers; @@ -22,41 +21,49 @@ __host__ __device__ int closestPow2(int x) //--------------------------------------------- -void bitonicSort(ArrayView arr, int begin, int end, bool sortAscending) +__global__ void bitonicMergeStep(ArrayView arr, + int begin, int end, bool sortAscending, + int monotonicSeqLen, int len, int partsInSeq) { - int arrSize = end - begin; - int paddedSize = closestPow2(arrSize); + int i = blockIdx.x * blockDim.x + threadIdx.x; - auto CmpSwap = [=] __cuda_callable__ (int i, int monotonicSeqLen, int len, int partsInSeq) mutable { + int part = i / (len / 2); - int part = i / (len / 2); + int s = begin + part * len + (i % (len / 2)); + int e = s + len / 2; + if (e >= end) + return; - int s = begin + part * len + (i % (len / 2)); - int e = s + len / 2; - if (e >= end) - return; + //calculate the direction of swapping + int monotonicSeqIdx = part / partsInSeq; + bool ascending = (monotonicSeqIdx % 2) == 0 ? !sortAscending : sortAscending; - //calculate the direction of swapping - int monotonicSeqIdx = part / partsInSeq; - bool ascending = (monotonicSeqIdx % 2) == 0 ? !sortAscending : sortAscending; + //special case for parts with no "partner" + if ((monotonicSeqIdx + 1) * monotonicSeqLen >= end) + ascending = sortAscending; + + auto &a = arr[s]; + auto &b = arr[e]; + if ((ascending && a > b) || (!ascending && a < b)) + TNL::swap(a, b); +} - //special case for parts with no "partner" - if ((monotonicSeqIdx + 1) * monotonicSeqLen >= end) - ascending = sortAscending; +//--------------------------------------------- - //templated size of block +void bitonicSort(ArrayView arr, int begin, int end, bool sortAscending) +{ + int arrSize = end - begin; + int paddedSize = closestPow2(arrSize); - auto &a = arr[s]; - auto &b = arr[e]; - if ((ascending && a > b) || (!ascending && a < b)) - TNL::swap(a, b); - }; + int threadPerBlock = 256; + int blocks = arrSize/threadPerBlock + (arrSize%threadPerBlock == 0? 0 : 1); for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) { for (int len = monotonicSeqLen, partsInSeq = 1; len > 1; len /= 2, partsInSeq *= 2) { - Algorithms::ParallelFor< Device>::exec(0, arrSize/2, CmpSwap, monotonicSeqLen, len, partsInSeq); + bitonicMergeStep<<>>(arr, begin, end, sortAscending, + monotonicSeqLen, len, partsInSeq); } } } -- GitLab From 00b9911c16497ad3e713b0a2b4403dda99f1ee22 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sun, 14 Feb 2021 23:15:07 +0100 Subject: [PATCH 003/258] copy into shared memory --- GPUSort/bitonicGPU/bitonicSort.h | 59 ++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index 9b1e8d9a3..d3372faf2 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -48,6 +48,64 @@ __global__ void bitonicMergeStep(ArrayView arr, TNL::swap(a, b); } +__global__ void bitonicMergeSharedMemory(ArrayView arr, + int begin, int end, bool sortAscending, + int monotonicSeqLen, int len, int partsInSeq) +{ + extern __shared__ int sharedMem[]; + + int arrSize = end - begin; + int paddedSize = closestPow2(arrSize); + //------------------------------------------ + + //copy from globalMem into sharedMem + { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int part = i / (len / 2); + + int s = begin + part * len + threadIdx.x; + int e = s + blockDim.x/2; + + sharedMem[threadIdx.x] = arr[s]; + sharedMem[threadIdx.x + blockDim.x/2] = e < end? arr[e] : -1; //any default value is ok + __syncthreads(); + } + + //------------------------------------------ + + //do bitonic sort + for (; len > 1; len /= 2, partsInSeq *= 2) + { + __syncthreads(); + + int i = threadIdx.x; + + int s = part * len + (i% (len / 2)); + int e = s + len / 2; + + + //calculate the direction of swapping + int monotonicSeqIdx = part / partsInSeq; + bool ascending = (monotonicSeqIdx % 2) == 0 ? !sortAscending : sortAscending; + + //special case for parts with no "partner" + if ((monotonicSeqIdx + 1) * monotonicSeqLen >= end) + ascending = sortAscending; + + //swap + /* + if ((ascending && a > b) || (!ascending && a < b)) + { + //TNL::swap(a, b); + } + */ + } + + //------------------------------------------ + //writeback to global memory + +} + //--------------------------------------------- void bitonicSort(ArrayView arr, int begin, int end, bool sortAscending) @@ -68,6 +126,7 @@ void bitonicSort(ArrayView arr, int begin, int end, bool sortAscend } } +//--------------------------------------------- void bitonicSort(ArrayView arr, bool sortAscending = true) { -- GitLab From 0e168e14591aee3e000c04e01d71b1f56c92539c Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sun, 14 Feb 2021 23:24:07 +0100 Subject: [PATCH 004/258] calc direction of swap --- GPUSort/bitonicGPU/bitonicSort.h | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index d3372faf2..d67cf0cfa 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -72,6 +72,16 @@ __global__ void bitonicMergeSharedMemory(ArrayView arr, } //------------------------------------------ + //calculate the direction of swapping + int i = blockIdx.x * blockDim.x + threadIdx.x; + int part = i / (len / 2); + int monotonicSeqIdx = part / partsInSeq; + bool ascending = (monotonicSeqIdx % 2) == 0 ? !sortAscending : sortAscending; + + //special case for parts with no "partner" + if ((monotonicSeqIdx + 1) * monotonicSeqLen >= end) + ascending = sortAscending; + //------------------------------------------ //do bitonic sort for (; len > 1; len /= 2, partsInSeq *= 2) @@ -80,18 +90,11 @@ __global__ void bitonicMergeSharedMemory(ArrayView arr, int i = threadIdx.x; + int part = i/(len/2); int s = part * len + (i% (len / 2)); int e = s + len / 2; - //calculate the direction of swapping - int monotonicSeqIdx = part / partsInSeq; - bool ascending = (monotonicSeqIdx % 2) == 0 ? !sortAscending : sortAscending; - - //special case for parts with no "partner" - if ((monotonicSeqIdx + 1) * monotonicSeqLen >= end) - ascending = sortAscending; - //swap /* if ((ascending && a > b) || (!ascending && a < b)) -- GitLab From 88c27a2fb6550d8eae7ddcbf5961f09a07e3ce71 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sun, 14 Feb 2021 23:30:05 +0100 Subject: [PATCH 005/258] swapping in each stage --- GPUSort/bitonicGPU/bitonicSort.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index d67cf0cfa..fb205d12e 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -94,14 +94,13 @@ __global__ void bitonicMergeSharedMemory(ArrayView arr, int s = part * len + (i% (len / 2)); int e = s + len / 2; - //swap - /* + int a = sharedMem[s], b = sharedMem[e]; if ((ascending && a > b) || (!ascending && a < b)) { - //TNL::swap(a, b); + sharedMem[s] = b; + sharedMem[e] = a; } - */ } //------------------------------------------ -- GitLab From a4b658b106c43a3ea66d71277462ac31d6fd8f3c Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sun, 14 Feb 2021 23:31:03 +0100 Subject: [PATCH 006/258] writeback to memory --- GPUSort/bitonicGPU/bitonicSort.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index fb205d12e..8400b18fc 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -105,7 +105,17 @@ __global__ void bitonicMergeSharedMemory(ArrayView arr, //------------------------------------------ //writeback to global memory - + { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int part = i / (len / 2); + + int s = begin + part * len + threadIdx.x; + int e = s + blockDim.x/2; + + arr[s] = sharedMem[threadIdx.x]; + arr[e] = sharedMem[threadIdx.x + blockDim.x/2]; + __syncthreads(); + } } //--------------------------------------------- -- GitLab From 3cb6f97b6f0a70a6201d999711c1c779c752b4ad Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sun, 14 Feb 2021 23:41:24 +0100 Subject: [PATCH 007/258] sharedMem indexing write --- GPUSort/bitonicGPU/bitonicSort.h | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index 8400b18fc..0679b1ff9 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -53,19 +53,12 @@ __global__ void bitonicMergeSharedMemory(ArrayView arr, int monotonicSeqLen, int len, int partsInSeq) { extern __shared__ int sharedMem[]; - - int arrSize = end - begin; - int paddedSize = closestPow2(arrSize); - //------------------------------------------ + + int s = begin + blockIdx.x * (2*blockDim.x); + int e = s + blockDim.x; //copy from globalMem into sharedMem { - int i = blockIdx.x * blockDim.x + threadIdx.x; - int part = i / (len / 2); - - int s = begin + part * len + threadIdx.x; - int e = s + blockDim.x/2; - sharedMem[threadIdx.x] = arr[s]; sharedMem[threadIdx.x + blockDim.x/2] = e < end? arr[e] : -1; //any default value is ok __syncthreads(); @@ -106,14 +99,8 @@ __global__ void bitonicMergeSharedMemory(ArrayView arr, //------------------------------------------ //writeback to global memory { - int i = blockIdx.x * blockDim.x + threadIdx.x; - int part = i / (len / 2); - - int s = begin + part * len + threadIdx.x; - int e = s + blockDim.x/2; - arr[s] = sharedMem[threadIdx.x]; - arr[e] = sharedMem[threadIdx.x + blockDim.x/2]; + arr[e] = sharedMem[threadIdx.x + blockDim.x]; __syncthreads(); } } -- GitLab From 5f97b6d148db34a3aa37d34957b07735c7514053 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sun, 14 Feb 2021 23:43:30 +0100 Subject: [PATCH 008/258] sharedMem indexing write bug fix --- GPUSort/bitonicGPU/bitonicSort.h | 55 +++++++++++++++++--------------- 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index 0679b1ff9..12495cced 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -60,39 +60,42 @@ __global__ void bitonicMergeSharedMemory(ArrayView arr, //copy from globalMem into sharedMem { sharedMem[threadIdx.x] = arr[s]; - sharedMem[threadIdx.x + blockDim.x/2] = e < end? arr[e] : -1; //any default value is ok + sharedMem[threadIdx.x + blockDim.x] = arr[e]; __syncthreads(); } //------------------------------------------ - //calculate the direction of swapping - int i = blockIdx.x * blockDim.x + threadIdx.x; - int part = i / (len / 2); - int monotonicSeqIdx = part / partsInSeq; - bool ascending = (monotonicSeqIdx % 2) == 0 ? !sortAscending : sortAscending; - - //special case for parts with no "partner" - if ((monotonicSeqIdx + 1) * monotonicSeqLen >= end) - ascending = sortAscending; - //------------------------------------------ - - //do bitonic sort - for (; len > 1; len /= 2, partsInSeq *= 2) - { - __syncthreads(); + //bitonic activity + { + //calculate the direction of swapping + int i = blockIdx.x * blockDim.x + threadIdx.x; + int part = i / (len / 2); + int monotonicSeqIdx = part / partsInSeq; + bool ascending = (monotonicSeqIdx % 2) == 0 ? !sortAscending : sortAscending; + + //special case for parts with no "partner" + if ((monotonicSeqIdx + 1) * monotonicSeqLen >= end) + ascending = sortAscending; + //------------------------------------------ + + //do bitonic sort + for (; len > 1; len /= 2, partsInSeq *= 2) + { + __syncthreads(); - int i = threadIdx.x; + int i = threadIdx.x; - int part = i/(len/2); - int s = part * len + (i% (len / 2)); - int e = s + len / 2; + int part = i/(len/2); + int s = part * len + (i% (len / 2)); + int e = s + len / 2; - //swap - int a = sharedMem[s], b = sharedMem[e]; - if ((ascending && a > b) || (!ascending && a < b)) - { - sharedMem[s] = b; - sharedMem[e] = a; + //swap + int a = sharedMem[s], b = sharedMem[e]; + if ((ascending && a > b) || (!ascending && a < b)) + { + sharedMem[s] = b; + sharedMem[e] = a; + } } } -- GitLab From 2654ee7bfb44dc7e3bac791953832241f4481429 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Mon, 15 Feb 2021 00:23:29 +0100 Subject: [PATCH 009/258] sorting pow2 arrays --- GPUSort/bitonicGPU/bitonicSort.h | 48 ++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index 12495cced..6f9777814 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -1,15 +1,18 @@ #include +#include using namespace TNL; using namespace TNL::Containers; typedef Devices::Cuda Device; +#define deb(x) std::cout << #x << " = " << x << std::endl; + //--------------------------------------------- __host__ __device__ int closestPow2(int x) { - if(x ==0) + if (x == 0) return 0; int ret = 1; @@ -22,8 +25,8 @@ __host__ __device__ int closestPow2(int x) //--------------------------------------------- __global__ void bitonicMergeStep(ArrayView arr, - int begin, int end, bool sortAscending, - int monotonicSeqLen, int len, int partsInSeq) + int begin, int end, bool sortAscending, + int monotonicSeqLen, int len, int partsInSeq) { int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -49,13 +52,12 @@ __global__ void bitonicMergeStep(ArrayView arr, } __global__ void bitonicMergeSharedMemory(ArrayView arr, - int begin, int end, bool sortAscending, - int monotonicSeqLen, int len, int partsInSeq) + int begin, int end, bool sortAscending, + int monotonicSeqLen, int len, int partsInSeq) { extern __shared__ int sharedMem[]; - - int s = begin + blockIdx.x * (2*blockDim.x); + int s = begin + blockIdx.x * (2 * blockDim.x) + threadIdx.x; int e = s + blockDim.x; //copy from globalMem into sharedMem { @@ -63,16 +65,16 @@ __global__ void bitonicMergeSharedMemory(ArrayView arr, sharedMem[threadIdx.x + blockDim.x] = arr[e]; __syncthreads(); } - + //------------------------------------------ //bitonic activity - { + { //calculate the direction of swapping int i = blockIdx.x * blockDim.x + threadIdx.x; int part = i / (len / 2); int monotonicSeqIdx = part / partsInSeq; - bool ascending = (monotonicSeqIdx % 2) == 0 ? !sortAscending : sortAscending; + bool ascending = (monotonicSeqIdx % 2) == 0 ? !sortAscending : sortAscending; //special case for parts with no "partner" if ((monotonicSeqIdx + 1) * monotonicSeqLen >= end) ascending = sortAscending; @@ -83,10 +85,8 @@ __global__ void bitonicMergeSharedMemory(ArrayView arr, { __syncthreads(); - int i = threadIdx.x; - - int part = i/(len/2); - int s = part * len + (i% (len / 2)); + int part = threadIdx.x / (len / 2); + int s = part * len + (threadIdx.x % (len / 2)); int e = s + len / 2; //swap @@ -115,15 +115,27 @@ void bitonicSort(ArrayView arr, int begin, int end, bool sortAscend int arrSize = end - begin; int paddedSize = closestPow2(arrSize); - int threadPerBlock = 256; - int blocks = arrSize/threadPerBlock + (arrSize%threadPerBlock == 0? 0 : 1); + int threadsNeeded = arrSize / 2; + + const int maxThreadsPerBlock = 256; + int threadPerBlock = min(maxThreadsPerBlock, threadsNeeded); + int blocks = threadsNeeded / threadPerBlock + (threadsNeeded % threadPerBlock == 0 ? 0 : 1); + + const int sharedMemSize = threadPerBlock * 2 * sizeof(int); for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) { for (int len = monotonicSeqLen, partsInSeq = 1; len > 1; len /= 2, partsInSeq *= 2) { - bitonicMergeStep<<>>(arr, begin, end, sortAscending, - monotonicSeqLen, len, partsInSeq); + /* + bitonicMergeStep<<>>(arr, begin, end, sortAscending, + monotonicSeqLen, len, partsInSeq); + */ + bitonicMergeSharedMemory<<>>(arr, begin, end, sortAscending, + monotonicSeqLen, len, partsInSeq); + + cudaDeviceSynchronize(); + break; } } } -- GitLab From fffb2165fe2a4c3e9750972ffd3dfb4158682081 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Mon, 15 Feb 2021 14:41:01 +0100 Subject: [PATCH 010/258] sharing non pow2 memory --- GPUSort/bitonicGPU/bitonicSort.h | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index 6f9777814..766068608 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -62,7 +62,9 @@ __global__ void bitonicMergeSharedMemory(ArrayView arr, //copy from globalMem into sharedMem { sharedMem[threadIdx.x] = arr[s]; - sharedMem[threadIdx.x + blockDim.x] = arr[e]; + if(e < end) + sharedMem[threadIdx.x + blockDim.x] = arr[e]; + __syncthreads(); } @@ -85,6 +87,15 @@ __global__ void bitonicMergeSharedMemory(ArrayView arr, { __syncthreads(); + { + int part = i / (len / 2); + + int arrCmpS = begin + part * len + (i % (len / 2)); + int arrCmpE = arrCmpS + len / 2; + if(arrCmpE >= end) + continue; + } + int part = threadIdx.x / (len / 2); int s = part * len + (threadIdx.x % (len / 2)); int e = s + len / 2; @@ -97,13 +108,17 @@ __global__ void bitonicMergeSharedMemory(ArrayView arr, sharedMem[e] = a; } } + + __syncthreads(); + } //------------------------------------------ //writeback to global memory { arr[s] = sharedMem[threadIdx.x]; - arr[e] = sharedMem[threadIdx.x + blockDim.x]; + if(e < end) + arr[e] = sharedMem[threadIdx.x + blockDim.x]; __syncthreads(); } } @@ -115,7 +130,7 @@ void bitonicSort(ArrayView arr, int begin, int end, bool sortAscend int arrSize = end - begin; int paddedSize = closestPow2(arrSize); - int threadsNeeded = arrSize / 2; + int threadsNeeded = arrSize / 2 + (arrSize %2 !=0); const int maxThreadsPerBlock = 256; int threadPerBlock = min(maxThreadsPerBlock, threadsNeeded); -- GitLab From 7411ea790d4f8e970da5c4692c20ead1e6cd1296 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Mon, 15 Feb 2021 15:27:55 +0100 Subject: [PATCH 011/258] call bitonicGlobal when doesnt fit into memory --- GPUSort/bitonicGPU/bitonicSort.h | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index 766068608..f58ecae46 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -142,15 +142,20 @@ void bitonicSort(ArrayView arr, int begin, int end, bool sortAscend { for (int len = monotonicSeqLen, partsInSeq = 1; len > 1; len /= 2, partsInSeq *= 2) { - /* - bitonicMergeStep<<>>(arr, begin, end, sortAscending, - monotonicSeqLen, len, partsInSeq); - */ - bitonicMergeSharedMemory<<>>(arr, begin, end, sortAscending, - monotonicSeqLen, len, partsInSeq); - - cudaDeviceSynchronize(); - break; + if(monotonicSeqLen > sharedMemSize) + { + bitonicMergeStep<<>>(arr, begin, end, sortAscending, + monotonicSeqLen, len, partsInSeq); + cudaDeviceSynchronize(); + } + else + { + + bitonicMergeSharedMemory<<>>(arr, begin, end, sortAscending, + monotonicSeqLen, len, partsInSeq); + cudaDeviceSynchronize(); + break; + } } } } -- GitLab From d01bcabb4e49fbd7cbafa8fa28d19ae39eb9816e Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Mon, 15 Feb 2021 15:28:44 +0100 Subject: [PATCH 012/258] rename function --- GPUSort/bitonicGPU/bitonicSort.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index f58ecae46..701ebbaa0 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -24,7 +24,7 @@ __host__ __device__ int closestPow2(int x) //--------------------------------------------- -__global__ void bitonicMergeStep(ArrayView arr, +__global__ void bitonicMergeGlobal(ArrayView arr, int begin, int end, bool sortAscending, int monotonicSeqLen, int len, int partsInSeq) { @@ -144,7 +144,7 @@ void bitonicSort(ArrayView arr, int begin, int end, bool sortAscend { if(monotonicSeqLen > sharedMemSize) { - bitonicMergeStep<<>>(arr, begin, end, sortAscending, + bitonicMergeGlobal<<>>(arr, begin, end, sortAscending, monotonicSeqLen, len, partsInSeq); cudaDeviceSynchronize(); } -- GitLab From d7a2159047de1b68cb9449be64a7aba79acd0cee Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Mon, 15 Feb 2021 16:00:24 +0100 Subject: [PATCH 013/258] sort beginning in shared mem --- GPUSort/bitonicGPU/bitonicSort.h | 84 +++++++++++++++++++++++++++++++- 1 file changed, 82 insertions(+), 2 deletions(-) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index 701ebbaa0..a52da7426 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -123,6 +123,80 @@ __global__ void bitonicMergeSharedMemory(ArrayView arr, } } +//--------------------------------------------- +__global__ void bitoniSort1stStepSharedMemory(ArrayView arr, int begin, int end, bool sortAscending) +{ + extern __shared__ int sharedMem[]; + + int s = begin + blockIdx.x * (2 * blockDim.x) + threadIdx.x; + int e = s + blockDim.x; + //copy from globalMem into sharedMem + { + sharedMem[threadIdx.x] = arr[s]; + if(e < end) + sharedMem[threadIdx.x + blockDim.x] = arr[e]; + + __syncthreads(); + } + + //------------------------------------------ + //bitonic activity + { + int i = blockIdx.x * blockDim.x + threadIdx.x; + + for (int monotonicSeqLen = 2; monotonicSeqLen <= blockDim.x * 2 * sizeof(int); monotonicSeqLen *= 2) + { + //calculate the direction of swapping + int monotonicSeqIdx = i / (monotonicSeqLen/2); + bool ascending = (monotonicSeqIdx % 2) == 0 ? !sortAscending : sortAscending; + + //special case for parts with no "partner" + if ((monotonicSeqIdx + 1) * monotonicSeqLen >= end) + ascending = sortAscending; + + for (int len = monotonicSeqLen, partsInSeq = 1; len > 1; len /= 2, partsInSeq *= 2) + { + __syncthreads(); + int part = i / (len / 2); + + int arrCmpS = begin + part * len + (i % (len / 2)); + int arrCmpE = arrCmpS + len / 2; + if(arrCmpE >= end) + continue; + + + + part = threadIdx.x / (len / 2); + int s = part * len + (threadIdx.x % (len / 2)); + int e = s + len / 2; + + //swap + int a = sharedMem[s], b = sharedMem[e]; + if ((ascending && a > b) || (!ascending && a < b)) + { + sharedMem[s] = b; + sharedMem[e] = a; + } + } + } + + __syncthreads(); + + } + + + + //------------------------------------------ + //writeback to global memory + { + arr[s] = sharedMem[threadIdx.x]; + if(e < end) + arr[e] = sharedMem[threadIdx.x + blockDim.x]; + __syncthreads(); + } +} + + //--------------------------------------------- void bitonicSort(ArrayView arr, int begin, int end, bool sortAscending) @@ -132,13 +206,19 @@ void bitonicSort(ArrayView arr, int begin, int end, bool sortAscend int threadsNeeded = arrSize / 2 + (arrSize %2 !=0); - const int maxThreadsPerBlock = 256; + const int maxThreadsPerBlock = 512; int threadPerBlock = min(maxThreadsPerBlock, threadsNeeded); int blocks = threadsNeeded / threadPerBlock + (threadsNeeded % threadPerBlock == 0 ? 0 : 1); const int sharedMemSize = threadPerBlock * 2 * sizeof(int); - for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) + //--------------------------------------------------------------------------------- + + + bitoniSort1stStepSharedMemory<<>>(arr, begin, end, sortAscending); + cudaDeviceSynchronize(); + + for (int monotonicSeqLen = 2*sharedMemSize; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) { for (int len = monotonicSeqLen, partsInSeq = 1; len > 1; len /= 2, partsInSeq *= 2) { -- GitLab From e3322b472e9c0e5bcb97defbf1b09fa2327d7d3f Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sun, 14 Feb 2021 20:01:23 +0100 Subject: [PATCH 014/258] check for trivial sizes --- GPUSort/bitonicGPU/bitonicSort.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index a52da7426..448208396 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -245,4 +245,4 @@ void bitonicSort(ArrayView arr, int begin, int end, bool sortAscend void bitonicSort(ArrayView arr, bool sortAscending = true) { bitonicSort(arr, 0, arr.getSize(), sortAscending); -} \ No newline at end of file +} -- GitLab From 77ea34dfe9ce9b56a0d2dde5a6c817debf1bf8cf Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Mon, 15 Feb 2021 18:46:55 +0100 Subject: [PATCH 015/258] more robust randomshuffle benchmark --- GPUSort/bitonicGPU/benchmark/benchmark.cu | 43 +++++++++++++---------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/GPUSort/bitonicGPU/benchmark/benchmark.cu b/GPUSort/bitonicGPU/benchmark/benchmark.cu index a77bca3d4..6cb882b5e 100644 --- a/GPUSort/bitonicGPU/benchmark/benchmark.cu +++ b/GPUSort/bitonicGPU/benchmark/benchmark.cu @@ -67,32 +67,37 @@ void test1() void randomShuffles() { - int size = 1<<15; - std::vector orig(size); - std::iota(orig.begin(), orig.end(), 0); - std::vector results; - - for (int i = 0; i < 100; i++) + int iterations = 100; + std::cout << iterations << " random permutations" << std::endl; + for(int p = 13; p <= 19; ++p) { - std::random_shuffle(orig.begin(), orig.end()); - - TNL::Containers::Array cudaArr(orig); - auto view = cudaArr.getView(); - std::vector tmp(orig.begin(), orig.end()); + int size = 1< orig(size); + std::iota(orig.begin(), orig.end(), 0); + std::vector results; + for (int i = 0; i < iterations; i++) { - TIMER t("random permutation"); + std::random_shuffle(orig.begin(), orig.end()); + + TNL::Containers::Array cudaArr(orig); + auto view = cudaArr.getView(); + std::vector tmp(orig.begin(), orig.end()); + + { + TIMER t("random permutation"); + + //std::sort(tmp.begin(), tmp.end()); + bitonicSort(view); + + results.push_back(t.stop()); + //t.printTime(); + } - std::sort(tmp.begin(), tmp.end()); - //bitonicSort(view); - - results.push_back(t.stop()); - //t.printTime(); } + std::cout << "average time for arrSize = 2^" << p << ": " << std::accumulate(results.begin(), results.end(), 0.)/results.size() << " ms" << std::endl; } - - std::cout << "average time: " << std::accumulate(results.begin(), results.end(), 0.)/results.size() << " ms" << std::endl; } void allPermutations(std::vector orig) -- GitLab From e2066e3cbc7b46214be6050e5a4ddff04a129e74 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Mon, 15 Feb 2021 20:44:14 +0100 Subject: [PATCH 016/258] shared size in byte vs amount of elements fix --- GPUSort/bitonicGPU/bitonicSort.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index 448208396..5ea6389c0 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -210,7 +210,8 @@ void bitonicSort(ArrayView arr, int begin, int end, bool sortAscend int threadPerBlock = min(maxThreadsPerBlock, threadsNeeded); int blocks = threadsNeeded / threadPerBlock + (threadsNeeded % threadPerBlock == 0 ? 0 : 1); - const int sharedMemSize = threadPerBlock * 2 * sizeof(int); + const int sharedMemLen = threadPerBlock * 2; + const int sharedMemSize = sharedMemLen* sizeof(int); //--------------------------------------------------------------------------------- @@ -218,11 +219,11 @@ void bitonicSort(ArrayView arr, int begin, int end, bool sortAscend bitoniSort1stStepSharedMemory<<>>(arr, begin, end, sortAscending); cudaDeviceSynchronize(); - for (int monotonicSeqLen = 2*sharedMemSize; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) + for (int monotonicSeqLen = 2*sharedMemLen; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) { for (int len = monotonicSeqLen, partsInSeq = 1; len > 1; len /= 2, partsInSeq *= 2) { - if(monotonicSeqLen > sharedMemSize) + if(len > sharedMemLen) { bitonicMergeGlobal<<>>(arr, begin, end, sortAscending, monotonicSeqLen, len, partsInSeq); -- GitLab From 9ec6585e155f20a35423ac89092923b4cfcb4c6f Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Mon, 15 Feb 2021 20:48:40 +0100 Subject: [PATCH 017/258] fix size in byte vs amount of elements in sharedMem step --- GPUSort/bitonicGPU/bitonicSort.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index 5ea6389c0..a47f78754 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -127,8 +127,9 @@ __global__ void bitonicMergeSharedMemory(ArrayView arr, __global__ void bitoniSort1stStepSharedMemory(ArrayView arr, int begin, int end, bool sortAscending) { extern __shared__ int sharedMem[]; + int sharedMemLen = 2*blockDim.x; - int s = begin + blockIdx.x * (2 * blockDim.x) + threadIdx.x; + int s = begin + blockIdx.x * sharedMemLen + threadIdx.x; int e = s + blockDim.x; //copy from globalMem into sharedMem { @@ -144,7 +145,7 @@ __global__ void bitoniSort1stStepSharedMemory(ArrayView arr, int be { int i = blockIdx.x * blockDim.x + threadIdx.x; - for (int monotonicSeqLen = 2; monotonicSeqLen <= blockDim.x * 2 * sizeof(int); monotonicSeqLen *= 2) + for (int monotonicSeqLen = 2; monotonicSeqLen <= sharedMemLen; monotonicSeqLen *= 2) { //calculate the direction of swapping int monotonicSeqIdx = i / (monotonicSeqLen/2); -- GitLab From adf71f08e27f0e4f27447765e5f42cb14c80de46 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Mon, 15 Feb 2021 22:22:08 +0100 Subject: [PATCH 018/258] big array tests and unify permutations --- GPUSort/bitonicGPU/unitTests/unitTests.cu | 103 +++++++++++++++------- 1 file changed, 69 insertions(+), 34 deletions(-) diff --git a/GPUSort/bitonicGPU/unitTests/unitTests.cu b/GPUSort/bitonicGPU/unitTests/unitTests.cu index 6540a967b..ee967f2a6 100644 --- a/GPUSort/bitonicGPU/unitTests/unitTests.cu +++ b/GPUSort/bitonicGPU/unitTests/unitTests.cu @@ -2,6 +2,7 @@ #include #include #include +#include #include @@ -11,8 +12,8 @@ bool is_sorted(TNL::Containers::ArrayView arr) { - for(int i = 1; i < arr.getSize(); i++) - if(arr.getElement(i-1) > arr.getElement(i)) + for (int i = 1; i < arr.getSize(); i++) + if (arr.getElement(i - 1) > arr.getElement(i)) return false; return true; @@ -20,27 +21,53 @@ bool is_sorted(TNL::Containers::ArrayView arr) //---------------------------------------------------------------------------------- -TEST(sortPow2, allPermutationSize4) +TEST(permutations, allPermutationSize_3_to_7) { - int size = 4; + for(int i = 3; i<=7; i++ ) + { + int size = i; + std::vector orig(size); + std::iota(orig.begin(), orig.end(), 0); + + while (std::next_permutation(orig.begin(), orig.end())) + { + TNL::Containers::Array cudaArr(orig); + auto view = cudaArr.getView(); + + bitonicSort(view); + + ASSERT_TRUE(is_sorted(view)) << "failed " << i << std::endl; + } + } +} + +TEST(permutations, somePermutationSize8) +{ + int size = 8; + const int stride = 23; + int i = 0; + std::vector orig(size); std::iota(orig.begin(), orig.end(), 0); while (std::next_permutation(orig.begin(), orig.end())) { + if ((i++) % stride != 0) + continue; + TNL::Containers::Array cudaArr(orig); auto view = cudaArr.getView(); bitonicSort(view); - ASSERT_TRUE(is_sorted(view)); + ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl; } } -TEST(sortPow2, somePermutationSize8) +TEST(permutations, somePermutationSize9) { - int size = 8; - const int stride = 5; + int size = 9; + const int stride = 227; int i = 0; std::vector orig(size); @@ -48,7 +75,7 @@ TEST(sortPow2, somePermutationSize8) while (std::next_permutation(orig.begin(), orig.end())) { - if((i++)%stride != 0) + if ((i++) % stride != 0) continue; TNL::Containers::Array cudaArr(orig); @@ -56,53 +83,61 @@ TEST(sortPow2, somePermutationSize8) bitonicSort(view); - ASSERT_TRUE(is_sorted(view)); + ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl; } } - -TEST(selectedSize, size5) +TEST(selectedSize, size15) { - TNL::Containers::Array cudaArr{8, 1, 45, 9, -5}; + TNL::Containers::Array cudaArr{5, 9, 4, 8, 6, 1, 2, 3, 4, 8, 1, 6, 9, 4, 9}; auto view = cudaArr.getView(); - ASSERT_EQ(5, view.getSize()); + ASSERT_EQ(15, view.getSize()); bitonicSort(view); - ASSERT_TRUE(is_sorted(view)); + ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl; } -TEST(selectedSize, size6) +TEST(multiblock, 32768_decreasingNegative) { - TNL::Containers::Array cudaArr{5, 9, 4, 3, 4, 0}; + TNL::Containers::Array cudaArr(1 << 15); + for (int i = 0; i < cudaArr.getSize(); i++) + cudaArr.setElement(i, -i); + auto view = cudaArr.getView(); - ASSERT_EQ(6, view.getSize()); bitonicSort(view); - ASSERT_TRUE(is_sorted(view)); + ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl; } - -TEST(selectedSize, size7) +TEST(randomGenerated, smallArray_randomVal) { - TNL::Containers::Array cudaArr{5, 8, 1, 6, 9, 7, 1}; - auto view = cudaArr.getView(); - ASSERT_EQ(7, view.getSize()); - bitonicSort(view); - ASSERT_TRUE(is_sorted(view)); + for(int i = 0; i < 100; i++) + { + TNL::Containers::Array cudaArr(std::rand()%(1<<10)); + for (int j = 0; j < cudaArr.getSize(); j++) + cudaArr.setElement(j, std::rand()); + + auto view = cudaArr.getView(); + bitonicSort(view); + ASSERT_TRUE(is_sorted(view)); + } } -TEST(selectedSize, size9) +TEST(randomGenerated, bigArray_all0) { - TNL::Containers::Array cudaArr{5, 8, 1, 6, 9, 7, 1, 6, 0}; - auto view = cudaArr.getView(); - ASSERT_EQ(9, view.getSize()); - bitonicSort(view); - ASSERT_TRUE(is_sorted(view)); -} + for(int i = 0; i < 50; i++) + { + int size = (1<<20) + (std::rand()% (1<<19)); + TNL::Containers::Array cudaArr(size); + auto view = cudaArr.getView(); + bitonicSort(view); + ASSERT_TRUE(true); + } +} //---------------------------------------------------------------------------------- -int main(int argc, char ** argv) +int main(int argc, char **argv) { testing::InitGoogleTest(&argc, argv); -- GitLab From e8bd3b88912ee1f9137085df54a42fab537a6990 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Mon, 15 Feb 2021 23:10:16 +0100 Subject: [PATCH 019/258] correct local indexing in shared memory --- GPUSort/bitonicGPU/bitonicSort.h | 79 ++++++++++++++++---------------- 1 file changed, 39 insertions(+), 40 deletions(-) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index a47f78754..c4c10fff1 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -56,18 +56,23 @@ __global__ void bitonicMergeSharedMemory(ArrayView arr, int monotonicSeqLen, int len, int partsInSeq) { extern __shared__ int sharedMem[]; + int sharedMemLen = 2*blockDim.x; + + int myBlockStart = begin + blockIdx.x * sharedMemLen; + int myBlockEnd = end < myBlockStart+sharedMemLen? end : myBlockStart+sharedMemLen; - int s = begin + blockIdx.x * (2 * blockDim.x) + threadIdx.x; - int e = s + blockDim.x; + int copy1 = myBlockStart + threadIdx.x; + int copy2 = copy1 + blockDim.x; //copy from globalMem into sharedMem { - sharedMem[threadIdx.x] = arr[s]; - if(e < end) - sharedMem[threadIdx.x + blockDim.x] = arr[e]; + if(copy1 < end) + sharedMem[threadIdx.x] = arr[copy1]; + if(copy2 < end) + sharedMem[threadIdx.x + blockDim.x] = arr[copy2]; __syncthreads(); } - + //------------------------------------------ //bitonic activity { @@ -83,22 +88,15 @@ __global__ void bitonicMergeSharedMemory(ArrayView arr, //------------------------------------------ //do bitonic sort - for (; len > 1; len /= 2, partsInSeq *= 2) + for (int len = monotonicSeqLen, partsInSeq = 1; len > 1; len /= 2, partsInSeq *= 2) { __syncthreads(); - { - int part = i / (len / 2); - - int arrCmpS = begin + part * len + (i % (len / 2)); - int arrCmpE = arrCmpS + len / 2; - if(arrCmpE >= end) - continue; - } - int part = threadIdx.x / (len / 2); int s = part * len + (threadIdx.x % (len / 2)); int e = s + len / 2; + if(e >= myBlockEnd - myBlockStart) + continue; //swap int a = sharedMem[s], b = sharedMem[e]; @@ -114,11 +112,13 @@ __global__ void bitonicMergeSharedMemory(ArrayView arr, } //------------------------------------------ + //writeback to global memory { - arr[s] = sharedMem[threadIdx.x]; - if(e < end) - arr[e] = sharedMem[threadIdx.x + blockDim.x]; + if(copy1 < end) + arr[copy1] = sharedMem[threadIdx.x]; + if(copy2 < end) + arr[copy2] = sharedMem[threadIdx.x + blockDim.x]; __syncthreads(); } } @@ -129,23 +129,29 @@ __global__ void bitoniSort1stStepSharedMemory(ArrayView arr, int be extern __shared__ int sharedMem[]; int sharedMemLen = 2*blockDim.x; - int s = begin + blockIdx.x * sharedMemLen + threadIdx.x; - int e = s + blockDim.x; + int myBlockStart = begin + blockIdx.x * sharedMemLen; + int myBlockEnd = end < myBlockStart+sharedMemLen? end : myBlockStart+sharedMemLen; + + int copy1 = myBlockStart + threadIdx.x; + int copy2 = copy1 + blockDim.x; //copy from globalMem into sharedMem { - sharedMem[threadIdx.x] = arr[s]; - if(e < end) - sharedMem[threadIdx.x + blockDim.x] = arr[e]; + if(copy1 < end) + sharedMem[threadIdx.x] = arr[copy1]; + + if(copy2 < end) + sharedMem[threadIdx.x + blockDim.x] = arr[copy2]; __syncthreads(); } - + //------------------------------------------ //bitonic activity { int i = blockIdx.x * blockDim.x + threadIdx.x; + int paddedSize = closestPow2(myBlockEnd - myBlockStart); - for (int monotonicSeqLen = 2; monotonicSeqLen <= sharedMemLen; monotonicSeqLen *= 2) + for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) { //calculate the direction of swapping int monotonicSeqIdx = i / (monotonicSeqLen/2); @@ -158,18 +164,12 @@ __global__ void bitoniSort1stStepSharedMemory(ArrayView arr, int be for (int len = monotonicSeqLen, partsInSeq = 1; len > 1; len /= 2, partsInSeq *= 2) { __syncthreads(); - int part = i / (len / 2); - - int arrCmpS = begin + part * len + (i % (len / 2)); - int arrCmpE = arrCmpS + len / 2; - if(arrCmpE >= end) - continue; - - - part = threadIdx.x / (len / 2); + int part = threadIdx.x / (len / 2); int s = part * len + (threadIdx.x % (len / 2)); int e = s + len / 2; + if(e >= myBlockEnd - myBlockStart) + continue; //swap int a = sharedMem[s], b = sharedMem[e]; @@ -185,14 +185,13 @@ __global__ void bitoniSort1stStepSharedMemory(ArrayView arr, int be } - - //------------------------------------------ //writeback to global memory { - arr[s] = sharedMem[threadIdx.x]; - if(e < end) - arr[e] = sharedMem[threadIdx.x + blockDim.x]; + if(copy1 < end) + arr[copy1] = sharedMem[threadIdx.x]; + if(copy2 < end) + arr[copy2] = sharedMem[threadIdx.x + blockDim.x]; __syncthreads(); } } -- GitLab From a6fc86691f2b1252d5fc0a3346c1fb9dd577db2e Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Tue, 16 Feb 2021 14:59:21 +0100 Subject: [PATCH 020/258] remove unneeded CPU synchronization --- GPUSort/bitonicGPU/bitonicSort.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index c4c10fff1..5f68e28ad 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -217,7 +217,6 @@ void bitonicSort(ArrayView arr, int begin, int end, bool sortAscend bitoniSort1stStepSharedMemory<<>>(arr, begin, end, sortAscending); - cudaDeviceSynchronize(); for (int monotonicSeqLen = 2*sharedMemLen; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) { @@ -227,18 +226,17 @@ void bitonicSort(ArrayView arr, int begin, int end, bool sortAscend { bitonicMergeGlobal<<>>(arr, begin, end, sortAscending, monotonicSeqLen, len, partsInSeq); - cudaDeviceSynchronize(); } else { bitonicMergeSharedMemory<<>>(arr, begin, end, sortAscending, monotonicSeqLen, len, partsInSeq); - cudaDeviceSynchronize(); break; } } } + cudaDeviceSynchronize(); } //--------------------------------------------- -- GitLab From c338ee5034fcdffa6739797b3d55954969be8976 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Tue, 16 Feb 2021 15:01:11 +0100 Subject: [PATCH 021/258] remove unecessary sync after final write --- GPUSort/bitonicGPU/bitonicSort.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index 5f68e28ad..0a588fdfc 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -119,7 +119,6 @@ __global__ void bitonicMergeSharedMemory(ArrayView arr, arr[copy1] = sharedMem[threadIdx.x]; if(copy2 < end) arr[copy2] = sharedMem[threadIdx.x + blockDim.x]; - __syncthreads(); } } @@ -192,7 +191,6 @@ __global__ void bitoniSort1stStepSharedMemory(ArrayView arr, int be arr[copy1] = sharedMem[threadIdx.x]; if(copy2 < end) arr[copy2] = sharedMem[threadIdx.x + blockDim.x]; - __syncthreads(); } } -- GitLab From 1c69bd590d13e9a34187b29dcaa1a6c68027eceb Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Tue, 16 Feb 2021 15:22:22 +0100 Subject: [PATCH 022/258] comments --- GPUSort/bitonicGPU/bitonicSort.h | 47 +++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index 0a588fdfc..94ab70173 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -23,34 +23,41 @@ __host__ __device__ int closestPow2(int x) } //--------------------------------------------- - +/** + * this kernel simulates 1 exchange + */ __global__ void bitonicMergeGlobal(ArrayView arr, int begin, int end, bool sortAscending, int monotonicSeqLen, int len, int partsInSeq) { int i = blockIdx.x * blockDim.x + threadIdx.x; - int part = i / (len / 2); + int part = i / (len / 2); //computes which sorting block this thread belongs to + //the index of 2 elements that should be compared and swapped int s = begin + part * len + (i % (len / 2)); int e = s + len / 2; - if (e >= end) + if (e >= end) //arr[e] is virtual padding and will not be exchanged with return; //calculate the direction of swapping int monotonicSeqIdx = part / partsInSeq; bool ascending = (monotonicSeqIdx % 2) == 0 ? !sortAscending : sortAscending; - - //special case for parts with no "partner" - if ((monotonicSeqIdx + 1) * monotonicSeqLen >= end) + if ((monotonicSeqIdx + 1) * monotonicSeqLen >= end) //special case for part with no "partner" to be merged with in next phase ascending = sortAscending; + //cmp and swap auto &a = arr[s]; auto &b = arr[e]; if ((ascending && a > b) || (!ascending && a < b)) TNL::swap(a, b); } +//--------------------------------------------- +/** + * kernel for merging if whole block fits into shared memory + * will merge all the way down til stride == 2 + * */ __global__ void bitonicMergeSharedMemory(ArrayView arr, int begin, int end, bool sortAscending, int monotonicSeqLen, int len, int partsInSeq) @@ -58,12 +65,13 @@ __global__ void bitonicMergeSharedMemory(ArrayView arr, extern __shared__ int sharedMem[]; int sharedMemLen = 2*blockDim.x; + //1st index and last index of subarray that this threadBlock should merge int myBlockStart = begin + blockIdx.x * sharedMemLen; int myBlockEnd = end < myBlockStart+sharedMemLen? end : myBlockStart+sharedMemLen; + //copy from globalMem into sharedMem int copy1 = myBlockStart + threadIdx.x; int copy2 = copy1 + blockDim.x; - //copy from globalMem into sharedMem { if(copy1 < end) sharedMem[threadIdx.x] = arr[copy1]; @@ -87,18 +95,19 @@ __global__ void bitonicMergeSharedMemory(ArrayView arr, ascending = sortAscending; //------------------------------------------ - //do bitonic sort + //do bitonic merge for (int len = monotonicSeqLen, partsInSeq = 1; len > 1; len /= 2, partsInSeq *= 2) { __syncthreads(); + //calculates which 2 indexes will be compared and swap int part = threadIdx.x / (len / 2); int s = part * len + (threadIdx.x % (len / 2)); int e = s + len / 2; - if(e >= myBlockEnd - myBlockStart) + if(e >= myBlockEnd - myBlockStart) //touching virtual padding continue; - //swap + //cmp and swap int a = sharedMem[s], b = sharedMem[e]; if ((ascending && a > b) || (!ascending && a < b)) { @@ -123,6 +132,13 @@ __global__ void bitonicMergeSharedMemory(ArrayView arr, } //--------------------------------------------- +/** + * very similar to bitonicMergeSharedMemory + * does bitonicMergeSharedMemory but afterwards increases monotoncSeqLen + * then trickles down again + * this continues until whole sharedMem is sorted + * */ + __global__ void bitoniSort1stStepSharedMemory(ArrayView arr, int begin, int end, bool sortAscending) { extern __shared__ int sharedMem[]; @@ -131,9 +147,9 @@ __global__ void bitoniSort1stStepSharedMemory(ArrayView arr, int be int myBlockStart = begin + blockIdx.x * sharedMemLen; int myBlockEnd = end < myBlockStart+sharedMemLen? end : myBlockStart+sharedMemLen; + //copy from globalMem into sharedMem int copy1 = myBlockStart + threadIdx.x; int copy2 = copy1 + blockDim.x; - //copy from globalMem into sharedMem { if(copy1 < end) sharedMem[threadIdx.x] = arr[copy1]; @@ -155,22 +171,21 @@ __global__ void bitoniSort1stStepSharedMemory(ArrayView arr, int be //calculate the direction of swapping int monotonicSeqIdx = i / (monotonicSeqLen/2); bool ascending = (monotonicSeqIdx % 2) == 0 ? !sortAscending : sortAscending; - - //special case for parts with no "partner" - if ((monotonicSeqIdx + 1) * monotonicSeqLen >= end) + if ((monotonicSeqIdx + 1) * monotonicSeqLen >= end) //special case for parts with no "partner" ascending = sortAscending; for (int len = monotonicSeqLen, partsInSeq = 1; len > 1; len /= 2, partsInSeq *= 2) { __syncthreads(); + //calculates which 2 indexes will be compared and swap int part = threadIdx.x / (len / 2); int s = part * len + (threadIdx.x % (len / 2)); int e = s + len / 2; - if(e >= myBlockEnd - myBlockStart) + if(e >= myBlockEnd - myBlockStart) //touching virtual padding continue; - //swap + //cmp and swap int a = sharedMem[s], b = sharedMem[e]; if ((ascending && a > b) || (!ascending && a < b)) { -- GitLab From 4c28b3322ecc5379f7b54c42e96ce63e838eedec Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Tue, 16 Feb 2021 16:38:36 +0100 Subject: [PATCH 023/258] template value type --- GPUSort/bitonicGPU/bitonicSort.h | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index 94ab70173..18661e19f 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -26,7 +26,8 @@ __host__ __device__ int closestPow2(int x) /** * this kernel simulates 1 exchange */ -__global__ void bitonicMergeGlobal(ArrayView arr, +template +__global__ void bitonicMergeGlobal(ArrayView arr, int begin, int end, bool sortAscending, int monotonicSeqLen, int len, int partsInSeq) { @@ -58,11 +59,12 @@ __global__ void bitonicMergeGlobal(ArrayView arr, * kernel for merging if whole block fits into shared memory * will merge all the way down til stride == 2 * */ -__global__ void bitonicMergeSharedMemory(ArrayView arr, +template +__global__ void bitonicMergeSharedMemory(ArrayView arr, int begin, int end, bool sortAscending, int monotonicSeqLen, int len, int partsInSeq) { - extern __shared__ int sharedMem[]; + extern __shared__ Value sharedMem[]; int sharedMemLen = 2*blockDim.x; //1st index and last index of subarray that this threadBlock should merge @@ -108,7 +110,7 @@ __global__ void bitonicMergeSharedMemory(ArrayView arr, continue; //cmp and swap - int a = sharedMem[s], b = sharedMem[e]; + Value a = sharedMem[s], b = sharedMem[e]; if ((ascending && a > b) || (!ascending && a < b)) { sharedMem[s] = b; @@ -138,10 +140,10 @@ __global__ void bitonicMergeSharedMemory(ArrayView arr, * then trickles down again * this continues until whole sharedMem is sorted * */ - -__global__ void bitoniSort1stStepSharedMemory(ArrayView arr, int begin, int end, bool sortAscending) +template +__global__ void bitoniSort1stStepSharedMemory(ArrayView arr, int begin, int end, bool sortAscending) { - extern __shared__ int sharedMem[]; + extern __shared__ Value sharedMem[]; int sharedMemLen = 2*blockDim.x; int myBlockStart = begin + blockIdx.x * sharedMemLen; @@ -186,7 +188,7 @@ __global__ void bitoniSort1stStepSharedMemory(ArrayView arr, int be continue; //cmp and swap - int a = sharedMem[s], b = sharedMem[e]; + Value a = sharedMem[s], b = sharedMem[e]; if ((ascending && a > b) || (!ascending && a < b)) { sharedMem[s] = b; @@ -211,8 +213,8 @@ __global__ void bitoniSort1stStepSharedMemory(ArrayView arr, int be //--------------------------------------------- - -void bitonicSort(ArrayView arr, int begin, int end, bool sortAscending) +template +void bitonicSort(ArrayView arr, int begin, int end, bool sortAscending) { int arrSize = end - begin; int paddedSize = closestPow2(arrSize); @@ -224,7 +226,7 @@ void bitonicSort(ArrayView arr, int begin, int end, bool sortAscend int blocks = threadsNeeded / threadPerBlock + (threadsNeeded % threadPerBlock == 0 ? 0 : 1); const int sharedMemLen = threadPerBlock * 2; - const int sharedMemSize = sharedMemLen* sizeof(int); + const int sharedMemSize = sharedMemLen* sizeof(Value); //--------------------------------------------------------------------------------- @@ -253,8 +255,8 @@ void bitonicSort(ArrayView arr, int begin, int end, bool sortAscend } //--------------------------------------------- - -void bitonicSort(ArrayView arr, bool sortAscending = true) +template +void bitonicSort(ArrayView arr, bool sortAscending = true) { bitonicSort(arr, 0, arr.getSize(), sortAscending); } -- GitLab From 0b15566d6e27945851f606f7756101e0ab056d8e Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Tue, 16 Feb 2021 16:52:45 +0100 Subject: [PATCH 024/258] casting shared memory --- GPUSort/bitonicGPU/bitonicSort.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index 18661e19f..06db632a5 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -64,7 +64,9 @@ __global__ void bitonicMergeSharedMemory(ArrayView arr, int begin, int end, bool sortAscending, int monotonicSeqLen, int len, int partsInSeq) { - extern __shared__ Value sharedMem[]; + extern __shared__ int externMem[]; + Value * sharedMem = (Value *)externMem; + int sharedMemLen = 2*blockDim.x; //1st index and last index of subarray that this threadBlock should merge @@ -143,7 +145,9 @@ __global__ void bitonicMergeSharedMemory(ArrayView arr, template __global__ void bitoniSort1stStepSharedMemory(ArrayView arr, int begin, int end, bool sortAscending) { - extern __shared__ Value sharedMem[]; + extern __shared__ int externMem[]; + + Value * sharedMem = (Value *)externMem; int sharedMemLen = 2*blockDim.x; int myBlockStart = begin + blockIdx.x * sharedMemLen; -- GitLab From f921fa440d5516541e433bce8a0de74f8e94bfcf Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Tue, 16 Feb 2021 17:05:28 +0100 Subject: [PATCH 025/258] testing sorting of structs --- GPUSort/bitonicGPU/unitTests/unitTests.cu | 46 ++++++++++++++++++++++- 1 file changed, 44 insertions(+), 2 deletions(-) diff --git a/GPUSort/bitonicGPU/unitTests/unitTests.cu b/GPUSort/bitonicGPU/unitTests/unitTests.cu index ee967f2a6..1c1cba0f7 100644 --- a/GPUSort/bitonicGPU/unitTests/unitTests.cu +++ b/GPUSort/bitonicGPU/unitTests/unitTests.cu @@ -9,8 +9,8 @@ #include "../bitonicSort.h" //---------------------------------------------------------------------------------- - -bool is_sorted(TNL::Containers::ArrayView arr) +template +bool is_sorted(TNL::Containers::ArrayView arr) { for (int i = 1; i < arr.getSize(); i++) if (arr.getElement(i - 1) > arr.getElement(i)) @@ -135,6 +135,48 @@ TEST(randomGenerated, bigArray_all0) } } +TEST(nonIntegerType, float_notPow2) +{ + TNL::Containers::Array cudaArr{5.0, 9.4, 4.6, 8.9, 6.2, 1.15184, 2.23}; + auto view = cudaArr.getView(); + bitonicSort(view); + ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl; +} + +TEST(nonIntegerType, double_notPow2) +{ + TNL::Containers::Array cudaArr{5.0, 9.4, 4.6, 8.9, 6.2, 1.15184, 2.23}; + auto view = cudaArr.getView(); + bitonicSort(view); + ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl; +} + +/* +struct TMPSTRUCT{ + uint8_t m_data[6]; + TMPSTRUCT(){m_data[0] = 0;} + TMPSTRUCT(int first){m_data[0] = first;}; + bool operator <(const TMPSTRUCT& other) const { return m_data[0] < other.m_data[0];} + + bool operator ==(const TMPSTRUCT& other) const {return !(*this < other) && !(other < *this); } + + bool operator >=(const TMPSTRUCT& other) const {return !(*this < other); } + bool operator >(const TMPSTRUCT& other) const {return !(*this <= other); } + bool operator <=(const TMPSTRUCT& other) const {return (*this < other) || (other == *this); } + + std::ostream& operator << (std::ostream & out) { return out << "{ " << m_data[0] << " }";} +}; + +TEST(nonIntegerType, struct) +{ + + TNL::Containers::Array cudaArr{TMPSTRUCT(5), TMPSTRUCT(6), TMPSTRUCT(9), TMPSTRUCT(1)}; + auto view = cudaArr.getView(); + bitonicSort(view); + ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl; +} +*/ + //---------------------------------------------------------------------------------- int main(int argc, char **argv) -- GitLab From 8b807785a7cc77337ce56740bfeebd7e040d04ed Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Tue, 16 Feb 2021 17:22:18 +0100 Subject: [PATCH 026/258] refactor cmp and swap --- GPUSort/bitonicGPU/bitonicSort.h | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index 06db632a5..4ddbe5d5f 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -22,6 +22,13 @@ __host__ __device__ int closestPow2(int x) return ret; } +template +__host__ __device__ void cmpSwap(Value & a, Value &b, bool ascending) +{ + if ((ascending && a > b) || (!ascending && a < b)) + TNL::swap(a, b); +} + //--------------------------------------------- /** * this kernel simulates 1 exchange @@ -48,10 +55,7 @@ __global__ void bitonicMergeGlobal(ArrayView arr, ascending = sortAscending; //cmp and swap - auto &a = arr[s]; - auto &b = arr[e]; - if ((ascending && a > b) || (!ascending && a < b)) - TNL::swap(a, b); + cmpSwap(arr[s], arr[e], ascending); } //--------------------------------------------- @@ -111,13 +115,7 @@ __global__ void bitonicMergeSharedMemory(ArrayView arr, if(e >= myBlockEnd - myBlockStart) //touching virtual padding continue; - //cmp and swap - Value a = sharedMem[s], b = sharedMem[e]; - if ((ascending && a > b) || (!ascending && a < b)) - { - sharedMem[s] = b; - sharedMem[e] = a; - } + cmpSwap(sharedMem[s], sharedMem[e], ascending); } __syncthreads(); @@ -191,13 +189,7 @@ __global__ void bitoniSort1stStepSharedMemory(ArrayView arr, int if(e >= myBlockEnd - myBlockStart) //touching virtual padding continue; - //cmp and swap - Value a = sharedMem[s], b = sharedMem[e]; - if ((ascending && a > b) || (!ascending && a < b)) - { - sharedMem[s] = b; - sharedMem[e] = a; - } + cmpSwap(sharedMem[s], sharedMem[e], ascending); } } -- GitLab From eb9b045e738ed1d0a15813eb39e56fe5a26f4efa Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Tue, 16 Feb 2021 17:46:36 +0100 Subject: [PATCH 027/258] allow to pass cmp function --- GPUSort/bitonicGPU/bitonicSort.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index 4ddbe5d5f..471077855 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -251,8 +251,14 @@ void bitonicSort(ArrayView arr, int begin, int end, bool sortAsce } //--------------------------------------------- +template +void bitonicSort(ArrayView arr, const Function & cmp) +{ + bitonicSort(arr, 0, arr.getSize(), true); +} + template -void bitonicSort(ArrayView arr, bool sortAscending = true) +void bitonicSort(ArrayView arr) { - bitonicSort(arr, 0, arr.getSize(), sortAscending); + bitonicSort(arr, [] __cuda_callable__ (const Value & a, const Value & b) {return a < b;}); } -- GitLab From a5adc83fd9872cd1c14289924dfb2df47bef6b10 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Tue, 16 Feb 2021 18:27:15 +0100 Subject: [PATCH 028/258] function comparator --- GPUSort/bitonicGPU/bitonicSort.h | 52 ++++++++++++++++---------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index 471077855..d8167a810 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -22,20 +22,20 @@ __host__ __device__ int closestPow2(int x) return ret; } -template -__host__ __device__ void cmpSwap(Value & a, Value &b, bool ascending) +template +__host__ __device__ void cmpSwap(Value & a, Value &b, bool ascending, const Function & Cmp) { - if ((ascending && a > b) || (!ascending && a < b)) + if( (ascending && Cmp(b, a)) + || (!ascending && Cmp(a, b)) ) TNL::swap(a, b); } - //--------------------------------------------- /** * this kernel simulates 1 exchange */ -template +template __global__ void bitonicMergeGlobal(ArrayView arr, - int begin, int end, bool sortAscending, + int begin, int end, const Function & Cmp, int monotonicSeqLen, int len, int partsInSeq) { int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -50,12 +50,11 @@ __global__ void bitonicMergeGlobal(ArrayView arr, //calculate the direction of swapping int monotonicSeqIdx = part / partsInSeq; - bool ascending = (monotonicSeqIdx % 2) == 0 ? !sortAscending : sortAscending; + bool ascending = (monotonicSeqIdx % 2) != 0; if ((monotonicSeqIdx + 1) * monotonicSeqLen >= end) //special case for part with no "partner" to be merged with in next phase - ascending = sortAscending; + ascending = true; - //cmp and swap - cmpSwap(arr[s], arr[e], ascending); + cmpSwap(arr[s], arr[e], ascending, Cmp); } //--------------------------------------------- @@ -63,9 +62,9 @@ __global__ void bitonicMergeGlobal(ArrayView arr, * kernel for merging if whole block fits into shared memory * will merge all the way down til stride == 2 * */ -template +template __global__ void bitonicMergeSharedMemory(ArrayView arr, - int begin, int end, bool sortAscending, + int begin, int end, const Function & Cmp, int monotonicSeqLen, int len, int partsInSeq) { extern __shared__ int externMem[]; @@ -97,10 +96,10 @@ __global__ void bitonicMergeSharedMemory(ArrayView arr, int part = i / (len / 2); int monotonicSeqIdx = part / partsInSeq; - bool ascending = (monotonicSeqIdx % 2) == 0 ? !sortAscending : sortAscending; + bool ascending = (monotonicSeqIdx % 2) != 0; //special case for parts with no "partner" if ((monotonicSeqIdx + 1) * monotonicSeqLen >= end) - ascending = sortAscending; + ascending = true; //------------------------------------------ //do bitonic merge @@ -115,7 +114,7 @@ __global__ void bitonicMergeSharedMemory(ArrayView arr, if(e >= myBlockEnd - myBlockStart) //touching virtual padding continue; - cmpSwap(sharedMem[s], sharedMem[e], ascending); + cmpSwap(sharedMem[s], sharedMem[e], ascending, Cmp); } __syncthreads(); @@ -140,8 +139,8 @@ __global__ void bitonicMergeSharedMemory(ArrayView arr, * then trickles down again * this continues until whole sharedMem is sorted * */ -template -__global__ void bitoniSort1stStepSharedMemory(ArrayView arr, int begin, int end, bool sortAscending) +template +__global__ void bitoniSort1stStepSharedMemory(ArrayView arr, int begin, int end, const Function & Cmp) { extern __shared__ int externMem[]; @@ -174,9 +173,9 @@ __global__ void bitoniSort1stStepSharedMemory(ArrayView arr, int { //calculate the direction of swapping int monotonicSeqIdx = i / (monotonicSeqLen/2); - bool ascending = (monotonicSeqIdx % 2) == 0 ? !sortAscending : sortAscending; + bool ascending = (monotonicSeqIdx % 2) != 0; if ((monotonicSeqIdx + 1) * monotonicSeqLen >= end) //special case for parts with no "partner" - ascending = sortAscending; + ascending = true; for (int len = monotonicSeqLen, partsInSeq = 1; len > 1; len /= 2, partsInSeq *= 2) { @@ -189,7 +188,8 @@ __global__ void bitoniSort1stStepSharedMemory(ArrayView arr, int if(e >= myBlockEnd - myBlockStart) //touching virtual padding continue; - cmpSwap(sharedMem[s], sharedMem[e], ascending); + cmpSwap(sharedMem[s], sharedMem[e], ascending, Cmp); + } } @@ -209,8 +209,8 @@ __global__ void bitoniSort1stStepSharedMemory(ArrayView arr, int //--------------------------------------------- -template -void bitonicSort(ArrayView arr, int begin, int end, bool sortAscending) +template +void bitonicSort(ArrayView arr, int begin, int end, const Function& Cmp) { int arrSize = end - begin; int paddedSize = closestPow2(arrSize); @@ -227,7 +227,7 @@ void bitonicSort(ArrayView arr, int begin, int end, bool sortAsce //--------------------------------------------------------------------------------- - bitoniSort1stStepSharedMemory<<>>(arr, begin, end, sortAscending); + bitoniSort1stStepSharedMemory<<>>(arr, begin, end, Cmp); for (int monotonicSeqLen = 2*sharedMemLen; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) { @@ -235,13 +235,13 @@ void bitonicSort(ArrayView arr, int begin, int end, bool sortAsce { if(len > sharedMemLen) { - bitonicMergeGlobal<<>>(arr, begin, end, sortAscending, + bitonicMergeGlobal<<>>(arr, begin, end, Cmp, monotonicSeqLen, len, partsInSeq); } else { - bitonicMergeSharedMemory<<>>(arr, begin, end, sortAscending, + bitonicMergeSharedMemory<<>>(arr, begin, end, Cmp, monotonicSeqLen, len, partsInSeq); break; } @@ -254,7 +254,7 @@ void bitonicSort(ArrayView arr, int begin, int end, bool sortAsce template void bitonicSort(ArrayView arr, const Function & cmp) { - bitonicSort(arr, 0, arr.getSize(), true); + bitonicSort(arr, 0, arr.getSize(), cmp); } template -- GitLab From 558d885d7266465dc2dd405359ccc67d720bcc01 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Tue, 16 Feb 2021 18:48:34 +0100 Subject: [PATCH 029/258] descending sort --- GPUSort/bitonicGPU/unitTests/unitTests.cu | 24 +++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/GPUSort/bitonicGPU/unitTests/unitTests.cu b/GPUSort/bitonicGPU/unitTests/unitTests.cu index 1c1cba0f7..af570f17f 100644 --- a/GPUSort/bitonicGPU/unitTests/unitTests.cu +++ b/GPUSort/bitonicGPU/unitTests/unitTests.cu @@ -177,6 +177,30 @@ TEST(nonIntegerType, struct) } */ +//error bypassing +//https://mmg-gitlab.fjfi.cvut.cz/gitlab/tnl/tnl-dev/blob/fbc34f6a97c13ec865ef7969b9704533222ed408/src/UnitTests/Containers/VectorTest-8.h +void descendingSort(ArrayView view) +{ + auto cmpDescending = [] __cuda_callable__ (int a, int b) {return a > b;}; + bitonicSort(view, cmpDescending); +} + +TEST(sortWithFunction, descending) +{ + TNL::Containers::Array cudaArr{6, 9, 4, 2, 3}; + auto view = cudaArr.getView(); + descendingSort(view); + + ASSERT_FALSE(is_sorted(view)) << "result " << view << std::endl; + + ASSERT_TRUE(view.getElement(0) == 9); + ASSERT_TRUE(view.getElement(1) == 6); + ASSERT_TRUE(view.getElement(2) == 4); + ASSERT_TRUE(view.getElement(3) == 3); + ASSERT_TRUE(view.getElement(4) == 2); +} + + //---------------------------------------------------------------------------------- int main(int argc, char **argv) -- GitLab From f8abd466d6785878e1b1213a9a91e404bfaf1746 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 17 Feb 2021 14:23:08 +0100 Subject: [PATCH 030/258] change slow %2 to faster &1 --- GPUSort/bitonicGPU/bitonicSort.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index d8167a810..370d408a6 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -50,7 +50,7 @@ __global__ void bitonicMergeGlobal(ArrayView arr, //calculate the direction of swapping int monotonicSeqIdx = part / partsInSeq; - bool ascending = (monotonicSeqIdx % 2) != 0; + bool ascending = (monotonicSeqIdx & 1) != 0; if ((monotonicSeqIdx + 1) * monotonicSeqLen >= end) //special case for part with no "partner" to be merged with in next phase ascending = true; @@ -96,7 +96,7 @@ __global__ void bitonicMergeSharedMemory(ArrayView arr, int part = i / (len / 2); int monotonicSeqIdx = part / partsInSeq; - bool ascending = (monotonicSeqIdx % 2) != 0; + bool ascending = (monotonicSeqIdx & 1) != 0; //special case for parts with no "partner" if ((monotonicSeqIdx + 1) * monotonicSeqLen >= end) ascending = true; @@ -173,7 +173,7 @@ __global__ void bitoniSort1stStepSharedMemory(ArrayView arr, int { //calculate the direction of swapping int monotonicSeqIdx = i / (monotonicSeqLen/2); - bool ascending = (monotonicSeqIdx % 2) != 0; + bool ascending = (monotonicSeqIdx & 1) != 0; if ((monotonicSeqIdx + 1) * monotonicSeqLen >= end) //special case for parts with no "partner" ascending = true; -- GitLab From b9aee18500770c75bd8e72c6e682da17bdb528e0 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 17 Feb 2021 14:29:22 +0100 Subject: [PATCH 031/258] remove all modulo operations by faster bitwise & operation --- GPUSort/bitonicGPU/bitonicSort.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index 370d408a6..376923d63 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -43,7 +43,7 @@ __global__ void bitonicMergeGlobal(ArrayView arr, int part = i / (len / 2); //computes which sorting block this thread belongs to //the index of 2 elements that should be compared and swapped - int s = begin + part * len + (i % (len / 2)); + int s = begin + part * len + (i & ((len / 2) - 1) ); int e = s + len / 2; if (e >= end) //arr[e] is virtual padding and will not be exchanged with return; @@ -109,7 +109,7 @@ __global__ void bitonicMergeSharedMemory(ArrayView arr, //calculates which 2 indexes will be compared and swap int part = threadIdx.x / (len / 2); - int s = part * len + (threadIdx.x % (len / 2)); + int s = part * len + (threadIdx.x & ((len /2) - 1)); int e = s + len / 2; if(e >= myBlockEnd - myBlockStart) //touching virtual padding continue; @@ -183,7 +183,7 @@ __global__ void bitoniSort1stStepSharedMemory(ArrayView arr, int //calculates which 2 indexes will be compared and swap int part = threadIdx.x / (len / 2); - int s = part * len + (threadIdx.x % (len / 2)); + int s = part * len + (threadIdx.x & ((len / 2) - 1)); int e = s + len / 2; if(e >= myBlockEnd - myBlockStart) //touching virtual padding continue; -- GitLab From 80416c3992425d783c6a8f16b05c0746b3ae8673 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 17 Feb 2021 14:34:22 +0100 Subject: [PATCH 032/258] fast compare to prevent divergent threads --- GPUSort/bitonicGPU/bitonicSort.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index 376923d63..e25066573 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -25,8 +25,7 @@ __host__ __device__ int closestPow2(int x) template __host__ __device__ void cmpSwap(Value & a, Value &b, bool ascending, const Function & Cmp) { - if( (ascending && Cmp(b, a)) - || (!ascending && Cmp(a, b)) ) + if( (ascending == Cmp(b, a))) TNL::swap(a, b); } //--------------------------------------------- -- GitLab From 9740c44437f5b157299b49107023c5c264c3b9d6 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 17 Feb 2021 22:27:20 +0100 Subject: [PATCH 033/258] gitignore --- GPUSort/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/GPUSort/.gitignore b/GPUSort/.gitignore index 722d5e71d..58e73e001 100644 --- a/GPUSort/.gitignore +++ b/GPUSort/.gitignore @@ -1 +1,2 @@ .vscode +backup \ No newline at end of file -- GitLab From 41fd9aabe294569b1638ccf252f85f7d78ec32cb Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Thu, 18 Feb 2021 00:03:34 +0100 Subject: [PATCH 034/258] interface for soring std::vector --- GPUSort/bitonicGPU/bitonicSort.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index e25066573..8edad9c4e 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -250,6 +250,7 @@ void bitonicSort(ArrayView arr, int begin, int end, const Functio } //--------------------------------------------- + template void bitonicSort(ArrayView arr, const Function & cmp) { @@ -261,3 +262,21 @@ void bitonicSort(ArrayView arr) { bitonicSort(arr, [] __cuda_callable__ (const Value & a, const Value & b) {return a < b;}); } + +//--------------------------------------------- + +template +void bitonicSort(std::vector vec) +{ + TNL::Containers::Array Arr(vec); + bitonicSort(Arr.getView()); +} + +template +void bitonicSort(std::vector vec,const Function & cmp) +{ + TNL::Containers::Array Arr(vec); + bitonicSort(Arr.getView(), cmp); +} + +//--------------------------------------------- \ No newline at end of file -- GitLab From 7857241b21368635676288592bf03e7920a2ba89 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Thu, 18 Feb 2021 00:10:17 +0100 Subject: [PATCH 035/258] removing namespace pollution --- GPUSort/bitonicGPU/bitonicSort.h | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index 8edad9c4e..b31f7a271 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -1,13 +1,6 @@ #include #include -using namespace TNL; -using namespace TNL::Containers; - -typedef Devices::Cuda Device; - -#define deb(x) std::cout << #x << " = " << x << std::endl; - //--------------------------------------------- __host__ __device__ int closestPow2(int x) @@ -33,7 +26,7 @@ __host__ __device__ void cmpSwap(Value & a, Value &b, bool ascending, const Func * this kernel simulates 1 exchange */ template -__global__ void bitonicMergeGlobal(ArrayView arr, +__global__ void bitonicMergeGlobal(TNL::Containers::ArrayView arr, int begin, int end, const Function & Cmp, int monotonicSeqLen, int len, int partsInSeq) { @@ -62,7 +55,7 @@ __global__ void bitonicMergeGlobal(ArrayView arr, * will merge all the way down til stride == 2 * */ template -__global__ void bitonicMergeSharedMemory(ArrayView arr, +__global__ void bitonicMergeSharedMemory(TNL::Containers::ArrayView arr, int begin, int end, const Function & Cmp, int monotonicSeqLen, int len, int partsInSeq) { @@ -139,7 +132,8 @@ __global__ void bitonicMergeSharedMemory(ArrayView arr, * this continues until whole sharedMem is sorted * */ template -__global__ void bitoniSort1stStepSharedMemory(ArrayView arr, int begin, int end, const Function & Cmp) +__global__ void bitoniSort1stStepSharedMemory(TNL::Containers::ArrayView arr, + int begin, int end, const Function & Cmp) { extern __shared__ int externMem[]; @@ -209,7 +203,7 @@ __global__ void bitoniSort1stStepSharedMemory(ArrayView arr, int //--------------------------------------------- template -void bitonicSort(ArrayView arr, int begin, int end, const Function& Cmp) +void bitonicSort(TNL::Containers::ArrayView arr, int begin, int end, const Function& Cmp) { int arrSize = end - begin; int paddedSize = closestPow2(arrSize); @@ -252,13 +246,13 @@ void bitonicSort(ArrayView arr, int begin, int end, const Functio //--------------------------------------------- template -void bitonicSort(ArrayView arr, const Function & cmp) +void bitonicSort(TNL::Containers::ArrayView arr, const Function & cmp) { bitonicSort(arr, 0, arr.getSize(), cmp); } template -void bitonicSort(ArrayView arr) +void bitonicSort(TNL::Containers::ArrayView arr) { bitonicSort(arr, [] __cuda_callable__ (const Value & a, const Value & b) {return a < b;}); } @@ -268,14 +262,14 @@ void bitonicSort(ArrayView arr) template void bitonicSort(std::vector vec) { - TNL::Containers::Array Arr(vec); + TNL::Containers::Array Arr(vec); bitonicSort(Arr.getView()); } template void bitonicSort(std::vector vec,const Function & cmp) { - TNL::Containers::Array Arr(vec); + TNL::Containers::Array Arr(vec); bitonicSort(Arr.getView(), cmp); } -- GitLab From dbe7c235c8d580db18098c1554ff5e05e37d65e2 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Thu, 18 Feb 2021 00:16:13 +0100 Subject: [PATCH 036/258] add copyback after sorting for sorting std::vector --- GPUSort/bitonicGPU/bitonicSort.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index b31f7a271..140f7b715 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -1,5 +1,4 @@ #include -#include //--------------------------------------------- @@ -259,18 +258,21 @@ void bitonicSort(TNL::Containers::ArrayView arr) //--------------------------------------------- -template -void bitonicSort(std::vector vec) +template +void bitonicSort(std::vector & vec,const Function & cmp) { TNL::Containers::Array Arr(vec); - bitonicSort(Arr.getView()); + auto view = Arr.getView(); + bitonicSort(view, cmp); + + for(size_t i = 0; i < vec.size(); ++i) + vec[i] = view.getElement(i); } -template -void bitonicSort(std::vector vec,const Function & cmp) +template +void bitonicSort(std::vector & vec) { - TNL::Containers::Array Arr(vec); - bitonicSort(Arr.getView(), cmp); + bitonicSort(vec, [] __cuda_callable__ (const Value & a, const Value & b) {return a < b;}); } //--------------------------------------------- \ No newline at end of file -- GitLab From e37e4780a6834bdbb6551c40260fa5392a3b9fb4 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Thu, 18 Feb 2021 00:20:38 +0100 Subject: [PATCH 037/258] sort vector test and fix namespace --- GPUSort/bitonicGPU/main.cu | 22 ++++++++++---- GPUSort/bitonicGPU/unitTests/unitTests.cu | 36 +++++++++++++++-------- 2 files changed, 40 insertions(+), 18 deletions(-) diff --git a/GPUSort/bitonicGPU/main.cu b/GPUSort/bitonicGPU/main.cu index 71e614925..30903f97c 100644 --- a/GPUSort/bitonicGPU/main.cu +++ b/GPUSort/bitonicGPU/main.cu @@ -15,16 +15,26 @@ std::ostream& operator<< (std::ostream&out, std::vector &arr) int main( int argc, char* argv[] ) { - TNL::Containers::Array Arr(argc - 1); - for(int i = 1; i < argc; i++) - Arr.setElement(i-1, std::atoi(argv[i])); + if(argc <= 1) + { + std::cout << "missing argument: N=array size to be tested on" << std::endl; + return 1; + } + + std::vector a(std::atoi(argv[1])); + for(int i = 0; i < a.size(); i++) + a[i] = std::rand() % a.size(); + + + TNL::Containers::Array Arr(a); auto view = Arr.getView(); - std::cout << "unsorted: " << view << std::endl; - bitonicSort(view); + + //std::cout << "unsorted: " << view << std::endl; + bitonicSort(a); - std::cout << "sorted: " << view << std::endl; + //std::cout << "sorted: " << view << std::endl; return 0; } \ No newline at end of file diff --git a/GPUSort/bitonicGPU/unitTests/unitTests.cu b/GPUSort/bitonicGPU/unitTests/unitTests.cu index af570f17f..02506bbed 100644 --- a/GPUSort/bitonicGPU/unitTests/unitTests.cu +++ b/GPUSort/bitonicGPU/unitTests/unitTests.cu @@ -31,7 +31,7 @@ TEST(permutations, allPermutationSize_3_to_7) while (std::next_permutation(orig.begin(), orig.end())) { - TNL::Containers::Array cudaArr(orig); + TNL::Containers::Array cudaArr(orig); auto view = cudaArr.getView(); bitonicSort(view); @@ -55,7 +55,7 @@ TEST(permutations, somePermutationSize8) if ((i++) % stride != 0) continue; - TNL::Containers::Array cudaArr(orig); + TNL::Containers::Array cudaArr(orig); auto view = cudaArr.getView(); bitonicSort(view); @@ -78,7 +78,7 @@ TEST(permutations, somePermutationSize9) if ((i++) % stride != 0) continue; - TNL::Containers::Array cudaArr(orig); + TNL::Containers::Array cudaArr(orig); auto view = cudaArr.getView(); bitonicSort(view); @@ -89,7 +89,7 @@ TEST(permutations, somePermutationSize9) TEST(selectedSize, size15) { - TNL::Containers::Array cudaArr{5, 9, 4, 8, 6, 1, 2, 3, 4, 8, 1, 6, 9, 4, 9}; + TNL::Containers::Array cudaArr{5, 9, 4, 8, 6, 1, 2, 3, 4, 8, 1, 6, 9, 4, 9}; auto view = cudaArr.getView(); ASSERT_EQ(15, view.getSize()); bitonicSort(view); @@ -98,7 +98,7 @@ TEST(selectedSize, size15) TEST(multiblock, 32768_decreasingNegative) { - TNL::Containers::Array cudaArr(1 << 15); + TNL::Containers::Array cudaArr(1 << 15); for (int i = 0; i < cudaArr.getSize(); i++) cudaArr.setElement(i, -i); @@ -111,7 +111,7 @@ TEST(randomGenerated, smallArray_randomVal) { for(int i = 0; i < 100; i++) { - TNL::Containers::Array cudaArr(std::rand()%(1<<10)); + TNL::Containers::Array cudaArr(std::rand()%(1<<10)); for (int j = 0; j < cudaArr.getSize(); j++) cudaArr.setElement(j, std::rand()); @@ -127,7 +127,7 @@ TEST(randomGenerated, bigArray_all0) { int size = (1<<20) + (std::rand()% (1<<19)); - TNL::Containers::Array cudaArr(size); + TNL::Containers::Array cudaArr(size); auto view = cudaArr.getView(); bitonicSort(view); @@ -137,7 +137,7 @@ TEST(randomGenerated, bigArray_all0) TEST(nonIntegerType, float_notPow2) { - TNL::Containers::Array cudaArr{5.0, 9.4, 4.6, 8.9, 6.2, 1.15184, 2.23}; + TNL::Containers::Array cudaArr{5.0, 9.4, 4.6, 8.9, 6.2, 1.15184, 2.23}; auto view = cudaArr.getView(); bitonicSort(view); ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl; @@ -145,7 +145,7 @@ TEST(nonIntegerType, float_notPow2) TEST(nonIntegerType, double_notPow2) { - TNL::Containers::Array cudaArr{5.0, 9.4, 4.6, 8.9, 6.2, 1.15184, 2.23}; + TNL::Containers::Array cudaArr{5.0, 9.4, 4.6, 8.9, 6.2, 1.15184, 2.23}; auto view = cudaArr.getView(); bitonicSort(view); ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl; @@ -170,7 +170,7 @@ struct TMPSTRUCT{ TEST(nonIntegerType, struct) { - TNL::Containers::Array cudaArr{TMPSTRUCT(5), TMPSTRUCT(6), TMPSTRUCT(9), TMPSTRUCT(1)}; + TNL::Containers::Array cudaArr{TMPSTRUCT(5), TMPSTRUCT(6), TMPSTRUCT(9), TMPSTRUCT(1)}; auto view = cudaArr.getView(); bitonicSort(view); ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl; @@ -179,7 +179,7 @@ TEST(nonIntegerType, struct) //error bypassing //https://mmg-gitlab.fjfi.cvut.cz/gitlab/tnl/tnl-dev/blob/fbc34f6a97c13ec865ef7969b9704533222ed408/src/UnitTests/Containers/VectorTest-8.h -void descendingSort(ArrayView view) +void descendingSort(TNL::Containers::ArrayView view) { auto cmpDescending = [] __cuda_callable__ (int a, int b) {return a > b;}; bitonicSort(view, cmpDescending); @@ -187,7 +187,7 @@ void descendingSort(ArrayView view) TEST(sortWithFunction, descending) { - TNL::Containers::Array cudaArr{6, 9, 4, 2, 3}; + TNL::Containers::Array cudaArr{6, 9, 4, 2, 3}; auto view = cudaArr.getView(); descendingSort(view); @@ -200,6 +200,18 @@ TEST(sortWithFunction, descending) ASSERT_TRUE(view.getElement(4) == 2); } +TEST(sortstdVector, stdvector) +{ + std::vector arr(84561); + for(size_t i = 0; i < arr.size(); i++) + arr[i] = -i; + + bitonicSort(arr); + + ASSERT_TRUE(std::is_sorted(arr.begin(), arr.end())); +} + + //---------------------------------------------------------------------------------- -- GitLab From e5d9a1a10cf4337ea34420aa44842574036ed50e Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Thu, 18 Feb 2021 00:22:02 +0100 Subject: [PATCH 038/258] namespace fix for benchmark --- GPUSort/bitonicGPU/benchmark/benchmark.cu | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/GPUSort/bitonicGPU/benchmark/benchmark.cu b/GPUSort/bitonicGPU/benchmark/benchmark.cu index 6cb882b5e..21c5c2eb4 100644 --- a/GPUSort/bitonicGPU/benchmark/benchmark.cu +++ b/GPUSort/bitonicGPU/benchmark/benchmark.cu @@ -8,6 +8,11 @@ #include "../bitonicSort.h" +using namespace TNL; +using namespace TNL::Containers; + +typedef Devices::Cuda Device; + template std::ostream& operator<< (std::ostream&out, std::vector &arr) { -- GitLab From d3482f70b022350013bdc564c13c7565a937bf24 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Fri, 19 Feb 2021 15:05:08 +0100 Subject: [PATCH 039/258] remove unused part variable in loop --- GPUSort/bitonicGPU/bitonicSort.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index 140f7b715..9b26eff2d 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -94,7 +94,7 @@ __global__ void bitonicMergeSharedMemory(TNL::Containers::ArrayView 1; len /= 2, partsInSeq *= 2) + for (int len = monotonicSeqLen; len > 1; len /= 2) { __syncthreads(); @@ -169,7 +169,7 @@ __global__ void bitoniSort1stStepSharedMemory(TNL::Containers::ArrayView= end) //special case for parts with no "partner" ascending = true; - for (int len = monotonicSeqLen, partsInSeq = 1; len > 1; len /= 2, partsInSeq *= 2) + for (int len = monotonicSeqLen; len > 1; len /= 2) { __syncthreads(); -- GitLab From d281d8b011178704476ed7d9a50a577b591845d7 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sat, 20 Feb 2021 15:12:55 +0100 Subject: [PATCH 040/258] skip empty iteration cycles for merging in shared memory --- GPUSort/bitonicGPU/bitonicSort.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index 9b26eff2d..b59ec7386 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -94,7 +94,7 @@ __global__ void bitonicMergeSharedMemory(TNL::Containers::ArrayView 1; len /= 2) + for (; len > 1; len /= 2) { __syncthreads(); -- GitLab From 3cbb94cf63ae4789a39c72a46333b7ee77d60999 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sat, 20 Feb 2021 16:27:08 +0100 Subject: [PATCH 041/258] check if sorted in host --- GPUSort/bitonicGPU/unitTests/unitTests.cu | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/GPUSort/bitonicGPU/unitTests/unitTests.cu b/GPUSort/bitonicGPU/unitTests/unitTests.cu index 02506bbed..a551a6f62 100644 --- a/GPUSort/bitonicGPU/unitTests/unitTests.cu +++ b/GPUSort/bitonicGPU/unitTests/unitTests.cu @@ -5,15 +5,16 @@ #include #include - +#include #include "../bitonicSort.h" -//---------------------------------------------------------------------------------- template bool is_sorted(TNL::Containers::ArrayView arr) { - for (int i = 1; i < arr.getSize(); i++) - if (arr.getElement(i - 1) > arr.getElement(i)) + TNL::Containers::Array tmp(arr.getSize()); + TNL::Algorithms::MultiDeviceMemoryOperations::copy(tmp.getData(), arr.getData(), arr.getSize()); + for (int i = 1; i < tmp.getSize(); i++) + if (tmp[i - 1] > tmp[i]) return false; return true; -- GitLab From fee188c21b637d70cf245489310e8ba114927c84 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sat, 20 Feb 2021 16:33:54 +0100 Subject: [PATCH 042/258] speed up copy back into array --- GPUSort/bitonicGPU/bitonicSort.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index b59ec7386..d1cb2bc18 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -265,8 +265,9 @@ void bitonicSort(std::vector & vec,const Function & cmp) auto view = Arr.getView(); bitonicSort(view, cmp); - for(size_t i = 0; i < vec.size(); ++i) - vec[i] = view.getElement(i); + TNL::Algorithms::MultiDeviceMemoryOperations:: + copy(vec.data(), view.getData(), view.getSize()); + } template -- GitLab From deea2c03030e9ebc522eb07a79bff5aaab0944c6 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sat, 20 Feb 2021 16:35:45 +0100 Subject: [PATCH 043/258] use std::sort to check for correctness --- GPUSort/bitonicGPU/unitTests/unitTests.cu | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/GPUSort/bitonicGPU/unitTests/unitTests.cu b/GPUSort/bitonicGPU/unitTests/unitTests.cu index a551a6f62..d1e92343a 100644 --- a/GPUSort/bitonicGPU/unitTests/unitTests.cu +++ b/GPUSort/bitonicGPU/unitTests/unitTests.cu @@ -11,13 +11,10 @@ template bool is_sorted(TNL::Containers::ArrayView arr) { - TNL::Containers::Array tmp(arr.getSize()); - TNL::Algorithms::MultiDeviceMemoryOperations::copy(tmp.getData(), arr.getData(), arr.getSize()); - for (int i = 1; i < tmp.getSize(); i++) - if (tmp[i - 1] > tmp[i]) - return false; + std::vector tmp(arr.getSize()); + TNL::Algorithms::MultiDeviceMemoryOperations::copy(tmp.data(), arr.getData(), arr.getSize()); - return true; + return std::is_sorted(tmp.begin(), tmp.end()); } //---------------------------------------------------------------------------------- -- GitLab From a5085ab10248ca8e1784962700dbed58e24a354b Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sat, 20 Feb 2021 17:19:33 +0100 Subject: [PATCH 044/258] threadcount --- GPUSort/bitonicGPU/bitonicSort.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index d1cb2bc18..917bb3327 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -210,7 +210,7 @@ void bitonicSort(TNL::Containers::ArrayView arr, int int threadsNeeded = arrSize / 2 + (arrSize %2 !=0); const int maxThreadsPerBlock = 512; - int threadPerBlock = min(maxThreadsPerBlock, threadsNeeded); + int threadPerBlock = maxThreadsPerBlock; int blocks = threadsNeeded / threadPerBlock + (threadsNeeded % threadPerBlock == 0 ? 0 : 1); const int sharedMemLen = threadPerBlock * 2; -- GitLab From 956429c2252a26e4a55c1e015bdfe8e8ce217445 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sun, 21 Feb 2021 20:29:33 +0100 Subject: [PATCH 045/258] refactor to remove if(...) continue statement --- GPUSort/bitonicGPU/bitonicSort.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index 917bb3327..9fa013cc6 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -102,10 +102,9 @@ __global__ void bitonicMergeSharedMemory(TNL::Containers::ArrayView= myBlockEnd - myBlockStart) //touching virtual padding - continue; - cmpSwap(sharedMem[s], sharedMem[e], ascending, Cmp); + if(e < myBlockEnd - myBlockStart) //touching virtual padding + cmpSwap(sharedMem[s], sharedMem[e], ascending, Cmp); } __syncthreads(); @@ -177,11 +176,9 @@ __global__ void bitoniSort1stStepSharedMemory(TNL::Containers::ArrayView= myBlockEnd - myBlockStart) //touching virtual padding - continue; - - cmpSwap(sharedMem[s], sharedMem[e], ascending, Cmp); + if(e < myBlockEnd - myBlockStart) //touching virtual padding + cmpSwap(sharedMem[s], sharedMem[e], ascending, Cmp); } } -- GitLab From a78e25edcf268809fa7048d91d9a016b97700610 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sun, 21 Feb 2021 20:36:21 +0100 Subject: [PATCH 046/258] sync after write --- GPUSort/bitonicGPU/bitonicSort.h | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index 9fa013cc6..e8f4f7aba 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -96,8 +96,6 @@ __global__ void bitonicMergeSharedMemory(TNL::Containers::ArrayView 1; len /= 2) { - __syncthreads(); - //calculates which 2 indexes will be compared and swap int part = threadIdx.x / (len / 2); int s = part * len + (threadIdx.x & ((len /2) - 1)); @@ -105,10 +103,8 @@ __global__ void bitonicMergeSharedMemory(TNL::Containers::ArrayView 1; len /= 2) { - __syncthreads(); - //calculates which 2 indexes will be compared and swap int part = threadIdx.x / (len / 2); int s = part * len + (threadIdx.x & ((len / 2) - 1)); @@ -179,11 +173,9 @@ __global__ void bitoniSort1stStepSharedMemory(TNL::Containers::ArrayView Date: Wed, 24 Feb 2021 21:17:35 +0100 Subject: [PATCH 047/258] update timer --- GPUSort/bitonicGPU/benchmark/benchmark.cu | 50 ++++++----------------- 1 file changed, 12 insertions(+), 38 deletions(-) diff --git a/GPUSort/bitonicGPU/benchmark/benchmark.cu b/GPUSort/bitonicGPU/benchmark/benchmark.cu index 21c5c2eb4..74b2bfaf7 100644 --- a/GPUSort/bitonicGPU/benchmark/benchmark.cu +++ b/GPUSort/bitonicGPU/benchmark/benchmark.cu @@ -3,6 +3,7 @@ #include #include #include +#include #include @@ -21,38 +22,20 @@ std::ostream& operator<< (std::ostream&out, std::vector &arr) return out; } + struct TIMER { - std::string s; - std::chrono::steady_clock::time_point begin; - double result = 0; - bool stopped = false; + std::function f; + std::chrono::high_resolution_clock::time_point begin; - TIMER(const std::string &name = "") - : s(name), begin(std::chrono::steady_clock::now()) {} + TIMER(std::function func/* = [](double res){std::cout << res << std::endl;}*/) + : f(func), begin(std::chrono::high_resolution_clock::now()) {} - double stop() - { - auto end = std::chrono::steady_clock::now(); - result = (std::chrono::duration_cast(end - begin).count() / 1000.); - stopped = true; - return result; - } - - void printTime() - { - if(!stopped) - stop(); - std::cout << ("Measured " + s + ": ") << result << " ms" << std::endl; - } - ~TIMER() { - if(!stopped) - { - stop(); - printTime(); - } + auto end = std::chrono::high_resolution_clock::now(); + double result = (std::chrono::duration_cast(end - begin).count() / 1000.); + f(result); } }; @@ -65,7 +48,7 @@ void test1() auto view = cudaArr.getView(); { - TIMER t("sorted sequences"); + TIMER t([](double res){std::cout << res << std::endl;}); bitonicSort(view); } } @@ -87,16 +70,9 @@ void randomShuffles() TNL::Containers::Array cudaArr(orig); auto view = cudaArr.getView(); - std::vector tmp(orig.begin(), orig.end()); - { - TIMER t("random permutation"); - - //std::sort(tmp.begin(), tmp.end()); + TIMER t([&](double res){results.push_back(res);}); bitonicSort(view); - - results.push_back(t.stop()); - //t.printTime(); } } @@ -114,10 +90,8 @@ void allPermutations(std::vector orig) auto view = cudaArr.getView(); { - TIMER t("random permutation"); + TIMER t([&](double res){results.push_back(res);}); bitonicSort(view); - results.push_back(t.stop()); - //t.printTime(); } } std::cout << "average time: " << std::accumulate(results.begin(), results.end(), 0.)/results.size() << " ms" << std::endl; -- GitLab From 9316a920743c7c375669f292a39c642815ddf00d Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 24 Feb 2021 21:20:54 +0100 Subject: [PATCH 048/258] fix permutation --- GPUSort/bitonicGPU/unitTests/unitTests.cu | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/GPUSort/bitonicGPU/unitTests/unitTests.cu b/GPUSort/bitonicGPU/unitTests/unitTests.cu index d1e92343a..76b4bec75 100644 --- a/GPUSort/bitonicGPU/unitTests/unitTests.cu +++ b/GPUSort/bitonicGPU/unitTests/unitTests.cu @@ -27,7 +27,7 @@ TEST(permutations, allPermutationSize_3_to_7) std::vector orig(size); std::iota(orig.begin(), orig.end(), 0); - while (std::next_permutation(orig.begin(), orig.end())) + do { TNL::Containers::Array cudaArr(orig); auto view = cudaArr.getView(); @@ -36,6 +36,7 @@ TEST(permutations, allPermutationSize_3_to_7) ASSERT_TRUE(is_sorted(view)) << "failed " << i << std::endl; } + while (std::next_permutation(orig.begin(), orig.end())); } } @@ -48,7 +49,7 @@ TEST(permutations, somePermutationSize8) std::vector orig(size); std::iota(orig.begin(), orig.end(), 0); - while (std::next_permutation(orig.begin(), orig.end())) + do { if ((i++) % stride != 0) continue; @@ -60,6 +61,7 @@ TEST(permutations, somePermutationSize8) ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl; } + while (std::next_permutation(orig.begin(), orig.end())); } TEST(permutations, somePermutationSize9) @@ -71,7 +73,7 @@ TEST(permutations, somePermutationSize9) std::vector orig(size); std::iota(orig.begin(), orig.end(), 0); - while (std::next_permutation(orig.begin(), orig.end())) + do { if ((i++) % stride != 0) continue; @@ -83,8 +85,11 @@ TEST(permutations, somePermutationSize9) ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl; } + while (std::next_permutation(orig.begin(), orig.end())); } +//----------------------------------------------------------------------- + TEST(selectedSize, size15) { TNL::Containers::Array cudaArr{5, 9, 4, 8, 6, 1, 2, 3, 4, 8, 1, 6, 9, 4, 9}; -- GitLab From 8c47ed2239f9e70abbe6168b21a7aa8c2ae802f3 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Thu, 25 Feb 2021 19:46:19 +0100 Subject: [PATCH 049/258] sorting struct --- GPUSort/bitonicGPU/unitTests/unitTests.cu | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/GPUSort/bitonicGPU/unitTests/unitTests.cu b/GPUSort/bitonicGPU/unitTests/unitTests.cu index 76b4bec75..83ffd9854 100644 --- a/GPUSort/bitonicGPU/unitTests/unitTests.cu +++ b/GPUSort/bitonicGPU/unitTests/unitTests.cu @@ -154,31 +154,22 @@ TEST(nonIntegerType, double_notPow2) ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl; } -/* + struct TMPSTRUCT{ uint8_t m_data[6]; TMPSTRUCT(){m_data[0] = 0;} TMPSTRUCT(int first){m_data[0] = first;}; bool operator <(const TMPSTRUCT& other) const { return m_data[0] < other.m_data[0];} - - bool operator ==(const TMPSTRUCT& other) const {return !(*this < other) && !(other < *this); } - - bool operator >=(const TMPSTRUCT& other) const {return !(*this < other); } - bool operator >(const TMPSTRUCT& other) const {return !(*this <= other); } - bool operator <=(const TMPSTRUCT& other) const {return (*this < other) || (other == *this); } - - std::ostream& operator << (std::ostream & out) { return out << "{ " << m_data[0] << " }";} }; TEST(nonIntegerType, struct) { - TNL::Containers::Array cudaArr{TMPSTRUCT(5), TMPSTRUCT(6), TMPSTRUCT(9), TMPSTRUCT(1)}; auto view = cudaArr.getView(); bitonicSort(view); - ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl; + ASSERT_TRUE(is_sorted(view)); } -*/ + //error bypassing //https://mmg-gitlab.fjfi.cvut.cz/gitlab/tnl/tnl-dev/blob/fbc34f6a97c13ec865ef7969b9704533222ed408/src/UnitTests/Containers/VectorTest-8.h -- GitLab From ed5806e6d30884eab29868e7e2c0c513bb3f94b1 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Thu, 25 Feb 2021 19:49:13 +0100 Subject: [PATCH 050/258] refactor out timer for benchmark --- GPUSort/bitonicGPU/benchmark/benchmark.cu | 18 ------------------ GPUSort/util/timer.h | 22 ++++++++++++++++++++++ 2 files changed, 22 insertions(+), 18 deletions(-) create mode 100644 GPUSort/util/timer.h diff --git a/GPUSort/bitonicGPU/benchmark/benchmark.cu b/GPUSort/bitonicGPU/benchmark/benchmark.cu index 74b2bfaf7..6631f873d 100644 --- a/GPUSort/bitonicGPU/benchmark/benchmark.cu +++ b/GPUSort/bitonicGPU/benchmark/benchmark.cu @@ -22,24 +22,6 @@ std::ostream& operator<< (std::ostream&out, std::vector &arr) return out; } - -struct TIMER -{ - std::function f; - std::chrono::high_resolution_clock::time_point begin; - - TIMER(std::function func/* = [](double res){std::cout << res << std::endl;}*/) - : f(func), begin(std::chrono::high_resolution_clock::now()) {} - - ~TIMER() - { - auto end = std::chrono::high_resolution_clock::now(); - double result = (std::chrono::duration_cast(end - begin).count() / 1000.); - f(result); - } -}; - - void test1() { int size = 1<<10; diff --git a/GPUSort/util/timer.h b/GPUSort/util/timer.h new file mode 100644 index 000000000..880eb03e7 --- /dev/null +++ b/GPUSort/util/timer.h @@ -0,0 +1,22 @@ +#pragma once + +#include +#include +#include +#include + +struct TIMER +{ + std::function f; + std::chrono::high_resolution_clock::time_point begin; + + TIMER(std::function func = [](double res){std::cout << res << std::endl;}) + : f(func), begin(std::chrono::high_resolution_clock::now()) {} + + ~TIMER() + { + auto end = std::chrono::high_resolution_clock::now(); + double result = (std::chrono::duration_cast(end - begin).count() / 1000.); + f(result); + } +}; \ No newline at end of file -- GitLab From 7afa1f2a59e281180c4ce845ffebb52743dcfe7d Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Thu, 25 Feb 2021 19:51:11 +0100 Subject: [PATCH 051/258] include --- GPUSort/bitonicGPU/benchmark/benchmark.cu | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/GPUSort/bitonicGPU/benchmark/benchmark.cu b/GPUSort/bitonicGPU/benchmark/benchmark.cu index 6631f873d..7da2c6cac 100644 --- a/GPUSort/bitonicGPU/benchmark/benchmark.cu +++ b/GPUSort/bitonicGPU/benchmark/benchmark.cu @@ -1,13 +1,13 @@ #include -#include #include #include #include -#include #include #include "../bitonicSort.h" +#include "../../util/timer.h" + using namespace TNL; using namespace TNL::Containers; @@ -30,7 +30,7 @@ void test1() auto view = cudaArr.getView(); { - TIMER t([](double res){std::cout << res << std::endl;}); + TIMER t; bitonicSort(view); } } @@ -53,7 +53,7 @@ void randomShuffles() TNL::Containers::Array cudaArr(orig); auto view = cudaArr.getView(); { - TIMER t([&](double res){results.push_back(res);}); + TIMER t; bitonicSort(view); } @@ -72,7 +72,7 @@ void allPermutations(std::vector orig) auto view = cudaArr.getView(); { - TIMER t([&](double res){results.push_back(res);}); + TIMER t; bitonicSort(view); } } -- GitLab From cdc2a927eb8a099634e103f31051a3c2f9ffa408 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Thu, 25 Feb 2021 20:07:06 +0100 Subject: [PATCH 052/258] refactor array creation --- GPUSort/bitonicGPU/unitTests/unitTests.cu | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/GPUSort/bitonicGPU/unitTests/unitTests.cu b/GPUSort/bitonicGPU/unitTests/unitTests.cu index 83ffd9854..c521f835f 100644 --- a/GPUSort/bitonicGPU/unitTests/unitTests.cu +++ b/GPUSort/bitonicGPU/unitTests/unitTests.cu @@ -101,11 +101,13 @@ TEST(selectedSize, size15) TEST(multiblock, 32768_decreasingNegative) { - TNL::Containers::Array cudaArr(1 << 15); - for (int i = 0; i < cudaArr.getSize(); i++) - cudaArr.setElement(i, -i); - + std::vector arr(1<<15); + for (size_t i = 0; i < arr.size(); i++) + arr[i] = -i; + + TNL::Containers::Array cudaArr(arr); auto view = cudaArr.getView(); + bitonicSort(view); ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl; } @@ -114,9 +116,11 @@ TEST(randomGenerated, smallArray_randomVal) { for(int i = 0; i < 100; i++) { - TNL::Containers::Array cudaArr(std::rand()%(1<<10)); - for (int j = 0; j < cudaArr.getSize(); j++) - cudaArr.setElement(j, std::rand()); + std::vector arr(std::rand()%(1<<10)); + for(auto & x : arr) + x = std::rand(); + + TNL::Containers::Array cudaArr(arr); auto view = cudaArr.getView(); bitonicSort(view); -- GitLab From 063487fdb64978748d6ea5560d2faced8eca7606 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Thu, 25 Feb 2021 20:09:24 +0100 Subject: [PATCH 053/258] refactor out is sorted function --- GPUSort/bitonicGPU/unitTests/unitTests.cu | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/GPUSort/bitonicGPU/unitTests/unitTests.cu b/GPUSort/bitonicGPU/unitTests/unitTests.cu index c521f835f..83ec1477b 100644 --- a/GPUSort/bitonicGPU/unitTests/unitTests.cu +++ b/GPUSort/bitonicGPU/unitTests/unitTests.cu @@ -7,15 +7,7 @@ #include #include #include "../bitonicSort.h" - -template -bool is_sorted(TNL::Containers::ArrayView arr) -{ - std::vector tmp(arr.getSize()); - TNL::Algorithms::MultiDeviceMemoryOperations::copy(tmp.data(), arr.getData(), arr.getSize()); - - return std::is_sorted(tmp.begin(), tmp.end()); -} +#include "../../util/algorithm.h" //---------------------------------------------------------------------------------- -- GitLab From 3aedd0b3918ef6adb446c986b94cacb4cdfd6ead Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Thu, 25 Feb 2021 20:10:05 +0100 Subject: [PATCH 054/258] refactor is sorted --- GPUSort/util/algorithm.h | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 GPUSort/util/algorithm.h diff --git a/GPUSort/util/algorithm.h b/GPUSort/util/algorithm.h new file mode 100644 index 000000000..dbb98b07a --- /dev/null +++ b/GPUSort/util/algorithm.h @@ -0,0 +1,11 @@ +#pragma once +#include + +template +bool is_sorted(TNL::Containers::ArrayView arr) +{ + std::vector tmp(arr.getSize()); + TNL::Algorithms::MultiDeviceMemoryOperations::copy(tmp.data(), arr.getData(), arr.getSize()); + + return std::is_sorted(tmp.begin(), tmp.end()); +} -- GitLab From 524a4e2da9b53bc9d543a3739def81d4a0f1b054 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Thu, 25 Feb 2021 20:12:18 +0100 Subject: [PATCH 055/258] add seed for deterministic testing --- GPUSort/bitonicGPU/unitTests/unitTests.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/GPUSort/bitonicGPU/unitTests/unitTests.cu b/GPUSort/bitonicGPU/unitTests/unitTests.cu index 83ec1477b..a0ed71845 100644 --- a/GPUSort/bitonicGPU/unitTests/unitTests.cu +++ b/GPUSort/bitonicGPU/unitTests/unitTests.cu @@ -106,6 +106,7 @@ TEST(multiblock, 32768_decreasingNegative) TEST(randomGenerated, smallArray_randomVal) { + std::srand(2006); for(int i = 0; i < 100; i++) { std::vector arr(std::rand()%(1<<10)); @@ -122,6 +123,7 @@ TEST(randomGenerated, smallArray_randomVal) TEST(randomGenerated, bigArray_all0) { + std::srand(304); for(int i = 0; i < 50; i++) { int size = (1<<20) + (std::rand()% (1<<19)); -- GitLab From 06294aa85c6b4013f6e6291038004a6832133dcd Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Fri, 26 Feb 2021 15:29:45 +0100 Subject: [PATCH 056/258] make is_sorted as a parallel function --- GPUSort/bitonicGPU/unitTests/unitTests.cu | 3 ++- GPUSort/util/algorithm.h | 14 ++++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/GPUSort/bitonicGPU/unitTests/unitTests.cu b/GPUSort/bitonicGPU/unitTests/unitTests.cu index a0ed71845..fde0a8d51 100644 --- a/GPUSort/bitonicGPU/unitTests/unitTests.cu +++ b/GPUSort/bitonicGPU/unitTests/unitTests.cu @@ -86,7 +86,7 @@ TEST(selectedSize, size15) { TNL::Containers::Array cudaArr{5, 9, 4, 8, 6, 1, 2, 3, 4, 8, 1, 6, 9, 4, 9}; auto view = cudaArr.getView(); - ASSERT_EQ(15, view.getSize()); + ASSERT_EQ(15, view.getSize()) << "size not 15" << std::endl; bitonicSort(view); ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl; } @@ -158,6 +158,7 @@ struct TMPSTRUCT{ TMPSTRUCT(){m_data[0] = 0;} TMPSTRUCT(int first){m_data[0] = first;}; bool operator <(const TMPSTRUCT& other) const { return m_data[0] < other.m_data[0];} + bool operator <=(const TMPSTRUCT& other) const { return m_data[0] <= other.m_data[0];} }; TEST(nonIntegerType, struct) diff --git a/GPUSort/util/algorithm.h b/GPUSort/util/algorithm.h index dbb98b07a..4e3f8ce48 100644 --- a/GPUSort/util/algorithm.h +++ b/GPUSort/util/algorithm.h @@ -1,11 +1,17 @@ #pragma once #include +#include + +template +bool is_sorted(TNL::Containers::ArrayView arr, const Function &Cmp) +{ + auto fetch = [=] __cuda_callable__(int i) { return Cmp(arr[i - 1], arr[i]); }; + auto reduction = [] __cuda_callable__(bool a, bool b) { return a && b; }; + return TNL::Algorithms::Reduction::reduce(1, arr.getSize(), reduction, fetch, true); +} template bool is_sorted(TNL::Containers::ArrayView arr) { - std::vector tmp(arr.getSize()); - TNL::Algorithms::MultiDeviceMemoryOperations::copy(tmp.data(), arr.getData(), arr.getSize()); - - return std::is_sorted(tmp.begin(), tmp.end()); + return is_sorted(arr, [] __cuda_callable__(const Value &a, const Value &b) { return a <= b; }); } -- GitLab From 27b525af399265d9541ef16d61a7cc0bf5d2bb12 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Fri, 26 Feb 2021 15:35:16 +0100 Subject: [PATCH 057/258] check all permutations between 1 and 8 --- GPUSort/bitonicGPU/unitTests/unitTests.cu | 28 ++--------------------- GPUSort/util/algorithm.h | 2 ++ 2 files changed, 4 insertions(+), 26 deletions(-) diff --git a/GPUSort/bitonicGPU/unitTests/unitTests.cu b/GPUSort/bitonicGPU/unitTests/unitTests.cu index fde0a8d51..6f4f5e7b7 100644 --- a/GPUSort/bitonicGPU/unitTests/unitTests.cu +++ b/GPUSort/bitonicGPU/unitTests/unitTests.cu @@ -11,9 +11,9 @@ //---------------------------------------------------------------------------------- -TEST(permutations, allPermutationSize_3_to_7) +TEST(permutations, allPermutationSize_1_to_8) { - for(int i = 3; i<=7; i++ ) + for(int i = 2; i<=8; i++ ) { int size = i; std::vector orig(size); @@ -32,30 +32,6 @@ TEST(permutations, allPermutationSize_3_to_7) } } -TEST(permutations, somePermutationSize8) -{ - int size = 8; - const int stride = 23; - int i = 0; - - std::vector orig(size); - std::iota(orig.begin(), orig.end(), 0); - - do - { - if ((i++) % stride != 0) - continue; - - TNL::Containers::Array cudaArr(orig); - auto view = cudaArr.getView(); - - bitonicSort(view); - - ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl; - } - while (std::next_permutation(orig.begin(), orig.end())); -} - TEST(permutations, somePermutationSize9) { int size = 9; diff --git a/GPUSort/util/algorithm.h b/GPUSort/util/algorithm.h index 4e3f8ce48..92f4ec264 100644 --- a/GPUSort/util/algorithm.h +++ b/GPUSort/util/algorithm.h @@ -5,6 +5,8 @@ template bool is_sorted(TNL::Containers::ArrayView arr, const Function &Cmp) { + if(arr.getSize() <= 1) return true; + auto fetch = [=] __cuda_callable__(int i) { return Cmp(arr[i - 1], arr[i]); }; auto reduction = [] __cuda_callable__(bool a, bool b) { return a && b; }; return TNL::Algorithms::Reduction::reduce(1, arr.getSize(), reduction, fetch, true); -- GitLab From aea5399ce6fea4bbbf2508aaea3cb434dfe26238 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Fri, 26 Feb 2021 15:50:14 +0100 Subject: [PATCH 058/258] rewrite benchmark --- GPUSort/bitonicGPU/benchmark/benchmark.cu | 110 ++++++++++++---------- 1 file changed, 59 insertions(+), 51 deletions(-) diff --git a/GPUSort/bitonicGPU/benchmark/benchmark.cu b/GPUSort/bitonicGPU/benchmark/benchmark.cu index 7da2c6cac..61b0246dd 100644 --- a/GPUSort/bitonicGPU/benchmark/benchmark.cu +++ b/GPUSort/bitonicGPU/benchmark/benchmark.cu @@ -2,6 +2,8 @@ #include #include #include +#include +#include #include @@ -14,75 +16,81 @@ using namespace TNL::Containers; typedef Devices::Cuda Device; -template -std::ostream& operator<< (std::ostream&out, std::vector &arr) -{ - for (auto x : arr) - std::cout << x << " "; - return out; -} - -void test1() +using namespace std; +int main() { - int size = 1<<10; - TNL::Containers::Array cudaArr(size); - cudaArr.evaluate([=] __cuda_callable__ (int i) {return i;}); - auto view = cudaArr.getView(); + for(int pow = 10; pow <= 20; pow++) { - TIMER t; - bitonicSort(view); - } -} + int size =(1<< pow); -void randomShuffles() -{ - int iterations = 100; - std::cout << iterations << " random permutations" << std::endl; - for(int p = 13; p <= 19; ++p) - { - int size = 1< orig(size); - std::iota(orig.begin(), orig.end(), 0); - std::vector results; + vector vec(size); + iota(vec.begin(), vec.end(), 0); + + Array arr; + vector resAcc; - for (int i = 0; i < iterations; i++) + //sorted sequence { - std::random_shuffle(orig.begin(), orig.end()); + arr = vec; + auto view = arr.getView(); - TNL::Containers::Array cudaArr(orig); - auto view = cudaArr.getView(); { - TIMER t; + TIMER t([&](double res){resAcc.push_back(res);}); bitonicSort(view); } - } - std::cout << "average time for arrSize = 2^" << p << ": " << std::accumulate(results.begin(), results.end(), 0.)/results.size() << " ms" << std::endl; - } -} + //almost sorted sequence + { + for(int i = 0; i < 3; i++) + { + int s = std::rand() % (size - 3); + std::swap(vec[s], vec[s + 1]); + } -void allPermutations(std::vector orig) -{ - std::vector results; - while (std::next_permutation(orig.begin(), orig.end())) - { - TNL::Containers::Array cudaArr(orig); - auto view = cudaArr.getView(); + arr = vec; + auto view = arr.getView(); + { + TIMER t([&](double res){resAcc.push_back(res);}); + bitonicSort(view); + } + } + + //decreasing sequence { - TIMER t; - bitonicSort(view); + for(size_t i = 0; i < size; i++) + vec[i] = -i; + + arr = vec; + auto view = arr.getView(); + + { + TIMER t([&](double res){resAcc.push_back(res);}); + bitonicSort(view); + } } - } - std::cout << "average time: " << std::accumulate(results.begin(), results.end(), 0.)/results.size() << " ms" << std::endl; -} + + //random sequence + { + std::random_shuffle(vec.begin(), vec.end()); + arr = vec; + auto view = arr.getView(); + + { + TIMER t([&](double res){resAcc.push_back(res);}); + bitonicSort(view); + } + } -int main() -{ - randomShuffles(); + + cout << "2^" << pow << " = "; + cout << fixed; + cout << setprecision(3); + cout << (accumulate(resAcc.begin(), resAcc.end(), 0.0) / resAcc.size()) << " ms" << endl; + } return 0; } \ No newline at end of file -- GitLab From dc234b4fcbe87d1e3f4a46054387436079addd06 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Fri, 26 Feb 2021 15:52:56 +0100 Subject: [PATCH 059/258] seed for benchmark and clean up includes --- GPUSort/bitonicGPU/benchmark/benchmark.cu | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/GPUSort/bitonicGPU/benchmark/benchmark.cu b/GPUSort/bitonicGPU/benchmark/benchmark.cu index 61b0246dd..b6c0f6204 100644 --- a/GPUSort/bitonicGPU/benchmark/benchmark.cu +++ b/GPUSort/bitonicGPU/benchmark/benchmark.cu @@ -1,8 +1,5 @@ -#include #include -#include #include -#include #include #include @@ -10,16 +7,15 @@ #include "../bitonicSort.h" #include "../../util/timer.h" - using namespace TNL; using namespace TNL::Containers; +using namespace std; typedef Devices::Cuda Device; -using namespace std; int main() { - + srand(2021); for(int pow = 10; pow <= 20; pow++) { int size =(1<< pow); @@ -45,7 +41,7 @@ int main() { for(int i = 0; i < 3; i++) { - int s = std::rand() % (size - 3); + int s = rand() % (size - 3); std::swap(vec[s], vec[s + 1]); } @@ -74,11 +70,11 @@ int main() //random sequence { - std::random_shuffle(vec.begin(), vec.end()); + random_shuffle(vec.begin(), vec.end()); arr = vec; auto view = arr.getView(); - + { TIMER t([&](double res){resAcc.push_back(res);}); bitonicSort(view); -- GitLab From 5a0c2ef588f1cce5fa429be317944e5e391493ca Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Fri, 26 Feb 2021 17:38:15 +0100 Subject: [PATCH 060/258] init quicksort folder --- GPUSort/quicksort/Makefile | 0 GPUSort/quicksort/config.mk | 0 GPUSort/quicksort/main.cu | 0 GPUSort/quicksort/quisort.cuh | 0 4 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 GPUSort/quicksort/Makefile create mode 100644 GPUSort/quicksort/config.mk create mode 100644 GPUSort/quicksort/main.cu create mode 100644 GPUSort/quicksort/quisort.cuh diff --git a/GPUSort/quicksort/Makefile b/GPUSort/quicksort/Makefile new file mode 100644 index 000000000..e69de29bb diff --git a/GPUSort/quicksort/config.mk b/GPUSort/quicksort/config.mk new file mode 100644 index 000000000..e69de29bb diff --git a/GPUSort/quicksort/main.cu b/GPUSort/quicksort/main.cu new file mode 100644 index 000000000..e69de29bb diff --git a/GPUSort/quicksort/quisort.cuh b/GPUSort/quicksort/quisort.cuh new file mode 100644 index 000000000..e69de29bb -- GitLab From 8eb409508ca50a07c6d66b0158e416abddd6b703 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Mon, 1 Mar 2021 21:53:49 +0100 Subject: [PATCH 061/258] fix file name typo --- GPUSort/quicksort/quicksort.cuh | 14 ++++++++++++++ GPUSort/quicksort/quisort.cuh | 0 2 files changed, 14 insertions(+) create mode 100644 GPUSort/quicksort/quicksort.cuh delete mode 100644 GPUSort/quicksort/quisort.cuh diff --git a/GPUSort/quicksort/quicksort.cuh b/GPUSort/quicksort/quicksort.cuh new file mode 100644 index 000000000..d3b0e9520 --- /dev/null +++ b/GPUSort/quicksort/quicksort.cuh @@ -0,0 +1,14 @@ +#pragma once + +#include + +__global__ void quicksortCuda(TNL::Containers::ArrayView arr, int begin, int end) +{ + + +} + +void quicksort(TNL::Containers::ArrayView arr) +{ + quicksortCuda<<<1, 1>>>(arr, 0, arr.getSize()); +} diff --git a/GPUSort/quicksort/quisort.cuh b/GPUSort/quicksort/quisort.cuh deleted file mode 100644 index e69de29bb..000000000 -- GitLab From b990cc5b8bf08eb9623b1ed30b6308f008385441 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Tue, 2 Mar 2021 16:08:23 +0100 Subject: [PATCH 062/258] compilation --- GPUSort/quicksort/Makefile | 24 ++++++++++++++++ GPUSort/quicksort/config.mk | 49 +++++++++++++++++++++++++++++++++ GPUSort/quicksort/main.cu | 21 ++++++++++++++ GPUSort/quicksort/quicksort.cuh | 27 +++++++++++++++++- 4 files changed, 120 insertions(+), 1 deletion(-) diff --git a/GPUSort/quicksort/Makefile b/GPUSort/quicksort/Makefile index e69de29bb..d301117ff 100644 --- a/GPUSort/quicksort/Makefile +++ b/GPUSort/quicksort/Makefile @@ -0,0 +1,24 @@ +include config.mk + +CUDA_SOURCES := $(wildcard *.cu) +CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) + +## targets definitions follow +.PHONY: all host cuda +all: cuda +cuda: $(CUDA_TARGETS) + +run: cuda + ./$(CUDA_TARGETS) + +.PHONY: clean +clean: + rm -f *.d *.o *.cuo $(CUDA_TARGETS) + +# use .cuo instead of .cu.o to avoid problems with the implicit rules: https://stackoverflow.com/q/62967939 +# (and use the host compiler for linking CUDA, nvcc does not understand that .cuo is an object file) +$(CUDA_TARGETS): % : %.o + $(CUDA_CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS) -lcudadevrt + +$(CUDA_SOURCES:%.cu=%.o): %.o : %.cu + $(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -gencode arch=compute_52,code=sm_52 -dc -c -o $@ $< \ No newline at end of file diff --git a/GPUSort/quicksort/config.mk b/GPUSort/quicksort/config.mk index e69de29bb..3715986f7 100644 --- a/GPUSort/quicksort/config.mk +++ b/GPUSort/quicksort/config.mk @@ -0,0 +1,49 @@ +# configure the include path(s) according to your TNL installation +TNL_INCLUDE_DIRS := -I ~/.local/include + +WITH_OPENMP := no +WITH_DEBUG := no + +# If TNL is installed on your system, the CUDA architecture can be detected +# automatically by tnl-cuda-arch. This is done if CUDA_ARCH is set to "auto". +# Otherwise, CUDA_ARCH has to be set manually to the desired CUDA architecture +# number, e.g. 60, 61, etc. +CUDA_ARCH := auto + +# compilers +CXX := g++ +CUDA_CXX := nvcc + +# host compiler flags +CXXFLAGS := -std=c++14 $(TNL_INCLUDE_DIRS) +ifeq ($(WITH_DEBUG),yes) + CXXFLAGS += -O0 -g +else + CXXFLAGS += -O3 -DNDEBUG +endif + +# CUDA compiler flags +CUDA_CXXFLAGS := -std=c++14 --expt-relaxed-constexpr --expt-extended-lambda $(TNL_INCLUDE_DIRS) +CUDA_CXXFLAGS += -DHAVE_CUDA +ifeq ($(CUDA_ARCH),auto) + CUDA_CXXFLAGS += $(shell tnl-cuda-arch) +else + CUDA_CXXFLAGS += -gencode arch=compute_$(CUDA_ARCH),code=sm_$(CUDA_ARCH) +endif + +# determine path to the CUDA toolkit installation +# (autodetection is attempted, set it manually if it fails) +CUDA_PATH ?= $(abspath $(dir $(shell command -v nvcc))/..) +#$(info Detected CUDA_PATH: $(CUDA_PATH)) + +# flags for linking CUDA with the host compiler +CUDA_LDFLAGS := -L $(CUDA_PATH)/lib64 +CUDA_LDLIBS := -lcudart -ldl -lrt + +# enable OpenMP +ifeq ($(WITH_OPENMP),yes) + CXXFLAGS += -fopenmp -DHAVE_OPENMP + LDLIBS += -lgomp + CUDA_CXXFLAGS += -Xcompiler -fopenmp -DHAVE_OPENMP + CUDA_LDLIBS += -lgomp +endif diff --git a/GPUSort/quicksort/main.cu b/GPUSort/quicksort/main.cu index e69de29bb..fa69fd747 100644 --- a/GPUSort/quicksort/main.cu +++ b/GPUSort/quicksort/main.cu @@ -0,0 +1,21 @@ +#include +#include "quicksort.cuh" +#include "../util/algorithm.h" + +#include +#include +using namespace std; + +int main() +{ + vector vec(19); + for(auto & x : vec) x = rand()%30; + + TNL::Containers::Array arr(vec); + auto view = arr.getView(); + cout << view << endl; + quicksort(view); + cout << view << endl; + + return 0; +} \ No newline at end of file diff --git a/GPUSort/quicksort/quicksort.cuh b/GPUSort/quicksort/quicksort.cuh index d3b0e9520..1b55bc0c8 100644 --- a/GPUSort/quicksort/quicksort.cuh +++ b/GPUSort/quicksort/quicksort.cuh @@ -4,7 +4,32 @@ __global__ void quicksortCuda(TNL::Containers::ArrayView arr, int begin, int end) { - + if(begin >= end) + return; + + int pivotIdx = end - 1; + int pivot = arr[pivotIdx]; + + int midPoint = begin; //[begin ; midPoint) contain elems smaller than pivot + + //partition the array except for last elem (the pivot itself) + for(int i = begin; i + 1< end; i++) + { + if(arr[i] < pivot) + { + TNL::swap(arr[i], arr[midPoint]); + midPoint++; //increase boundary + } + } + + //put pivot onto its correct position, now [begin, midpoint] is sorted + TNL::swap(arr[midPoint], arr[pivotIdx]); + + //sorts all elems before midPoint(which is pivot now) + quicksortCuda<<<1, 1>>>(arr, begin, midPoint); + + //sorts all elems after(bigger than) midPoint + quicksortCuda<<<1, 1>>>(arr, midPoint+1, end); } -- GitLab From 0686d9b43698729c1dc9d26dcc33765cd0ced36d Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 3 Mar 2021 18:40:25 +0100 Subject: [PATCH 063/258] rewrite --- GPUSort/quicksort/quicksort.cuh | 34 +++++++-------------------------- 1 file changed, 7 insertions(+), 27 deletions(-) diff --git a/GPUSort/quicksort/quicksort.cuh b/GPUSort/quicksort/quicksort.cuh index 1b55bc0c8..a359fe6fa 100644 --- a/GPUSort/quicksort/quicksort.cuh +++ b/GPUSort/quicksort/quicksort.cuh @@ -2,38 +2,18 @@ #include -__global__ void quicksortCuda(TNL::Containers::ArrayView arr, int begin, int end) +void quicksort(TNL::Containers::ArrayView arr, int begin, int end) { - if(begin >= end) - return; + if(begin >= end) return; - int pivotIdx = end - 1; - int pivot = arr[pivotIdx]; - - int midPoint = begin; //[begin ; midPoint) contain elems smaller than pivot - - //partition the array except for last elem (the pivot itself) - for(int i = begin; i + 1< end; i++) - { - if(arr[i] < pivot) - { - TNL::swap(arr[i], arr[midPoint]); - midPoint++; //increase boundary - } - } - - //put pivot onto its correct position, now [begin, midpoint] is sorted - TNL::swap(arr[midPoint], arr[pivotIdx]); - - //sorts all elems before midPoint(which is pivot now) - quicksortCuda<<<1, 1>>>(arr, begin, midPoint); - - //sorts all elems after(bigger than) midPoint - quicksortCuda<<<1, 1>>>(arr, midPoint+1, end); + int newPivotPos = partition(arr, begin, end, end-1); + quicksort(arr, begin, newPivotPos); + quicksort(arr, newPivotPos + 1, end); + cudaDeviceSynchronize(); } void quicksort(TNL::Containers::ArrayView arr) { - quicksortCuda<<<1, 1>>>(arr, 0, arr.getSize()); + quicksort(arr, 0, arr.getSize()); } -- GitLab From 8319727554b1899e37ab0f6b412dbb04ebe0adc5 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 3 Mar 2021 18:42:48 +0100 Subject: [PATCH 064/258] calculating amount of blocks needed --- GPUSort/quicksort/quicksort.cuh | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/GPUSort/quicksort/quicksort.cuh b/GPUSort/quicksort/quicksort.cuh index a359fe6fa..f8328fe3c 100644 --- a/GPUSort/quicksort/quicksort.cuh +++ b/GPUSort/quicksort/quicksort.cuh @@ -2,6 +2,34 @@ #include +int partition(TNL::Containers::ArrayView arr, int begin, int end, int pivotIdx) +{ + int size = end - begin; + const int threadsPerBlock = 512, maxBlocks = 1<<14; //16k + int elemPerBlock, blocks; + + int setsNeeded = size/threadsPerBlock + (size % threadsPerBlock != 0); + if(setsNeeded <= blocks) + { + blocks = setsNeeded; + elemPerBlock = threadsPerBlock; + } + else + { + int setsPerBlock = setsNeeded/blocks + 1; //+1 to spread out task of the last block + elemPerBlock *= setsPerBlock; + blocks = size / elemPerBlock + (size % elemPerBlock != 0); + } + + //------------------------------------ + + + +} + +//----------------------------------------------------------------------------------------- +//----------------------------------------------------------------------------------------- + void quicksort(TNL::Containers::ArrayView arr, int begin, int end) { if(begin >= end) return; -- GitLab From c546587cd2d233f4c46001f889b6fdf96fe20f20 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 3 Mar 2021 18:54:56 +0100 Subject: [PATCH 065/258] calc smaller and bigger in thread --- GPUSort/quicksort/quicksort.cuh | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/GPUSort/quicksort/quicksort.cuh b/GPUSort/quicksort/quicksort.cuh index f8328fe3c..3a32061c5 100644 --- a/GPUSort/quicksort/quicksort.cuh +++ b/GPUSort/quicksort/quicksort.cuh @@ -2,6 +2,24 @@ #include +__global__ void cudaPartition(TNL::Containers::ArrayView arr, + int begin, int end, int pivotIdx, int * newPivotPos, + int elemPerBlock) +{ + const int myBegin = begin + elemPerBlock*blockIdx.x; + const int myEnd = TNL::min(end - 1, myBegin + elemPerBlock); //important, pivot is at the end + + int pivot = arr[pivotIdx]; + + int smaller = 0; bigger = 0; + for(int i = myBegin + threadIdx.x; i < myEnd; i+= threadIdx.x) + { + int data = arr[i]; + if(data < pivot) smaller++; + else bigger++; + } +} + int partition(TNL::Containers::ArrayView arr, int begin, int end, int pivotIdx) { int size = end - begin; @@ -9,7 +27,7 @@ int partition(TNL::Containers::ArrayView arr, int begin int elemPerBlock, blocks; int setsNeeded = size/threadsPerBlock + (size % threadsPerBlock != 0); - if(setsNeeded <= blocks) + if(setsNeeded <= maxBlocks) { blocks = setsNeeded; elemPerBlock = threadsPerBlock; @@ -22,9 +40,10 @@ int partition(TNL::Containers::ArrayView arr, int begin } //------------------------------------ + TNL::Containers::Array newPivotPos; + cudaPartition<<>>(arr, begin, end, pivotIdx, newPivotPos.getData(), elemPerBlock); - - + return newPivotPos.getElement(0); } //----------------------------------------------------------------------------------------- -- GitLab From 44a0fbd589840c602feb30177a2e923b3ece4a46 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 3 Mar 2021 18:58:15 +0100 Subject: [PATCH 066/258] refactor counting smaller and bigger into function --- GPUSort/quicksort/quicksort.cuh | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/GPUSort/quicksort/quicksort.cuh b/GPUSort/quicksort/quicksort.cuh index 3a32061c5..6ffd8eebc 100644 --- a/GPUSort/quicksort/quicksort.cuh +++ b/GPUSort/quicksort/quicksort.cuh @@ -2,6 +2,17 @@ #include +__device__ void cmpElem(TNL::Containers::ArrayView arr, int myBegin, int myEnd, + int pivot, int &smaller, int&bigger) +{ + for(int i = myBegin + threadIdx.x; i < myEnd; i+= threadIdx.x) + { + int data = arr[i]; + if(data < pivot) smaller++; + else bigger++; + } +} + __global__ void cudaPartition(TNL::Containers::ArrayView arr, int begin, int end, int pivotIdx, int * newPivotPos, int elemPerBlock) @@ -10,14 +21,11 @@ __global__ void cudaPartition(TNL::Containers::ArrayView arr, int begin, int end, int pivotIdx) -- GitLab From ead45c26cd7971ffa6688a7e705d625fdad872b5 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 3 Mar 2021 19:53:13 +0100 Subject: [PATCH 067/258] reduction --- GPUSort/quicksort/quicksort.cuh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/GPUSort/quicksort/quicksort.cuh b/GPUSort/quicksort/quicksort.cuh index 6ffd8eebc..a1e5258e0 100644 --- a/GPUSort/quicksort/quicksort.cuh +++ b/GPUSort/quicksort/quicksort.cuh @@ -1,9 +1,10 @@ #pragma once #include +#include "reduction.cuh" __device__ void cmpElem(TNL::Containers::ArrayView arr, int myBegin, int myEnd, - int pivot, int &smaller, int&bigger) + int pivot, int &smaller, int&bigger) { for(int i = myBegin + threadIdx.x; i < myEnd; i+= threadIdx.x) { @@ -24,7 +25,8 @@ __global__ void cudaPartition(TNL::Containers::ArrayView Date: Wed, 3 Mar 2021 19:53:26 +0100 Subject: [PATCH 068/258] reduction file --- GPUSort/quicksort/reduction.cuh | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 GPUSort/quicksort/reduction.cuh diff --git a/GPUSort/quicksort/reduction.cuh b/GPUSort/quicksort/reduction.cuh new file mode 100644 index 000000000..b78cb20d9 --- /dev/null +++ b/GPUSort/quicksort/reduction.cuh @@ -0,0 +1,33 @@ +#pragma once +/** + * https://developer.nvidia.com/blog/faster-parallel-reductions-kepler/ + * */ + +__device__ int warpReduceSum(int initVal) +{ + const unsigned int maskConstant = 0xffffffff; //not used + for (unsigned int mask = warpSize / 2; mask > 0; mask >>= 1) + initVal += __shfl_xor_sync(maskConstant, initVal, mask); + + return initVal; +} + +__device__ int blockReduceSum(int val) +{ + static __shared__ int shared[32]; + int lane = threadIdx.x & (warpSize - 1); + int wid = threadIdx.x / warpSize; + + val = warpReduceSum(val); + + if (lane == 0) + shared[wid] = val; + __syncthreads(); + + val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0; + + if (wid == 0) + val = warpReduceSum(val); + + return val; +} \ No newline at end of file -- GitLab From d81051220b18d57d7a27ab26996099c996a72d3d Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 3 Mar 2021 21:43:29 +0100 Subject: [PATCH 069/258] partitioning --- GPUSort/quicksort/quicksort.cuh | 81 +++++++++++++++++++++++---------- GPUSort/quicksort/reduction.cuh | 6 ++- 2 files changed, 63 insertions(+), 24 deletions(-) diff --git a/GPUSort/quicksort/quicksort.cuh b/GPUSort/quicksort/quicksort.cuh index a1e5258e0..bf4f87dd3 100644 --- a/GPUSort/quicksort/quicksort.cuh +++ b/GPUSort/quicksort/quicksort.cuh @@ -3,55 +3,89 @@ #include #include "reduction.cuh" -__device__ void cmpElem(TNL::Containers::ArrayView arr, int myBegin, int myEnd, - int pivot, int &smaller, int&bigger) +using CudaArrayView = TNL::Containers::ArrayView; + +__device__ void cmpElem(CudaArrayView arr, int myBegin, int myEnd, int pivot, int &smaller, int &bigger) +{ + for (int i = myBegin + threadIdx.x; i < myEnd; i += threadIdx.x) + { + int data = arr[i]; + if (data < pivot) + smaller++; + else + bigger++; + } +} + +__device__ void copyData(CudaArrayView arr, int myBegin, int myEnd, int pivot, + CudaArrayView aux, int smallerStart, int biggerStart) { - for(int i = myBegin + threadIdx.x; i < myEnd; i+= threadIdx.x) + for (int i = myBegin + threadIdx.x; i < myEnd; i += blockDim.x) { int data = arr[i]; - if(data < pivot) smaller++; - else bigger++; + if (data < pivot) + aux[smallerStart++] = data; + else + aux[biggerStart++] = data; } } -__global__ void cudaPartition(TNL::Containers::ArrayView arr, - int begin, int end, int pivotIdx, int * newPivotPos, - int elemPerBlock) +__global__ void cudaPartition(CudaArrayView arr, int begin, int end, + CudaArrayView aux, int *auxBeginIdx, int *auxEndIdx, + int pivotIdx, int *newPivotPos, + int elemPerBlock) { - const int myBegin = begin + elemPerBlock*blockIdx.x; + static __shared__ int sharedMem[2]; + int *smallerStart = sharedMem, *biggerStart = smallerStart + 1; + + const int myBegin = begin + elemPerBlock * blockIdx.x; const int myEnd = TNL::min(end - 1, myBegin + elemPerBlock); //important, pivot is at the end int pivot = arr[pivotIdx]; int smaller = 0, bigger = 0; - cmpElem(arr, myBegin, myEnd, pivot, smaller, bigger); + cmpElem(arr, myBegin, myEnd, pivot, smaller, bigger); + + int smallerOffset = blockReduceSum(smaller); + int biggerOffset = blockReduceSum(bigger); - smaller = blockReduceSum(smaller); - bigger = blockReduceSum(bigger); + if (threadIdx.x == 0) + { + *smallerStart = atomicAdd(auxBeginIdx, smallerOffset); + *biggerStart = atomicAdd(auxEndIdx, -biggerOffset) - biggerOffset; + } + __syncthreads(); + int auxThreadSmallerBegin = atomicAdd(smallerStart, smaller); + int auxThreadBiggerBegin = atomicAdd(biggerStart, bigger); + copyData(arr, myBegin, myEnd, pivot, aux, auxThreadSmallerBegin, auxThreadBiggerBegin); } -int partition(TNL::Containers::ArrayView arr, int begin, int end, int pivotIdx) +int partition(CudaArrayView arr, int begin, int end, int pivotIdx) { int size = end - begin; - const int threadsPerBlock = 512, maxBlocks = 1<<14; //16k + const int threadsPerBlock = 512, maxBlocks = 1 << 14; //16k int elemPerBlock, blocks; - - int setsNeeded = size/threadsPerBlock + (size % threadsPerBlock != 0); - if(setsNeeded <= maxBlocks) + + int setsNeeded = size / threadsPerBlock + (size % threadsPerBlock != 0); + if (setsNeeded <= maxBlocks) { blocks = setsNeeded; elemPerBlock = threadsPerBlock; } else { - int setsPerBlock = setsNeeded/blocks + 1; //+1 to spread out task of the last block + int setsPerBlock = setsNeeded / blocks + 1; //+1 to spread out task of the last block elemPerBlock *= setsPerBlock; blocks = size / elemPerBlock + (size % elemPerBlock != 0); } //------------------------------------ + TNL::Containers::Array aux(end - begin), cudaAuxBegin({0}), cudaAuxEnd({end}); TNL::Containers::Array newPivotPos; - cudaPartition<<>>(arr, begin, end, pivotIdx, newPivotPos.getData(), elemPerBlock); + cudaPartition<<>>(arr, begin, end, + aux, cudaAuxBegin.getData(), cudaAuxEnd.getData(), + pivotIdx, newPivotPos.getData(), + elemPerBlock); return newPivotPos.getElement(0); } @@ -59,18 +93,19 @@ int partition(TNL::Containers::ArrayView arr, int begin //----------------------------------------------------------------------------------------- //----------------------------------------------------------------------------------------- -void quicksort(TNL::Containers::ArrayView arr, int begin, int end) +void quicksort(CudaArrayView arr, int begin, int end) { - if(begin >= end) return; + if (begin >= end) + return; - int newPivotPos = partition(arr, begin, end, end-1); + int newPivotPos = partition(arr, begin, end, end - 1); quicksort(arr, begin, newPivotPos); quicksort(arr, newPivotPos + 1, end); cudaDeviceSynchronize(); } -void quicksort(TNL::Containers::ArrayView arr) +void quicksort(CudaArrayView arr) { quicksort(arr, 0, arr.getSize()); } diff --git a/GPUSort/quicksort/reduction.cuh b/GPUSort/quicksort/reduction.cuh index b78cb20d9..3efb309b8 100644 --- a/GPUSort/quicksort/reduction.cuh +++ b/GPUSort/quicksort/reduction.cuh @@ -29,5 +29,9 @@ __device__ int blockReduceSum(int val) if (wid == 0) val = warpReduceSum(val); - return val; + if(threadIdx.x == 0) + shared[0] = val; + __syncthreads(); + + return shared[0]; } \ No newline at end of file -- GitLab From b1c921546130a3e2e806add4a3fdd341fdd87180 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 3 Mar 2021 22:28:38 +0100 Subject: [PATCH 070/258] copy in aux and write pivot --- GPUSort/quicksort/quicksort.cuh | 35 ++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/GPUSort/quicksort/quicksort.cuh b/GPUSort/quicksort/quicksort.cuh index bf4f87dd3..283cc60be 100644 --- a/GPUSort/quicksort/quicksort.cuh +++ b/GPUSort/quicksort/quicksort.cuh @@ -3,11 +3,13 @@ #include #include "reduction.cuh" +#define deb(x) std::cout << #x << " = " << x << std::endl; + using CudaArrayView = TNL::Containers::ArrayView; __device__ void cmpElem(CudaArrayView arr, int myBegin, int myEnd, int pivot, int &smaller, int &bigger) { - for (int i = myBegin + threadIdx.x; i < myEnd; i += threadIdx.x) + for (int i = myBegin + threadIdx.x; i < myEnd; i += blockDim.x) { int data = arr[i]; if (data < pivot) @@ -58,6 +60,14 @@ __global__ void cudaPartition(CudaArrayView arr, int begin, int end, int auxThreadSmallerBegin = atomicAdd(smallerStart, smaller); int auxThreadBiggerBegin = atomicAdd(biggerStart, bigger); copyData(arr, myBegin, myEnd, pivot, aux, auxThreadSmallerBegin, auxThreadBiggerBegin); + __syncthreads(); + + //inserts pivot + if (threadIdx.x * blockIdx.x == 0) + { + aux[*auxEndIdx - 1] = pivot; + *newPivotPos = *auxEndIdx - 1; + } } int partition(CudaArrayView arr, int begin, int end, int pivotIdx) @@ -80,13 +90,21 @@ int partition(CudaArrayView arr, int begin, int end, int pivotIdx) } //------------------------------------ - TNL::Containers::Array aux(end - begin), cudaAuxBegin({0}), cudaAuxEnd({end}); - TNL::Containers::Array newPivotPos; - cudaPartition<<>>(arr, begin, end, - aux, cudaAuxBegin.getData(), cudaAuxEnd.getData(), - pivotIdx, newPivotPos.getData(), - elemPerBlock); - + TNL::Containers::Array aux(arr.getSize()); + TNL::Algorithms::MultiDeviceMemoryOperations:: + copy(aux.getData(), arr.getData(), arr.getSize()); + + TNL::Containers::Array cudaAuxBegin({begin}), cudaAuxEnd({end}), newPivotPos(1); + + //------------------------------------ + cudaPartition<<>>(arr, begin, end, + aux, cudaAuxBegin.getData(), cudaAuxEnd.getData(), + pivotIdx, newPivotPos.getData(), + elemPerBlock); + + //------------------------------------ + TNL::Algorithms::MultiDeviceMemoryOperations:: + copy(arr.getData(), aux.getData(), aux.getSize()); return newPivotPos.getElement(0); } @@ -97,7 +115,6 @@ void quicksort(CudaArrayView arr, int begin, int end) { if (begin >= end) return; - int newPivotPos = partition(arr, begin, end, end - 1); quicksort(arr, begin, newPivotPos); quicksort(arr, newPivotPos + 1, end); -- GitLab From d057cd20176e42025415bf448496a2cf0cd08238 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 3 Mar 2021 22:36:58 +0100 Subject: [PATCH 071/258] benchmark --- GPUSort/quicksort/benchmark/Makefile | 25 +++++++ GPUSort/quicksort/benchmark/benchmark.cu | 92 ++++++++++++++++++++++++ 2 files changed, 117 insertions(+) create mode 100644 GPUSort/quicksort/benchmark/Makefile create mode 100644 GPUSort/quicksort/benchmark/benchmark.cu diff --git a/GPUSort/quicksort/benchmark/Makefile b/GPUSort/quicksort/benchmark/Makefile new file mode 100644 index 000000000..9f523a7de --- /dev/null +++ b/GPUSort/quicksort/benchmark/Makefile @@ -0,0 +1,25 @@ +include ../config.mk + +CUDA_SOURCES := $(wildcard *.cu) +CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) + +## targets definitions follow +.PHONY: all host cuda +all: cuda +cuda: $(CUDA_TARGETS) + +run: cuda + ./$(CUDA_TARGETS) + + +.PHONY: clean +clean: + rm -f *.d *.o *.cuo $(CUDA_TARGETS) + +# use .cuo instead of .cu.o to avoid problems with the implicit rules: https://stackoverflow.com/q/62967939 +# (and use the host compiler for linking CUDA, nvcc does not understand that .cuo is an object file) +$(CUDA_TARGETS): % : %.cuo + $(CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS) + +$(CUDA_SOURCES:%.cu=%.cuo): %.cuo : %.cu + $(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -c -o $@ $< diff --git a/GPUSort/quicksort/benchmark/benchmark.cu b/GPUSort/quicksort/benchmark/benchmark.cu new file mode 100644 index 000000000..66c201d36 --- /dev/null +++ b/GPUSort/quicksort/benchmark/benchmark.cu @@ -0,0 +1,92 @@ +#include +#include +#include + +#include + +#include "../quicksort.cuh" +#include "../../util/timer.h" + +using namespace TNL; +using namespace TNL::Containers; +using namespace std; + +typedef Devices::Cuda Device; + +int main() +{ + srand(8151); + for(int pow = 5; pow <= 10; pow++) + { + int size =(1<< pow); + + vector vec(size); + iota(vec.begin(), vec.end(), 0); + + Array arr; + vector resAcc; + + //sorted sequence + { + arr = vec; + auto view = arr.getView(); + + { + TIMER t([&](double res){resAcc.push_back(res);}); + quicksort(view); + } + } + + //almost sorted sequence + { + for(int i = 0; i < 3; i++) + { + int s = rand() % (size - 3); + std::swap(vec[s], vec[s + 1]); + } + + arr = vec; + auto view = arr.getView(); + + { + TIMER t([&](double res){resAcc.push_back(res);}); + quicksort(view); + } + } + + //decreasing sequence + { + for(size_t i = 0; i < size; i++) + vec[i] = -i; + + arr = vec; + auto view = arr.getView(); + + { + TIMER t([&](double res){resAcc.push_back(res);}); + quicksort(view); + } + } + + //random sequence + { + random_shuffle(vec.begin(), vec.end()); + + arr = vec; + auto view = arr.getView(); + + { + TIMER t([&](double res){resAcc.push_back(res);}); + quicksort(view); + } + } + + + cout << "2^" << pow << " = "; + cout << fixed; + cout << setprecision(3); + cout << (accumulate(resAcc.begin(), resAcc.end(), 0.0) / resAcc.size()) << " ms" << endl; + } + + return 0; +} \ No newline at end of file -- GitLab From d6c3dbfbc298d6e19c9559c784758480135bc4a7 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Thu, 4 Mar 2021 01:03:22 +0100 Subject: [PATCH 072/258] parallel block wide prefix sum --- GPUSort/quicksort/reduction.cuh | 34 +++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/GPUSort/quicksort/reduction.cuh b/GPUSort/quicksort/reduction.cuh index 3efb309b8..48d3ed32d 100644 --- a/GPUSort/quicksort/reduction.cuh +++ b/GPUSort/quicksort/reduction.cuh @@ -34,4 +34,38 @@ __device__ int blockReduceSum(int val) __syncthreads(); return shared[0]; +} + +__device__ int warpPrefixSum(int value) +{ + int laneId = threadIdx.x & 0x1f; + for (int i = 1; i*2 <= warpSize; i *= 2) + { + int n = __shfl_up_sync(0xffffffff, value, i); + if ((laneId & (warpSize - 1)) >= i) + value += n; + } + + return value; +} + +__device__ int blockPrefixSum(int value) +{ + static __shared__ int shared[32]; + int lane = threadIdx.x & (warpSize - 1); + int wid = threadIdx.x / warpSize; + + int tmp = warpPrefixSum(value); + + if (lane == warpSize-1) + shared[wid] = tmp; + __syncthreads(); + + int tmp2 = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0; + if (wid == 0) + shared[lane] = warpPrefixSum(tmp2) - shared[lane]; + __syncthreads(); + + tmp += shared[wid]; + return tmp; } \ No newline at end of file -- GitLab From 6df373b7731ace10d61ecf5362f37b96ff4e0652 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Thu, 4 Mar 2021 01:13:04 +0100 Subject: [PATCH 073/258] use prefixSum instead of reduction --- GPUSort/quicksort/quicksort.cuh | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/GPUSort/quicksort/quicksort.cuh b/GPUSort/quicksort/quicksort.cuh index 283cc60be..db734e1dc 100644 --- a/GPUSort/quicksort/quicksort.cuh +++ b/GPUSort/quicksort/quicksort.cuh @@ -47,19 +47,17 @@ __global__ void cudaPartition(CudaArrayView arr, int begin, int end, int smaller = 0, bigger = 0; cmpElem(arr, myBegin, myEnd, pivot, smaller, bigger); - int smallerOffset = blockReduceSum(smaller); - int biggerOffset = blockReduceSum(bigger); + int smallerOffset = blockPrefixSum(smaller); + int biggerOffset = blockPrefixSum(bigger); - if (threadIdx.x == 0) + if (threadIdx.x == blockDim.x - 1) { *smallerStart = atomicAdd(auxBeginIdx, smallerOffset); *biggerStart = atomicAdd(auxEndIdx, -biggerOffset) - biggerOffset; } __syncthreads(); - int auxThreadSmallerBegin = atomicAdd(smallerStart, smaller); - int auxThreadBiggerBegin = atomicAdd(biggerStart, bigger); - copyData(arr, myBegin, myEnd, pivot, aux, auxThreadSmallerBegin, auxThreadBiggerBegin); + copyData(arr, myBegin, myEnd, pivot, aux, (*smallerStart) + smallerOffset - smaller, (*biggerStart) + biggerOffset - bigger); __syncthreads(); //inserts pivot -- GitLab From a2f1e5b7c38a6dd7d444a9a42e15a6621d819049 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Fri, 5 Mar 2021 02:39:38 +0100 Subject: [PATCH 074/258] shared variables --- GPUSort/quicksort/quicksort.cuh | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/GPUSort/quicksort/quicksort.cuh b/GPUSort/quicksort/quicksort.cuh index db734e1dc..aa963009a 100644 --- a/GPUSort/quicksort/quicksort.cuh +++ b/GPUSort/quicksort/quicksort.cuh @@ -37,13 +37,16 @@ __global__ void cudaPartition(CudaArrayView arr, int begin, int end, int pivotIdx, int *newPivotPos, int elemPerBlock) { - static __shared__ int sharedMem[2]; - int *smallerStart = sharedMem, *biggerStart = smallerStart + 1; + static __shared__ int smallerStart, biggerStart; + static __shared__ int pivot; const int myBegin = begin + elemPerBlock * blockIdx.x; const int myEnd = TNL::min(end - 1, myBegin + elemPerBlock); //important, pivot is at the end - int pivot = arr[pivotIdx]; + if(threadIdx.x == 0) + pivot = arr[pivotIdx]; + __syncthreads(); + int smaller = 0, bigger = 0; cmpElem(arr, myBegin, myEnd, pivot, smaller, bigger); @@ -52,12 +55,12 @@ __global__ void cudaPartition(CudaArrayView arr, int begin, int end, if (threadIdx.x == blockDim.x - 1) { - *smallerStart = atomicAdd(auxBeginIdx, smallerOffset); - *biggerStart = atomicAdd(auxEndIdx, -biggerOffset) - biggerOffset; + smallerStart = atomicAdd(auxBeginIdx, smallerOffset); + biggerStart = atomicAdd(auxEndIdx, -biggerOffset) - biggerOffset; } __syncthreads(); - copyData(arr, myBegin, myEnd, pivot, aux, (*smallerStart) + smallerOffset - smaller, (*biggerStart) + biggerOffset - bigger); + copyData(arr, myBegin, myEnd, pivot, aux, smallerStart + smallerOffset - smaller, biggerStart + biggerOffset - bigger); __syncthreads(); //inserts pivot -- GitLab From b01f1bdb377f923bd02ca340b29b844947112a67 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Fri, 5 Mar 2021 02:52:23 +0100 Subject: [PATCH 075/258] basic unit test --- GPUSort/quicksort/unitTests/Makefile | 26 ++++++++++ GPUSort/quicksort/unitTests/unitTests.cu | 60 ++++++++++++++++++++++++ 2 files changed, 86 insertions(+) create mode 100644 GPUSort/quicksort/unitTests/Makefile create mode 100644 GPUSort/quicksort/unitTests/unitTests.cu diff --git a/GPUSort/quicksort/unitTests/Makefile b/GPUSort/quicksort/unitTests/Makefile new file mode 100644 index 000000000..4cf4ea6f0 --- /dev/null +++ b/GPUSort/quicksort/unitTests/Makefile @@ -0,0 +1,26 @@ +include ../config.mk + +CUDA_SOURCES := $(wildcard *.cu) +CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) + +GTEST := -lgtest -pthread + +## targets definitions follow +.PHONY: all host cuda +all: cuda +cuda: $(CUDA_TARGETS) + +run: cuda + ./$(CUDA_TARGETS) + +.PHONY: clean +clean: + rm -f *.d *.o *.cuo $(CUDA_TARGETS) + +# use .cuo instead of .cu.o to avoid problems with the implicit rules: https://stackoverflow.com/q/62967939 +# (and use the host compiler for linking CUDA, nvcc does not understand that .cuo is an object file) +$(CUDA_TARGETS): % : %.cuo + $(CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS) $(GTEST) + +$(CUDA_SOURCES:%.cu=%.cuo): %.cuo : %.cu + $(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -c -o $@ $< diff --git a/GPUSort/quicksort/unitTests/unitTests.cu b/GPUSort/quicksort/unitTests/unitTests.cu new file mode 100644 index 000000000..265548237 --- /dev/null +++ b/GPUSort/quicksort/unitTests/unitTests.cu @@ -0,0 +1,60 @@ +#include "gtest/gtest.h" +#include +#include +#include +#include + +#include +#include +#include "../quicksort.cuh" +#include "../../util/algorithm.h" + +//---------------------------------------------------------------------------------- + +TEST(selectedSize, size15) +{ + TNL::Containers::Array cudaArr{5, 9, 4, 8, 6, 1, 2, 3, 4, 8, 1, 6, 9, 4, 9}; + auto view = cudaArr.getView(); + ASSERT_EQ(15, view.getSize()) << "size not 15" << std::endl; + quicksort(view); + ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl; +} + +TEST(multiblock, 32768_decreasingNegative) +{ + std::vector arr(1<<15); + for (size_t i = 0; i < arr.size(); i++) + arr[i] = -i; + + TNL::Containers::Array cudaArr(arr); + auto view = cudaArr.getView(); + + quicksort(view); + ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl; +} + +TEST(randomGenerated, smallArray_randomVal) +{ + std::srand(2006); + for(int i = 0; i < 100; i++) + { + std::vector arr(std::rand()%(1<<10)); + for(auto & x : arr) + x = std::rand(); + + TNL::Containers::Array cudaArr(arr); + + auto view = cudaArr.getView(); + quicksort(view); + ASSERT_TRUE(is_sorted(view)); + } +} + +//---------------------------------------------------------------------------------- + +int main(int argc, char **argv) +{ + testing::InitGoogleTest(&argc, argv); + + return RUN_ALL_TESTS(); +} \ No newline at end of file -- GitLab From 1e2082f44bd966af4962613c14b7fab4b1465e89 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Fri, 5 Mar 2021 02:52:43 +0100 Subject: [PATCH 076/258] fix blockwide pivot write --- GPUSort/quicksort/quicksort.cuh | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/GPUSort/quicksort/quicksort.cuh b/GPUSort/quicksort/quicksort.cuh index aa963009a..1f085f0d8 100644 --- a/GPUSort/quicksort/quicksort.cuh +++ b/GPUSort/quicksort/quicksort.cuh @@ -34,8 +34,7 @@ __device__ void copyData(CudaArrayView arr, int myBegin, int myEnd, int pivot, __global__ void cudaPartition(CudaArrayView arr, int begin, int end, CudaArrayView aux, int *auxBeginIdx, int *auxEndIdx, - int pivotIdx, int *newPivotPos, - int elemPerBlock) + int pivotIdx, int elemPerBlock) { static __shared__ int smallerStart, biggerStart; static __shared__ int pivot; @@ -61,14 +60,6 @@ __global__ void cudaPartition(CudaArrayView arr, int begin, int end, __syncthreads(); copyData(arr, myBegin, myEnd, pivot, aux, smallerStart + smallerOffset - smaller, biggerStart + biggerOffset - bigger); - __syncthreads(); - - //inserts pivot - if (threadIdx.x * blockIdx.x == 0) - { - aux[*auxEndIdx - 1] = pivot; - *newPivotPos = *auxEndIdx - 1; - } } int partition(CudaArrayView arr, int begin, int end, int pivotIdx) @@ -95,18 +86,22 @@ int partition(CudaArrayView arr, int begin, int end, int pivotIdx) TNL::Algorithms::MultiDeviceMemoryOperations:: copy(aux.getData(), arr.getData(), arr.getSize()); - TNL::Containers::Array cudaAuxBegin({begin}), cudaAuxEnd({end}), newPivotPos(1); + TNL::Containers::Array cudaAuxBegin({begin}), cudaAuxEnd({end}); //------------------------------------ + + int pivot = arr.getElement(pivotIdx); cudaPartition<<>>(arr, begin, end, aux, cudaAuxBegin.getData(), cudaAuxEnd.getData(), - pivotIdx, newPivotPos.getData(), - elemPerBlock); - + pivotIdx, elemPerBlock); + cudaDeviceSynchronize(); + + pivotIdx = cudaAuxEnd.getElement(0) - 1; + aux.setElement(pivotIdx, pivot); //------------------------------------ TNL::Algorithms::MultiDeviceMemoryOperations:: copy(arr.getData(), aux.getData(), aux.getSize()); - return newPivotPos.getElement(0); + return pivotIdx; } //----------------------------------------------------------------------------------------- -- GitLab From 24f577488d26b4227a5cdd55b1e560fe191de477 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Fri, 5 Mar 2021 03:44:24 +0100 Subject: [PATCH 077/258] rename prefixSum to inclusivePrefixSum --- GPUSort/quicksort/quicksort.cuh | 4 ++-- GPUSort/quicksort/reduction.cuh | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/GPUSort/quicksort/quicksort.cuh b/GPUSort/quicksort/quicksort.cuh index 1f085f0d8..a6588f3ae 100644 --- a/GPUSort/quicksort/quicksort.cuh +++ b/GPUSort/quicksort/quicksort.cuh @@ -49,8 +49,8 @@ __global__ void cudaPartition(CudaArrayView arr, int begin, int end, int smaller = 0, bigger = 0; cmpElem(arr, myBegin, myEnd, pivot, smaller, bigger); - int smallerOffset = blockPrefixSum(smaller); - int biggerOffset = blockPrefixSum(bigger); + int smallerOffset = blockInclusivePrefixSum(smaller); + int biggerOffset = blockInclusivePrefixSum(bigger); if (threadIdx.x == blockDim.x - 1) { diff --git a/GPUSort/quicksort/reduction.cuh b/GPUSort/quicksort/reduction.cuh index 48d3ed32d..8bb5b2e40 100644 --- a/GPUSort/quicksort/reduction.cuh +++ b/GPUSort/quicksort/reduction.cuh @@ -36,7 +36,7 @@ __device__ int blockReduceSum(int val) return shared[0]; } -__device__ int warpPrefixSum(int value) +__device__ int warpInclusivePrefixSum(int value) { int laneId = threadIdx.x & 0x1f; for (int i = 1; i*2 <= warpSize; i *= 2) @@ -49,13 +49,13 @@ __device__ int warpPrefixSum(int value) return value; } -__device__ int blockPrefixSum(int value) +__device__ int blockInclusivePrefixSum(int value) { static __shared__ int shared[32]; int lane = threadIdx.x & (warpSize - 1); int wid = threadIdx.x / warpSize; - int tmp = warpPrefixSum(value); + int tmp = warpInclusivePrefixSum(value); if (lane == warpSize-1) shared[wid] = tmp; @@ -63,7 +63,7 @@ __device__ int blockPrefixSum(int value) int tmp2 = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0; if (wid == 0) - shared[lane] = warpPrefixSum(tmp2) - shared[lane]; + shared[lane] = warpInclusivePrefixSum(tmp2) - tmp2; __syncthreads(); tmp += shared[wid]; -- GitLab From 1443b8707fba018d266ab7894748fbb3cad97c31 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Fri, 5 Mar 2021 03:47:16 +0100 Subject: [PATCH 078/258] unroll for loop --- GPUSort/quicksort/reduction.cuh | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/GPUSort/quicksort/reduction.cuh b/GPUSort/quicksort/reduction.cuh index 8bb5b2e40..9c528c10b 100644 --- a/GPUSort/quicksort/reduction.cuh +++ b/GPUSort/quicksort/reduction.cuh @@ -49,6 +49,25 @@ __device__ int warpInclusivePrefixSum(int value) return value; } +/* +template +__device__ int warpInclusivePrefixSum(int value) +{ + if(it >= 2) + { + int i = it == 0? 32 : 32/it; + int n = __shfl_up_sync(0xffffffff, value, i); + int laneId = threadIdx.x & 0x1f; + if ((laneId & (warpSize - 1)) >= i) + value += n; + return warpInclusivePrefixSum(value); + + } + + return value; +} +*/ + __device__ int blockInclusivePrefixSum(int value) { static __shared__ int shared[32]; -- GitLab From b7860af96143d78fa6caf7b8b8cab37508e37042 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sat, 6 Mar 2021 02:40:50 +0100 Subject: [PATCH 079/258] template unrolling --- GPUSort/quicksort/reduction.cuh | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/GPUSort/quicksort/reduction.cuh b/GPUSort/quicksort/reduction.cuh index 9c528c10b..234871c93 100644 --- a/GPUSort/quicksort/reduction.cuh +++ b/GPUSort/quicksort/reduction.cuh @@ -36,37 +36,28 @@ __device__ int blockReduceSum(int val) return shared[0]; } + +template __device__ int warpInclusivePrefixSum(int value) { - int laneId = threadIdx.x & 0x1f; - for (int i = 1; i*2 <= warpSize; i *= 2) + if(it*2 <= 32) { + int i = it; int n = __shfl_up_sync(0xffffffff, value, i); + int laneId = threadIdx.x & 0x1f; if ((laneId & (warpSize - 1)) >= i) value += n; + return warpInclusivePrefixSum= 32? 32 : it*2>(value); + } return value; } -/* -template __device__ int warpInclusivePrefixSum(int value) { - if(it >= 2) - { - int i = it == 0? 32 : 32/it; - int n = __shfl_up_sync(0xffffffff, value, i); - int laneId = threadIdx.x & 0x1f; - if ((laneId & (warpSize - 1)) >= i) - value += n; - return warpInclusivePrefixSum(value); - - } - - return value; + return warpInclusivePrefixSum<1>(value); } -*/ __device__ int blockInclusivePrefixSum(int value) { -- GitLab From 62cf2f8d14c1006ed7391deb6bf9094005214cee Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sat, 6 Mar 2021 03:55:27 +0100 Subject: [PATCH 080/258] change helper values structure and write pivot for last kernel --- GPUSort/quicksort/quicksort.cuh | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/GPUSort/quicksort/quicksort.cuh b/GPUSort/quicksort/quicksort.cuh index a6588f3ae..b5790b661 100644 --- a/GPUSort/quicksort/quicksort.cuh +++ b/GPUSort/quicksort/quicksort.cuh @@ -34,7 +34,7 @@ __device__ void copyData(CudaArrayView arr, int myBegin, int myEnd, int pivot, __global__ void cudaPartition(CudaArrayView arr, int begin, int end, CudaArrayView aux, int *auxBeginIdx, int *auxEndIdx, - int pivotIdx, int elemPerBlock) + int pivotIdx, int* newPivotIdx, int elemPerBlock, int * blockCount) { static __shared__ int smallerStart, biggerStart; static __shared__ int pivot; @@ -60,6 +60,15 @@ __global__ void cudaPartition(CudaArrayView arr, int begin, int end, __syncthreads(); copyData(arr, myBegin, myEnd, pivot, aux, smallerStart + smallerOffset - smaller, biggerStart + biggerOffset - bigger); + + if(threadIdx.x == 0) + { + if( atomicAdd(blockCount, -1) == 1) + { + *newPivotIdx = (*auxEndIdx) - 1; + aux[*newPivotIdx] = pivot; + } + } } int partition(CudaArrayView arr, int begin, int end, int pivotIdx) @@ -86,22 +95,20 @@ int partition(CudaArrayView arr, int begin, int end, int pivotIdx) TNL::Algorithms::MultiDeviceMemoryOperations:: copy(aux.getData(), arr.getData(), arr.getSize()); - TNL::Containers::Array cudaAuxBegin({begin}), cudaAuxEnd({end}); + TNL::Containers::Array helper({begin, end, 0, blocks}); //------------------------------------ - - int pivot = arr.getElement(pivotIdx); + cudaPartition<<>>(arr, begin, end, - aux, cudaAuxBegin.getData(), cudaAuxEnd.getData(), - pivotIdx, elemPerBlock); - cudaDeviceSynchronize(); - - pivotIdx = cudaAuxEnd.getElement(0) - 1; - aux.setElement(pivotIdx, pivot); + aux, helper.getData(), helper.getData() + 1, + pivotIdx, helper.getData() + 2, elemPerBlock, helper.getData() + 3); + //------------------------------------ + TNL::Algorithms::MultiDeviceMemoryOperations:: copy(arr.getData(), aux.getData(), aux.getSize()); - return pivotIdx; + + return helper.getElement(2); } //----------------------------------------------------------------------------------------- -- GitLab From 743873260a1125f3076f579441058f0786fac580 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sat, 6 Mar 2021 23:56:46 +0100 Subject: [PATCH 081/258] quicksort by depth in class --- GPUSort/quicksort/quicksort.cuh | 116 ++++++++++++++++++++++++++++++++ GPUSort/quicksort/task.h | 18 +++++ 2 files changed, 134 insertions(+) create mode 100644 GPUSort/quicksort/task.h diff --git a/GPUSort/quicksort/quicksort.cuh b/GPUSort/quicksort/quicksort.cuh index b5790b661..80d1cce15 100644 --- a/GPUSort/quicksort/quicksort.cuh +++ b/GPUSort/quicksort/quicksort.cuh @@ -2,6 +2,8 @@ #include #include "reduction.cuh" +#include "task.h" +#include #define deb(x) std::cout << #x << " = " << x << std::endl; @@ -125,6 +127,120 @@ void quicksort(CudaArrayView arr, int begin, int end) cudaDeviceSynchronize(); } +//----------------------------------------------------------- +class QUICKSORT +{ + static const int threadsPerBlock = 512, maxBlocks = 1 << 14; //16k + const int maxTasks = 1<<20; + + CudaArrayView arr; + int begin, end; + int size; + TNL::Containers::Array aux; + + TNL::Containers::Array host_tasks; + TNL::Containers::Array cuda_tasks; + TNL::Containers::Array newTasks; + int tasksAmount; + + TNL::Containers::Array host_blockToTaskMapping; + TNL::Containers::Array cuda_blockToTaskMapping; + + //-------------------------------------------------------------------------------------- + + int getSetsNeeded() + { + auto view = host_tasks.getView(); + auto fetch = [=] __cuda_callable__ (int i) { + auto task = view.getElement(i); + int size = task.arrEnd - task.arrBegin; + return size / threadsPerBlock + (size % threadsPerBlock != 0); + }; + auto reduction = [] __cuda_callable__(int a, int b) {return a + b;}; + return TNL::Algorithms::Reduction::reduce(0, tasksAmount, reduction, fetch, 0); + } + + std::pair calcConfig() + { + int setsNeeded = getSetsNeeded(); + + if(setsNeeded <= maxBlocks) + return {setsNeeded, threadsPerBlock}; + + int setsPerBlock = setsNeeded / maxBlocks + 1; //+1 to spread out task of the last block + int elemPerBlock = setsPerBlock * threadsPerBlock; + int blocks = size / elemPerBlock + (size % elemPerBlock != 0); + return {blocks, elemPerBlock}; + } + + int initTasks(std::pair blocks_elemPerBlock) + { + int elemPerBlock = blocks_elemPerBlock.second; + auto host_tasksView = host_tasks.getView(); + int blockToTaskMapping_Cnt = 0; + + for(int i = 0; i < tasksAmount; ++i) + { + TASK & task = host_tasks[i]; + int size = task.arrEnd - task.arrBegin; + int blocksNeeded = size / elemPerBlock + (size % elemPerBlock != 0); + + task.firstBlock = blockToTaskMapping_Cnt; + task.blockCount = blocksNeeded; + + for(int set = 0; set < blocksNeeded; set++) + host_blockToTaskMapping[blockToTaskMapping_Cnt++] = i; + } + + TNL::Algorithms::MultiDeviceMemoryOperations:: + copy(cuda_tasks.getData(), host_tasks.getData(), host_tasks.getSize()); + + TNL::Algorithms::MultiDeviceMemoryOperations:: + copy(cuda_blockToTaskMapping.getData(), host_blockToTaskMapping.getData(), host_blockToTaskMapping.getSize()); + + if(blockToTaskMapping_Cnt != blocks_elemPerBlock.first) + { + std::cerr << "blockToTaskMapping_Cnt != blocks_elemPerBlock" << std::endl; + class INVALID_CONFIG{}; + throw INVALID_CONFIG(); + } + + return blockToTaskMapping_Cnt; + } + +public: + QUICKSORT(CudaArrayView arr, int begin, int end) + : arr(arr.getView()), begin(begin), end(end), + size(end - begin), aux(size), + host_tasks(maxTasks), cuda_tasks(maxTasks), newTasks(maxTasks), + host_blockToTaskMapping(maxBlocks), cuda_blockToTaskMapping(maxBlocks) + { + int pivotIdx = end - 1; + host_tasks[0] = TASK(begin, end, 0, size, pivotIdx); + tasksAmount = 1; + } + + void sort() + { + while(tasksAmount > 0) + { + std::pair blocks_elemPerBlock = calcConfig(); + int blocksCnt = initTasks(blocks_elemPerBlock); + /* + partition(arr, aux.getView(), + cuda_tasks.getView(), cuda_blockToTaskMapping.getView() + newTasks.getView()); + */ + processTasks(); + } + + //2nd phase to finish + } + +}; + +//----------------------------------------------------------- + void quicksort(CudaArrayView arr) { quicksort(arr, 0, arr.getSize()); diff --git a/GPUSort/quicksort/task.h b/GPUSort/quicksort/task.h new file mode 100644 index 000000000..38f5ed49a --- /dev/null +++ b/GPUSort/quicksort/task.h @@ -0,0 +1,18 @@ +#pragma once + +struct TASK +{ + int arrBegin, arrEnd;//start and end position of array to read from + int auxBeginIdx, auxEndIdx; //start and end position of still available memory to write into + int pivotPos; //input pivot pos and output + int firstBlock, blockCount; //shared counter of blocks working together(how many are still working) + + TASK(int srcBegin, int srcEnd, int destBegin, int destEnd, int pivotPos) + : arrBegin(srcBegin), arrEnd(srcEnd), + auxBeginIdx(destBegin), auxEndIdx(destEnd), + pivotPos(pivotPos), + firstBlock(-1), blockCount(-1) + {} + TASK() = default; + +}; \ No newline at end of file -- GitLab From 00a35e3639c7ffc86d249d3aa1895493477d70b1 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sat, 6 Mar 2021 23:59:56 +0100 Subject: [PATCH 082/258] remove cpu function --- GPUSort/quicksort/quicksort.cuh | 58 ++------------------------------- 1 file changed, 3 insertions(+), 55 deletions(-) diff --git a/GPUSort/quicksort/quicksort.cuh b/GPUSort/quicksort/quicksort.cuh index 80d1cce15..9164d2ba5 100644 --- a/GPUSort/quicksort/quicksort.cuh +++ b/GPUSort/quicksort/quicksort.cuh @@ -73,60 +73,6 @@ __global__ void cudaPartition(CudaArrayView arr, int begin, int end, } } -int partition(CudaArrayView arr, int begin, int end, int pivotIdx) -{ - int size = end - begin; - const int threadsPerBlock = 512, maxBlocks = 1 << 14; //16k - int elemPerBlock, blocks; - - int setsNeeded = size / threadsPerBlock + (size % threadsPerBlock != 0); - if (setsNeeded <= maxBlocks) - { - blocks = setsNeeded; - elemPerBlock = threadsPerBlock; - } - else - { - int setsPerBlock = setsNeeded / blocks + 1; //+1 to spread out task of the last block - elemPerBlock *= setsPerBlock; - blocks = size / elemPerBlock + (size % elemPerBlock != 0); - } - - //------------------------------------ - TNL::Containers::Array aux(arr.getSize()); - TNL::Algorithms::MultiDeviceMemoryOperations:: - copy(aux.getData(), arr.getData(), arr.getSize()); - - TNL::Containers::Array helper({begin, end, 0, blocks}); - - //------------------------------------ - - cudaPartition<<>>(arr, begin, end, - aux, helper.getData(), helper.getData() + 1, - pivotIdx, helper.getData() + 2, elemPerBlock, helper.getData() + 3); - - //------------------------------------ - - TNL::Algorithms::MultiDeviceMemoryOperations:: - copy(arr.getData(), aux.getData(), aux.getSize()); - - return helper.getElement(2); -} - -//----------------------------------------------------------------------------------------- -//----------------------------------------------------------------------------------------- - -void quicksort(CudaArrayView arr, int begin, int end) -{ - if (begin >= end) - return; - int newPivotPos = partition(arr, begin, end, end - 1); - quicksort(arr, begin, newPivotPos); - quicksort(arr, newPivotPos + 1, end); - - cudaDeviceSynchronize(); -} - //----------------------------------------------------------- class QUICKSORT { @@ -226,12 +172,14 @@ public: { std::pair blocks_elemPerBlock = calcConfig(); int blocksCnt = initTasks(blocks_elemPerBlock); + /* partition(arr, aux.getView(), cuda_tasks.getView(), cuda_blockToTaskMapping.getView() newTasks.getView()); */ - processTasks(); + + processTasks(); } //2nd phase to finish -- GitLab From 8ba705c35fb14bbb228e0043a780a0cee2729832 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sun, 7 Mar 2021 02:10:10 +0100 Subject: [PATCH 083/258] write pivot --- GPUSort/quicksort/main.cu | 4 +- GPUSort/quicksort/quicksort.cuh | 121 ++++++++++++++++++++------------ GPUSort/quicksort/task.h | 2 + 3 files changed, 81 insertions(+), 46 deletions(-) diff --git a/GPUSort/quicksort/main.cu b/GPUSort/quicksort/main.cu index fa69fd747..6084d8665 100644 --- a/GPUSort/quicksort/main.cu +++ b/GPUSort/quicksort/main.cu @@ -4,12 +4,14 @@ #include #include +#include using namespace std; int main() { vector vec(19); - for(auto & x : vec) x = rand()%30; + iota(vec.begin(), vec.end(), 0); + random_shuffle(vec.begin(), vec.end()); TNL::Containers::Array arr(vec); auto view = arr.getView(); diff --git a/GPUSort/quicksort/quicksort.cuh b/GPUSort/quicksort/quicksort.cuh index 9164d2ba5..54e5bbde1 100644 --- a/GPUSort/quicksort/quicksort.cuh +++ b/GPUSort/quicksort/quicksort.cuh @@ -9,68 +9,111 @@ using CudaArrayView = TNL::Containers::ArrayView; -__device__ void cmpElem(CudaArrayView arr, int myBegin, int myEnd, int pivot, int &smaller, int &bigger) +__device__ void cmpElem(CudaArrayView arr, int myBegin, int myEnd, + int &smaller, int &bigger, + int pivot) { for (int i = myBegin + threadIdx.x; i < myEnd; i += blockDim.x) { int data = arr[i]; if (data < pivot) smaller++; - else + else if(data > pivot) bigger++; } } -__device__ void copyData(CudaArrayView arr, int myBegin, int myEnd, int pivot, - CudaArrayView aux, int smallerStart, int biggerStart) +__device__ void copyData(CudaArrayView arr, int myBegin, int myEnd, + CudaArrayView aux, int smallerStart, int biggerStart, + int pivot) { for (int i = myBegin + threadIdx.x; i < myEnd; i += blockDim.x) { int data = arr[i]; if (data < pivot) aux[smallerStart++] = data; - else + else if(data > pivot) aux[biggerStart++] = data; } } -__global__ void cudaPartition(CudaArrayView arr, int begin, int end, - CudaArrayView aux, int *auxBeginIdx, int *auxEndIdx, - int pivotIdx, int* newPivotIdx, int elemPerBlock, int * blockCount) +__global__ +void cudaPartition(CudaArrayView arr, CudaArrayView aux, int elemPerBlock, + TNL::Containers::ArrayView cuda_tasks, + TNL::Containers::ArrayView cuda_blockToTaskMapping, + TNL::Containers::ArrayView cuda_newTasks, + int * newTasksCnt) { static __shared__ int smallerStart, biggerStart; static __shared__ int pivot; - - const int myBegin = begin + elemPerBlock * blockIdx.x; - const int myEnd = TNL::min(end - 1, myBegin + elemPerBlock); //important, pivot is at the end + static __shared__ int myTaskIdx; + static __shared__ TASK myTask; + static __shared__ bool writePivot; if(threadIdx.x == 0) - pivot = arr[pivotIdx]; + { + myTaskIdx = cuda_blockToTaskMapping[blockIdx.x]; + myTask = cuda_tasks[myTaskIdx]; + pivot = arr[myTask.pivotPos]; + writePivot = false; + } __syncthreads(); + const int myBegin = myTask.arrBegin + elemPerBlock * (blockIdx.x - myTask.firstBlock); + const int myEnd = TNL::min(myTask.arrEnd, myBegin + elemPerBlock); + int smaller = 0, bigger = 0; cmpElem(arr, myBegin, myEnd, pivot, smaller, bigger); int smallerOffset = blockInclusivePrefixSum(smaller); int biggerOffset = blockInclusivePrefixSum(bigger); - if (threadIdx.x == blockDim.x - 1) + if (threadIdx.x == blockDim.x - 1) //last thread in block has sum of all values + { + smallerStart = atomicAdd(&(cuda_tasks[myTaskIdx].auxBeginIdx), smallerOffset); + biggerStart = atomicAdd(&(cuda_tasks[myTaskIdx].auxEndIdx), -biggerOffset) - biggerOffset; + } + __syncthreads(); + + int destSmaller = smallerStart + smallerOffset - smaller; + int destBigger = biggerStart + biggerOffset - bigger; + copyData(arr, myBegin, myEnd, aux, destSmaller, destBigger, pivot); + + if(threadIdx.x == 0 && atomicAdd(&(cuda_tasks[myTaskIdx].blockCount), -1) == 1) { - smallerStart = atomicAdd(auxBeginIdx, smallerOffset); - biggerStart = atomicAdd(auxEndIdx, -biggerOffset) - biggerOffset; + writePivot = true; + myTask = cuda_tasks[myTaskIdx]; } __syncthreads(); - copyData(arr, myBegin, myEnd, pivot, aux, smallerStart + smallerOffset - smaller, biggerStart + biggerOffset - bigger); + if(!writePivot) + return; + + for(int i = myTask.auxBeginIdx + threadIdx.x; i < myTask.auxEndIdx; i+= blockDim.x) + aux[i] = pivot; + //only works if aux array is as big as input array if(threadIdx.x == 0) { - if( atomicAdd(blockCount, -1) == 1) + if(myTask.auxBeginIdx - myTask.arrBegin > 1) { - *newPivotIdx = (*auxEndIdx) - 1; - aux[*newPivotIdx] = pivot; + int newTaskIdx = atomicAdd(newTasksCnt, 1); + cuda_newTasks[newTaskIdx] = TASK( + myTask.arrBegin, myTask.auxBeginIdx, + myTask.arrBegin, myTask.auxBeginIdx, + myTask.auxBeginIdx - 1); + } + + if(myTask.arrEnd - myTask.auxEndIdx > 1) + { + int newTaskIdx = atomicAdd(newTasksCnt, 1); + cuda_newTasks[newTaskIdx] = TASK( + myTask.auxEndIdx, myTask.arrEnd, + myTask.auxEndIdx, myTask.arrEnd, + myTask.arrEnd - 1); } } + } //----------------------------------------------------------- @@ -80,8 +123,6 @@ class QUICKSORT const int maxTasks = 1<<20; CudaArrayView arr; - int begin, end; - int size; TNL::Containers::Array aux; TNL::Containers::Array host_tasks; @@ -93,6 +134,7 @@ class QUICKSORT TNL::Containers::Array cuda_blockToTaskMapping; //-------------------------------------------------------------------------------------- +public: int getSetsNeeded() { @@ -106,22 +148,19 @@ class QUICKSORT return TNL::Algorithms::Reduction::reduce(0, tasksAmount, reduction, fetch, 0); } - std::pair calcConfig() + int getBlockSize() { int setsNeeded = getSetsNeeded(); if(setsNeeded <= maxBlocks) - return {setsNeeded, threadsPerBlock}; + return threadsPerBlock; int setsPerBlock = setsNeeded / maxBlocks + 1; //+1 to spread out task of the last block - int elemPerBlock = setsPerBlock * threadsPerBlock; - int blocks = size / elemPerBlock + (size % elemPerBlock != 0); - return {blocks, elemPerBlock}; + return setsPerBlock * threadsPerBlock; } - int initTasks(std::pair blocks_elemPerBlock) + int initTasks(int elemPerBlock) { - int elemPerBlock = blocks_elemPerBlock.second; auto host_tasksView = host_tasks.getView(); int blockToTaskMapping_Cnt = 0; @@ -144,25 +183,16 @@ class QUICKSORT TNL::Algorithms::MultiDeviceMemoryOperations:: copy(cuda_blockToTaskMapping.getData(), host_blockToTaskMapping.getData(), host_blockToTaskMapping.getSize()); - if(blockToTaskMapping_Cnt != blocks_elemPerBlock.first) - { - std::cerr << "blockToTaskMapping_Cnt != blocks_elemPerBlock" << std::endl; - class INVALID_CONFIG{}; - throw INVALID_CONFIG(); - } - return blockToTaskMapping_Cnt; } -public: - QUICKSORT(CudaArrayView arr, int begin, int end) - : arr(arr.getView()), begin(begin), end(end), - size(end - begin), aux(size), + QUICKSORT(CudaArrayView arr) + : arr(arr.getView()), aux(arr.getSize()), host_tasks(maxTasks), cuda_tasks(maxTasks), newTasks(maxTasks), host_blockToTaskMapping(maxBlocks), cuda_blockToTaskMapping(maxBlocks) { - int pivotIdx = end - 1; - host_tasks[0] = TASK(begin, end, 0, size, pivotIdx); + int pivotIdx = arr.getSize() - 1; + host_tasks[0] = TASK(0, arr.getSize(), 0, arr.getSize(), pivotIdx); tasksAmount = 1; } @@ -170,8 +200,8 @@ public: { while(tasksAmount > 0) { - std::pair blocks_elemPerBlock = calcConfig(); - int blocksCnt = initTasks(blocks_elemPerBlock); + int elemPerBlock = getBlockSize(); + int blocksCnt = initTasks(elemPerBlock); /* partition(arr, aux.getView(), @@ -179,7 +209,7 @@ public: newTasks.getView()); */ - processTasks(); + //processTasks(); } //2nd phase to finish @@ -191,5 +221,6 @@ public: void quicksort(CudaArrayView arr) { - quicksort(arr, 0, arr.getSize()); + //quicksort(arr, 0, arr.getSize()); + return; } diff --git a/GPUSort/quicksort/task.h b/GPUSort/quicksort/task.h index 38f5ed49a..33dcf605a 100644 --- a/GPUSort/quicksort/task.h +++ b/GPUSort/quicksort/task.h @@ -6,7 +6,9 @@ struct TASK int auxBeginIdx, auxEndIdx; //start and end position of still available memory to write into int pivotPos; //input pivot pos and output int firstBlock, blockCount; //shared counter of blocks working together(how many are still working) + + __cuda_callable__ TASK(int srcBegin, int srcEnd, int destBegin, int destEnd, int pivotPos) : arrBegin(srcBegin), arrEnd(srcEnd), auxBeginIdx(destBegin), auxEndIdx(destEnd), -- GitLab From 3f4664edc1ba848fbf6e6d4a87186e15fc51b2e7 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sun, 7 Mar 2021 15:44:21 +0100 Subject: [PATCH 084/258] bug fixes --- GPUSort/quicksort/quicksort.cuh | 59 +++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 22 deletions(-) diff --git a/GPUSort/quicksort/quicksort.cuh b/GPUSort/quicksort/quicksort.cuh index 54e5bbde1..accfea531 100644 --- a/GPUSort/quicksort/quicksort.cuh +++ b/GPUSort/quicksort/quicksort.cuh @@ -3,7 +3,7 @@ #include #include "reduction.cuh" #include "task.h" -#include +#include #define deb(x) std::cout << #x << " = " << x << std::endl; @@ -59,11 +59,12 @@ void cudaPartition(CudaArrayView arr, CudaArrayView aux, int elemPerBlock, } __syncthreads(); + //only works if consecutive blocks work on the same task const int myBegin = myTask.arrBegin + elemPerBlock * (blockIdx.x - myTask.firstBlock); const int myEnd = TNL::min(myTask.arrEnd, myBegin + elemPerBlock); int smaller = 0, bigger = 0; - cmpElem(arr, myBegin, myEnd, pivot, smaller, bigger); + cmpElem(arr, myBegin, myEnd, smaller, bigger, pivot); int smallerOffset = blockInclusivePrefixSum(smaller); int biggerOffset = blockInclusivePrefixSum(bigger); @@ -117,10 +118,11 @@ void cudaPartition(CudaArrayView arr, CudaArrayView aux, int elemPerBlock, } //----------------------------------------------------------- +const int threadsPerBlock = 512, maxBlocks = 1 << 14; //16k +const int maxTasks = 1<<20; + class QUICKSORT { - static const int threadsPerBlock = 512, maxBlocks = 1 << 14; //16k - const int maxTasks = 1<<20; CudaArrayView arr; TNL::Containers::Array aux; @@ -128,6 +130,7 @@ class QUICKSORT TNL::Containers::Array host_tasks; TNL::Containers::Array cuda_tasks; TNL::Containers::Array newTasks; + TNL::Containers::Array cuda_newTasksAmount; int tasksAmount; TNL::Containers::Array host_blockToTaskMapping; @@ -136,9 +139,9 @@ class QUICKSORT //-------------------------------------------------------------------------------------- public: - int getSetsNeeded() + int getSetsNeeded() const { - auto view = host_tasks.getView(); + auto view = host_tasks.getConstView(); auto fetch = [=] __cuda_callable__ (int i) { auto task = view.getElement(i); int size = task.arrEnd - task.arrBegin; @@ -148,7 +151,7 @@ public: return TNL::Algorithms::Reduction::reduce(0, tasksAmount, reduction, fetch, 0); } - int getBlockSize() + int getBlockSize() const { int setsNeeded = getSetsNeeded(); @@ -178,17 +181,31 @@ public: } TNL::Algorithms::MultiDeviceMemoryOperations:: - copy(cuda_tasks.getData(), host_tasks.getData(), host_tasks.getSize()); + copy(cuda_tasks.getData(), host_tasks.getData(), tasksAmount); TNL::Algorithms::MultiDeviceMemoryOperations:: - copy(cuda_blockToTaskMapping.getData(), host_blockToTaskMapping.getData(), host_blockToTaskMapping.getSize()); - + copy(cuda_blockToTaskMapping.getData(), host_blockToTaskMapping.getData(), blockToTaskMapping_Cnt); + cuda_newTasksAmount = 0; return blockToTaskMapping_Cnt; } - QUICKSORT(CudaArrayView arr) - : arr(arr.getView()), aux(arr.getSize()), - host_tasks(maxTasks), cuda_tasks(maxTasks), newTasks(maxTasks), + int processNewTasks() + { + TNL::Algorithms::MultiDeviceMemoryOperations:: + copy(arr.getData(), aux.getData(), aux.getSize()); + + TNL::Algorithms::MultiDeviceMemoryOperations:: + copy(host_tasks.getData(), newTasks.getData(), newTasks.getSize()); + + return tasksAmount = cuda_newTasksAmount.getElement(0); + } + + //----------------------------------------------------- + + QUICKSORT(CudaArrayView _arr) + : arr(_arr), aux(arr.getSize()), + host_tasks(maxTasks), cuda_tasks(maxTasks), + newTasks(maxTasks), cuda_newTasksAmount(1), host_blockToTaskMapping(maxBlocks), cuda_blockToTaskMapping(maxBlocks) { int pivotIdx = arr.getSize() - 1; @@ -203,16 +220,13 @@ public: int elemPerBlock = getBlockSize(); int blocksCnt = initTasks(elemPerBlock); - /* - partition(arr, aux.getView(), - cuda_tasks.getView(), cuda_blockToTaskMapping.getView() - newTasks.getView()); - */ + cudaPartition<<>> + (arr, aux.getView(), elemPerBlock, + cuda_tasks.getView(), cuda_blockToTaskMapping.getView(), + newTasks.getView(), cuda_newTasksAmount.getData()); - //processTasks(); + tasksAmount = processNewTasks(); } - - //2nd phase to finish } }; @@ -221,6 +235,7 @@ public: void quicksort(CudaArrayView arr) { - //quicksort(arr, 0, arr.getSize()); + QUICKSORT sorter(arr); + sorter.sort(); return; } -- GitLab From 832b5652dbb2157f30080534c9916a8f2b52dcf9 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sun, 7 Mar 2021 17:05:31 +0100 Subject: [PATCH 085/258] init in GPU --- GPUSort/quicksort/quicksort.cuh | 172 +++++++++++++++----------------- 1 file changed, 83 insertions(+), 89 deletions(-) diff --git a/GPUSort/quicksort/quicksort.cuh b/GPUSort/quicksort/quicksort.cuh index accfea531..c8b933fdc 100644 --- a/GPUSort/quicksort/quicksort.cuh +++ b/GPUSort/quicksort/quicksort.cuh @@ -11,38 +11,37 @@ using CudaArrayView = TNL::Containers::ArrayView; __device__ void cmpElem(CudaArrayView arr, int myBegin, int myEnd, int &smaller, int &bigger, - int pivot) + volatile int pivot) { for (int i = myBegin + threadIdx.x; i < myEnd; i += blockDim.x) { int data = arr[i]; if (data < pivot) smaller++; - else if(data > pivot) + else if (data > pivot) bigger++; } } __device__ void copyData(CudaArrayView arr, int myBegin, int myEnd, CudaArrayView aux, int smallerStart, int biggerStart, - int pivot) + volatile int pivot) { for (int i = myBegin + threadIdx.x; i < myEnd; i += blockDim.x) { int data = arr[i]; if (data < pivot) aux[smallerStart++] = data; - else if(data > pivot) + else if (data > pivot) aux[biggerStart++] = data; } } -__global__ -void cudaPartition(CudaArrayView arr, CudaArrayView aux, int elemPerBlock, - TNL::Containers::ArrayView cuda_tasks, - TNL::Containers::ArrayView cuda_blockToTaskMapping, - TNL::Containers::ArrayView cuda_newTasks, - int * newTasksCnt) +__global__ void cudaPartition(CudaArrayView arr, CudaArrayView aux, int elemPerBlock, + TNL::Containers::ArrayView cuda_tasks, + TNL::Containers::ArrayView cuda_blockToTaskMapping, + TNL::Containers::ArrayView cuda_newTasks, + int *newTasksCnt) { static __shared__ int smallerStart, biggerStart; static __shared__ int pivot; @@ -50,7 +49,7 @@ void cudaPartition(CudaArrayView arr, CudaArrayView aux, int elemPerBlock, static __shared__ TASK myTask; static __shared__ bool writePivot; - if(threadIdx.x == 0) + if (threadIdx.x == 0) { myTaskIdx = cuda_blockToTaskMapping[blockIdx.x]; myTask = cuda_tasks[myTaskIdx]; @@ -80,82 +79,120 @@ void cudaPartition(CudaArrayView arr, CudaArrayView aux, int elemPerBlock, int destBigger = biggerStart + biggerOffset - bigger; copyData(arr, myBegin, myEnd, aux, destSmaller, destBigger, pivot); - if(threadIdx.x == 0 && atomicAdd(&(cuda_tasks[myTaskIdx].blockCount), -1) == 1) + if (threadIdx.x == 0 && atomicAdd(&(cuda_tasks[myTaskIdx].blockCount), -1) == 1) { writePivot = true; myTask = cuda_tasks[myTaskIdx]; } __syncthreads(); - if(!writePivot) + if (!writePivot) return; - - for(int i = myTask.auxBeginIdx + threadIdx.x; i < myTask.auxEndIdx; i+= blockDim.x) + + for (int i = myTask.auxBeginIdx + threadIdx.x; i < myTask.auxEndIdx; i += blockDim.x) aux[i] = pivot; //only works if aux array is as big as input array - if(threadIdx.x == 0) + if (threadIdx.x == 0) { - if(myTask.auxBeginIdx - myTask.arrBegin > 1) + if (myTask.auxBeginIdx - myTask.arrBegin > 1) { int newTaskIdx = atomicAdd(newTasksCnt, 1); cuda_newTasks[newTaskIdx] = TASK( - myTask.arrBegin, myTask.auxBeginIdx, - myTask.arrBegin, myTask.auxBeginIdx, - myTask.auxBeginIdx - 1); + myTask.arrBegin, myTask.auxBeginIdx, + myTask.arrBegin, myTask.auxBeginIdx, + myTask.auxBeginIdx - 1); } - if(myTask.arrEnd - myTask.auxEndIdx > 1) + if (myTask.arrEnd - myTask.auxEndIdx > 1) { int newTaskIdx = atomicAdd(newTasksCnt, 1); cuda_newTasks[newTaskIdx] = TASK( - myTask.auxEndIdx, myTask.arrEnd, - myTask.auxEndIdx, myTask.arrEnd, - myTask.arrEnd - 1); + myTask.auxEndIdx, myTask.arrEnd, + myTask.auxEndIdx, myTask.arrEnd, + myTask.arrEnd - 1); } } +} +__global__ void cudaInitTask(TNL::Containers::ArrayView cuda_tasks, int *firstAvailBlock, int elemPerBlock, + TNL::Containers::ArrayView cuda_blockToTaskMapping) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + auto &task = cuda_tasks[i]; + int size = task.arrEnd - task.arrBegin; + int blocksNeeded = size / elemPerBlock + (size % elemPerBlock != 0); + int avail = atomicAdd(firstAvailBlock, blocksNeeded); + task.firstBlock = avail; + task.blockCount = blocksNeeded; + + for (int set = 0; set < blocksNeeded; set++) + cuda_blockToTaskMapping[avail++] = i; } //----------------------------------------------------------- const int threadsPerBlock = 512, maxBlocks = 1 << 14; //16k -const int maxTasks = 1<<20; +const int maxTasks = 1 << 20; class QUICKSORT { - CudaArrayView arr; TNL::Containers::Array aux; - TNL::Containers::Array host_tasks; TNL::Containers::Array cuda_tasks; - TNL::Containers::Array newTasks; - TNL::Containers::Array cuda_newTasksAmount; + TNL::Containers::Array cuda_newTasks; + + TNL::Containers::Array cuda_newTasksAmount; //is in reality 1 integer int tasksAmount; - TNL::Containers::Array host_blockToTaskMapping; TNL::Containers::Array cuda_blockToTaskMapping; + TNL::Containers::Array cuda_blockToTaskMapping_Cnt; //is in reality 1 integer //-------------------------------------------------------------------------------------- public: + QUICKSORT(CudaArrayView _arr) + : arr(_arr), aux(arr.getSize()), + cuda_tasks(maxTasks), cuda_newTasks(maxTasks), cuda_newTasksAmount(1), + cuda_blockToTaskMapping(maxBlocks), cuda_blockToTaskMapping_Cnt(1) + { + int pivotIdx = arr.getSize() - 1; + cuda_tasks.setElement(0, TASK(0, arr.getSize(), 0, arr.getSize(), pivotIdx)); + tasksAmount = 1; + } + + + void sort() + { + while (tasksAmount > 0) + { + int elemPerBlock = getBlockSize(); + int blocksCnt = initTasks(elemPerBlock); + + cudaPartition<<>>(arr, aux.getView(), elemPerBlock, + cuda_tasks.getView(), cuda_blockToTaskMapping.getView(), + cuda_newTasks.getView(), cuda_newTasksAmount.getData()); + + tasksAmount = processNewTasks(); + } + } int getSetsNeeded() const { - auto view = host_tasks.getConstView(); + auto view = cuda_tasks.getConstView(); auto fetch = [=] __cuda_callable__ (int i) { - auto task = view.getElement(i); + auto & task = view[i]; int size = task.arrEnd - task.arrBegin; return size / threadsPerBlock + (size % threadsPerBlock != 0); }; auto reduction = [] __cuda_callable__(int a, int b) {return a + b;}; - return TNL::Algorithms::Reduction::reduce(0, tasksAmount, reduction, fetch, 0); + return TNL::Algorithms::Reduction::reduce(0, tasksAmount, reduction, fetch, 0); } - + int getBlockSize() const { int setsNeeded = getSetsNeeded(); - if(setsNeeded <= maxBlocks) + if (setsNeeded <= maxBlocks) return threadsPerBlock; int setsPerBlock = setsNeeded / maxBlocks + 1; //+1 to spread out task of the last block @@ -164,71 +201,28 @@ public: int initTasks(int elemPerBlock) { - auto host_tasksView = host_tasks.getView(); - int blockToTaskMapping_Cnt = 0; - - for(int i = 0; i < tasksAmount; ++i) - { - TASK & task = host_tasks[i]; - int size = task.arrEnd - task.arrBegin; - int blocksNeeded = size / elemPerBlock + (size % elemPerBlock != 0); - - task.firstBlock = blockToTaskMapping_Cnt; - task.blockCount = blocksNeeded; - - for(int set = 0; set < blocksNeeded; set++) - host_blockToTaskMapping[blockToTaskMapping_Cnt++] = i; - } - - TNL::Algorithms::MultiDeviceMemoryOperations:: - copy(cuda_tasks.getData(), host_tasks.getData(), tasksAmount); + int threads = min(tasksAmount, 512); + int blocks = tasksAmount / threads + (tasksAmount % threads != 0); + cuda_blockToTaskMapping_Cnt = 0; + cudaInitTask<<>>(cuda_tasks.getView(), cuda_blockToTaskMapping_Cnt.getData(), + elemPerBlock, cuda_blockToTaskMapping.getView()); - TNL::Algorithms::MultiDeviceMemoryOperations:: - copy(cuda_blockToTaskMapping.getData(), host_blockToTaskMapping.getData(), blockToTaskMapping_Cnt); cuda_newTasksAmount = 0; - return blockToTaskMapping_Cnt; + cudaDeviceSynchronize(); + return cuda_blockToTaskMapping_Cnt.getElement(0); } int processNewTasks() { - TNL::Algorithms::MultiDeviceMemoryOperations:: - copy(arr.getData(), aux.getData(), aux.getSize()); + TNL::Algorithms::MultiDeviceMemoryOperations:: + copy(arr.getData(), aux.getData(), aux.getSize()); - TNL::Algorithms::MultiDeviceMemoryOperations:: - copy(host_tasks.getData(), newTasks.getData(), newTasks.getSize()); + TNL::Algorithms::MultiDeviceMemoryOperations:: + copy(cuda_tasks.getData(), cuda_newTasks.getData(), cuda_newTasks.getSize()); return tasksAmount = cuda_newTasksAmount.getElement(0); } - //----------------------------------------------------- - - QUICKSORT(CudaArrayView _arr) - : arr(_arr), aux(arr.getSize()), - host_tasks(maxTasks), cuda_tasks(maxTasks), - newTasks(maxTasks), cuda_newTasksAmount(1), - host_blockToTaskMapping(maxBlocks), cuda_blockToTaskMapping(maxBlocks) - { - int pivotIdx = arr.getSize() - 1; - host_tasks[0] = TASK(0, arr.getSize(), 0, arr.getSize(), pivotIdx); - tasksAmount = 1; - } - - void sort() - { - while(tasksAmount > 0) - { - int elemPerBlock = getBlockSize(); - int blocksCnt = initTasks(elemPerBlock); - - cudaPartition<<>> - (arr, aux.getView(), elemPerBlock, - cuda_tasks.getView(), cuda_blockToTaskMapping.getView(), - newTasks.getView(), cuda_newTasksAmount.getData()); - - tasksAmount = processNewTasks(); - } - } - }; //----------------------------------------------------------- -- GitLab From de8dde786b9599fbf5df5a1ca8b8a07c200bb133 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sun, 7 Mar 2021 17:07:44 +0100 Subject: [PATCH 086/258] copy only necessary amount of memory --- GPUSort/quicksort/quicksort.cuh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/GPUSort/quicksort/quicksort.cuh b/GPUSort/quicksort/quicksort.cuh index c8b933fdc..095d2e92a 100644 --- a/GPUSort/quicksort/quicksort.cuh +++ b/GPUSort/quicksort/quicksort.cuh @@ -217,10 +217,12 @@ public: TNL::Algorithms::MultiDeviceMemoryOperations:: copy(arr.getData(), aux.getData(), aux.getSize()); + tasksAmount = cuda_newTasksAmount.getElement(0); + TNL::Algorithms::MultiDeviceMemoryOperations:: - copy(cuda_tasks.getData(), cuda_newTasks.getData(), cuda_newTasks.getSize()); + copy(cuda_tasks.getData(), cuda_newTasks.getData(), tasksAmount); - return tasksAmount = cuda_newTasksAmount.getElement(0); + return tasksAmount; } }; -- GitLab From 2ac17a9ee541677b7778a1d23218668ca0de94ce Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sun, 7 Mar 2021 17:14:25 +0100 Subject: [PATCH 087/258] use prefix sum to minimize atomic add operation --- GPUSort/quicksort/quicksort.cuh | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/GPUSort/quicksort/quicksort.cuh b/GPUSort/quicksort/quicksort.cuh index 095d2e92a..bde9103da 100644 --- a/GPUSort/quicksort/quicksort.cuh +++ b/GPUSort/quicksort/quicksort.cuh @@ -118,16 +118,25 @@ __global__ void cudaPartition(CudaArrayView arr, CudaArrayView aux, int elemPerB __global__ void cudaInitTask(TNL::Containers::ArrayView cuda_tasks, int *firstAvailBlock, int elemPerBlock, TNL::Containers::ArrayView cuda_blockToTaskMapping) { + static __shared__ int avail; + int i = blockDim.x * blockIdx.x + threadIdx.x; auto &task = cuda_tasks[i]; int size = task.arrEnd - task.arrBegin; int blocksNeeded = size / elemPerBlock + (size % elemPerBlock != 0); - int avail = atomicAdd(firstAvailBlock, blocksNeeded); - task.firstBlock = avail; + + int blocksNeeded_total = blockInclusivePrefixSum(blocksNeeded); + if(threadIdx.x == blockDim.x - 1) + avail = atomicAdd(firstAvailBlock, blocksNeeded_total); + __syncthreads(); + + int myFirstAvailBlock = avail + blocksNeeded_total - blocksNeeded; + + task.firstBlock = myFirstAvailBlock; task.blockCount = blocksNeeded; for (int set = 0; set < blocksNeeded; set++) - cuda_blockToTaskMapping[avail++] = i; + cuda_blockToTaskMapping[myFirstAvailBlock++] = i; } //----------------------------------------------------------- -- GitLab From c41b852d254ac818a13690344937093cc2bbf28c Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sun, 7 Mar 2021 21:57:33 +0100 Subject: [PATCH 088/258] fix pivot selection --- GPUSort/quicksort/quicksort.cuh | 12 +++++++----- GPUSort/quicksort/task.h | 6 +++--- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/GPUSort/quicksort/quicksort.cuh b/GPUSort/quicksort/quicksort.cuh index bde9103da..0bb790f4b 100644 --- a/GPUSort/quicksort/quicksort.cuh +++ b/GPUSort/quicksort/quicksort.cuh @@ -53,7 +53,7 @@ __global__ void cudaPartition(CudaArrayView arr, CudaArrayView aux, int elemPerB { myTaskIdx = cuda_blockToTaskMapping[blockIdx.x]; myTask = cuda_tasks[myTaskIdx]; - pivot = arr[myTask.pivotPos]; + pivot = myTask.pivot; writePivot = false; } __syncthreads(); @@ -101,7 +101,7 @@ __global__ void cudaPartition(CudaArrayView arr, CudaArrayView aux, int elemPerB cuda_newTasks[newTaskIdx] = TASK( myTask.arrBegin, myTask.auxBeginIdx, myTask.arrBegin, myTask.auxBeginIdx, - myTask.auxBeginIdx - 1); + aux[myTask.auxBeginIdx - 1]); } if (myTask.arrEnd - myTask.auxEndIdx > 1) @@ -110,7 +110,7 @@ __global__ void cudaPartition(CudaArrayView arr, CudaArrayView aux, int elemPerB cuda_newTasks[newTaskIdx] = TASK( myTask.auxEndIdx, myTask.arrEnd, myTask.auxEndIdx, myTask.arrEnd, - myTask.arrEnd - 1); + aux[myTask.arrEnd - 1]); } } } @@ -164,8 +164,8 @@ public: cuda_tasks(maxTasks), cuda_newTasks(maxTasks), cuda_newTasksAmount(1), cuda_blockToTaskMapping(maxBlocks), cuda_blockToTaskMapping_Cnt(1) { - int pivotIdx = arr.getSize() - 1; - cuda_tasks.setElement(0, TASK(0, arr.getSize(), 0, arr.getSize(), pivotIdx)); + int pivot = arr.getElement(arr.getSize() - 1); + cuda_tasks.setElement(0, TASK(0, arr.getSize(), 0, arr.getSize(), pivot)); tasksAmount = 1; } @@ -182,6 +182,8 @@ public: cuda_newTasks.getView(), cuda_newTasksAmount.getData()); tasksAmount = processNewTasks(); + cudaDeviceSynchronize(); + } } diff --git a/GPUSort/quicksort/task.h b/GPUSort/quicksort/task.h index 33dcf605a..a3226e1de 100644 --- a/GPUSort/quicksort/task.h +++ b/GPUSort/quicksort/task.h @@ -4,15 +4,15 @@ struct TASK { int arrBegin, arrEnd;//start and end position of array to read from int auxBeginIdx, auxEndIdx; //start and end position of still available memory to write into - int pivotPos; //input pivot pos and output + int pivot; int firstBlock, blockCount; //shared counter of blocks working together(how many are still working) __cuda_callable__ - TASK(int srcBegin, int srcEnd, int destBegin, int destEnd, int pivotPos) + TASK(int srcBegin, int srcEnd, int destBegin, int destEnd, int pivot) : arrBegin(srcBegin), arrEnd(srcEnd), auxBeginIdx(destBegin), auxEndIdx(destEnd), - pivotPos(pivotPos), + pivot(pivot), firstBlock(-1), blockCount(-1) {} TASK() = default; -- GitLab From b9d1a8368e151a5d6b03874396a8f9eaf7ea3bee Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sun, 7 Mar 2021 22:15:42 +0100 Subject: [PATCH 089/258] task indexing fix --- GPUSort/quicksort/quicksort.cuh | 34 ++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/GPUSort/quicksort/quicksort.cuh b/GPUSort/quicksort/quicksort.cuh index 0bb790f4b..ec7e3cce7 100644 --- a/GPUSort/quicksort/quicksort.cuh +++ b/GPUSort/quicksort/quicksort.cuh @@ -115,28 +115,37 @@ __global__ void cudaPartition(CudaArrayView arr, CudaArrayView aux, int elemPerB } } -__global__ void cudaInitTask(TNL::Containers::ArrayView cuda_tasks, int *firstAvailBlock, int elemPerBlock, +__global__ void cudaInitTask(TNL::Containers::ArrayView cuda_tasks, + int taskAmount, int elemPerBlock, int *firstAvailBlock, TNL::Containers::ArrayView cuda_blockToTaskMapping) { static __shared__ int avail; int i = blockDim.x * blockIdx.x + threadIdx.x; - auto &task = cuda_tasks[i]; - int size = task.arrEnd - task.arrBegin; - int blocksNeeded = size / elemPerBlock + (size % elemPerBlock != 0); + int blocksNeeded = 0; + + if(i < taskAmount) + { + auto task = cuda_tasks[i]; + int size = task.arrEnd - task.arrBegin; + blocksNeeded = size / elemPerBlock + (size % elemPerBlock != 0); + } int blocksNeeded_total = blockInclusivePrefixSum(blocksNeeded); if(threadIdx.x == blockDim.x - 1) avail = atomicAdd(firstAvailBlock, blocksNeeded_total); __syncthreads(); - int myFirstAvailBlock = avail + blocksNeeded_total - blocksNeeded; + if(i < taskAmount) + { + int myFirstAvailBlock = avail + blocksNeeded_total - blocksNeeded; - task.firstBlock = myFirstAvailBlock; - task.blockCount = blocksNeeded; + cuda_tasks[i].firstBlock = myFirstAvailBlock; + cuda_tasks[i].blockCount = blocksNeeded; - for (int set = 0; set < blocksNeeded; set++) - cuda_blockToTaskMapping[myFirstAvailBlock++] = i; + for (int set = 0; set < blocksNeeded; set++) + cuda_blockToTaskMapping[myFirstAvailBlock++] = i; + } } //----------------------------------------------------------- @@ -215,8 +224,11 @@ public: int threads = min(tasksAmount, 512); int blocks = tasksAmount / threads + (tasksAmount % threads != 0); cuda_blockToTaskMapping_Cnt = 0; - cudaInitTask<<>>(cuda_tasks.getView(), cuda_blockToTaskMapping_Cnt.getData(), - elemPerBlock, cuda_blockToTaskMapping.getView()); + + cudaInitTask<<>>( + cuda_tasks.getView(), tasksAmount, elemPerBlock, + cuda_blockToTaskMapping_Cnt.getData(), + cuda_blockToTaskMapping.getView()); cuda_newTasksAmount = 0; cudaDeviceSynchronize(); -- GitLab From cc07c695020eb25c4c01a1886f513638bd29917b Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sun, 7 Mar 2021 22:21:07 +0100 Subject: [PATCH 090/258] less cudaDeviceSync calls --- GPUSort/quicksort/quicksort.cuh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/GPUSort/quicksort/quicksort.cuh b/GPUSort/quicksort/quicksort.cuh index ec7e3cce7..50a49aca6 100644 --- a/GPUSort/quicksort/quicksort.cuh +++ b/GPUSort/quicksort/quicksort.cuh @@ -191,9 +191,8 @@ public: cuda_newTasks.getView(), cuda_newTasksAmount.getData()); tasksAmount = processNewTasks(); - cudaDeviceSynchronize(); - } + cudaDeviceSynchronize(); } int getSetsNeeded() const @@ -231,7 +230,6 @@ public: cuda_blockToTaskMapping.getView()); cuda_newTasksAmount = 0; - cudaDeviceSynchronize(); return cuda_blockToTaskMapping_Cnt.getElement(0); } -- GitLab From 15b8fe70960355022f42b86370a270ec014721f1 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sun, 7 Mar 2021 22:56:21 +0100 Subject: [PATCH 091/258] rotate ptr instead of copying data --- GPUSort/quicksort/quicksort.cuh | 59 +++++++++++++++++++++++++-------- 1 file changed, 46 insertions(+), 13 deletions(-) diff --git a/GPUSort/quicksort/quicksort.cuh b/GPUSort/quicksort/quicksort.cuh index 50a49aca6..972379b3b 100644 --- a/GPUSort/quicksort/quicksort.cuh +++ b/GPUSort/quicksort/quicksort.cuh @@ -90,12 +90,15 @@ __global__ void cudaPartition(CudaArrayView arr, CudaArrayView aux, int elemPerB return; for (int i = myTask.auxBeginIdx + threadIdx.x; i < myTask.auxEndIdx; i += blockDim.x) + { + arr[i] = pivot; aux[i] = pivot; + } //only works if aux array is as big as input array if (threadIdx.x == 0) { - if (myTask.auxBeginIdx - myTask.arrBegin > 1) + if (myTask.auxBeginIdx - myTask.arrBegin > 0) { int newTaskIdx = atomicAdd(newTasksCnt, 1); cuda_newTasks[newTaskIdx] = TASK( @@ -104,7 +107,7 @@ __global__ void cudaPartition(CudaArrayView arr, CudaArrayView aux, int elemPerB aux[myTask.auxBeginIdx - 1]); } - if (myTask.arrEnd - myTask.auxEndIdx > 1) + if (myTask.arrEnd - myTask.auxEndIdx > 0) { int newTaskIdx = atomicAdd(newTasksCnt, 1); cuda_newTasks[newTaskIdx] = TASK( @@ -165,7 +168,7 @@ class QUICKSORT TNL::Containers::Array cuda_blockToTaskMapping; TNL::Containers::Array cuda_blockToTaskMapping_Cnt; //is in reality 1 integer - + int iteration = 0; //-------------------------------------------------------------------------------------- public: QUICKSORT(CudaArrayView _arr) @@ -186,18 +189,36 @@ public: int elemPerBlock = getBlockSize(); int blocksCnt = initTasks(elemPerBlock); - cudaPartition<<>>(arr, aux.getView(), elemPerBlock, - cuda_tasks.getView(), cuda_blockToTaskMapping.getView(), - cuda_newTasks.getView(), cuda_newTasksAmount.getData()); + if(iteration%2 == 0) + { + cudaPartition<<>>(arr, aux.getView(), elemPerBlock, + cuda_tasks.getView(), cuda_blockToTaskMapping.getView(), + cuda_newTasks.getView(), cuda_newTasksAmount.getData()); + } + else + { + cudaPartition<<>>(aux.getView(), arr, elemPerBlock, + cuda_newTasks.getView(), cuda_blockToTaskMapping.getView(), + cuda_tasks.getView(), cuda_newTasksAmount.getData()); + } tasksAmount = processNewTasks(); + + iteration++; + } + + if(iteration%2) + { + TNL::Algorithms::MultiDeviceMemoryOperations:: + copy(arr.getData(), aux.getData(), aux.getSize()); } + cudaDeviceSynchronize(); } int getSetsNeeded() const { - auto view = cuda_tasks.getConstView(); + auto view = iteration%2 == 0? cuda_tasks.getConstView() : cuda_newTasks.getConstView(); auto fetch = [=] __cuda_callable__ (int i) { auto & task = view[i]; int size = task.arrEnd - task.arrBegin; @@ -224,10 +245,20 @@ public: int blocks = tasksAmount / threads + (tasksAmount % threads != 0); cuda_blockToTaskMapping_Cnt = 0; - cudaInitTask<<>>( - cuda_tasks.getView(), tasksAmount, elemPerBlock, - cuda_blockToTaskMapping_Cnt.getData(), - cuda_blockToTaskMapping.getView()); + if(iteration%2 == 0) + { + cudaInitTask<<>>( + cuda_tasks.getView(), tasksAmount, elemPerBlock, + cuda_blockToTaskMapping_Cnt.getData(), + cuda_blockToTaskMapping.getView()); + } + else + { + cudaInitTask<<>>( + cuda_newTasks.getView(), tasksAmount, elemPerBlock, + cuda_blockToTaskMapping_Cnt.getData(), + cuda_blockToTaskMapping.getView()); + } cuda_newTasksAmount = 0; return cuda_blockToTaskMapping_Cnt.getElement(0); @@ -235,14 +266,16 @@ public: int processNewTasks() { + /* TNL::Algorithms::MultiDeviceMemoryOperations:: copy(arr.getData(), aux.getData(), aux.getSize()); - + */ tasksAmount = cuda_newTasksAmount.getElement(0); + /* TNL::Algorithms::MultiDeviceMemoryOperations:: copy(cuda_tasks.getData(), cuda_newTasks.getData(), tasksAmount); - + */ return tasksAmount; } -- GitLab From 25a311078f42e059717c097445eac1cd44e923c9 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Tue, 9 Mar 2021 00:24:14 +0100 Subject: [PATCH 092/258] write into almostDone if partioned subarray is too small --- GPUSort/quicksort/quicksort.cuh | 118 +++++++++++++++++++------------- 1 file changed, 70 insertions(+), 48 deletions(-) diff --git a/GPUSort/quicksort/quicksort.cuh b/GPUSort/quicksort/quicksort.cuh index 972379b3b..8a5a186d2 100644 --- a/GPUSort/quicksort/quicksort.cuh +++ b/GPUSort/quicksort/quicksort.cuh @@ -8,6 +8,7 @@ #define deb(x) std::cout << #x << " = " << x << std::endl; using CudaArrayView = TNL::Containers::ArrayView; +using CudaTaskArray = TNL::Containers::Array; __device__ void cmpElem(CudaArrayView arr, int myBegin, int myEnd, int &smaller, int &bigger, @@ -38,10 +39,12 @@ __device__ void copyData(CudaArrayView arr, int myBegin, int myEnd, } __global__ void cudaPartition(CudaArrayView arr, CudaArrayView aux, int elemPerBlock, - TNL::Containers::ArrayView cuda_tasks, TNL::Containers::ArrayView cuda_blockToTaskMapping, + TNL::Containers::ArrayView cuda_tasks, TNL::Containers::ArrayView cuda_newTasks, - int *newTasksCnt) + int *newTasksCnt, + TNL::Containers::ArrayView cuda_almostDoneTasks, + int *almostDoneTasksCnt, int minElemPerBlock) { static __shared__ int smallerStart, biggerStart; static __shared__ int pivot; @@ -98,7 +101,7 @@ __global__ void cudaPartition(CudaArrayView arr, CudaArrayView aux, int elemPerB //only works if aux array is as big as input array if (threadIdx.x == 0) { - if (myTask.auxBeginIdx - myTask.arrBegin > 0) + if (myTask.auxBeginIdx - myTask.arrBegin > minElemPerBlock) { int newTaskIdx = atomicAdd(newTasksCnt, 1); cuda_newTasks[newTaskIdx] = TASK( @@ -106,8 +109,16 @@ __global__ void cudaPartition(CudaArrayView arr, CudaArrayView aux, int elemPerB myTask.arrBegin, myTask.auxBeginIdx, aux[myTask.auxBeginIdx - 1]); } + else if (myTask.auxBeginIdx - myTask.arrBegin > 1) + { + int newTaskIdx = atomicAdd(almostDoneTasksCnt, 1); + cuda_almostDoneTasks[newTaskIdx] = TASK( + myTask.arrBegin, myTask.auxBeginIdx, + myTask.arrBegin, myTask.auxBeginIdx, + aux[myTask.auxBeginIdx - 1]); + } - if (myTask.arrEnd - myTask.auxEndIdx > 0) + if (myTask.arrEnd - myTask.auxEndIdx > minElemPerBlock) { int newTaskIdx = atomicAdd(newTasksCnt, 1); cuda_newTasks[newTaskIdx] = TASK( @@ -115,19 +126,27 @@ __global__ void cudaPartition(CudaArrayView arr, CudaArrayView aux, int elemPerB myTask.auxEndIdx, myTask.arrEnd, aux[myTask.arrEnd - 1]); } + else if (myTask.arrEnd - myTask.auxEndIdx > 1) + { + int newTaskIdx = atomicAdd(almostDoneTasksCnt, 1); + cuda_almostDoneTasks[newTaskIdx] = TASK( + myTask.auxEndIdx, myTask.arrEnd, + myTask.auxEndIdx, myTask.arrEnd, + aux[myTask.arrEnd - 1]); + } } } __global__ void cudaInitTask(TNL::Containers::ArrayView cuda_tasks, - int taskAmount, int elemPerBlock, int *firstAvailBlock, - TNL::Containers::ArrayView cuda_blockToTaskMapping) + int taskAmount, int elemPerBlock, int *firstAvailBlock, + TNL::Containers::ArrayView cuda_blockToTaskMapping) { static __shared__ int avail; int i = blockDim.x * blockIdx.x + threadIdx.x; int blocksNeeded = 0; - if(i < taskAmount) + if (i < taskAmount) { auto task = cuda_tasks[i]; int size = task.arrEnd - task.arrBegin; @@ -135,11 +154,11 @@ __global__ void cudaInitTask(TNL::Containers::ArrayView aux; - TNL::Containers::Array cuda_tasks; - TNL::Containers::Array cuda_newTasks; + CudaTaskArray cuda_tasks, cuda_newTasks, cuda_AlmostDoneTasks; - TNL::Containers::Array cuda_newTasksAmount; //is in reality 1 integer - int tasksAmount; + //is in reality 2 integer, [0] is newlyCreatedTasks, [1] are almostDoneTasks + TNL::Containers::Array cuda_newTasksAmount; + int tasksAmount; //counter for Host TNL::Containers::Array cuda_blockToTaskMapping; TNL::Containers::Array cuda_blockToTaskMapping_Cnt; //is in reality 1 integer + int iteration = 0; //-------------------------------------------------------------------------------------- public: QUICKSORT(CudaArrayView _arr) : arr(_arr), aux(arr.getSize()), - cuda_tasks(maxTasks), cuda_newTasks(maxTasks), cuda_newTasksAmount(1), + cuda_tasks(maxTasks), cuda_newTasks(maxTasks), cuda_AlmostDoneTasks(maxTasks), //todo: check size + cuda_newTasksAmount(2), cuda_blockToTaskMapping(maxBlocks), cuda_blockToTaskMapping_Cnt(1) { int pivot = arr.getElement(arr.getSize() - 1); @@ -181,36 +203,46 @@ public: tasksAmount = 1; } - void sort() { - while (tasksAmount > 0) + while (tasksAmount > 0 && tasksAmount <= maxTasks) { int elemPerBlock = getBlockSize(); int blocksCnt = initTasks(elemPerBlock); - if(iteration%2 == 0) + if (iteration % 2 == 0) { - cudaPartition<<>>(arr, aux.getView(), elemPerBlock, - cuda_tasks.getView(), cuda_blockToTaskMapping.getView(), - cuda_newTasks.getView(), cuda_newTasksAmount.getData()); + cudaPartition<<>>( + arr, aux.getView(), + elemPerBlock, + cuda_blockToTaskMapping.getView(), + cuda_tasks.getView(), cuda_newTasks.getView(), + cuda_newTasksAmount.getData(), + cuda_AlmostDoneTasks.getView(), cuda_newTasksAmount.getData() + 1, maxBitonicArrSize); } else { - cudaPartition<<>>(aux.getView(), arr, elemPerBlock, - cuda_newTasks.getView(), cuda_blockToTaskMapping.getView(), - cuda_tasks.getView(), cuda_newTasksAmount.getData()); + cudaPartition<<>>( + aux.getView(), arr, //swapped order to write back and forth without copying + elemPerBlock, + cuda_blockToTaskMapping.getView(), + cuda_newTasks.getView(), cuda_tasks.getView(), //swapped order to write back and forth without copying + cuda_newTasksAmount.getData(), + cuda_AlmostDoneTasks.getView(), cuda_newTasksAmount.getData() + 1, maxBitonicArrSize); } - + tasksAmount = processNewTasks(); iteration++; } - if(iteration%2) + //insert phase 2 sort for almostDoneTasks + + //todo: is this needed after 2nd phase? + if (iteration % 2) { TNL::Algorithms::MultiDeviceMemoryOperations:: - copy(arr.getData(), aux.getData(), aux.getSize()); + copy(arr.getData(), aux.getData(), aux.getSize()); } cudaDeviceSynchronize(); @@ -218,13 +250,13 @@ public: int getSetsNeeded() const { - auto view = iteration%2 == 0? cuda_tasks.getConstView() : cuda_newTasks.getConstView(); - auto fetch = [=] __cuda_callable__ (int i) { - auto & task = view[i]; + auto view = iteration % 2 == 0 ? cuda_tasks.getConstView() : cuda_newTasks.getConstView(); + auto fetch = [=] __cuda_callable__(int i) { + auto &task = view[i]; int size = task.arrEnd - task.arrBegin; - return size / threadsPerBlock + (size % threadsPerBlock != 0); + return size / threadsPerBlock + (size % threadsPerBlock != 0); }; - auto reduction = [] __cuda_callable__(int a, int b) {return a + b;}; + auto reduction = [] __cuda_callable__(int a, int b) { return a + b; }; return TNL::Algorithms::Reduction::reduce(0, tasksAmount, reduction, fetch, 0); } @@ -245,40 +277,30 @@ public: int blocks = tasksAmount / threads + (tasksAmount % threads != 0); cuda_blockToTaskMapping_Cnt = 0; - if(iteration%2 == 0) + if (iteration % 2 == 0) { - cudaInitTask<<>>( + cudaInitTask<<>>( cuda_tasks.getView(), tasksAmount, elemPerBlock, - cuda_blockToTaskMapping_Cnt.getData(), + cuda_blockToTaskMapping_Cnt.getData(), cuda_blockToTaskMapping.getView()); } else { - cudaInitTask<<>>( + cudaInitTask<<>>( cuda_newTasks.getView(), tasksAmount, elemPerBlock, - cuda_blockToTaskMapping_Cnt.getData(), + cuda_blockToTaskMapping_Cnt.getData(), cuda_blockToTaskMapping.getView()); } - cuda_newTasksAmount = 0; + cuda_newTasksAmount.setElement(0, 0); return cuda_blockToTaskMapping_Cnt.getElement(0); } int processNewTasks() { - /* - TNL::Algorithms::MultiDeviceMemoryOperations:: - copy(arr.getData(), aux.getData(), aux.getSize()); - */ tasksAmount = cuda_newTasksAmount.getElement(0); - - /* - TNL::Algorithms::MultiDeviceMemoryOperations:: - copy(cuda_tasks.getData(), cuda_newTasks.getData(), tasksAmount); - */ return tasksAmount; } - }; //----------------------------------------------------------- -- GitLab From bb1c0a929aaf002a6d18d4d74928cc7aa03edc5c Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Tue, 9 Mar 2021 01:24:37 +0100 Subject: [PATCH 093/258] recalculate elem per block --- GPUSort/quicksort/quicksort.cuh | 97 ++++++++++++++------------------- 1 file changed, 41 insertions(+), 56 deletions(-) diff --git a/GPUSort/quicksort/quicksort.cuh b/GPUSort/quicksort/quicksort.cuh index 8a5a186d2..80c950ece 100644 --- a/GPUSort/quicksort/quicksort.cuh +++ b/GPUSort/quicksort/quicksort.cuh @@ -42,21 +42,19 @@ __global__ void cudaPartition(CudaArrayView arr, CudaArrayView aux, int elemPerB TNL::Containers::ArrayView cuda_blockToTaskMapping, TNL::Containers::ArrayView cuda_tasks, TNL::Containers::ArrayView cuda_newTasks, - int *newTasksCnt, - TNL::Containers::ArrayView cuda_almostDoneTasks, - int *almostDoneTasksCnt, int minElemPerBlock) + int *newTasksCnt) { + static __shared__ TASK myTask; static __shared__ int smallerStart, biggerStart; static __shared__ int pivot; static __shared__ int myTaskIdx; - static __shared__ TASK myTask; static __shared__ bool writePivot; if (threadIdx.x == 0) { myTaskIdx = cuda_blockToTaskMapping[blockIdx.x]; myTask = cuda_tasks[myTaskIdx]; - pivot = myTask.pivot; + pivot = arr[myTask.arrEnd - 1]; writePivot = false; } __syncthreads(); @@ -65,6 +63,8 @@ __global__ void cudaPartition(CudaArrayView arr, CudaArrayView aux, int elemPerB const int myBegin = myTask.arrBegin + elemPerBlock * (blockIdx.x - myTask.firstBlock); const int myEnd = TNL::min(myTask.arrEnd, myBegin + elemPerBlock); + //------------------------------------------------------------------------- + int smaller = 0, bigger = 0; cmpElem(arr, myBegin, myEnd, smaller, bigger, pivot); @@ -82,6 +82,8 @@ __global__ void cudaPartition(CudaArrayView arr, CudaArrayView aux, int elemPerB int destBigger = biggerStart + biggerOffset - bigger; copyData(arr, myBegin, myEnd, aux, destSmaller, destBigger, pivot); + //----------------------------------------------------------- + if (threadIdx.x == 0 && atomicAdd(&(cuda_tasks[myTaskIdx].blockCount), -1) == 1) { writePivot = true; @@ -99,41 +101,25 @@ __global__ void cudaPartition(CudaArrayView arr, CudaArrayView aux, int elemPerB } //only works if aux array is as big as input array - if (threadIdx.x == 0) + if (threadIdx.x != 0) + return; + + if (myTask.auxBeginIdx - myTask.arrBegin > 0) //smaller { - if (myTask.auxBeginIdx - myTask.arrBegin > minElemPerBlock) - { - int newTaskIdx = atomicAdd(newTasksCnt, 1); - cuda_newTasks[newTaskIdx] = TASK( - myTask.arrBegin, myTask.auxBeginIdx, - myTask.arrBegin, myTask.auxBeginIdx, - aux[myTask.auxBeginIdx - 1]); - } - else if (myTask.auxBeginIdx - myTask.arrBegin > 1) - { - int newTaskIdx = atomicAdd(almostDoneTasksCnt, 1); - cuda_almostDoneTasks[newTaskIdx] = TASK( - myTask.arrBegin, myTask.auxBeginIdx, - myTask.arrBegin, myTask.auxBeginIdx, - aux[myTask.auxBeginIdx - 1]); - } + int newTaskIdx = atomicAdd(newTasksCnt, 1); + cuda_newTasks[newTaskIdx] = TASK( + myTask.arrBegin, myTask.auxBeginIdx, + myTask.arrBegin, myTask.auxBeginIdx + ); + } - if (myTask.arrEnd - myTask.auxEndIdx > minElemPerBlock) - { - int newTaskIdx = atomicAdd(newTasksCnt, 1); - cuda_newTasks[newTaskIdx] = TASK( - myTask.auxEndIdx, myTask.arrEnd, - myTask.auxEndIdx, myTask.arrEnd, - aux[myTask.arrEnd - 1]); - } - else if (myTask.arrEnd - myTask.auxEndIdx > 1) - { - int newTaskIdx = atomicAdd(almostDoneTasksCnt, 1); - cuda_almostDoneTasks[newTaskIdx] = TASK( - myTask.auxEndIdx, myTask.arrEnd, - myTask.auxEndIdx, myTask.arrEnd, - aux[myTask.arrEnd - 1]); - } + if (myTask.arrEnd - myTask.auxEndIdx > 0) //greater + { + int newTaskIdx = atomicAdd(newTasksCnt, 1); + cuda_newTasks[newTaskIdx] = TASK( + myTask.auxEndIdx, myTask.arrEnd, + myTask.auxEndIdx, myTask.arrEnd + ); } } @@ -170,9 +156,11 @@ __global__ void cudaInitTask(TNL::Containers::ArrayView aux; - CudaTaskArray cuda_tasks, cuda_newTasks, cuda_AlmostDoneTasks; + CudaTaskArray cuda_tasks, cuda_newTasks; - //is in reality 2 integer, [0] is newlyCreatedTasks, [1] are almostDoneTasks - TNL::Containers::Array cuda_newTasksAmount; + TNL::Containers::Array cuda_newTasksAmount; //is in reality 1 integer int tasksAmount; //counter for Host TNL::Containers::Array cuda_blockToTaskMapping; @@ -194,20 +181,18 @@ class QUICKSORT public: QUICKSORT(CudaArrayView _arr) : arr(_arr), aux(arr.getSize()), - cuda_tasks(maxTasks), cuda_newTasks(maxTasks), cuda_AlmostDoneTasks(maxTasks), //todo: check size - cuda_newTasksAmount(2), + cuda_tasks(maxTasks), cuda_newTasks(maxTasks), cuda_newTasksAmount(1), cuda_blockToTaskMapping(maxBlocks), cuda_blockToTaskMapping_Cnt(1) { - int pivot = arr.getElement(arr.getSize() - 1); - cuda_tasks.setElement(0, TASK(0, arr.getSize(), 0, arr.getSize(), pivot)); + cuda_tasks.setElement(0, TASK(0, arr.getSize(), 0, arr.getSize())); tasksAmount = 1; } void sort() { - while (tasksAmount > 0 && tasksAmount <= maxTasks) + while (tasksAmount > 0) { - int elemPerBlock = getBlockSize(); + int elemPerBlock = getElemPerBlock(); int blocksCnt = initTasks(elemPerBlock); if (iteration % 2 == 0) @@ -217,8 +202,8 @@ public: elemPerBlock, cuda_blockToTaskMapping.getView(), cuda_tasks.getView(), cuda_newTasks.getView(), - cuda_newTasksAmount.getData(), - cuda_AlmostDoneTasks.getView(), cuda_newTasksAmount.getData() + 1, maxBitonicArrSize); + cuda_newTasksAmount.getData() + ); } else { @@ -227,8 +212,8 @@ public: elemPerBlock, cuda_blockToTaskMapping.getView(), cuda_newTasks.getView(), cuda_tasks.getView(), //swapped order to write back and forth without copying - cuda_newTasksAmount.getData(), - cuda_AlmostDoneTasks.getView(), cuda_newTasksAmount.getData() + 1, maxBitonicArrSize); + cuda_newTasksAmount.getData() + ); } tasksAmount = processNewTasks(); @@ -254,26 +239,26 @@ public: auto fetch = [=] __cuda_callable__(int i) { auto &task = view[i]; int size = task.arrEnd - task.arrBegin; - return size / threadsPerBlock + (size % threadsPerBlock != 0); + return size / minElemPerBlock + (size % minElemPerBlock != 0); }; auto reduction = [] __cuda_callable__(int a, int b) { return a + b; }; return TNL::Algorithms::Reduction::reduce(0, tasksAmount, reduction, fetch, 0); } - int getBlockSize() const + int getElemPerBlock() const { int setsNeeded = getSetsNeeded(); if (setsNeeded <= maxBlocks) - return threadsPerBlock; + return minElemPerBlock; int setsPerBlock = setsNeeded / maxBlocks + 1; //+1 to spread out task of the last block - return setsPerBlock * threadsPerBlock; + return setsPerBlock * minElemPerBlock; } int initTasks(int elemPerBlock) { - int threads = min(tasksAmount, 512); + int threads = min(tasksAmount, threadsPerBlock); int blocks = tasksAmount / threads + (tasksAmount % threads != 0); cuda_blockToTaskMapping_Cnt = 0; -- GitLab From 5b0b58e1e357866c4920226aaae5304911ce808e Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Tue, 9 Mar 2021 01:47:36 +0100 Subject: [PATCH 094/258] clean up --- GPUSort/quicksort/quicksort.cuh | 36 +++++++++++++++++---------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/GPUSort/quicksort/quicksort.cuh b/GPUSort/quicksort/quicksort.cuh index 80c950ece..d1dc169fd 100644 --- a/GPUSort/quicksort/quicksort.cuh +++ b/GPUSort/quicksort/quicksort.cuh @@ -62,6 +62,17 @@ __global__ void cudaPartition(CudaArrayView arr, CudaArrayView aux, int elemPerB //only works if consecutive blocks work on the same task const int myBegin = myTask.arrBegin + elemPerBlock * (blockIdx.x - myTask.firstBlock); const int myEnd = TNL::min(myTask.arrEnd, myBegin + elemPerBlock); + const int size = myEnd - myBegin; + + //------------------------------------------------------------------------- + + /* + if(size <= blockDim.x*2 && myTask.blockCount == 1) + { + bitonicSort(arr, myTask.arrBegin, myTask.arrEnd, bitonicSortAuxMemory); + return; + } + */ //------------------------------------------------------------------------- @@ -87,7 +98,7 @@ __global__ void cudaPartition(CudaArrayView arr, CudaArrayView aux, int elemPerB if (threadIdx.x == 0 && atomicAdd(&(cuda_tasks[myTaskIdx].blockCount), -1) == 1) { writePivot = true; - myTask = cuda_tasks[myTaskIdx]; + myTask = cuda_tasks[myTaskIdx]; //update auxBeginIdx, auxEndIdx value } __syncthreads(); @@ -95,16 +106,12 @@ __global__ void cudaPartition(CudaArrayView arr, CudaArrayView aux, int elemPerB return; for (int i = myTask.auxBeginIdx + threadIdx.x; i < myTask.auxEndIdx; i += blockDim.x) - { - arr[i] = pivot; aux[i] = pivot; - } - //only works if aux array is as big as input array if (threadIdx.x != 0) return; - if (myTask.auxBeginIdx - myTask.arrBegin > 0) //smaller + if (myTask.auxBeginIdx - myTask.arrBegin > 1) //smaller { int newTaskIdx = atomicAdd(newTasksCnt, 1); cuda_newTasks[newTaskIdx] = TASK( @@ -113,7 +120,7 @@ __global__ void cudaPartition(CudaArrayView arr, CudaArrayView aux, int elemPerB ); } - if (myTask.arrEnd - myTask.auxEndIdx > 0) //greater + if (myTask.arrEnd - myTask.auxEndIdx > 1) //greater { int newTaskIdx = atomicAdd(newTasksCnt, 1); cuda_newTasks[newTaskIdx] = TASK( @@ -159,9 +166,8 @@ __global__ void cudaInitTask(TNL::Containers::ArrayView>>( - aux.getView(), arr, //swapped order to write back and forth without copying + arr, aux.getView(), elemPerBlock, cuda_blockToTaskMapping.getView(), cuda_newTasks.getView(), cuda_tasks.getView(), //swapped order to write back and forth without copying @@ -223,13 +229,6 @@ public: //insert phase 2 sort for almostDoneTasks - //todo: is this needed after 2nd phase? - if (iteration % 2) - { - TNL::Algorithms::MultiDeviceMemoryOperations:: - copy(arr.getData(), aux.getData(), aux.getSize()); - } - cudaDeviceSynchronize(); } @@ -283,6 +282,9 @@ public: int processNewTasks() { + TNL::Algorithms::MultiDeviceMemoryOperations:: + copy(arr.getData(), aux.getData(), aux.getSize()); + tasksAmount = cuda_newTasksAmount.getElement(0); return tasksAmount; } -- GitLab From c1b733c85e221f1f3479a0991e507eee462626f1 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Tue, 9 Mar 2021 01:53:37 +0100 Subject: [PATCH 095/258] refactor to allow call from kernel --- GPUSort/bitonicGPU/bitonicSort.h | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index e8f4f7aba..4e7e02369 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -119,19 +119,12 @@ __global__ void bitonicMergeSharedMemory(TNL::Containers::ArrayView -__global__ void bitoniSort1stStepSharedMemory(TNL::Containers::ArrayView arr, - int begin, int end, const Function & Cmp) +__device__ void bitoniSort1stStepSharedMemory_device( + TNL::Containers::ArrayView arr, + int begin, int end, Value* sharedMem, const Function & Cmp) { - extern __shared__ int externMem[]; - - Value * sharedMem = (Value *)externMem; int sharedMemLen = 2*blockDim.x; int myBlockStart = begin + blockIdx.x * sharedMemLen; @@ -187,6 +180,20 @@ __global__ void bitoniSort1stStepSharedMemory(TNL::Containers::ArrayView +__global__ void bitoniSort1stStepSharedMemory(TNL::Containers::ArrayView arr, + int begin, int end, const Function & Cmp) +{ + extern __shared__ int externMem[]; + + bitoniSort1stStepSharedMemory_device(arr, begin, end, (Value*) externMem, Cmp); +} //--------------------------------------------- -- GitLab From c3578108bc793e157653e6afe1ffd553e21d2d7c Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Tue, 9 Mar 2021 02:11:15 +0100 Subject: [PATCH 096/258] use bitonic sort for small sequences --- GPUSort/quicksort/quicksort.cuh | 45 +++++++++++++++++++++++---------- GPUSort/quicksort/task.h | 4 +-- 2 files changed, 32 insertions(+), 17 deletions(-) diff --git a/GPUSort/quicksort/quicksort.cuh b/GPUSort/quicksort/quicksort.cuh index d1dc169fd..1dd9be285 100644 --- a/GPUSort/quicksort/quicksort.cuh +++ b/GPUSort/quicksort/quicksort.cuh @@ -3,6 +3,7 @@ #include #include "reduction.cuh" #include "task.h" +#include "../bitonicGPU/bitonicSort.h" #include #define deb(x) std::cout << #x << " = " << x << std::endl; @@ -38,8 +39,11 @@ __device__ void copyData(CudaArrayView arr, int myBegin, int myEnd, } } -__global__ void cudaPartition(CudaArrayView arr, CudaArrayView aux, int elemPerBlock, +template +__global__ void cudaPartition(CudaArrayView arr,const Function & Cmp, + CudaArrayView aux, TNL::Containers::ArrayView cuda_blockToTaskMapping, + int elemPerBlock, TNL::Containers::ArrayView cuda_tasks, TNL::Containers::ArrayView cuda_newTasks, int *newTasksCnt) @@ -49,6 +53,7 @@ __global__ void cudaPartition(CudaArrayView arr, CudaArrayView aux, int elemPerB static __shared__ int pivot; static __shared__ int myTaskIdx; static __shared__ bool writePivot; + extern __shared__ int externMem[]; if (threadIdx.x == 0) { @@ -66,13 +71,15 @@ __global__ void cudaPartition(CudaArrayView arr, CudaArrayView aux, int elemPerB //------------------------------------------------------------------------- - /* if(size <= blockDim.x*2 && myTask.blockCount == 1) { - bitonicSort(arr, myTask.arrBegin, myTask.arrEnd, bitonicSortAuxMemory); + bitoniSort1stStepSharedMemory_device( + aux, myTask.arrBegin, myTask.arrEnd, + (int*) externMem, + Cmp + ); return; } - */ //------------------------------------------------------------------------- @@ -183,6 +190,7 @@ class QUICKSORT TNL::Containers::Array cuda_blockToTaskMapping_Cnt; //is in reality 1 integer int iteration = 0; + //-------------------------------------------------------------------------------------- public: QUICKSORT(CudaArrayView _arr) @@ -194,8 +202,10 @@ public: tasksAmount = 1; } - void sort() + template + void sort(const Function & Cmp) { + const int auxMemByteSize = minElemPerBlock* sizeof(int); while (tasksAmount > 0) { int elemPerBlock = getElemPerBlock(); @@ -203,20 +213,22 @@ public: if (iteration % 2 == 0) { - cudaPartition<<>>( - arr, aux.getView(), - elemPerBlock, + cudaPartition<<>>( + arr, Cmp, + aux.getView(), cuda_blockToTaskMapping.getView(), + elemPerBlock, cuda_tasks.getView(), cuda_newTasks.getView(), cuda_newTasksAmount.getData() ); } else { - cudaPartition<<>>( - arr, aux.getView(), - elemPerBlock, + cudaPartition<<>>( + arr, Cmp, + aux.getView(), cuda_blockToTaskMapping.getView(), + elemPerBlock, cuda_newTasks.getView(), cuda_tasks.getView(), //swapped order to write back and forth without copying cuda_newTasksAmount.getData() ); @@ -292,9 +304,14 @@ public: //----------------------------------------------------------- -void quicksort(CudaArrayView arr) +template +void quicksort(CudaArrayView arr, const Function & Cmp) { QUICKSORT sorter(arr); - sorter.sort(); - return; + sorter.sort(Cmp); +} + +void quicksort(CudaArrayView arr) +{ + quicksort(arr, []__cuda_callable__(int a, int b){return a < b;}); } diff --git a/GPUSort/quicksort/task.h b/GPUSort/quicksort/task.h index a3226e1de..903428de0 100644 --- a/GPUSort/quicksort/task.h +++ b/GPUSort/quicksort/task.h @@ -4,15 +4,13 @@ struct TASK { int arrBegin, arrEnd;//start and end position of array to read from int auxBeginIdx, auxEndIdx; //start and end position of still available memory to write into - int pivot; int firstBlock, blockCount; //shared counter of blocks working together(how many are still working) __cuda_callable__ - TASK(int srcBegin, int srcEnd, int destBegin, int destEnd, int pivot) + TASK(int srcBegin, int srcEnd, int destBegin, int destEnd) : arrBegin(srcBegin), arrEnd(srcEnd), auxBeginIdx(destBegin), auxEndIdx(destEnd), - pivot(pivot), firstBlock(-1), blockCount(-1) {} TASK() = default; -- GitLab From 2d14db254ffb1da7d1811db12adf0c7ffa0c7002 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Tue, 9 Mar 2021 02:21:31 +0100 Subject: [PATCH 097/258] differentiate between counter and amount of working blocks for a task --- GPUSort/quicksort/quicksort.cuh | 9 ++++++--- GPUSort/quicksort/task.h | 10 ++++++++-- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/GPUSort/quicksort/quicksort.cuh b/GPUSort/quicksort/quicksort.cuh index 1dd9be285..7a8430963 100644 --- a/GPUSort/quicksort/quicksort.cuh +++ b/GPUSort/quicksort/quicksort.cuh @@ -74,10 +74,13 @@ __global__ void cudaPartition(CudaArrayView arr,const Function & Cmp, if(size <= blockDim.x*2 && myTask.blockCount == 1) { bitoniSort1stStepSharedMemory_device( - aux, myTask.arrBegin, myTask.arrEnd, + arr, myTask.arrBegin, myTask.arrEnd, (int*) externMem, Cmp ); + + for (int i = myBegin + threadIdx.x; i < myEnd; i += blockDim.x) + aux[i] = arr[i]; return; } @@ -102,7 +105,7 @@ __global__ void cudaPartition(CudaArrayView arr,const Function & Cmp, //----------------------------------------------------------- - if (threadIdx.x == 0 && atomicAdd(&(cuda_tasks[myTaskIdx].blockCount), -1) == 1) + if (threadIdx.x == 0 && atomicAdd(&(cuda_tasks[myTaskIdx].stillWorkingCnt), -1) == 1) { writePivot = true; myTask = cuda_tasks[myTaskIdx]; //update auxBeginIdx, auxEndIdx value @@ -163,7 +166,7 @@ __global__ void cudaInitTask(TNL::Containers::ArrayView Date: Tue, 9 Mar 2021 14:29:48 +0100 Subject: [PATCH 098/258] refactor for blocksort to be independent --- GPUSort/bitonicGPU/bitonicSort.h | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index 4e7e02369..1eae30326 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -121,23 +121,19 @@ __global__ void bitonicMergeSharedMemory(TNL::Containers::ArrayView -__device__ void bitoniSort1stStepSharedMemory_device( +__device__ void bitonicSort_Block( TNL::Containers::ArrayView arr, - int begin, int end, Value* sharedMem, const Function & Cmp) + int myBlockStart, int myBlockEnd, Value* sharedMem, const Function & Cmp) { - int sharedMemLen = 2*blockDim.x; - - int myBlockStart = begin + blockIdx.x * sharedMemLen; - int myBlockEnd = end < myBlockStart+sharedMemLen? end : myBlockStart+sharedMemLen; //copy from globalMem into sharedMem int copy1 = myBlockStart + threadIdx.x; int copy2 = copy1 + blockDim.x; { - if(copy1 < end) + if(copy1 < myBlockEnd) sharedMem[threadIdx.x] = arr[copy1]; - if(copy2 < end) + if(copy2 < myBlockEnd) sharedMem[threadIdx.x + blockDim.x] = arr[copy2]; __syncthreads(); @@ -146,7 +142,7 @@ __device__ void bitoniSort1stStepSharedMemory_device( //------------------------------------------ //bitonic activity { - int i = blockIdx.x * blockDim.x + threadIdx.x; + int i = threadIdx.x; int paddedSize = closestPow2(myBlockEnd - myBlockStart); for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) @@ -154,7 +150,7 @@ __device__ void bitoniSort1stStepSharedMemory_device( //calculate the direction of swapping int monotonicSeqIdx = i / (monotonicSeqLen/2); bool ascending = (monotonicSeqIdx & 1) != 0; - if ((monotonicSeqIdx + 1) * monotonicSeqLen >= end) //special case for parts with no "partner" + if ((monotonicSeqIdx + 1) * monotonicSeqLen >= myBlockEnd) //special case for parts with no "partner" ascending = true; for (int len = monotonicSeqLen; len > 1; len /= 2) @@ -174,9 +170,9 @@ __device__ void bitoniSort1stStepSharedMemory_device( //------------------------------------------ //writeback to global memory { - if(copy1 < end) + if(copy1 < myBlockEnd) arr[copy1] = sharedMem[threadIdx.x]; - if(copy2 < end) + if(copy2 < myBlockEnd) arr[copy2] = sharedMem[threadIdx.x + blockDim.x]; } } @@ -191,8 +187,16 @@ __global__ void bitoniSort1stStepSharedMemory(TNL::Containers::ArrayView Date: Tue, 9 Mar 2021 15:03:48 +0100 Subject: [PATCH 099/258] include guard --- GPUSort/bitonicGPU/bitonicSort.h | 1 + 1 file changed, 1 insertion(+) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index 1eae30326..2d6269ae5 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -1,3 +1,4 @@ +#pragma once #include //--------------------------------------------- -- GitLab From 434083579f7eefd6070d296973d49e17de68b6cb Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Tue, 9 Mar 2021 15:05:14 +0100 Subject: [PATCH 100/258] big array test --- GPUSort/quicksort/unitTests/unitTests.cu | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/GPUSort/quicksort/unitTests/unitTests.cu b/GPUSort/quicksort/unitTests/unitTests.cu index 265548237..829b01fa0 100644 --- a/GPUSort/quicksort/unitTests/unitTests.cu +++ b/GPUSort/quicksort/unitTests/unitTests.cu @@ -50,6 +50,23 @@ TEST(randomGenerated, smallArray_randomVal) } } + +TEST(randomGenerated, bigArray_randomVal) +{ + std::srand(304); + for(int i = 0; i < 50; i++) + { + int size = (1<<20) + (std::rand()% (1<<19)); + std::vector arr(size); + for(auto & x : arr) x = std::rand(); + TNL::Containers::Array cudaArr(arr); + + auto view = cudaArr.getView(); + quicksort(view); + ASSERT_TRUE(is_sorted(view)); + } +} + //---------------------------------------------------------------------------------- int main(int argc, char **argv) -- GitLab From 490e7b445618d7fbb4c337da368635fe62e51d67 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Tue, 9 Mar 2021 15:05:35 +0100 Subject: [PATCH 101/258] benchmark big arrays --- GPUSort/quicksort/benchmark/benchmark.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPUSort/quicksort/benchmark/benchmark.cu b/GPUSort/quicksort/benchmark/benchmark.cu index 66c201d36..764289b35 100644 --- a/GPUSort/quicksort/benchmark/benchmark.cu +++ b/GPUSort/quicksort/benchmark/benchmark.cu @@ -16,7 +16,7 @@ typedef Devices::Cuda Device; int main() { srand(8151); - for(int pow = 5; pow <= 10; pow++) + for(int pow = 5; pow <= 23; pow++) { int size =(1<< pow); -- GitLab From cb37004b39360cf3a72f73034cdd3b5e9d074b8e Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Tue, 9 Mar 2021 15:05:47 +0100 Subject: [PATCH 102/258] clean up --- GPUSort/quicksort/quicksort.cuh | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/GPUSort/quicksort/quicksort.cuh b/GPUSort/quicksort/quicksort.cuh index 7a8430963..2e1065f8c 100644 --- a/GPUSort/quicksort/quicksort.cuh +++ b/GPUSort/quicksort/quicksort.cuh @@ -73,17 +73,18 @@ __global__ void cudaPartition(CudaArrayView arr,const Function & Cmp, if(size <= blockDim.x*2 && myTask.blockCount == 1) { - bitoniSort1stStepSharedMemory_device( - arr, myTask.arrBegin, myTask.arrEnd, + bitonicSort_Block( + arr.getView(myBegin, myEnd), 0, size, (int*) externMem, Cmp ); - + __syncthreads(); + for (int i = myBegin + threadIdx.x; i < myEnd; i += blockDim.x) aux[i] = arr[i]; + return; } - //------------------------------------------------------------------------- int smaller = 0, bigger = 0; @@ -175,7 +176,7 @@ __global__ void cudaInitTask(TNL::Containers::ArrayView Date: Tue, 9 Mar 2021 15:19:25 +0100 Subject: [PATCH 103/258] add more interfaces --- GPUSort/bitonicGPU/bitonicSort.h | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index 2d6269ae5..1b7087d5a 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -246,9 +246,15 @@ void bitonicSort(TNL::Containers::ArrayView arr, int //--------------------------------------------- template -void bitonicSort(TNL::Containers::ArrayView arr, const Function & cmp) +void bitonicSort(TNL::Containers::ArrayView arr, int begin, int end) { - bitonicSort(arr, 0, arr.getSize(), cmp); + bitonicSort(arr, begin, end, [] __cuda_callable__ (const Value & a, const Value & b) {return a < b;}); +} + +template +void bitonicSort(TNL::Containers::ArrayView arr, const Function & Cmp) +{ + bitonicSort(arr, 0, arr.getSize(), Cmp); } template @@ -258,17 +264,27 @@ void bitonicSort(TNL::Containers::ArrayView arr) } //--------------------------------------------- - template -void bitonicSort(std::vector & vec,const Function & cmp) +void bitonicSort(std::vector & vec, int begin, int end, const Function & Cmp) { TNL::Containers::Array Arr(vec); auto view = Arr.getView(); - bitonicSort(view, cmp); + bitonicSort(view, begin, end, Cmp); TNL::Algorithms::MultiDeviceMemoryOperations:: copy(vec.data(), view.getData(), view.getSize()); +} +template +void bitonicSort(std::vector & vec, int begin, int end) +{ + bitonicSort(vec, begin, end, [] __cuda_callable__ (const Value & a, const Value & b) {return a < b;}); +} + +template +void bitonicSort(std::vector & vec, const Function & Cmp) +{ + bitonicSort(vec, 0, vec.size(), Cmp); } template -- GitLab From 7a01bb8c31c77a1746a6028036f4b56dd5dee1c7 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Tue, 9 Mar 2021 16:17:45 +0100 Subject: [PATCH 104/258] sort subarray correctly --- GPUSort/bitonicGPU/bitonicSort.h | 101 +++++++++++----------- GPUSort/bitonicGPU/unitTests/unitTests.cu | 16 +++- 2 files changed, 64 insertions(+), 53 deletions(-) diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/bitonicGPU/bitonicSort.h index 1b7087d5a..f5a70c0ac 100644 --- a/GPUSort/bitonicGPU/bitonicSort.h +++ b/GPUSort/bitonicGPU/bitonicSort.h @@ -27,7 +27,7 @@ __host__ __device__ void cmpSwap(Value & a, Value &b, bool ascending, const Func */ template __global__ void bitonicMergeGlobal(TNL::Containers::ArrayView arr, - int begin, int end, const Function & Cmp, + const Function & Cmp, int monotonicSeqLen, int len, int partsInSeq) { int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -35,15 +35,15 @@ __global__ void bitonicMergeGlobal(TNL::Containers::ArrayView= end) //arr[e] is virtual padding and will not be exchanged with + if (e >= arr.getSize()) //arr[e] is virtual padding and will not be exchanged with return; //calculate the direction of swapping int monotonicSeqIdx = part / partsInSeq; bool ascending = (monotonicSeqIdx & 1) != 0; - if ((monotonicSeqIdx + 1) * monotonicSeqLen >= end) //special case for part with no "partner" to be merged with in next phase + if ((monotonicSeqIdx + 1) * monotonicSeqLen >= arr.getSize()) //special case for part with no "partner" to be merged with in next phase ascending = true; cmpSwap(arr[s], arr[e], ascending, Cmp); @@ -55,9 +55,10 @@ __global__ void bitonicMergeGlobal(TNL::Containers::ArrayView -__global__ void bitonicMergeSharedMemory(TNL::Containers::ArrayView arr, - int begin, int end, const Function & Cmp, - int monotonicSeqLen, int len, int partsInSeq) +__global__ +void bitonicMergeSharedMemory(TNL::Containers::ArrayView arr, + const Function & Cmp, + int monotonicSeqLen, int len, int partsInSeq) { extern __shared__ int externMem[]; Value * sharedMem = (Value *)externMem; @@ -65,16 +66,16 @@ __global__ void bitonicMergeSharedMemory(TNL::Containers::ArrayView= end) + if ((monotonicSeqIdx + 1) * monotonicSeqLen >= arr.getSize()) ascending = true; //------------------------------------------ @@ -102,7 +103,7 @@ __global__ void bitonicMergeSharedMemory(TNL::Containers::ArrayView -__device__ void bitonicSort_Block( - TNL::Containers::ArrayView arr, - int myBlockStart, int myBlockEnd, Value* sharedMem, const Function & Cmp) +__device__ +void bitonicSort_Block(TNL::Containers::ArrayView arr, Value* sharedMem, const Function & Cmp) { - //copy from globalMem into sharedMem - int copy1 = myBlockStart + threadIdx.x; + int copy1 = threadIdx.x; int copy2 = copy1 + blockDim.x; { - if(copy1 < myBlockEnd) - sharedMem[threadIdx.x] = arr[copy1]; + if(copy1 < arr.getSize()) + sharedMem[copy1] = arr[copy1]; - if(copy2 < myBlockEnd) - sharedMem[threadIdx.x + blockDim.x] = arr[copy2]; + if(copy2 < arr.getSize()) + sharedMem[copy2] = arr[copy2]; __syncthreads(); } @@ -144,24 +143,24 @@ __device__ void bitonicSort_Block( //bitonic activity { int i = threadIdx.x; - int paddedSize = closestPow2(myBlockEnd - myBlockStart); + int paddedSize = closestPow2(arr.getSize()); for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) { //calculate the direction of swapping int monotonicSeqIdx = i / (monotonicSeqLen/2); bool ascending = (monotonicSeqIdx & 1) != 0; - if ((monotonicSeqIdx + 1) * monotonicSeqLen >= myBlockEnd) //special case for parts with no "partner" + if ((monotonicSeqIdx + 1) * monotonicSeqLen >= arr.getSize()) //special case for parts with no "partner" ascending = true; for (int len = monotonicSeqLen; len > 1; len /= 2) { //calculates which 2 indexes will be compared and swap - int part = threadIdx.x / (len / 2); - int s = part * len + (threadIdx.x & ((len / 2) - 1)); + int part = i / (len / 2); + int s = part * len + (i & ((len / 2) - 1)); int e = s + len / 2; - if(e < myBlockEnd - myBlockStart) //touching virtual padding + if(e < arr.getSize()) //not touching virtual padding cmpSwap(sharedMem[s], sharedMem[e], ascending, Cmp); __syncthreads(); } @@ -171,10 +170,10 @@ __device__ void bitonicSort_Block( //------------------------------------------ //writeback to global memory { - if(copy1 < myBlockEnd) - arr[copy1] = sharedMem[threadIdx.x]; - if(copy2 < myBlockEnd) - arr[copy2] = sharedMem[threadIdx.x + blockDim.x]; + if(copy1 < arr.getSize()) + arr[copy1] = sharedMem[copy1]; + if(copy2 < arr.getSize()) + arr[copy2] = sharedMem[copy2]; } } /** @@ -184,43 +183,41 @@ __device__ void bitonicSort_Block( * this continues until whole sharedMem is sorted * */ template -__global__ void bitoniSort1stStepSharedMemory(TNL::Containers::ArrayView arr, - int begin, int end, const Function & Cmp) +__global__ void bitoniSort1stStepSharedMemory(TNL::Containers::ArrayView arr, const Function & Cmp) { extern __shared__ int externMem[]; int sharedMemLen = 2*blockDim.x; - int myBlockStart = begin + blockIdx.x * sharedMemLen; - int myBlockEnd = end < myBlockStart+sharedMemLen? end : myBlockStart+sharedMemLen; + int myBlockStart = blockIdx.x * sharedMemLen; + int myBlockEnd = TNL::min(arr.getSize(), myBlockStart + sharedMemLen); - if(blockIdx.x%2 == 0) - bitonicSort_Block(arr, myBlockStart, myBlockEnd, (Value*) externMem, Cmp); + if(blockIdx.x%2 || blockIdx.x + 1 == gridDim.x) + bitonicSort_Block(arr.getView(myBlockStart, myBlockEnd), (Value*) externMem, Cmp); else - bitonicSort_Block(arr, myBlockStart, myBlockEnd, (Value*) externMem, + bitonicSort_Block(arr.getView(myBlockStart, myBlockEnd), (Value*) externMem, [&] __cuda_callable__ (const Value&a, const Value&b){return Cmp(b, a);} - ); + ); } //--------------------------------------------- template -void bitonicSort(TNL::Containers::ArrayView arr, int begin, int end, const Function& Cmp) +void bitonicSort(TNL::Containers::ArrayView src, int begin, int end, const Function& Cmp) { - int arrSize = end - begin; - int paddedSize = closestPow2(arrSize); + TNL::Containers::ArrayView arr = src.getView(begin, end); + int paddedSize = closestPow2(arr.getSize()); - int threadsNeeded = arrSize / 2 + (arrSize %2 !=0); + int threadsNeeded = arr.getSize() / 2 + (arr.getSize() %2 !=0); const int maxThreadsPerBlock = 512; int threadPerBlock = maxThreadsPerBlock; - int blocks = threadsNeeded / threadPerBlock + (threadsNeeded % threadPerBlock == 0 ? 0 : 1); + int blocks = threadsNeeded / threadPerBlock + (threadsNeeded % threadPerBlock != 0); const int sharedMemLen = threadPerBlock * 2; const int sharedMemSize = sharedMemLen* sizeof(Value); //--------------------------------------------------------------------------------- - - bitoniSort1stStepSharedMemory<<>>(arr, begin, end, Cmp); + bitoniSort1stStepSharedMemory<<>>(arr, Cmp); for (int monotonicSeqLen = 2*sharedMemLen; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) { @@ -228,14 +225,14 @@ void bitonicSort(TNL::Containers::ArrayView arr, int { if(len > sharedMemLen) { - bitonicMergeGlobal<<>>(arr, begin, end, Cmp, - monotonicSeqLen, len, partsInSeq); + bitonicMergeGlobal<<>>( + arr, Cmp, monotonicSeqLen, len, partsInSeq); } else { - bitonicMergeSharedMemory<<>>(arr, begin, end, Cmp, - monotonicSeqLen, len, partsInSeq); + bitonicMergeSharedMemory<<>>( + arr, Cmp, monotonicSeqLen, len, partsInSeq); break; } } diff --git a/GPUSort/bitonicGPU/unitTests/unitTests.cu b/GPUSort/bitonicGPU/unitTests/unitTests.cu index 6f4f5e7b7..323a1a732 100644 --- a/GPUSort/bitonicGPU/unitTests/unitTests.cu +++ b/GPUSort/bitonicGPU/unitTests/unitTests.cu @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -108,7 +109,7 @@ TEST(randomGenerated, bigArray_all0) auto view = cudaArr.getView(); bitonicSort(view); - ASSERT_TRUE(true); + ASSERT_TRUE(is_sorted(view)); } } @@ -180,6 +181,19 @@ TEST(sortstdVector, stdvector) ASSERT_TRUE(std::is_sorted(arr.begin(), arr.end())); } +TEST(sortRange, secondHalf) +{ + std::vector arr(19); + int s = 19/2; + for(size_t i = 0; i < s; i++) arr[i] = -1; + for(size_t i = s; i < 19; i++) arr[i] = -i; + + bitonicSort(arr, s, 19); + + ASSERT_TRUE(std::is_sorted(arr.begin() + s, arr.end())); + ASSERT_TRUE(arr[0] == -1); + ASSERT_TRUE(arr[s-1] == -1); +} //---------------------------------------------------------------------------------- -- GitLab From 94bbe16ae01be69c98e6061a20065c5e247d25b1 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Tue, 9 Mar 2021 16:19:57 +0100 Subject: [PATCH 105/258] bitonic UI change --- GPUSort/quicksort/quicksort.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPUSort/quicksort/quicksort.cuh b/GPUSort/quicksort/quicksort.cuh index 2e1065f8c..1002eb8c7 100644 --- a/GPUSort/quicksort/quicksort.cuh +++ b/GPUSort/quicksort/quicksort.cuh @@ -74,7 +74,7 @@ __global__ void cudaPartition(CudaArrayView arr,const Function & Cmp, if(size <= blockDim.x*2 && myTask.blockCount == 1) { bitonicSort_Block( - arr.getView(myBegin, myEnd), 0, size, + arr.getView(myBegin, myEnd), (int*) externMem, Cmp ); -- GitLab From 56eb9c9b8fd7bf81989bce5fc817848071299179 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Tue, 9 Mar 2021 16:36:52 +0100 Subject: [PATCH 106/258] more mid range test --- GPUSort/bitonicGPU/unitTests/unitTests.cu | 45 +++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/GPUSort/bitonicGPU/unitTests/unitTests.cu b/GPUSort/bitonicGPU/unitTests/unitTests.cu index 323a1a732..5911e9e92 100644 --- a/GPUSort/bitonicGPU/unitTests/unitTests.cu +++ b/GPUSort/bitonicGPU/unitTests/unitTests.cu @@ -195,6 +195,51 @@ TEST(sortRange, secondHalf) ASSERT_TRUE(arr[s-1] == -1); } +TEST(sortRange, middle) +{ + std::srand(8705); + + std::vector arr(20); + int s = 5, e = 15; + for(size_t i = 0; i < s; i++) arr[i] = -1; + for(size_t i = e; i < 20; i++) arr[i] = -1; + + for(size_t i = s; i < e; i++) arr[i] = std::rand(); + + bitonicSort(arr, s, e); + + ASSERT_TRUE(std::is_sorted(arr.begin() + s, arr.begin() + e)); + ASSERT_TRUE(arr[0] == -1); + ASSERT_TRUE(arr.back() == -1); + ASSERT_TRUE(arr[s-1] == -1); + ASSERT_TRUE(arr[e] == -1); +} + +TEST(sortRange, middleMultiBlock) +{ + std::srand(4513); + int size = 1<<20; + int s = 2000, e = size - 1512; + + std::vector arr(size); + for(size_t i = 0; i < s; i++) arr[i] = -1; + for(size_t i = e; i < size; i++) arr[i] = -1; + + for(size_t i = s; i < e; i++) arr[i] = std::rand(); + + bitonicSort(arr, s, e); + + ASSERT_TRUE(std::is_sorted(arr.begin() + s, arr.begin() + e)); + + ASSERT_TRUE(arr[0] == -1); + ASSERT_TRUE(arr[std::rand() % s] == -1); + ASSERT_TRUE(arr[s-1] == -1); + + ASSERT_TRUE(arr[e] == -1); + ASSERT_TRUE(arr[e + (std::rand() % (size - e))] == -1); + ASSERT_TRUE(arr.back() == -1); +} + //---------------------------------------------------------------------------------- -- GitLab From 633fd4344b919ad15c8bc0ca413bcb0b72e60f1a Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Tue, 9 Mar 2021 20:44:05 +0100 Subject: [PATCH 107/258] init 2nd phase --- GPUSort/quicksort/quicksort.cuh | 138 ++++++++++++++++++++++---------- 1 file changed, 95 insertions(+), 43 deletions(-) diff --git a/GPUSort/quicksort/quicksort.cuh b/GPUSort/quicksort/quicksort.cuh index 1002eb8c7..6f4049367 100644 --- a/GPUSort/quicksort/quicksort.cuh +++ b/GPUSort/quicksort/quicksort.cuh @@ -46,7 +46,10 @@ __global__ void cudaPartition(CudaArrayView arr,const Function & Cmp, int elemPerBlock, TNL::Containers::ArrayView cuda_tasks, TNL::Containers::ArrayView cuda_newTasks, - int *newTasksCnt) + int *newTasksCnt, + TNL::Containers::ArrayView cuda_2ndPhaseTasks, + int * cuda_2ndPhaseCnt +) { static __shared__ TASK myTask; static __shared__ int smallerStart, biggerStart; @@ -67,26 +70,9 @@ __global__ void cudaPartition(CudaArrayView arr,const Function & Cmp, //only works if consecutive blocks work on the same task const int myBegin = myTask.arrBegin + elemPerBlock * (blockIdx.x - myTask.firstBlock); const int myEnd = TNL::min(myTask.arrEnd, myBegin + elemPerBlock); - const int size = myEnd - myBegin; //------------------------------------------------------------------------- - if(size <= blockDim.x*2 && myTask.blockCount == 1) - { - bitonicSort_Block( - arr.getView(myBegin, myEnd), - (int*) externMem, - Cmp - ); - __syncthreads(); - - for (int i = myBegin + threadIdx.x; i < myEnd; i += blockDim.x) - aux[i] = arr[i]; - - return; - } - //------------------------------------------------------------------------- - int smaller = 0, bigger = 0; cmpElem(arr, myBegin, myEnd, smaller, bigger, pivot); @@ -121,23 +107,45 @@ __global__ void cudaPartition(CudaArrayView arr,const Function & Cmp, if (threadIdx.x != 0) return; - - if (myTask.auxBeginIdx - myTask.arrBegin > 1) //smaller + + if (myTask.auxBeginIdx - myTask.arrBegin > 0) //smaller { - int newTaskIdx = atomicAdd(newTasksCnt, 1); - cuda_newTasks[newTaskIdx] = TASK( - myTask.arrBegin, myTask.auxBeginIdx, - myTask.arrBegin, myTask.auxBeginIdx - ); + if(myTask.auxBeginIdx - myTask.arrBegin <= blockDim.x*2) + { + int newTaskIdx = atomicAdd(cuda_2ndPhaseCnt, 1); + cuda_2ndPhaseTasks[newTaskIdx] = TASK( + myTask.arrBegin, myTask.auxBeginIdx, + myTask.arrBegin, myTask.auxBeginIdx + ); + } + else + { + int newTaskIdx = atomicAdd(newTasksCnt, 1); + cuda_newTasks[newTaskIdx] = TASK( + myTask.arrBegin, myTask.auxBeginIdx, + myTask.arrBegin, myTask.auxBeginIdx + ); + } } - if (myTask.arrEnd - myTask.auxEndIdx > 1) //greater + if (myTask.arrEnd - myTask.auxEndIdx > 0) //greater { - int newTaskIdx = atomicAdd(newTasksCnt, 1); - cuda_newTasks[newTaskIdx] = TASK( - myTask.auxEndIdx, myTask.arrEnd, - myTask.auxEndIdx, myTask.arrEnd - ); + if (myTask.arrEnd - myTask.auxEndIdx <= blockDim.x*2) + { + int newTaskIdx = atomicAdd(cuda_2ndPhaseCnt, 1); + cuda_2ndPhaseTasks[newTaskIdx] = TASK( + myTask.auxEndIdx, myTask.arrEnd, + myTask.auxEndIdx, myTask.arrEnd + ); + } + else + { + int newTaskIdx = atomicAdd(newTasksCnt, 1); + cuda_newTasks[newTaskIdx] = TASK( + myTask.auxEndIdx, myTask.arrEnd, + myTask.auxEndIdx, myTask.arrEnd + ); + } } } @@ -174,10 +182,21 @@ __global__ void cudaInitTask(TNL::Containers::ArrayView +__global__ +void cudaQuickSort(CudaArrayView arr, const Function & Cmp, + TNL::Containers::ArrayView cuda_tasks, + int *TasksCnt) + {} + + + //----------------------------------------------------------- //----------------------------------------------------------- -const int threadsPerBlock = 32, maxBlocks = 1 << 14; //16k -const int maxTasks = maxBlocks; +const int threadsPerBlock = 512, maxBlocks = 1 << 14; //16k +const int maxTasks = maxBlocks/2; const int minElemPerBlock = threadsPerBlock*2; class QUICKSORT @@ -185,10 +204,11 @@ class QUICKSORT CudaArrayView arr; TNL::Containers::Array aux; - CudaTaskArray cuda_tasks, cuda_newTasks; + CudaTaskArray cuda_tasks, cuda_newTasks, cuda_2ndPhaseTasks; - TNL::Containers::Array cuda_newTasksAmount; //is in reality 1 integer + TNL::Containers::Array cuda_newTasksAmount, cuda_2ndPhaseTasksAmount; //is in reality 1 integer int tasksAmount; //counter for Host + int totalTask; TNL::Containers::Array cuda_blockToTaskMapping; TNL::Containers::Array cuda_blockToTaskMapping_Cnt; //is in reality 1 integer @@ -199,18 +219,20 @@ class QUICKSORT public: QUICKSORT(CudaArrayView _arr) : arr(_arr), aux(arr.getSize()), - cuda_tasks(maxTasks), cuda_newTasks(maxTasks), cuda_newTasksAmount(1), + cuda_tasks(maxTasks), cuda_newTasks(maxTasks), cuda_2ndPhaseTasks(maxTasks*2), + cuda_newTasksAmount(1), cuda_2ndPhaseTasksAmount(1), cuda_blockToTaskMapping(maxBlocks), cuda_blockToTaskMapping_Cnt(1) { cuda_tasks.setElement(0, TASK(0, arr.getSize(), 0, arr.getSize())); - tasksAmount = 1; + totalTask = tasksAmount = 1; + cuda_2ndPhaseTasksAmount = 0; } template void sort(const Function & Cmp) { const int auxMemByteSize = minElemPerBlock* sizeof(int); - while (tasksAmount > 0) + while (tasksAmount > 0 && totalTask < maxTasks) { int elemPerBlock = getElemPerBlock(); int blocksCnt = initTasks(elemPerBlock); @@ -223,7 +245,8 @@ public: cuda_blockToTaskMapping.getView(), elemPerBlock, cuda_tasks.getView(), cuda_newTasks.getView(), - cuda_newTasksAmount.getData() + cuda_newTasksAmount.getData(), + cuda_2ndPhaseTasks.getView(), cuda_2ndPhaseTasksAmount.getData() ); } else @@ -234,17 +257,17 @@ public: cuda_blockToTaskMapping.getView(), elemPerBlock, cuda_newTasks.getView(), cuda_tasks.getView(), //swapped order to write back and forth without copying - cuda_newTasksAmount.getData() + cuda_newTasksAmount.getData(), + cuda_2ndPhaseTasks.getView(), cuda_2ndPhaseTasksAmount.getData() ); } tasksAmount = processNewTasks(); iteration++; + } - } - - //insert phase 2 sort for almostDoneTasks + _2ndPhase(); cudaDeviceSynchronize(); } @@ -303,8 +326,37 @@ public: copy(arr.getData(), aux.getData(), aux.getSize()); tasksAmount = cuda_newTasksAmount.getElement(0); + totalTask = tasksAmount + cuda_2ndPhaseTasksAmount.getElement(0); return tasksAmount; } + + void _2ndPhase() + { + if(totalTask == 0) return; + + init2ndPhase(); + + } + + void init2ndPhase() + { + TNL::Algorithms::MultiDeviceMemoryOperations:: + copy(cuda_2ndPhaseTasks.getData() + (totalTask - tasksAmount), + (iteration%2? cuda_newTasks.getData() :cuda_tasks.getData() ), + tasksAmount + ); + + int threads = min(totalTask, threadsPerBlock); + int blocks = totalTask / threads + (totalTask % threads != 0); + + cuda_blockToTaskMapping_Cnt = 0; + + cudaInitTask<<>>(cuda_2ndPhaseTasks.getView(), + totalTask, INT_MAX, + cuda_blockToTaskMapping_Cnt.getData(), + cuda_blockToTaskMapping.getView() + ); + } }; //----------------------------------------------------------- -- GitLab From 417787026108db9bb25fc1453741e55db9832ea8 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Tue, 9 Mar 2021 22:39:12 +0100 Subject: [PATCH 108/258] block quicksort --- GPUSort/quicksort/quicksort.cuh | 103 +++++++++++++++++++++++++------- 1 file changed, 82 insertions(+), 21 deletions(-) diff --git a/GPUSort/quicksort/quicksort.cuh b/GPUSort/quicksort/quicksort.cuh index 6f4049367..176c716e8 100644 --- a/GPUSort/quicksort/quicksort.cuh +++ b/GPUSort/quicksort/quicksort.cuh @@ -56,7 +56,6 @@ __global__ void cudaPartition(CudaArrayView arr,const Function & Cmp, static __shared__ int pivot; static __shared__ int myTaskIdx; static __shared__ bool writePivot; - extern __shared__ int externMem[]; if (threadIdx.x == 0) { @@ -184,14 +183,83 @@ __global__ void cudaInitTask(TNL::Containers::ArrayView +__device__ void cudaQuickSort_block(CudaArrayView arr, const Function & Cmp, + CudaArrayView aux, int * externMem) +{ + static __shared__ int pivotBegin, pivotEnd; + static __shared__ int pivot; + + if (threadIdx.x == 0) + pivot = arr[arr.getSize() - 1]; + __syncthreads(); + + if(arr.getSize() <= blockDim.x*2) + { + bitonicSort_Block(arr, (int*) externMem, Cmp); + return; + } + + int smaller = 0, bigger = 0; + cmpElem(arr, 0, arr.getSize(), smaller, bigger, pivot); + + int smallerOffset = blockInclusivePrefixSum(smaller); + int biggerOffset = blockInclusivePrefixSum(bigger); + + if (threadIdx.x == blockDim.x - 1) + { + pivotBegin = smallerOffset; + pivotEnd = arr.getSize() - biggerOffset; + } + __syncthreads(); + + int destSmaller = smallerOffset - smaller; + int destBigger = pivotEnd + biggerOffset - bigger; + copyData(arr, 0, arr.getSize(), aux, destSmaller, destBigger, pivot); + __syncthreads(); + + for (int i = threadIdx.x; i < arr.getSize(); i += blockDim.x) + { + if(i >= pivotBegin && i < pivotEnd) + arr[i] = pivot; + else + arr[i] = aux[i]; + } + __syncthreads(); + + if(pivotBegin > 1) //left from pivot are smaller elems + { + cudaQuickSort_block(arr.getView(0, pivotBegin), Cmp, + aux.getView(0, pivotBegin), externMem + ); + } + + if(arr.getSize() - pivotEnd > 1) //right from pivot until arr.size are elem greater than pivot + { + cudaQuickSort_block(arr.getView(pivotEnd, arr.getSize()), Cmp, + aux.getView(pivotEnd, arr.getSize()), externMem + ); + } + +} + template __global__ void cudaQuickSort(CudaArrayView arr, const Function & Cmp, - TNL::Containers::ArrayView cuda_tasks, - int *TasksCnt) - {} + CudaArrayView aux, + TNL::Containers::ArrayView cuda_tasks) +{ + extern __shared__ int externMem[]; + static __shared__ TASK task; + if(threadIdx.x == 0) + task = cuda_tasks[blockIdx.x]; + __syncthreads(); + cudaQuickSort_block(arr.getView(task.arrBegin, task.arrEnd), Cmp, + aux.getView(task.auxBeginIdx, task.auxEndIdx), externMem + ); +} //----------------------------------------------------------- //----------------------------------------------------------- @@ -231,7 +299,6 @@ public: template void sort(const Function & Cmp) { - const int auxMemByteSize = minElemPerBlock* sizeof(int); while (tasksAmount > 0 && totalTask < maxTasks) { int elemPerBlock = getElemPerBlock(); @@ -239,7 +306,7 @@ public: if (iteration % 2 == 0) { - cudaPartition<<>>( + cudaPartition<<>>( arr, Cmp, aux.getView(), cuda_blockToTaskMapping.getView(), @@ -251,7 +318,7 @@ public: } else { - cudaPartition<<>>( + cudaPartition<<>>( arr, Cmp, aux.getView(), cuda_blockToTaskMapping.getView(), @@ -267,7 +334,7 @@ public: iteration++; } - _2ndPhase(); + _2ndPhase(Cmp); cudaDeviceSynchronize(); } @@ -330,15 +397,18 @@ public: return tasksAmount; } - void _2ndPhase() + template + void _2ndPhase(const Function & Cmp) { if(totalTask == 0) return; - init2ndPhase(); + int blocks = init2ndPhase(); + int auxMem = threadsPerBlock*2*sizeof(int); + cudaQuickSort<<>>(arr, Cmp, aux.getView(), cuda_2ndPhaseTasks); } - void init2ndPhase() + int init2ndPhase() { TNL::Algorithms::MultiDeviceMemoryOperations:: copy(cuda_2ndPhaseTasks.getData() + (totalTask - tasksAmount), @@ -346,16 +416,7 @@ public: tasksAmount ); - int threads = min(totalTask, threadsPerBlock); - int blocks = totalTask / threads + (totalTask % threads != 0); - - cuda_blockToTaskMapping_Cnt = 0; - - cudaInitTask<<>>(cuda_2ndPhaseTasks.getView(), - totalTask, INT_MAX, - cuda_blockToTaskMapping_Cnt.getData(), - cuda_blockToTaskMapping.getView() - ); + return totalTask; } }; -- GitLab From 228cf9a370647cff094b69fc526f6ef2cde2e76e Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Tue, 9 Mar 2021 23:23:37 +0100 Subject: [PATCH 109/258] task size --- GPUSort/quicksort/quicksort.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/GPUSort/quicksort/quicksort.cuh b/GPUSort/quicksort/quicksort.cuh index 176c716e8..814995546 100644 --- a/GPUSort/quicksort/quicksort.cuh +++ b/GPUSort/quicksort/quicksort.cuh @@ -264,7 +264,7 @@ void cudaQuickSort(CudaArrayView arr, const Function & Cmp, //----------------------------------------------------------- //----------------------------------------------------------- const int threadsPerBlock = 512, maxBlocks = 1 << 14; //16k -const int maxTasks = maxBlocks/2; +const int maxTasks = maxBlocks/4; const int minElemPerBlock = threadsPerBlock*2; class QUICKSORT @@ -287,7 +287,7 @@ class QUICKSORT public: QUICKSORT(CudaArrayView _arr) : arr(_arr), aux(arr.getSize()), - cuda_tasks(maxTasks), cuda_newTasks(maxTasks), cuda_2ndPhaseTasks(maxTasks*2), + cuda_tasks(maxBlocks), cuda_newTasks(maxBlocks), cuda_2ndPhaseTasks(maxBlocks), cuda_newTasksAmount(1), cuda_2ndPhaseTasksAmount(1), cuda_blockToTaskMapping(maxBlocks), cuda_blockToTaskMapping_Cnt(1) { -- GitLab From 3ad278bec96612cfadb1523518bd37770c57de1e Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 10 Mar 2021 01:12:58 +0100 Subject: [PATCH 110/258] custom stack --- GPUSort/quicksort/quicksort.cuh | 140 +++++++++++++++++++------------- 1 file changed, 83 insertions(+), 57 deletions(-) diff --git a/GPUSort/quicksort/quicksort.cuh b/GPUSort/quicksort/quicksort.cuh index 814995546..284707f49 100644 --- a/GPUSort/quicksort/quicksort.cuh +++ b/GPUSort/quicksort/quicksort.cuh @@ -185,68 +185,90 @@ __global__ void cudaInitTask(TNL::Containers::ArrayView __device__ void cudaQuickSort_block(CudaArrayView arr, const Function & Cmp, - CudaArrayView aux, int * externMem) + CudaArrayView aux, + int * stackArrBegin, int *stackArrEnd, int stackSize, + int * bitonicMem ) { + static __shared__ int begin, end; + static __shared__ int stackTop; static __shared__ int pivotBegin, pivotEnd; static __shared__ int pivot; if (threadIdx.x == 0) - pivot = arr[arr.getSize() - 1]; + { + stackArrBegin[0] = 0; + stackArrEnd[0] = arr.getSize(); + stackTop = 1; + } __syncthreads(); - if(arr.getSize() <= blockDim.x*2) + while(stackTop > 0) { - bitonicSort_Block(arr, (int*) externMem, Cmp); - return; - } + if (threadIdx.x == 0) + { + begin = stackArrBegin[stackTop - 1]; + end = stackArrEnd[stackTop - 1]; + stackTop--; + pivot = arr[end - 1]; + } + __syncthreads(); + + int size = end - begin; + if(size<= blockDim.x*2) + { + bitonicSort_Block(arr.getView(begin, end), bitonicMem, Cmp); + continue; + } - int smaller = 0, bigger = 0; - cmpElem(arr, 0, arr.getSize(), smaller, bigger, pivot); + int smaller = 0, bigger = 0; + cmpElem(arr, begin, end, smaller, bigger, pivot); - int smallerOffset = blockInclusivePrefixSum(smaller); - int biggerOffset = blockInclusivePrefixSum(bigger); + int smallerOffset = blockInclusivePrefixSum(smaller); + int biggerOffset = blockInclusivePrefixSum(bigger); - if (threadIdx.x == blockDim.x - 1) - { - pivotBegin = smallerOffset; - pivotEnd = arr.getSize() - biggerOffset; - } - __syncthreads(); + if (threadIdx.x == blockDim.x - 1) + { + pivotBegin = begin + smallerOffset; + pivotEnd = end - biggerOffset; + } + __syncthreads(); - int destSmaller = smallerOffset - smaller; - int destBigger = pivotEnd + biggerOffset - bigger; - copyData(arr, 0, arr.getSize(), aux, destSmaller, destBigger, pivot); - __syncthreads(); + int destSmaller = smallerOffset - smaller; + int destBigger = pivotEnd + biggerOffset - bigger; + copyData(arr, begin, end, aux, destSmaller, destBigger, pivot); + __syncthreads(); - for (int i = threadIdx.x; i < arr.getSize(); i += blockDim.x) - { - if(i >= pivotBegin && i < pivotEnd) - arr[i] = pivot; - else - arr[i] = aux[i]; - } - __syncthreads(); + for (int i = begin + threadIdx.x; i < end; i += blockDim.x) + { + if(i >= pivotBegin && i < pivotEnd) + arr[i] = pivot; + else + arr[i] = aux[i]; + } + __syncthreads(); - if(pivotBegin > 1) //left from pivot are smaller elems - { - cudaQuickSort_block(arr.getView(0, pivotBegin), Cmp, - aux.getView(0, pivotBegin), externMem - ); - } + if(threadIdx.x != 0) continue; - if(arr.getSize() - pivotEnd > 1) //right from pivot until arr.size are elem greater than pivot - { - cudaQuickSort_block(arr.getView(pivotEnd, arr.getSize()), Cmp, - aux.getView(pivotEnd, arr.getSize()), externMem - ); - } + if(pivotBegin - begin > 1) //left from pivot are smaller elems + { + stackArrBegin[stackTop] = begin; + stackArrEnd[stackTop] = pivotBegin; + stackTop++; + } + if(end - pivotEnd > 1) //right from pivot until end are elem greater than pivot + { + stackArrBegin[stackTop] = pivotEnd; + stackArrEnd[stackTop] = end; + stackTop++; + } + } } template __global__ void cudaQuickSort(CudaArrayView arr, const Function & Cmp, - CudaArrayView aux, + CudaArrayView aux, int stackSize, TNL::Containers::ArrayView cuda_tasks) { extern __shared__ int externMem[]; @@ -256,14 +278,21 @@ void cudaQuickSort(CudaArrayView arr, const Function & Cmp, task = cuda_tasks[blockIdx.x]; __syncthreads(); + int * bitonicMem = externMem; + int * stackLeft = bitonicMem + (2*blockDim.x); + int * stackRight = stackLeft+ (stackSize/2); + cudaQuickSort_block(arr.getView(task.arrBegin, task.arrEnd), Cmp, - aux.getView(task.auxBeginIdx, task.auxEndIdx), externMem + aux.getView(task.auxBeginIdx, task.auxEndIdx), + stackLeft, stackRight, stackSize/2, + bitonicMem ); + } //----------------------------------------------------------- //----------------------------------------------------------- -const int threadsPerBlock = 512, maxBlocks = 1 << 14; //16k +const int threadsPerBlock = 512, maxBlocks = 1 << 15; //32k const int maxTasks = maxBlocks/4; const int minElemPerBlock = threadsPerBlock*2; @@ -401,22 +430,19 @@ public: void _2ndPhase(const Function & Cmp) { if(totalTask == 0) return; - - int blocks = init2ndPhase(); - - int auxMem = threadsPerBlock*2*sizeof(int); - cudaQuickSort<<>>(arr, Cmp, aux.getView(), cuda_2ndPhaseTasks); - } - - int init2ndPhase() - { + TNL::Algorithms::MultiDeviceMemoryOperations:: - copy(cuda_2ndPhaseTasks.getData() + (totalTask - tasksAmount), - (iteration%2? cuda_newTasks.getData() :cuda_tasks.getData() ), - tasksAmount - ); + copy(cuda_2ndPhaseTasks.getData() + (totalTask - tasksAmount), + (iteration%2? cuda_newTasks.getData() :cuda_tasks.getData() ), + tasksAmount + ); + + int blocks = totalTask; - return totalTask; + int stackSize = 256, stackMem = stackSize * sizeof(int); + int bitonicMem = threadsPerBlock*2*sizeof(int); + int auxMem = stackMem + bitonicMem; + cudaQuickSort<<>>(arr, Cmp, aux.getView(), stackSize, cuda_2ndPhaseTasks.getView()); } }; -- GitLab From 64442d5f7e095a2a0da4f589d54878fb358d071b Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Fri, 12 Mar 2021 15:49:03 +0100 Subject: [PATCH 111/258] stack size and access fix --- GPUSort/quicksort/quicksort.cuh | 37 +++++++++++++++++---------------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/GPUSort/quicksort/quicksort.cuh b/GPUSort/quicksort/quicksort.cuh index 284707f49..674caca44 100644 --- a/GPUSort/quicksort/quicksort.cuh +++ b/GPUSort/quicksort/quicksort.cuh @@ -245,23 +245,24 @@ __device__ void cudaQuickSort_block(CudaArrayView arr, const Function & Cmp, else arr[i] = aux[i]; } - __syncthreads(); - - if(threadIdx.x != 0) continue; - - if(pivotBegin - begin > 1) //left from pivot are smaller elems - { - stackArrBegin[stackTop] = begin; - stackArrEnd[stackTop] = pivotBegin; - stackTop++; - } - if(end - pivotEnd > 1) //right from pivot until end are elem greater than pivot + if(threadIdx.x == 0) { - stackArrBegin[stackTop] = pivotEnd; - stackArrEnd[stackTop] = end; - stackTop++; + if(pivotBegin - begin > 1) //left from pivot are smaller elems + { + stackArrBegin[stackTop] = begin; + stackArrEnd[stackTop] = pivotBegin; + stackTop++; + } + + if(end - pivotEnd > 1) //right from pivot until end are elem greater than pivot + { + stackArrBegin[stackTop] = pivotEnd; + stackArrEnd[stackTop] = end; + stackTop++; + } } + __syncthreads(); } } @@ -293,7 +294,7 @@ void cudaQuickSort(CudaArrayView arr, const Function & Cmp, //----------------------------------------------------------- //----------------------------------------------------------- const int threadsPerBlock = 512, maxBlocks = 1 << 15; //32k -const int maxTasks = maxBlocks/4; +const int maxTasks = 1<<10; const int minElemPerBlock = threadsPerBlock*2; class QUICKSORT @@ -304,8 +305,8 @@ class QUICKSORT CudaTaskArray cuda_tasks, cuda_newTasks, cuda_2ndPhaseTasks; TNL::Containers::Array cuda_newTasksAmount, cuda_2ndPhaseTasksAmount; //is in reality 1 integer - int tasksAmount; //counter for Host - int totalTask; + int tasksAmount; //counter for Host == cuda_newTasksAmount + int totalTask; // cuda_newTasksAmount + cuda_2ndPhaseTasksAmount TNL::Containers::Array cuda_blockToTaskMapping; TNL::Containers::Array cuda_blockToTaskMapping_Cnt; //is in reality 1 integer @@ -439,7 +440,7 @@ public: int blocks = totalTask; - int stackSize = 256, stackMem = stackSize * sizeof(int); + int stackSize = 128, stackMem = stackSize * sizeof(int); int bitonicMem = threadsPerBlock*2*sizeof(int); int auxMem = stackMem + bitonicMem; cudaQuickSort<<>>(arr, Cmp, aux.getView(), stackSize, cuda_2ndPhaseTasks.getView()); -- GitLab From 63df9fb4acfa67d137cde7839978a9925a6bf752 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sat, 20 Mar 2021 18:48:31 +0100 Subject: [PATCH 112/258] dynamic call --- GPUSort/quicksort_dynamic/Makefile | 31 ++ GPUSort/quicksort_dynamic/benchmark/Makefile | 31 ++ .../quicksort_dynamic/benchmark/benchmark.cu | 92 +++++ GPUSort/quicksort_dynamic/config.mk | 49 +++ GPUSort/quicksort_dynamic/helper.cuh | 89 +++++ GPUSort/quicksort_dynamic/main.cu | 23 ++ GPUSort/quicksort_dynamic/quicksort.cu | 322 ++++++++++++++++++ GPUSort/quicksort_dynamic/quicksort.cuh | 10 + GPUSort/quicksort_dynamic/reduction.cuh | 81 +++++ GPUSort/quicksort_dynamic/task.h | 14 + GPUSort/quicksort_dynamic/unitTests/Makefile | 32 ++ .../quicksort_dynamic/unitTests/unitTests.cu | 77 +++++ 12 files changed, 851 insertions(+) create mode 100644 GPUSort/quicksort_dynamic/Makefile create mode 100644 GPUSort/quicksort_dynamic/benchmark/Makefile create mode 100644 GPUSort/quicksort_dynamic/benchmark/benchmark.cu create mode 100644 GPUSort/quicksort_dynamic/config.mk create mode 100644 GPUSort/quicksort_dynamic/helper.cuh create mode 100644 GPUSort/quicksort_dynamic/main.cu create mode 100644 GPUSort/quicksort_dynamic/quicksort.cu create mode 100644 GPUSort/quicksort_dynamic/quicksort.cuh create mode 100644 GPUSort/quicksort_dynamic/reduction.cuh create mode 100644 GPUSort/quicksort_dynamic/task.h create mode 100644 GPUSort/quicksort_dynamic/unitTests/Makefile create mode 100644 GPUSort/quicksort_dynamic/unitTests/unitTests.cu diff --git a/GPUSort/quicksort_dynamic/Makefile b/GPUSort/quicksort_dynamic/Makefile new file mode 100644 index 000000000..5e4f268e3 --- /dev/null +++ b/GPUSort/quicksort_dynamic/Makefile @@ -0,0 +1,31 @@ +include config.mk + +TARGET := main +EXTRA_ARCH := -gencode arch=compute_52,code=sm_52 +DEVICE_CODE := -dc + +CUDA_LDLIBS += -lcudadevrt + +## targets definitions follow +.PHONY: all host cuda +all: cuda +cuda: $(TARGET) + +run: cuda + ./$(TARGET) + +.PHONY: clean +clean: + rm -f *.d *.o *.cuo $(TARGET) + +$(TARGET): quicksort.o quicksort_link.o $(TARGET).o + $(CXX) $(TNL_INCLUDE_DIRS) $(CUDA_LDFLAGS) -o $@ $^ $(CUDA_LDLIBS) + +$(TARGET).o: $(TARGET).cu + $(CUDA_CXX) $(CUDA_CXXFLAGS) -c -o $@ $< + +quicksort.o: quicksort.cu + $(CUDA_CXX) $(CUDA_CXXFLAGS) $(EXTRA_ARCH) $(DEVICE_CODE) -c -o $@ $< + +quicksort_link.o: quicksort.o + $(CUDA_CXX) $(CUDA_LDFLAGS) -dlink -o $@ $< $(CUDA_LDLIBS) diff --git a/GPUSort/quicksort_dynamic/benchmark/Makefile b/GPUSort/quicksort_dynamic/benchmark/Makefile new file mode 100644 index 000000000..fc9c1bcc8 --- /dev/null +++ b/GPUSort/quicksort_dynamic/benchmark/Makefile @@ -0,0 +1,31 @@ +include ../config.mk + +TARGET := benchmark +EXTRA_ARCH := -gencode arch=compute_52,code=sm_52 +DEVICE_CODE := -dc + +CUDA_LDLIBS += -lcudadevrt + +## targets definitions follow +.PHONY: all host cuda +all: cuda +cuda: $(TARGET) + +run: cuda + ./$(TARGET) + +.PHONY: clean +clean: + rm -f *.d *.o *.cuo $(TARGET) + +$(TARGET): quicksort.o quicksort_link.o $(TARGET).o + $(CXX) $(TNL_INCLUDE_DIRS) $(CUDA_LDFLAGS) -o $@ $^ $(CUDA_LDLIBS) + +$(TARGET).o: $(TARGET).cu + $(CUDA_CXX) $(CUDA_CXXFLAGS) -c -o $@ $< + +quicksort.o: ../quicksort.cu + $(CUDA_CXX) $(CUDA_CXXFLAGS) $(EXTRA_ARCH) $(DEVICE_CODE) -c -o $@ $< + +quicksort_link.o: quicksort.o + $(CUDA_CXX) $(CUDA_LDFLAGS) -dlink -o $@ $< $(CUDA_LDLIBS) diff --git a/GPUSort/quicksort_dynamic/benchmark/benchmark.cu b/GPUSort/quicksort_dynamic/benchmark/benchmark.cu new file mode 100644 index 000000000..764289b35 --- /dev/null +++ b/GPUSort/quicksort_dynamic/benchmark/benchmark.cu @@ -0,0 +1,92 @@ +#include +#include +#include + +#include + +#include "../quicksort.cuh" +#include "../../util/timer.h" + +using namespace TNL; +using namespace TNL::Containers; +using namespace std; + +typedef Devices::Cuda Device; + +int main() +{ + srand(8151); + for(int pow = 5; pow <= 23; pow++) + { + int size =(1<< pow); + + vector vec(size); + iota(vec.begin(), vec.end(), 0); + + Array arr; + vector resAcc; + + //sorted sequence + { + arr = vec; + auto view = arr.getView(); + + { + TIMER t([&](double res){resAcc.push_back(res);}); + quicksort(view); + } + } + + //almost sorted sequence + { + for(int i = 0; i < 3; i++) + { + int s = rand() % (size - 3); + std::swap(vec[s], vec[s + 1]); + } + + arr = vec; + auto view = arr.getView(); + + { + TIMER t([&](double res){resAcc.push_back(res);}); + quicksort(view); + } + } + + //decreasing sequence + { + for(size_t i = 0; i < size; i++) + vec[i] = -i; + + arr = vec; + auto view = arr.getView(); + + { + TIMER t([&](double res){resAcc.push_back(res);}); + quicksort(view); + } + } + + //random sequence + { + random_shuffle(vec.begin(), vec.end()); + + arr = vec; + auto view = arr.getView(); + + { + TIMER t([&](double res){resAcc.push_back(res);}); + quicksort(view); + } + } + + + cout << "2^" << pow << " = "; + cout << fixed; + cout << setprecision(3); + cout << (accumulate(resAcc.begin(), resAcc.end(), 0.0) / resAcc.size()) << " ms" << endl; + } + + return 0; +} \ No newline at end of file diff --git a/GPUSort/quicksort_dynamic/config.mk b/GPUSort/quicksort_dynamic/config.mk new file mode 100644 index 000000000..3715986f7 --- /dev/null +++ b/GPUSort/quicksort_dynamic/config.mk @@ -0,0 +1,49 @@ +# configure the include path(s) according to your TNL installation +TNL_INCLUDE_DIRS := -I ~/.local/include + +WITH_OPENMP := no +WITH_DEBUG := no + +# If TNL is installed on your system, the CUDA architecture can be detected +# automatically by tnl-cuda-arch. This is done if CUDA_ARCH is set to "auto". +# Otherwise, CUDA_ARCH has to be set manually to the desired CUDA architecture +# number, e.g. 60, 61, etc. +CUDA_ARCH := auto + +# compilers +CXX := g++ +CUDA_CXX := nvcc + +# host compiler flags +CXXFLAGS := -std=c++14 $(TNL_INCLUDE_DIRS) +ifeq ($(WITH_DEBUG),yes) + CXXFLAGS += -O0 -g +else + CXXFLAGS += -O3 -DNDEBUG +endif + +# CUDA compiler flags +CUDA_CXXFLAGS := -std=c++14 --expt-relaxed-constexpr --expt-extended-lambda $(TNL_INCLUDE_DIRS) +CUDA_CXXFLAGS += -DHAVE_CUDA +ifeq ($(CUDA_ARCH),auto) + CUDA_CXXFLAGS += $(shell tnl-cuda-arch) +else + CUDA_CXXFLAGS += -gencode arch=compute_$(CUDA_ARCH),code=sm_$(CUDA_ARCH) +endif + +# determine path to the CUDA toolkit installation +# (autodetection is attempted, set it manually if it fails) +CUDA_PATH ?= $(abspath $(dir $(shell command -v nvcc))/..) +#$(info Detected CUDA_PATH: $(CUDA_PATH)) + +# flags for linking CUDA with the host compiler +CUDA_LDFLAGS := -L $(CUDA_PATH)/lib64 +CUDA_LDLIBS := -lcudart -ldl -lrt + +# enable OpenMP +ifeq ($(WITH_OPENMP),yes) + CXXFLAGS += -fopenmp -DHAVE_OPENMP + LDLIBS += -lgomp + CUDA_CXXFLAGS += -Xcompiler -fopenmp -DHAVE_OPENMP + CUDA_LDLIBS += -lgomp +endif diff --git a/GPUSort/quicksort_dynamic/helper.cuh b/GPUSort/quicksort_dynamic/helper.cuh new file mode 100644 index 000000000..8f8172b44 --- /dev/null +++ b/GPUSort/quicksort_dynamic/helper.cuh @@ -0,0 +1,89 @@ +#pragma once + +#include + +template +__device__ void countElem(TNL::Containers::ArrayView src, + int myBegin, int myEnd, + int &smaller, int &bigger, + const Value &pivot) +{ + for (int i = myBegin + threadIdx.x; i < myEnd; i += blockDim.x) + { + int data = src[i]; + if (data < pivot) + smaller++; + else if (data > pivot) + bigger++; + } +} + +template +__device__ void copyData(TNL::Containers::ArrayView src, + int myBegin, int myEnd, + TNL::Containers::ArrayView dst, + int smallerStart, int biggerStart, + const Value &pivot) + +{ + for (int i = myBegin + threadIdx.x; i < myEnd; i += blockDim.x) + { + int data = src[i]; + if (data < pivot) + dst[smallerStart++] = data; + else if (data > pivot) + dst[biggerStart++] = data; + } +} + +__device__ void calcBlocksNeeded(int elemLeft, int elemRight, int &blocksLeft, int &blocksRight) +{ + int minElemPerBlock = blockDim.x*2; + blocksLeft = elemLeft / minElemPerBlock + (elemLeft% minElemPerBlock != 0); + blocksRight = elemRight / minElemPerBlock + (elemRight% minElemPerBlock != 0); + + + int totalSets = blocksLeft + blocksRight; + if(totalSets<= gridDim.x) + return; + + int multiplier = 1.*gridDim.x / totalSets + 1; + minElemPerBlock *= multiplier; + + blocksLeft = elemLeft / minElemPerBlock + (elemLeft% minElemPerBlock != 0); + blocksRight = elemRight / minElemPerBlock + (elemRight% minElemPerBlock != 0); + +} + +template +__device__ Value pickPivot(TNL::Containers::ArrayView src, const Function & Cmp) +{ + return src[0]; + //return src[src.getSize()-1]; + + /* + if(src.getSize() ==1) + return src[0]; + + Value a = src[0], b = src[src.getSize()/2], c = src[src.getSize() - 1]; + + if(Cmp(a, b)) // ..a..b.. + { + if(Cmp(b, c))// ..a..b..c + return b; + else if(Cmp(c, a))//..c..a..b.. + return a; + else //..a..c..b.. + return c; + } + else //..b..a.. + { + if(Cmp(a, c))//..b..a..c + return a; + else if(Cmp(c, b))//..c..b..a.. + return b; + else //..b..c..a.. + return c; + } + */ +} \ No newline at end of file diff --git a/GPUSort/quicksort_dynamic/main.cu b/GPUSort/quicksort_dynamic/main.cu new file mode 100644 index 000000000..6084d8665 --- /dev/null +++ b/GPUSort/quicksort_dynamic/main.cu @@ -0,0 +1,23 @@ +#include +#include "quicksort.cuh" +#include "../util/algorithm.h" + +#include +#include +#include +using namespace std; + +int main() +{ + vector vec(19); + iota(vec.begin(), vec.end(), 0); + random_shuffle(vec.begin(), vec.end()); + + TNL::Containers::Array arr(vec); + auto view = arr.getView(); + cout << view << endl; + quicksort(view); + cout << view << endl; + + return 0; +} \ No newline at end of file diff --git a/GPUSort/quicksort_dynamic/quicksort.cu b/GPUSort/quicksort_dynamic/quicksort.cu new file mode 100644 index 000000000..25e72c2f4 --- /dev/null +++ b/GPUSort/quicksort_dynamic/quicksort.cu @@ -0,0 +1,322 @@ +#include "quicksort.cuh" + +#include +#include "reduction.cuh" +#include "task.h" +#include "../bitonicGPU/bitonicSort.h" +#include "helper.cuh" +#include +#include +#include + +#define deb(x) std::cout << #x << " = " << x << std::endl; + +using CudaArrayView = TNL::Containers::ArrayView; +using CudaTaskArray = TNL::Containers::Array; + +template +__device__ bool cudaPartition(CudaArrayView src, CudaArrayView dst, TASK * task, const int & pivot, const Function & Cmp) +{ + static __shared__ int smallerStart, biggerStart; + static __shared__ bool writePivot; + + int elemPerBlock = ceil( ((double)src.getSize()) / gridDim.x); + int myBegin = blockIdx.x * elemPerBlock; + int myEnd = TNL::min(src.getSize(), myBegin + elemPerBlock); + + int smaller = 0, bigger = 0; + countElem(src, myBegin, myEnd, smaller, bigger, pivot); + + int smallerOffset = blockInclusivePrefixSum(smaller); + int biggerOffset = blockInclusivePrefixSum(bigger); + + if (threadIdx.x == blockDim.x - 1) //last thread in block has sum of all values + { + smallerStart = atomicAdd(&(task->begin), smallerOffset); + biggerStart = atomicAdd(&(task->end), -biggerOffset) - biggerOffset; + } + __syncthreads(); + + int destSmaller = smallerStart + smallerOffset - smaller; + int destBigger = biggerStart + biggerOffset - bigger; + copyData(src, myBegin, myEnd, dst, destSmaller, destBigger, pivot); + + if (threadIdx.x == 0) + writePivot = (atomicAdd(&(task->stillWorkingCnt), -1) == 1); + __syncthreads(); + + return writePivot; +} + +template +__device__ void multiBlockQuickSort(CudaArrayView arr, CudaArrayView aux, TASK * task, const Function & Cmp, int depth) +{ + static __shared__ int pivot; + + if(threadIdx.x == 0) + pivot = pickPivot(depth %2 == 0? arr: aux, Cmp); + __syncthreads(); + + bool isLast; + if(depth %2 == 0) + isLast = cudaPartition(arr, aux, task, pivot, Cmp); + else + isLast = cudaPartition(aux, arr, task, pivot, Cmp); + + if(!isLast) + return; + + int leftEnd = task->begin, rightBegin = task->end; + + for (int i = leftEnd + threadIdx.x; i < rightBegin; i += blockDim.x) + arr[i] = pivot; + + if(threadIdx.x != 0) + return; + + int blocksLeft = 1, blocksRight = 1; + calcBlocksNeeded(leftEnd - 0, arr.getSize() - rightBegin, blocksLeft, blocksRight); + + bool usedLeft = false; + + if(leftEnd > 0) + { + *task = TASK(0, leftEnd, blocksLeft); + usedLeft = true; + + cudaStream_t s; + cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking); + cudaQuickSort<<>>( + arr.getView(0, leftEnd), + aux.getView(0, leftEnd), + task, + Cmp, depth+1); + cudaStreamDestroy(s); + } + + if((arr.getSize() - rightBegin)> 0) + { + TASK * newTaskRight = nullptr; + + if(usedLeft) + { + newTaskRight = (TASK * )malloc(sizeof(TASK)); + if(!newTaskRight) + { + printf("couldnt allocate memory for right task\n"); + return; + } + *newTaskRight = TASK(0, arr.getSize() - rightBegin, blocksRight); + } + else + { + usedLeft = true; + *task = TASK(0, arr.getSize() - rightBegin, blocksRight); + } + + cudaStream_t s; + cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking); + cudaQuickSort<<>>( + arr.getView(rightBegin, arr.getSize()), + aux.getView(rightBegin, aux.getSize()), + newTaskRight? newTaskRight : task, + Cmp, depth+1); + cudaStreamDestroy(s); + } + + if(!usedLeft) + free(task); +} + +//------------------------------------------------------------------------- +template +__device__ void externSort(CudaArrayView src, CudaArrayView dst, const Function & Cmp) +{ + static __shared__ int sharedMem[externMemSize]; + bitonicSort_Block(src, dst, sharedMem, Cmp); +} + +template +__device__ void stackPush(int stackArrBegin[], int stackArrEnd[], + int stackDepth[], int & stackTop, + int begin, int pivotBegin, + int pivotEnd, int end, + int depth) +{ + int sizeL = pivotBegin - begin, sizeR = end - pivotEnd; + + if(sizeL > sizeR) + { + if(sizeL > 0) //left from pivot are smaller elems + { + stackArrBegin[stackTop] = begin; + stackArrEnd[stackTop] = pivotBegin; + stackDepth[stackTop] = depth + 1; + (stackTop)++; + } + + if(sizeR > 0) //right from pivot until end are elem greater than pivot + { + assert(stackTop < stackSize && "Local quicksort stack overflow."); + + stackArrBegin[stackTop] = pivotEnd; + stackArrEnd[stackTop] = end; + stackDepth[stackTop] = depth + 1; + (stackTop)++; + } + } + else + { + if(sizeR > 0) //right from pivot until end are elem greater than pivot + { + stackArrBegin[stackTop] = pivotEnd; + stackArrEnd[stackTop] = end; + stackDepth[stackTop] = depth + 1; + (stackTop)++; + } + + if(sizeL > 0) //left from pivot are smaller elems + { + assert(stackTop < stackSize && "Local quicksort stack overflow."); + + stackArrBegin[stackTop] = begin; + stackArrEnd[stackTop] = pivotBegin; + stackDepth[stackTop] = depth + 1; + (stackTop)++; + } + } +} + +template +__device__ void singleBlockQuickSort(CudaArrayView arr, CudaArrayView aux, const Function & Cmp, int _depth) +{ + static __shared__ int stackTop; + static __shared__ int stackArrBegin[stackSize], stackArrEnd[stackSize], stackDepth[stackSize]; + static __shared__ int begin, end, depth,pivotBegin, pivotEnd; + static __shared__ int pivot; + + if (threadIdx.x == 0) + { + stackTop = 0; + stackArrBegin[stackTop] = 0; + stackArrEnd[stackTop] = arr.getSize(); + stackDepth[stackTop] = _depth; + stackTop++; + } + __syncthreads(); + + while(stackTop > 0) + { + if (threadIdx.x == 0) + { + begin = stackArrBegin[stackTop-1]; + end = stackArrEnd[stackTop-1]; + depth = stackDepth[stackTop-1]; + stackTop--; + pivot = pickPivot(depth%2 == 0? + arr.getView(begin, end) : + aux.getView(begin, end), + Cmp + ); + } + __syncthreads(); + + int size = end - begin; + auto src = depth%2 == 0 ? arr.getView(begin, end) : aux.getView(begin, end); + auto dst = depth%2 == 0 ? aux.getView(begin, end) : arr.getView(begin, end); + + if(size <= blockDim.x*2) + { + externSort(src, arr.getView(begin, end), Cmp); + continue; + } + + int smaller = 0, bigger = 0; + countElem(src, 0, size, smaller, bigger, pivot); + + int smallerOffset = blockInclusivePrefixSum(smaller); + int biggerOffset = blockInclusivePrefixSum(bigger); + + if (threadIdx.x == blockDim.x - 1) + { + pivotBegin = smallerOffset; + pivotEnd = size - biggerOffset; + } + __syncthreads(); + + int destSmaller = 0 + smallerOffset - smaller; + int destBigger = pivotEnd + (biggerOffset - bigger); + + copyData(src, 0, size, dst, destSmaller, destBigger, pivot); + __syncthreads(); + + for (int i = pivotBegin + threadIdx.x; i < pivotEnd; i += blockDim.x) + src[i] = dst[i] = pivot; + + if(threadIdx.x == 0) + { + stackPush(stackArrBegin, stackArrEnd, stackDepth, stackTop, + begin, begin+ pivotBegin, + begin +pivotEnd, end, + depth); + } + __syncthreads(); + } //ends while loop +} + +template +__global__ void cudaQuickSort(CudaArrayView arr, CudaArrayView aux, TASK * task, const Function & Cmp, int depth) +{ + if(gridDim.x > 1) + { + multiBlockQuickSort(arr, aux, task, Cmp, depth); + } + else + { + if(threadIdx.x == 0) + free(task); + + singleBlockQuickSort(arr, aux, Cmp, depth); + } +} + +//----------------------------------------------------------- + +/** + * call this kernel using 1 thread only + * */ +template +__global__ void cudaQuickSortEntry(CudaArrayView arr, CudaArrayView aux, const Function & Cmp, int blocks, int threadsPerBlock) +{ + TASK * task = (TASK *)malloc(sizeof(TASK)); + *task = TASK(0, arr.getSize(), blocks); + if(!task) + { + printf("couldnt allocate memory for right task\n"); + return; + } + + //task is freed by the block that wrote pivot + cudaQuickSort<<>>(arr, aux, task, Cmp, 0); +} + +//----------------------------------------------------------- + +template +void quicksort(CudaArrayView arr, const Function & Cmp) +{ + TNL::Containers::Array aux(arr.getSize()); + + const int threadsPerBlock = 512, maxBlocks = 1 << 15; //32k + const int minElemPerBlock = threadsPerBlock*2; + int sets = arr.getSize() / minElemPerBlock + (arr.getSize() % minElemPerBlock != 0); + + int blocks = min(sets, maxBlocks); + cudaQuickSortEntry<<<1, 1>>>(arr, aux.getView(), Cmp, blocks, threadsPerBlock); + cudaDeviceSynchronize(); +} + +void quicksort(TNL::Containers::ArrayView arr) +{ + quicksort(arr, []__cuda_callable__(int a, int b){return a < b;}); +} diff --git a/GPUSort/quicksort_dynamic/quicksort.cuh b/GPUSort/quicksort_dynamic/quicksort.cuh new file mode 100644 index 000000000..037e8d339 --- /dev/null +++ b/GPUSort/quicksort_dynamic/quicksort.cuh @@ -0,0 +1,10 @@ +#pragma once + +#include +#include "task.h" + +using CudaArrayView = TNL::Containers::ArrayView; +template +void quicksort(CudaArrayView arr, const Function & Cmp); + +void quicksort(TNL::Containers::ArrayViewarr); \ No newline at end of file diff --git a/GPUSort/quicksort_dynamic/reduction.cuh b/GPUSort/quicksort_dynamic/reduction.cuh new file mode 100644 index 000000000..234871c93 --- /dev/null +++ b/GPUSort/quicksort_dynamic/reduction.cuh @@ -0,0 +1,81 @@ +#pragma once +/** + * https://developer.nvidia.com/blog/faster-parallel-reductions-kepler/ + * */ + +__device__ int warpReduceSum(int initVal) +{ + const unsigned int maskConstant = 0xffffffff; //not used + for (unsigned int mask = warpSize / 2; mask > 0; mask >>= 1) + initVal += __shfl_xor_sync(maskConstant, initVal, mask); + + return initVal; +} + +__device__ int blockReduceSum(int val) +{ + static __shared__ int shared[32]; + int lane = threadIdx.x & (warpSize - 1); + int wid = threadIdx.x / warpSize; + + val = warpReduceSum(val); + + if (lane == 0) + shared[wid] = val; + __syncthreads(); + + val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0; + + if (wid == 0) + val = warpReduceSum(val); + + if(threadIdx.x == 0) + shared[0] = val; + __syncthreads(); + + return shared[0]; +} + + +template +__device__ int warpInclusivePrefixSum(int value) +{ + if(it*2 <= 32) + { + int i = it; + int n = __shfl_up_sync(0xffffffff, value, i); + int laneId = threadIdx.x & 0x1f; + if ((laneId & (warpSize - 1)) >= i) + value += n; + return warpInclusivePrefixSum= 32? 32 : it*2>(value); + + } + + return value; +} + +__device__ int warpInclusivePrefixSum(int value) +{ + return warpInclusivePrefixSum<1>(value); +} + +__device__ int blockInclusivePrefixSum(int value) +{ + static __shared__ int shared[32]; + int lane = threadIdx.x & (warpSize - 1); + int wid = threadIdx.x / warpSize; + + int tmp = warpInclusivePrefixSum(value); + + if (lane == warpSize-1) + shared[wid] = tmp; + __syncthreads(); + + int tmp2 = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0; + if (wid == 0) + shared[lane] = warpInclusivePrefixSum(tmp2) - tmp2; + __syncthreads(); + + tmp += shared[wid]; + return tmp; +} \ No newline at end of file diff --git a/GPUSort/quicksort_dynamic/task.h b/GPUSort/quicksort_dynamic/task.h new file mode 100644 index 000000000..632b0dff9 --- /dev/null +++ b/GPUSort/quicksort_dynamic/task.h @@ -0,0 +1,14 @@ +#pragma once + +struct TASK +{ + int begin, end; + int stillWorkingCnt; + + __cuda_callable__ + TASK(int _begin, int _end, int blocks) + : begin(_begin), end(_end), stillWorkingCnt(blocks){} + + __cuda_callable__ + TASK(){}; +}; \ No newline at end of file diff --git a/GPUSort/quicksort_dynamic/unitTests/Makefile b/GPUSort/quicksort_dynamic/unitTests/Makefile new file mode 100644 index 000000000..610e599c4 --- /dev/null +++ b/GPUSort/quicksort_dynamic/unitTests/Makefile @@ -0,0 +1,32 @@ +include ../config.mk + +TARGET := unitTests +GTEST := -lgtest -pthread +EXTRA_ARCH := -gencode arch=compute_52,code=sm_52 +DEVICE_CODE := -dc + +CUDA_LDLIBS += -lcudadevrt + +## targets definitions follow +.PHONY: all host cuda +all: cuda +cuda: $(TARGET) + +run: cuda + ./$(TARGET) + +.PHONY: clean +clean: + rm -f *.d *.o *.cuo $(TARGET) + +$(TARGET): quicksort.o quicksort_link.o $(TARGET).o + $(CXX) $(TNL_INCLUDE_DIRS) $(CUDA_LDFLAGS) -o $@ $^ $(CUDA_LDLIBS) $(GTEST) + +$(TARGET).o: $(TARGET).cu + $(CUDA_CXX) $(CUDA_CXXFLAGS) -c -o $@ $< + +quicksort.o: ../quicksort.cu + $(CUDA_CXX) $(CUDA_CXXFLAGS) $(EXTRA_ARCH) $(DEVICE_CODE) -c -o $@ $< + +quicksort_link.o: quicksort.o + $(CUDA_CXX) $(CUDA_LDFLAGS) -dlink -o $@ $< $(CUDA_LDLIBS) diff --git a/GPUSort/quicksort_dynamic/unitTests/unitTests.cu b/GPUSort/quicksort_dynamic/unitTests/unitTests.cu new file mode 100644 index 000000000..829b01fa0 --- /dev/null +++ b/GPUSort/quicksort_dynamic/unitTests/unitTests.cu @@ -0,0 +1,77 @@ +#include "gtest/gtest.h" +#include +#include +#include +#include + +#include +#include +#include "../quicksort.cuh" +#include "../../util/algorithm.h" + +//---------------------------------------------------------------------------------- + +TEST(selectedSize, size15) +{ + TNL::Containers::Array cudaArr{5, 9, 4, 8, 6, 1, 2, 3, 4, 8, 1, 6, 9, 4, 9}; + auto view = cudaArr.getView(); + ASSERT_EQ(15, view.getSize()) << "size not 15" << std::endl; + quicksort(view); + ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl; +} + +TEST(multiblock, 32768_decreasingNegative) +{ + std::vector arr(1<<15); + for (size_t i = 0; i < arr.size(); i++) + arr[i] = -i; + + TNL::Containers::Array cudaArr(arr); + auto view = cudaArr.getView(); + + quicksort(view); + ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl; +} + +TEST(randomGenerated, smallArray_randomVal) +{ + std::srand(2006); + for(int i = 0; i < 100; i++) + { + std::vector arr(std::rand()%(1<<10)); + for(auto & x : arr) + x = std::rand(); + + TNL::Containers::Array cudaArr(arr); + + auto view = cudaArr.getView(); + quicksort(view); + ASSERT_TRUE(is_sorted(view)); + } +} + + +TEST(randomGenerated, bigArray_randomVal) +{ + std::srand(304); + for(int i = 0; i < 50; i++) + { + int size = (1<<20) + (std::rand()% (1<<19)); + std::vector arr(size); + for(auto & x : arr) x = std::rand(); + TNL::Containers::Array cudaArr(arr); + + auto view = cudaArr.getView(); + quicksort(view); + ASSERT_TRUE(is_sorted(view)); + } +} + +//---------------------------------------------------------------------------------- + +int main(int argc, char **argv) +{ + testing::InitGoogleTest(&argc, argv); + + return RUN_ALL_TESTS(); +} \ No newline at end of file -- GitLab From 51d159cc9f18f03b134cff005accff307c2711ef Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sat, 20 Mar 2021 19:24:55 +0100 Subject: [PATCH 113/258] refactor folders --- .../bitonic_benchmark}/Makefile | 2 +- .../bitonic_benchmark}/benchmark.cu | 4 +- .../quicksort_benchmark}/Makefile | 2 +- .../quicksort_benchmark}/benchmark.cu | 4 +- .../quicksort_dynamic_benchmark}/Makefile | 12 +++-- .../quicksort_dynamic_benchmark}/benchmark.cu | 4 +- GPUSort/quicksort_dynamic/config.mk | 49 ------------------- .../bitonicSort}/bitonicSort.h | 0 .../bitonicSort/sample}/Makefile | 2 +- .../bitonicSort/sample}/main.cu | 6 +-- GPUSort/{ => src}/quicksort/quicksort.cuh | 2 +- GPUSort/{ => src}/quicksort/reduction.cuh | 0 .../quicksort/sample}/Makefile | 2 +- .../quicksort/sample}/main.cu | 3 +- GPUSort/{ => src}/quicksort/task.h | 0 .../{ => src}/quicksort_dynamic/helper.cuh | 0 .../{ => src}/quicksort_dynamic/quicksort.cu | 2 +- .../{ => src}/quicksort_dynamic/quicksort.cuh | 0 .../{ => src}/quicksort_dynamic/reduction.cuh | 0 .../quicksort_dynamic/sample}/Makefile | 8 +-- .../quicksort_dynamic/sample}/main.cu | 4 +- GPUSort/{ => src}/quicksort_dynamic/task.h | 0 GPUSort/{ => src}/util/algorithm.h | 0 GPUSort/{bitonicGPU => src/util}/config.mk | 0 .../{quicksort => src/util/configs}/config.mk | 0 GPUSort/{ => src}/util/timer.h | 0 .../bitonic_tests}/Makefile | 2 +- .../bitonic_tests}/unitTests.cu | 4 +- .../quicksort_dynamic_tests}/Makefile | 11 +++-- .../quicksort_dynamic_tests}/unitTests.cu | 4 +- .../quicksort_unitTests}/Makefile | 2 +- .../quicksort_unitTests}/unitTests.cu | 4 +- 32 files changed, 46 insertions(+), 87 deletions(-) rename GPUSort/{bitonicGPU/benchmark => benchmark/bitonic_benchmark}/Makefile (95%) rename GPUSort/{bitonicGPU/benchmark => benchmark/bitonic_benchmark}/benchmark.cu (95%) rename GPUSort/{quicksort/benchmark => benchmark/quicksort_benchmark}/Makefile (95%) rename GPUSort/{quicksort/benchmark => benchmark/quicksort_benchmark}/benchmark.cu (96%) rename GPUSort/{quicksort_dynamic/benchmark => benchmark/quicksort_dynamic_benchmark}/Makefile (76%) rename GPUSort/{quicksort_dynamic/benchmark => benchmark/quicksort_dynamic_benchmark}/benchmark.cu (95%) delete mode 100644 GPUSort/quicksort_dynamic/config.mk rename GPUSort/{bitonicGPU => src/bitonicSort}/bitonicSort.h (100%) rename GPUSort/{bitonicGPU => src/bitonicSort/sample}/Makefile (95%) rename GPUSort/{bitonicGPU => src/bitonicSort/sample}/main.cu (86%) rename GPUSort/{ => src}/quicksort/quicksort.cuh (99%) rename GPUSort/{ => src}/quicksort/reduction.cuh (100%) rename GPUSort/{quicksort => src/quicksort/sample}/Makefile (95%) rename GPUSort/{quicksort_dynamic => src/quicksort/sample}/main.cu (88%) rename GPUSort/{ => src}/quicksort/task.h (100%) rename GPUSort/{ => src}/quicksort_dynamic/helper.cuh (100%) rename GPUSort/{ => src}/quicksort_dynamic/quicksort.cu (99%) rename GPUSort/{ => src}/quicksort_dynamic/quicksort.cuh (100%) rename GPUSort/{ => src}/quicksort_dynamic/reduction.cuh (100%) rename GPUSort/{quicksort_dynamic => src/quicksort_dynamic/sample}/Makefile (82%) rename GPUSort/{quicksort => src/quicksort_dynamic/sample}/main.cu (87%) rename GPUSort/{ => src}/quicksort_dynamic/task.h (100%) rename GPUSort/{ => src}/util/algorithm.h (100%) rename GPUSort/{bitonicGPU => src/util}/config.mk (100%) rename GPUSort/{quicksort => src/util/configs}/config.mk (100%) rename GPUSort/{ => src}/util/timer.h (100%) rename GPUSort/{quicksort/unitTests => tests/bitonic_tests}/Makefile (95%) rename GPUSort/{bitonicGPU/unitTests => tests/bitonic_tests}/unitTests.cu (98%) rename GPUSort/{quicksort_dynamic/unitTests => tests/quicksort_dynamic_tests}/Makefile (78%) rename GPUSort/{quicksort/unitTests => tests/quicksort_dynamic_tests}/unitTests.cu (95%) rename GPUSort/{bitonicGPU/unitTests => tests/quicksort_unitTests}/Makefile (95%) rename GPUSort/{quicksort_dynamic/unitTests => tests/quicksort_unitTests}/unitTests.cu (95%) diff --git a/GPUSort/bitonicGPU/benchmark/Makefile b/GPUSort/benchmark/bitonic_benchmark/Makefile similarity index 95% rename from GPUSort/bitonicGPU/benchmark/Makefile rename to GPUSort/benchmark/bitonic_benchmark/Makefile index 9f523a7de..57736cce3 100644 --- a/GPUSort/bitonicGPU/benchmark/Makefile +++ b/GPUSort/benchmark/bitonic_benchmark/Makefile @@ -1,4 +1,4 @@ -include ../config.mk +include ../../src/util/config.mk CUDA_SOURCES := $(wildcard *.cu) CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) diff --git a/GPUSort/bitonicGPU/benchmark/benchmark.cu b/GPUSort/benchmark/bitonic_benchmark/benchmark.cu similarity index 95% rename from GPUSort/bitonicGPU/benchmark/benchmark.cu rename to GPUSort/benchmark/bitonic_benchmark/benchmark.cu index b6c0f6204..0771bb798 100644 --- a/GPUSort/bitonicGPU/benchmark/benchmark.cu +++ b/GPUSort/benchmark/bitonic_benchmark/benchmark.cu @@ -4,8 +4,8 @@ #include -#include "../bitonicSort.h" -#include "../../util/timer.h" +#include "../../src/bitonicSort/bitonicSort.h" +#include "../../src/util/timer.h" using namespace TNL; using namespace TNL::Containers; diff --git a/GPUSort/quicksort/benchmark/Makefile b/GPUSort/benchmark/quicksort_benchmark/Makefile similarity index 95% rename from GPUSort/quicksort/benchmark/Makefile rename to GPUSort/benchmark/quicksort_benchmark/Makefile index 9f523a7de..57736cce3 100644 --- a/GPUSort/quicksort/benchmark/Makefile +++ b/GPUSort/benchmark/quicksort_benchmark/Makefile @@ -1,4 +1,4 @@ -include ../config.mk +include ../../src/util/config.mk CUDA_SOURCES := $(wildcard *.cu) CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) diff --git a/GPUSort/quicksort/benchmark/benchmark.cu b/GPUSort/benchmark/quicksort_benchmark/benchmark.cu similarity index 96% rename from GPUSort/quicksort/benchmark/benchmark.cu rename to GPUSort/benchmark/quicksort_benchmark/benchmark.cu index 764289b35..954b52979 100644 --- a/GPUSort/quicksort/benchmark/benchmark.cu +++ b/GPUSort/benchmark/quicksort_benchmark/benchmark.cu @@ -4,8 +4,8 @@ #include -#include "../quicksort.cuh" -#include "../../util/timer.h" +#include "../../src/quicksort/quicksort.cuh" +#include "../../src/util/timer.h" using namespace TNL; using namespace TNL::Containers; diff --git a/GPUSort/quicksort_dynamic/benchmark/Makefile b/GPUSort/benchmark/quicksort_dynamic_benchmark/Makefile similarity index 76% rename from GPUSort/quicksort_dynamic/benchmark/Makefile rename to GPUSort/benchmark/quicksort_dynamic_benchmark/Makefile index fc9c1bcc8..b58dc0a17 100644 --- a/GPUSort/quicksort_dynamic/benchmark/Makefile +++ b/GPUSort/benchmark/quicksort_dynamic_benchmark/Makefile @@ -1,4 +1,4 @@ -include ../config.mk +include ../../src/util/config.mk TARGET := benchmark EXTRA_ARCH := -gencode arch=compute_52,code=sm_52 @@ -6,11 +6,15 @@ DEVICE_CODE := -dc CUDA_LDLIBS += -lcudadevrt +SRC_FOLDER := ../../src/quicksort_dynamic + ## targets definitions follow -.PHONY: all host cuda +.PHONY: cuda all: cuda + cuda: $(TARGET) +.PHONY: cuda run: cuda ./$(TARGET) @@ -21,10 +25,10 @@ clean: $(TARGET): quicksort.o quicksort_link.o $(TARGET).o $(CXX) $(TNL_INCLUDE_DIRS) $(CUDA_LDFLAGS) -o $@ $^ $(CUDA_LDLIBS) -$(TARGET).o: $(TARGET).cu +$(TARGET).o: $(SRC_FOLDER)/$(TARGET).cu $(CUDA_CXX) $(CUDA_CXXFLAGS) -c -o $@ $< -quicksort.o: ../quicksort.cu +quicksort.o: $(SRC_FOLDER)/quicksort.cu $(CUDA_CXX) $(CUDA_CXXFLAGS) $(EXTRA_ARCH) $(DEVICE_CODE) -c -o $@ $< quicksort_link.o: quicksort.o diff --git a/GPUSort/quicksort_dynamic/benchmark/benchmark.cu b/GPUSort/benchmark/quicksort_dynamic_benchmark/benchmark.cu similarity index 95% rename from GPUSort/quicksort_dynamic/benchmark/benchmark.cu rename to GPUSort/benchmark/quicksort_dynamic_benchmark/benchmark.cu index 764289b35..389aba9a7 100644 --- a/GPUSort/quicksort_dynamic/benchmark/benchmark.cu +++ b/GPUSort/benchmark/quicksort_dynamic_benchmark/benchmark.cu @@ -4,8 +4,8 @@ #include -#include "../quicksort.cuh" -#include "../../util/timer.h" +#include "../../src/quicksort_dynamic/quicksort.cuh" +#include "../../src/util/timer.h" using namespace TNL; using namespace TNL::Containers; diff --git a/GPUSort/quicksort_dynamic/config.mk b/GPUSort/quicksort_dynamic/config.mk deleted file mode 100644 index 3715986f7..000000000 --- a/GPUSort/quicksort_dynamic/config.mk +++ /dev/null @@ -1,49 +0,0 @@ -# configure the include path(s) according to your TNL installation -TNL_INCLUDE_DIRS := -I ~/.local/include - -WITH_OPENMP := no -WITH_DEBUG := no - -# If TNL is installed on your system, the CUDA architecture can be detected -# automatically by tnl-cuda-arch. This is done if CUDA_ARCH is set to "auto". -# Otherwise, CUDA_ARCH has to be set manually to the desired CUDA architecture -# number, e.g. 60, 61, etc. -CUDA_ARCH := auto - -# compilers -CXX := g++ -CUDA_CXX := nvcc - -# host compiler flags -CXXFLAGS := -std=c++14 $(TNL_INCLUDE_DIRS) -ifeq ($(WITH_DEBUG),yes) - CXXFLAGS += -O0 -g -else - CXXFLAGS += -O3 -DNDEBUG -endif - -# CUDA compiler flags -CUDA_CXXFLAGS := -std=c++14 --expt-relaxed-constexpr --expt-extended-lambda $(TNL_INCLUDE_DIRS) -CUDA_CXXFLAGS += -DHAVE_CUDA -ifeq ($(CUDA_ARCH),auto) - CUDA_CXXFLAGS += $(shell tnl-cuda-arch) -else - CUDA_CXXFLAGS += -gencode arch=compute_$(CUDA_ARCH),code=sm_$(CUDA_ARCH) -endif - -# determine path to the CUDA toolkit installation -# (autodetection is attempted, set it manually if it fails) -CUDA_PATH ?= $(abspath $(dir $(shell command -v nvcc))/..) -#$(info Detected CUDA_PATH: $(CUDA_PATH)) - -# flags for linking CUDA with the host compiler -CUDA_LDFLAGS := -L $(CUDA_PATH)/lib64 -CUDA_LDLIBS := -lcudart -ldl -lrt - -# enable OpenMP -ifeq ($(WITH_OPENMP),yes) - CXXFLAGS += -fopenmp -DHAVE_OPENMP - LDLIBS += -lgomp - CUDA_CXXFLAGS += -Xcompiler -fopenmp -DHAVE_OPENMP - CUDA_LDLIBS += -lgomp -endif diff --git a/GPUSort/bitonicGPU/bitonicSort.h b/GPUSort/src/bitonicSort/bitonicSort.h similarity index 100% rename from GPUSort/bitonicGPU/bitonicSort.h rename to GPUSort/src/bitonicSort/bitonicSort.h diff --git a/GPUSort/bitonicGPU/Makefile b/GPUSort/src/bitonicSort/sample/Makefile similarity index 95% rename from GPUSort/bitonicGPU/Makefile rename to GPUSort/src/bitonicSort/sample/Makefile index 6a7032a04..23593937b 100644 --- a/GPUSort/bitonicGPU/Makefile +++ b/GPUSort/src/bitonicSort/sample/Makefile @@ -1,4 +1,4 @@ -include config.mk +include ../../util/config.mk CUDA_SOURCES := $(wildcard *.cu) CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) diff --git a/GPUSort/bitonicGPU/main.cu b/GPUSort/src/bitonicSort/sample/main.cu similarity index 86% rename from GPUSort/bitonicGPU/main.cu rename to GPUSort/src/bitonicSort/sample/main.cu index 30903f97c..21ad72b28 100644 --- a/GPUSort/bitonicGPU/main.cu +++ b/GPUSort/src/bitonicSort/sample/main.cu @@ -1,7 +1,7 @@ #include #include -#include "bitonicSort.h" +#include "../bitonicSort.h" //-------------------------------------------------- std::ostream& operator<< (std::ostream&out, std::vector &arr) { @@ -31,10 +31,10 @@ int main( int argc, char* argv[] ) auto view = Arr.getView(); - //std::cout << "unsorted: " << view << std::endl; + std::cout << "unsorted: " << view << std::endl; bitonicSort(a); - //std::cout << "sorted: " << view << std::endl; + std::cout << "sorted: " << view << std::endl; return 0; } \ No newline at end of file diff --git a/GPUSort/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh similarity index 99% rename from GPUSort/quicksort/quicksort.cuh rename to GPUSort/src/quicksort/quicksort.cuh index 674caca44..c06e5ee27 100644 --- a/GPUSort/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -3,7 +3,7 @@ #include #include "reduction.cuh" #include "task.h" -#include "../bitonicGPU/bitonicSort.h" +#include "../bitonicSort/bitonicSort.h" #include #define deb(x) std::cout << #x << " = " << x << std::endl; diff --git a/GPUSort/quicksort/reduction.cuh b/GPUSort/src/quicksort/reduction.cuh similarity index 100% rename from GPUSort/quicksort/reduction.cuh rename to GPUSort/src/quicksort/reduction.cuh diff --git a/GPUSort/quicksort/Makefile b/GPUSort/src/quicksort/sample/Makefile similarity index 95% rename from GPUSort/quicksort/Makefile rename to GPUSort/src/quicksort/sample/Makefile index d301117ff..474eb7141 100644 --- a/GPUSort/quicksort/Makefile +++ b/GPUSort/src/quicksort/sample/Makefile @@ -1,4 +1,4 @@ -include config.mk +include ../../util/config.mk CUDA_SOURCES := $(wildcard *.cu) CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) diff --git a/GPUSort/quicksort_dynamic/main.cu b/GPUSort/src/quicksort/sample/main.cu similarity index 88% rename from GPUSort/quicksort_dynamic/main.cu rename to GPUSort/src/quicksort/sample/main.cu index 6084d8665..38f1f2404 100644 --- a/GPUSort/quicksort_dynamic/main.cu +++ b/GPUSort/src/quicksort/sample/main.cu @@ -1,6 +1,5 @@ #include -#include "quicksort.cuh" -#include "../util/algorithm.h" +#include "../quicksort.cuh" #include #include diff --git a/GPUSort/quicksort/task.h b/GPUSort/src/quicksort/task.h similarity index 100% rename from GPUSort/quicksort/task.h rename to GPUSort/src/quicksort/task.h diff --git a/GPUSort/quicksort_dynamic/helper.cuh b/GPUSort/src/quicksort_dynamic/helper.cuh similarity index 100% rename from GPUSort/quicksort_dynamic/helper.cuh rename to GPUSort/src/quicksort_dynamic/helper.cuh diff --git a/GPUSort/quicksort_dynamic/quicksort.cu b/GPUSort/src/quicksort_dynamic/quicksort.cu similarity index 99% rename from GPUSort/quicksort_dynamic/quicksort.cu rename to GPUSort/src/quicksort_dynamic/quicksort.cu index 25e72c2f4..b56aca624 100644 --- a/GPUSort/quicksort_dynamic/quicksort.cu +++ b/GPUSort/src/quicksort_dynamic/quicksort.cu @@ -3,7 +3,7 @@ #include #include "reduction.cuh" #include "task.h" -#include "../bitonicGPU/bitonicSort.h" +#include "../bitonicSort/bitonicSort.h" #include "helper.cuh" #include #include diff --git a/GPUSort/quicksort_dynamic/quicksort.cuh b/GPUSort/src/quicksort_dynamic/quicksort.cuh similarity index 100% rename from GPUSort/quicksort_dynamic/quicksort.cuh rename to GPUSort/src/quicksort_dynamic/quicksort.cuh diff --git a/GPUSort/quicksort_dynamic/reduction.cuh b/GPUSort/src/quicksort_dynamic/reduction.cuh similarity index 100% rename from GPUSort/quicksort_dynamic/reduction.cuh rename to GPUSort/src/quicksort_dynamic/reduction.cuh diff --git a/GPUSort/quicksort_dynamic/Makefile b/GPUSort/src/quicksort_dynamic/sample/Makefile similarity index 82% rename from GPUSort/quicksort_dynamic/Makefile rename to GPUSort/src/quicksort_dynamic/sample/Makefile index 5e4f268e3..922a35af6 100644 --- a/GPUSort/quicksort_dynamic/Makefile +++ b/GPUSort/src/quicksort_dynamic/sample/Makefile @@ -1,4 +1,4 @@ -include config.mk +include ../../util/config.mk TARGET := main EXTRA_ARCH := -gencode arch=compute_52,code=sm_52 @@ -6,6 +6,8 @@ DEVICE_CODE := -dc CUDA_LDLIBS += -lcudadevrt +SRC_FOLDER := .. + ## targets definitions follow .PHONY: all host cuda all: cuda @@ -21,10 +23,10 @@ clean: $(TARGET): quicksort.o quicksort_link.o $(TARGET).o $(CXX) $(TNL_INCLUDE_DIRS) $(CUDA_LDFLAGS) -o $@ $^ $(CUDA_LDLIBS) -$(TARGET).o: $(TARGET).cu +$(TARGET).o: $(SRC_FOLDER)/$(TARGET).cu $(CUDA_CXX) $(CUDA_CXXFLAGS) -c -o $@ $< -quicksort.o: quicksort.cu +quicksort.o: $(SRC_FOLDER)/quicksort.cu $(CUDA_CXX) $(CUDA_CXXFLAGS) $(EXTRA_ARCH) $(DEVICE_CODE) -c -o $@ $< quicksort_link.o: quicksort.o diff --git a/GPUSort/quicksort/main.cu b/GPUSort/src/quicksort_dynamic/sample/main.cu similarity index 87% rename from GPUSort/quicksort/main.cu rename to GPUSort/src/quicksort_dynamic/sample/main.cu index 6084d8665..9ba19cb38 100644 --- a/GPUSort/quicksort/main.cu +++ b/GPUSort/src/quicksort_dynamic/sample/main.cu @@ -1,6 +1,6 @@ #include -#include "quicksort.cuh" -#include "../util/algorithm.h" +#include "../quicksort.cuh" +#include "../../util/algorithm.h" #include #include diff --git a/GPUSort/quicksort_dynamic/task.h b/GPUSort/src/quicksort_dynamic/task.h similarity index 100% rename from GPUSort/quicksort_dynamic/task.h rename to GPUSort/src/quicksort_dynamic/task.h diff --git a/GPUSort/util/algorithm.h b/GPUSort/src/util/algorithm.h similarity index 100% rename from GPUSort/util/algorithm.h rename to GPUSort/src/util/algorithm.h diff --git a/GPUSort/bitonicGPU/config.mk b/GPUSort/src/util/config.mk similarity index 100% rename from GPUSort/bitonicGPU/config.mk rename to GPUSort/src/util/config.mk diff --git a/GPUSort/quicksort/config.mk b/GPUSort/src/util/configs/config.mk similarity index 100% rename from GPUSort/quicksort/config.mk rename to GPUSort/src/util/configs/config.mk diff --git a/GPUSort/util/timer.h b/GPUSort/src/util/timer.h similarity index 100% rename from GPUSort/util/timer.h rename to GPUSort/src/util/timer.h diff --git a/GPUSort/quicksort/unitTests/Makefile b/GPUSort/tests/bitonic_tests/Makefile similarity index 95% rename from GPUSort/quicksort/unitTests/Makefile rename to GPUSort/tests/bitonic_tests/Makefile index 4cf4ea6f0..a5dbfa2ab 100644 --- a/GPUSort/quicksort/unitTests/Makefile +++ b/GPUSort/tests/bitonic_tests/Makefile @@ -1,4 +1,4 @@ -include ../config.mk +include ../../src/util/config.mk CUDA_SOURCES := $(wildcard *.cu) CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) diff --git a/GPUSort/bitonicGPU/unitTests/unitTests.cu b/GPUSort/tests/bitonic_tests/unitTests.cu similarity index 98% rename from GPUSort/bitonicGPU/unitTests/unitTests.cu rename to GPUSort/tests/bitonic_tests/unitTests.cu index 5911e9e92..4649c3fef 100644 --- a/GPUSort/bitonicGPU/unitTests/unitTests.cu +++ b/GPUSort/tests/bitonic_tests/unitTests.cu @@ -7,8 +7,8 @@ #include #include -#include "../bitonicSort.h" -#include "../../util/algorithm.h" +#include "../../src/bitonicSort/bitonicSort.h" +#include "../../src/util/algorithm.h" //---------------------------------------------------------------------------------- diff --git a/GPUSort/quicksort_dynamic/unitTests/Makefile b/GPUSort/tests/quicksort_dynamic_tests/Makefile similarity index 78% rename from GPUSort/quicksort_dynamic/unitTests/Makefile rename to GPUSort/tests/quicksort_dynamic_tests/Makefile index 610e599c4..44e3041ba 100644 --- a/GPUSort/quicksort_dynamic/unitTests/Makefile +++ b/GPUSort/tests/quicksort_dynamic_tests/Makefile @@ -1,4 +1,4 @@ -include ../config.mk +include ../../src/util/config.mk TARGET := unitTests GTEST := -lgtest -pthread @@ -7,9 +7,12 @@ DEVICE_CODE := -dc CUDA_LDLIBS += -lcudadevrt +SRC_FOLDER := ../../src/quicksort_dynamic + ## targets definitions follow -.PHONY: all host cuda +.PHONY: cuda all: cuda + cuda: $(TARGET) run: cuda @@ -22,10 +25,10 @@ clean: $(TARGET): quicksort.o quicksort_link.o $(TARGET).o $(CXX) $(TNL_INCLUDE_DIRS) $(CUDA_LDFLAGS) -o $@ $^ $(CUDA_LDLIBS) $(GTEST) -$(TARGET).o: $(TARGET).cu +$(TARGET).o: $(SRC_FOLDER)/$(TARGET).cu $(CUDA_CXX) $(CUDA_CXXFLAGS) -c -o $@ $< -quicksort.o: ../quicksort.cu +quicksort.o: $(SRC_FOLDER)/quicksort.cu $(CUDA_CXX) $(CUDA_CXXFLAGS) $(EXTRA_ARCH) $(DEVICE_CODE) -c -o $@ $< quicksort_link.o: quicksort.o diff --git a/GPUSort/quicksort/unitTests/unitTests.cu b/GPUSort/tests/quicksort_dynamic_tests/unitTests.cu similarity index 95% rename from GPUSort/quicksort/unitTests/unitTests.cu rename to GPUSort/tests/quicksort_dynamic_tests/unitTests.cu index 829b01fa0..9c59a031a 100644 --- a/GPUSort/quicksort/unitTests/unitTests.cu +++ b/GPUSort/tests/quicksort_dynamic_tests/unitTests.cu @@ -6,8 +6,8 @@ #include #include -#include "../quicksort.cuh" -#include "../../util/algorithm.h" +#include "../../src/quicksort_dynamic/quicksort.cuh" +#include "../../src/util/algorithm.h" //---------------------------------------------------------------------------------- diff --git a/GPUSort/bitonicGPU/unitTests/Makefile b/GPUSort/tests/quicksort_unitTests/Makefile similarity index 95% rename from GPUSort/bitonicGPU/unitTests/Makefile rename to GPUSort/tests/quicksort_unitTests/Makefile index 4cf4ea6f0..a5dbfa2ab 100644 --- a/GPUSort/bitonicGPU/unitTests/Makefile +++ b/GPUSort/tests/quicksort_unitTests/Makefile @@ -1,4 +1,4 @@ -include ../config.mk +include ../../src/util/config.mk CUDA_SOURCES := $(wildcard *.cu) CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) diff --git a/GPUSort/quicksort_dynamic/unitTests/unitTests.cu b/GPUSort/tests/quicksort_unitTests/unitTests.cu similarity index 95% rename from GPUSort/quicksort_dynamic/unitTests/unitTests.cu rename to GPUSort/tests/quicksort_unitTests/unitTests.cu index 829b01fa0..6397e3359 100644 --- a/GPUSort/quicksort_dynamic/unitTests/unitTests.cu +++ b/GPUSort/tests/quicksort_unitTests/unitTests.cu @@ -6,8 +6,8 @@ #include #include -#include "../quicksort.cuh" -#include "../../util/algorithm.h" +#include "../../src/quicksort/quicksort.cuh" +#include "../../src/util/algorithm.h" //---------------------------------------------------------------------------------- -- GitLab From 74bd790f012e6bcbf02c5ae690798f742727a55b Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sat, 20 Mar 2021 19:27:47 +0100 Subject: [PATCH 114/258] dst array for block bitonicsort --- GPUSort/src/bitonicSort/bitonicSort.h | 30 ++++++++++++++------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/GPUSort/src/bitonicSort/bitonicSort.h b/GPUSort/src/bitonicSort/bitonicSort.h index f5a70c0ac..b62b3f3b0 100644 --- a/GPUSort/src/bitonicSort/bitonicSort.h +++ b/GPUSort/src/bitonicSort/bitonicSort.h @@ -124,17 +124,19 @@ void bitonicMergeSharedMemory(TNL::Containers::ArrayView __device__ -void bitonicSort_Block(TNL::Containers::ArrayView arr, Value* sharedMem, const Function & Cmp) +void bitonicSort_Block(TNL::Containers::ArrayView src, + TNL::Containers::ArrayView dst, + Value* sharedMem, const Function & Cmp) { //copy from globalMem into sharedMem int copy1 = threadIdx.x; int copy2 = copy1 + blockDim.x; { - if(copy1 < arr.getSize()) - sharedMem[copy1] = arr[copy1]; + if(copy1 < src.getSize()) + sharedMem[copy1] = src[copy1]; - if(copy2 < arr.getSize()) - sharedMem[copy2] = arr[copy2]; + if(copy2 < src.getSize()) + sharedMem[copy2] = src[copy2]; __syncthreads(); } @@ -143,14 +145,14 @@ void bitonicSort_Block(TNL::Containers::ArrayView arr //bitonic activity { int i = threadIdx.x; - int paddedSize = closestPow2(arr.getSize()); + int paddedSize = closestPow2(src.getSize()); for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) { //calculate the direction of swapping int monotonicSeqIdx = i / (monotonicSeqLen/2); bool ascending = (monotonicSeqIdx & 1) != 0; - if ((monotonicSeqIdx + 1) * monotonicSeqLen >= arr.getSize()) //special case for parts with no "partner" + if ((monotonicSeqIdx + 1) * monotonicSeqLen >= src.getSize()) //special case for parts with no "partner" ascending = true; for (int len = monotonicSeqLen; len > 1; len /= 2) @@ -160,7 +162,7 @@ void bitonicSort_Block(TNL::Containers::ArrayView arr int s = part * len + (i & ((len / 2) - 1)); int e = s + len / 2; - if(e < arr.getSize()) //not touching virtual padding + if(e < src.getSize()) //not touching virtual padding cmpSwap(sharedMem[s], sharedMem[e], ascending, Cmp); __syncthreads(); } @@ -170,10 +172,10 @@ void bitonicSort_Block(TNL::Containers::ArrayView arr //------------------------------------------ //writeback to global memory { - if(copy1 < arr.getSize()) - arr[copy1] = sharedMem[copy1]; - if(copy2 < arr.getSize()) - arr[copy2] = sharedMem[copy2]; + if(copy1 < src.getSize()) + dst[copy1] = sharedMem[copy1]; + if(copy2 < src.getSize()) + dst[copy2] = sharedMem[copy2]; } } /** @@ -191,9 +193,9 @@ __global__ void bitoniSort1stStepSharedMemory(TNL::Containers::ArrayView Date: Sat, 20 Mar 2021 19:30:32 +0100 Subject: [PATCH 115/258] makefile for dynamic compilation --- GPUSort/benchmark/quicksort_dynamic_benchmark/Makefile | 2 +- GPUSort/src/quicksort_dynamic/sample/Makefile | 2 +- GPUSort/tests/quicksort_dynamic_tests/Makefile | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/GPUSort/benchmark/quicksort_dynamic_benchmark/Makefile b/GPUSort/benchmark/quicksort_dynamic_benchmark/Makefile index b58dc0a17..daa599112 100644 --- a/GPUSort/benchmark/quicksort_dynamic_benchmark/Makefile +++ b/GPUSort/benchmark/quicksort_dynamic_benchmark/Makefile @@ -25,7 +25,7 @@ clean: $(TARGET): quicksort.o quicksort_link.o $(TARGET).o $(CXX) $(TNL_INCLUDE_DIRS) $(CUDA_LDFLAGS) -o $@ $^ $(CUDA_LDLIBS) -$(TARGET).o: $(SRC_FOLDER)/$(TARGET).cu +$(TARGET).o: $(TARGET).cu $(CUDA_CXX) $(CUDA_CXXFLAGS) -c -o $@ $< quicksort.o: $(SRC_FOLDER)/quicksort.cu diff --git a/GPUSort/src/quicksort_dynamic/sample/Makefile b/GPUSort/src/quicksort_dynamic/sample/Makefile index 922a35af6..62d89d388 100644 --- a/GPUSort/src/quicksort_dynamic/sample/Makefile +++ b/GPUSort/src/quicksort_dynamic/sample/Makefile @@ -23,7 +23,7 @@ clean: $(TARGET): quicksort.o quicksort_link.o $(TARGET).o $(CXX) $(TNL_INCLUDE_DIRS) $(CUDA_LDFLAGS) -o $@ $^ $(CUDA_LDLIBS) -$(TARGET).o: $(SRC_FOLDER)/$(TARGET).cu +$(TARGET).o: $(TARGET).cu $(CUDA_CXX) $(CUDA_CXXFLAGS) -c -o $@ $< quicksort.o: $(SRC_FOLDER)/quicksort.cu diff --git a/GPUSort/tests/quicksort_dynamic_tests/Makefile b/GPUSort/tests/quicksort_dynamic_tests/Makefile index 44e3041ba..62fd87b3f 100644 --- a/GPUSort/tests/quicksort_dynamic_tests/Makefile +++ b/GPUSort/tests/quicksort_dynamic_tests/Makefile @@ -25,7 +25,7 @@ clean: $(TARGET): quicksort.o quicksort_link.o $(TARGET).o $(CXX) $(TNL_INCLUDE_DIRS) $(CUDA_LDFLAGS) -o $@ $^ $(CUDA_LDLIBS) $(GTEST) -$(TARGET).o: $(SRC_FOLDER)/$(TARGET).cu +$(TARGET).o: $(TARGET).cu $(CUDA_CXX) $(CUDA_CXXFLAGS) -c -o $@ $< quicksort.o: $(SRC_FOLDER)/quicksort.cu -- GitLab From 7c4029a320a46952a3c9ba8898ec564021fdfb4b Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sat, 20 Mar 2021 19:32:43 +0100 Subject: [PATCH 116/258] cleanup obsolete folder --- GPUSort/src/util/configs/config.mk | 49 ------------------------------ 1 file changed, 49 deletions(-) delete mode 100644 GPUSort/src/util/configs/config.mk diff --git a/GPUSort/src/util/configs/config.mk b/GPUSort/src/util/configs/config.mk deleted file mode 100644 index 3715986f7..000000000 --- a/GPUSort/src/util/configs/config.mk +++ /dev/null @@ -1,49 +0,0 @@ -# configure the include path(s) according to your TNL installation -TNL_INCLUDE_DIRS := -I ~/.local/include - -WITH_OPENMP := no -WITH_DEBUG := no - -# If TNL is installed on your system, the CUDA architecture can be detected -# automatically by tnl-cuda-arch. This is done if CUDA_ARCH is set to "auto". -# Otherwise, CUDA_ARCH has to be set manually to the desired CUDA architecture -# number, e.g. 60, 61, etc. -CUDA_ARCH := auto - -# compilers -CXX := g++ -CUDA_CXX := nvcc - -# host compiler flags -CXXFLAGS := -std=c++14 $(TNL_INCLUDE_DIRS) -ifeq ($(WITH_DEBUG),yes) - CXXFLAGS += -O0 -g -else - CXXFLAGS += -O3 -DNDEBUG -endif - -# CUDA compiler flags -CUDA_CXXFLAGS := -std=c++14 --expt-relaxed-constexpr --expt-extended-lambda $(TNL_INCLUDE_DIRS) -CUDA_CXXFLAGS += -DHAVE_CUDA -ifeq ($(CUDA_ARCH),auto) - CUDA_CXXFLAGS += $(shell tnl-cuda-arch) -else - CUDA_CXXFLAGS += -gencode arch=compute_$(CUDA_ARCH),code=sm_$(CUDA_ARCH) -endif - -# determine path to the CUDA toolkit installation -# (autodetection is attempted, set it manually if it fails) -CUDA_PATH ?= $(abspath $(dir $(shell command -v nvcc))/..) -#$(info Detected CUDA_PATH: $(CUDA_PATH)) - -# flags for linking CUDA with the host compiler -CUDA_LDFLAGS := -L $(CUDA_PATH)/lib64 -CUDA_LDLIBS := -lcudart -ldl -lrt - -# enable OpenMP -ifeq ($(WITH_OPENMP),yes) - CXXFLAGS += -fopenmp -DHAVE_OPENMP - LDLIBS += -lgomp - CUDA_CXXFLAGS += -Xcompiler -fopenmp -DHAVE_OPENMP - CUDA_LDLIBS += -lgomp -endif -- GitLab From 87d3c10dcc3057fbd235c1355fa967903eadc7b2 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sat, 20 Mar 2021 19:33:13 +0100 Subject: [PATCH 117/258] opt out of OPENMP --- GPUSort/src/util/config.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPUSort/src/util/config.mk b/GPUSort/src/util/config.mk index 5b2f0d7f6..3715986f7 100644 --- a/GPUSort/src/util/config.mk +++ b/GPUSort/src/util/config.mk @@ -1,7 +1,7 @@ # configure the include path(s) according to your TNL installation TNL_INCLUDE_DIRS := -I ~/.local/include -WITH_OPENMP := yes +WITH_OPENMP := no WITH_DEBUG := no # If TNL is installed on your system, the CUDA architecture can be detected -- GitLab From d8148877a3815035d8edb354b2177735c3e0591e Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sat, 20 Mar 2021 20:18:10 +0100 Subject: [PATCH 118/258] benchmark change --- .../benchmark/bitonic_benchmark/benchmark.cu | 138 ++++++++++-------- 1 file changed, 75 insertions(+), 63 deletions(-) diff --git a/GPUSort/benchmark/bitonic_benchmark/benchmark.cu b/GPUSort/benchmark/bitonic_benchmark/benchmark.cu index 0771bb798..bb9b9e74b 100644 --- a/GPUSort/benchmark/bitonic_benchmark/benchmark.cu +++ b/GPUSort/benchmark/bitonic_benchmark/benchmark.cu @@ -3,90 +3,102 @@ #include #include +#include "../../src/util/timer.h" +//--------------------------- #include "../../src/bitonicSort/bitonicSort.h" -#include "../../src/util/timer.h" +#define SORTERFUNCTION bitonicSort +//--------------------------- using namespace TNL; using namespace TNL::Containers; using namespace std; -typedef Devices::Cuda Device; +const int lowPow = 15, highLow = 22; +const int tries = 50; -int main() +double measure(const vector&vec) { - srand(2021); - for(int pow = 10; pow <= 20; pow++) + Array arr(vec.size()); + vector resAcc; + + for(int i = 0; i < tries; i++) { - int size =(1<< pow); + arr = vec; + auto view = arr.getView(); - vector vec(size); - iota(vec.begin(), vec.end(), 0); + { + TIMER t([&](double res){resAcc.push_back(res);}); + SORTERFUNCTION(view); + } + } - Array arr; - vector resAcc; + return accumulate(resAcc.begin(), resAcc.end(), 0.0) / resAcc.size(); +} - //sorted sequence - { - arr = vec; - auto view = arr.getView(); +double sorted(int size) +{ + vector vec(size); + iota(vec.begin(), vec.end(), 0); + + return measure(vec); +} - { - TIMER t([&](double res){resAcc.push_back(res);}); - bitonicSort(view); - } - } +double random(int size) +{ + srand(size); - //almost sorted sequence - { - for(int i = 0; i < 3; i++) - { - int s = rand() % (size - 3); - std::swap(vec[s], vec[s + 1]); - } - - arr = vec; - auto view = arr.getView(); - - { - TIMER t([&](double res){resAcc.push_back(res);}); - bitonicSort(view); - } - } + vector vec(size); + iota(vec.begin(), vec.end(), 0); + random_shuffle(vec.begin(), vec.end()); - //decreasing sequence - { - for(size_t i = 0; i < size; i++) - vec[i] = -i; - - arr = vec; - auto view = arr.getView(); + return measure(vec); +} - { - TIMER t([&](double res){resAcc.push_back(res);}); - bitonicSort(view); - } - } - - //random sequence - { - random_shuffle(vec.begin(), vec.end()); +double almostSorted(int size) +{ + vector vec(size); + iota(vec.begin(), vec.end(), 0); + for(int i = 0; i < 3; i++) //swaps 3 times in array + { + int s = rand() % (size - 3); + std::swap(vec[s], vec[s + 1]); + } - arr = vec; - auto view = arr.getView(); + return measure(vec); +} - { - TIMER t([&](double res){resAcc.push_back(res);}); - bitonicSort(view); - } - } +double decreasing(int size) +{ + vector vec(size); + for(size_t i = 0; i < size; i++) + vec[i] = -i; + + return measure(vec); +} +int main() +{ + string delim = "\t"; + cout << "size" << delim; + cout << "random" << delim; + cout << "sorted" << delim; + cout << "almost" << delim; + cout << "decreasing" << delim; + cout << endl; + + for(int pow = lowPow; pow <= highLow; pow++) + { + int size =(1<< pow); + vector vec(size); - cout << "2^" << pow << " = "; - cout << fixed; - cout << setprecision(3); - cout << (accumulate(resAcc.begin(), resAcc.end(), 0.0) / resAcc.size()) << " ms" << endl; + cout << "2^" << pow << delim; + cout << fixed << setprecision(3); + cout << random(size) << delim; + cout << sorted(size) << delim; + cout << almostSorted(size) << delim; + cout << decreasing(size) << delim; + cout << endl; } - return 0; } \ No newline at end of file -- GitLab From 453a39ee7856b6ef37b4a7391e4151fd8a268f5c Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sat, 20 Mar 2021 20:33:43 +0100 Subject: [PATCH 119/258] benchmarker --- GPUSort/benchmark/benchmarker.cpp | 106 ++++++++++++++++++ .../benchmark/bitonic_benchmark/benchmark.cu | 102 +---------------- .../quicksort_benchmark/benchmark.cu | 94 +--------------- .../quicksort_dynamic_benchmark/Makefile | 2 +- .../quicksort_dynamic_benchmark/benchmark.cu | 94 +--------------- 5 files changed, 114 insertions(+), 284 deletions(-) create mode 100644 GPUSort/benchmark/benchmarker.cpp diff --git a/GPUSort/benchmark/benchmarker.cpp b/GPUSort/benchmark/benchmarker.cpp new file mode 100644 index 000000000..b0171aae1 --- /dev/null +++ b/GPUSort/benchmark/benchmarker.cpp @@ -0,0 +1,106 @@ +#include +#include +#include + +#include +#include "../src/util/timer.h" + +//--------------------------- +/** + * important! to make use of this benchmarker, it is needed to define SORTERFUNCTION + * then include this file + * */ +//--------------------------- + +using namespace TNL; +using namespace TNL::Containers; +using namespace std; + +const int lowPow = 15, highLow = 22; +const int tries = 50; + +double measure(const vector&vec) +{ + Array arr(vec.size()); + vector resAcc; + + for(int i = 0; i < tries; i++) + { + arr = vec; + auto view = arr.getView(); + + { + TIMER t([&](double res){resAcc.push_back(res);}); + SORTERFUNCTION(view); + } + } + + return accumulate(resAcc.begin(), resAcc.end(), 0.0) / resAcc.size(); +} + +double sorted(int size) +{ + vector vec(size); + iota(vec.begin(), vec.end(), 0); + + return measure(vec); +} + +double random(int size) +{ + srand(size); + + vector vec(size); + iota(vec.begin(), vec.end(), 0); + random_shuffle(vec.begin(), vec.end()); + + return measure(vec); +} + +double almostSorted(int size) +{ + vector vec(size); + iota(vec.begin(), vec.end(), 0); + for(int i = 0; i < 3; i++) //swaps 3 times in array + { + int s = rand() % (size - 3); + std::swap(vec[s], vec[s + 1]); + } + + return measure(vec); +} + +double decreasing(int size) +{ + vector vec(size); + for(size_t i = 0; i < size; i++) + vec[i] = -i; + + return measure(vec); +} + +int main() +{ + string delim = "\t"; + cout << "size" << delim; + cout << "random" << delim; + cout << "sorted" << delim; + cout << "almost" << delim; + cout << "decreasing" << delim; + cout << endl; + + for(int pow = lowPow; pow <= highLow; pow++) + { + int size =(1<< pow); + vector vec(size); + + cout << "2^" << pow << delim; + cout << fixed << setprecision(3); + cout << random(size) << delim; + cout << sorted(size) << delim; + cout << almostSorted(size) << delim; + cout << decreasing(size) << delim; + cout << endl; + } + return 0; +} \ No newline at end of file diff --git a/GPUSort/benchmark/bitonic_benchmark/benchmark.cu b/GPUSort/benchmark/bitonic_benchmark/benchmark.cu index bb9b9e74b..26e241be2 100644 --- a/GPUSort/benchmark/bitonic_benchmark/benchmark.cu +++ b/GPUSort/benchmark/bitonic_benchmark/benchmark.cu @@ -1,104 +1,4 @@ -#include -#include -#include - -#include -#include "../../src/util/timer.h" - -//--------------------------- #include "../../src/bitonicSort/bitonicSort.h" #define SORTERFUNCTION bitonicSort //--------------------------- - -using namespace TNL; -using namespace TNL::Containers; -using namespace std; - -const int lowPow = 15, highLow = 22; -const int tries = 50; - -double measure(const vector&vec) -{ - Array arr(vec.size()); - vector resAcc; - - for(int i = 0; i < tries; i++) - { - arr = vec; - auto view = arr.getView(); - - { - TIMER t([&](double res){resAcc.push_back(res);}); - SORTERFUNCTION(view); - } - } - - return accumulate(resAcc.begin(), resAcc.end(), 0.0) / resAcc.size(); -} - -double sorted(int size) -{ - vector vec(size); - iota(vec.begin(), vec.end(), 0); - - return measure(vec); -} - -double random(int size) -{ - srand(size); - - vector vec(size); - iota(vec.begin(), vec.end(), 0); - random_shuffle(vec.begin(), vec.end()); - - return measure(vec); -} - -double almostSorted(int size) -{ - vector vec(size); - iota(vec.begin(), vec.end(), 0); - for(int i = 0; i < 3; i++) //swaps 3 times in array - { - int s = rand() % (size - 3); - std::swap(vec[s], vec[s + 1]); - } - - return measure(vec); -} - -double decreasing(int size) -{ - vector vec(size); - for(size_t i = 0; i < size; i++) - vec[i] = -i; - - return measure(vec); -} - -int main() -{ - string delim = "\t"; - cout << "size" << delim; - cout << "random" << delim; - cout << "sorted" << delim; - cout << "almost" << delim; - cout << "decreasing" << delim; - cout << endl; - - for(int pow = lowPow; pow <= highLow; pow++) - { - int size =(1<< pow); - vector vec(size); - - cout << "2^" << pow << delim; - cout << fixed << setprecision(3); - cout << random(size) << delim; - cout << sorted(size) << delim; - cout << almostSorted(size) << delim; - cout << decreasing(size) << delim; - cout << endl; - } - return 0; -} \ No newline at end of file +#include "../benchmarker.cpp" \ No newline at end of file diff --git a/GPUSort/benchmark/quicksort_benchmark/benchmark.cu b/GPUSort/benchmark/quicksort_benchmark/benchmark.cu index 954b52979..0a486ce3a 100644 --- a/GPUSort/benchmark/quicksort_benchmark/benchmark.cu +++ b/GPUSort/benchmark/quicksort_benchmark/benchmark.cu @@ -1,92 +1,4 @@ -#include -#include -#include - -#include - #include "../../src/quicksort/quicksort.cuh" -#include "../../src/util/timer.h" - -using namespace TNL; -using namespace TNL::Containers; -using namespace std; - -typedef Devices::Cuda Device; - -int main() -{ - srand(8151); - for(int pow = 5; pow <= 23; pow++) - { - int size =(1<< pow); - - vector vec(size); - iota(vec.begin(), vec.end(), 0); - - Array arr; - vector resAcc; - - //sorted sequence - { - arr = vec; - auto view = arr.getView(); - - { - TIMER t([&](double res){resAcc.push_back(res);}); - quicksort(view); - } - } - - //almost sorted sequence - { - for(int i = 0; i < 3; i++) - { - int s = rand() % (size - 3); - std::swap(vec[s], vec[s + 1]); - } - - arr = vec; - auto view = arr.getView(); - - { - TIMER t([&](double res){resAcc.push_back(res);}); - quicksort(view); - } - } - - //decreasing sequence - { - for(size_t i = 0; i < size; i++) - vec[i] = -i; - - arr = vec; - auto view = arr.getView(); - - { - TIMER t([&](double res){resAcc.push_back(res);}); - quicksort(view); - } - } - - //random sequence - { - random_shuffle(vec.begin(), vec.end()); - - arr = vec; - auto view = arr.getView(); - - { - TIMER t([&](double res){resAcc.push_back(res);}); - quicksort(view); - } - } - - - cout << "2^" << pow << " = "; - cout << fixed; - cout << setprecision(3); - cout << (accumulate(resAcc.begin(), resAcc.end(), 0.0) / resAcc.size()) << " ms" << endl; - } - - return 0; -} \ No newline at end of file +#define SORTERFUNCTION quicksort +//--------------------------- +#include "../benchmarker.cpp" \ No newline at end of file diff --git a/GPUSort/benchmark/quicksort_dynamic_benchmark/Makefile b/GPUSort/benchmark/quicksort_dynamic_benchmark/Makefile index daa599112..2578516b3 100644 --- a/GPUSort/benchmark/quicksort_dynamic_benchmark/Makefile +++ b/GPUSort/benchmark/quicksort_dynamic_benchmark/Makefile @@ -25,7 +25,7 @@ clean: $(TARGET): quicksort.o quicksort_link.o $(TARGET).o $(CXX) $(TNL_INCLUDE_DIRS) $(CUDA_LDFLAGS) -o $@ $^ $(CUDA_LDLIBS) -$(TARGET).o: $(TARGET).cu +$(TARGET).o: $(TARGET).cu ../benchmarker.cpp $(CUDA_CXX) $(CUDA_CXXFLAGS) -c -o $@ $< quicksort.o: $(SRC_FOLDER)/quicksort.cu diff --git a/GPUSort/benchmark/quicksort_dynamic_benchmark/benchmark.cu b/GPUSort/benchmark/quicksort_dynamic_benchmark/benchmark.cu index 389aba9a7..64b28f453 100644 --- a/GPUSort/benchmark/quicksort_dynamic_benchmark/benchmark.cu +++ b/GPUSort/benchmark/quicksort_dynamic_benchmark/benchmark.cu @@ -1,92 +1,4 @@ -#include -#include -#include - -#include - #include "../../src/quicksort_dynamic/quicksort.cuh" -#include "../../src/util/timer.h" - -using namespace TNL; -using namespace TNL::Containers; -using namespace std; - -typedef Devices::Cuda Device; - -int main() -{ - srand(8151); - for(int pow = 5; pow <= 23; pow++) - { - int size =(1<< pow); - - vector vec(size); - iota(vec.begin(), vec.end(), 0); - - Array arr; - vector resAcc; - - //sorted sequence - { - arr = vec; - auto view = arr.getView(); - - { - TIMER t([&](double res){resAcc.push_back(res);}); - quicksort(view); - } - } - - //almost sorted sequence - { - for(int i = 0; i < 3; i++) - { - int s = rand() % (size - 3); - std::swap(vec[s], vec[s + 1]); - } - - arr = vec; - auto view = arr.getView(); - - { - TIMER t([&](double res){resAcc.push_back(res);}); - quicksort(view); - } - } - - //decreasing sequence - { - for(size_t i = 0; i < size; i++) - vec[i] = -i; - - arr = vec; - auto view = arr.getView(); - - { - TIMER t([&](double res){resAcc.push_back(res);}); - quicksort(view); - } - } - - //random sequence - { - random_shuffle(vec.begin(), vec.end()); - - arr = vec; - auto view = arr.getView(); - - { - TIMER t([&](double res){resAcc.push_back(res);}); - quicksort(view); - } - } - - - cout << "2^" << pow << " = "; - cout << fixed; - cout << setprecision(3); - cout << (accumulate(resAcc.begin(), resAcc.end(), 0.0) / resAcc.size()) << " ms" << endl; - } - - return 0; -} \ No newline at end of file +#define SORTERFUNCTION quicksort +//--------------------------- +#include "../benchmarker.cpp" \ No newline at end of file -- GitLab From 3cd4e57085ac2aa305d38855b97b282b25888290 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sat, 20 Mar 2021 20:38:07 +0100 Subject: [PATCH 120/258] bitonic interface change --- GPUSort/src/quicksort/quicksort.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index c06e5ee27..a62a31e42 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -216,7 +216,7 @@ __device__ void cudaQuickSort_block(CudaArrayView arr, const Function & Cmp, int size = end - begin; if(size<= blockDim.x*2) { - bitonicSort_Block(arr.getView(begin, end), bitonicMem, Cmp); + bitonicSort_Block(arr.getView(begin, end), arr.getView(begin, end), bitonicMem, Cmp); continue; } -- GitLab From e459c313ab9a4943e2bd654c22b633b236dc5a94 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sun, 21 Mar 2021 15:33:40 +0100 Subject: [PATCH 121/258] refactor reduction function --- GPUSort/src/quicksort/quicksort.cuh | 2 +- GPUSort/src/quicksort_dynamic/quicksort.cu | 4 +- GPUSort/src/quicksort_dynamic/reduction.cuh | 81 ------------------- GPUSort/src/{quicksort => util}/reduction.cuh | 0 4 files changed, 4 insertions(+), 83 deletions(-) delete mode 100644 GPUSort/src/quicksort_dynamic/reduction.cuh rename GPUSort/src/{quicksort => util}/reduction.cuh (100%) diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index a62a31e42..6fed813b2 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -1,7 +1,7 @@ #pragma once #include -#include "reduction.cuh" +#include "../util/reduction.cuh" #include "task.h" #include "../bitonicSort/bitonicSort.h" #include diff --git a/GPUSort/src/quicksort_dynamic/quicksort.cu b/GPUSort/src/quicksort_dynamic/quicksort.cu index b56aca624..12320a73c 100644 --- a/GPUSort/src/quicksort_dynamic/quicksort.cu +++ b/GPUSort/src/quicksort_dynamic/quicksort.cu @@ -1,7 +1,7 @@ #include "quicksort.cuh" #include -#include "reduction.cuh" +#include "../util/reduction.cuh" #include "task.h" #include "../bitonicSort/bitonicSort.h" #include "helper.cuh" @@ -145,6 +145,8 @@ __device__ void stackPush(int stackArrBegin[], int stackArrEnd[], { int sizeL = pivotBegin - begin, sizeR = end - pivotEnd; + //push the bigger one 1st and then smaller one 2nd + //in next iteration, the smaller part will be handled 1st if(sizeL > sizeR) { if(sizeL > 0) //left from pivot are smaller elems diff --git a/GPUSort/src/quicksort_dynamic/reduction.cuh b/GPUSort/src/quicksort_dynamic/reduction.cuh deleted file mode 100644 index 234871c93..000000000 --- a/GPUSort/src/quicksort_dynamic/reduction.cuh +++ /dev/null @@ -1,81 +0,0 @@ -#pragma once -/** - * https://developer.nvidia.com/blog/faster-parallel-reductions-kepler/ - * */ - -__device__ int warpReduceSum(int initVal) -{ - const unsigned int maskConstant = 0xffffffff; //not used - for (unsigned int mask = warpSize / 2; mask > 0; mask >>= 1) - initVal += __shfl_xor_sync(maskConstant, initVal, mask); - - return initVal; -} - -__device__ int blockReduceSum(int val) -{ - static __shared__ int shared[32]; - int lane = threadIdx.x & (warpSize - 1); - int wid = threadIdx.x / warpSize; - - val = warpReduceSum(val); - - if (lane == 0) - shared[wid] = val; - __syncthreads(); - - val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0; - - if (wid == 0) - val = warpReduceSum(val); - - if(threadIdx.x == 0) - shared[0] = val; - __syncthreads(); - - return shared[0]; -} - - -template -__device__ int warpInclusivePrefixSum(int value) -{ - if(it*2 <= 32) - { - int i = it; - int n = __shfl_up_sync(0xffffffff, value, i); - int laneId = threadIdx.x & 0x1f; - if ((laneId & (warpSize - 1)) >= i) - value += n; - return warpInclusivePrefixSum= 32? 32 : it*2>(value); - - } - - return value; -} - -__device__ int warpInclusivePrefixSum(int value) -{ - return warpInclusivePrefixSum<1>(value); -} - -__device__ int blockInclusivePrefixSum(int value) -{ - static __shared__ int shared[32]; - int lane = threadIdx.x & (warpSize - 1); - int wid = threadIdx.x / warpSize; - - int tmp = warpInclusivePrefixSum(value); - - if (lane == warpSize-1) - shared[wid] = tmp; - __syncthreads(); - - int tmp2 = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0; - if (wid == 0) - shared[lane] = warpInclusivePrefixSum(tmp2) - tmp2; - __syncthreads(); - - tmp += shared[wid]; - return tmp; -} \ No newline at end of file diff --git a/GPUSort/src/quicksort/reduction.cuh b/GPUSort/src/util/reduction.cuh similarity index 100% rename from GPUSort/src/quicksort/reduction.cuh rename to GPUSort/src/util/reduction.cuh -- GitLab From cbedfcaef95bbd5e654af86cf2c9456e7a4cf06d Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sun, 21 Mar 2021 16:33:51 +0100 Subject: [PATCH 122/258] partition rewrite --- GPUSort/src/quicksort/cudaPartition.cuh | 97 ++++++++++++++ GPUSort/src/quicksort/quicksort_1Block.cuh | 149 +++++++++++++++++++++ GPUSort/src/quicksort/task.h | 25 ++-- 3 files changed, 261 insertions(+), 10 deletions(-) create mode 100644 GPUSort/src/quicksort/cudaPartition.cuh create mode 100644 GPUSort/src/quicksort/quicksort_1Block.cuh diff --git a/GPUSort/src/quicksort/cudaPartition.cuh b/GPUSort/src/quicksort/cudaPartition.cuh new file mode 100644 index 000000000..574130178 --- /dev/null +++ b/GPUSort/src/quicksort/cudaPartition.cuh @@ -0,0 +1,97 @@ +#pragma once + +#include +#include "../util/reduction.cuh" +#include "task.h" +#include + +#define deb(x) std::cout << #x << " = " << x << std::endl; + +using namespace TNL; +using namespace TNL::Containers; + +__device__ +void cmpElem(ArrayView arr, + int &smaller, int &bigger, + const int &pivot) +{ + for (int i = threadIdx.x; i < arr.getSize(); i += blockDim.x) + { + int data = arr[i]; + if (data < pivot) + smaller++; + else if (data > pivot) + bigger++; + } +} + +__device__ +void copyData(ArrayView src, + ArrayView dst, + int smallerStart, int biggerStart, + const int &pivot) +{ + for (int i = threadIdx.x; i < src.getSize(); i += blockDim.x) + { + int data = arr[i]; + if (data < pivot) + aux[smallerStart++] = data; + else if (data > pivot) + aux[biggerStart++] = data; + } +} + +//---------------------------------------------------------------------------------- + +template +__global__ void cudaPartition(ArrayView src, + ArrayView dst, + const Function &Cmp, + int elemPerBlock, TASK * task + ) +{ + static __shared__ int myBegin, myEnd; + static __shared__ int smallerStart, biggerStart; + static __shared__ int pivot; + static __shared__ bool writePivot; + + if (threadIdx.x == 0) + { + myBegin = elemPerBlock * (blockIdx.x - task->firstBlock); + myEnd = TNL::min(myBegin + elemPerBlock, arr.getSize()); + pivot = src[src.getSize() - 1]; + } + __syncthreads(); + + auto srcView = src.getView(myBegin, myEnd); + + //------------------------------------------------------------------------- + + int smaller = 0, bigger = 0; + cmpElem(srcView, smaller, bigger, pivot); + + int smallerOffset = blockInclusivePrefixSum(smaller); + int biggerOffset = blockInclusivePrefixSum(bigger); + + if (threadIdx.x == blockDim.x - 1) //last thread in block has sum of all values + { + smallerStart = atomicAdd(&(task->dstBegin), smallerOffset); + biggerStart = atomicAdd(&(task->dstEnd), -biggerOffset) - biggerOffset; + } + __syncthreads(); + + //----------------------------------------------------------- + + int destSmaller = smallerStart + smallerOffset - smaller; + int destBigger = biggerStart + biggerOffset - bigger; + copyData(srcView, dst, destSmaller, destBigger, pivot); + __syncthreads(); + + //----------------------------------------------------------- + + if (threadIdx.x == 0) + writePivot = atomicAdd(&(task->tillWorkingCnt), -1) == 1; + __syncthreads(); + + return writePivot; +} \ No newline at end of file diff --git a/GPUSort/src/quicksort/quicksort_1Block.cuh b/GPUSort/src/quicksort/quicksort_1Block.cuh new file mode 100644 index 000000000..2ed5e94f6 --- /dev/null +++ b/GPUSort/src/quicksort/quicksort_1Block.cuh @@ -0,0 +1,149 @@ +#pragma once + +#include +#include "../bitonicSort/bitonicSort.h" +#include "../util/reduction.cuh" + +using namespace TNL; +using namespace TNL::Containers; + +template +__device__ void externSort(ArrayView src, + ArrayView dst, + const Function & Cmp) +{ + static __shared__ int sharedMem[externMemSize]; + bitonicSort_Block(src, dst, sharedMem, Cmp); +} + +template +__device__ void stackPush(int stackArrBegin[], int stackArrEnd[], + int stackDepth[], int & stackTop, + int begin, int pivotBegin, + int pivotEnd, int end, + int depth) +{ + int sizeL = pivotBegin - begin, sizeR = end - pivotEnd; + + //push the bigger one 1st and then smaller one 2nd + //in next iteration, the smaller part will be handled 1st + if(sizeL > sizeR) + { + if(sizeL > 0) //left from pivot are smaller elems + { + stackArrBegin[stackTop] = begin; + stackArrEnd[stackTop] = pivotBegin; + stackDepth[stackTop] = depth + 1; + stackTop++; + } + + if(sizeR > 0) //right from pivot until end are elem greater than pivot + { + assert(stackTop < stackSize && "Local quicksort stack overflow."); + + stackArrBegin[stackTop] = pivotEnd; + stackArrEnd[stackTop] = end; + stackDepth[stackTop] = depth + 1; + stackTop++; + } + } + else + { + if(sizeR > 0) //right from pivot until end are elem greater than pivot + { + stackArrBegin[stackTop] = pivotEnd; + stackArrEnd[stackTop] = end; + stackDepth[stackTop] = depth + 1; + stackTop++; + } + + if(sizeL > 0) //left from pivot are smaller elems + { + assert(stackTop < stackSize && "Local quicksort stack overflow."); + + stackArrBegin[stackTop] = begin; + stackArrEnd[stackTop] = pivotBegin; + stackDepth[stackTop] = depth + 1; + stackTop++; + } + } +} + +template +__device__ void singleBlockQuickSort(ArrayView arr, + ArrayView aux, + const Function & Cmp, int _depth) +{ + static __shared__ int stackTop; + static __shared__ int stackArrBegin[stackSize], stackArrEnd[stackSize], stackDepth[stackSize]; + static __shared__ int begin, end, depth,pivotBegin, pivotEnd; + static __shared__ int pivot; + + if (threadIdx.x == 0) + { + stackTop = 0; + stackArrBegin[stackTop] = 0; + stackArrEnd[stackTop] = arr.getSize(); + stackDepth[stackTop] = _depth; + stackTop++; + } + __syncthreads(); + + while(stackTop > 0) + { + if (threadIdx.x == 0) + { + begin = stackArrBegin[stackTop-1]; + end = stackArrEnd[stackTop-1]; + depth = stackDepth[stackTop-1]; + stackTop--; + pivot = pickPivot(depth%2 == 0? + arr.getView(begin, end) : + aux.getView(begin, end), + Cmp + ); + } + __syncthreads(); + + int size = end - begin; + auto src = depth%2 == 0 ? arr.getView(begin, end) : aux.getView(begin, end); + auto dst = depth%2 == 0 ? aux.getView(begin, end) : arr.getView(begin, end); + + if(size <= blockDim.x*2) + { + externSort(src, arr.getView(begin, end), Cmp); + continue; + } + + int smaller = 0, bigger = 0; + countElem(src, 0, size, smaller, bigger, pivot); + + int smallerOffset = blockInclusivePrefixSum(smaller); + int biggerOffset = blockInclusivePrefixSum(bigger); + + if (threadIdx.x == blockDim.x - 1) + { + pivotBegin = smallerOffset; + pivotEnd = size - biggerOffset; + } + __syncthreads(); + + int destSmaller = 0 + smallerOffset - smaller; + int destBigger = pivotEnd + (biggerOffset - bigger); + + copyData(src, 0, size, dst, destSmaller, destBigger, pivot); + __syncthreads(); + + for (int i = pivotBegin + threadIdx.x; i < pivotEnd; i += blockDim.x) + src[i] = dst[i] = pivot; + + if(threadIdx.x == 0) + { + stackPush(stackArrBegin, stackArrEnd, stackDepth, stackTop, + begin, begin+ pivotBegin, + begin +pivotEnd, end, + depth); + } + __syncthreads(); + } //ends while loop +} \ No newline at end of file diff --git a/GPUSort/src/quicksort/task.h b/GPUSort/src/quicksort/task.h index aa68fe3a9..7b4416571 100644 --- a/GPUSort/src/quicksort/task.h +++ b/GPUSort/src/quicksort/task.h @@ -2,23 +2,28 @@ struct TASK { - int arrBegin, arrEnd;//start and end position of array to read from - int auxBeginIdx, auxEndIdx; //start and end position of still available memory to write into - int firstBlock, blockCount; + //start and end position of array to read and write from + int partitionBegin, partitionEnd; + + //----------------------------------------------- + //helper variables for blocks working on this task + + int dstBegin, dstEnd; + int firstBlock, blockCount;//for workers read only values int stillWorkingCnt;//shared counter of blocks working together(how many are still working) - __cuda_callable__ - TASK(int srcBegin, int srcEnd, int destBegin, int destEnd) - : arrBegin(srcBegin), arrEnd(srcEnd), - auxBeginIdx(destBegin), auxEndIdx(destEnd), - firstBlock(-1), blockCount(-1), stillWorkingCnt(-1) + TASK(int begin, int end) + : partitionBegin(begin), partitionEnd(end), + dstBegin(0), dstEnd(end-begin), + firstBlock(-100), blockCount(-100), stillWorkingCnt(-100) {} - __cuda_callable__ void setBlocks(int blocks) + __cuda_callable__ + void setBlocks(int blocks) { blockCount = stillWorkingCnt = blocks; } - TASK() = default; + TASK() = default; }; \ No newline at end of file -- GitLab From 0b1d1f9cf84d78c5e2cee984e72a082dad0ec77b Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sun, 21 Mar 2021 17:04:39 +0100 Subject: [PATCH 123/258] write new task --- GPUSort/src/quicksort/quicksort.cuh | 347 +++++++--------------------- 1 file changed, 83 insertions(+), 264 deletions(-) diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index 6fed813b2..d395701b6 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -3,154 +3,85 @@ #include #include "../util/reduction.cuh" #include "task.h" +#include "cudaPartition.cuh" #include "../bitonicSort/bitonicSort.h" #include #define deb(x) std::cout << #x << " = " << x << std::endl; -using CudaArrayView = TNL::Containers::ArrayView; -using CudaTaskArray = TNL::Containers::Array; +using namespace TNL; +using namespace TNL::Containers; -__device__ void cmpElem(CudaArrayView arr, int myBegin, int myEnd, - int &smaller, int &bigger, - volatile int pivot) +//----------------------------------------------------------- + +__device__ void writeNewTask(int begin, int end, + ArrayView newTasks, int *newTasksCnt, + ArrayView secondPhaseTasks, int *secondPhaseTasksCnt) { - for (int i = myBegin + threadIdx.x; i < myEnd; i += blockDim.x) + int size = end - begin; + if(size == 0) return; + if(size <= blockDim.x*2) + { + int idx = atomicAdd(secondPhaseTasksCnt, 1); + secondPhaseTasks[idx] = TASK(begin, end); + } + else { - int data = arr[i]; - if (data < pivot) - smaller++; - else if (data > pivot) - bigger++; + int idx = atomicAdd(newTasksCnt, 1); + newTasks[idx] = TASK(begin, end); } } -__device__ void copyData(CudaArrayView arr, int myBegin, int myEnd, - CudaArrayView aux, int smallerStart, int biggerStart, - volatile int pivot) +__device__ void writeNewTasks(int leftBegin, int leftEnd, int rightBegin, int rightEnd, + ArrayView newTasks, int *newTasksCnt, + ArrayView secondPhaseTasks, int *secondPhaseTasksCnt) { - for (int i = myBegin + threadIdx.x; i < myEnd; i += blockDim.x) - { - int data = arr[i]; - if (data < pivot) - aux[smallerStart++] = data; - else if (data > pivot) - aux[biggerStart++] = data; - } + writeNewTask(leftBegin, leftEnd, newTasks, newTasksCnt, secondPhaseTasks, secondPhaseTasksCnt); + writeNewTask(rightBegin, rightEnd, newTasks, newTasksCnt, secondPhaseTasks, secondPhaseTasksCnt); } +//---------------------------------------------------- template -__global__ void cudaPartition(CudaArrayView arr,const Function & Cmp, - CudaArrayView aux, - TNL::Containers::ArrayView cuda_blockToTaskMapping, - int elemPerBlock, - TNL::Containers::ArrayView cuda_tasks, - TNL::Containers::ArrayView cuda_newTasks, - int *newTasksCnt, - TNL::Containers::ArrayView cuda_2ndPhaseTasks, - int * cuda_2ndPhaseCnt -) +__global__ void cudaQuickSort1stPhase(ArrayView src, ArrayView dst, + const Function &Cmp, int elemPerBlock, + ArrayView tasks, + ArrayView taskMapping, int *tasksAmount, + ArrayView newTasks, int *newTasksCnt, + ArrayView secondPhaseTasks, int *secondPhaseTasksCnt) { - static __shared__ TASK myTask; - static __shared__ int smallerStart, biggerStart; static __shared__ int pivot; - static __shared__ int myTaskIdx; - static __shared__ bool writePivot; + TASK &myTask = tasks[taskMapping[blockIdx.x]]; if (threadIdx.x == 0) - { - myTaskIdx = cuda_blockToTaskMapping[blockIdx.x]; - myTask = cuda_tasks[myTaskIdx]; - pivot = arr[myTask.arrEnd - 1]; - writePivot = false; - } + pivot = src[myTask.partitionEnd - 1]; __syncthreads(); - //only works if consecutive blocks work on the same task - const int myBegin = myTask.arrBegin + elemPerBlock * (blockIdx.x - myTask.firstBlock); - const int myEnd = TNL::min(myTask.arrEnd, myBegin + elemPerBlock); - - //------------------------------------------------------------------------- - - int smaller = 0, bigger = 0; - cmpElem(arr, myBegin, myEnd, smaller, bigger, pivot); - - int smallerOffset = blockInclusivePrefixSum(smaller); - int biggerOffset = blockInclusivePrefixSum(bigger); - - if (threadIdx.x == blockDim.x - 1) //last thread in block has sum of all values - { - smallerStart = atomicAdd(&(cuda_tasks[myTaskIdx].auxBeginIdx), smallerOffset); - biggerStart = atomicAdd(&(cuda_tasks[myTaskIdx].auxEndIdx), -biggerOffset) - biggerOffset; - } - __syncthreads(); - - int destSmaller = smallerStart + smallerOffset - smaller; - int destBigger = biggerStart + biggerOffset - bigger; - copyData(arr, myBegin, myEnd, aux, destSmaller, destBigger, pivot); - - //----------------------------------------------------------- + bool isLast = cudaPartition( + src.getView(myTask.partitionBegin, myTask.partitionEnd), + dst.getView(myTask.partitionBegin, myTask.partitionEnd), + Cmp, pivot, elemPerBlock, myTask); + if (!isLast) + return; - if (threadIdx.x == 0 && atomicAdd(&(cuda_tasks[myTaskIdx].stillWorkingCnt), -1) == 1) - { - writePivot = true; - myTask = cuda_tasks[myTaskIdx]; //update auxBeginIdx, auxEndIdx value - } - __syncthreads(); + myTask = tasks[taskMapping[blockIdx.x]]; - if (!writePivot) - return; + int leftBegin = myTask.partitionBegin, leftEnd = myTask.partitionBegin + myTask.dstBegin; + int rightBegin = myTask.partitionBegin + myTask.dstEnd, rightEnd = myTask.partitionEnd; - for (int i = myTask.auxBeginIdx + threadIdx.x; i < myTask.auxEndIdx; i += blockDim.x) - aux[i] = pivot; + for (int i = leftEnd + threadIdx.x; i < rightBegin; i += blockDim.x) + src[i] = dst[i] = pivot; if (threadIdx.x != 0) return; - - if (myTask.auxBeginIdx - myTask.arrBegin > 0) //smaller - { - if(myTask.auxBeginIdx - myTask.arrBegin <= blockDim.x*2) - { - int newTaskIdx = atomicAdd(cuda_2ndPhaseCnt, 1); - cuda_2ndPhaseTasks[newTaskIdx] = TASK( - myTask.arrBegin, myTask.auxBeginIdx, - myTask.arrBegin, myTask.auxBeginIdx - ); - } - else - { - int newTaskIdx = atomicAdd(newTasksCnt, 1); - cuda_newTasks[newTaskIdx] = TASK( - myTask.arrBegin, myTask.auxBeginIdx, - myTask.arrBegin, myTask.auxBeginIdx - ); - } - } - if (myTask.arrEnd - myTask.auxEndIdx > 0) //greater - { - if (myTask.arrEnd - myTask.auxEndIdx <= blockDim.x*2) - { - int newTaskIdx = atomicAdd(cuda_2ndPhaseCnt, 1); - cuda_2ndPhaseTasks[newTaskIdx] = TASK( - myTask.auxEndIdx, myTask.arrEnd, - myTask.auxEndIdx, myTask.arrEnd - ); - } - else - { - int newTaskIdx = atomicAdd(newTasksCnt, 1); - cuda_newTasks[newTaskIdx] = TASK( - myTask.auxEndIdx, myTask.arrEnd, - myTask.auxEndIdx, myTask.arrEnd - ); - } - } + writeNewTasks(leftBegin, leftEnd, rightBegin, rightEnd, + newTasks, newTasksCnt, + secondPhaseTasks, secondPhaseTasksCnt); } - -__global__ void cudaInitTask(TNL::Containers::ArrayView cuda_tasks, +//----------------------------------------------------------- +__global__ void cudaInitTask(ArrayView cuda_tasks, int taskAmount, int elemPerBlock, int *firstAvailBlock, - TNL::Containers::ArrayView cuda_blockToTaskMapping) + ArrayView cuda_blockToTaskMapping) { static __shared__ int avail; @@ -181,153 +112,43 @@ __global__ void cudaInitTask(TNL::Containers::ArrayView -__device__ void cudaQuickSort_block(CudaArrayView arr, const Function & Cmp, - CudaArrayView aux, - int * stackArrBegin, int *stackArrEnd, int stackSize, - int * bitonicMem ) -{ - static __shared__ int begin, end; - static __shared__ int stackTop; - static __shared__ int pivotBegin, pivotEnd; - static __shared__ int pivot; - - if (threadIdx.x == 0) - { - stackArrBegin[0] = 0; - stackArrEnd[0] = arr.getSize(); - stackTop = 1; - } - __syncthreads(); - - while(stackTop > 0) - { - if (threadIdx.x == 0) - { - begin = stackArrBegin[stackTop - 1]; - end = stackArrEnd[stackTop - 1]; - stackTop--; - pivot = arr[end - 1]; - } - __syncthreads(); - - int size = end - begin; - if(size<= blockDim.x*2) - { - bitonicSort_Block(arr.getView(begin, end), arr.getView(begin, end), bitonicMem, Cmp); - continue; - } - - int smaller = 0, bigger = 0; - cmpElem(arr, begin, end, smaller, bigger, pivot); - - int smallerOffset = blockInclusivePrefixSum(smaller); - int biggerOffset = blockInclusivePrefixSum(bigger); - - if (threadIdx.x == blockDim.x - 1) - { - pivotBegin = begin + smallerOffset; - pivotEnd = end - biggerOffset; - } - __syncthreads(); - - int destSmaller = smallerOffset - smaller; - int destBigger = pivotEnd + biggerOffset - bigger; - copyData(arr, begin, end, aux, destSmaller, destBigger, pivot); - __syncthreads(); - - for (int i = begin + threadIdx.x; i < end; i += blockDim.x) - { - if(i >= pivotBegin && i < pivotEnd) - arr[i] = pivot; - else - arr[i] = aux[i]; - } - - if(threadIdx.x == 0) - { - if(pivotBegin - begin > 1) //left from pivot are smaller elems - { - stackArrBegin[stackTop] = begin; - stackArrEnd[stackTop] = pivotBegin; - stackTop++; - } - - if(end - pivotEnd > 1) //right from pivot until end are elem greater than pivot - { - stackArrBegin[stackTop] = pivotEnd; - stackArrEnd[stackTop] = end; - stackTop++; - } - } - __syncthreads(); - } -} - -template -__global__ -void cudaQuickSort(CudaArrayView arr, const Function & Cmp, - CudaArrayView aux, int stackSize, - TNL::Containers::ArrayView cuda_tasks) -{ - extern __shared__ int externMem[]; - - static __shared__ TASK task; - if(threadIdx.x == 0) - task = cuda_tasks[blockIdx.x]; - __syncthreads(); - - int * bitonicMem = externMem; - int * stackLeft = bitonicMem + (2*blockDim.x); - int * stackRight = stackLeft+ (stackSize/2); - - cudaQuickSort_block(arr.getView(task.arrBegin, task.arrEnd), Cmp, - aux.getView(task.auxBeginIdx, task.auxEndIdx), - stackLeft, stackRight, stackSize/2, - bitonicMem - ); - -} - //----------------------------------------------------------- //----------------------------------------------------------- const int threadsPerBlock = 512, maxBlocks = 1 << 15; //32k -const int maxTasks = 1<<10; -const int minElemPerBlock = threadsPerBlock*2; +const int maxTasks = 1 << 10; +const int minElemPerBlock = threadsPerBlock * 2; class QUICKSORT { - CudaArrayView arr; - TNL::Containers::Array aux; + ArrayView arr; + Array aux; - CudaTaskArray cuda_tasks, cuda_newTasks, cuda_2ndPhaseTasks; + Array cuda_tasks, cuda_newTasks, cuda_2ndPhaseTasks; - TNL::Containers::Array cuda_newTasksAmount, cuda_2ndPhaseTasksAmount; //is in reality 1 integer - int tasksAmount; //counter for Host == cuda_newTasksAmount - int totalTask; // cuda_newTasksAmount + cuda_2ndPhaseTasksAmount + Array cuda_newTasksAmount, cuda_2ndPhaseTasksAmount; //is in reality 1 integer + int tasksAmount; //counter for Host == cuda_newTasksAmount + int totalTask; // cuda_newTasksAmount + cuda_2ndPhaseTasksAmount - TNL::Containers::Array cuda_blockToTaskMapping; - TNL::Containers::Array cuda_blockToTaskMapping_Cnt; //is in reality 1 integer + Array cuda_blockToTaskMapping; + Array cuda_blockToTaskMapping_Cnt; //is in reality 1 integer int iteration = 0; //-------------------------------------------------------------------------------------- public: - QUICKSORT(CudaArrayView _arr) + QUICKSORT(ArrayView _arr) : arr(_arr), aux(arr.getSize()), cuda_tasks(maxBlocks), cuda_newTasks(maxBlocks), cuda_2ndPhaseTasks(maxBlocks), cuda_newTasksAmount(1), cuda_2ndPhaseTasksAmount(1), cuda_blockToTaskMapping(maxBlocks), cuda_blockToTaskMapping_Cnt(1) { - cuda_tasks.setElement(0, TASK(0, arr.getSize(), 0, arr.getSize())); + cuda_tasks.setElement(0, TASK(0, arr.getSize())); totalTask = tasksAmount = 1; cuda_2ndPhaseTasksAmount = 0; } - template - void sort(const Function & Cmp) + template + void sort(const Function &Cmp) { while (tasksAmount > 0 && totalTask < maxTasks) { @@ -338,31 +159,29 @@ public: { cudaPartition<<>>( arr, Cmp, - aux.getView(), + aux.getView(), cuda_blockToTaskMapping.getView(), elemPerBlock, cuda_tasks.getView(), cuda_newTasks.getView(), cuda_newTasksAmount.getData(), - cuda_2ndPhaseTasks.getView(), cuda_2ndPhaseTasksAmount.getData() - ); + cuda_2ndPhaseTasks.getView(), cuda_2ndPhaseTasksAmount.getData()); } else { cudaPartition<<>>( arr, Cmp, - aux.getView(), + aux.getView(), cuda_blockToTaskMapping.getView(), elemPerBlock, cuda_newTasks.getView(), cuda_tasks.getView(), //swapped order to write back and forth without copying cuda_newTasksAmount.getData(), - cuda_2ndPhaseTasks.getView(), cuda_2ndPhaseTasksAmount.getData() - ); + cuda_2ndPhaseTasks.getView(), cuda_2ndPhaseTasksAmount.getData()); } tasksAmount = processNewTasks(); iteration++; - } + } _2ndPhase(Cmp); @@ -427,21 +246,21 @@ public: return tasksAmount; } - template - void _2ndPhase(const Function & Cmp) + template + void _2ndPhase(const Function &Cmp) { - if(totalTask == 0) return; - + if (totalTask == 0) + return; + TNL::Algorithms::MultiDeviceMemoryOperations:: - copy(cuda_2ndPhaseTasks.getData() + (totalTask - tasksAmount), - (iteration%2? cuda_newTasks.getData() :cuda_tasks.getData() ), - tasksAmount - ); - + copy(cuda_2ndPhaseTasks.getData() + (totalTask - tasksAmount), + (iteration % 2 ? cuda_newTasks.getData() : cuda_tasks.getData()), + tasksAmount); + int blocks = totalTask; - int stackSize = 128, stackMem = stackSize * sizeof(int); - int bitonicMem = threadsPerBlock*2*sizeof(int); + int stackSize = 128, stackMem = stackSize * sizeof(int); + int bitonicMem = threadsPerBlock * 2 * sizeof(int); int auxMem = stackMem + bitonicMem; cudaQuickSort<<>>(arr, Cmp, aux.getView(), stackSize, cuda_2ndPhaseTasks.getView()); } @@ -449,14 +268,14 @@ public: //----------------------------------------------------------- -template -void quicksort(CudaArrayView arr, const Function & Cmp) +template +void quicksort(ArrayView arr, const Function &Cmp) { QUICKSORT sorter(arr); sorter.sort(Cmp); } -void quicksort(CudaArrayView arr) +void quicksort(ArrayView arr) { - quicksort(arr, []__cuda_callable__(int a, int b){return a < b;}); + quicksort(arr, [] __cuda_callable__(int a, int b) { return a < b; }); } -- GitLab From b5430f9093372eeaac0addab2869b736c7711c5e Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sun, 21 Mar 2021 18:35:51 +0100 Subject: [PATCH 124/258] depth --- GPUSort/src/quicksort/cudaPartition.cuh | 16 +++--- GPUSort/src/quicksort/quicksort.cuh | 73 +++++++++++++++++++------ GPUSort/src/quicksort/task.h | 11 ++-- 3 files changed, 69 insertions(+), 31 deletions(-) diff --git a/GPUSort/src/quicksort/cudaPartition.cuh b/GPUSort/src/quicksort/cudaPartition.cuh index 574130178..f627c1301 100644 --- a/GPUSort/src/quicksort/cudaPartition.cuh +++ b/GPUSort/src/quicksort/cudaPartition.cuh @@ -44,22 +44,20 @@ void copyData(ArrayView src, //---------------------------------------------------------------------------------- template -__global__ void cudaPartition(ArrayView src, +__device__ bool cudaPartition(ArrayView src, ArrayView dst, - const Function &Cmp, - int elemPerBlock, TASK * task + const Function &Cmp, const int & pivot, + int elemPerBlock, TASK & task ) { static __shared__ int myBegin, myEnd; static __shared__ int smallerStart, biggerStart; - static __shared__ int pivot; static __shared__ bool writePivot; if (threadIdx.x == 0) { - myBegin = elemPerBlock * (blockIdx.x - task->firstBlock); + myBegin = elemPerBlock * (blockIdx.x - task.firstBlock); myEnd = TNL::min(myBegin + elemPerBlock, arr.getSize()); - pivot = src[src.getSize() - 1]; } __syncthreads(); @@ -75,8 +73,8 @@ __global__ void cudaPartition(ArrayView src, if (threadIdx.x == blockDim.x - 1) //last thread in block has sum of all values { - smallerStart = atomicAdd(&(task->dstBegin), smallerOffset); - biggerStart = atomicAdd(&(task->dstEnd), -biggerOffset) - biggerOffset; + smallerStart = atomicAdd(&(task.dstBegin), smallerOffset); + biggerStart = atomicAdd(&(task.dstEnd), -biggerOffset) - biggerOffset; } __syncthreads(); @@ -90,7 +88,7 @@ __global__ void cudaPartition(ArrayView src, //----------------------------------------------------------- if (threadIdx.x == 0) - writePivot = atomicAdd(&(task->tillWorkingCnt), -1) == 1; + writePivot = atomicAdd(&(task.tillWorkingCnt), -1) == 1; __syncthreads(); return writePivot; diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index d395701b6..83e98e201 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -14,36 +14,38 @@ using namespace TNL::Containers; //----------------------------------------------------------- -__device__ void writeNewTask(int begin, int end, +__device__ void writeNewTask(int begin, int end, int depth ArrayView newTasks, int *newTasksCnt, ArrayView secondPhaseTasks, int *secondPhaseTasksCnt) { int size = end - begin; - if(size == 0) return; - if(size <= blockDim.x*2) + if (size == 0) + return; + if (size <= blockDim.x * 2) { int idx = atomicAdd(secondPhaseTasksCnt, 1); - secondPhaseTasks[idx] = TASK(begin, end); + secondPhaseTasks[idx] = TASK(begin, end, depth + 1); } else { int idx = atomicAdd(newTasksCnt, 1); - newTasks[idx] = TASK(begin, end); + newTasks[idx] = TASK(begin, end, depth + 1); } } __device__ void writeNewTasks(int leftBegin, int leftEnd, int rightBegin, int rightEnd, + int depth, ArrayView newTasks, int *newTasksCnt, ArrayView secondPhaseTasks, int *secondPhaseTasksCnt) { - writeNewTask(leftBegin, leftEnd, newTasks, newTasksCnt, secondPhaseTasks, secondPhaseTasksCnt); - writeNewTask(rightBegin, rightEnd, newTasks, newTasksCnt, secondPhaseTasks, secondPhaseTasksCnt); + writeNewTask(leftBegin, leftEnd, depth, newTasks, newTasksCnt, secondPhaseTasks, secondPhaseTasksCnt); + writeNewTask(rightBegin, rightEnd, depth, newTasks, newTasksCnt, secondPhaseTasks, secondPhaseTasksCnt); } //---------------------------------------------------- template -__global__ void cudaQuickSort1stPhase(ArrayView src, ArrayView dst, - const Function &Cmp, int elemPerBlock, +__global__ void cudaQuickSort1stPhase(ArrayView arr, ArrayView aux, + const Function &Cmp, int elemPerBlock, int depth, ArrayView tasks, ArrayView taskMapping, int *tasksAmount, ArrayView newTasks, int *newTasksCnt, @@ -53,13 +55,26 @@ __global__ void cudaQuickSort1stPhase(ArrayView src, ArrayVi TASK &myTask = tasks[taskMapping[blockIdx.x]]; if (threadIdx.x == 0) - pivot = src[myTask.partitionEnd - 1]; + pivot = depth % 2 == 0 ? arr[myTask.partitionEnd - 1] : aux[myTask.partitionEnd - 1]; __syncthreads(); - bool isLast = cudaPartition( - src.getView(myTask.partitionBegin, myTask.partitionEnd), - dst.getView(myTask.partitionBegin, myTask.partitionEnd), - Cmp, pivot, elemPerBlock, myTask); + bool isLast; + + if (depth % 2 == 0) + { + isLast = cudaPartition( + arr.getView(myTask.partitionBegin, myTask.partitionEnd), + aux.getView(myTask.partitionBegin, myTask.partitionEnd), + Cmp, pivot, elemPerBlock, myTask); + } + else + { + isLast = cudaPartition( + aux.getView(myTask.partitionBegin, myTask.partitionEnd), + arr.getView(myTask.partitionBegin, myTask.partitionEnd), + Cmp, pivot, elemPerBlock, myTask); + } + if (!isLast) return; @@ -69,16 +84,39 @@ __global__ void cudaQuickSort1stPhase(ArrayView src, ArrayVi int rightBegin = myTask.partitionBegin + myTask.dstEnd, rightEnd = myTask.partitionEnd; for (int i = leftEnd + threadIdx.x; i < rightBegin; i += blockDim.x) - src[i] = dst[i] = pivot; + { + /* + #ifdef DEBUG + aux[i] = -1; + #endif + */ + arr[i] = pivot; + } if (threadIdx.x != 0) return; writeNewTasks(leftBegin, leftEnd, rightBegin, rightEnd, + depth, newTasks, newTasksCnt, secondPhaseTasks, secondPhaseTasksCnt); } + //----------------------------------------------------------- + +template +__global__ void cudaQuickSort2ndPhase(ArrayView arr, ArrayView aux, + const Function &Cmp, + ArrayView secondPhaseTasks) +{ + TASK & myTask = secondPhaseTasks[blockIdx.x]; + auto arrView = arr.getView(myTask.partitionBegin, myTask.partitionEnd); + auto auxView = aux.getView(myTask.partitionBegin, myTask.partitionEnd); + + singleBlockQuickSort(arrView, auxView, Cmp, myTask.depth); +} +//----------------------------------------------------------- + __global__ void cudaInitTask(ArrayView cuda_tasks, int taskAmount, int elemPerBlock, int *firstAvailBlock, ArrayView cuda_blockToTaskMapping) @@ -91,7 +129,7 @@ __global__ void cudaInitTask(ArrayView cuda_tasks, if (i < taskAmount) { auto task = cuda_tasks[i]; - int size = task.arrEnd - task.arrBegin; + int size = task.partitionEnd - task.partitionBegin; blocksNeeded = size / elemPerBlock + (size % elemPerBlock != 0); } @@ -104,8 +142,7 @@ __global__ void cudaInitTask(ArrayView cuda_tasks, { int myFirstAvailBlock = avail + blocksNeeded_total - blocksNeeded; - cuda_tasks[i].firstBlock = myFirstAvailBlock; - cuda_tasks[i].setBlocks(blocksNeeded); + cuda_tasks[i].initTask(myFirstAvailBlock, blocksNeeded); for (int set = 0; set < blocksNeeded; set++) cuda_blockToTaskMapping[myFirstAvailBlock++] = i; diff --git a/GPUSort/src/quicksort/task.h b/GPUSort/src/quicksort/task.h index 7b4416571..848d7460c 100644 --- a/GPUSort/src/quicksort/task.h +++ b/GPUSort/src/quicksort/task.h @@ -4,24 +4,27 @@ struct TASK { //start and end position of array to read and write from int partitionBegin, partitionEnd; - //----------------------------------------------- //helper variables for blocks working on this task + int depth; int dstBegin, dstEnd; int firstBlock, blockCount;//for workers read only values int stillWorkingCnt;//shared counter of blocks working together(how many are still working) __cuda_callable__ - TASK(int begin, int end) + TASK(int begin, int end, int depth) : partitionBegin(begin), partitionEnd(end), - dstBegin(0), dstEnd(end-begin), + depth(depth), + dstBegin(-151561), dstEnd(-151561), firstBlock(-100), blockCount(-100), stillWorkingCnt(-100) {} __cuda_callable__ - void setBlocks(int blocks) + void initTask(int firstBlock, int blocks) { + dstBegin= 0; dstEnd = partitionEnd - partitionBegin; + this->firstBlock = firstBlock; blockCount = stillWorkingCnt = blocks; } -- GitLab From 801a5ac161fa81babee2126d4b61b55e02b4114b Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sun, 21 Mar 2021 19:04:49 +0100 Subject: [PATCH 125/258] compilation fix --- GPUSort/src/quicksort/cudaPartition.cuh | 20 ++- GPUSort/src/quicksort/quicksort.cuh | 156 ++++----------------- GPUSort/src/quicksort/quicksort_1Block.cuh | 6 +- GPUSort/src/quicksort/sample/main | Bin 0 -> 217896 bytes GPUSort/src/quicksort/sample/main.o | Bin 0 -> 339720 bytes 5 files changed, 43 insertions(+), 139 deletions(-) create mode 100644 GPUSort/src/quicksort/sample/main create mode 100644 GPUSort/src/quicksort/sample/main.o diff --git a/GPUSort/src/quicksort/cudaPartition.cuh b/GPUSort/src/quicksort/cudaPartition.cuh index f627c1301..b3710b406 100644 --- a/GPUSort/src/quicksort/cudaPartition.cuh +++ b/GPUSort/src/quicksort/cudaPartition.cuh @@ -10,8 +10,14 @@ using namespace TNL; using namespace TNL::Containers; +template +__device__ Value pickPivot(TNL::Containers::ArrayView src, const Function & Cmp) +{ + return src[0]; +} + __device__ -void cmpElem(ArrayView arr, +void countElem(ArrayView arr, int &smaller, int &bigger, const int &pivot) { @@ -33,11 +39,11 @@ void copyData(ArrayView src, { for (int i = threadIdx.x; i < src.getSize(); i += blockDim.x) { - int data = arr[i]; + int data = src[i]; if (data < pivot) - aux[smallerStart++] = data; + dst[smallerStart++] = data; else if (data > pivot) - aux[biggerStart++] = data; + dst[biggerStart++] = data; } } @@ -57,7 +63,7 @@ __device__ bool cudaPartition(ArrayView src, if (threadIdx.x == 0) { myBegin = elemPerBlock * (blockIdx.x - task.firstBlock); - myEnd = TNL::min(myBegin + elemPerBlock, arr.getSize()); + myEnd = TNL::min(myBegin + elemPerBlock, src.getSize()); } __syncthreads(); @@ -66,7 +72,7 @@ __device__ bool cudaPartition(ArrayView src, //------------------------------------------------------------------------- int smaller = 0, bigger = 0; - cmpElem(srcView, smaller, bigger, pivot); + countElem(srcView, smaller, bigger, pivot); int smallerOffset = blockInclusivePrefixSum(smaller); int biggerOffset = blockInclusivePrefixSum(bigger); @@ -88,7 +94,7 @@ __device__ bool cudaPartition(ArrayView src, //----------------------------------------------------------- if (threadIdx.x == 0) - writePivot = atomicAdd(&(task.tillWorkingCnt), -1) == 1; + writePivot = atomicAdd(&(task.stillWorkingCnt), -1) == 1; __syncthreads(); return writePivot; diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index 83e98e201..4264746eb 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -4,6 +4,7 @@ #include "../util/reduction.cuh" #include "task.h" #include "cudaPartition.cuh" +#include "quicksort_1Block.cuh" #include "../bitonicSort/bitonicSort.h" #include @@ -14,8 +15,8 @@ using namespace TNL::Containers; //----------------------------------------------------------- -__device__ void writeNewTask(int begin, int end, int depth - ArrayView newTasks, int *newTasksCnt, +__device__ +void writeNewTask(int begin, int end, int depth, ArrayView newTasks, int *newTasksCnt, ArrayView secondPhaseTasks, int *secondPhaseTasksCnt) { int size = end - begin; @@ -34,7 +35,7 @@ __device__ void writeNewTask(int begin, int end, int depth } __device__ void writeNewTasks(int leftBegin, int leftEnd, int rightBegin, int rightEnd, - int depth, + int depth, ArrayView newTasks, int *newTasksCnt, ArrayView secondPhaseTasks, int *secondPhaseTasksCnt) { @@ -45,7 +46,7 @@ __device__ void writeNewTasks(int leftBegin, int leftEnd, int rightBegin, int ri template __global__ void cudaQuickSort1stPhase(ArrayView arr, ArrayView aux, - const Function &Cmp, int elemPerBlock, int depth, + const Function &Cmp, int elemPerBlock, ArrayView tasks, ArrayView taskMapping, int *tasksAmount, ArrayView newTasks, int *newTasksCnt, @@ -55,12 +56,16 @@ __global__ void cudaQuickSort1stPhase(ArrayView arr, ArrayVi TASK &myTask = tasks[taskMapping[blockIdx.x]]; if (threadIdx.x == 0) - pivot = depth % 2 == 0 ? arr[myTask.partitionEnd - 1] : aux[myTask.partitionEnd - 1]; + pivot = pickPivot(myTask.depth %2 == 0? + arr.getView(myTask.partitionBegin, myTask.partitionEnd ) : + aux.getView(myTask.partitionBegin, myTask.partitionEnd ), + Cmp + ); __syncthreads(); bool isLast; - if (depth % 2 == 0) + if (myTask.depth % 2 == 0) { isLast = cudaPartition( arr.getView(myTask.partitionBegin, myTask.partitionEnd), @@ -97,23 +102,23 @@ __global__ void cudaQuickSort1stPhase(ArrayView arr, ArrayVi return; writeNewTasks(leftBegin, leftEnd, rightBegin, rightEnd, - depth, + myTask.depth, newTasks, newTasksCnt, secondPhaseTasks, secondPhaseTasksCnt); } //----------------------------------------------------------- -template +template __global__ void cudaQuickSort2ndPhase(ArrayView arr, ArrayView aux, const Function &Cmp, ArrayView secondPhaseTasks) { - TASK & myTask = secondPhaseTasks[blockIdx.x]; + TASK &myTask = secondPhaseTasks[blockIdx.x]; auto arrView = arr.getView(myTask.partitionBegin, myTask.partitionEnd); auto auxView = aux.getView(myTask.partitionBegin, myTask.partitionEnd); - singleBlockQuickSort(arrView, auxView, Cmp, myTask.depth); + singleBlockQuickSort(arrView, auxView, Cmp, myTask.depth); } //----------------------------------------------------------- @@ -161,10 +166,10 @@ class QUICKSORT Array aux; Array cuda_tasks, cuda_newTasks, cuda_2ndPhaseTasks; - Array cuda_newTasksAmount, cuda_2ndPhaseTasksAmount; //is in reality 1 integer - int tasksAmount; //counter for Host == cuda_newTasksAmount - int totalTask; // cuda_newTasksAmount + cuda_2ndPhaseTasksAmount + + int tasksAmount; //counter for Host == cuda_newTasksAmount + int totalTask; // cuda_newTasksAmount + cuda_2ndPhaseTasksAmount Array cuda_blockToTaskMapping; Array cuda_blockToTaskMapping_Cnt; //is in reality 1 integer @@ -179,130 +184,21 @@ public: cuda_newTasksAmount(1), cuda_2ndPhaseTasksAmount(1), cuda_blockToTaskMapping(maxBlocks), cuda_blockToTaskMapping_Cnt(1) { - cuda_tasks.setElement(0, TASK(0, arr.getSize())); + cuda_tasks.setElement(0, TASK(0, arr.getSize(), 0)); totalTask = tasksAmount = 1; cuda_2ndPhaseTasksAmount = 0; } template - void sort(const Function &Cmp) - { - while (tasksAmount > 0 && totalTask < maxTasks) - { - int elemPerBlock = getElemPerBlock(); - int blocksCnt = initTasks(elemPerBlock); - - if (iteration % 2 == 0) - { - cudaPartition<<>>( - arr, Cmp, - aux.getView(), - cuda_blockToTaskMapping.getView(), - elemPerBlock, - cuda_tasks.getView(), cuda_newTasks.getView(), - cuda_newTasksAmount.getData(), - cuda_2ndPhaseTasks.getView(), cuda_2ndPhaseTasksAmount.getData()); - } - else - { - cudaPartition<<>>( - arr, Cmp, - aux.getView(), - cuda_blockToTaskMapping.getView(), - elemPerBlock, - cuda_newTasks.getView(), cuda_tasks.getView(), //swapped order to write back and forth without copying - cuda_newTasksAmount.getData(), - cuda_2ndPhaseTasks.getView(), cuda_2ndPhaseTasksAmount.getData()); - } - - tasksAmount = processNewTasks(); - - iteration++; - } - - _2ndPhase(Cmp); - - cudaDeviceSynchronize(); - } - - int getSetsNeeded() const - { - auto view = iteration % 2 == 0 ? cuda_tasks.getConstView() : cuda_newTasks.getConstView(); - auto fetch = [=] __cuda_callable__(int i) { - auto &task = view[i]; - int size = task.arrEnd - task.arrBegin; - return size / minElemPerBlock + (size % minElemPerBlock != 0); - }; - auto reduction = [] __cuda_callable__(int a, int b) { return a + b; }; - return TNL::Algorithms::Reduction::reduce(0, tasksAmount, reduction, fetch, 0); - } - - int getElemPerBlock() const - { - int setsNeeded = getSetsNeeded(); - - if (setsNeeded <= maxBlocks) - return minElemPerBlock; - - int setsPerBlock = setsNeeded / maxBlocks + 1; //+1 to spread out task of the last block - return setsPerBlock * minElemPerBlock; - } - - int initTasks(int elemPerBlock) - { - int threads = min(tasksAmount, threadsPerBlock); - int blocks = tasksAmount / threads + (tasksAmount % threads != 0); - cuda_blockToTaskMapping_Cnt = 0; - - if (iteration % 2 == 0) - { - cudaInitTask<<>>( - cuda_tasks.getView(), tasksAmount, elemPerBlock, - cuda_blockToTaskMapping_Cnt.getData(), - cuda_blockToTaskMapping.getView()); - } - else - { - cudaInitTask<<>>( - cuda_newTasks.getView(), tasksAmount, elemPerBlock, - cuda_blockToTaskMapping_Cnt.getData(), - cuda_blockToTaskMapping.getView()); - } - - cuda_newTasksAmount.setElement(0, 0); - return cuda_blockToTaskMapping_Cnt.getElement(0); - } - - int processNewTasks() - { - TNL::Algorithms::MultiDeviceMemoryOperations:: - copy(arr.getData(), aux.getData(), aux.getSize()); - - tasksAmount = cuda_newTasksAmount.getElement(0); - totalTask = tasksAmount + cuda_2ndPhaseTasksAmount.getElement(0); - return tasksAmount; - } - - template - void _2ndPhase(const Function &Cmp) - { - if (totalTask == 0) - return; - - TNL::Algorithms::MultiDeviceMemoryOperations:: - copy(cuda_2ndPhaseTasks.getData() + (totalTask - tasksAmount), - (iteration % 2 ? cuda_newTasks.getData() : cuda_tasks.getData()), - tasksAmount); - - int blocks = totalTask; - - int stackSize = 128, stackMem = stackSize * sizeof(int); - int bitonicMem = threadsPerBlock * 2 * sizeof(int); - int auxMem = stackMem + bitonicMem; - cudaQuickSort<<>>(arr, Cmp, aux.getView(), stackSize, cuda_2ndPhaseTasks.getView()); - } + void sort(const Function &cmp); }; +template +void QUICKSORT::sort(const Function &cmp) +{ + return; +} + //----------------------------------------------------------- template diff --git a/GPUSort/src/quicksort/quicksort_1Block.cuh b/GPUSort/src/quicksort/quicksort_1Block.cuh index 2ed5e94f6..f4e189b0a 100644 --- a/GPUSort/src/quicksort/quicksort_1Block.cuh +++ b/GPUSort/src/quicksort/quicksort_1Block.cuh @@ -1,8 +1,10 @@ #pragma once #include +#include "cassert" #include "../bitonicSort/bitonicSort.h" #include "../util/reduction.cuh" +#include "cudaPartition.cuh" using namespace TNL; using namespace TNL::Containers; @@ -116,7 +118,7 @@ __device__ void singleBlockQuickSort(ArrayView arr, } int smaller = 0, bigger = 0; - countElem(src, 0, size, smaller, bigger, pivot); + countElem(src, smaller, bigger, pivot); int smallerOffset = blockInclusivePrefixSum(smaller); int biggerOffset = blockInclusivePrefixSum(bigger); @@ -131,7 +133,7 @@ __device__ void singleBlockQuickSort(ArrayView arr, int destSmaller = 0 + smallerOffset - smaller; int destBigger = pivotEnd + (biggerOffset - bigger); - copyData(src, 0, size, dst, destSmaller, destBigger, pivot); + copyData(src, dst, destSmaller, destBigger, pivot); __syncthreads(); for (int i = pivotBegin + threadIdx.x; i < pivotEnd; i += blockDim.x) diff --git a/GPUSort/src/quicksort/sample/main b/GPUSort/src/quicksort/sample/main new file mode 100644 index 0000000000000000000000000000000000000000..027e47b12cf8e10e6d91754dbe5c516063da9351 GIT binary patch literal 217896 zcmb<-^>JfjWMpQ50%is~21W)31_euqfCGeLc;N|G!r;JQ!NAMlz@W;Y!obGBz`(-5 zzyMQ+PTydH7yzR=AY2BJX&`+PAO-^i13E1ORR^O%ZUPB`XplY-8yhZAg^0pv1_1~k zq>mNEGyo9{3=C*=;&O;Mj7HW6w(kK%o&k-PPX?LHzyPC>^?|}>fhQyY7DS@bD?pMU zBe_5X0|QJSNEJxR!W+b5V6ccrr4=At1{e(sPmmiy*a8}!=(GaV8W@ePF9NC$opyoh zgV7*6KtjP!OHx2=bbDZYm_0BWW?ukQpTa*#xS`V}JRs8<7+^HW4vqD$sao&ds$54+r+)OC$2BKFgui|_nZbuJ;)4@ zogg(JgF*VCQ4HdO^@lPrfb%5CsRj%a859}Bm_TCT3=9kr3=9kxQqSe8=JGgK^1R`7 zdH8~F+Mncf(HT6>U;}Uvo7ou{gs{lzGh-KD%7I;6gd4l~WJc`bD;cnhpJ&4^-i|~4 zG#2dYMRAA=;_$CH4u9FOVmIFsNBGRbVg5B7;n0LboEJxUp21=7G*0a9Y{TK6R2=@z zU4*$yIP`@2VymR8Pw-AT)r+<48X-INV=`Bi`@fNYC6j;;RFP`wMW0+u}%v zB{<^!4i5J(#}SWTairV1INT3v=V7aNKy@Ijt$?LtFz# zx{?L8jgcJ)W$WM&|BWMlqi}?eGCTHkD2F3Gf5wrnB)J(Fl%Q@yrYyncD=;ibgtRgQ zKmpIdz@W&J3z%j?G=#J7e z^BLkjLww^?Qj1d4GmA@7i$Z*zbMo_2LlTp6QW@go({uCl;)_cXi%R0-Axg0-OfF4H z3@#~3P0S6h%uCKF%FoNJ0%-?{d*)@9_~xgS=A<%!xNb$M5QckdiBDp2iEB|&ei2y4 zH!&wCKbavu9;7S?>K?b!yyTM1{5-G<_tX-X)UwQEhylK-xyc2UQ1zjCFo(G%mN;eR zB^JSC0`d!-6LWH$^YhX&(@TpIkxc_z9$ZqCnU{`gK|oP{L26M+W@<6ie7JjHrn}~) zK&1joi!(57K=TGn8_XK8&3=iw5U=6si{VrZD0kysR8Qk0lkQtX-R8l32v?CNTsoS$2eSd{7-;GN769~E3; zWDp-;l2Mdj9-ot%mtK+)p9%_Th-^q$aEWDVMRIBZI5gs;{DMmijf+b2N-}d(AqriL zT+2XCA>AdJ6|T7q@lhe3nP7M16ytVu5Xf4H|MH8$>Blo9KExp2HP}2p$lDL6-N7dD z!KU#chVibh4DnGZIRRxL8!s%gifIEh=%%4e|~)i3cUUip2Qz(!`>a_{8MG(#)b%^2~KM zboB=t7zOcgfH#@Z?ra1~dO_a7#^^q@fW$B)1w*45XV5!CgV`Hr>MXx*cYT1qG>jDIl+s7vALOD`!KHt=IyX$ReV^2V?-m z*%q1k#qmjr#i=Hs@)@4CNiKH4ISeKJxCI602ggHl5Xr{Df)_cFg9|ZGffD9tXaFu1 z6O%H_3=KggWKLpnaePT-L27(vURplL(%=%11XxdeQep}uIe?OM2a@mQQ0pNwHb zd~!u%d{Sz9W?p=9Vo7oa$YxKBL`r&m%z@UUi6yBOnI#Olskz0eB@9K0 zkhWb#VtiUAD1(68vN@SapyUl|3g;$f<}rXQVgL!2rX&`XfJPt<4fPB_BE=;s$=cc= z5i<}sJvlkP7{oIK@sdF-GX{5O=Xhg10|s{=PbcSiBRvy1%Sg`@!j6cDH`X)JGqhlE zj)-t{@{BjsGuE?2kux$w<|3&>7S=N|1oaG<7+4sX7+4vY7}ywC7}yw?7?>EC!6XX< zGgt&92NPpvU}sDlU zU7+z-Mh1H%`3JhUnHd<^z+xa3j0{W+c~E)K)PhK6CNBd6Cqn~NtW+wKlYxPip$l#v z8$%#e4x~Th$8;tJ1}=taP_fF3>$w>iI2e{f`DI{vc7{z*eu?HJ4$#cYK`8(7`fOIv zX!BVpzYwgRiQxv+KG^&NY#g4!3S=t-!wTrU1Wf#lD@1$)nz+9^M0^LD_yOoV(Frv1 z6VQI)1vK%7^AK}xpot$!frvjq6aN4e|9~cb18V*cH1ShV^BJJ>FnJaP`Aeg~TPC#d)ZH1UM@ z5cLnx#1BBlKcI;#e1NFufcg(+Zvs?Y0Zn`YRNMefdt(WVTKAMafk|tOaqcQXwD5H%D~WpBo3+2@ z1VQ-^Nn8v`Tmng497$XONn8?1Tmwm53Q61mNn9F9+yY4)wiW>-?SLc>8WRSIfv^XX zxEx3TiUW|u<)LCADgsFy)b0R@fp7wnxFSdZiZhVJk;iunki?atYC%*5lDG;;0E!!s z#Gy+jz%m_3;%Zg9M;> z2a-5wj2$Y>Z~#eM7beKSz;FUdTn|b70+P5slK2fIaZtMrCiMVG9NeshO1wZ4hpYsE zNPIvNH-QL&$sb7KreF~$0WDWS>E8@01fn>Q#Gy?qkT8P)k~p;K1{Rk<61N76KnMjS zaa)KGnAAWLw*!kn2m>T>dx#L2v_KMvt?dCxI3S5Tf&`%014-NoDh8qgki?M}CPW~K zLz|u;0!iEpNxT6`+#5-}14-NmNqhp5 zcmR_43?%VDB=H4E;=xGbqs-9|7!85Z5Eu=C(GVC7fzc2c4S~@R7|0O#%rE!LqxlVo zM>p$1eI^Ev)&nI>|1WqnAK^F*cGiDWD_tgrPybb|b(t9WE%T@A2iVN>E%HE%W^A2hJ?>E%K=A2guy>E%Q?A2g8i>189F59%a+ zdRYkPg9c7My-bAjK?5eAUIxPXpn;N4FCF20&;ZG&mxgdYXkg^iOGP*zG$8Wnr68OS z8hHEkk`c}a4UBwx`SBmjzn}q;PcI+B`JjQ2PcJXR`Je%iPcIL``JjQ1PcJvZ`Je%h zPcIk3`JjQ0PcJ9J`Je%gPcIwce9*wgr180C4;t9`^wJT| z2MuU^dT9vfg9b7_y;OwrK?4|{UJAnbpn;1|FB#!{(169KmmmMa{0|za`1JB2oDUkH z`1JB3oDUk9`1JB1oDUk1`1EokoDUj^`1EoioDUj+`1EojoDUj!`1G<7&Ib)Re0o_3 z=Ys|sKD|tY^Fad)pI!#S`JjP?PcI$ee9(Zxr7}(@RA-A2fjQ>7^i?4;nc5 z^pX+I2Mri}din7W%>ST)f=@3W!ug;9f=@3m!ugW`JjP;PcIqae9(ZvrA5!A)F5y0QmIs zBAgHE_ee2g!4fi{!cF(Vf;`3RsU*%N*@;n zaQXA;zv@;ERQ_5t{!%plTr~bvG=47{zZH#Ni^eZS!R^h(fG1xd{Hz$FB+c}jsI62HN3x~@!z8HpQ7>aqVcby@z0|1kD~GSqVcz) z@zKNpRkipGycvb!pfsp6jYvhbg~F`vs#1H%<$;tISQ4M0ts}xet<~JXfQD}A7Jcc71m&4 z=w=0#Yt26x`CDf)Ffeqp{!?dS@R;GzD+1Nb3MzLzjvkn3bkRA*wCfExUrwQr8Q z{sA?&yIucu9)I!fH&VnkzfnM!KSw}*9Rc}1RCx!F&JUiQKYcpS`E*|Q=sfPxS^B`E zv-X8YXXp)&PTvbYovtTP+~7}rPCMH3@#)yI&0rF z*S=vWW#0jc{MX_L%dB1B)UsYo{qy>Bx9cC2Ky&De0N2{-`@p5s^^Q-c?*)XNoyT9~f?^Mxm^*8Kbe4YU4E@mU`o-Gy z1OJ=@D6Zh>to_qj`lCDa2Po!_?FYI4#o_<||DQZO0jA%hm-W>D|Nl|+DR^}L@aX&q zar6UFwoCEoW>E#TfIT{0e;6L{=ym-D%B~=DYC&9gEe3{NAb)yv-UIP3gQR<1!Ai?P>`u5+888>DicuSE z;{uOf-whtUAWP1HG(Q6iyIuerxDR6L@fUeIARAp5cyxxY@aP56pn_(FA``=lr(mf= z9^I@b6+p$=F^|scFYJE&|L@Tqy1=8`b%h5g2_Jhg;otxN2(7swt*hW#`w&{0fBgTy z8*DGwQO93gQUN&>;vEr928PZX9^Jk#QarSIz_C*j@j4I`!X98>f&()^1LWEpAn&OD zgd{X1*(MbRhS%%}E#Pq4jnMMx$N&E?&VC1ZAod5=0>eQWwZQ1LHB)9{cyZ}JNDr%) zGAJ01zc}^n|9{kCMF3o?gA1MGFK&NFP8%N0M;bgjYiD$pPU#Gt;L+*Z;nV5bfSg!< z7#=w8x&$0B-L5l0e*3}7z|bAK1XO@`x?H``?7D=p+jR-3*bSZH(HU|SMPveoNC#ZR zquaFsWZ+gz13h|KA2NVKlQmk2iQ%~G4p5nR+;tCxIsl=LK&TTC>I{g2q$*HiDLvxR z>$(GEeB}54|6336Pd(6bptQ2Nb_qkNqepk`29NI26&}5z7a(pw_<-4`mxsxt+xLh| zx9b^??$90Gt_Q-yJUXv+9`fk51wveOk@RI#G@OO-HyMgVgbeW7HDjP z6?^ou&H-iDPS#@zpvXavfa9(gK=E+g^$LW#0io`ID34y>10KDhJ3M+zuXyyjo&dRi z&sUVl*#e2210LO_J3M+rA3)rNh@2}f-L7{$xSU%*a7%71Ocg4t>#iuJibd8qjD8?V{l` z6Sin@1(&O=?(&pIgXtG4M8kGX28I{k7)gqTW{{pkP(7d^MXty8f1*M(AeDZsYH~~r zAWJ|gFY6nq*mV7o#Q=ehJ3(2)amRm9aoWqvJO@<#i~gAnqD=3A%jx4Ud_VpF@6mij z0b~e7TNVSF#vq7BZ!l$A4Am$?SmQ#l4$)a)%JdPqm3sWe?T-)}k=u~Hy!U>BYBf>2 z)nM1TOM!YMw(S4^|9_$P1zh}t+8m&YU&Eub_6Bk(0IK}EUBQ*qT5t(_$fMWwxku+U zP?)S@fV3t+)f~JvVfpd@e^BKIs+B;-f(j*&|6?Jod}xCf%4>dO0G97O;nH~!)SU7F zx6M9ybh_T?^nKvd?FuR{Z*1k5%M0T8NUytbw8+3doc&KNe6OX zuPCoDNVn?`k4}LJv4_E<=*U3_>HzU?W3bUG2w?+Ku>-Om8Wc$FOsFVwvj8O2?fL^%QD^CwZr3j!Gf+jrl`ldFl&mZ~I%`jK zmL5S%S>3Kj4!)4@aJ}D9d!M0%z1#K3_ZzR7yIqfTx<2T3eXtKSaNl|S#X$xJhECTf z9=)#MCLza%|Nr^7h2HOWeZqL5^O#3(=sk~aQ1x(e7ii=blrUA_g3>!gdFDHaa)=sm zx>frC>e3zT22~hOI*)mDo`3O71S2UszqUa%zrZ%jff8CLI5QN#N3__$mLGpH;SI=C zux#l2|NmbYfK-FzxcL+BX%C!J-J3;C{ctF&vfkb*u{h$iOAzVi&mjlEF ztA7Di|Lrv>`IyQ<6+DM)b zI}q1sfD@`%dh_bN6uf*LD_cSD|ikP zhB_G1BLd4VK*)Xt1xdFnO1*OsBn@&ST6P2Xvt9qp{9gN`^I$iWaRJ=I=ihdK@dUQA z6Ei=2euX`CW2&6_64FpOi73ZGAs*q;S-ZodvvdQfbBYn_p`gy~0cf?F3o5d^LA?@J zm{1Ey2%IW*Koxm_rXxLiK`M^FXa^0=fx7vK=8)=ZIZ%-U3BLoN&`tUS58bV={{Me5 z0pidd9w-5Zr%YH6(eQ(OpG^k!t_k@p57ab<`D`s{)`yVKI>3z|aLIf7<^TULw!HvH z0h-T1u^<5+p8}7E9Di{V)cx-j)!+tKjW0Yp1zzO5`2Qafa8PyTAmbp#ehoMXkjry$ z^QD(ndm;k^Qc4Q;XnbP<9(mn+NfVaCj9^ZA6Zxv)EfgE=h55h@c;jRhzW^c z6IvBOiK=%hNZg~>^zsBq>E$Q@DtcQPz>>WXHKK>WlHhWlLl7+g02GzIQ$aR*^!9>P zL+XJJKCt`&uzW8>nP@9Q{|Sis3&8TB4IaI{V5Jbv-Vn_VV98#H8c`pF=5C1Q1h9PQ z0*~Hauu_O-K8R)quw*Ypji@|AvmZpW0!Y3a92FkDy zuz-Ee2a$gOipt*510ZjMltMI5hED0*-@4qN`2C79`AtJ|L6g~lS!KQZ}e~|>? zf^F$M{vsH{g}CPU3l|6%?1#?dFH9j^h+mJtP=j#6!P9yCg*b!@3C80uI3Qeb$afxp z@$)gnUQm2?9)Ix)!Ue}`=kXV}AzW}8=sf=741^1g)z0HD_JX;PVV8{%_5x5*vJ}Dw z7b`QtY|$bPP%df(C*n>4k8Usvnic~DKx2-G(we`ei;;ohWdkE*0vB9H!NbY(@&Erw z17EGM05e1rhlQLZnm8;78PUXHVfpeAs`;=$y@)0b3*DV);;>*|h$apT?@lyvNY*(1 zq7Y3S76XxJ;;^W2L=%U_i6*KzByvR11i`WN{~?M`AyvhzhoFK8Tvo1s^8f#fE>QHL z)BvDz65KxMEWLtWN`d=+$6vHQ`TxJ$^-AaQ7uJtJ#vSy4H4q?~Ve8}n|0lqj11}zd zI4HF>O63Hu+z*4z0Xs_q-M26MOT14s#R z4$>|6t^^ewrWvfDw%Yv{KORBaYMz}(JUZ{c=zQ@1zfb1_P`3LAc5d?#foR7VL@Ry= zsB?+nf+}!G#}zueaIN$Bi?`oUhDM;hIgjI@F$|PhC6ElLO#_vsJBX-9r9ec{o0 z0m+Ka+8>|!1zo>@+T@@41zf*;<`;yFlpJ>et;PZwiM1DtY6f`f5Y#^a_d5bO&^w$U zhk!ci-2xzsJh~Y?EDsg4d33vec+CP5L6`;#dr%8&g-2&8$|&Xw(0KG7kT*S!yB+}3 z-K^(8WmTu^3Gkq0ukRj@-p~W^NrcY(ptu4hn-fSA4hy+JaRi!wIR3(sn*p>k4m?C$ zVhbCZECgjN=wyIANCi0Qf$UfT?IF#70Llo^k<3f?|Nnn&0Ml#&(%kL4z@s-5Sw}9i zjs+lxK`qDtIR_<6xxKbWwD-Vcr3D}xgwX_}kql7=83Jk7EP3$%|BHH%9B3*8TpS?c zulbDtaybB*W%>e2kZ13J%7HH)-Jw5HJhVk1Gaali%LkH1jhVgL=a zpqLMCMG+~#1l00HYJ-9jJhFab6o9XLRJa`yXKJyE(nnJsS&k<@s3&B8M1BE%Lr-kmaECMcf<3tOY#y20B7#KSD z%AhqEe;6L%-^Ti!lZm0_0RNN&EeA^6J$h}gb22f!_;n9HGz;p(*>-^hvq6=AukAJv z`{!*?dq7l*4Loj@%?YAxK`whS{|-o|6=WHx>)C7T2U7nMG?Um0wZ^%7DpfM^2=c=S$a0S(UfA{oL7GUOZw14DP~l7IjIzn1VomVM2^#PEWNlYs#=Htq{b zMbP;&))yb{prj)BjB2Ov8&Di|hkk)XTzBY)&H!+E&>i}wGXOj&cHDvI`})8CLF$e> z2q2gupy4-&s04y317ac-KA_Sh!=tlyg-7Q_kIoMsoxdOxt`j^uLuVjo1@P#o?+TA@ z*9jiop)))>89h1!K@M2q(Hj9Ov{ry}i7y9e{0cOeu?N(?2Ze3t{TCon@Wf-UhXzFW zkVmiS`c_bL^Zbj4pil?p#a<5wgzQA9>|um#1VXkLDhoQC7ix0BaaZu{mq%|X!tV1g zF0wO#((($A-i{6i1_tbk)*uwE01f?u`@Pe4>-~a!|T_I&RxVi1R2O4x`Q12gaSODGM4Kk~H z0*D7sQt`_Aq6(N^Y{zUp~Z-nD@a%KfdY_m+c5MrABX@+_+!}9e82%DVTMIQ10*4f zMFLz@1+jq=4!D(a{T8_Ui)e3`u)bJ)6C{S#afP;2x?P_%zhLZi{nA|fAi1ovJM=;4 z1&_=F@~y7}Jr2H5IQW7!`$S6T1g1!M(iP!1#uDpGrS`61FA9Zb!+UzlDuHrGDjEVJ%*eE_oO zPq*uzUJpi4*WaTzfDvroy?%swe>{3c&w=?Opp5pSL*O-Ex9fxE0|ww>k{2DIk#tx= z49d_RpfPBV&e8)OouNC>GVw9j`wXz*Wy6>M|F;|{VLRq}pYe4xShPD7HtNE24OB$B z-u37Xg$>R8yb2Nuz30*EdKWr2!^_XW0G{W+3yQKA*Fg#(gVA@c|Nq}|yMzT~IB3wD zzXf!nwMRFkZC(d5z4Nd~H)JL{AH-$6;Q^k@g{a^GS!U__piB~`=+~A1|2-^SAC$9$ z$K7AEg4e>d94IjayRqAK2RuXN-vG62j=yLH^^?J&bOP$&qgVg`f2|9e#fCTqT&{nB zDi6a@J`Lmwl-3993n!2-F-E&F$Bn^LI4|ICiUv8q^Z1L8pzbr+O+TP+GK5%yytV;8 zTZCoIxZ70%+)PyHX0SX|YwFYOdjT@}3!2moec{nv3Z6Ui=(XL~$iNW(zf=J-jRcCJ zHjw4bM-FC+vR z7(ktiURQ9h;yY+qqnp8}^AWUoZ3Z<{K+~4sZU%BY4Pqp8SWFEx5Y!2B3kZW+#3j<; z?Am$!g$76t93MYG&34eLH&Cq$>u}&{z3~V#FrWl1s7v(&thw{}i!~6BBD)DiehvZo z9`3vWxp|k>U+@qA~+%QWU%;+beqUE2>$Y$6ss%O`hS2(`}&UXSeH*eV|4(D8p^N z1ZvtL{DGdrdqthmY$5F8UWg-Lc>u}9-@jnExJwl5J!BU^!W>jHfqaDSVrC2%gQl(;M2?NP|LvJxMLZ(q0DPl3+h;kGS`79 zQxh;{8(7D{@PZXIsts8@@B$QQMxvl*%eB4%0mpb z02#_83Q2O{jM@!uEI;T5ErLDxfYpN$R6pMUmjc&cxPpQRltK`53al?;E`lNsBaFc* z6{V2e53&c;tp^7vxD6qpFKg%yI}gBw=g z5PWcZ>KTF$Zbn^0@WHJo5zyuyM6m-JzLx+6W#>4?=2sW5LFyo53mz9>&c{eMpjhra{(?;mWD_Xy{qX4a0QH|BwFY?9{Wr)Y zs6uy$H<8Q%*GOo&4>ZOGTJi=huwOt7_~6lv+P|D|4lONXH_P>fM{fWJl6f<5nCF1a zJn-`P&d@g=y`djGdO;NrILt~#898QVLKr7TMFFeki1i1Z&ZWfGyx}SqSQI^}3$e2Oiq!JpQ5tDH3anEZBn4`s_J%${ zGH^1)Kw%F^Vu2Yn52PAw?eQ1uMHm=fmVg@@Xes{d8Ca5mr&w(DY_F)xdvMVXn!5v+ zXoza!1)^dDw{kK-r6(wfy?|73H+aBZV6dEzC`u(=4RRy684fOY!$hHVHh2w48z>3F z8%MB|2#=Z0(j%Rr2hifCx%L1EeB z9y2@+K49_ytuO6$z0i3Job`^sI0VWi-~~~jY>K?LnDxbx)35+Tb`dCt-vG_>gY$5& zX;cL$pB;Z8238Fj%-jH3?m8bdh*S!i{)acsv&6yu@^04~kPs_41L}{1*R6omfvTk9 z@BjZ>y1wC`1JZ2{wdH#`#Fh=9l+<~+I}|ira{*$87|6_HpkZyLGyngC68r1TP;G~y z+A2ZX;KS?i_y@IhI!i&_%O4)y0^nsw;7|hBS)dkBClmG&oYDoIp>r@IWDY|q+k}^| z7#J9ixn5v|)~Oc)JbGPM1bFna`V>Q|YQu6+tqNMr>(R@r4{~9z=ngPt`V&mqN>?y2 zys!|5*QyIZOFctDLqaRS`wF^U7j!d#^PB4ikKR)7%J1zU<-MVxIsXeDkbvQr0M*!_ z<;8P4q1yqx!9tK&Isw)JSzW$IoPpuBF({rv;}*~X;pHGf^yT7{K)N7>@L@=(qKpaj zpF~eIphgp9?!coHRGov$EZ-jvIu$a%2%bTA>1;jo7c@80djO=Q8??T;bq9ng z;nCT;0m4-9=xkj9VQP4Ewk|*$N(C34kV(ewsbDRT2+@!PMMx`nma%&(*Z_zGKSTmH zop|dQsCVD`A7n%4@fX>kFzuZR=63gjECbEFgWC+?+HCd-uyXJuWv}V$Qb<;51Fe}n z3|D#lg$t^wOOJzw4ftEYZIa$ru-%{`>R!_|P!pVwgZlP|Ji1%KfdQIdy8c2BB-lL_ zEYb_{Bg7BFlHk%_-vh<0La15aL6(C?17?#zf+)&Epvq5xWI(-INZ>=P-V74# zo(hhRUPvfHtezo(&FXI@kbtNGX#-mg4tR)J;vhk&)x7}%9^I_0pkf~)S#|9H|Cf)! zwH$J|oP-*v4blsC95@spdc;72y`~$XYS=+suo`fDLDYOY3W_Mxllc&bJqB^XYQSEC zs5u7`>^04SnzJ9mO@wkcfVg14K&G(4!RQSV>@{_Rs_6l7!D^8Fs}2?HW|e)%$N+7Q zPXY<{wn7rlRB#G_1gIsFMCb7rvkrq2!f}^waOUvn24@wIZb-)IhGe7eR#2w$1T}7t zzsLd&^FR&mJpSU;;s5_%ECr1gLPNas_zSfops;|(;_(;DK~})h)A1KG4uht(kZW-M zmTm?HhL?>hgU9CGu>|NpP$K_v*d_=5PT^Z1Lv!zi=WkjU&j{$eGXI3%EszfeRI zhXnHR7n~s1K?*%k{JdBRay)oO3uGs_)WRHM!6O2%fk1UUs76I=GlQB?$6ve!w;4c- zqalj|?}J4q;8XCe27I*-2~EbGyYG*kj|9%zgceQeXC*Ea1LBg2d3 zmq7^wX3<}$^3Fb4vTP*JmfGRZbR32!g z0W_!qn$`mka7@6}JA!nAe|UiEZBY9hTuj@6T6w5CK>dAaV=5H1n-$ig1^0+LT|rA> zL0t@}6xft1kek6HWTBuDESQd&5a9mWMA*lOqIH)uHlk_%3PhE2dtT5!917nlp`LB8;SH)D5# ztO6$vl$-^cZ=2!KSvmo&spbLRM&`P~qdOGjt_{%k(o11bx$L?FGG^@yl>&!43%DWe zdcdRG7bHEwqr3EgM>nMHBn55}xt{Q7u06p3+KcT9ZhHrVgnJ`cUbcc_0c2AzXh$G8 z;e~?be3yIlf?9Q@;6|Hg=Lyi%V6W(v43IO9zlZ`U>aGRt`2*Pr+D`;_lH*QLzsnP{ zPvJdCQK#z)kK?Z3xiC;;6V!_S4{qp1ut1g@9DmUZR(Au`B~S6tmVwNwqvSkrD>4nL ztn>Jbrw9K3e{o0rL#vbF9*bV2eUx#GW`VR*=mEFw-B@pqxpyeBuRrNm*KlqChr4v{^2PQ z)bNF+bI^tg(7{KlpusPXUJnrv-R&UYkqpX=klpB@`v3fkAh7GfogY@g;ZFub?}T4+c~EIb0GB;sR1 zKMmr%DzNjQv2Y3OAW>_G^K!sE+od4q<$^*9do1wor9v$10hte~2@swCA6Q}mTE%xe z2q1Mlh@MfZ!8fDSmBWBGqhtaa?}N-JA*V5LZUjxqpe2N3uHRvO(C?r==nGI@Jdpwk z=Cv81Q~)|;$)lH7862>pCXiq*1yi=uGZ`3Ouz?~GnhHF+OCNwso?T#VoyT7k@B07$ zMc?lK|2q%u1D&Vj(QEr5gMs0N?M+A>kCHjcc7vh?UW$N1uCw$5TK5`M0f6g6WWRuV zQ=qMVU%(R_aVZ%v-Sr{`vSs;c?r~3!tMT0(EfJV9yc?P)1b>94?ybC z8~oEjTh+nq7Q0`8dS)d&9=*04*$fOXymx?{=L)I_K+c9QW>JPa8{tN@Hje{jive67 z=18RWrt1rjZqV5Z;EL%O$Tg7BwHL5j5!+B~54aWvhx1+nuIL8!u0XS093Gt)JTy;u z9Q?uL0qXTY#6X!0GM;@4Y&WR6bo_-5XaotS1P8`T(CMZr(Mg%ZswNNJ)){B8BbYKw-F3TbIfAE0JYeDwb!4!Z37qS@ubZ7;% zvYG;Rj0CJv)C%S@cvv3hpLzhTB@cBUXmYa`viQgYvKk4ga6za+ot=ZXB$0CtXjL63 z=YSo10W!38!J`v2`W*TKY4lkN6r12hk*K5DhqfclIwxb{UHI68*w+95Uj%LiS02y7POwc*XJB~2a2Zxaqf{T}TR}+?BNc#a zUW6#5>IKJIXXqWwNCOph4?H>rxFZe@bIVC>zlcUljW@PRvIC469B9ZFxR!}T*X zYQBJy&1aCc8nF;-BfwD&sZ@ELA=VxNQ>L{LYwv-qWr9XEs9%5*TpU|K!G)CS!L0=1 z+wA9|jpCPaAn!xshj%mBW>Fi6JMPAVlZZB`ov;qHToaN+zpchz{Z05r5?S!F%V;)f)k+WF^IAE;~5xUtOV_%h8YVQpa(VbLDLwF;LgfcP=5_= z0*-$(n#U8syR$$AGV}x?&G4M{KX-VLqRP! zP-7l>>N{~0nrlJP1fJdl9mCTJTG|g<@N@jdT~Ldm^B8ooKgbwRrn?0aMP2Cwum44_ zg7O-;pmYFDNB_Tsb+iJbKJx&bECHUw{{de2D%uJ$=Yt2Ry8>Bs1Fl*Hkg8V5L|Op$ zLqA>^9_V&mVePtxzmJm%v@moBsCiGsNh;m0J3w1pS7^Jg=>%;!$U6^8RTo}_gXt}x zlT$o;UAI8nDT$zIAlD;MO-DKpc<`@3)E&CQBl%Ld?;4NflOCF#JKs&2Hb9D!QtliXCOC&nwpR~p)G&^|A!nLrMB_^ z{}&tAgUjb!(6WVITkli`h8GEE|NnpO0xDTNJCC&7Mrf@B&zpe9XjW~2?>0eJ5spyN zu>sOW0JUe~{paBI;6gM2ymSd<1886sG;6a0WPj)02=J80)&MNaBoO1u^VWhI%dH?q z-3%VRkWuaCLjs`Q_4OAF8^E$)lb~{taCTn@auC9p_O&2B@&I&+#%o28qaY*9kiqif zFXBPgK}LqRtpET2MgKa41t?7)c*_Hv<3QtZpjIq6L4g)c>}b}1hhyKx-7Kw_zM@*Rwei#J7|3YZj?XpIQW3sgYg2Wh5-%gHrM`ODB%Q` z@7)a0b+?3C|Jc$-C%AF~jV-~(Ykz?Qr35s(3$CKUi?+bSs$i}le7OR+YMln&XMD`3 z^Z5%=cwoS0A#|a`#^4!Os3UQwFIc#O+v}jy9YA3WjtpLK;J{mNj-eo*G`@kX$J$!~ z>XdYDEdWzfGybEljk;lYz@xhrB6OfWFWU$B~kM6BtbHHc9T?7vp zAAbQ}UDDkOmgqeGLJZW10%?YvUb<(d`RT0U9k{0M~!~#Y<3Y5;SJpT)Tpy#ORo72j~nZ(CKopS(KL` zi+e*mAhRZ58yO%GCOeM%>kbXm&gbzJ|6gVzVhgfy@6x67_8D9YzcT6VB4zy z|6fd8`TxI1;}K8{Mmxqh#v=E}klNfJva<( zz}#b?a?PjnAvCptSFJ$@LMM1MA29$|Z}3!)QVxUWp;`}=NI?q#&>lFC?jR0mTVN?a z>Uc!+n*!tmAUryK7x;9#&hY8P|uQLk3>1 zf&33X`_Oj-*jR}0@n4|+pX&;6I|`g7KsgPZ{G`CcA)zxoy1_f1c0iYeNP~qzZF2_+ z&`u4AVIa4K&Ollwb_lZC`h-V!DX8~j0-4odeZjRHtyv8z;Xx^-8+`1nyD+G-gU10R z4S))CY=e(4Rxbm$f*e8Fw%4{chJoQl6lenYxa$LOlM%Fj3f3^IPT zoF2Wl3!@nrUerM>`v6KT0w9xCL)3uo9`oq6tp%xx0?p4OK|6lxG0`?O$0MAA- zFuYiG2-GYCH7$_WxSi;=t9&O2Dt(pyRILQ+}8Ytj@fU4A1J#avR{`ch~Ihq#T8IR zhb{X}SOD?_QY?T*kr7dbbjGtsx2pssu_%BV|CWb}@y!Otd85w;t^+L@1UUiGl<}ZW z;P7u_X#T+1a-f9JcvmC1G(rSVwg)tL@?{}QS22bq3+ID^2fcK9ftIEqgQ#yn>*K*A zqu`lKJ`c#y;b(8qh$QH`5|3V9%}`M3FIwdTqD&`)dA4_b7#Loh1~t&ZLx-iHnT{_z zLFcOP0-b0Es@8VS{r~^Pm3iQz5Oi~hN3X3JNLTY7=-C6P^B}wCft(GCRnUmUz^)_B zQ2_ZDvUUz-9ck_+P^Ac8<~C^#mZ5`<|kRY~oq^IA3_sXKJBh`eO zwH~Am)J#BLN9qLy+=Sz5wN7u9IWd|8=(e12I&Pm4rv|fIgntlX(m+7eh?R|25BAX zDv)5WX`?H|VY5M8uo@&!wt)nDO;w=gltZ|ZP;Lf@3-$|Si6hv#%pk#D(|?{2V_iU8 zuo@))-k$|(fq~|7t{#T_(+i{$y4(|GRP7^3HB_SW_=}*KAk8GKBUJ*8rXln<&HVrW zMJ#BP5E|K?$6wq7nF(7BeEdZ`gbVT2@fQIz(TDNyt|Of|3;Q}!&Y9TPktU*vLjvmf zi<>i0)x(@~a0Vyp^Yj<1aRV+BKbr_k-3bzUTu5NasP1Zf1|}U=GA-5-&VDJp?>@SzpP3 zVw-jTQAW^>A`+mIyOVV$gl7%mO+Ly9I^ZAMDljbTPC&h1&_WE*wu%=Z#s`ny&@V`{ z&GNF?PUZ-kiCo6Q+Lwr$A0re{#y>&lG=c`|?!R~k@(*b4y*C21=?mN+S_Tdf@Q@C> zEci@TG`~RFO34(gZTiw!yS@CJ3$QCd{srZZ^AAILGFln&cT2ww722A(*E z)!neL0}oxIj!vPZfg3XT(m;?jR#`}#fcwp>!2Ue`VhgG{pv(vZqm)0Odsv_a5NI+TW|so! z+yQXH2D?)b%smJy4!|Q}-~_`3au0kk3|#Ud(lzceySp+deL9efaPC=zo_moD+93*d z4S3*gC&*&(YOo6k*K7j`c7u)s2hHOmj|{NBI5q`K%0x|j&_45auqmJ^3kHu)P@fQ- zuOFWW9T5%c0R89)04+m8_9v)U058rF!3k20IwSA_+&zS?=k5iKK!I!pjbec&9DG5? zD_`*FE(MLkfvtTGnuI)10y)qu0h~1uOSyl5+P>?bgX$nq*A&*}_5`iCwRC+@g0eiJ z9F$;mP(__DcmhV8(11~c@I>CdP;8c_?&cFcQ^~xs= zsyRWa2Qqi61L1?B0<_;9QvE(#1PblrFF^NS!4`~Lg7YY7m>7IHhy<7m%Cg|i58qKU z^c?V{Cn&pvUGP$jfx)-)gHPv6@Yx#RdgG`Rc-!fX*WRGu0tNideo*2CW$xoIwu8n} z!Fm0{F7T{1sGdCkfQeL6UfD2$)BMQlhJ=L2D+V zLXaG00#ODUKnI7S8|chbP_x_>meC%7L_iaMy&}wzsar^K0d@w$;?Nf!&4&a)YYQQ3 z-(bNo@Ey-GP)dU~woy8sUqA!1h(@_&A4(T-)dUb1cgOPaDH2h_kUkY4m~2?YuEntrf`guWYy3s!^F@iYVp_L_2-L)0jNxL`F% z9Zz16V6W+Fs5$?7K#^v;0Lpz2;)4AG=@Wtd(ghOiHEo5eISt~1)gbvd4Jz2p8nYGd z&x;^ISi*rU^?)qrs6dkFJpSTl7pQC_q2sv{JX8W7^5E?L|Nq641t6cm2Uk+N!CZ(F zkH2^ZvI4Pz@p>0Y9Z!iK>>bYuU8o&Na7qKMc}Ek6xgr5g91_UKUwD9A2Ps}b@$=#Z z$nmIUEL!auhB}@MErF3sUsx^e0Xgv%+}1$aF5v;&FM;Tfz~?$aZD8=YH)yjMazo%B zXt6ouxG|J*O|*s=betU2)PPJee!)6c4sK!{bN$Z~FNxK15 zYhx>EyC8#fu!OL_1DpzBgK-X^c*V0-ya1FAU>U~r05l&*Z$>L=B~eYS>cmyjHbPC1 z=mcdjv>`rbkRY~__Qfr1#hn|}tTQ0X!Py^@dcj3RI!F*&+(AlOO{ns@AQ|*Rr~@R3 zt)$JxYV~O|NP!jz(gwB~lA*v>zi9^tF*w8_CGD*Z@Dj+s18Ye;4{9I-)IgLi^sm7N znzljJ+yQaH9zZH-kAnnzP3IXx9JT|*1*<_SX%~Y8drcjo=1hTb&7j;?5Etwhq+nD4 z3HF*wL)9dLxL`F%{{7PijZ4-~>*4-P2dRW59OROg4@si)_>25jkY*A}TKiex&;{>7 z?>zoue(V4LFRCEpiQo|LJpST0$V_Nn09|ti;X-_M{6%gnNhR&NHtZ#>S}XRFwhm1k z5>Urqd}~2f4|C4F7El0y8-Sqrc~J)nO_Y*$;G1!>p!LYm07YrWu|OnX0UreN3u2f3 zV~|(y6fVMGc}S(%Ynp5fNxJ{mp%pHBn!zy+E3#x;aTP8qP!o28oB>Y!NaN(oK!Vr` zm+ouW3YV8g5ZB~`w1HDRB*MTsoC_p~Qn;LfDt7?MfUQQ#ciJF9Y=w&eR;%-%W_NfYv7sgQ#01W#Sv=YVvt_2kQCW@{Do8F|Nk#!K|_wv z5br$xVjjp$XkO?%{z4wYh4||D3xP(G3YWkp?1jt82Gqg=mcbMoQNqtQcqmc?D15vJDb3;CiCh zluaL!bQi8dD_r7HO+C>tSPO9x z*xQhnOFmTjV~`9eFCh03&w&K76)u}EVzXKhYSwg+Hn7#;fQO8-7=Z+#NgYzSD6D{I zx~cV83zsL*LctBB7wkBs!o?6I*lT(QszwRK1*<_STzEl(y{1o~w@&`610`8d_#zc9 zFF}I6rtMI3Zb7&;Q0{3E7wi{Ex(64OX&}K~(?qD6;#7lA{>8KqlXms-53|EL2(E!AJyvq z|9`;<_ChPz0u-M>S7CvUFMa{K5)m?3n*?$Xc&PM;2Y9(U18lyu^Tf%+u+Aa6UU0{^ z^*{;lKG1|KX!unN0tXn8{$AJqn7>u4<7yd0qL10LB_1%RzbU_;7wVe z!OI_@$)Z}w=spX`VzjYYq}fLB+05V>$E9aMp#q-%d0y3(IIW7+Bm4jv%Kqoqa zj`4vw3^Wdj;upjqI&_%-7$?^0FmUhu_={b*Wuf}FfmI;f1M)QLSU=j>#44}`cp$-K z!R|*{3pg3%T@=TI1`@!-0+4yB<1d;yLG=J=G6ghDd;CQQc&->UZS@0rxcnkB>I?kt8{uGo$;FByZ z;EifXGc2Ky%OQ|vSjx)~Gc2K?wPxUA17(I~9wP(83;9aK3`=q)$QE$v)B$2PAF)86 zYXy&~LBb0X$QAEVS2$TAbWlww?08e>+&$oLh%)3H92S&4w6-&J9nXbGQjg-Umt_w6BL2q z{YqCrH;#d4d%?lG8O%Kd+gIWXy6WmCDBY#7gO-0lrnXSVCDB3=V}S8HNFz!~;sC$< z0d#Kzc;y6m**Y^Qt$@y001F*|@f37ABj~VB5Fa`w>H%B$-duZwfxqP^WL_8SbOF$z zn&2tdP|#^ppxML|&;`MFL3b&Ew#I-89mH{EFQEI!S`Ui-6KcLA?P`Gp+f3X>~b`it|1t(-R6UgnLV|bBv8iNK;NNIxf7lZOC?k327h&;H3 z0BxT@YGShzP?H1E1Su;9@dL>yfz1n~*S(JGLIkU08Z{Qv)p zTSyjw;|NwRlIjyhkimEwyc_dz_~ZoGw^?6A=pPUp)XfKl1lR)5u3ewrT+r6$86cOv0jcPY-QdxA z53OmX5z{{l=K>4TJ6|~6}z7ge2 z9(dzi1V-BoGwfb?9DKk6IfDw8c=YnXi3bq}uy_ZBRdekJM)*?A5;0KRcpP^CU-1WK zHyca@AhqM_%jt5s~pmoyFg>~R& zRr6|4L3tRqRT$GR4jEFMA6p8CNE%1;>E|3LnNg+4nX^g;O-Yv21T zxWjV%1ve-FL8Ef_U+nk-uBAYWUXfD?$mx)9?+twqKFthV$bswJ`yj_4auvuncwY%Q z7rg-Oiia%wI`9GHVbIaM*P&~8<35679(2dFD@^Dl*maocmWZF z;A{l$bRgmY6t`d@xT7$0Ja~1l*-H4R4X7;(E=iOSVgW2j4OVzr336!X@fRl;2<8(+ zN&xvB7me2XIc&bdCKXB8vZGss~AaZ|78^DYe2S%@_qwAUOIREyI=UP3k-XDLF3QA<%t|>N zzIqNxRxc1G0;sM317u(4UJkUwMnJ`CFQh@D3fjHV+X}l-9W*TH(c3By8gA)4{^B6W zZM{>$+}>93!G^u2s^EPI$6v63#(a8P!FR6rP6eCL+Y43$85Q{h8h!3{ZSd&rg{Tqz z4_a9ckz|EPf-Zbr;L+>b0IhXTfP}kSK@Rrl^*!LxD|!c^2Q(TEx-lI*;SSzvzQ-!Jgwpi}zi zBcIX_xn=3@A|{5<(D!3k$Rz;6Ou52r}RW;>Tb728|u!NeG{-K)yjv z2;V^KZXpTb4>%RMq9=s?AU#muKoY`bgq{sK z{(wpxw6e*YR6~40YXZ>>VZ$&4)M^Cn2E%fhLt`r=!;4#>!3%I-3w8|Arv^}Hg2o|1 z2eyE_U!ss1Z2mh@yokFZgX}F_vy90+{DP>80Oh| z%`w!o^Q&WsW9Oexk6zs?O^gh|KAL|)hk3r>kNp4N>kJB z;?$y&%=|nBbp~~Xw8YGu)D#6pMMW+K-_+vb#Pn1v1qLn#x6B+6!>u$g8Kl-qfx#y; z4`D zex5=}Myi5ysEeaQN@`hVa;lX=ZhmozLQZCOYEC6cvM5y{vsfW7zeFK5FTXTBL!r1J zF*y~c6QT)foE`(%AlINE{~!eh1q}vGD+LBUuHgLAB9Os3nR%%S3<{Zf3Jg}Dn^GA5 zqk(@=KIjx|kOI(f4Tv4$=i}=K3MNo+fm)8Bfjt%mhWIEy;}Aa|Lj%X0^!%dCl8oG9 zL-T;dqQsn>)Eu|`BJb3qywn^|pCm)q%qYKLgLp$TU$DRZ3sQ>`K_OV|=@%St?gI9C zv59kON}{W)adB#iXJ)3WYgs@@e2{mrd3=aLylZGqrfa;HtE+2Suv2`nQ+$YFyh}ld zQ9Q_5M7zU)*6uJgN-j-F^vugF2}vx@hK7!FeqKppW?pJhv87{CQDS9SW@@>oNr+>x zH-_gkU0pLX12X-B4dXpCgH7TyU0oUMY#A~^zGPr1%g;xzko1;cWM^enoLb_VlbV~FSE7-epI2N0QKP1#0O5fRP}5Y1 zF3&8is3GRs<#1|Xcr6i zkbXvfZmNDoX<}Z9z8)xJE@-TCzfQErRp(cD%ja7Xn=!3Q^9}%!D3)wK-HgFtdL(?qL81akd$AV zmr@MUYokz-ky)&3m!4V@oLQBsp~-;Gk5*vNQD9JrWk^XZ0SzgF99CSCSdy8nfQTkU zzOb@FPx&_Bh(rlXNF-@Mou>th87N;38mq7J2NJpr3TU!PN zFmSCXNKGzDO#v05U;%~jjLhT=h0J1*NJu^?@=^;+6LSYBiK*PaQ$PeJS1qU1~P+{pFoR}ez153#e9yA$~oa{66N}#Ep zfdQPF6+qs=k$AyUDCrlH{6OK(z`ziaky)%zoRMFelcJE9T9#U*kdatWkea8U3#(xA zixg5aQ^2(bD4NPa)kR{SLTXV_evuvn!Z3Jbfs!xf8&!t$35LgWmQdy8{XJwU~Ur?#xg0Kzh zc6iYOX&N9zQSt(&K1lImt$?BpCJ5H6V5^XmpO>ysnpd2eo|l>esTCm32e}CpYAAJx zM}Bb$C=5a2Ktg$iJ5cm8P|GV&5I_s8dLfjkY$&!7-N)OOG+9n?&~TCLk5DbN5H zSXyXR16U9iFOa~;2zXEtk5Y=o5~?o{E`-#_AP0gg32=D^;z7!^^2B0=%)HE!%*33` zs?-z(aDyUI0aiDI73qOXaZnv?zyMxl0tyNZaG=@R!YUL6H8rqeP!JnvD%8{{KvXDV zs<2Zq(1et$V0YkfLP36UCa8I!keHHElv-S@keHXEP?cJg53we{2*N>@g=+z2U1)?s zTtT)wLEZt^kdSBsw*wGiixj!IVm}?xk^^TtO=uGiwdqDu6&4F=FhQb%_f6<9~mGO`cSga0Q0NTaNFN#&L19^_2!5&OuAq)%*3=ND7j15c-ObyHo%nd9I zEDa3|4GoP9jSWo~TOij#8%uOszEKLne4NZ+qjZIBVO-;>A%}p&#EzJzf z49$$pjLl5UOwG*9%*`y!EX@tf4b6?rjm=HWP0h{B&CM;$EiDWz3@wZ-j4ezoOfAeT z%q=V|EG-Qz4K0l8-P-$LCW^Sr0 zs4<1zVG!*MnEH)C`kfL}z*zx9FIYWlZPNk%#Da4x^#Fyk2RHT)}8yFZE zm?Rn|gBc)ByhVI&VrE`^a(-?>X-R6lsZo4#L4jd>W@fyZNupU|aw@Fnf^IT|PpR1f zZ$o!6G6*n8Kyx}110$mY0|Ud2bcp#dlAVE(!34%(U|RR%YXJtXVlb;rSJ!~dVAJ?uvv_zpW@vJzniin0n@eIzB0gKNmAxCveba0)U?cs;L==#bw)ZeL%)2W;+ zk&LEB>O(S)D#;JYJbdX7Ng4LE1yYa*UNgYJKyoLPTHQ}li#oWygt|ktoZd(4fe)xt z5QPM^92jWVKw5!_%4{IrLv(7PXs3)iWU~cN710+3JY(lH$Fh=gEoT>)NRScM;W5Et6@QcDjXk0 zjdl*yY$_NPA4Qdh64XJ|Fe^R^UsDTe5gsM+QG|zIaI1)q!qS3+T8b$H&P$0!1<1oS zFvS@C2*fx}d{jtSaEWCqY+{K4Dv*?z5)Ym~1PeeL8A#JlXwyd^U6_in%u#_A8XCmM zCs$M?CS{fx8k)z)C+8#<7sr=W7No|5<~m#f>GKYv8SulWmS!hEo(8wj! zjG<}!d+<3az96wEF_)46qh-s6%&|^dI!PryK0b;T?JQT*#InT9ocMS{Q!@js5kiXw z40;ID!VYMVp^wtTN{sk;3o~%30V+8W{VzmmNaXx5Qd5%ZEhkVgL7hkKc^f2Wqqc64 z?4&~51j#gNG%S#e!_#~~Rz>~6MCG!aoJ!85o!u z5aKWeH<&=``CwcI(2^SvCzctc9!VS|$iTpm$_f(ahw>1V0vkvia~04}PLQ||LLrpJ z!o|R#$RNZJ0Oe~yX;G*+=wu_15KLSRDh?6@VVJl#RJ;Nt$iTn=6EA^^e}IZdKM^97#LvUC!pf6byHkW^=F~tFmp|z;#Z*Jd!c*_D193${uRpifYOhk z;*+87OoocTgo;DC3^4mYK*dv_5-L#o2UHvu^CD1j1|CQ_xIp=!y=fpO2UJ`Y#9?4y zkcZL&P;pzRd%~dN>v=%o4D&rykb!~W0WV0L8zBH?Iq`$Ud7wN5)gS^A2k(wU2r@7* zu(L8SV5Uw627ZvZ08%=ag=k|D0L;c6&R(s@F7#u+UGGSnz0rE5hD6KFsY+zwv0N(@)<1;cafR5({DFG#m4U7y7 zCqN9S{0=4th7Hhs0}=-DSAh08Lghi`Wk3@!NFEdnKhXFUpvZ%&KfuAjumg>s!3nY* z;=Twj28IA=QUsX?;uj$C6Sx@|BA`hUBoE?)@(0KS5Whm0fgu7KWgvb4F9U-Iv`B{W zGm!WZd<+Z$&>|Ql58_Wi;&0$%U|501KY_&G!Oy_30h)wC`a%2)Nc;l=3=BJ0jDdke43%$z#5WLUV9*dp$bL(a*>L(a*>L(a*>L(a*>L(a*>L(a!|c&ji)a1l7+3)z1Xg&ji)a1l7+3)emcb zXfrS{fRX^HOa%Ex2g<(z;xI5UfaF0*;02Tqk_Yi&?GBJUC`)jF6F36{NFKx&fbv1| zpd>5-<%8rwd<`fcBoC^3ETDXlJcu6w<%8tq7#J7|pnQ-#h(7_!*J5A*r;7_vK1e+% zN#20+LFz&L7f?P(9+X5sK=~kf5dR0150VGFA6o8$A0!Xr?|||_@*wx0fbv1|ApQj?A0!WQ{|hJ|BoE?qKl)^<%8rw?*9PggXBSc0cgDjk_Wk81Ih=LGmE}4=5ib4{|@O%LI}K@eQE$Fw8#%P(DaKC_XBne2{t&zXQq#$%Eo!0+bJu z2k~b>`5<|)`=NZ0Jcz#o$_L4V-4Er1K1d$K=YZDxAbF7cC7^te zJcw@r<-`1Q0m=ud2gSz?C?BLA#D4+hgXBT+@d3&Q$%FVmpnQ-#$o;UUGDsf8mw>i2 zK=L5>Ye4xRc@WBoA``3n(8X z58`ux3LORp2AF?5pnQ;eP<#YH`5^Tmegu>ck_W{{29yty2k{G_e2_fIstzb0BoE?G zfbv1|VE04$AbAjf1(Xkx2f2R-ln;^z@ozx+AbF7cKS22)c@SR!+HQsUM*`YT1*r$c zhX#}nQV-%AK=~kfP<&WG`5<`^-vi1A$%EV<0p)|_LHrCTA0!WQe*u&ak_YiCpnQ-# z$o&(be2_edzX8ez$%EW~0?G%;gZM9?e3*YUpzUmsdQf~AK=~l`Aif2Z50VGPhX<4o zk_YhvpnQ-#*!@sGNFKy5fbv1|VE04$AbAkK1Ih=m`O<%8rw?mq$L zgXBT{7f?P(9_0QXP(DZ=#Fv2fvta%yfbv1=LGe)m<%86N_#IF_NFEd)6QF#MJcvI7 z$_L4V+`j?J2g!r@JD_}!JjnegpnQ-#h<^df2g!ro59Nd8L3|F7BcSmIc0ZI4k_YiE zpnRBrDxiFjd7${{fbv1=LHr3&K1d!EA2XnQkUWUL0?G%;gWV71gXBT{6Hq=#9_)T7 zA0!Xr-+=N#@*wwrfbv1|Aie-7(2)G20p)|_LH_Z8@?rj&0p)|t0~xgf$_J?j@i##E zAbAjT2Z(^Ce-QHoh=AG$a{mn|AEY0|e*xu#^n=|00m=u-gZMw7e2_fY{h&aEnh)X| zK=~kfu=}BWkbV$91ImZ_X9biGQV(Ko01;60K+GK=0_t8+e4K#tLFR$@7odEQez5zY ze2_ed{{hMe$%EYw<%8rwd=8KyQ1e0Vmw@s?@*utiln>Gma(@7n50VG*3!r?Mf1usX zQED^gomjzp)?q$Fv1BixbW;gV7+GfkB;tfk7aKok5(nfjQUv3HyTo z3`gV{82CX0?;8T3-LA)M4B=pVzOos^7y?{S68^|06zmM!q z{}@ilGq5v&u9MNUU}8{bWoVS}VPJCy+xLmxL5^XIJOeYkIV%H)fDbDZI|GBlop&(% z|FR$W!|+F*fuEt{*MEM8M+kR-+{5trKRd%8h6a#4`+sD4n7Rj0c^Ds^rc?NF!0d&E zA2T#OKC&+WhYvhlpy9~O3<-~~><|7j?2%_+fQAdq92otS{lR}w_`vx5kQu+n{0!~j z@F3OyjEwAm|NrNg5rU?l4gXg z#C&64@ShRnKWO-Z&gx)bPX7h*hQnWw`;UO+ zLBsApKA?xgM|OvQ3}cWe7h3+n!UK}Ne|%sN1C|Ns9PWLUYv85#sP#C+sG z_J`qyJO>-=a)yR_hG=%ytkbFHuv5 znc2)46qpXgeB@XAcVL1%0}JzXu>Gve(-}HI_Je7KXi)jga6_JfnRz+`1CvHHGjlkD z0`q|w7G{WhKC&+Zm(Q#$%NZJ27^2x&Ans>jNoSbBWDxz3z2P6Id}U(@XJue<_hI-S z&dR{^fEG(eAS_EU5nL)h+1{r>~@B9tSxzV2?`Bws@A5^}JL-m2| zV_*=B`3>^l1$hQ`kp0X#(X6cQ3=FIbVwjmh>6by~6MMrSh7*87+~?vEbtvPlfl9!C?3#I zFaG`m`+{en^v%E^&Il@>Sh(1k7#Me0urq_)pB?>?z2Pa-1@LiDhM;l-mJS%$Kr=W3 zF&{wbrz4($jTuxwF|e>RsDsUgrIQcr3C|eLfa2#k*j`w=;Rn@o%sFPD^y_p)UP7D& z6rMR|+^paE8Mn4AFuh|AW#8gFDz> zkUmg4Vr465P+&O_^O0Tc-+>lT`eJ8+{UAOAi-zA9en|U(fngHZUN$y&h6T(AVmKMv!S;dE#|e1`cCdZ{KUOep;K#zo z&It0yNA?78{lv~u&ai;#KnyE$J2)KlS=gBxm~+e?g6fAW@*EuO>I@A`Qa3RkTGZe&e*`5Z3fDp3@!2u`cV7W*uVp0GU_bsj39S>g_Ms1EFVDm zpMjB`9b!K#n>zyoqeKkYf1vV-8Nz1<) z`5zR&;QR^kFU+6((0W`DR6pI2XJBPz2m70u`TKuRJTovbuz~X1TXu$j;QCF#oS}g^ zJDLrojzJ&>R7rs9GjRGiBFDhOkPdD?vNDu|X;Asaz`*$S3%i1x0?7T)dXt42-TevR z_94vOa2g&z(DoC!T{Po8Tprq_o27jVC@ue`Z*xafZlF{g~LZiNc#sC9*A&&v{ODYGW=sa!p;B- z5A<*Vw+}(-8ybG-?UcXb3;u)JAL#8iSpNC|E`RJ8*agKI8klp^PYI!Qk*m;RrK>pjbPY4{KjAh_QqDpz?(g-u`2F%peGA ze}EQ{HB^Gq=~rm`Sr}Tbf!kNi+0p#q*aA(zN)iGviL;}nGkTY=*u$3{DSRRQKzR7ls(gi} zvw+02z`_Gd`EJK>z@7oUT!W>vkI?oLEWKeHe__Hq{-OZtU%~sIuyiIY0xq9q zKJpV$zHO0bKu>q-KSbX9xKSt9xXqoZ{b_W`#Zw7Gsf|Yxs(E1s2B=7`K`_C4XKG{Vf z?K>d}(6}3@o`#fz;PgQ-|8rB4|KkYc|2Rnghol$udgdb&r2m93|FhxE{|w;#0}CH^ zQ7X6Zpzg!pzvG7Y?_l``Rt|#8*Av`G?GH%51{Mw<_!Z&^_wVkL+`khQ0QYlc!1E_t zK#NBk1isZ~HJyCmSEwT*d>CQ<0dW6`eMStd zzXu+l#@7FZrW4He1E^d_j01r45B~BM>^@Tazp!-jkr}oBi_tz~2K9gW8DQxJR&T=c z3&HkTIiddV0dV_XQ~=yB1=UX|>EtalWPVXdq8;4-C8PeC2Z}$k!-oY|_(0OhCl*ln z2$LT^-wA~ebbbq(UdU@-2~$$O&LdF1&V!V%sO1VQoqS}0UjZ2b0Z2QC`TulA1`dIkZ=mrv@cb}GK4|_IQcg1v zJ^z^rxeo<&(+Gr>sepMAnO}a1LUCz9L4FbFLaxj_$h|Vz;H&nE^$6Z=2E7ym@7-zO zTN=RksDPb<<1!Vr`&;d}$0xSIR(cfl(v<8kl_WEsFU?(Xp`G z?I^kchI9wO0|#;ok%AU=t|6jXfRJ_x8SGXnP}`C`3>vkE?Zrhl0a9NxFtmcEZ;&KF z>XEn%pq30a{YRnt`oYsn2p#D9LCegcTb&S!7#J8LK%4s*7#J#`hJ(}~ZL$OH+ykrE z1e4(H!Vngeyr2eQ>K--NWEGTIMZmu!F4F)-b7qm|pDi5QUf*giW z1`}pj1Jw_`Rs?K6C`h2&b>ZsdAp#ioi$FKtUjSJQ+ULjso)7>HRKxXx7-f76pg6$L z{{pnIl7WE%wvP{_7KA~qDP+2mhk*gQl?x;c$2&j^F&P*b7JwX$WIs$lc($5>f#DGw zWUnhE7~t9&BCH|o1f&SZ7JfV&*!AxKU7E+hz<{hD%{GG6!0-a_6gX)ACqx7k4WK=mpgrCY5eO;81hF4o{{vVC0vQes zKV}97Hgx@mnPK{2Aqp|~fg^-w04Q`Z4kV4mg2^#(&_d#`lsRwajieUnp1k^bY zNd^XnIu_8fBhXm^a3L@Qc0WxEm5rd|fvJwY8yAPQmk z1Z{|f@Yg_T&^gI&zVdpr zdV}u;VqkD!a0kt;A|1)h!Jt%`E5N|W!SKNIvIvQ@WFEe z4E~@KoAMYm?2A%!HZth?7p3MTx-jT+nK1s!pOki$B@hPmyzuW zBmV@3-1+e95+*a0@Y^s+%Ny7*Rq^*R`10E@$?{JC(+d2P8N8Tmn0|7`gKk2OFU~bL zh%e4a%PFqROO7ush%dOq!mrA}#mvCUufo92EXT&L%)rUi!|{&_}C5%cK`3o7X_@fwXIP)0!V;JmNvl;oL z8EjZG7;mDSd3A$@TZ)0%kYPUjz#t}OPX2yw#!lEZB&iJiehh}}s*L>p3?{6*Soi}N z44Ew$*CUPuvSgHt2H!Jc2~uOkZpFwSz+lR1&B!0fV8m?0_>&9gaX&sx{AvuG97~z` zH5hnU3Yq!U88}&HFvNq8uA0uk%dg77&N+>NU!8%Abt(hD8Us7i6b61nPDcNvw4BNs zP62jx2IhPQeqRPoRxN&hRR&I`DGY{Dh9(CXb0DW$ZfE3|WnkctVB`m_{94GsAH~4P zQo;}qKHsvKfrVFwfwPE#Uygy1wUB{VmVq%)ATd6zC^0wHgMn9&K`K#*;UMS`!Ni=L z^rFOq3_gbW1x5KK`Jj_0Wds%Y#RS!v)fxGFS!DS_1-~$Yj)X)$8&i;BBK%lPK?ddw z1`$^^hX0JkhNy>Osxq*AX2Ll7Qj_7o0P=~JSkApvV_?XK9CWG4z`_$HBxo8Xr01E* z!xQKSx{q!%gHvKjeo>B4uqWucxMBu=F9sI=C>|DmKL#elV59f|2Htf-`teaySvZtj z%?x=MgPE8<2r*p|+KxCm(vXKi+=z!E5pwDyD}$1XAv=SR5j%sBDLaFZ89RfJB|C$% zp#cX2vnqp-p#y`Fijlbl!$Knq*g45P4EzDYn^-lO`E`V{_^%58|F4w~IS26slQ_Ss zPyowCasFUtUjA+dFV^E6{Cy0iOvhOHCoq(AqzW=0W#`|>n8}~d#l#dX!X(GaufV|0 z9xe&ah_5+88Ije5iC>w4jemlW5C27x|A*qi2jJaibi9ai#@}8hj?2va7NU;)Wg<>Y z_n7&ML>yVqGxNtXII@?r@W(T_Ft2Cfk7ID)FBNg;KOm18vgY3_ z%Eog=l--h%ONxR2w}=(LvDkk(ep^v%{!URNHccjee+DD|O0i!4f1<*I#@9snr-_yE z%ZUl_D=~1gtrp}@731I#VC7e5;AXka0!q{Tr@5I}TE&>&v-2l%GqNZ#uy=!!JP&g> zD9LlO{$=1-W#HiN5o2On!NA`k#=v}mljoNh%W`3N&{2Nn41AgloDUiJfPqDVLHN1^1G6{-s|17a9|;EG3Q3Tlk}6{Y zgYX9l2H^+}24Qar1|g4fO^}ij2?iDi1{Mwm;e8Se!V4Tgii`P}K&=y3F$Oj!hRO0w z3=F&ut_&=~48jc@41%r(48qQQV5v}$l!7zEw+Rfw9nuWKi4qJ-?974;LY?dk$_%*@ z3`%uOf*=!xDLeIgA{c~CB^ZQTB^ZSF3NQ#;a4-n>NH7S06<`qd z;9wA*EWsc=c?E-TvIK+hEC~i?eg@VC2H}Mg48kdr48kfL48kiV7=#4{7=%SQ7=$-U zFt9vg6jqR65Z)=lAgn6DAS}&>3XU7>pSh8Wb4PKwSW4 z25_L3oB$s;W(Wz^Xf_50-bO=)1P2C|2@Iu4Y#?oB8Vte=3JmGb!3U5@FfcPQ6i?s) z1(6Xr9GDUqWQ`ISN|iYo7?c>yH5phOn8IyAr;Y*E<*K9EUPf?y{^3V=A) zU>U*l5)5KC5OH52khm>aylj~;h+_wli2|QcWe=7qeJ28Pt%D!~O8{dymnZ`RGuX9) zn59>U|^fU zsMH`_%@EEm1QBLn3-1&LGnE>63>YHpM8F&&Rtbjid7=yqLK%!o4Qvt&5nf^p3_<~n z%r976{{R2~zlKTpKO@6uA!Y`ankfwp8a#~+4H^s_{~H@LI6zDekRZp@hK9AQVD&Fp zd6*cK3OIxq0$D)KR!;`O1a5{e;;d5}8Z_8gKs^{%kQR29J&X(tY#=5J=yV4*kUWbR z3sVZHwiITV&#>@6$WD+O_#+tD7cwvvGcal}uv}r_VPX(2h+tuCVqjd#%;d@-%<`F$ zzktD(MT4EUfPr}-1AhafE{hB!Se+mP$72Rz5gA4n69)bOMk8hgW~K)WtZx~O3|Krk znfV_u=<+8pu_}F#d&s~T%EWk>(PaW7i%KGspin;pVDBnGwx3?>CE0`nQfxaKqXakCs?p3fj;V#~n1fst98iSYj;I%0%lQ`Hy~%Q zaxjZ9EMPWfU~x%d;4fg%6)^G;?047(y4J?Wc zjB6MeH?oNFZDcg%XIZh4g)M+d#36x^e*uFV^AA?WP6oygOky1K83Y-{7#QC(xiGOj zV6tZv<#^8|D)1Df%vJD1CJWaB1`(F~jGPM?M0xBP*%mOcTCgxaVd8IK(&YcZ#Kydk zk#RW#qXQ$G04vC8!VwXW5UyZnoW;mkzyNZqsE+_68w0z%#0Exx4i0wa1t3voy#R&) zCQ+6J42CTF9P4Br69a=0j}nVA52*AIcT!;xbS&U! zQej}x;{+Qb+_8lr7}QPGn!v#FkQpL2Lxn-yehUMO8n-&gB1L_UeVoDy9ta85ms{S$P;-^f?$5IT(dN zZpmQ~Qsib}RAC4Pb!~Za7(`?x7{mp67!>t67V`>AsW346Gcl?#Yy@?bK*J|2W-KZU zhAieRODqjp_*sOw6;Kaaiz=?su!I^>KfHMPwg9`)00T%`a z2UpNRcmfQdeJ!BP`~sQ`3;_;D7&-W#F!J&%F!1t6Fu4AA(g*+#ae$6H0v$Otfr)|P z0}}&-0xJVU0V@N;237`!53CFf25bxr0?=c$KnF{KY77BR1_sbEK>}P144~t7Ku6w8 z;9kSP@lZp9A%TH`;Q(kwG$X?ke&G!j4DUfF&lxZ?Fl=CEU@%}&VP@d5VfxRcA;17S zdg%c(1H%Cp1_lS#8%!KR8h@CC9eSAyK_w3(149Y3xD5w`p(+DI7GE%ET*yv>LD+$V zLD-RlLD(sWMcA2x!O(?);h2E1D+j}6P>1^8|Ns9%^xq5yrL$ro3`$($DGW;d;u#D| zm&Gy|3?GXzY~oY;C|1Cr^c6%fiWe|IxF5xGKng2B+zJM@k7As{!d5F7gdH;2CxdQ* z2JI_W0M+sW48jj2_`zbJy~_t6V&V$~7=l3?oErofz;dAR%LIs=u!8`Du%!WmxPSnI zxEKegu!R7F(1q75j8+1{pjGl4Aj*^;F#9mFTwo9u5MTgN4_5Glr~?q*0ti(gz`$~V zK{xA%1r{K1AP3FK5^1_Ndm1qLNyh7(LI7Z?nM7#QX-u_!PEgU;$m zXkZk!JS;3Muv0`hK!QQqLV`iu;R8c3DA#-tU=TLo0HsqF2?pkHUd8~9U{GjEYyh=? znEUw{cd)QTFe<4soMGV*U=Vg_F@uF1EGL*?ya$^_kOCEC z=LH5~P@rCb1nLC_VFQr<2jFNBwhZAC{$MQ%iZS5}(3k}+NP^j`0JcLd!BPR@C6Kc( zFfdv&GP5%TgObOC03js%ppn=B)+G$`D%3v935+a@MOYRv3V*mQ2BIzqFn}nik_Exy zARbg7DDoFDg51e+fq^lRoB0DnFete_V3xqN3*@&2jNk+V)w6&RoWd3`3O6VVf;4PU zH-5e!Y(j?!W`r%tQH*5mSnlWz+w#|g&!>C1DTA~ z^H6&r;RuZ+#S8lP85yrL1xqk6Fo06W0!ZpuzzB}11&m4$^c=uh1fq?RmyfxDB@vXQ zF6i;$MSxKa{}12Z%bl|E<|AX`?T2?~6*4eACE<~mL_8}RZ22foN@+&+x(bDDt18GpQ ze#y`Lftm3GhoBrk17ndaI0is&!5^dG*tx(UeBr79C@?^ggNRT_N>!R5vjH457o;J6 zf+_;ZLJNTo9%$wQN8JSmwGUDW42&B%nIZWW}vWEv$c!3MC1p*AL3mBO($+dDIBOzs`(1m;eo-zl9mid6ak=W(cwz;8n6_*doZ%!3U1tfDV3^3yd5y_@$+x zSY!r2xM-WOf}e!}lKmMVN&Nz&FxUxz{1z+wQZ_=O7s_*o_}2pcr;BLuY$ zm?bcPf?Ie4*uN|i3|beA4H$$cIPeJ{fC&jJOyC7+1z7=ij+TKQXdFj?0TDjp1qwW1 zw-_Yw2q(aFi(g3K28%Wv;6^f2K!6+6209?X!00H&tii@1!65!XfeWm1LIaoZ1qlY0 z1hB&uE^x9WK;j2(yYPn#9H0PR;J_*T0B(~(0|!|D0|5q5>_R>MVFEjA0s}Jx2eL{B zcE$uyZ3$H;{6K(#Lqbg4rb3KEL5xF!fn$Xz;|50N53C#;L^&it#xV+8dJD4zaB*;O zfdmo^I6>xb;N-~Qv#@CdeDk#CU+8 z(St#VPk;gJNpS2aC2%W%{0nl2Z~#0Reee(iS(2azG6KYs0JU?#HhmCa;Mf4Nh-CvK zvp*YagqZMw8H_9|*cn|Vm|rk3?%-kf;09+%P_|gX4$ej^*f~15rA0cp!TC|afI-;O zf(u;Su3%?b!Oc>^A-+L?8QdrXmF^x~;JUhkgK@7YvxIm!sQK>T!Nhn%091%pa43Ed z*Joh7!N_OBi+u8rA1C~Giq=N2`>Pb%pli6%LHhxDZvFUN)|9Orb;qTU=0Q} zA}3@pgBmb5xZ$L5!vutoxPbzs%>{Cv1ef9mfqDkU2aLg>$yjJrhpUzVm6IPpCJAp4 zU;&l6PzQnP9|uP900Btr1yqhh8lkK|xR@ZjW}@W467iNF0d_NV3~l@4uLlSI3i>iGkBR_fX&3zy?{~p z!3kcL0A{pW6w>Md2Nk!6LjM1Kb6CFoPK+2MYBLCXOG>($YVe z86%iMS)+mpoLViIS$=?05rh=p@Pk>{Qi27XRXdo#S)l`F1xo`b%LN9G8_eK5{b2@J z+YM$G4`voHDcoSdz_NjX#X^B)1B1AO0t4dC&ZLEeI9K6shKufW17&9A@$!XSkVEYea6EYgw+ zEQ}TG%mQqntOH8l(9&rEBTEEGjN=3^;|f*w4mQRe;KHzgiKBxA*6JJ0t*WV*fU^KJb{CqaRby{EDE4rB#6AgApYP1b1*+>Tn5rK zL@IBf?cECu91f}+Gq?pUxl)Z87&ffn2Gwn#p#hEz4)9Q)K?5_(3{a)SBEbL?D_~$} z`N0OsFf1#;ErbsWOe_<)AVM57IKf&cEMOGBkO9sy3=%98m{=~bvK(M!nZXGnCor;3 zU}6qn1hFCF6Ts{Vj4TYSED$zC9GU#V$ufb7V*#7E;|VU73MP<4Suz-vEE!@nStf9B zBnX0P)fM0ztZ;ylV+9AOh0svI!sr1?(Ht{aK=KnfKvlZ|1IGj&P<<}JAl%Tv0?PCY z1Q>)9I2c4MZh%w^KX3pQ3gC$uc(aD1fr)Vi1G5GLw88~9>=&?r33rE zwT}uIK^zGP#{p74VKG}s~_z#u$f2Loh?g++r|*ru7C&;1hSrkYzal3T&1gtSlcGc}_5~Y>?prxsC}$KHwI% znXSRn!6$6BL67ACJIJmI29_5LptR$_z@otf5}5$@Ob5tvVS@*Z!j>Z7AskSn5@e6? zh6#*3Cm8qzm{^4!JV2vnEDoR}jpbG67+U-2`@FCJsU29R|$8o+l(gwG@*80}Dvu4mn{90|v0h z8xt6XJu_HY6qrC<76*AlPX>kpQ{f1RJd1?gWKRYL@FK4l2N=O>A}oYG4fI(qxPo}% z9tI4-pu25eTwoMt@)2W+V3YO$MXLaV_>B#W5*8W4+!74JA6765&)@*H9$7e8gnz7H zWRYNCd0-$NA;G}%LrD0=3{W0oIlv0i%QAsgd_@JLxCMtI%L5h`4+h~635>!gIKT~U zP@z158Kip#v-ATGMwSU~{7ej>aokQrl_KU+^H1uG41MejkN6n`Di|3TfMg#0XJc%LN)>nT7nh=3@V_Wwjg5w zg9@nT7i1P-P!{tMU`T$%$`D#C?8Fu*z`&p;r@){hV4}bf#LmD}B;L#h9ydIuufU+f zWT3zh$jqQpB=&$|BOAy`Py88FiUb50Kt=`jGpH0X2{0_;0Lfg?XJ{>!;`zoS_F8~} z;gKK%PXz-L$fX5NADBU22`&CVf#D&@PgO#UppE|jAAELTRBA2W=5mqgB}fk=gKH%l z!-5jFFPsg`3{UtO1gZr8gBDKyfAXJ!;S4{6JLezn<_2blSNsf36))Ko85j~6CaaV% z{0Ck9^M3)Oa#MvRJBtE?Qys^D76pc4evrLQ74sRG6&Q?|YB;$$jrbWj6d0H{vNJ0% z7^>89U1VT$U|??KWmRCXJ118=CM#Um_1twv^21aG3iih%(K{d}2eg>urLxms^`$+)9Ck7tS zGF2{62r)A-6>(K_ECfkCl4npU;tpUCVF_R`EaGI?#K@??5XjGwne@3_>Rw7z8=yI|~ZPaw<7jN;5KU;tPJI&k$3& z#c3jg0Rt$A9V_b?SR0uBF!B^Jh=C>w3If0}$?}1TRe?cK;gbNfv1=t;j*wu30D~h> z5d)J0!^WU5pqOLfDPj=6YONXkL4YA$K)HyO;fVpmM5h7 zSy)Jofl-M40TY)3vn5jvhXW&z0<)P)9j5~$qXRR)0yDP>Qw?_k3v&Q7mjE+Y0299e zv%C>g4Qn;17-oFO#KgqV^a$iJw*TB87??LOa49fHs?;(6=THDKc^sH|44566YIqrp zKqVU21QsKvBKF<7%o~_^3mEwom}LxAinuN?F-~Ajc_h!kRK&f3k?8{ij{q~T0W-S* zGqVFDBNG$z0R}UbIu-?1;ZqEZEDFr>&lH&D9h)mf7@rEHf->JCCI@z5fd;0auwn)_ z;dP9g85o|hGc;FMj#3n=Ma;AfC* zV3-KXHA;L8OjYL?nG_fTkMT1|R-R*EV&xA?;bTxKyUVwTm8p*%v=K+8;)m^5E@lw# zls*Gb6$1l9Bm+Z1(g$XS8~hBu>zSDf*sYi<*bA5f%Q*it#0aoI1Q$=LWlRhVQ49?1 z2@Hw+3@TL&tbw56G{?4L2Bu&J<`3*V1q^}ioeT^c(!EQXnHgTmGx$DWU~XXGZ(z(# zQz>Ko&%pJ8LHTgy2WI95MvV zuz;47vN84XgU)9Xs9|DSz!CW1KR?5X35@bg6BxOf`vMs^Fop3msFblzU=%ztfzg;K%ru|4hftmjSBR@|ABkKbu<_5+PQ2Bjf0{3K40xgmJ zufVW@k!b=K^F}sdfeDO)LcI)142llSmI76R3%CMZE7=(gg4vihusbppi8Cx6F)%l<32|&-VBWyaqrffBs=zJFufT1{RKq6E%65TG zo}Gb%i{}A5p8~felLEIrQw{S6c5aRf?2HPWjCssVObpVG7`T}mxWt(lxLF>sGkxHS z2IVo14_t!W3>=IfxDua$iY!h624VI9_7J8bb_M|kHg*PX#tGa)PZAgz6Ig5+`uGL7 zg^vg@vV35dfAfJ|p78;baMS}PrUGWh-^?rp%)AGfAafdTiFfjbQz`zs0$bW&C zmwy8nuL)BPR{|sB0xrG-y!;24tR0zZSQ$PsF&8lKJ>ZpOd%(-Lfs5w?6Js40$gc^^ zjGGvkKkynd)i8PZun90rGGE~3F<{|8z+^ARZNSDjfr)7YFJpp0<|BRvrXn^4Zr%$_ zJPWv(6ZjY%K;|$9F!NvF)l_UO-mZO-f#Ibb_%f}=VpfTd9G5`6MSKjxRqPB5vH6Te zpel{6nB@bT;uM7kY|0$P4SXL#Gtmqxats0$Of3KY|L0c_N;)IYz*@|_SZ3J=MFxfx z1_q`grT{j^4}44uxDPUO^|3QN(PvPpU|@9?2bCtE3YwKIL0~c{vmWDT5H5;mX1>55 zs9wa(%*rmySit7Uz*EGKz+hOA^do@JP^gOG0E5zh8wCzfCGlkga}Y=+TLUZO0>+Ol zLRAd^8Dbe2{u}qPGn@j&6iWdM;{t}J6CgGdJIetMMg@kAJVgxbpv?&d4xb#j7OBi& zVqyecgU2<2p;2H4Gc$-Iz$o~{fTa;^T0zjK4}uFpH3sP1tbzmveleZ^2ElI%42=Rs z;KNoM92g2u@H4O%v9kVWo6gX{3aU5+I1(5VK{@FV>jnlTR;~{WN-B#E2s0jF5Pamo zz_frV5Omt$i3SEC)(QNK4GfM9>_rUwxfp}A6HkDeQsN>0j13GyXZRTenglo)1sIYp z@H048viC4DZ{Sj5tE>qyWSF3^fP0Zpg|rap^x6Lh1Q?Y$iuD=T44C8@4461s8iWP^ zUl3R*&!AGl%KYH}|NjCES|{`wm|8CIGpHBIG7DW`+<1ncL8XFO79_iXL-14qhYGVo z0i)0j2S&yK2Id8fo&p@QvKts#4H%dnFschkC@=^;I>0HoiN)e0Qv)-@JAQ`FN)~Yj z1_hS?416D$TzQHZq!Sn!1y}@y7cdHo7cdzyRWKYw8xTl17iqi4-#7f zgCScb>w37#{I4fXW~RCgu%{Awm^0@&XNDuXt24F!VDqGw=lRR4~X0OkiM2 zVBt{^4s@+#V-PUnW=h~z<|zKaIvq5A_^FRUK&gPS^hiE~Km{AC0(0V%eg@_W)(rxT zObkgU_!(G>*)Omweo$0kWC>tpa$x$xq0b;##LNzA(EQf~rD}f$^&)0=<_TE&(P8-CC|&CD8Vp65ETEi6PQ^XL>Lp9nF4qMK{b5R zA5Xys0a4e^N*1XL%=$~5E-*6)N^fA^CiOE4wWKsYFlLCvV z^a5tBCoB!jf;Su(#H0n71FFiv9#i5eEkbrC0n7 z42E$G46VGsn4j=7tf*x84@zz>j`AD~i~Kf71|8#P=&1DPpTWYxz`^jznZYqYK@i_1swNx@Fu;fZGa4;n0O_1U&kP>|KfPqoiB1!zke|Clo4;c6p z_%-<#@bK~%NLBMs;9-*QV3K7q5MmOL64W;kVk{D3b>J4{p1{r+ zAjEDUB*^B#Bg(J9$+m$>ki~#C`#(b;Kf@6NS>_G0YzjjB2bkpf68I&V6Zn&PDj38W z861QJk8KbWlxP$ddbvP0tF4$pLRdtAL0eo#fI(P7fFbb_GlM`8^FhvM{~v-v!Jh%7 zgjqm=LBuyRhj{~UxKIUyIP(QA$I#;cocGz79oUmyD>)eogc{fw5AX;IB``7?aB&py z3k5N7sJM5&abgPKbmXaEV0gfw81%t`K{2%WJQI5YgCPfl!D1Qa2cp8AoN>$x7?gM_ zT-4be7zEkv8CVxE2(!;IXLDc>ydcQL+Q1+$oFJynRUp8ZAjaI!z&t@fm8s@{2xEdU zPk^waN*(9_3j(YkM49s$nH|9WJVi#v2~3O|WTd$rgqc5ZFdH!N1;{b}WdK$BA7mK? zxEKo<$ zJKq5&Yo;0&1!jH+6>h!+F&+VSAzmA1_5^0O1hHsl2BsRG1TiiH6`lpmJR3N97BKTO zuFOtbYNjl5MZ9b!r~ywD8SA?L5z_@fL-WJ0t+hxyZjRdc6n8m zk^lzA1NMLo1%N=D<)O}Hba#XCIMl_ z1~&c%5eEJbtomFDtRbLQ(T@WH{03YM{2zFD`6o!m^BXY#U&H@_)fOypfQgrX1AjE% z16KYAJRs)(I<60_^7A83ZbX7!(;V zaLXV2z|G6ez!vg^nZdDAfr<5iprH5!Nuh)UA)W(5CMqRN2ZWgucm$aec$Ao!HW>3O zu<;rSR0w^LWZb~OtRN|PAyJV@L5gL9yf71koFP+@bb>Hr0WZ@614SY61Hw!WQjW|O z694TPm>u{y4;ah8Q{ZFZ4G`ilV5=2mIjzkSpvvbU#J@m;o9BV5JyQ+)0b>dN1~EoK zCV5d71!0~IygVONxf)pb0wnnx__>6RIWVwo&|q?qWK`l~EZ}EzkYp|3S1RIEP-RqL zV4J|tTENf0!I*`0f*6N^fY6f%T&xR(g^o;MV4c7(|7HTeJbwY(dr6iAQT_`|vit?w z+Tc80pzY1(Ajzm8$?PD>7%0NJfeB>61txX}Nl|tNR;~+@ECnF`pr0$2p~4=C|72rcJ(z{GDL`oD*n z!I=MpxCK9hP(ObH$NxQy3s}Wi3gkirDwu?s7Ktouqo!O4)R zh-E?@;{`EB2M4AN4vY!Pj0t)|><>&B8Ccjp2(l?K@CUGPXkXy}pv#nCtn%Q*1O~PQ zOA+A&M(qk#0dpn;7XAVTCdLg6g6wwAYzhqQ7g%^37zE`5jQJZFxRi?678_3`R^vEX6sD42**Q1&oXzO!*lYJ=6?NH86oqDb8hVU=ZYQU|`(9$KSx< zBxv8jB+k&lz|z1dDqX-J#u^~aSfI<#z--CXz@T>EB&a*cz|X+Uz%_x1pMhDEsewVf z0o)yDWPHHFD!|C!zyeY#?$E%%pWx2Ra==rVaRLLlpIg9S$rxbG#}KHjUX$R-%n+!h zRw2NkrCK7uprukEz@R0cBfy{~ogl!VB^4vUz+Vu^%PJ7aJb{6gA&^;sK}9wrfPtwX zP(>z1fT5WgJhZ9b0~@TA{iw%Q5U8Tpp}-Kx&JbEG!_UR2z@V6>=fI$%+oHe_$jP8m zr0-x*$PH=$9rIUUP|;~nUsKk#bzl(WcVKAYX9Nw# zNrWgclyEXI73nbW9bjQj0F9R_Foc0LOCFGAPGArbQ}9lF#Ll2nB$2?t_<#$fMA*Rr zVy!r+5e^m-vQc0N0_hT)V9kDjf%QQUV*&#cgPkHf!v`x7#sEXcgrG$57^+|b!$BU9 zQ!n&`hJqv;Wfy@4hB6Epnk&_WEH-ipFfe2=GBj7JNHV$}1o4s>LH#yy2?GwM0z*-^ z0#l|7a+U&B(jVOU0}L6J19(0drac3V7D#eo)Ny=}Wei~8 z4^ZainZUs1V8kCNOd{=yNQP;Z@+0 z=bvD~uUN#u;Gk5Hl)%hCfr*icrI&4j34a3ZgAqC z5TnleL70C54=4YG7z5BavFZdL{tY}FOeJ}21PHZ3exE$>G6O6R^4lwaM*r~Ey;Nw5Qq|0?dR$iXLgF%|(f&!BQh-53^=VK7$ zRFLLBz+~XBTEU|Ffrm}#$%0t3@9Y0AkmC5jBXpu5H5}Bt5?jCnjw4|KP;1DSDW`_1 zfQ|2gki5VE5Z{Xtd=P6>)G2rFpFJP18S5VjGpI|M^Z@~Hg zEdK!}K9EH^1-xvG1#ID*40HbfFJNOY5ausn%jGTL<)4toE+~0HCRv^VG$btD&$vL? zIjoq0y?O@Y1}XjpY1}eQ3)1A77Nkjo2<{f}5U~U!!vkqX0R}}$#}XMMD_tA z;{rKhsf|)>3*t)bi`4_E)ECxC(7nt}HL}d9F2pV#2 zU}AT0U=&bf-Qdi>K+u$NfrOaM1UXTG337}E-24j!g*he&#)Aqdp#_3v`b<0pZ0<}O znE5y4c=9h0{LjX+fSKn2GyewWv)l!2{0WLo;8edr@P88H;aSWRI6!%i-+)t&_k$_Z z0zE;A00AZgNyhuihWrd14_G)1IQa!6jQI=LctBS17qBJqFA$Uv6boQvS|G^uQJZan zAmarY{sn@{{0`6)XjLi`W<#K%t^qf+6DuAz|5vJZu|;SU(6cZV(cb z+0O&Y4;zF;)DuL~9&s{+7E6B+Qe5UIAS|LbK_u-FH$!N#)CVEOXMG9`%BmF&!ikR< z8B~hYA22a~U}WAPB%<;GtdFTk&4H=q7(WxJ86&SCn)ryDL8VC5fkBXSf}x;%0)vQL zfG9{WQ<2JfK}H5<<^%>d17UD;N#22h`GO%MSRkC6fvHIDf*})_7YMRT$bo^8ftfMD zkjcT2Q9;<`2|ts8Fr$O8+68%s8b*f7n0E{URbmRlY!et6FEAT&G8_mrVi7PD5>^ma zWb0F4aOEjtVE7=VctgIxK%~KdLGcxTff3^*KIRSVEDlCOycdL73ycIAJ{U3vsEA5D zkmX;%DafhNA|+ptrT>Wkg3$kTLE!`J{0le}cp5nQF9v|dd_zf7C7%vMk88GlXC;`o*h%aE~ z7&2zwnkVV6u=4!9!Uq*N=^Yreg{?tOO-md0v2ng8ukw$el)*=o~|QP4G%*REAs{> zJ_A-pIaW}i#=y!b#K3I8%71{#mfwMupM8S_;{#Uq3ljVXm=r+Xk#u0S z^MQ@?0t0^o69fMNc3Yl=Jf07Xd{pu%`Tjl)1K8Z>_^ zXP_n|zfqVm0K!!PBY5g^RU;L7+xNKC## zmHC0Qh>QSJ;v-P~DOVuGC?Lk{;KFu6Seo5IjS0fybWmerP-ER7!kEAyD4oE-q~OYw zz`z=yuKI|5LK5QzA?5&KHUUUsAnCxsm>|S_fPr;^2xEeph{S;c<^(mS05vuTVL>Jb z25`prFX&-35SF#OFpb~Ag;8*UA%|iEvqP;?m7syJ;*b3qg122fccl3>7~ z`ifscopAz#-~|IN{tNjUd;-G!1^m4H37oq84d$_Y49bkQf}o`TK~aP?z@6EFlb@kV zQl9Zb4Ih7lxgaC+HpT)bz5os$242n&Jp2vjLOdHxm<}W`E?{P6U}QYR!NS1E0UGfV zWjY|k$iU9HA%VR>Q2N9KUfCB9RQWgX^BJmEusE~|$r#abg~Et3$p%WViaIt z5)fwE!^rxeg2iS5(*+@Z11Vlmb-}`@AQgTCR5Cn}6ndnYq0_YV}KJggA-H95~c z@(d~!%WjLGDljg9zIJ2A&CB zjzSe01`2`90loYUT?~R64+InwzIR2G!8IuF6h!e(?6k$n}J#Xk^F=%0j38EO8*%On9Nj4m>aqSPk{R8eNoH@ zW^z0LEo5P3YG`1bz{VrM%p|~|q9yNO7=DGHK|sQQ0W`&H$inErd>Az0S)r^1DvC8f zXel#S6to`!^<&QQGjudFC@To~^GxWfc*W14GJ`?O0bJ`U1u!R_<7a4@sp!Ly2-+98 ziorpODP#qc0lSfVCBwoF<^yb?AXnVLto6iY0W+gRh@jE|7E#3oU4lvz{0kL96IYM= z7ceV3u`G}hd>6nVsRl3m2i|1_u6sNJ%OE0tT%|{0$Pk1)@w14DtpI z3|x$T?92fyObm?76PoxE82A|&Rm24v7{Fs@4b033SojZwaZBkhVCVV3z}TS5-@wSP z|KVo?BPSmNBjW*9eg;Mh`3nsU42&O`^&a^)-e82K5P z<(L{6q%SZqGj3pEXJFTq>p@D%hfLEAd0=KS`@`Zjz2Qiin z+=@!f1!Bz2XS^5~o^c+Smk>Nhh_&`{)T;1RkYAi!3@!!$vJF^G||fJc_G!ICq8hq-`Tut8bBj0)`h0id)Jz`*=~opHicZ3e{zZbn8q{s0C}*-rtWl|jq_3{nys zxETwSK!PGl1>Q^tI7Af~I24yK2{4K@fExHB{0rDQ1?3m8@GoE&(>lU$z|O0{%A~-q zrtnXJjeh|fBbNXxqaPFV0cLsm1xw)Bi|&d11dz2%AGrAiSS9%xmZ}KyEEVKuSZcu8z{<}cWF(~3&&bap zBx+cq_jZ8wWdQ~r0ii@CIR>E$jR3y-|DeU4Py7{xxOo)#c@>1@{S|~5Of=cG z95~K{rll*GH9@7TmV%I&qNe5p7UmCg*aCzEKQXX@i&dcu4lLrA99SoJma-@)%QNsW zED$i1tWZ8^V5BU@sKBgPp^(7GIDx(57`Vr*yn#*FEl!tFfSFmLmNCFd@qhCIRwK4b zwgcR$pjmn)z6+pr>Y&!VMgRjpgCGOT2R;@91|9}M(DJ{B^O!ye7;zM{2`IB3SfItq zc0rigK|=9^;squ*l?rCo1HvWppb2(XW&;t%fZ2j93k4V(82AqeFfq!ra~u$0{J_B0 zz{ry@o5>-beFJE1OPBL=39^Gt>e{0s}?om4uQH5*miK~weo3<4Ss41&iN2rx3t zXAEWMUm(Ds#8b3DfN=r?(}Oxu%>qW3EBp*hGk3BSFie`Ep#ds2S(z5_IkwDH(wM}+ zSTLLEfB@?TZJ}2IOd?DS983XBDoTn0Oo>li7$hqc1DFIA1DFCqHJDNWladlsfh-RL zCw~AFFIU1WX!7APVC8CHSJ7ukFcww5z{9m13rGe1AheAnGATP z`~~9K8yMsjL1XkAc*Q3OaBxjvOH~485v>o5{0YnqTnpG7nHZQV-ZL=?FfltQv0mUV ze8&zd)|n^pFfN!Y@Be@sG~>?rfLrL%g;lH%*mx(b6P$m6lm7slq^N{}ApeGS%A5@n z{0uz0j0#Nb0c^|(tb#>z8JH9}TtTrfz^uK5Nq|{iKY^8zQ-FtGfSFgzy^=wOkAc~o zjY0kbBMSpF8>r~w;#eTWFTf=tqA-D%r-4apk$?iX^sxda#U%_4JJ=MMq~#xQ$onfW z*=<5Po0^N^uOqcjrdDA1D41JXT%wSas*snTrwcn=S`X<2=5B^B95_y4Heg^>W%w$q z$3Xlk)bR`-IrTCM3K*~)Kpn)eNWs=t!9Y`?rba=DK~7PL;j_9Oh;M7FP*PNy%0TpK z)0_;2(37Uc7(TLMJ7`*xft{O?pN)~_6Bo|Iq>nNvF)(Q{D8@13wfIoO#P`ZGxcdZJ|%bs9#eqRwz=1*D+<}2X{usyyOB@))-#9SHTR1Q@`0|y2{op`WzN09bP2L=`;9$QNW zMqO3r0}S#`4onP+41yvoybWNxSR@&E7+6@^^c497=diGVPb?Fj;lRLgfEhfQ=)hof zpMfDEkcWwZ@##$?28IO+EQ}3|;wB8NOy3#&85oopuP`tKf=)WNXJ_DKVBv6JV7bAh z|Buyyftk@{H+u#v3j+(w4$#4L`pRqv7?_169NAb_Fz`5lHcaTZu{$s@gFG!%kif*q zz{tq1%Qy|Ra$o-$`vC@Kr5XO33`!2H91OzDGfwgxV9-zEbYNgsx)EZ*AU>mkQQWeD zk%fU#{~#wwaY2{^1Iqz6Vdf|ISsEBX;up9Y7?@d7HVNHuXJBMt~u>8 z@gEFK;&u(-z?jPiGE^bcfkEg1I|JhZP*MQt=jI3Lf8f9%&e6aiZ{NVcpl<{cQhJe9 z$-n^#;06Zyss;uYC686sL4Gqx;D-YPM*|Zm zVjCFb?}8GPM($4r@ed44jOubs;tZ?}%<}IJFt8{|2Y`HifKid3S$~;u!E-KF zCI;b*OKeOzs!U7_{0&@;`re?lq4c8A1Y|9L0~eD#2PhF_L}&?5IKUvxaDbJ$p9Q4N zD1?Wp9h~V)c^HHnHZln@YcPmN2{5rRF({@nGH@_`5o2d!P->{<1?_9<0ZnPjGJrg; zzla$W<_z@?3`{YiOb!e_iVPsz^zX4cFt9Ldg0m9?vx_PNgMyex^j8REqAdf1kpqLiJQpa3K6PMV z=HO#yV3kpDV9;*|6^26JK;gzHZ_mKUAiu+bfknCDy8{CwpFUFqgO39PgH{8B{s$gV zBI2%O;&5Q{ab#dna$r!=;d5Z%ZeSD^c3@yq;9)$#=;Oq|VCKLedc=W&X#>bTjAlA4 zKI%*d7<`-=7*re>R3bpj0Q4Bxm>7gr8yJ{{7=)NDuJAfAiN0}Q;64CKZp@Jk;NWp# zV9;w|5Kd^|XI2NP7t(29VDwdFW?+&w)XDf+PbI_~b~T z!r;rGVt0a}76Xd|HwzO3Ge0Pf;%gWf7`Pl5I9LuavoJC6G_dHavO6$va~LpiGw=t1 zdb3~*JMTHd3(RGJ(V+95O&~&GQV=xz31&eEfjtl!*LlxM0vq@lzVe!|n!t~HHlX8i z&k2yzp1Bz=pd9ur$iOTEKku2Lgn=J%+%wk)28xb(7A5_dX8}eA@G;Lj8QB()c+4}i z4byu=BhWF=o_U!iA&JG=WvrZ#Gn-Rcez0IUv)L1LM64_8Jx2aS1}FCGto#WKPTc%V z{3#4hEDsnbz>jKvz^G81$;iO}j8UmFAT!uBKG;m14SY~@H6woMEIf=CQI2Mw!^AAg$%A$}vpW+f|6)!?0atkz%gi9}RHVb14VhmstVf*6 z{E|UVAV`Oq`y~VDJX-cwpd@R={F(vJY0Rq`L8mdZ&tw1}!(6}sIfgkOatt%GJAcmB^wkQ8nyE}I3=bysxZjAsxq($n5i-x4l2zn z$;?f4Eh@?{(q#C+j{TTqS4M&OZqO0C$cH3b@d=t*@$sASF&g*<$A|J1<>V)W4oUXm zLpdawixc^f_hWIT(YPIJWRn`GjPK35;xidHCn?%ds~xGrKVHJ2S|# zS~BqS@MUooF!OgaWHDO`vQ1^=2OVp^iwSf%GJhqn0)LvIC(>EUtbBpYJkpGRKFn^3t2*06_ zAb+qB2U`{+zbXSezqt?>Z;ueOKRame@NIq%$g#)#lY|)9*E90TF)*HEBdSPjF5AOQI0nj zTAe$Mk_gSPxuBK~Fc#>5)8uVM;rf8 z1RZV6sKH>$tDplr+t>hnwlM>f5(5+XbYp?9ybPc#COp_c#}Gr0I2K42WpHL-V3;Mr zz{ty>%D@0Q=U6aB47Bf3g%vEhQi4HHkPjpv0uk6K!64`Y7Epi)?37>-ROMq3Wk_Oz zoqH@0Cd)8|fq@~68FYrR9mq}x1qOxN%o`Zlz$g7Gw6IDrh{%9Lyi^=~@Ue^o1B(@d zf*e}{gQ%hg1B0dN3@rOV z#|kS1^GPtUOan0mPf9R|sewC#3g7t+7=+a!ivJ5pFtFq?voSF!bPGx_h-reA2tJcw z5Z2-YZ9NhY;$Y;fU{)v+@?a3x)&w=e1Yb%p2nq3mWCg!VFo+8afkYKgG9@$cGBKpz z=Kvj5yn&JVHKRfew*doN0HbijM<#`*JQ57-0gOVN!3+vcd=emWp#$O!iVQ7aw(tQ1 z28E6MQ1w3<6&?vd#T{4_@&&3IhWV0|OHSg9_-ZS!aC?20H^51`iZbkYZZ~1_mBfVUS{* zazE9J>XOUxJ5MXj)039v6hIN4hXrfe7fq_N*7~@n>SaL8hd`J{v5MbQJqWVD)e4gy0 z572XJWm$3<8)P`Z7ZClJ0y^@TgCUQZzktD#;SU3^45K0!2Mb5310(pf+8qo6ESuO= zKR6072r%zrPz9Y;E5P)Mjr|4sNwp1(-=#G)728Ijl3_6S-IRqFaI2jmDa5FH>5N2SwAj81GV#2^YBawlj0_3<2 zEDQ`gco~>aFa~U3WMEh!3G%u42?2)S9ZU=i8wxlW#2FX_7=%|8uz)rY1~3R``~+>J zWO&Rd{X>93pkgT#a{@#30S*R+9Rdu(88=uM92kNlxEL4$4uVc26)zBA2ml?V8xhOG z8~`#({D%NTP=zo9Lqt9k;~%EP0A2@G&r~ z5MThgDQyQo1H%S>2e8})0fw{#0w8N4YJUhY1WO1qFbF8HF?;~4Fc4q}RuE!fkXX&e z08wKhzz}R8%)p?r875KyI--|RSRj*=L4hHfL6m{vhX8|`gy;f>00A)u1`b0OW&=hB z0S0jk0fs~aaRvqrad`&D0I-mU07HO<1OtPCDl0R@{0sqx#0W_Sh5$)<21bZ8!Ty(G zV2F@~DViX_kk}y2z)&F#S2RO_A$Woe14GAUPKetXRQ^3NXtvFs@(;U%(6=A&^#J zVPsHX5T5}~X8bJ-Y>FBh9UKgMxENM&d}b@qvNCi5FBN@i2j;6&M&+vx#SbgNGr34Jwq) z&XU8%0M3WP9tI4cLSQvJ=l~}W3v|e`09focJIfwec_5*{z!1&BPy@CUY#hrU(BV)l zKUf$*`=uBdIzcl@pdv8IM}9t-}YK$?%37>crC= zEGIx0LV!;ET)_%b$IZzw1?DFK27!X54D68ep+IdmZUzAhRH>dCl2n$r4 z3xJQx^$=hHS-u*4C^Lv906vx(WEwYvz>ic0W^jQEwUrTk6gD*2Z-C8$3W5DD47Kt$ zC+J{PkQt!r1+0ugnoD>Fgax|119DE52H0|_2?n54EbxJqk-32p$36Cje@3<4djpwmP^wt(zwU|^WQ2s$bb8f2gm z!3B&0KSUXr4Pb_X{SCU51ypArfEf?+BQ&xOFd&5-)TRRr!W9rZ1we;Rg46m0M&S&& z7^sX_U|`4wpQZ{83(!f57Z?OSgtCJC2nt^VupdET3C;irKSF{M8Z!_-LP8Qn2x?t6 zqcGGu0r2@yAnSHB2!oOqSgXK?udM6^OyE?C=52^Qpy-Ae5ArtDV+Y_KgQQlFsR!WZ zLlO~4Yyu-VlY(u6Wzq?Z!l1MT787`p3eHuaddvWHbUrw3LP8x9nkaq(wHg#aH#Z7o zure?+FoU8p0>om1Dua|c7LYVK0ena}$Qc3*Fe@i8f|C}+a)G66?4U58%Env(wj3Jl z0u12H5&#y1MzjDtqO-9iVQk?s0ZYhFU_^vGr~qMcU>10>lpPf1pkxh+a!`qY7UfvN z0umEYe}nx4i|7f+=?WUrC=szjhL7a}gD@yoA%PAGS8$-S`0#i~ne1;j`LNTNLe z2|!5jq85Q*b>Jc}fDx3TFp9tljKZLB1*>EE1Gc$B0bH;hU;t+du+7kLLGceHnV<*> zK-|O4ApAoBT7E-|Kaky^LKkcg%*F}e!w^AU0-b6PDcC^Hhj<%QFF-7Rp(6lNbbtYz z4(xd`734rn-L<|%)5HV=x03AaIanb}vaDalGP=p|<5Jd2W7=Kpp({F{s=+}R0aVd zP#p;I_lEp(6-tymT`$Y!GB%0?ie2 zY~W+~zyKlz7&s?q#DJLB0L~sC7!-am)-iy#Y=O>>hQ$8? zMus~AppXS;67d_LWWvbO0rDUyW+o_rj^AZa09gx)`2!3rJHRm_z`$|2Wq0xLrTNS3{T6=ePaR)z_T90x#b76nOA^*Di1 zdI-;cV)?)d(jLGFVnDp}0o>X`5)obz z0QNTMW==Mc^$ctbFg8nr6e!=kP+*0G8928TfNX;$!~aVA;URa)E&(ftBF^14{raDBpmpIR#eGfXM*{mIE+@Jir=R z3Sb5qfcdbx2HFM%-3h}8I^Y&+_yVwM21fB80^kt-AqMiIfdJT^1K=YXK%NCPW^}|r zdXU=~3_jwZG^W775WvWi0XkwAl%AK!!Ue?(!12UlAPdn7I@%G`$Xx?9N*v^70dVUT zWbKa&P-|HpfPDw*K!Q>(q+|xi7=w)j$aDu#QNt1;0Wua;szBQHDD?_zWdYXAG6(F& z0}SE<5IzU!J_LmsEPV`2;G_vIG{GqnTrorHTu{LbRxbR*1FU}nBTI!O#AfjqkU)l~ zF>rwe$^eiCAw(af2nD4@Na+E}f(4L*2vjmctOoTYz~vx|0W@?NPQZd!fPo%u0p=DUb$8i)TiU6hx^oyhvr(fMg)3rk=pa@&}xn zKqEz93t%k`Xp=GlTuVY*8aQkPc@!MEEOVf_hrt5u4zS0>H8y}cyeuccu9?8d@0UIzsu>f4KEnowct8g|0Bg+G3(3MdF3?K#4Tp&__fu%-)cV0>=?U3XCnlz+wQ)6%d91gKz}o z))Z*2gZAIS`H#bZh2a1Lhy-OP@X)~oMwUBDpwI&~ycJkLo}IwRAOH?iu%}pLltF@^ z&K4xMVY8D3eAGNkjWRT{9ArTZNN9qc3=K^KPz?-_p2Pu;F_r}kus8&b)`6^90Iu;s?f^3m zFo4uRyakcEz`$|`YAc5WKf?hA@eFWZpCy2mB|w11Kmb(in;5YefSNTRK>-GS$ceL- zA5=fGANU74sYJ~A(u&(LDez{FT(8W{3kh_PLQ!BCW;Jb+>2M{$LJEH~sCMHtcr z7z_m%%sD_;yR~yLe9->DuJn)Lh&&S;%XfZ;hU?J`3=KjI3=C44WAkKe8+QV>}@b8mks&<6sbG=U|xpfn7n4p+%m7OMpRvh0%h+IP5*Q0K?>u z><<4J4#+crPA2B$U=ZfvU{L@e|_$dj-Y=JdEv3 z3=E;~85r4_G#EJ+FsL)!SioSwpw8IA=)lhKou7lD@UQrS{|sN`U5r32l}86S^c@)% zJm$AD`yle;!$BVTBq;0;3@_1A7Of(no#+{%2^AXV}1aK;bbzL;L^#{0@&8_%oO`Z3y_r&hU@H zMTUV(0y9ei1Cs)?5yN9P14ad(uM8{& zN9;LR8QfVb7?fDpA{Z2y+1Mll8T>x7HzhEBWMOA`&d9)^X~D#x&dSgz;lse@Ucm5$ z^Ao#+9K#lQ24;41Rt63MAMmk-3U}T$FeLnCKk$d)4+En{$FCm@OahF_43Gb_GyG9t z&-#2!JKMY6YIfR7F86+4CqS=Mq zBbW>sgtiMX3H)wge0W2iK}?99wV_GEN1Q>NgNaFl(cs624-G6qU*s9sg$}TSYyh1K z2kIy=%N{n4a)g;7fQ6-lg~fw~rGZ75e+COn1`AsPi-LmO0!9%L1qNmY z7GY5V27wO;m=X@iGcbH;pUj}lD6)c;#ehZeBLf2?YXb|51&aV%12d>R`o_NCKjQ-i z<^)C_4;F#H2N(plFfcQGU=I56fn7-5fsvVmk*$GMfMb)X@&}F?pe8s=0~?C~BM%27 z%K}h_05KdGAtInt4^A*M2rybQe3;GwI>b+`#etQjgOvru;V}7P&mh3avH-*t{@cL9 zvx7~6K}>@s;mW&>>$VZmn;U8m(3` zI|BnlLjWtwbI|RIGOS$T3=M)CVm|U8`@?WUo`a2bIYUD|Lo_?uK_PkpD?nPGq5o)U<+hmVP{Z}U|`W;O8CH@@QmTi0!G2(6$~sEOalBb*bJC+%nmRmI31Cf z5N8QsyqIIg&HA0cfk`TcjdlA4#*L5I8~!mKkeA?K-EP2S$iPr|gF%2@033ya42%p4 zOo|Nd6$}g(3=AGj4_MjC85CF!#C&8|`*)xvfk}y-v4Y8vov~vD=;qY}3elh0lm0mz zk>}uGtzcDP@Y4WK9|$lcF|cU(ec@N&Fl1nuRKc*3jm@250rP<$cu0)DI<3<(B)ENtwICm4)AvL`4oEaWI>Sip23hLyRUgV8~sg`MdGgTO-twue{b zIXKwW85)?ReB@c%B^Z^=ITRQe8(0c1*fX%Ou>Ol+Q)FZ7)NqtxV3<6Cfw6(nxka8q zJ%GW1lYu>O27@672b%$dz{dyNj17zo3fztiEbQzvH7CfJGk<4nV9qvMz|7FVEv#R_ zpuomvCz!w>qt3$4xPU?6YXXj;z;=UEiG%q7(}FMj4<0jIuxDW7XtN0vh+$__ zpTVfa#-_m}An(8wTqDE4&ZfYm$X3+Bpv2C0fRWLGA^CL+Dz@s|jy#fQX1GnLOc7_BF9tj2oeun2v1`LlsIIuD-0O@0TZ=f>a z(F6_w-3CsF1Mv(32@HV(KJP_7{NS3#C?Jr|(7MEybIToA zz|OLQL*VTL76t}R#_b{uJ2<&kaDYxVWZ>hqX_05x!3i#|nQRU~*(?eCECP&d9E?mW zI9fh2GW=sa!p;D$0vHvz7#=V&J23LB;NVZ-=N0_Wz_P@Sfn89Xp@BKqYzGI+29Av% z85HC|?alvU$A9oR2#C2eGVmG-iXG=**!Y9N;g7-*W(GmAb^!*K1P((6F?In4h6ENt z#s+?d2u25n#|(lmm>HhhD>PIVFftzCcm5*JAbf*GfSrSZ;Q>Er4a)-1;GY2d0YL@k z>}XSg01E~Oeg+P|3;b*f48rOP3`z~^3JjVZstOG90t>jag~b*yvbqU z_JfJ#1tSXsHxHP>zkpj#UtswH?kpY-MkWT(CBXuWEI*i7I2idCaGS_-EZ`Op2;gLR zz$duq0ka{CK>H7VApuDvM;TUz=`VPdJ~AY*2?)(#Gf}wnZU&oz$Okn^(D`Ld3<>Nk z5)3>UO#BI~vHS^)%;M|`j7%JStO<<#39PQnV2M-~4o02?R-OzdmIPK70Y;Vz(D~c^ z39Pyd3S2IZSL7KAn0PXn1U@lvn!q_97O*)SkY^B)V6|bN5tG5hm%zwynvLxQJO2ST zSN;S>ekPFFjJ~W4Y|I5rYz=};4s47Jj0_7nm1XLJg7$jLVSPegZPiSBekPzT@W}guw*}&t#FD%aZfS-|pdH)f4 z1`z@F2Mx>}0S}n|KM-U{U}AZ|BrLjsoq<<~;RUC|2lfTF3z(GHMIUeq2z9D3u3%^4 zVC83E5tip>V9}J1V_@N7W{_eMVCBhRQ~Jo{z{}6T!YR+jz{17dz#%XCfYSzat9gSc zwdspP-C`q`;Q5dJGIQ zY7-u5FfmLJWmv$RLz{JnMFDJx&fRSH;kz1Z$fssd?UxATJg9&6xodTl(s{tF2 z1{>o7F@6SqMrI8rJ_io272=?485sDv85c0{Gw{o@XfP>#WKNLdv0!J>U{d(R?7*kO z&tSmBV!_U0z|6mZlTW@pL5|z-1N(sm{EDIiGx!)5uyJZIDZOP*kg*h!Xpdk4wPO_+ z1q9}SU7{w>qQJyOW@=ZVp+fp@>T*Tqr6-KE3fJi;RoVM>>@kFm4qY&1Pwp3CnPW{Fi0?n zISI%J2)y8!!2EwYBLjy(%(n(%Mg=(qj(i0xzM405~-2gDf|7}*af3H(?g z#&tj`L7qXKv7Nm_(NGQ4`E`(FY-bfYaDh5?Vhbb<*d?ZK5I2Yx7s*gIc6l%6Ah%#6gMt{t3wZ`9kpylB*Y_$S zFBliNzGq-yC}&_;@W_Epf$@q1TM_86JBP;%7P1CD%#s}v0zwY5j0c36!ZZUtEZCXZ zZ%8@uGc;H(5E4Ahz^(FMML~tZJ5*u9BLRkh141{V{~nNXJi^4l-=t^k@QArWKtV=j z1B(LV+Xe;&2GB111q@4md|*)d{{KHe1B(WOfT4qm!-04fnFCTL3=D5H7?jv$3^ z*ccn67-y)kaIgzA81OPWFp3{}AjT}gz~aFy^szyjKY^D=l$n8rg+r8ufrU|lgIz() zkc}Z-Lz+cH+2Jp{!XF0_mK$usTpB#W+y@w05=4bLZZHaSzF=fkU=?CaU|^9@=5b&X zewF~b-i$ASmpg$|QKozb4`^h)fstVW0}}%iXr-rz3X20HO9cZ<1_O%&8w&#~&kQ~m z16D=`VX+eqVk{PnECJ$-4JtNY z#AX~|;PGJPPY~lZ{K&7=z^ue96QKBj<-_rx|NrwdGRQ1vY-oBQHQ~TNhZcDTW@&R) zN23@v1`f~=p5p<=8_@}@K}VPw7z8iyO=SL1{{Pm0env?Jb(IMV7KjN*7)U!YOjy96 zq3*bVne8}e{1nvt-N76@VF3%9I|l>90?N6Ft8n9 zW}d;Ep!S{rfbfk4VvG}*1wJ{j2z*IUWL&^tz$mBI!K}pi)`Nkefv2nC9|P#f3bh%` z!g3Rs84{QzlpGX=p+(np2`oIGGP9v>MO{8!LqxWUBmLxW`kGiZC>19d_3 z1#BRt6PTGCgcuGmaLr(5J-}eVc;F9%G)UhG7J3rZE~>AU{;i4 zpTVre%rt@7@Eg0*1F+%=%mTj;h}axqW&qpB(!dNdVgl&?cnE_*fmxJMfjQ9OF}slY z3U(I;hKAw>cF?ezFmnUDC4;bf1G@q<^8#iA4-0k+HpT{ahDK%v2Pysrc4c{$26h<% zm3#({26l!O?EDSv94srC7!{aoj>t3I&}IqX;cs9!5^jO9U54p#vks1AS(&CWnuV2mU#*u}om*Z(w5LcMxI}5-(t9Il##8 zpcu>2!OZkPTE(6LtQ1s8Okf7x4(Gtg*uX7z;{gk3z>b5F{{fc}gM*TgxCJ{)2eT5h z_yT5uuL+7KGvq;bM}PzkKe9J47_mUiNdRRRo(@nlgorRb-~wC1(!uP+%;3%lI*)~w z=>|U=g8^u=gTa7Ffq_XvK*{C3hJdh(g8-WX15<V& z<}--@5MbFMAT0cWZ}LZXg?|D^hWKtP=FfdGpKuYv%d0UOf;0X4=84_KA>_&j)7G#FSO z2rwBius;wG61c$1w}6GAos-MKlp#TwKCbtc(rJoDWn5!*?*VE?{AJV9f7e z%E79@t018MKvg613%LSW|j;N)&mUu4g7-q4xEfaVkdYQ7cep?FtRp?GIX%Ay%6MS-~v^g z8(8=m_>9;e@PI1I2`nrLTp-2{0hSG*o9cOX2(V-rvovsm#1AmAykOyP5as87pd=-h z!D;xBf58DZC1$Y=%sd&K0?!&aZ8pe*%IgeH76(qAA3Q7>oGc0q5C)?H1A~JU_YWQ> z4`ySB#~&I*SqeCrUa+%efbM=WIU>(cz{#Y+!3NgCQozZR!O6nF$<)BX&%kNV$RH|k z-~l(&1S>`kCIjZ|=m16r2QGoX3LH!h90IHi94s0vECwnJ4C+n{Z{!(%NEqivvt}?0 zvrG_Ve4s3NLjhEDI`A`e{LzpWW@%s+`0pSs(6E4+L4boV!HVBOT7jWKjE%vLk%3>7 z{QxIV1E<4BCWSf%X{H87wi}#GD;U`ha55bLvtKZR^6(8VrI7b8xY#(P85y{YTI?Al zxIryVZsU;mVgihe33kq3=gOIV;lP~hPLM&}wC z2BrvUVetmgkhG=JTYd&MLw06&2T=!phK8yWjKVSn(u@sEi8tgKq!_L*U=)^g04+k` zVoKofw`X9N1YO>=K#)IyBVMpwgOfjj!<>nMjpqalKLeWxUjm1SWP=XV0~OFNn8XR( zEew+Bj0}wuF$#hV4Ll4F)b*J~6F3+RZ$`g)!0jZ*z$Wm6eG^+T{@kE%YmPPL+=G6V*?w*0TxCH27$~Q5{xHgm@^}B7K}>FQWDID3{vJC%#03# z0XO71q@)grdojo`3vn4FIJ4<^(IM~1oja?Z3Ffm*ZW_kePw|wA#@L%DNJcEG?(+g&{ z6P)}9I2HICgqc_@IGH%O*-mh>98h7h;O1W-%)op=h2;V#&k0V(1)S^)gau_@Ffbh8 zWJ*wFSs*MdA;BpuzJpU(Yy+q8-vz?LqBDerMRsrs3vb{AUF|8fgHu@WhKjI21Lx## z><=C@Y_VrxYTy)R*ukm9&d?yvpuo$}VDG@sz@T}7U0CJ@pN!N3PUa8F{0yw+g3C8B z@h{+PV?M#mzJOCuZ~-T%Y0SpEfKy6r1{=!+HitLt3f~mWSQc;!vCQBUX;{I@yMR-a zWdWxE+XY5O2M&e>oQx6-d={_uW;k!;V4Kds z!K@I?!mcb~x^ z$e!V-z~J}5QNiI%gNvYf0R!j;Uqb;uP7Vem1HX?7N(vHO0t^An8h-4o8qNwD(FP0# z9rX+goEZ}w1z80ct|>%waJ4fmU^x)O!m=4uSc@pKryp=s;$pFIP~zm+;3)9Lfg$mT zJO?{N`vwNk9dkb%9n3k|nH(G)zOXa=PtY}FXK&xZprpi2U zOmJp+pvtyDQHg`Yfx&>4RrHFmpv(cMpd<1O`V0b2it5ZO7#yRSS<@dl3Vc^!PW&U! zz`{}{z-Y+A%e_(fDf4~JA zhn*m(z{;Lk?%?yDpJ8HH!;cRiAd1||)w02?Jn4+WElJzof!mT zSUX&m*cdcGlZOHh4}=&Qn4NOWSeYZ3m6(||m>3_p2$8K~@X?(t$iCn? zLx~Lo!w(i=js^~f0LB8)9M>oQ1rMB!IG7oBuoyCPnXh0`;$mLG!r;J?1e$STWj^3x z$i}=p!VNSMD$MBMVR$57LV+QXNh6w-HJ!mtfNh2Aqk4wukL+##4$J^uoz-puTFIxN z5Us$#kio*xz{S9zYsAdPZowe1O^3z|T>>fmJ#B*#Q>=_79v5 zjE;=#43!xS0t^=z4B2PIJdkBbU{GY|{K0I<&%m_6ge3vg;9_V1-A2vB!Jrf?=n!DY z%8=dwn&D_*WODE_&W`@zAS=r7!&R^mBmkOd{V4w6fU7X`4+fSOj0_pfj0;#pA3SIH zQ_sN2$l%7ff`LJRiN8TXpS6LB`2rKq0|tHuMqZW%CPP*ha|T8x4oy*w1~0}2CWZz! z1_^J*A8t$yO#BBJOi@o0`zS-gX=28!4T|*`4lVr4m6}(gkeQc~TA`3xtdL(?qL81a zkd$AVm!iP%l^y$8Vl`_76}lN5K!=Dquld4|nV6GVm71bZo>`Kikf;DYMC=1Q_Cv&y z7M7)?>WBFG=>M7{svnQ{@Gu5ph9u;(!yN1s3^dglKFgDObeKVDUa}qoqZosMPiCGN z!^IRXm(0ZUy!_&l%;aMDxnT~VW5diCKQSpJrlb_578ffd=A|f9r55F*92>?E$M_8A zp|IFi5hfffjVlWn}>GPBnSh z>Bzvq;KIhh~sL5{t8;lRL9(gj+? zrp_RsxRQqfyu)1J8#gz|OyK}YMwTCm3=RzPN}$!4SFW>xgcTHY#X&Z+RDv!flUD{w zJ#%LVNeQc*?h_IL^#vH@LCs>pP3jzs|2sLE7z8Zl`!V-)gRF93V6f}uabQ#s=aOUK zb6^k-a9|K;X<%RwX5w*RU_1!QObiMkhG0utSQ(f>r}PN&I55Z`a$sN)z5zO_i;>|n zBglt_3Ct|&Osq@{0uG^x%qE~M(x8F{EGb_AQl8!}G+=swnWch(QI02`fq`Kg=q_#nj$9T{RA1l&op;5=;K0n| zz@TpkI{ZoLhO-5OQUe=ka+3in;lSy@z^oMEWx*ieU~F#;WV1Qrnk zonNL<5o*W47Lf`H5CP&1*3=Hvt3{@fwmW<2{px!rw*bxQ} zCI&?%MhON67Di4c2Bn55eg*-DSO#VX1|M0-NmX&66RZRdL}@WFf#&sn6d6FrqAAT{ zRb~)mdlApZ#2|3slL9C^ten7boWaMOfkDcFL8Xe_fq_96bhMMwfl3|*mWas=4h%jP z3=A3$4Ep`xuy*JMo#qSLC+=g(z#!qkpg)__fq{jgJ`|K^4uC^W-im>dLH>*b1B(K~ zR5J#)3zHcd7<{Z57-Sn5^tW&|Ffc0|m}kep_5)-*_*6TU`Jh9uF0n8%F(^4K=3rn^ zm;$odmVtq%fkBA>04uKplK?}0B!iU_I|GxC9RtW$qG6!3t{4PY*fVe*VDhnNVBmIO z5M2$rjFN$ci9zYWYAyzj21a>@21W*6yvzz`({+$RNUE$-vCSpdZQxJ)6pb1#&hOqcpR12@eB9 zFX+@~s%r^i2EATQf zgxmc4@4(BzkYW4pKWNi+lkLC%YlImX7TNy$e?pjn;gIdW|4)P&7#`XF`_CZ4z`$hp z@4t-*1A~*@zyA>;3=C~{|NfVVFfeSl`}co}2m`|>yMO;Th%hjy*#GCzyAk#85oXx{`>!cmx1A~=fD39d<+a9JpcVy;A3E5^7{ARfscVf!t3Av z1U?1^ZLfd-7w|DKSb6{Ze}a#J;h6Wo|1Uu5z5o5^;Adb^^7;2)gP(!H(dXZP4}J!Q z3BLdSXYeyH?D75gzlEQHVWr={|7-Xe7_RyK`+tU?fq}vQ-~TuK3=Dey|Net6le*>q z@4txv0|Q6EzyBcu3=CTW{{623oqrSf@Bb1|dq42s|04nn40l5R{eL6Cz)&9c@4tv3 z1A|cbzyCIZ3=BQt|Nf^4GBC^z|M$N|kbz-e_`m;41Q{4^g#Y`0M38~uU--ZOPXrkl zbRz!!XAxpxaEkc%Uqy(4Au;0Le;3fA*oc4sQ-l~8-bDWU-y_7ppcD1){~93%2A`;Z z|IY|9Fyuu2`~OCWfuSer-+vxq28K0J|NiR;Gcc${|NHMF%)sCl{qKK{Fatwc^uPZ- z!VC;WG5`L{h%hkRi~0Bej0gk6;rM_5|A;X#tWWs&Uj}q?Lejtg4&n?9rAhz(Cx|mJ zJWTrczd@XVfj{}*{{`X<3{lDd{)47PbCUo4e<04lFfaMvf6$ijFUkM@Ye+CKc%}UN z?<2v$kdgB5e~ttLLr2QL|2+~63@cLp{a+)&!0;gD-~S5|3=CYU|NeiFU|@(z{r6u) zl7XQu_1}LJNd|^FssH|mNHQ>dNd5P}M3R9)F74m{8IlYPerf;y?~r6*Sd{kf{|!k7 zh6`!`{{N9=U|>l9_g_Yefq^aK-+vn^1_rl`fBze#7#Qwn{QJK^ih-da^WXm?QVa|W zGXMR5BE`UPAoJgU7HI|syR3izb)*>>+Oq!r50GYHxRmwpe}OaugHrat{}ZGc7|OE$ z{of$Xz)+O)@BbBP1_puLfB%0-GcYX5{r6u+hJm3d@85q183u+0dH?<=$S^Qm&;R$I zLzaPoz2M(}4Os>T^MZf>J!Bae{0jd4&yZzcNGSOCzeARRVQA(L0atsVbrT_jH$T2WXF8%j^f*b>b zLD|3mJLDJ`9LoOvzahuK5K#8-{|`9^hJ9uK{>y+CQKgz3mr-V5c-;8!zl|~jgId$S|1ru841P`j{?{loFr+m7`#(pSfuX1A-~T%adpDhv$4ZU6q;s4y@vw*UJdqr$-8(*Ey%jS2%pPW!+A z3se{wcDMige?WzSL9FZF{}(C@4EMVJ{TEPWVBqTh_uoL3fkCGG-~Rwr28O)ufB!2~ z85nxH|NWn#%D}Ly```Z^stgP#y8r#Zp~}E;ulwKsAF2!tT0Q^%OQj@{O`Yw1_Q&M$^ZVxXfQA=o$~L0iv|Nj$kc!TmuN6BteN`n z{}BxahC@^T{ePmtz@Rhj-+vZO28OU{|Ng6JGBET^`}f~PlY!ycw159oG#MD=r~muk zqRGJEGyUKHC7KKjWz+xtKcdON&@uhr|0kLZ46~;H`_H1qz`!@--+vV?1_qTG|NgsZ zF)&=3@$Y|%76XIh%zyt|v=|sZ&HVR&i53IHidp~uAJJl9V4eT({~Ij^28sFq{_|)v zFszyX@4taI0|U>3fByrt85rai{QF;^&A?Ex;NSl#+6)ZS7X16aMVo&!Ee|@MzJ${|dSc3|x!<{ddr1V3@J^-~Sw41_p*D z|Nc+VWnj=+^6&o!T?U2)OaA@8qRYVWV#&Y%UvwE5_AUMQUqp|A;la{>|4sB57`T@G z`yT>YeY5P}{}Me02EAqf{!h_kU~pLW@BbD(28M)X|NdXmV_;}l_V51}JqCs)%m4jX z&}U$fTJi6{hdu*?(TacnGxQl46j%QH-=ojKP`CEq|1J6q40G20`+r5BfnmeifB(Pe zGcY__`|rPq0Rw~Nx_|#o3>X-^*Zun+V!*&qvF_jh5(5T??sfnEPcdL%c(Li<|2+l_ z46fV%{eNP>z_4idzyCaj3=BDY{{1&GWMEja=imPjLk5PMd;a||F=Sv6-uv(W6hj6E zhrR#)Z!u(GsM!1O{}n?9hNXM|{r_Ufz;Jx;zyBge3=B{A{`+rY#K6F@@8ACrBL)Vk zegFQK7%?zd?ECkBiV*`t(!PKHw-_-ntl9VP{}m$!hEx0g{r_Ub!0>M0zyBh}3=F*c z|NS>HW?*pG|L=c@F#|);{(t{Vj2RdT_W%1o#h8JiY5%|fTZ|bPrtJUs|B5jK!>0ZJ z{(muMV7RdV-+vJk28LfC{U!_yEC>JnPcUI%P&)YUe}f4FgUP{v{}-4rFjOD>_y2$i z14G}zfBzqtFfgn+`0qc1DFef)ga7_3m@+W@IQZ|sgDC?8*P(y^6HFNxst*18-(t$Z zu;$Re|4U367*vn^`+vlgf#JiEfB)Z@GBEHR{r8{8jDf-C=)eCuW(*8PNB{lzF=Jpj zdGz1^95V)n$;bZvpJK+q5O@6F|2<|54Bf~7{l8KgXPbLHWeL|9i|C7@SW0`+vurfg$L` zzyE*C85ojI{QECs!N4%%#J~SG77PrlPW<~HW5K|1=ET4MH5LpE_dxnB7#RM5^jk16 z1fBf%|A7SqL(0j2|3T+`tvdPdzk($L!3{#vSTZoQoc{O!jU@xanbZIN^H?!3+&lg6zm63H!=Kat{`*)lFz}uE z_dmyqfkEfYzyCc}3=DQ>{{3HL#lW!S%)kE^tQZ*focZ_v14#XufByxn85p$A{`+rW z&A{;Y?7#mJ)(i|h=l=b#ux4P$J@@bb3~L64hI9Y^@33ZIm~`&n{~Oi}4A;*6`~Sn5 zf#J)!fBz+H7#P^j|NC!Y!@%Hi{@?!y8wQ55^Z))=*f213od5TKh7AM5uJix?@33KD zIDh`%{~I<84DZhW`~Sm+fx+a$zyA`p3=DY}{{6SGWnk#J@b7~Fs!=x@4tl|1H+e#|NckVF)&zN z`uD%Wj)9^6(!c*R>=+n2uKfFdz>a}o;|4N{|o~I1Hb;i|AL@Clp|C> zGY=z30Z9H0NS^_2^LZE<7!Fwf`!5Sp2eUr}q?-Y3zYZew=j0_By ztpEK772ymp{V53f97YC)`_}*diy`axMacIsGB8}V{`X%PS>6jFzlM>4;e_?S|3b*} zo(TCfj0_CB@Rz~5J z!0^E4-+v=y`AmfT7A6LUPd5MlgYIyGn;(Xdzrw`8@X_Yqe}7OqfSJ$i26iCms-iDU z3=9f(|NiSBo6i6`5s`s`gOP#3!0zAwx1a?E>p_(mG#$A%Gc(;s*kQrU!0;8_jAVp- z1TzBzlRa)fRxmR#@YrKdE1+-&bt6A85$Pz~SHjFp$6B=%d6flLx{+0~Q8`IER1#MUlhK2OHN)VcRCH?uH-b15k8ePCf=P;mSAAJh$pryVzleIR+zVT$~2|Nbi@ z`!5P1Z@|jHAmoO-3<+RmVBm54_aD@(hMOOY&|d)6kE@KBzzT66t}+62z?_&H?zDe_ zm4QLj4O(Wy(taAmeo(l-U}j)Yar^f_nvsElWuYEQJTv=(vj{j3O0Y37M7aO^4_ZY5 z_eVLz98fq}fDW2;$L)>?0`e6E^Msv&;gROQ|1QXS86b5UXiE^6*1!LT z$Z{FrUxtc z1t5Pla4;}vX#M*S8fIaD)!9gO?g9=521PC0`3ZF1C`i8`$bB&V(DDdm{sRsM21%`d z|I8OT$&jG@07?h(TL1p<1LcS3+Nkc_0FG{Oe$e1#VEC$yJKuY7GBCW?#_i4w zsD3$QcZ7n20c?H;Cj-N0?SKC}Q0lCDumeE;Si#A_5Uu;~Kls`LxIY}h27u&Ga56Bo z>i+v5jH2HQuKxum1H%rzfB!*kH+Xu^25Se~&%wpOa7GWeehn@LhX4Aw^?PtJFbEs` z`+pVLe@th<7J%HB!NtIE!T_z#g(P3F{T*Bk4EsUjxG4G`gY|>**a|KNhHlS)|0|LG z2Wod@gS`kg;|v!A!vn8>|9w%+Z~>bEa>E-g1_oYl?0EF^Mhk-%J=ih(OTpT=2 zLi`F&*dTu_;9+2Jz+nbR{s0dHgO?BX_6SJ+0aQP>J_E=NKe!keN_-&gFcw}Ue;~En zRd^W~>U^-*sUSOCco`V_e6aURKz77%GcX*2*|Am~5mrccbnr4ToWWzq3SI_=J3jyZ zvmvK{roZ681i9e^F9X9NU$pWcT3>_Q@Pe0tp+xuJ|8P)Tz{*KbTMMcG!NbSEu+R73 z|Ct~)@G@os*ba~xI(!Ta=6?VFgVuAwWIA*%tI(*jZfK97ZA1wR8rUEsg}=h4+1gs1`e2Q;^& z6@pnw0(L3N$2DoWTgYa+@~5dj8<`P%>f^MUMwr4^)pCTOmz7mqw> z4y#%l8g4NCO#Wbpg4|gmz`(E`kNznF3=GGE{{1%q*$<1Cbg*`C8rvekz;GRp{1pKP zhUY>5{);2)_e1FaBEZ0KF9>@%4GJe7eg=jMLI3{8g8DFeDu{3b^Xq$;SvXFnkZgEng$Z!0-}}{2W0BhC5;Z{yQQ2!?l^Ygom92Wd0t|++G;&GX9Pr z1H&IY@_z&w7=DHQ`>%*>egq_(K;ZyN2fxGq{htfUW0A^;Z~&!Eq&yFrLo5i#-KRmAaRG7zhcE*}Rm8vl{>b`4{sh-Vkg)+3K?a6p5&!;2fbvhA5=wY5 zLz-IP^cf<|z;G?{-+xWyvYJ@|oWVeDDG_F1co6mPKWKamUXC&a!`(VXn1Mki8n^x} zpt;u=-1@HwGcY8@;I0$D2s1E*$KY-UiHI;TEQ$H|e+P0{Fs+BX&qRcQ!6p{_I29-? zLqr%D^ke`1??u)R3QMH2xk7}2VW&FoGG>Md1H%e+XnPt~h9Hd*?GRyLh``|nP#E73 zVPNoy#hoU9K>dJi902482~h?H?bv_+ThRSb1`Y;rJllveF!aP?uj@eh2b2y#@~NP7 z?WBN`UYNom=78*~5oTaG6$@zxv8W-bgATla{IdddAweu!AFdi=21x#dC0->bAZ-m#9=QBK=K-53=Fby|NevS5P_#z*Jfr=j~gWKA;!R96Zh{w z$lq}JXoUMR#26SH;&7+Q4lxD>vpDSiG?4i##26TA;{N>yU26k3-?f>!6j~OZ5MyAN zABVfFd?Ciba47EIe|6;ej6=AOL!5!(R2=THSq*Uph68c`{%auXk4NbD5NBXG5%=%^ ze3U#t4W8#S#2FYo67cw6oPohE;opA=6#e{g{VT*77|as?{r`zEHi#DDq3;@-M_080IFSg%Nom;o4xsQ+kz!zIO2eLZLFJc=6azy=+Q0wPkj;mL zaT~}MkQo_L3=FT*u-D(9Jk=n-6F-n zur338It1BuMT&vpUIzB~1iAN(1OvmJ49M6#3mcMM(E0_WMnal_;ZerF|Mi%D$_M!g zqz1H>$S(8We=AV>IwXZ~GpJC6j;|$1Gcbf?{`;?ilCR~#o&mWPbpK0C7WQ%%l!j+W zGBCtt{rmq4WJeN`9gy(71#%0>oDjg3l4A*l0{m(=WS0>1y07(A? z83qQ0Tu9u&%Xw(LfaEvGFfeH4;!ew;dyO1&|NW0bF&{FC2-5#ShJm3f_uqfeHB@l( zq5VpbynrkN!{l7ZcmP};(hLQsJp)+=h6Q-s7a+^PuqqdOnFKPwK$d}F4j%alvJ4D; zx&QtbqqrYDeF3%~w65z>F7EIJt@pZ-i#z^6>%h|T{{7EKF&{EP0CFE_ec0B#fB!e2 z$S(&a53v29bz^S%|Nd`7kzWgz2cfZCAY5(P58v`3DCp`_&B{{$4bKpIvczub{yVAz7kjz4k?49iN; z(hA59$S@k%zY6jU40lSfmn9%O9OM}oZsCznkY`}HfkPhT&jxu0hHE8|F+UW4LWUv0 zZdfAEz`#+8Jx_qbe2**xgHY+e|64$Ld#V^B%t3xx296rAId9|{80MAY_6v^!1H+tB z-1UGCXpLn z9Fb#SI8+Mhcd*PCMTCPVqz++%wwFQS{zHL*L8a{9|8(TE0}8K5M0m+4GB9M6VGnnZ z88(Ux3`J%C{+FSf0i8bqxj8|Rf#FEmzyF}k0dP0hA?#>SWMFtxhI`z6fg%ILqVj+L z!HX(jAKnz;LMI-+#~%L-05Oneh-3FCe=F6d4$LE1~6BKayQ&<(Pvq z1H++O+<7WNnStRzE%v@LD0~`}85p+IVxJQL$uCf5U^rI$?>}gq9qv}7G4=z>3=DVi z=zpNh!0^5ncYOw$WdDQ1eV}lVP=e?OkA>|LLc|YJIDpiF(t%17G`x=DR+pi~z!2F6 zRd)fmx(P}Q41Hby{+|MsX}3|;G3|uJ6DWK@>t_wR|NXZ?&V$7W`3EWt3_ji1+npf& z45|zavE7h196WrH+HVS~3=HMnxceXustgP*-DvUS+RU5@?UyE~GBC{P#vT3*stgP( zyK(351*!}TtGjWhuLA`1KTu_0SksLs|En=D%)?{8f*QpAxYp`8s4*}s>&87VnV`nN zZ~>3`4WM<%-MH6VEKp-$xYUiid^w=Tz;Lk}cm8>x#=!6dpa0bv7_Q-QpMp9A!xcRG z9n={Zj^U9{P-kG+fyev?bq0oQc;pwTGcauF#$LXI%1_Wb@O|C?{&RxL&*S`v{0Xb` zz9aI&8+8VTIX$@35|0K0!>k_MX+=kafuW}dcU|VA!N9N?kNzAD28LyL?C;TFU|51j zevJkL!=fJCV_0W27#KF-upd%pC#3_z2ay=e2$ z%mv{79H<-wt(%YRgO*$2e2Dk}jcp*6TNRoN41fA?$Hfdy28LgKxc$0AlYxPwA9p|K zh9(1pTtD`H8pxd=)EO8;`=RbU$cxpTAaxQN3=H34>Q-Y{2MRY2Ee3{J6L6Pv8Cnbs z%O~JYTOC>q3^OO-4v!UD3=Grp=s%&wz|e(9{)H9;Lk%8z4s8a8ya~ALX$@@#hD1E> z^U!8sh{q$Jq0PV$Hv#u}bcZ$rgC8FG71|68b`x;N_X%wVh7dgdd!fz1;5GqwJ;R~H zz~F&LzlIJ2gB1?{f%2Az4g-V51pNJ&932LREIfAf=rAzEO!)WT4LN_-BFe8dIt&a= z6L6P>puHVE6aM`_4Qew8awF0-tSm&ze?N2>7<4A$9y62BWngfchLYIM|XyU*B zptV=;$G~s~PZ-tc zF)$pNh+g1H-0CxaV$K^cfhgO~M_Q zOY|8SuHZ5Mh&}_uu}NriAC*Y{*JogOF$sG(g8au~z`*bgkNGN~Jw20f*NH9$3=F>} zVILm`nV(|7!0;ZA|5^+f7~W07omZ9^FffQr#@$9gV!*(lHyL;Mf%YKjOvas8SqvE% zR3_uDyHpGr7>p+4S^s0mz~F>Oe~KXkLjWHAErtvXkGOEJTU!F!8#Vdge@~S3=>U#k zQ1~7(WMIg`BmcyZfnmzzfB#*P_1A#4gUc!wBL;?DIOIY3OvZqL;r`@*|2aYPI@eec z`3%$t`~hzIfVzX^YYF&&O7#_^{ z_a8LR%izW*(9h(_C(+03%BRrF;>xGd!|K9k(8lJ$XVJ{=&gYQFXW__a;K--p#HZlI zC*i~=;KauP-Y?T*%)r1g^WT3~P&`2~Xz$Dt(1w$l(DDZ={{KItKsDnjmNK$ly8ze7UX2QU*gOT+=Bg1J%)>n)S zGobF$?*w^i5hLqH4u*}4tTQq~ZqHg?uV4u-QF5c7B~ zSs567SPK}yTL1rN1X&CF?;(hJHqf2pN9lvu9;s$YAYeWLN?UAFwB9%QA{;ffnvC zN}gt9yvL~cpOIlA$g7~qG6shKvqc$s)LHcyLs%mjvROe#$beJ-|No3+IAG9f0mo3d z+;m2U&7hD0`CFDz)R2_{l&&{3GTvq6dd|r36C?)>m;a1tdj1n9!rj5h+Rw3gs4+DceH^dYe zKM;+dfX2^*@>$Mq3=H5s4-5>T!>z$0F!2{qkHXyj0DATUCs-9^-zta+ z-Y>?$0NNG`;xhdH4_PY22o?GV<#RyI`w!(qjbr!#Liyk`6~HD^m#~25 zA9Pa>dfYJ}3^am)-02-TMU|RC50Hq6{ zbOV&00Hqf|=?zf&0F=G}r5`})4^Wx`yrGqWK>$iCKxqRg?Es|%pfu>*9#C2-fbv1- z_kj2lpnMRUmY5>}mV+U-Y;;a?1BoPjgzzsUfp`oIAaPJx*AWRKA#xzn-PzenK_fJ& zG_Ryo!O%d@NY6mm(6|)LH`IiRfE41v%nXd6v#2ptFf%YQfX=l*mtbaKW&oXMfho?y zPy;KVkR+KISV6ZsA&Eek%nWSsau6Z{A(?LSQB{ z13$wCEa?Yykv-`ABd{VQf|)^(0apJY2{SM=2r+;z>O>X;v6&f!84f_}Ll7T^nHfYF zVCe=X2BMi6L>bW24MbdwLE#^2dV+|HGtA(Eq&pBFhM5^87y|6DiAyppfYygFwIG_A zK?+_^g7`4Z%plEh0IPZ#25jXwGlML=zJr+oqL~@w7#Ps<4@6v^K?19|0z(5UXsCDtns^>m z9KD?h+IPgrpun&JS{{Jh2eNkp185iwGn}_W&0&DHV?k;`_%K*KW_&%u;eOB=Ng#U< zK<$OK7eM=tK^Hr5Ld}P@ixe3_?m?taP_-XL=@qv%_7h?De6^E%;hKe&hgt!A1zqU|u z4m9y7sJH-{csW#D0xAx(cQsU822K1vR9pc~TpB#AE5x9JCJvg8U}O+t(140Vo4X9j zQ1u2-ahSaoP;nD9@iwTq1)BI|sJIQ9_zG4?`gE~{gb%ct%kUK{9t#y$gGhr;c4q+P zix#Lj_#6@j28L;D5OaP&`;*{(_Y4dSYuG_)K!HI38jrB>zr+qvzZ1N{4v~I7fYl2# zC_vi>;PZGG7#LvrR07Q$Rt|_cm!ams+^Gf?{|Oa`weunQ3$woP=YY6V0nL0CPKf#b z;6)k?pktH4ArTH0k3|z-1{KeSii6K#U|?W)#tAWB1I>IlZix9WK}UcyFff46GXeF_ zcp&2Yq2_@1rZX@wMDRkyZNM7?A!pNp90WSQ8B~rLpqambAELehsvc(kF9C@7VQ2vc ztIi(@LfmPAW{!{$ME!TDdhod?3=9lbP;uBe0{DzE1_p*msQ6hl_n(7`-+_w5;;U2` zVm`V!=v-xxzZ}rqvr`1(&Hyy=1W|~%2bwsm7(_e*O?;IY#9k)o_y_nb6$S=|TTpS> zxCks?;Hn2f=RJe2GV#IEe!y1Va)CRpnC%l;afo{yp#635*>?;K43m06;+X9>70`9M z44CZ-U9kC*80`}~s5p9iCJ<~6W;-NZoPj}!L4ctQyfKA=0lcS47?GF0o3vXi*JXD!^Yjf=cs|kD`X(r-^}q;iVWTT?sHO zg0?GQ?az9sdMh;Z??S~7pt+M#9%B9sX!?PLgMmE6{OjNa^b8D8YZw}#>Ni8fAL=ZI zg-~&QXvBigLSkTGcn>v499ltv&#?l9vjWI`h@X(jtqKtLM1eP0GBAM8=VD-B_@KbR zAiyBc;DOd||Bk~PPeq71lRyW?FfcHH&u(I1V90`sYb8T$1)nj*z`*bcDn6|bA`Z2l z!9^Kjjtpq=Ad)%LpyE@Z5duDcje&vTiZXV8J;NbxsRA)SE(2mO=-e6*D+nsSVk(G- zFtY_JUIsqkj)4JuRs;hB186@3sQfvFmf!VNA?BNPK+FN{tp>4Dq2l+T83NWW?SzUa z>;|bqDo>!|51{EE8mtVLq2f2dhyNhT6LmF^n*|t>py>x%9WvyA&KqYCU_j4bbD`q9 zp#Fs>F9u6>i2DyLg184-{V?=YgU-Lf?Dus+)oVc0KdfH3st$4w#0+He8&o}P93EC~ zIBG!b)q<7-Fmw7f7#IXWB|5bI3~Oi3gR1A-3o#v5Zk*J>Ztr`j`ZN0>>S5)8h$h6| zf5%Y8qoCrw(DD=Jo&}oN&EE%Ap8z^k7j*tD#5e{XEr|I(paVS_7(mxq2jNg@d7?alYxPuUmKwp><}pN z0IL1~G$TTrWem1D5cBUr{RJEDRMCZqYeCaLEPP^hv728FRi6a)7p&agi$gt+9>n}* z(0mF$JBopUK@%#DUY{01#iK4jJOryZFF?hoLmRxX`c70IWIjYKGHD1F7hte~`U`Ys z0Z2YUAG^O+LDe712B~6TfKD4Re20p|=AEF`KSP`W$X)@41<-sAOSjoj@iox$0Tw=8 zQ1MgH`VM(-Xy9t^fVddLesCXWD^CtrX=u9AxPSAKZ$Q(6jg$At-8EzOu%t>hlsbXLNpF_gH zz)%f3N1j1|!KVo#4x9fu3l+Co1`!80QyCZ-Doi2foPf5MVC_IJGf+4~)FP9mW)Sty zq4fx?{@M%`e+6x4!s^G%P;vC~|07g91X_;4%4bJ&i23N^6QSY@py3bN-wAS;js-;h zQSf1Bi1ue8RQ$v)kSavG$k7s_J^`ApVDZvs$-uCXL5QK@9Yj5}8OpE^s{RV7W5U3| z04@F)oU)+Z4D9ciGY|7%U_`JAW(V` zfyNg!n=!0}s?UH1By4_cuQdaMAd?&e`aHoYsQNn-A?Ab5aR9NtLdEl-^UBcThQZec z;+_i7fFT2@jS5oDz|amA*M){BY#Qg54fb$Qw}qH91KLi2)tiA(ah-V(b7Aq)4i(=8 zZ70C;m#7`YoCeTq#S@_E6Bb{4?6HRfn*+rBUC{V~ zg@Y7Sya-xO!ond4Dh``RhpDfDieG^CM_}Q<9xA>D+HQxn&wU*s_8x#X#9$-W-B9s& zM%V9_L+nNG=SD)smq0T#teso|73V<E|v~TVH7RwGKdh3*JM?z`!us1ERhWY7Y3W2nGg*l^)pL{}ZZy z2ekf$rCS3}?CSkIA?6oB)x+8+6QJU3(0&ALT4g^}9JU?@<{kkrh&dV1c!c#6W1!*z z&~_%w{8do#1JDW+bY>>V=Rct0EztZ98}|tChM2DaZO4N47l71vLB-{u;R7oNHbcc< zf+lDf7#N_*g5fPxd_J^23EFc3GRM^iVm^BRF%BwT2Q3F+=6CyG56>k~^)Ar#4;%k^ zhC@AsQ5oh~!Dq}eFfdp{#jBbj z=7Y~SV_;yYg^F`*gowvNJ77nl;%lJml3?YGus_6n3uu7>D?crv;tJ4#Oj!8%LB)40 zgqRO)7BdtFK+HiO-&-1hJv=W!)!&DfOVD982GF6hpn5+Ctz3-^gqZ&ZTE4-;tph54 zJ`Um@@Hy9@aR#XPH)#74R-b|nO$M271FaWe?NZ$!?C$gkg4k;h%`dQUXn=~hLF+Zx zvWgi{aoKAS_duhW;RIB?586+Gxkoe@Vt(pYhk8={)396w;P4RA@-uDXY&Y%xDqs9!0J0Ms5pANaZv=szXE9fJq}fm-mZQN6=!Y$ zg$e@$tbVkPgxKqYmj0(g#nJmO=}{2%cF^`D`1~#g28KmYarAcDtte3VL+nH*)uJKh zptn~8pyKG`Hq}t^CTRTv8_z!t70-jVS7B+`AO>Q-6LkC*)=tQPilevdW<$mKpyd^; zzL-!Aa=!orY<&ox-modH#kKAx}%D&7O_7sK*hWIV(@=;L)upyKphKK|JU6<2_ci^9fX9zew>K+73e`qWH@ znDY;sfk5|ffTAV`D$W3H--3>@0EzE}iXVWs_rPaJGB7ZRra;U=@8{lwicjkT>1F_3 zZwF=+rb5&g%?EQCKxdJGSg%sC`6-RIXzk`b3flh$D1$h>9_eloG{Q?X^&~`X%+_C~H z&IBDNhxLP|WjF?}L`NuyB}= z4KW|K4h>e1Kgb4|j~X^09#;;;{P)oCfteEo6&HifheE4=2BTbv`WI0D!s=O`Jczgx z_`pjB23R{GEDyWA8F>(QR-o0tsW{p{ZBX;o(8gQ%@*(!3mt(>CAoGP7VC(mx(Z;YC zs@@HneqinD%TV!u;KRHb7+~e|_5z6c3ebcAS|0`qH_1YX_zLKF7pz|y02NmNUkb;- z06OOtq`tNY;vU%gf7p8VsYMWTen1O&*t&>)P;m#SLqPY8fz0^|6)%|#;xRD5*4wBT zL(D;+A1Q^3`$6X~VDY=57<;_%l|and08PlSbgl~(KMHNX!RAe3q2g}y+zRS4Q9>)s5pB0b`&ZuGy`Hj_}o)a z{a6VxCkvX+Vdhvu#nHzT+M(j;{ok2TajAU}^Fil$fmrNS5cAQ;MSY;+J<#?KZ2WL` z754Cd16AJu?I*&*LB1MdK6*QWq?>BHe7UW`7tWX0n-wWDK zfMy#8`C9Dmv=j%~tH7`TI^GL2Ew>h84*EFibf|a^v_A`LKdgs}e}eWSpv4D+U>(F9 z^mb859VETM_I1J1+Y+eyX;AZF;r0kBj=uiHxE^A@I<&t4Uq0RdGGB-Rwm%Bi{`YEt zs26~a|G?5u8B|;Z+AoInrxruSAICyM2DJVU#JbP`3Qqxs`Ox)p(B=z+S|h}K^!Zs& zsJIujJqhjZGJJ-Le}Lu-Sa^CgLCl$y1+tBS0UE3f8=&GY(D5!9Sf5Oh8Zwq4Ak1tOjVo&N%#f6u_cPzV)wf!1rVdY`iu#PhtBORY8tqU|&5tnuc7V+hU^oCBAA>f77-oRQp_-AX<4|*cLc<5v zZUn7E0*&LHftHi7@rRmzh`p1b5;wzy23s`;H3l+Zv zO|YqnSRgV?(b&0j50aoGM{Sbkpz6-OVJXqb-OUvs8I%nyO~r(o&! zFI3zQI<5{4J_g|#5cQLw9Xn_u^((`escm#C( z4LXg%uwW*{UIw)C5OkRxD1Y%l$6;Xgi1sXq`b21j3BD^Dv|be|jvmgNq2eCUb%0QB zGceAEn1k-G0H}BiwElvv->{hjQIEbJH3cgE5ZX?It*coF75`8H3N^$!K$f}K!%b!` z#QfXPaXeVKc|gTmpz8-<(!wr?L+ZumgOwV?A#uzK?sR9p{Q zp2PZm8cQJNb3n@n=x{%S@KT8Q5orGfR<7nk#nI>McY)@G83Y&%pzGve`_NB8)vrcN z4_~3;P0)CUW&?)kW!S@~8Z0irunbzU!^X9GmO!!cmOG&0 z=<8%yS3uOGuPf7nifcgE^T5g(C#X33I71~=9DTjbEU0)3w4V#BceX>t(Z?ZgL&eei zSz0S0?u>!PqZ!283{g<=D`@5LVyHO!dhFvXv4`_xsQL}i`C`yL-5@LYS3&G8fX-7x zhcy|VuEK5(-)e|@^z~*Nq2ec?{aIN1{1;SQ0NUS&l^eoqAm*UYU)w;%C7}5QroIU( zj=qli${OtM{0vne4(&I=%1M^B5PNq++i$RSr0P)ddC>JUuzEZcD((y||6$@iP;uC~ z8?butI#iqqnonWt@jgSv(dVJ`)?qJaY}Y~DiQbO&g^I(@34somFzkYgqxVO6*F((t z3vFM)+WBHoarE`HIZ$!*@@EoM{1>!>f|W}v*JF48VW@iad1~1W*yAg21H_#*&~^xH z9$_0)Tm;&lfu)BxP;vD3B=bgyIgg<2H)!?1pbZtbgl2SDJ0T7#{tDXffR&%yHiG(7 z3JePpA^S05<7Q`}>e2hHrkf!4-iEeIp~KA#(wlMkcQZsi3OC4h6eh2dQgNR8Jt{H zVyKtT5FekIlo?-=n9h(~R8m}0nwF-Q%#e~=l$xGdT#{N8Uy>W2oRgoITFek1pOPP+ zo|B)Hm=m8;l3!FDpIBPKker`ekds=HnxbcbT}6CaW?p7|Vo_0IWqfL0Nl_(3T2W$d zYJ5s*Zf+$*d^{2_J~J<~1Vw31W?r^laY<zktF$CF-qa{QxuC!>J~K1k%p}n)F*!AnAwDX~!V+R@ z5Hy&4GV`)iQy?}|XuPX&e0%}O?FjHTw2Y5WODu89%yW(pFG?&Z0L4{+TUkIr znQO37JlFsuhyejbsi1I7O>s*sNy^MCW=I3mE{P=|zxWuMf=zNsEz3+!4N0vi0eR4e zBHLXJL2ggX$w^8~&W1!N6$~|wkI!Vl6>+*|;E2;hM4TQdt)!(I>LnLI!qX@|KCcX% z#!^$_a})Ct(^FI8i$LD>F^0%N0wx2TuHw`3i_le>gH0$-EK3E2eNtu~R;2*}Weo9A z!6iP1mIaxq$*JX;#i{Yh`FX`9MWx9g>p)V*i8(p>$%!TTMe&J6=@9j1ndy1?MX3;u zMSOf_J}5&l#7FsA#>c1UmBuGmRQMPf#>c0mrX`loqb#(d%AN3d#b( znN^T*0=dt}&?q-CJu^8zuQWF)6>N@=p=k-KZ$Np=GB>dTVmXLs0rGNUadt6?V`!S2 zndh35nhWziDCt`kmS!er7v~q1_<`&(G;q$(D@n}EOD!t4bSx@LtPIOcE%(gy3ywF3 zWW-_<=hBo!*GyMeS3mF2oXkwu_~abd%uI?6gP3I!?-uGu4dY;LD+_Q-%FJ|i1tony zBU5lj$weyCz=jOh`74Cge2z_7x@`Maydv}MrsbYTrEn> zEQ!xAElDkcJ0D~L#0TgKJ%cUdL*iXsjnh+0(hA~TUHyaILFI9jkC9P)JlIX(x-Y&U zu_!S&z9=;f48AKZ#SvY^5mr7{Bv!pdrFbs#7{V8t7#G(jrT;>$BjGN4r> zq|A!4v;-9{po$S?2Rn}XJ!Uu`UM-tdu9fk z#Amv?29a$_5VA3#iZYEMJ_-~hAU8Uuq!gtV7yGAq5@Egpmedpw5+4+74vH$$jLh^4 z#%Y`x*f=wg`+O2h^O7^XQ;YIab3Bt>EkR0Rd&X0fvUo zp~WRdsfoGq#i05k%E!Ex!nA4Jfl^MuFUCXy%)m zn_pDvUyxdq2r8zFJu%|S)wnpd#1ov0L1Ez?Y#twC5bqkASC)~IV-Rl;Zxjz|VPpn* zXJ)zvo5h2a8pfApx&}MP2fM_3xw^WR1v|wDJH>|>#=8`R7{zC11{qp_lRPMwpceuL zn1w(9#BfN)#`GdKPne-aC5A6tz`lT#3vOkZu3o{0@xey%uCA~YG#b2+S{!Q{AwPJr z=1rO>2XjKn!4)Gp7{v!0gOWqATYONkD=5ceq&y!JgLrr)7M}ucD?^G%9GQ+1e+C=G zV~q;ZeP$9LY#I-)4PDEEf<5DdedB|`c{3;k+()3KWHEFdEm<&<67@=!(clfnS#X2; zI*?{GqWU8(hZ?$O4wW2g3~6S9n+tg*nI)C5K?i4O2OHGoN3@rIx_ZC+VCs2v9B)_9WI12aOae?WaxNJ9;&2WAfIfhBo?8j^+~!6e#n`bagc}NGpg>Nd*mP4rXUsrWM2|=NDu`Mkoep zSb`l1k8l%1P;rx3l9>!1fy+$H$*cnRL*f(jQsPrmA%kO};UfdkAO|er7MCW)dxE;5 z#^s4c1>o_U)Zo(GOosS)w;)Gf*LYVym-u*w_;`1}(0Er5sGx^S5JO@~YDH!VL%d6* zpQEp*GebPsUdUi%d}>|_sz7l`B4{KKYOTMUTd-?Le2Aly52QI5Tw-VxAC{U78m>!9 zEKc>z3{La}r5CtpW^O@_vyp3%cd!AtjPMRFF^>lg*cloc8YLwbXC{M&o-^~(J(Gh= z3=NYr5{u$XK;sIt`^CO1&PU-C6%sa4i4@#H1Wh!a<6I;6zVQM-(l-f>U!+ zlS>SXOI(9XEK&@5{olDGhO|HP2vqL;(hZ|;)@dV(o^F>;+c8r zU3|I#gN-u>rxUOvVF?qIL?GA@lv0d9F`1bM z3P5n=m8ODIm1|J2VSH|G0N50WSHa>SuYq*CLLwb8gl%Y?ms$=>68S}*nXVRy^bz0< ziuB+TLxcGEw9K5G_`LW;&w$L_OxKY3U_(%QJIFi4Fh01%$gC(aFC{-WJ~0_I=o_C| z0vg!JF9HoGf)W`hwhRqiE0R+Sz{4GeW}pGppwhgO%-mG)xQj_%Vs5IFX>vt{p&`<6 z3`jGHnaJ4?9@jV%wpqL{D78ZrLL7%^NtuGjfFa2UV^{_>DhwKIht3y3MutJ5>+0(2 zg0R^P8i=4EfD|kw`oh%`K1l&847`FegDpYb;&SNvf=r_Md4rQ8s0amT4bM#EGIVI=UqjP)-}t1`SP?FmkQ@dRriL-TlFNIbw(FeI1Y zO8`h&&M4jm)O^UV1WibLf*lGT#x5=?%CGbc08L7SfJS&g<55QCMVTe3eyQc4(d^93 zOmOXjV>Sj{sAJ|vSJ!~dVAJ?u@X#(;zj=CU3Ah@_#3=rQOA2yu*Z8=rvJg;77h(vW zJ+O!m@D4TwWvHPvJP%4h!TCAGuDK|U8IZ6yQpAELTk?x?6La7NxMx6SaEVzNxUmIm zXSo*SxaKg#r=~#WtZ|1M&c>A)q~dbT4e(AzaS)_+1xm=!ROA_undw>vu5E&X6FosA z;2=i_mlztC=H(ZG)<%G)j|)6AL1h7`9tGFUC^dB=N>j_#)jTy9JR$1|DpVaog=(-# zJZd}5B;GGSB^410!DjKG+7pCbU6Wivg)b-xfO^u9N*0{chIDyk6d#8IARaV<=t_iRTna!_eZi*jo?b2mprnG))HH-QHQil76DE*$rZK2M3O0_< z3=et73k1)dC*~m2)x#x)7dxQ4| zl5;$RkQ*P632R9BWE!}-Cg+%^|lt(${OOp41B3&8!A63f&)Q2!@BGtD!ixG1>9EFRqP_w)iY zK}%0OgG)eTRGG!`kciF6%mxn;g0o>}aXfhN5Z*$~1a+oBUDn_dLkpN}VsUY1I@lgi z1J>2m)wC=zr!>{I43-W+ojX{M9+K%%A{V0!j)!-ZVdb#1k!Mh_NqiExnE{GWSew`z zDR_(wk=jD3N%`^65jUSq15p3a&>$WjtKhW|uAmlBa(ZfsYZ(J*FxEY_1k$2)aLCC_ z0*Mrr=oRPd85)9C*+pT*A~eT=Itr*wVUP%B*8wze2hZ;CM1!aB1^F`wl;<$Va6vO? zkU?CAcu)WM;*yliy!g`M)D&=vN%SPt=!Uz)&^$iAB%>(5JU%Hg1=4^-9R>j#2!u95>EJu8>{_Qkel7=fG1~KvF=60cg6~6_!MxO)bzMO?-SJWKBVS z8fsk)ser-N7D`1Ac8IGhX!0DC@jwyh3vScmh)1M$5L%xYC0P=%4%!z2byeU6hmk=% zmY%*vyl;GQNn%MVxNrq&Aa#t&5LDhkVh6N#4eLl1$XtX&AXxz%R`KB62#GqBlFTyR z7bI6uQUq}rxX%?28mfv0IQN5vKr!nIsWo9H1sj9f%;5SLQ4a)pXS#xl zUr=@hVOMA(@e3xLG@T7WOC<0Yb*`?UA-_aWi3>@eqz0(75vYa^@(wnRhgSQ@gMLU0 z|8g_2HjKgL36@4QxC7y8Xqu9lmIj$_i3d-$fUCk_Cva^D4(4IewFM7NrKE!M8fdv0 zq?cO^PEElj@FEUcANz;I6H%JQCs!oKC#9x?Rs$!NBxj(HzT(cjgvvZvn}LjGohf)2 z8GE+FXjYhlT@MZfaFqcamvlCSw4g{$s^F{zYC&OX?13s8b4Z;98J}_miD0jKu#cf4 zoQZ$b&d@L=wK%yb6FjO1Yp5X>Q-PL)S0sWKor7i=QlWVY(Ue9M4T$!nv!Q2DumPw= z4k;r1L*ikzI_YE0pwbv>D@smA54fCS?8`lag7d)x&!9zHNUbw)eg+Q+fXW$fbkoRa z0T_XXEii_%!!nCXN)vM+u>qdsfTdQDmEc9Gh9>E$CBBIj?nRj?;MKFB5nDe*z(UGg zBUgXu5E{5DMQIuvxO$;0!qR-gUuT$r>I~O12GDYS@QP`Z)Vvg|V=b;hpuRg~4kjP6 zuFex2oUXxUsD&;x@xY6bRM_Y-dfo>I47BM2$y$(!1*A?z0G?h2Bw&yVCbYoA*WNOL zO`3z!K`@qKPEwinxaL9PaZZDPWU#c2z;+Ux1OfXgvjRRm49XM~ z7Ke$!o|#^*Wv;HE#iyVq4Jcyqtb`coRg*H10+*kl?NyKtSWaqQdPzonDr9jo>ToG2ae@*(bQHx9F}&pyLd5V^d=#h~ z37<8GMl5L72Wct`ZSdFG5NUJ-*4jrY<%sLIW9_z+ZJr@$hyimL0BID!5XYD#TDn4- z4+f`1S69&XDa4#PB>53LHVRtE8te=o8x1y%hmRa1M;rPCIC!oDHVJMDS~VJM8V{c4 zFa^!_fG}jnXAl=m=8!#IMV?W9pb!8pKPn4$jSqH@5B7+64b91Pjd!(7$%n2ka0Tzu z!q!&>XX$v9VH-oF$#-xSfHYte9|cO&@R|)2KqdL{pamYqpqK`)ZpZ>HFG$V`iFXC5 z%S_KK0c{lmw;WJfyGEcnYNx~$N6_Xl=r9^o6=J0$QEP(@4T@7sT%lv5;MfH@0koO{ zw8a55GJ(7w0o(XQZfb6EY6-L{i)cWD{0Ch}HrUqlI~(DPde>mjc>iD*&?*9qQC+fg zDy(q~>Pmwaw3McTCiy)b<6(og$cAkMj2ZOF0q6Un!{=hq~R&hTuV-7dR}Tud{QZBI1pvf8a(*`8t8`f z`9X;XrB1?FHwA4mf-)7>hMb`}Y^fxu?u!S_*~7eS85rv6>>cbM6k-C}3+sueG9i7z zw=ps6lPzHDlVJ2?X%D@sk% zi;qvQsE98}Eh^5>OU%hEsf;f(055%owt;T*y9kl<-)&@diE5ZiDc zxEl*@?15%eAS0)sb;Yi(kV!4GqSRtg_c$onG9Ee^?M-^Xn}E$golS#Oj3HruhM-e5 zDk>6_GRq7>b8yKyiN(d>xq|r2ytI5*^hD&1G0bcTS{Dk+<>01ICV2iU*aTdIKucgm zI3bN_f(D4uhKyZN#{-Z{O=m;U$`8=O4A5c#S1<4ajUdFh8Dt>_XlfR`5(Bhy3cMoO z6SU9`GNbJ26;TXXqv0Ct5)a{lw}FBp6e(0KAVY=4;3db<@j{&Iia^CJT9Ir99Y#zm zN==Q2H~P@S5vj1DVD1lj$);&MXmt@}$!0(ZxZelqvw_#)A?1C@$OX8PLa7PB4I)=) z{RBR01-2{54Sdo_Vo{}QUJACp0`ew&XkH~c&OsR$b2*A}YF-MYLPF|MLalI5@Zjyks;NJUbg~1lnm$N&W(r$7uNrv>O7{_%Mh^n<6%iN17W3ZRUq| zSzt56pkA+UYHo5tCH6FoG~0^4Mx9Vc2INeLH*r)lpy5TNu1-KEI2zHb5ok>TZg*m? zJjGgDfEO0Ix){JRiy_u@LCF0`OYlLxFtFpOQ$oR-eh{ycIEsOOAb}ZZiXRlt)Nvb9 zzZ$ad067H^*#s;ofQ3Aw7l3u~g|m?>=(q}S@d)kHAon%A@y!=O`eTqFL=>#hMu$0M z^MF6D?E~Oq9p9uYMvact!bj~k8@fWK(7{GJ8$vcAz*e<^7W)Qxqc$@@aRqWV)WaAX zZ@l3nl&UQx$m_w;+CqfNEpXd2H?blf(!>QNH_(m^ zbI{I@^rFNZ$hljIc_p5Zq59;qBuI7zyB)IHv%m+m1P7!QG>V5NgTNtqbM>oj}MMd&PYwphV0+WNzFr< z4MG&UkcDR8Mjz_ny#Z){CAjVc$1TowFSr9vT+s&_{?ABFOaTvBf`<*MvCRXz3m*4+ z2_$PFORq743%U>ray%U9yeIHRMex2$@c2MVYHng)dQK`@ZzQ+`bm~=U9zq9bPcFn1 z%nf*;T@i-y;C8!raET>oT^YEq2;NYK$X=k%9C%v{MoR|EHW|>mLhwRA4(&NflXd&dJ*hz1*gcJG+P8)D3oKsK0w+eb*dc0$6ChiFma12kAr znvw|XX;bkeQE<|N=PS_AGk7ioYlecX-a%Um4QUL48>|$raRC(tpc4)uCk{fph?!_z zL{OECy_xC>-F}FH0 zu@Ab58L4E(JzP&x+X3oMP^|?SdI2{^hH^6!)>pvVB*B=I@B(cHDFaWXy1@5}V+J$0 z0fu{a44#j1bZ&5Vi6HHGl)@D<$_0iif3fRq{TtLLO09XqV_YefZP7Cs+ zcR;W)c)}ar=OklZ(3Ob2dXNop$dgpiauOUJnB%~pwq{;x1$2TGJeH2_02C~9eb5Lb zKYTEUWxUB9o;NNA?I|HLO~AGifxHfF-9wt$6f9V(Lr}$sHfaU!Sz?>RgT_6$^uf9|$l1s< z0JMUk%+)m{-Y*zsk7Qm6XqTj+WoB`F4rJR@Q7UvJ4|q#DbZ4V~up?~P2;AHQ9SaYt zBtg@O@yUrLspsR8 zBRQbb54xuobb>y7<%zSAD>NY>M$%EIrO}6TGhM-rR8Xg%{Pj?#$@!&uC9XlxP8IGd z1tm5h?MNI|H0a10hU8lAF;_AVAt?1=$)6ssxOSO2J29L6-Mpngkv0 z@kuO(Ou1kk=K<{^PR@=`&d82WOU%rHx+UI7&jgfdL1i2J}OUs9>KZzAwI;{8FuU=?p0mT z=m{>d1TAd@-3mhf9)8&DBV@V=YlesdcaY*s@?l2?5Ip!GAOv*OAh^+nSaJ-jkwKLt zzQxDrn*iZcFUT8eP)-#AB^^W4BGC0Np#?6PWtqj9`FW7xIFt$p$HWk5Cj-)6D5NwG zuE#-pRzQnY5VaAw$|`mZfL3?l;KAO1^o89KQe2W>l$Z{ltB4ALOlC90M_EGZ&79O+ ze2ErIyWG_^2(*C(x@->8u0t6DfOYqr4WWkufE@{1LIOXpBiFz+$UD=^)io%@ARaUl zk8;Kf=*StbU?b2~2cY$4pf)CKvK}Rt_PIz<81s&Z6URnmOjgaoZBD;`8-sT1>s)M{C zd+R~xyP(u-kPrpeN|q?Ssb1 zpg9F~I1HmWz_QgLIma~@DH$M5`(UiBssqnwu2QC zP>|>&V`eGH8~fJ65KrWeIZC8J+JlBj`zt_O?29ww32m`}6hcUCRn%pdMkvcJA(4bp zSz&1qg9qBM4OD|>=Rv0f!cqw=cq~iup@-H(*83r)dqb0WP-h3S%Q+`C&oj%_H7g_@ zoIR0S1K3yW8-j8~UMlGNP4GeFuDQ9WQyY4Q7SPr&=;k?J@a?VG+r56sT_(_GPHZQS zK#z$=?(jlu98!V_JP3$Xl8`dy4T>db&P3|D=i=_UgUUy6euZ@?^V5jwQi2w5gVH#1 z9KZ&YK;u~8bw{2kdm_Ne0_i|P#L@te43^>>w1W-D=_=sPAh<;aX-t6zoUr#zkrR z#y~*xCh$>0(7G}30m9%dKA@g8tR@BpGx;OaFi$|v-voslXd@SR-zMx@2GD>y2)jCa z8lou!E%x+G2JJ$C6iWfgu=U_2uyTah3ox7wK?`xQ?+}F4omkTnc%&#jwIn{I0QqD> zP&NRSq#zGrY|D2xr0&!kIJ7{cqM%zuz@t)Fw_%Wywoscg;4}DxO(6sPpahQEjDYTC zMz4QB2?Nsd9rEp2d}HP&xChNiPJ`e(T(F-a04b;NHCLf6L{LSEI87Y1<_JFiPK{Cl z)bB?dX@d{6Axa`=Lsw|q3eh+~8s^1*YbmJB;)$URT3Lcy*d(n516P^I9bh6i)tG?B zF(CytB+wCFAloglE*``{r1mD(*@;1UK^(^H1hflAjfM?!ETPqinXaGZ(>A_4^_a;brov#`WDc%265TDBQx?)J{NL76O ziDrq(sfnP7M|Uv#K3^<36r7@PyDtiPQiy`Ygt#mabgeWfS)k@HQ0b3U0pU4l+Y~kP z43{Z6&?GqIay`&4MbID)v}#4_HK1&NfNb24Pbw}(^d51F8h3 z^f98Kr@XL)?`{Sqa7xzwKmrG;`HpR{i2O_I&51Zf(mW*}bp2{2^ax3qi%_Q{P;NB@ zc^bBmD9Ae)Grb&wY{jsadGIpn+XXp{2LiCa(*!q!zG2|*V48iKd~ zLS`L614Nm~r@e#5S}Zd2i@}ww38?1)8qPu=2Z8sKkg_4hTob6X3g3kR?akoWtRDg^ zz5KyV%6QN%E%C|4rQkiv(2?ta(&7xz72VGHd1;yHrA6S|ZgC7^4~Gnqn3LlgPzIX; z7%l}m=B0`7No@R^rVYV;Ojl6MLiSb1r$BFI%>$h&23qnDIhO`pgBluv&#pl%Jc3R6 z;9q-WjI{O$sqjV|i{%R5*#s)g4YAzagK~x?sEolnR%&Pn8eE0liVZ%g8|16OdI&7& zb_=9U9MEC~JkJU~j{vlf27LA}d{zjlP(Z1hK?Q;zs6!9+YI0F3=yDNIE3W`_uc#-e z)dj*BH>s0xSvg5nmRGPL=wKyRS0m&2c&x6&yUf}pBsr(J$PcvVAF3I0K^5fU-}wAe z(5+}7kAX14Ea0WRdgw`fB4+rm=-Qf|Xm z1%r<1yR8zme4EA6Ae86Ln?bHJt_%&rWt5P1e9_>?t>?9BMOr@sDy%7xzJ@Z zkoX#W>Cn(D-W8M#5qs1yZ!3hQL|4oB;CN`F1zj}j4jzC&8jeL963lc(9(YBl%S*+6 z2O;<{x_Hp(!Fixdb&>A!2A@Rd>Kg0_wh4l&+4yZ*7 zKJgr~($O0_F%NMW`mk>%>KY8_p`GAmS?KqfAr8X;jd{ZihpeLnZ==pcACmx0r3{zq z5WF~}v^cdmzBm(fycl#c$`!Ps5!>PnP|gMQH;3bx7WkxU;$~SuyN0NDkvZrxIMDUs z`ML4M8Kr4yIjLmr1pzNtfy@en))_&jm%ybUYDEm1+Z{q@DI?7tLZ?VUi4cE- zHWW!KUCfJ9OTaC5@BkRZSbR-lXCqg{FeYSSyJv`Rd`fCjYIEgeByMF?8= z3tdSFZdO5V6G==iEX^!JKQh$NFgYi`IJLMWAivxQ?1N!cTbM#N9fOAYLB$DjfD<(p zN^orlvWaAGxdxT`5a;1M`2ln#7UV1+w84TPZ^&Rc$s6UM({rHYigmUClD@$Ug<$Kz zGQm@yt|p*6LO`n}K;0}nE{3OaL*ye?L7K5FM>oPc{s&sGfwmkS(ThPp+aGHyAM0Ll zvtilykh8FM{ia{_* z<5rgG3ZFcIWeZSjLPue+uVREBh;ITqA0pT!J`;43uLZ@F5KByRV!L!50xgQaI9y6{OUTpwlZLeFWrnZQ!m?FzAvVS6A2I5+lQc z%+%!6^2}n;rMh{=B}JvlCGjPQ*-6NyJ;A2&;5&N^;z8#uL2Ab0lEk7C@M0-wvlf2# za&Sp%VURbt)e9{sKtT$bxd9EE`<3P5V}SXbUbeNlq;)f|hKEjmaPn zV#$7_Od~rR60ry$Vh_?1I>Ug(qQsn>)Eu|`B2czK-O}#^y2H-U6iJP5eoCsVfvc-Y zYDH?YXC@`pF7{)*KnDq<-~HnOy8Fk~)hEl))fG~;TE+*1HWzrgy1JGHdw_0v4)%de zPlK{6`Kcc~!G;n$ko0d^SelufP5p~2jnHL?>9tP_7q|8k4?RRCt zpc?=%uC7FDWD=^tpq&V$)kAov?F@$Kv>j;pFSQWd`v7g(H$>e&i4xhM#DlN50NvzA z##~cog)7=VcO&p+W}plSYP*2i_mG3HFmA{IFC)gjz#B`BO)_-FFafc&F`0XX4$Sx{i(96p&N!3fwE7dE2T!vAJB$QN|nUkWM znF1AZbn?_KNla$|E6+$Q&S20>smv>`%!SY;MG%>?)FRMvY$%-gA_l#p)SN_+1}Li_ zr-VTdbVUn;UO`TYUV45Bgi)ND%%E42T2aEF2Odq*E6NA;mKngEp2f2EFv;Watf+@!)Yoi1qP_MMa5~5X)d}kcA-k z>J{aK{g#-UnarRE5@pcKD}!7x5}cY2x)YT_FR8egK`%K!7tCXz7)UcPFfcMOFfc() zl3<1~V6+T`3nxKp;T#4A24w~Y26iO<7Et{#S^-%x0|NsG8s7{}e*{!Nj8;KY&w|GH zMAKgb)eoc5?MK%i&cML%?f?ILnEw|*^}}czsC{fu8r}Vg3=9n1NdA8S)eoaPpc+77 z1PVJSmq7w*UM2$r14s+Z{tO<71dM(TG8`fVCP8L^8Bn5-fq~&alKTUo2E*uWP<1f( z!|Z|4bqov)|B&=^NJ2Eg=m!ukIF3PK2XjAEn4y<}fdM2AGw%gdKa73=l4M|DfY}ez z4|OKPbf|tOsDUTsAPgA20;&;2$s(m6nEr*(^b3*)>ofq93=B{jl)j-NU<&FSFdIb7 zLDR1Q)d!;+&@>2uG%+wR!06>@`WKi$6u@YhI*2GjA544$n*I$?{V=+Kfc-ns^eb3G zY=Y57^B{Z}39}c*-;bt09~65H6fV*MA31f5JzIQW&j~0pY_)So(wUpQGvTh=k-Y7(HVNOeKUy*Z&PozrsI= zei&T>&Cd|skh}t8gJ>2;)be)*7bGNM^bTW?Dg;KipASucfE`3Xj6RPRVQ5?i1x5x2 zK4=*PvSmReME?R$2o1|;AUP0**$<<27#SG&k@W9?>fZs?4^s!y3&JoyjJANfA5;c{ zQ zA;K3zPk>M$tzZn(52O9i+|Q5z)er!oafKh)Fa`#O5H$S~Q2i26{m|kFrW~XGOl5@B zsWA69K=pe-^((+6pe8s#X&99SwIAIcn0_^wLMRQ34rChQQ>X$cRRFbr1JpuLJE0Ow o`OS%xcA#3pl)^-aslnU9TsQ%<7bYNxLw~Uc#N)+i8ql~507*)(9RL6T literal 0 HcmV?d00001 diff --git a/GPUSort/src/quicksort/sample/main.o b/GPUSort/src/quicksort/sample/main.o new file mode 100644 index 0000000000000000000000000000000000000000..c45eb7f9de62b40c47e94c75f7e0010bff0385d9 GIT binary patch literal 339720 zcmb<-^>JfjWMpQ50!9Wq21Y0wnexzLfpZzu)14Ap6-Nwkk&<6Y&&a?q0m`2Ur6(~mFieKBr!X=wOog(iF)}brXJlZQ0p-tR zWMG&DWzS}0V3@2+@9EP%wFfuS4g|d$^ zGB6y6vQIEFFr0+4PeJL^j0_BCpzO1Z3=HR>?DJ6i0wV*%MMegOOHlqTD1Doef#D96 zeV37e;U1KIpOJy#0hIlak%8e6l>L~If#C_1{gjb`;Te?u97?}nWMFs+Wxrx%V0aB> zzhPuxcnf8}V`N}>4`qK~WMKFRWq)F1VE7DWe_>=`_zGozV`O0X4rTvfWMKFSW&dJi zVE7GX|AEqf85tP;7~ zP9_EhE-0IuiGhI!%I0NaVBmwY`I#6P1fXm|CI$u}C|j6`fk6by7G+{!5QDPCnHU%( zplnGd1_mi8TbhZ1K?cf}Wny5EgR5lUZz(wCVS7_LCs zSE2MZCI*J~ z7#O}l*-%FvvmK^2`hj3Q)EplvaY$ z%FGN5Dp0m6GXsMfl&#Lpz@PzTYeH!)W(EdrC|ie_fk79_)?;R1(1)@Om>C!hp=={& z1_om&+k}~c!4%3iV`gA5hq5i8v?VhGgB6r*&CI}H17+JXGcede+4jr~3=UAXBQpbo z6O`=?rCpdA7+j%jH)aL~cPQI~nSsF*%JyPrVDN^reV7>-e4%VVW(Ed-C_8|efgupe z4uaCb%nS@6PP*85nY*>^x=$hI}ZyfSG}z5XvrM zW?(3WvP+m57)qh+GG+#bawxlknSr4a%C2H&V5o+&YnT}rYN6~pW(J0OD7yhlH!?FY zG(p+TP`U+5w?gSQDBaG?z|a9@cQP|DbV1qOP`U?7_cAju^g-GEPx@Ud_zFum;Lr3#HdFGcc@&vNu5Kjm!)T zo1pB?%nS@$pzN*83=G?#?Cnr`2Qvf1PAGd9l->=c_b@Xs?1i%TF*7jihq4bq>4VG+ z42PiX!^{i}N1*JZQ2H1%1H*AB`vjCe$;`lT3d%ms%)oF4%03IF&oMJFoQJY6KC#uGBYsTf{NW{W?;AjW#46HV7SN3z;K_L zf#CsE>>)D)!y_pBF_eD7%)syz%6^D&QEi(hdJ1F}- zGXuj1DElKb1H&gM`!h2G!xt#~D>DPbHz@l%GXuj9DElWf1H&&U`!_QK!yhR7FEazf zKPdY@sHe{YsY4i97#NtKY-Sb)1{Nrr6-u+QFfgz~*&I-slZAnSi-m#VGr!!gP>;^1 z9-Xy6Ji1+fcyzLYMg=Cs9)`;_zftI{eRJFu)Foo*cKy?N`~^q}9pxQ7IzM=J{`BcQ z=hJ!Jqw}~&XXyiv&e|6qouM~8I(;wrbh@6v=RS!3Up}4Jd^+!=>c8O8>3hSa)Afu? zr!Q_ZI&0rF*S=vWW#0kv`D<~6W!A25YFRI){(1em+w~7hpgDBbzUeG|(HZ*0qw|DM z=RuFoQy!hJ7eH?I!Dp98^BV)W)=u9CE}gD-d^&wEAnb%jzyw&lb(VhV4E@mU`o-Gy z1OJ=@D6Zh>to_qj`lCDa2Po!_?FYI4#o_<||DQYz*YDBGdg}lG|0wztJUV}Pbbf?5 z`hiDh=!+DOZWdJy1_lO?PS+oX2RwRRzj<^Xf1%C6z~Ipv`rV`3_kl;Z>kE(0^B$eY zz$s(`C_$xoXhXD>u!03o!V^}oNAsHqWK&;&O}zjz7355h-p~sky{;Sf34<+#q&kmY zP!Ph|8{mpN4};XgxVyjxbl!vUk>xy4m3knn3SHpQ>$?GB$vGs=7r+MYgP01nz0-Ap zM`!2?k6sXc%%j&9Bn$SVN9Q4rZg4*G=sf1pc^x9y9lF4y+jWHpC<#LXh*+(=K_){S z1+}}^74GxS8y?-hFH$_TdBCw#67f0^lr=SoxUADovsbYiRFjk zf#a@AzyZ_kIs@dlAFK=v-Jwf7k}r95x?H``?7D=p+jWV@43FN>DIT36M^Qv3V2E_U zMLfD)8$bqb#Wc{P7aFZ#>bNUtXpZ5yD`+&1;kYYk;Emz9D`=dJ;kYYkSdHPhD`*sr z0g|dfiKX<2N3ZJ+knte5w;tf1dZ6V%X=QWm5{6PokM7zH9^IuYKuI6s_Ja?YeR_GA zJi2|4xOBUo@#qfS(d~L5Jj|o>TIV5Hnv-H>VDM-@BJkP>W`Iwx?*;sNp0Pmm;LoET z&2JPi3*Qf*i0ce})9L%dr_=QTDC>8JzUVG}!IF}Vd48r`m2z|pk^6crDeUAHj8L{`8>Ji0;I?f8o-7Eo+&fyOpiF?!@6N5FAc z(1<0&an~!LLA>LxHy{*fs1g)82RwR1cX;%cUh(L4Jppq)QsiubM9u+^?$R9|y`c{v z?m|S)6_;+;J09JkC%RoPphXU-l?#s?umL{3prX_Q22$9hM8axN-0R<`U zdJK|BNR9@i(hppifGh!}ysU4aV$=0U76Sx2?gV8G#~uGc#c3}uh~v>K3ZYEF6tq3y z(R@S!WC%oC76W}WBB!lh-g`g)|3`COuPyuk|Np@?$b{I#pf(Gr;@9x#ti6F;3V(dZW|#fls$9sJy(u$g z8Kl-qfx#y;4$jdKL zNX^SHP0vs$E=WvHh3SN7f*PmC05-@qD9ArZfk8ooLDNcsL60jqzqANsa871ksse*T zW}X6rl>#VYLAe?k|ATTsldvEK93VxE3=AQDKE8e+Hpnv#3=9mQX)Eu|`BJb3qywn^|pCm)q%qYKLgLp$TU$DRZ3sQ>` zK_OV|=@%St?gI9Cv59kON}{W)adB#iXJ)3WYgs@@e2{mrd3=aLylZGqrfa;HtE+2S zuv2`nQ+$YFyh}ldQ9Q_5M7zU)*6uJgN-j-F^vugF2}vx@hK7!FeqKppW?pJhv87{C zQDS9SW@@>oNr+>xH-_gkU0pLX12X-B4dXpCgH7TyU0oUMY#A~^zGPr1%g;xzko1;cWM^enoLb_VlbV~FSE7-epI2N0 zQKP1#0O5fRP}5Y1F3&8 zis3GRs<#1|Xcr6ikbXvfZmNDoX<}Z9z8)xJE@-TCzfQErRp(cD%ja7Xn=!3Q^9}%!D3)wK-HgF ztdL(?qL81akd$AVmr@MUYokz-ky)&3m!4V@oLQBsp~-;Gk5*vNQD9JrWk^XZNd&u! zfuXo0u_QBD0TE4zd|_pUp7L$L5s4C(kVw*iI!_B0Gf=)7G*)3z1QmvbJlLxc4kUCL z7zl(f!gc!SjswLkJh&MgOG;963rbQ`6iV_Hic?DzKz>#z$tVJsM~Mn~r8zkTB}IDR zuu!nIRR9-&wzdokVBlI&keXbQngS|9!2$~58JWo$3YoL@TICYO{Z=2$5(D3lc|FcfE2rGi5o5zvUFLQ;MJ$1OPEV1Wut_u#|~ zi5yr;hVY=tnB-)inO6c$^$ZN))T{vV29CrFmO@FtkmLsncLoNAkc`Y?h2o6-(wr29 zywtMPB87~^f`ZgM1zlJLlV7Bel9>XoH9*l+4yrB^^Au8xit>x}7!ZcRBa7DI3Ci^} zDMxG=sFm-a`3YzCgC!BDI#9B34E6@)wdBM+Q2m@-lnScai%U|htjbc8OY(~pauSQu zQ;QTzG7|F?auX}!K@qE=$&iv-T#}gwt^q*-kH6d~%_+%*q&@UpilqpF)c_%t1*vvc zR>}DVl^QMx+n{cT7cG#c0YVfdFJS6}6ff2aDB56xV7&^q3OV_C=?bNJ#hK}OsVR_J z0pfg+n?Rw4Qipit7ngv-5EKq1lvlU|MK1%jyaELQw7|+&KrX8gp#skZ#o(3_F8ygm z`MJm@KvcsuGcbe{CFT{Ur51te5>QrE0PDyAwdYFmksEo?2Bsd!)1dqe3K5JV4Ae}( zTCLkb%Q%n%4RC>_g;q6y1!3_534Dxz2Nm%srC2PX`U2rXNPP^lA6!X*%QFxUQl^zB z7As`tWtLSzN7186WXXn+IF))rQwD5$A{6@!A< zKvSWnMggKi5mSYof`KNaTm`!WhZ73&i!(vZ1BJwtl%mw)Vui%K6osnPqI`%o`9%;8 zvMgK+DCPv~UrP6JncANTC90(L&AB!YsHm^T1^e zO1mZrfj!t@4;tS_9V9>&=`8)y?fS)I z25Q$7Jh+Px0u5VOcy!jD=qx>gHgeJJdgR~>`3~3n4Yl_fO4z$yk9@!Jnz`HcNT=(A zZr2ClBr@!~FB@DRvx@Tdv8 za)=u6xCqE|9-RlfK?9OcKvNCp!8*`}N}ONYfIJJC5rEr_G`tU!1zQfa6l3t;^@K+^ zXsG4?e@207)+3Azvlv+)F*4j`R7T7qcGlkMEWL5u6|_2&q1*Ka|M~-+t{=KxZ*+%# z;9q~J)Ab8j(Dh5_0njW`x9bf{*E_|m-L7w5yLG$Xv3C7X%+c-o1~lYg2IhZ(@|j+% zfd*JDU4N886fnOQ?RLEbo|Wd^_G213;V8QlR7bU;E3RT?_l1Ra<~n#S|!g$&w3CrLm~ z0Z$i!rd|sS9 zAY-5iLy&@VKWMB2oUo6BhD8`4v5sm9C<%k|Hz<>W#}++0OD~}1Qjc!e3;f#{S`L&L zLBHa(t%>+ z04ylc$~e{6a)=OxD?%#c!0E@M+jR%L6yr5xWnl0WuV}U?Bl;xFG@;EC^MHs3PED%Aw6_#Nf?}s1d*ezmQ2&q_N{*kH$Bk zRo@<+dvE;x|G#tVmB0W0dvs1c0pj#t`1}7qMv|EeQs~j^dZ2qMNHL-y2Th`PPX*~g z5o^Fy*P8JE|9^C02;ZZ()#3mD|Hvk^Du5?krh>%L3*g>X2C!r=L=7ao5H*EIZ|j49 zpox&FAR9e;d%>zv2rk4m(B#+K3ibm+7UEa9EI4=&vXEee%Ys85Aq$BixGXqc z5whSkfZ&2-70!h$766xG;H4liLqMe&j139r?pAOj?iBEVupnu#Ge7{fFyn6l?Sgs< z+C~K`l)$w_D>zu8b|Nj;KnXBpapaIg7Do<3WO3xML>5O5RAh1F&_xzU4rXL=WO3v;K@vwq4zl0^SS^Aqh^N+owtLh4Ss56TStoNcl(YWhVCZ9H zeaXSFl#%r!2g4yo)}0&-j~H1OaxgS8v37DWEMj8)$j-2ziS;Hs!+j>!gX|1{m{?b` zGt6OToyg9xhne*!8^axD)`x5iKbcuivN3eCux?~ySi^}_c!G*&aAT&k^a^@`2%a2; zB(HAQE6{}D(RmQwHbLY&P`wFnWS}Os5|nxqt_VxLc>`X?bDBu!F|>o~Oi=0sRhdJ@ zosJ_I(0s_jkKKKrW^r`4`qegcM#N z-+Oi*@#utbeL5e&Qq=^|ByqH3Ozh$R{~0-~S;H6tr!5`fJ6|cfg%FbMqJ_1S&Fh6=mlt9*B+2JJ&wB`0MoG9Zcq=@6|_Kak4JCl0kp+< zpoD({vPKTxV}o!ZnZyI!0&Y1_VvD@Q2So)q^PGUJ(Sw@-$~^EDL9Y!^GGEJQ4B$}x*$mww4@N$D-aE0Wnfst$lA-q zaF3C75fj64CZys7)Urh`Mj)#=zra#0XauA?^hb(^wg_Y;A#A}WC=x)f0;dgFuNssJ zpb2#Xa!MTlc~Hvo=nXIc)1b=L^#>?fgI0Kfg_;k*7o>st#~nZ!9+Lc<4=8-*7XY>Y z8DQZ6RRbP}L+XWsLJyuZkbDjb9Yp5Fr3xN?jc-6(jXL+rpfz%T7#`r?26jiw0sbim zS`L)BBQF4frCuXsK~y$M8HGH&(Ax^K4AkL97Hx%E(JEZy6x0BU{1ds`k50gv7Z zpncmOSPkiJUGne$|JM>YWEniVeL?97x-u1(p5S58S^ERBcDd8{4Jg#RL%%?xuRHWZ zX8^dw=nj3;830~>a@>LE`})8CLF$e>2q2gupoK0FQ3(W72E;@vo`A;2y(y*kKPDSv9Ec7V!W{8k-rxB*H0psWN69u2Vfpm_tdJk|98c<~xaQGMJMQo4h? z2(Ek3g6??30tU#g29R0Z6F@u=t8)TqtuMSra9!ciI|1ZRNKnBC!d_lyfaFC`2?{D1 zSwY@_#TcTs2bKZ{0o1i8xq%1mWuIK)`cz|8QQ$%295|9?hom`7WRfgzYRilLYlu12)X zgn?lPBkO-ghSQ9!uNWC-fU2JV{~7f=K~2R)jI0|u7&bDp&g5Wtz{q-$onZ&lf96IG zhEHs)FWDK|*jXDn7|wD)%;U9WWnl1OEnom^g*rmO3Ut21J4V*^ObjQWW^-7vIx<8- zG{GDy-OU)p!0?4p_9rvLDyDP>h84`RvsoB!F(-rF%4g2Xzz`}ol@a6^un0&ae*r54 zL;Y>WHOvf0nNiK-Q?q7ZsA4_D$S{G?2dsfl&5D7clJy`XLqAl63_tSOvobJbuy!*t zECGcN*pstm8AY{(KsC7JX-39-jEesm876|f3aZK&82--|W#mz3)ng1{jbzAXg=7;3 zu#?Dez@XIvj-hb5>5L4UK_LV3w=AQmAuH(64c5(!jCUEio-;E11j#|encWu!%&{KE-M3r4eL#2h78tC%nTijth1OwoxLVzhCPg| zZ=Vst!VlbRw$SA<@ zmDiiqo56v>f`Ng-fx#U#YOhzEQCw0~l9<|Mi>=yI7b{?$QDd6K+}GLrGX0*MPJ zLl_zO`xtVW{xY&XVdS5{kUJkdYnqr>VlbJZgx`iqTHe5hsfxdk!I$5LNtS;Cm{#DQ z%;3dj!}OCg9<tf11D1t z!+(Y-)Wwl*Y@D&+MUWp^zO!Ii_gK%sAJ5>%n#II_fWe7 zO#DX~oLC&0{<9dD7iE^D`lXhGR$LZmf`B`dQe{A9uxWg-nFu?7EQ14w0|$RRgEQ+b zM*h7F_ADig5=JGA{Dq8G{80=xoOz7=F%0&s*^KmWnkx= z#=x)6z{NV1fnSY*ooNaKzab~1e^OdbWeukQyE+4NJ_Elm11GB%KffviC({%L!ze?O z1B^LAsVSw&soNR(Wf>SaBpCVS8JL+DGVn(+FtU^|#2bTbC}v>cm0{p4V&Io!U}P<1 z;FV=y3=~L=Pb*5yP4!^l6=aY~6k<4-SEiSon3I!Ulvt3#$1uO3D8D4Xq_QAYMo@uY zOi-O!osqwnMV2pA@C##nR7hBGiDhaAU?i0Q6_EDPm<%QZZy_5HezC5He+F5He$D5VB-vP&PE+U|?2d z5HfUNFj6rxmta_EWWi8YP?VWhlGek(A0WJmRg;-tM<|Q`s__5+TKVw>iA9OICz!da7KL13Cf79CQSUw3~c-pgnalfiu^wmuM!_0A9b72@uI6~Vp(ElPJFzfshPoE zCXUO@{1&2){AD6eO!t`ii$ok*&olGKGB~oAvhc?r=kxPn!|F?)0zp>bVIeuGFYyM7ABQ{MYet!ld{z|c4 z{(qvvg2vZG_@{}L@ym$`@GCKJvaJ^6PZi_f5Mbq3XW(YJ%>qi({HM8@SX#xH-m~*3 zax=0hF|c=ok~|M{Hz>(-vi@b@S7qSf?-65STEW2IA;!Rbfs^N#7|U{D_xN}VvvLML zO$N?~4E*v8EX+4SX_PNrq;bnK?WR?{ji;7?d+XL{4UAW|FuKKNAC6 zsSyJMgA#*@0)tWm2RB1_Hs}-%3DAj6f~zDLL|ph8SeY0mZ)IR$&{c3@2oPcr*5F_e zJ|@8+d_sVMMS?;2x&#BWI0LH$gYX{-2H^@xkf4$(V*-Ql2MGq@2o45eZwUq=k8(|r zk`f6976%3v4hG?U5)8r%96*YTL8m-1Gq{R@j)s~HI*N(c!IgnUm_fLKgF(>MfI--q z4=e>bO^8>)nc>?62H_5A2H`{r1|@c8K?b2tb_QjJTnPrHIwnDoiNci<3@im&LR|6; z$_#Z93@i~0!ln`o!mSbv!g~c6ge^E2gnJ|ygueYOoKs~L4hItIU7i&1n2;*;t3oewh=fSm=YLdjS?72l{rBX zXRgV>;=mMc%gMmN46#9;3#8TpY=ht}2?kL&0Z??7{^4d|;B|0Yz{IkGsU(>Pq{vtu z6uV3-7(|U%FbEq+F%(Plg2XMs7L`5V197Yb!A^)20CB9rGJ@wN7{qKK;=V#4aa*u> z*)m}e#||PBB?98ugJnwJiGW<|AjrTHz!=UY3UjUCCJ6=+N3dJ<&xqb&W-g5t1L<^@8icDE!r6r&!VGNTox)(IQUi|x zLxi0Om?Ok0!4N)Alz~AggHfr0O@bl9ON@a*D1edq1&ho7|NsBjFbV%>WcVz^%)nAJ zrJ+HCr?H_ygMs6JV}k|(qt@4K~o>1PrVoE$l3yZL@43CJPS(0|OgKo<)p>DFswp3Ny@SSoj}gC&&%_5e)1L z8JLP07&RDJu7FNa6E28gVQgYxT*}Pk${@`0nUTMM!InjXowtAiRDm`y>axf%@-Q(7 z7YH(NJZ2CUkzr&pVc-v7G-6g@W_rND`j*kifW?E8ng0QUE`I_OtI`L#hYXCNOpJ#a zT_!NHs3bB83iUHECNjAQv#_uyGKmT#G6}`lG4L;7@DLJ6VqjaqU{b&$FrPt;Yd(V? zH_HL$`3yoPwhYW07@4)17;iAKFfj8kU>0S019Ao{2eSyn0%l_d7MBzT{sIO~{tb+* z!UZo_*aX1VNeD7>ZeY|iVG(3x4QJpFU~=W#z@pf|xQ2moBa0Z{Mn+S9mK7UW*aDbD z9191#Ht;R<%f zS&WPY3?R3P`Uo(xF|f-^Y+&@~;9zH701{=^3t$Lf5@lJyV927+v5q%M1++6snTLUq zgGoq`hv70P{#o=n7+e?_81z&aSbBJrSOj?(SX3B-jTsmiBvcrLjNKU+Js4Pd7~J(a z7@Tt$#GO?b#GO4D44oJljChn-lzBjypKa&ariykM~5aEt348fp2jMfAO zmWRv`u^B21;`Un@Sk$=HK^7_MbL`_3R`6h8N#cemsPJGAwq+53n7f5R+%|_H7_@+0 z)q{b>i3_5}z=J_tbpnHssyhRt3nMEJgNr@~gCYl`5Xdb#3_^dXJE)E8H1_p)%Mg<0DNd*QL@vkhUObi?h3?C8&7)lvIo4h^<3V;qAT=+qdfm!-L z1B(pD8)o?r84Nt~${7p>@u00>7fvuR^88>>YjK83e%QDt&PL!N3y3_&-b|fMEkC1A_q<1H%H)dPQyq zh6HW~h6Y{+h6}t53<7)%3;}!$3={Ym7!EivFgQ3eFdT4XU~q6^U^w8!z~JD_z;M8s zfx*Frf#HA)1A~Jr0|SeU00V;n14Doq1A~Aj14Dqr5k?OFCyc!O3Jkpb5e%;XoiqX% z3K$s}K%28cM+btAb_0!%E3h&!fQDo?ure@wU}a!1U}InqU}s>Mz|O!Rz`?)(sxbsO z85kyTGB5~mF)&QvVqg&9W?-1Wy@r9~p@s%S0s{jBXy{LXk>Lry@P-P8_n_0(KqpFp zPRTJ~QDJ7_v0?hpq#*!4QWSI~%mEe#1_#y~OdLWQf0%?FdYKDBB@ZJ5LkY9E4F`jv zDg#3nU$8X;1B0CegRlb!gRmn9gRoN$i?A~XgP{up!!ZG2R}O~DppG(V8y|@No57%T zRxE@;iAy|%L5W{HgF)%CSO$aPV=;zJd`ch13K*2Wf(S4>Yo&cc| zAd~}yQV?KB26>5};U_cm4aP)%1_p);w)_l?E0}_%7#J9!K?l+d4s}Te28IGqa0oC6 zGeAPjU?B&~2__it!DbPpKn2-(fk7A)s23oCdVxXM0Hpr`I2wd4L%4)LSc`&UO!xvc zX2p^0RRG(emSCv>@e;_{7Z?~V8JXD`fUpR=kZ^=XlHvvZ`;3g&nSv!CAq}zx6jKWr!7;UfQR#u6 z12~I7v@!DXF*mRzf|Aq)U490}0!Yw<($xh9VQ8SC`U|2_>4Q!HIG8~mTfhkR*aApO zT)-#{QKJ^19l!uit|0ybMz9+}PS7#{I{|77$bHyS5{m;fG!T_OXciz_R-g$Ae66 z>@{edA)JXV@Rc4YgLMjn%AEy_!k}0KB^5+^T)?Q*pagNxhPmJ(3gnOr3}A;`U;sM= zQHp&?2B!g3mxKHYk4?0+dB8v#l&oL!Gk;)a{Jw32MGlRmIz*OU|wJV2PPu;LHQVx|3PkrM)d=6NGibye%1wy zLZXmle?d?Yl%YTl#|RDsh#R4%eGo2S5Vq{$0To{0LTrHm1M31tW=wLe9LPvWnJIK3 zUz*i{87Qbc>pc#~rDN>qAfk9mI z1xT-O!2>>)3yi`Wz$SreSKg$FLVS@&KgrL>|vjhfEa0_n$`X`(K65jjpGO~Ai_tyK!FGB7J~#H;RKj&@e2vuV9|yH+(>2$2ylbiKnDaE7#*dU zHP|>L7{nhaaDi1$Xy6jQAi=-1x}U(Nc_NU7yfX80~Eju95{s^z-=;U-~j7? zAiw~MU8u)DOkihCU|?q8KvwC%&X@qIEurd!9|$mTNQjBsRETjXh;c|TaI6qz+`!2E zft6!}D2D{dI7VShZ()`IE)EVZkU)X~C&>H_oE#aP91;vHAHeQO;9wMApuh-H@ZkeH z*o7R7AbW)^R)7Y=1bM@m7!U9>dN2s_2{3>?363451a1Y8e?jgL4uB`44<2G5OA^#T zMu1onpmq+}rVj!P92-Cuv20*u_Ge>_5EDKygOOzgJEN-v^9u&X9X#wF+~5oe$`&iw z!P#g9J4Xk%v`7axI6o>FFbG>(aDj{473?f4xLGPV#5V{qgBxX_(%pj#Tvu0cFzyv) zmJkmIHQyaPm>5q8fC|wH4#f}R`V5RW7#S^~r7F~PMh8h|NW}`RoIts?fY!62rm7g)lq^6E6B+Q0>Bi=9ncDB0i$>V)O2XXp;Qn`7uXgsuuMQ{hrk;E z91${%8NAFdz-D6VUce~)-~=yA05e)G3TbtK0}RyqMe^|iMwS~gpqAGK4GxwCj4U(Q znB95R84i443I^>k!6LjM1Kb6C zFoPK+2MYBLCXOG>($YVe86%iMS)+mpoLViIS$=?05rh=p@Pk>{Qi27XRXdo#S)l`F z1xo`b%LN9G8_eK5{b2@J+YM$G4`voHDcoSdz_NjX#X^B)1B1AO0t4dC&ZLEeI9K6shKufW17 z&9A@$!XSkVEYea6EYgw+EQ}TG%mQqntOH8l(9&rEBTEEGjN=3^;|f*w4mQRe;KHzg ziKBxA*6JJ0t*WV*fU^KJb{CqaRby{EDE4r zB#6AgApYP1b1*-s{RJtQkjfiqd-nnZhl47|3~oV7u2f?Nh7BvYL3JBwXn-Sw13c7c z(7?S6Vo`cteTX@0_ zMq#VhqAU~mSOgeYGPqbk?V|!l5Jv*S@c?mzC;VUl4YmjfFbGfB!2lUzVbNd~wrOT( zIl;iPgM&kZRopp4nq>hqC}4yatN@h^3@jF`!VMDO5=?@Dqk|dbG7Scf3Cy6d=#U0E zy@Hb^LmR}9U=U`2G$d9?i$5@shNzV{IKjxmvVeufgOR0y1>~*=tRP2j2mm>V<%Kq@ z0B8=IC4-UqJ1egP6N>}`s|Eve2CJ~84g*VuEXxN|kb^h`_{3c@WLXY?0-I$AE6WE) zo)b(g8)SGuu44j`54eSGW^1r?@CjRO&|^8k4zjC)f#n4QDD5~fuxK!WL?(be(*d$v z*x&)9u%!ri2nW=t1lc3JVFDx12?l-vCRSkw574L?ivwr|jv4G+;U5*u!k!fjEC+an znFJWbBNQ0KU4HNjGhJX5eo?_9?CHSBvVa9F<)Og9c!3Et$i~#cEWE;kNyKvnD4T&K zg+aq$o;%oBIt0Poj2rC2o-;UDUa*0=KXxz+ds^$VOh8y-H-TN4i9=9$hXJ#&=Lrc= zEyX0jzyeaZLr&PjfB~%W#so%T&kR-;1tt)e#X;WClYybYR5$`6&mv(r*^_~R0W=c+ z;s7I9O@xK8r-44p1y>MH+{1t&7_yd?nMfk@GMivPMmInsX5fTh6KZJx|%mC#fmIJIHy(|-0#aC1?id%3f zvOHj6@n8`CkiaN>f&<*p1{KN^m_fQ{FiSu1U}Tx##?J)WV)Vb$P^E~u)clh=BSRlM z!y|qMfeJ>(1)ve$2mjd_PVh4bRWLGs0Pzp>u`^uYXJ{{GU=;OWU|`@7U|@J5z@R1K zBEX;}WFx?!C1@eQpaSY?3o-^UsDNsIL1qC4WicNChU7=A457usPHce!3=C>=3JfX& zCJGEe>~Q}al>Q!3JfYt1_}&;%nT|;VhNI-xAWK>{3gGv#T z0K*~rhSp*!o^L#2uLT$w9tkq=R4_1sTw37tff?kL(Bl6S7#@QBR3*d+nm+jd z;IjjxQfu)xmy1j4k;cQ@Lc*4&hP$l@Efs=vZ|C9d=3}^Tm+&TYn zH#aadyy9nQs(8t!$iR@mFj=L9;lCO}$0W-K zCRPOoMTJiS%*L*jY&k-L4FU{~JVgvl4h$QEzJOwmg{O!?{HnEP@CN~gbOGfeR)!}A z3=^FS7(i)QrlOajo}obW0|Ubi1!iF(H3mi@_6JN{3e1*FH5?9%JPOQaDs`L=jEoM< z{0hw6CQLQl1uV<~%v=J@Tmek{0?hJ8Of{_4pkkQu9TO81L(?OW$JqXJe_&wVz`&)z z9H~;r{GUSs#N=^c<}qM)WUAq1FanimToYJ~n2OkU>oRX(;w@n0S74SgR4L-Rz{EI# zHRX{!15*+A21ce23_JqNyavqd0?f<~jEqc7%m*0ERO(n1ScOk9FtR8x%Rf_KmUnEf z6k&WSkP6Cti8f{CHIl8G^ve}Dn^EnOf8_KbAg{hvVmbDDAy?QF)&q~V`Nfb2t3BmAX#~ifr*trD20zf zrR*->B37n8c7_x33@Q~rY`=0bgLtR(8F;D~7#JcM7z&a;Ff-iXXYgIm%v8W`#Z>2L1-d+%%Ok#{Udl9~hJmSAJk-ZeSGI*}y2o@qo>6XBArk zBjZ>0z(*h_-{oMM!1$3PiH~7P5eo}wu{ax3A3wtxc?N+RCZ+`(fe-%kGn|;fD9<#3 zk(;?MkZ}W37(atb8S4Z_!4nf0jhPst9xw`BXkcViVq$Dyl;=-ilrv!}VpiZ}Y+#ab zTfms~h@ZiunBfEKM+SxlW`-C144uU+p!;xG%-9oE_!!tLSVf!}gT%BS$tN)LKVanN zX<%f1z{K3Z7y>H4FHGQ`3`(FSlK&MLHZU?x;9}m$CM+<4QBbItL5V@pf!R`^N^k*J zplc;NgF!GG(*|}&rXq2M1?-Ft+(96h3M}AKWb3=YVW?6iR>&+U^nvN1JcDYPgcw5v z1A`=!0Z$nVPZfh0Lrem5pgSi6!+}W*Ob?j+c`6vh9Jo0RT`SpaTp0}*guXa12o?l0 zF*z{sG_Zw(3eF!4jJyqO{2$o;7;PDum>3iv@fYy0IdD0GN?1M?CME{v1~ws%4Ghd1 z*m)GV#aR`&W%(7j4Vh}#5;LRK(68 zz`(}Nz|A;;Tj)swBVz)KEkhr_0JrcF0Y;V&?DB6uu*)+(U=ogcz{FI*%=nv`rGT0D z0F!*@0VeKnrV0jzpBET-0vP!(@bdC+;Nmr5s^LmtWL&_-cYv4w0F$*NQw=M_Cnn|s z2EGTpl57un`8IIzTwr3X;{y3Lfthg=1M>%7L#7%g4<9xGW=ZA?ygUXh{0Erq#kdXF z7$-0>ZQx~05XgMQ&%ji~rohd6fr)1U7jpt1qXWns<^X2?3%r_&jm6uwFETK^lw)96 z!N<^8%qsDb;}VFsh>toxw zXF0&ZsKBt1rwDwXAVYz}CkL)YDl?dv7#SEC*tsS!Gz!dMW(IKt7zLjgurz{AD+v1Z zL2x0c#$X3kD+vtzVmtv1g5MMv8U>12nb;W^8XOo3Pw+Fa7qPPbXPeH@zzV831UM2H z5IOO{NVrp{{jqJC-fPZS}yQ2s29mH z3teE`c!r-rrGi-&B)fn^@KgbZ3bR51qtFcpM#caJ<^_zN0vxik8yHy)7?>U~stZUc zFbF+5z$v(i#o{AV12e-reumCU7I6j!1(yE|d>@!xd5RdM6Brok$eV$3N}^+=ENuc49pd*8w41c7?Mu#Gq4u3Utm}Kps2ve62Qvj!1RSfpFyyQ znH|)i`L78|)&30XMa=BX6W9_#kyypt$G~X75DbdNvj5C14_JgME-E&(P8-CC|&C zD8Vp65ETEi6PQ^XL>Lp9nF4qMK{b5RA5Xys0a4e^N*1XL%=$~5E-*6)N^fA^CiOE4wWKsYFlLCvV^a5tBCoB!jf;Su(#H0n711H%L*2BrpP#sl1p3-|;-EnsovsbLV>z_Kvt z12?!=P$Znd#Q1?t@rwQgNf8GJ2Bla04Ge~H3=FNjznGu!Gpwj&_zy~ME{^gX42%3W zNCq9_XXvQ(=bypC!N9@r$(g}1KtS%*2L@r^087RV405jyaPc_s8?{t2D6r&8FmNy= z^nig;*dj^%#ea5&3lA9h6Zkdx7x3`%7f4m}PvBvcWSzjnUmztd&m6$d zz+WJxDaXpd$05MOa{wgFC(T+QC9hu~WyV&(&tD+LY0Apr{DCR{05ij@_yQ?@{sexT z5KspAsldg*L4c88L8y*rgCviF5Ql>--vuTf11WozIyMC%3H}RA{QLq^f*cD3c`h*V zA7GMYF%V)BkP_545MnG6Vs+paOfkDJKGlzKtZ@5qegE;dAF2~T~|D5;P zm>t-YT`M^m3WOTi7!U9W3MDWy8gOwG@CyYoaHzOm;{|f@FA4Hk+8JQix{X9iR#tBS} z8)T%p9fX-ba4;J%@CC>*{srBT%ltu>QGkoFfI(j0frUwkRe+VTfPrU$Fuw!K|GZ?L zDh6SOBn1{G2WIApOe_nSxfLY&6U6p$FtGC-V6tYaVNqb_cTnNxOAzA`U>D-GVP;QY zW=jx@W@ccj;YkqVGEm`Jz|6COlV<@lKLfkIAoqGvHU@U40x?DdNk#`2<^%!e2`nrQ zl8gfE{1e0&IRw~+&Lps~GO)`(VPKb6RVfKzU_8K|qFTlx%#iRw(21W#pn^e&F@d-6 zoIWTjSc+YlFR&@PSH5Cm6<{+|DPa;2W^7>NZxCVN|G=uxmB1PTY8CxBAi!_H#lZi8 zhnIhXWIVqC^ZzycA6RX{0tc9Q`8V)K^F3hYf4~D`{;%Wuz$&l*ft82x0}uZPRxVBh z8O9H+jD75^0&E~{d=I#J0=W4bzez#7Z{LC2qE0W;GA1HJ`(JRkV@H*j$C zGqCaUKM?w#XU0^)(!i7WM4v&RLWn_;@dCH}u@Bt5+zf0XPna1TD;1bn4+sj1PmmN! zND$&VAY`Ia!gN5GIe|xzDS=0ciD`o|zXBVtp+JSu2T8^a49p6Wf)^4MnG~d0Cddmj zF~}J*6-g%uGZyeNEih0N57F=Lr zcaRiiXJF;JAjxt-o%H|{hXW|*I(W1dGe{{2846S|IjFE4VC64RX5>h)G=uKZslKGYIwbCvg1V!?=J|jHN&>M4*C6 zn2AA`|9}(={{mJ`eg|$|{ttZJd<$6lKk)tU;a|XN&;P;rzm7cf2jdui2X00o?f^qp z2X3YYS=JA{{2z?H_!;=*85gihaV%hEWKdyzpwI8XZO^}em6tz2yqbRkt2}>$R4?ZS zUPc8i_607C3-s9+xG)Epvn_C8`XIolXu$Y@iS2^`lLH^q2LUbzJ|+fcMh8Cr1_dU5 z13nJk3ugQem}J4u068szLzjPor7eE|w=Vw!0bBkBtg-wHg!y?s2=D~h@N8gZdf+Z1 zz#zzTK$t(loeNY*GB02i<5|EO!c)W`@!!C>@Cm39$RyQZqWIwB0zq|w3Ih|y4F;2# z|AW?^{eM#L;1xWRf#Ji81J1gy^b?#6nTl8@)G=NVV{~v}+Tg&Lpv;({C&d21gpq-T z?Smkj0t0^l3y1av{tvoL3C1c9PE256ORy9XPGHooU==WDGGO5^U|?e0z#zzO=gg+S zzMgd0l4-EVb3@YLm7#JOyidYy9n6MWx zFbg;^7l1m$mP`zc;u{(mSRPn$G%$(_d=TPqV009p(7<5CRK!x8!^prW=wHCd_`#H) zfzd*p%X2#s&sK{ssod4Sf6!3{Haf4NT$;4Gb&|jH1#73}UPS(u@VV{0z*N zObrZb2Tp>zgADu(%nV!;nD`l(HJKV1#2difaYn`mEUW^I{0%H1rQ!|^4Ezc1yetPi zg&8L>fcv=x43>-m)_e?s%IY-1K$A_<^<4qsRBb7NVDVt zS>^-=5itet#7FE5Dn$|r42%!BKuUxi93a+;gBsyrAt4(Dh9HnGu?g1f2N+l%1TiKs zFfrIEvNL?J5@8H5WK0N31dpK#CNLc20Xg+TKWHdOvQc&sXkaMAkfFI!O~_&+mjDAp z1|vgrrHUk@>p>7Ni4oLq6PGaHU@9;abt^Dsx*%sMP$m7roj<^kQ8|F;gJIe;&}f0w z2X~=U33?(53Jfd^maGky#_|j*7mS1@7#s~}Rs=&u#`7=Aj!tmAjtnflAHB_4zq$G^L-}f2lAXB%or7P80F-cm>3utctzL*n3x5O z7~g_wTt)>3L2dy?mIQe-l{$_OvWx)?`~k|mJQEnW9E^A-Fj%Y9ab5`Kn!sS9QpfXw zgSml$`8_l12L@^01)$*|Yn3|o2S!{842%sPECHtc2P}AbCh#$DU|}p`NaPVgO z(222t&4{73IG;sOBEdqCQGg*FbPlMv07E#aRnIKI5DwZk!Y;rdEg--U4w`!BRA7*n z5MU6I5nxb!#K^!c_&}J2{{t(lI+M-^R{jHaOxzz>*%~7GKd}D)uPD#-AwocrN$`O> znD4^hz{EFSwSt8SWl2 ztOYzJ{0ss`3}V6`?Dz|KL<9?rxtRhOELBRF11#*jid74E_*ge6Fct8~s21?Z7ZmVt z@C!)u@&`!h@@$Cbn-Ievz{AVZpv!bXpKF0WpMfTSfSN3SfU7kBf)GZo09XD7CqcFZ z5xxmLoEL-`6{Oi6#F!bJ*aW1dnH!w=C&Z|;eh}tgz{ANuA;thSPOLh?hkpYP2UAHN zTLCjasPW<8>d7x4Z7;}dFUXR>&vSsuP^FILqZ8W)J}w74{sbd!z5`7B4tA<67x?%O zFzIq#kd>Eb@L-VUxS+tK03z86`1u$FITfV&4=@?{t5&e6e&Asfda@wa?ECt^3#2$c z@CcnKNDT+|uEZAbfa6G50Mr`tWy+~xDq!P#ASAEzfrpQ~fK6VffUS~$1M~m8{2RFV z{CSEPbPRZT`3u-2`4!Z4`6pP*@*8meKg)lBi4SCvP600)V*y(@C&Qfo{|ngI3xxR# z*m8Lbc=;!!u?tFGkV%$j01XLC_cJaKb`C3MV6UFRxIv14K^nIV(}Fa4rUhxzAcDIE zJVY$P$nZd#QGh{FlJV_M(1@@UBZGh}sHl@>i0d0c5C$IwM1k0E3AR$OuLS1`}OI z1_qE46Fo)-1299Mks(2re?oeLl1h<734OvV2TuT38wi zgv39{vNRN^buvs4{va#Cl+w*4z|Gkp2pU6at`rp8$jkUaE)Z1m z%Dgvc`5@=tRm{YvAi>LRz|E^5!T*4dU0(i!`F~w`#s_?AECTWT5BPNX7YNGnGzjv4 z;A3E1Jc&g?jl)5XSHV}B`GW?30S7OmfG9_T1pfn2yM|kme}Nz`e}TTeAeXiXBNGE- zL69^zg92B8Adi7Q&jb-Z1re?ZB0Lua`3?v&&Q$`9N;)VoeiC6;NaK0n&l4cRcR_*c zfj?J(gDmp~CKdx7mJ3Y$2_mxm3j__hHZZX}I4}w*vTktZUm$48xIjWoW`dlkzyvu) z18)8Wg2EgV1mi&kl+XgfGJPhV0ycN14b1!-ayxedS-{M5fSG>-^I7f!HvR-f zCUB}>AoxFt@$f9>2^^rj$8W$X$NRyQX@Q=gM1TO3fh6O7WkY@jjt49p2Auo?62|-m zY&;;V_zT#Q_!kID2#N(TGA$5f`l!veK#=i*4F3W_WqyYW5UndMv_Md(X@Vfj1b)T| z3}Uhi1o;^Rn*Nzes#5mT801Q;EB3>g?M2rw{B&;hm13>X+M z=m}kN&}I74%EZ907+PGf!0y1nR4>G+z`&Sb!nHs_iKl449`ghhBNmmS54ns50-(Ws z$qJT&yhUsb44_d(rXprWX(fIJ0W;2m3P;d@V4f~(KroAd5#s|r#)4o*S<%bPIBI*euX^%J=LW`w82q`Y}6A%_r zn;??*h?^m_Sn7k2;JOM0KQJcAk#Il)j+K7m0*EIqZtOZ7b3?B>`15`vM9?0@9;1uLkXpxdH$kKnre?jPfx}fj@cK!vN z2|Nv){1=267z0JYOIbi1etrc*zHrqF773OGdPWVE%o}nif+msI@i8=1zOQ8xU|@Qn z%l3eUv4Gi-gF(Pph{=JSUqORWq+kLYQ$v~{lRyFEfqG|u2B%JbR%QnV75C1R{Y(Yy zpmn_re*6XuOpKR>m<$+r9+ZIQQN$N8^Kvp632`zUVhLd2O%UQg;K9SsU?k7Nz{qq! zou|QsZ-Fs$JtJcP7n1=y+lMy34K<*thYPHX54d?RFlg6t2dFag@h~wlFe->hb0nBB z&u0V8b4}>rm*zQO!pLU=l1gBZ=4}w+YG44*BO5U=3y3H)Rh;MG-_Xv;;vmAZ!GyU$ zibFtyD}b4;!Gwi@g}=c>nm>V=A5=|=Ur^`eY+#b_Y+&N&zo5JC0}JN_CSC?}{st!gNvahr z;-I2J<%6gri$I0=ftq`vnR91564a??^hZTJkf9^70&D;Q7GDd4Ykyfr)|t0J|+uLLSctM!tePE(dlH zpWlJ?fs~{JtE!}=fi#Z;E2Du3O9Bg%1FNAxmBf8%#sXF*fod^{20o?_>}nf6K5*s< zi02njV`Vg$p!kSiK#e0o#1T|4f6!%Iz$wKkpjNhs2~=fsIH;v^gBY9+YWhzYFZ9^( z8>rdJG#RJ~GAsO! zh&J&tsL3}lsPXVS2+OiMI5P(bD+^jI04?}qJ>bmDpvJaASdh6uh;4#UFlcdxI3q7x zfH2zxA#iU;>^~c*xCjttWpHKuAS5Q=pvwHfSwu#FDe)1g{*)^aViXW#c5q?4AS})9 zpvDAYaXP3mF{rU_5MfMU5R^_}U{Y{pN?>3OP*;7#J|T(mf)I0nFq;6RFpzX$U`!BV zKES}bK!hu8VJkUU6{u2;KC@lz>q_+f!U!} zsY=j5Sn!&j=7+TaRaEV5J@m#P<_R(pw2jfLGXeB7ypHP4L$*3{sMkp{sc~4 z{s!|{J_cn*TR~9L|DY(s8sN_Cz{$^0B`ME%p@xsY!Ca7$c^hK^6JG!a4+Agf2OjpRtASiue0FPgJMIa`(=Iy22RF!4n_wCMgdO7y$nnbgj-&LR<{TXGbF?? z32=%DC$KPmV9?m`TY!^Uo{c%7ieG?}mFYtuQ-ds{fhc=}EMtQ_qX2^xy8tKS1sRS5 zQ+XBi`5R<)q=XJk)nC!1pl>71q@d6LfrE#C14jV3H~yo5gHb?IO#1-4q8=Xuhl(hR z0SA)+2jeASA&!77t_fX1atxA{%6k|U6&VVcc?@(7Rf-fC*clyI5|4o9_7nve7#&>s z4|Fl9vN0@ZXZ+B^KA~Op2}400qXR?B6L|)e3TE~P6I`=S^fRazu}^4sDN8XA zVpU+YuV65q(8bM{z#w-hfkA}r00Ym2E=QpX4FiQh=73)QhAsv{jRyjX312?2@+|N& z6ROaZGnJIzIh^azTfth&&JLiI~sCRt~DjEkEI2UwjpXr~_#m&I1|44p9 zmjKfP1*QKC1x#itCCm-ofhR!y^S&tN12Z`ufEKc_GBq?XPGI8^U}h3vP|=ciFbu!K z&mbUSzyO-!HDqCQU_K0*@T^c)0u{xYAGDO2D+=0=fci1#_!&By8I%(TP&hay}%vAJYNCfQ*T*crZ#T2rF$$;I+y^>*J2lD|oP>?HbVAgu# zvVfVy+YxdAjK1nSB;FtErTa$sOi01E^#^E)uGNHIAuiz;2{WpZF(7GP!; zU{tHn1kFEovtM8cI?>OlQX#+~rnsR?MN{blqxKW_4P8tV7@3MVnHGQ+F0m9a3M(+k zaX#Que59Wc#<+l$Re^!Mz=;vOjzZ+Wf`VZQ`+r6TA%+PIqRtlr1rIQArh(iQ@aq8w zUjqYwK%}IUegT8lBmM>n-U3mk1_pV91_mz1K6d5+7A6Kp<_S%F2@L!Uj4I*+4GiEh zvj%4711$Up!nmdM7qIhuU|?)eD;)urn}o6)gFn=C}m;@L_8bA$v z5&i}2oPzQTSojyPi)kIz{bCTjgd=$mC=uh`2e%L`~qejP{1)X zFz_)j@++`PO38m<;4xt5XJBMeyYLq>@L3Qi#oxf7EWeduYaA61a0xw`z{1MFs&z%4fkpR3egeo^rVrfw0<4n! z3`SyF<5E3;kQT)Gvk#z#Mn6dx^kAP63k{p9jg+>5h z{eRHn&L{o~LfkwG{JaW6^8N}!3?`cFS`HlNLDSL|%$lImRZBrgOi@$w0SohoIcxz! zf}a@Jz{RT21qT-KOAf4)J4;y(TzG%!*YV^m;PtWZecW1PTVaSYsJ zR^GrS>=viXD8S6DP|FzLr1-yi0jm*PCEEe+RM0HF65j>TI(1NMUL$~kpFxm;>#1|LGc2Un@R;U>jB{sdC&wqE3<(J zW58@dmW2Y04GjDT1eh4***OjfFn(ZQYhdI_n9bx6&%Ob)+MCf~y%7Ub$9X1027ZPG z@lGlo%$kj=?Vzc8eg**z2L{1o3j`P$<}-$}^Dhu!P~s_CAiy|*f$2e=sAd79%N2eG zrkOig3K%BM(9i&tnygF<_#9hiDrrn&U@Vx;bU=W0gSODC045P81`ehGCKV;c0H(wz zE)0?tiUCZ5iUCZ4pc+gmfJsS-sX&&8fs;RgiI*#37Bu?>`Vr{QvL$*>{O9Cm zVqm(m(2#*aW~~w%69c0*2RjpkJhuY_i*SqugL;yvNJIVP7qmc z%)lTsmx1{jJJVV&b|wa)xjYQK4h%|bEF~EjYs8sV80AeI7#M^;P6P=HpK)Li%&=ns z)x4$-3=D$0ce(f+7?ez`EE$9n*cljIW#r93%H0zfc^w#pr#LW(3pg;yn>#Qt$R7jw zV2J~R_!|cXc?$;y21Qm*MF!pm2E_^15)6!YxLFt&RV*2LcC#@tD3)=7-66jLq|*vy z*$j z@(v)|*%ZP4f8f9%s1py??g-Le>A=9E#A9p8z^JRre1JjT$$^PMkwH*|g|`807mFkV z4+9HJo1P+{;2ahf@QG!@GaMK=4lsi!6CD_g?lUkX1oAL3Fh0F$#K5pXfrYVwQQU-q zmFYW!KLdjj;}r&mKn4be1@`O=oD3`+4h$?enDqa#IxsLZn(SuJU}a%oVc7vXm`-1r z?EnL_u!JKU%L)b_2hfHI{Wf+724;|_g$fdw7#SED*>xGGfmZJ8KVv_@z^pXGUz0(} zft7o(E*QMXCaW&UZ-tFt7-vSuzM+07VUh zxNQR?%K--c+q@v-e>gD68#pkqfXpuC1Dl<0$sqoNfl1u10UQ`}`9Ov$WI8Yi9bji* zJOD}xApP9@ApH*<7{oam807667#Q@8Ktf6{vML!kAOYOKAYawMz@p@l?ZCkDf{zgZxyGZcywyFv>eMFfhn(1_}IdVBlzA0!3^CgZy1kg3`$S$sqoLfr(LFj!B$> zwSigw-2ny`C5e1Z2KfMxuMaRP@-yo%6E1kp#mdAWoNAo)Ftvj-eJKxvaKlC>A!ZE*5h(#C z7A6M8G)4vvrY~abObkj5wY;ExO+Czztw1~u4El?hL1E5N@4&zmBg*8!;G@U@vQ7UU zs{;cIvnDt@F)+KRGB8L8fX>2G$zpS0U*7=*x3~a_CbB24)UEW(HOn1qTNGc2Hp`^bHhljPmvj zj12NS92i)X8@@X*F!Jd$H8A)%FfeE}FzA2a0VN{tN+u2mCLc!z1|7X}8s1_t4T27YFBka{7V1_nl7MP>#@ zdC+YqLcJlZ!UfAY*q9iE^c)zZCrC0dflrPUDh$31Dt0FrYB8`laI-KmF!O`rD87b) zfq~0`frI4$GYb;~PXmj-D!T&%H-`ZOHv@kFs5cA7Obn3oo+G@#Tm~2oI=Y2@DtelWDn^ReSuwXf}*%Nd` ztSjq1M*c(wC-&>C{0R(B-26=ZDGW|54;UxFk7|Cvs8F2A$iV-MQK>QjbS`qRnK~Qz zpyp~u{x}AE7EwkCBT+_vA;<~M9Q>eTWo=k^7%!q6%{+&RS(K9p?Q~{$CQkmvoQwjl z@+_8Uhcg>8zhGF8IF$BpfD`jq zcK)vd4s3r|Kxvr0gcFp88TK>s`v_XIq%rcp6R=@V2A`A6kpN1`)_ijXS!5ZxX9=?3 zWdI$KEGTHjA0ueUwhD5Rcs1yha0C7jL2Ld6f<|omAg&R+0qD?hL;g2{!u&BpA^e6y zg8ack9Bf&P{HhG>{N_Sjygfq9{_LQ=!?*c8Ajcl_PZDBaU(d)V$G~`wm3x&Ci#!Xj zBm>($23{Ek2hah>Yzz#N5&_`jjae9&1Q-~&7#LU>3>cI%l^7-=op4;3lam8F;W$&C zpH-NhpNWARdb+W^DuYr3Gv?{Wy5N&zp{E<`l6|_dk_RK?++mdCjfEEIGq6@LDDW|T zU=Y>=*HO^ZjrCAYH`W24WGfJAz~HQ)gM7HLFna={aHj-=F#6HPKNLYn8#8J!nDQ#< zz|J-{0H1Blz@)^$1U}tZ;43eKBj^enHqbG|&?Alol0_Mu85kI5NiZ<-GN>{zK+ZW9 zOc4X^yHsHXORkh)5ESGC35Y-hHcBuEx_|`~AObrj7z9=M7(^M8m|*7~3xvrsOkrSP z2xA7FVQdGo(?Nkj;WqOIMmF%d#0o8}5)2|T;1Dkr2OoSaf{r8>{2;*~tPhq|*vM7D zB&uJ)#8SbeAi?dyASMS^!c@T^Dp$b(I_XL9f&qiDGDNi*uLlFmKG3nk3c-953@p^93}R~F&Y;3~egg(!b%^5s0ul@?dCY7~3<}+X5)5LRU?qajBp8IX_&{5a1cW#k z`6`$diiA8E#I-d+jWEHN5)48@ydYV@?-C5+!a^WX#gk0Q47^MX>GwH6M-^{iWPZ)4 zP{VD&z!tzL-0+b};VF*<1A73Y5N9xhf)k$vNL=WEID;ZX3z#i@z<@zvBR^FAPez4D z0#I=W7KMC4Fq=h!S@05sb%9kOUdVuf&4Q6-0>dNFA+w;fU_mPnKu5+hFfo7+fNj?2 zV6bOkVBl6^VBle3U}9iU0i8ALtk1z>SWFHw;aT3=qax4r2 zOfC$dqh;5yE^q)%l!A|`J;pc{6qX#2gK7mBcd@8KJX0eViYEK3e!gA4~l0RscW zk13!dk2x6fnE4ABEE)bV@X9bMa&fS5lsYhiPpjR*Ai%PTP4$DL0D}PYE(TT5S+xR8 zuh`gMpr2IR!1!HSLxW)l3j+h_*0v5-28JK33=9+47#J$pA?L%cU}s?1!Op;Nft^8z z@gs)-g9Il7!wGH%h8e;P3>Rb=7+6dgm}ew1FjRmXw}FL$VFxb*^9ja)4U7y7D1MX{6!>0t^A5gLETeS(pPrMv4CrUkQMjtVPGg=5Z@ud5Dhv)_=Nz200%D{^B0D21$G7o zi4>43P=JI-a4;|gK)4(!%pg4)0u1310$>xPZ*Vez)Ez@XWj!xiZV2BW45U&tmke)FGB+D>^8C3EVForMSXJD8CF#u#^0RsaI zle_{86PLjUh8KYh%qffkp!+yGVnIi~F#KU-&|wAX1Sdg(3!I?S7L*P!$sh#9Z>TVZ z|6pcd_#nc>Ai%)k!3uK!2NkAp&=JZKEFeLMC)pSn0>ElOF~Xq2Cf)(zS%@$(6fp3M zFtYLMF!C8ZV5oS?!2E|HdjK76}O;JOmgM(oY z7sCpU&rF~rq+fuJk!E6G039FAk;A~y!f421qOgS>6n^U&q!TJYjs+da{e+MA1BZf! z$SDp+4K4;34h9Pjfg2s1JSJR<8XO^l0w)rfHBA(Fn8F#D85n-ZfI<)yKrxI=sv0aM z0-y_s|8HPm=;30J;07JG;Q}TLz~lii`36ifF*tyZf#zTk{_DV?v|LDrLFtK*1B239 zAqNJ-y+RCkxRp)|c`zuQ7xG{*ye-5~!NVZIz|g}3GIIx*`~xNv7(rgJ;$^5{V0pm6 zz$M5~%_jWf0|SE-FQ`P~VFF1jFfgoU6VCt#4?_YQR4ALBC5MdxTwDly7%+eefz|Aw z1DrrC&>_nLV6ofmEPG((frJ7BLo^3N4cJn!aV&p8heNUaU||65mttV(1kEIYio^hj z?gI>96TxL0=oB%K*`UEqh%98Q8&nLm4j*hL!y9g>6HjxnoB&-20Xp$>1uIA$Hz&gs zn4bg~1PYciutUy=0=3z=83Zg?!2%Kx^A9k9&c0y)+b-~f6_i654loFRfXE6k2)_^j z-9g3B&IUS-3uL|k_;L>h1qSdTfji&|H$Yh2oWd(0EKqSS06r?$Lx2Hf`D*Z?%pjHk z_*iC;Y1|9~KT;W(!38eVRz~nq*wA3V0X7RN1opcy)XLkOpo2|8W`L>}urdZ|F5wvv z7U=R0$T?XWV9TK<7=Ti-zz0@F<_1QPKOx40V&ekDp9f&R1BpRxI>3PBC8+t;;DeK) zws%16a{&`*pCZV^D8c+fl@SyrAR84J7%o6;bO4Q^GD8d!03Txq zI(7~c6cB$;fJkyP2z0Q5P7?vy0s9LLO9rusARayjaQ(igHl0hD152L_mvjEMWnO38=rp{((jG1mtuDjcAmJ zSRup5a)Ch@6swRx2Zbv*&{=%=K;d@m2PJ(_h=W`YvHFLMAf)nux)jBe5c^Ps zAgK^V2o!w^;4%!76hJ`^ib#kUH2Z_H;v3LO>nwi+Aq6UffDot-g!ucz0a2*G;ZXz& z0c<6@H8W<3J^_5fB*+v2NRoosk8qO(o?zzz#UYBXAfb*T1POE$AxM4zmEX`%5P+At z&@vqy?ksnN8H^Me)MSKLyb%B$*7rh35Y%|-W@6YN$iM`eE9BU~$MAsxL<%r)OaQT2 z8u%C*K=mDnv4DZ$0|STwF|PrfJw7lf{9vqO0BzX;T-2Ic||P-}YwAHxAgh?5zPfDLqDU^u|Y!k`RsxIo8NX7&IMaNPz@ZafcI#SIi- zl^VFw!f}8VVZ{Yjh6Io-djTuR`~$2E6Bs!TfY>YwlA!8w0;Bj2u)jbx8_NPn2@WZ( zKxujdD7XYZEM;O=U|Q3g886K z3Q@qaftBR~14jZY!vO}C09H`G0abGfte^pt0}Lz&UK#u3u?^hh=KGVw=o!e#6f9Hfq@}_ zktG9k#4ad3FOh`{iWh+6iN!z`q7`(sBdC$P25OWz$jt)a)+@-`9~Yq3vOEC$4%C4J zrCdnK4304d8wrr<4xpljB|-vZET~k0wChpo71YWCteIsF*o_Am#04OH4$yrF3Nu*x z7?{9G6I^J5QzW=zhSa&Bf*Gt__=g8r{{%*s3Q35~;x8b93{PX=0t=J@APqu@K1dM? zN{f)v1C#{|AO#VqWQ14^>PdjhK^6mO=rEjs1+M@DLj;UHfsy3`*cqT^xje`j6Bvb8 zd;k}n0t_q<*g--E7+BUoohaM^37i8AEDRhVIZz&F0$noFz{CKynq>o+A;7@Wzywlo zfISc1m&Uw3>*rQp!C0h3*^rR29^a}ptd|HCZQt290D*?Ac`(9 zf)s%R6v{vr>5&KJ8;=4uXiQE3Ge84n3Cy5io&fFAGW-Dzyn}o(fstbYxL{kr1}azK zYz9V_2h5-=qXZa03Z%I}qyPg;jRMOB262H6pipI*zy|UL$hpi6U=fZ9ps^j69daQ5 z1R#uN3}9qNHtqle$aJtTI39pp1~vTvGw1;51I&=JjAH}HG+~Yga1ekhC3#5j2)}@M z1iAGF&S~KIf+bS{28JK1pwtA8BZd?hTY!PZ0G2Bt3;_n=2*|A|&|C-Yzk~B1hXD)2 z0R|8W%1+>+g9(f*ca%V(2WogLuz)-}fssJ~9HwATvB)Tc1VNoGNN&SsCkyzfd6pVw zXk=}0aV3v902>}0BAgk0~}*43m9N=2pX*eS+f9KYDsMt3#VleQGlMuo z1N)2^Mn?AE|NryLFfy|L|Noz#8?1sL5q7#4hF zUNE1b#h!tQvC1?sAMy99%wC_{Mw!^V%|3jbJc$TNyCqzf<@3NV;+fUb6H=V17t z{efNSAHxxOCN`Gu{0t4(qZt?)gcukmJQ89QXJBA*_xZxU;4i}wW(Ibq@BjZZXPUFK zeCI#FbTe9n&5`p1^M)lK85aC!I3mx$#MaKhz%XG+3~K@d!;cRv9~2liGcbN+SNO+x zLLM|$EzHKjAk5CeF!=+!f*eDOJOh^ig8~bq1%q+edu{=S$sgGr{xKYo2OXa)%*nwZ z%)`N;{E<`LfT9JR#sOyFvrw6b63K#c2!(Mq!QwMivdmjUU(@;usFtGcbHeHvkE+a4@!fWLof_p(UQd zf{{gn(U6P5-GPa50Yl;mc?NEVZ~@TiOwTzOggG;qK*q%-FtXfWU|GS)@_@nNulj=j z4;UF%sBt7P8ZtAmcQ7h_ z*uc>ApZ(!~h6Z_t4U7jA9`iG_|NqbL@Q8swgK5)-fN$&!{}^0k7#IbZ1$@4kCN!|m z@L^zZV7hqXgFU;#KgS#L983)20t_G6nHktw8yG=HeW)`u)C>53V?XeZX-k4tQaHm5 zMuX^m5{Cq0*crrG8<=ywpRg-1vlK8eDKHx`JZ3XsRPg!Az*2C;o`aRaowb5NiG?kK zL4lc#O)`+d?<0Ft0^>&(c82GS3=EnUObqI*42=>#3~cTN3|}}uu{+2yY>{VRW;bVL z;1KWuA6uw!=UoFs!e90Se;EESFlu!C`oX{?z?jVN_&+J5 zU^0p45J+GW5(2fNgw!=nKeGCLV^{dYa73O%NXVQ)g25n~UC2Fx$&f*4y8x5G?*_() zH{=<_gxFadnk0P08N@l5m^2s-eth`Qz!LODo`GHH04vA_(5Y~sjsmmXkpxy24;Fz> z2@EDjm>B|CSUOl(JXly7ScLg!u&`vXuqCi4D99~f6cJHiU}j(u78PI+_;7$J;eb2? z!-w|C49bimD_B_!SQI}pFfg(BQ8dwE5Hkm4a;Ftkwg0nQRu?R5oa4@ng0A&ae!+{YZ0y_2J z1T%vGqb0+K=^UU#{KQ%uSXnw)SwI{PlP~rR0*ovRKwRO!4J zH{=;Otqw36Jc^Nj!T6D#IX9Y_&747j=|Id!ezkuGCdf0eFi&q_Fl1$(ZoweP#=)SV z5WRqvp@HoaGc)sa1_maLXlCYc1_kB=F)Yjz7z{tMFFU}oh?QkI=s?YAHkJbn8(CP= z8D=mUL_cJ2aA5kv$;J@Q%E038!|*?xm4V&eho8%xp@BIg+JSii3(HCd1_r?xW@bJ? z2L>5_HVzJ>-005-m<1#p7#JR~3X5BSD>efLf!_}p94^Q+us1MVV9tqVWp!s@U|kTy z%zRAv0)xya_J%(UC*(C)S${iOJH{|G|Kwnpz{(KLz`*Pp{pA58g9Ec7gSi011lIrJ z3=K@KW`YU~h71BR49v0v{Zh0y^Ru*q9fv1v0R( zGpI)}uxKzPd|*#_#&Bit=hvji|+%rWC;{m$RO zBo)KPy8Qy<#z*W8{}>O*OK`AmH()YkU?{x7AiypFjzU2OMg|2YMF#f@1_lcT1`nnO ztZd~B3M>a=KC-L*JJ6EAq{Plx!DPtJ*s%h1^XdVG=+Ep){~V6Ub8xU$uqrV4X@I8> z1Q?PSSTy{;@GEc_GB8Z4VA#mU=FYHy`9KUOL%RbbqXLuT33&!~2L?j{KUNNg1Oq=7 zHg?7n3`QT>6BHO0a+EVHU^)=P%G}Pu=%CNS&h&vn;GqKB!z=O}9PH{04NOu#@~rI= zj7sJl3Ji=5ECm{e_<) zfnAYd$_54n23AdZg_!qMjtvdcD?WT>S718;8VR|`z%b)IGYc;RgMiO_a|dRH?C1kb z3d}VJm>3*IjemS#yTPf%!F+&e!597qj~Oo5Gq7>A*#rv2u(PSpU{qpb(_j*icVG&x zkzrtGQ(#hLE9ziSVrM%5TCU>44loENC~&dtU|{6{O>eL|a58@2 zQJwK#fq~h9+weU*Ljnhn1cL%U!*eDBhQ}WqSQ!?8^fA3RP?_*(0*8QZ1E<4*cm{z4 zhCl(I_aYyDa7|+r5J+ce;5Zbs0F-3-7qH6-GCFWFDDW}lIrA@IXAxyzz%DGYf`MfP z2h##}V|E5f2@qRYVg-l82Yv-R2WA!pW|jm#t_AG!@eA0w0L zj0#)~515!87z?^HggM(!Q$HtEg3UZ+K=6|u{KX@Dj z#M~Jfcnt-`j&m?<{K4SxN8t!FgP>Tu00T<`harO)y8r`20*fGH13yCqqXWZZ2EiB1 z3{UM98Y&AI84vI~f01VpzQH2E&cVR&fFHDmWdUgLPk{Y^paOGtw5dRV1%m@W1Bc%Q zel`ULVRZ!tr3Q5c2F(st1qOM61>D)fVhb2qUN8!a3o!63;1(A9!Nl@{k%fVq2h8AK zz%8dQuzUe`77qs_69edyU;##!A51J9jQk6@O=LM1a0>_oa56mL6I}Fw*^ot`{Rh90 zfTWS53@gL*7raUz84}n8gl4drDBO8BgH1u?gPJ7h{4yqn1a=k)2A&Ki{sh)o{scy5 zarOj8CJsK<1V;V@R##@QL@EmhBToV=PX-f90xOFEBTEJ7{B8aOR$T@KE*Hlu@(cw` zJQ+*^pBOkz;G7Q&*c=YXGl)pA+OW@v$zbA3U}QMW#&&|8{{WjSe*z;v6Ub~vUseV- z<^m?R20I&xnz1;Bnv=7H53G&&a^M|A;(;hyeS82Ih`{2TcDT z2r?uvu{>ZB7G1#3z$?V?g45vx`vThqOiJvc4>$#cI#n1~urqP6@-wgq%X2fZXv)Vi zu<$T5NHGbp@?@|nePnXrRHD)53&P{u)0V9QxO1_l|m36C_G7$%4^EZ|{c zkY+Jp;%DHO6JkEV$gjZ2EzhsO$Rp0Lz{sV+1Tv*gfl+|ffQ?6kjd6h(KLbA_vj!8N z0|(a%anQ944E)@T3mEtr_+?o%n3O&;C&=+wu(N0|DSTpf;8WpeFkoV_U}rI4=3l_c zCtsc*$8Gq5{lEf#MNxqnd<+ZNI5n7*-ZCf1SPDtBN3ej}u?ma=0`tHwQIls;U}R+i znZ=^OC@-wQs4o9qfsvQFfuGrci9dl;Sx1=Rh9rvtGvfh9{s&Ur^79f{dF1CMaB>N; zEMNwCD}j?yUM_)^SM`YS192sGksabnLJ|Ukh9B7z5||YjBpAe;1Y`sRUT{oc{y&|O zfkPnXTZ1s8f}8?JzJeUn2{us%4S9YBIbMbX;tULo><5$teykAVI-ryw&!EoO&R(Ht zs0Ql%I!H3MvkEXcd}LRMQ(!QZV00H?FcxKOf1xDk&cVQVfXVrYJcBTUvak??vasL_ zC5Mm93iA#yGD#>Xg}m2L2n>0z%E+FUDkE@8D73L8JFFZj=KM4m~M z;kX2-x4j(P+ujax2ZKNhgB(+Vvarw$Wntk7%8UvO0yh%41j7v&ge4;sg+(&BK%F|V z1(F8r64N(`8$^qXWGEZEycctjTduSpy$t$qoqtAqQE;142w;nt>h`?9A*pq@4H}8Y~wG2_9zP zR{5`@pu*rCsxaY^07Jk5p&QYE4@fy4VPfEK(ld5=#9SeuAfvK@MS<~c1A_tsXqWv0 zh9y5fFsOY0|DT_MMT0@W&_TuFKs<}g0Vxv(hBq1vO6)QQoJepx(9tqwvEA$~+c~EC!5>4UA?d*cmJsnHF$@xCsnmGY&BDc(C#(i18YJ zBjm^CvKEWZJ>OpTK0wx`0DdE_?Ex}f<2HjvT@%uEhK3qH9fNf-HUdBDtafPtleU08vElVt;Fq%)ZD1Uu*~dX^Q;!U`v(866lI6qK27 zaImakW(i62q)&zEuHwT!R6WA4mSza(2PFNr`L5*zzsFf_i?KEKl8%qSc5*teZJIeuPC3cnw zc0+cS_5^k%c9sBkh69WW4mTDsGDI-2MX)o8#V|A|G9~b{MX)nSnI$+pVqr*U(qLt5 zU`;&2#K6i>F2KNYgG*hUg@Ie-Kmvo#j}IB_0$&)E85_8()lC=}K70`GU^e_9zHkAz z7Q1)_J4*pOD9cV@=ILN&Y+&SIP+;a@;&))vW$9pMaFBFiIKs{_ftkgEiN%A7MS+_o zf(xY3fsx^XJ~LR8!$-yg{~Xv@CNT3iFfs8v2r&wY7qGJ&VB~jDjAiLyW_lp4V$T3p z3MwQfFoSM~b6{j_;1;{_fCV&Q$HB<|fJ=zMK}ks5f}N#@*uk- zK!S!J*&7&)SRm#kfU*lu2PhdrM3^3Mfh}R_V0L0=aAyRa$HK~VgP)DT05sXbV8Eon zz$77{#P_vY&K|t7LhJdgO2Uu1?K-k4XK-k3t z%u5gucBv2$c6lMN$mRWh28Mr(f8<#h1leX7Fuh=qS7^}JV_d+>)gUPMt3gnSo8be4 zaEJt>aEJyYn}Z;e10!36AWMKCi-I6i1EX+=10&lCL52kOV5TGU8N`1Gu^GvtJOm9G z7#j8^2|RirAkO$efW?DXL4ePIjp>1a8smirtV(=*9=t3X3@i@>m<$-$9|#BuTwvu} zz{1eZ$>m_mkRZ(RgPG?6E9lNW1|DIS7iugVJgf<<+&@@Z95@s{GAZx~*ej@WJD3Wz zcQXVqeqve>&tSsxfR$wi3(E^u#s+522daYMJD6D)urNF@=65jVU{&B%5Kw=hDiR&R z$Mk`b?ExFV0&hIi2?l-z-eNg<1zsV61FQ@Uj0_KiS#Ge}GYF(7a54*t8}PE+U}d?$ z%Add)C&bgh%(6j%k%4*I33-P9Jm2{buA}hQfQh|oF|fR(L*pJ@X# zO9lt)0S5jCenEZ*PDUZI6FiIy7#S28SsO$dI#}6W2=X*=fhx`oEc^_7M(huGK$YbL z7M27q5Mze`%LdR*^*lQSSTc-R8aP4X2N+mhu<$pC@^e2>k`l|{H2lcF-~gKvv)Be^ zo(xWbXAPV-8{|Rdbp|Jk11HZ99+nJF76k?fgHeHj!9j}q2M?16v$4bD4-KL$1)NMT z*x52bcR!gNk!L92WYXYZ18ZR^;N;2RWMSZBYT)2!;IwCC5EVG^fSYN86(a|e0dsbA z03(9~m%v{I4kiZ<0agYM77Z2_0~H1abti^5@(e#DjB}$|Gnj>0CI~VXh;jQG%ySNcaRonSisC6z`>Vb#qS`kz|bJZ#$dz3|vMn_6!o-pq3`Lamafy0Y=6I zJLfO*jG_XJEE#sf!Vfr=KejL6W`4jaEU3XHEKndQ@Nfa6bBzoGQ-rjzcmrri+EVE) zKLeW~JF~lkr~^MkL)8gJVVMGH#s;Ru8}bZN4A&Ph3QIbG79nskC2;uLGq6j7E^k^O z$e+LwFW9cZ$)CVs&cwjRbApARflY)jfkQ;HL5Jyq3TPKh;sov%2FY|rhDM1P1wn=e z9)<_%`plvU9E^rHqu)H>c9LUY6ZpYCiCy43zXRiqXz3Gz0WF)@7yMmdB*bPQ>Da(7 zozB7Kz|X*;_kxkJfsNq+3!?;sK;{hz#uGA31 z5qSkpVV4CQY~Y2)E{uPe7%m7iJpl1rKJY*Iukc5n!N7&-1vA?TPW}U&3j7ViOe_|h zOdQ;7CpcLSs4!V@^Dhu)U_PM2a)FcQ1SjJHPWA=Df-)}{7!GhUC8)A25Ehn@;1m|$ z!6_`Zfm8VJ0%2j%8N$LMJ2-`fH*kWk_7vK|DJ*zHMOdJLbMiO#2ag%H*fTISa0)Z* z;8bE~Xb@*m;ALpAci?AW(7eDdEOUcTMrr{k^9N;q2G(-H!W zG-TsS7jO~a++kn9Bo)KXHl3lsk>P+;(t!*146JN3oHuf?O=sX>R)}U{Sp-_X>ELih zo`K7NA&6Ncnww>Xy#hPK0$GLy3<3)j;BcnF zMbNx}0d#|}p@1JJ2ZND;-$w-{1qm(zh5%*_KXz6PX9bOD0|tYRdWHqgj0ui{tO5+z z6rwq}+8GwG9Ef3I*$gVIMHJc74>&4uu~;}LadK>M6!_x6ka$F%gPoy$0|V%exgU-W z=A7(I4vr3A*ctvO=o+%Kx9?z3QfHH3P~qfU;K=yENr{2uV1%{*h;3VJQ<}G-P4sb#P!f z;lOagfsuh*phtm^;R18ujQ8veITj28`X`(Vn6sllusg6n-~x@qP7qXJWzQ^k@OjVA zFfpv*$A=H{3Xd5691v!b&{X1JUBRTp#?ZjPxWF;-f;|H}M_>bk5(j$)lOa2M6KJ7< zqcR7(1(PCsvjj+d1$fwZhCKs2dxHc6lLwu)xLji{e zLW~T|PB~_*%n{5=%*+~0j1OD{O!A!s6C4ph1mjJc@-E0SrnL*%~%;+*%TNU zq}rVXV%V80n1oq7Tnzv6GdM7@C@=`KFmSMBaPl|!=uQ@7U+|ov#D;<42a7OA0|!F@ zV*vw0L%=8g1rMB!IG7oBuoyCPnXh0`;$mLG!r;J?1e$STWj^3x$i}=p!VNSMD$MBM zVR$57LV+QXNh6w-HJ!mtfNh2Aqk4wukL+##4$J^uoz-puTFIxN5Us$#kio*xz{S9z zYsAdPZowe1O^wyM6kW*O;*n*g(eE3-z7!6rj85vj^ z6IcXpIIuE3U=R^kU=a9F%wWL4_(XwK;Lia@!H5bL1r}Bb2JQtc3=E762`+&QGv2ea zOXexah=b;>vkz!1{Bv;dXJ(L6V(0k5V8G8&zJXOa`q=>&1NIM`4UCSA>4NUw8 z7)()56ZdCeDw%*33` zs?-#P^30M9g+vALAz~lcu^%Fqw6H8CRX@bfNB`FxQT=$lhleo;GbACO9p+%CV4$ha z@L8VJqr(hJ^OE%#7{wS2d@}RI7%pNzH_QQaY?vA2Cng2F$A&S)F+Rh2XxIj%GsA8( zI)aW2<69d@?E}LYF0bSlVw7Y^W#wmL&}3-kW?mi7$;804LCctd;g3WC3lnI?KL-;7 zLzgu_=-6DF?_8kcqgaL68CaP>hZZr&GcYi*@h~wcc_;`mu&hXBP+<@urV+>FfdHx zE+W3OH~Fff#Kffli;GYBZIuexl1qEGkkj*TWpbN?5l|fR^+}S}=!YZfxghW7n z0S0+cvsiGGItSzbPEIBU0gL&5%st&8s~i{@?0R_|7!}01KVr00?2=bv}0yB#`6Dt#g zfJ0~^vk7R6G^n5fOUf63lxH|FusoOuTBX*L1Ty0YNN|G#14{-o$Ya_E7#S2F@LXZ$ z1ugn_VBlx~nZeS)z+}$k$iVQ9=MFQk1A~Hqu^a=_j3$ut!3uaB7@Y1iFfceXae)qd za$pb#4VWHaW~pFcl;eqKU|`q=x{F(YBbNmf)ff0c=Up)|I52}wYBB^J{-kun*@8i- zfekdd$pDpb;B;VMR*LYlU=VPyWMFQ9iKlWoFfc35@HJ;pW)LuBWB{$o0Gs5`4LXQ+ zg}nd+10y3O<{3Z|3>=004UCKm6+vbUY#EH8s0WMd_wsdMZ`eomnl?)+A*+2q=Ev( zfr+`Fk(G&oL7kalF$1$Y6B`o)L%bkEl?a0+BQpc2_st-7gn@&JK~afOf`NgBk&}r* zsUeD=LBJuFf!TqpuwV5P*)z~p1c0P>Y+80f4k2Ei5f44em;eC!z*xE&ZoSA#C2WME-pP&%-h zi-DtoQQo0}kwK_ego`CUixm`HYq%L$R%C%rp*jyLHG~RZ@$)(`usG}n4e>ECfJWY! z7=&tiX7W2Qu<;Z!h_F~PFf%dehq6J>rgC6`oK3aSImr!lU@rLZDLjOMfq|ick%6&+ ziGitsnSr^1g@L7^fuW(Hk)g4niJ_^XnW4F%g`uU9fsvt+k&&^HiIJ(1nUT4Xg^{JP zfw7^nk+HF{iLt4%nX$RCg|VfHfr+7sk%_U1iHWI+nTffHg^8u9fvKUXk*Tq%iK(fn znW?#{g{h^PftjJ1k(sfXiJ7UHnVGqng_)(ffw`f%k-4$CiMgq{nYp>Sg}J4LfrX)k zk%h5^iG`_!nT5H9g@vW1fu*6Pk)^SviKVHfnWed~V+WFC-jeI4j=@P)#`7p&PQQaUeao?txC>H4GF z^+D$WROwFF2i>kedOaAsL%(=*yT0&XJk)v8qZf2tHv|7R7RC;yZr3j$D-Jd^L4vEXg*-j3AUv}U_$I+m|ufEn%{VMbk?5m=qx?p(HXiU#iN@=RReT77P)$KyB>g>-g(%g+jR$=%Xq`1)Aa$03QN}q zWs)e0EL|Uzv+n>Ufl4GF~(R&cO+bi1DL=#Kqy5+3j#&2JPuI%|J;be4Yb=nQ?~(H;5$q!W*uejvHY zquUkLFas3BI(;9wbh_U0>GZwe(d{bH4N3?K-3*q8YE6B*eJ{9lyWa8W_I=>d4YIft zWbqG=UeNg}4B`Jv75{4W)N2Tb!3g$XDQ7l7LZaWt~SS`Y9~Ie?-X-J35wx?MjY zdj%yVk6J{_yDb{Q&k>=Q)@MkR0zA0!r{87rpT4EPde78G6H` zI~L@k8!*+Loku)+T_3=>9^DK+osT>^k9l<72S?=ukLDu^(T*{(hYlLJ7DJYh zZbI^4hd^yYW@FU{O1B`Ffsz0sb0fK*$gu84gf(hFV-07Bp9$IxPN3bcKe`<_Fyjf4 z=O6_o#H*kz^26ge=!_2r29VHk2hdg9Am#^1d?9>^l%hPE-$2}iX%;jjv4$Qf5( zyY_+ZD}W^rq%g!t;h44zJ{Nw|6vj~N~Z zAFz5bf@;ni;8Nf^*!l^eSVJ%6vE)v0N(Gnfs0AAqA+!P=lBz)Ig!KTpI)>JlV38FV zA|2==%?Ao#2@2#^kLCjrXnY4Wz6Kf}UIQQ{_h664Hxi(r?ELA``N5;}ut#U_hkyV7 zdvwnI0j8$D`1k)ma-_d7JmAp{QPVvYtfse>;Xg?LCq~ z(kDE6K`{?^uLr0(-g(1grpNc6Xf}bHsJ)>N&~&sMC=rGx7G!%Msvw?zS@Qq?e_~QR zwt5y`O}y~v1XXM=kSewph>8u~$^o}|Umz+tP~is)6HtQ)sgg!Z%HTR1-aSAxj$kPo z9y6V#M><0fpv6sd?E!{TesJp;n%X;E4|Ka8&~`lnZtb6dIl`m!P`B$5aB0ls(aXc+ z(d&Al^OgrF>p_Ccqucd@M>i;&`htoGl(dYR$3QLu&^q-(fJd+EiU5qN8gxMcu~qegZVr#`&=nrt zt}8q`L6POUpql}l-&{9%^p;+LM|E%L22k`OpAIkDFjNr&qO znC{ocpp*wnq(};0vm*rIDp3kyS5Trs>$0JB*91Ci|8$oA@aP29-Jmkd_lHNX>mQ^v zh*ThfnuR2(2X$^bOFw|Ssz^N?kT6Q02UG!r&e`+m-237$s21J&08CB20j7E{{Qdvm zrL*5!9>#8ZdyW zgywZrQ(JEHw}9Iuy{%xon-2(}XK7GwZ3PDgD33v!a30-L!6HyUBK**DphVvTuURlH z$2>ZrX5m%d4GDY{tHIF$4Mozd1_wNnSx~Ec0|Y#}!2?T3&D58V!L=M|;es0apcYOy zIB_7Ei#2?~UP5*lq|pwKdsJ@cAy9h{oK}#65sMmh|AOKY-JkG;GZmZy5CIC4fH|V` zxJx%Ub9i)vvx-MIB;#~LvQc*{C{uZY8n^JI21%n3F0|-@geS~QSS&)6z z-~kTg-VdnSf2ULU|cZG%@c+m2LM>n+T22Iv22TIsInq6No zzGnAmu6@B!8U${^c7w;>RKRSgN=VqDG`cNaf0Uq&5toXh>T!MWx@E#^MsS=X5*93^ z5Sas%)Ie=Zq?`ciDL_&TD7T|Dq(ME%7anNMSWv42C1-(JYco7LODCW;)jYagXLxkF zZt&<11-WYja(k)Mbq8e3+7~JXk2{Y}*8?8iz98ud9^Iv&B#zV~ay{YETzi6{1mt~0 z+q*Y{~KInH)AM^!gUQ_^ODq<2lC=Ypbmp%ZOJiEZ!pam2-y>=ejrv^3( zJP!sI!VxVf&85!L4`|(MPz3<450U)>8i)e*-Jv#ur^#9llv+Y+KSJ)?W zJ7S2i^B|}c0@V}8JUStn3zBgOby`6&3vnMJtkK&S5I)RHunG*j`$I2y^twJru4uqq zi0eHpU2pJD2Tkrihqd`zZkO<&w6Z`U(2Z+=2H{35Z64${2h5SkwJc}?6{S`ucPJLS zE4o3wE6{8nheziH56u%E2Y)a@ni5#rN1$R676i!dhPST3vjU(d6*S~PT^f+#4B**I zd-Zd|PV;ZH&j%ozBUjvIrQ0hmT0!A7a1)G3K zTksG7Pc(tk5IkKWhY4IAXd)AqN#W@T6w9zUgg42M+NY2Zg=Z7w@(32Htq1s9LKs1V zeIH;2?FSqM?HzFG2wt3mztMBd71gWpn1VrTlDWdV# z>d@E)*Sv_dffQ?mC^^8xaFR1$_-~sOK zxk^AfgRTmoo)l)P*#Ih$TMv{vfSOc3-M$B0x?Rsey5*o*lTygUB_y$jd-T?R5C0G9 z(t*=+I4n3ov+l%26TCX=^aV9$KEO@rJk|{o0WCa2ssYiLGeP?t;Dv19It0|s04dl6$3&R7^U8$4o$7@XkYED|UlKTMv|Kdvu2`fb^r`HN|d_ z`#=up_Fd8Kx~B6S)Onx*boDgEB2;%5N`*OjZVsLr}3%zy$)u*1FMSplhg^X%99})oduAwsy-K}7g zpmK=uvk9=N3#c(@4%B$92yzr?EDFs!i~$pf1>lAYNDIgx@RkR-1qqr61La|Of@=Q2 zh}8Ok)^3PSB_xA^yZoT89wl<_i1sbF*(S@{$L9KVt`VdHF0nhq_3K($n86G^Z&!ClFprC*kl+YER$m@h! z4wOoQ7c+tvHG+Z~K92XA5zQ<}i3yGukV2S4Kx;=nbTc3l4(iwrnhQYQ04*~CmCn#* zp_rlCSqt$$$aCP<=L3&}512uXa?pZC*B>6uwLch2I3dM019aUjQW(NR7PkN0@xprK-@EZk!rovsT!x_v<^K%>PAJkTw4UE$GOyMm#_=$LB<19*BL zJo|w#zBjZ3GHU`C^6Wg~*?IMKxJP&G4yf4+Afv)y+p!G{cX)v6TLw@srJd2EH?-ZO z*B5Fm+^Ig@sRw+zb3to!-~-w|oez9EUwL%L-oPp_DJl2#gzfMPJ(F~%_# zxj%-~HU}49;3Nu~+CXYULCx_1&2M1K3P`C1p1$yxU$9&a>e(Ze^oX(yl;*HF&8HKz z?A#UBVL)DIgON=@nFwjf=@_V7^XYtulG;LFfJY-dnvWQOvk7va1ycTi`~eyUZaq*U z1uX!;Ev@b#4iBs@_h^1o;L%yT!=tlwgGXoR3Xe|T1wNgwGkiLIQ6@Yf{nZ^F%?A_^ zIl~n+9t9c;xB+Tfc=T2%fa&f4&`8A&@OU&RS$7@-$7$z5kIoC7z7IfS2mISyAG91O zG4km41%(boKgz%hDBW)W8;2wenxg=>EJDFiJ`Rm9vl%04Xg&(g3JH#~4C+VR!%% z0*CfFf;)kTR*=VW*9V{`B4{O&2drNV9@BtlSXhr(dp~F_A8Zh)q=Bx7M;i)&ih@;j z9@^)MZXKv{M;QTBd@Tj4%|YoAsu!vK1vUm-sSNTayygU}>O8d1AJZ~;zXPTQw95{8 zND{dgr9udT77K%?k{=)?_rVx|L!d{AT)NOCz7b=UXrpVOsgE1Ty+2Utv*SMm$b-rc zS~p6)W<}Ur>J=Yrh`ZUyxE0T)tzprjfd)@Z67GCAg%8Ry(BV0(F6zz?#AB zxE~(i&N8T(0~)dY02ylqwID$<$X#V@B>Z1fEH82`+(q}dC@5V+OGhsdca~2 zIt4(TD^Q`0bM_SEF0kueKcKi|2xLD{sSNG&f${?)uF<9fAp`RYpyY0OsF>{8KqTv^ z5jgzY7@9vYwj3y-)p!?jE(RrO^tJZjsd~tu^%ooi>Y!W1i5xmC1+ClvvXhB{fngU5 zsG5VOR#>&>3=)G3-hqWc%Yd;KKCoaOT}KMqWQ4Nz4m9jFx{h>o9qIr7un{*zk7aZn zDQwL%q~C$jUq&4!1XX+BNosg037RwoEsTH+9d(A@==8k;o@fE>*?{a(=?=XDX@9`0 zUXRYh`$20I!9|(}XvY<^M|UuXM<=64XCP=d)C-SJ4*}@%Cy!3Z%p7EQJyzZXw7F`$ zErJKImcR=Tu~6clsFGv-K+0gV~J2i`n7;nUsFtxd?SVW@NA=>gpK zhO~@+cr+g{@UV3Kz(3^x|F#1j;O-KlwFI&sw0sxW5d|nG06lb^~Ipe zB0(Fn?jto(J$ha5!y0#>Mjg_SKFaz+xFP}>fKYgYS_~*HDsa;o8gJl~4%CsSa!8|3pbb@^F-TmCt6(J%M$ZkjUlSI_;3*3R zk4{jZ5T393w}Co?KRN6Tq{jNMmU5el|#H^8pSlsUMcd2*xkCgacQC@UREvTS6i*^Fc0v#tZTo6^aY7 z$UBC?@*OC8;AN&qr!QzQ6P}9DcfEoZ@FJyu&~hu#!u4aw)h{$Tc7tXiAZ*W0&@eIl zNES$zg&)|0mZ3pohM-|isH1&5KlpUM1W)iG)f=uiUVDRr3l{LmL)MUdcwrZK)*3XS z1z8FSE)yVn4ujc9MKg4y+A-EK4yj}U{kAhn&y#b)%fd~+T4S= zt^=uo2yLo?n^?zO|1)CgRUsJfGJx0oKpJeYkvxorb1(^L z`_yA}t2n#?2yWz#HscPXCy~)}C=eYX*s{~nX545qj?!k_XdiL3j|fUbqb--wmJ0&| z19D%G@Q9KV( z3>sv|KgAY*J3!1OVm1Okcf$07RY3Oq>;f;92hB7?T!&O7fXqVi6=XFzXci7Uf(P3! z0^O4VpNWIak|E8#fo6N*J#X;T3uJGuFKAohO;~vgI%yTM2?OqZqB9$k4hPtwxzIZi zkWa~lu5*UF6nr=?Xa)~DCJNs=&|G_ifxqP^WL_7^>ENTkK&MTCCU#FCueJbfjR9ph z#BpUWp!>&K50qwtGY#a#4d_hO3GmUbkeR9Pe$phG@jXElJz1o)m5)LINwdVn(-+PZGoxd@=8Tj1+1 zTwfrkLV^oqhetDUIHopmnG8x1ka8H3Wq+Wx@-T{I_|y%k?1AQ1Y(+RCbz>TVT?BH< z5>mm6XkTL51IrXBu0XL9eTE8!k3E;b&V59lghI|GnC?b)2KGiUemxi-!(7P#SrkM> z<>t|eRI`DXO`wDasCR-g0|(0S$n6Ehq9D*Z!s(y`Tp&B>K%ESagAZ6>^(Inu!~73P zOXw4LNMQ$S*+pR6sEAzTfcNu)&PRl$O4kK2dqEXB_(W=W5$4mI3tDwE1FoVwc7sRf zJ;WwE&(0%0o$p{P-CoOrQWt26P3JwBm`^8sxfH1C!?+ru`A7st+YDRk0f!H$dPGY+ zh&VtFtLEAdjPRwJC1Rks@i^{a0B#zAl63O{gV!S9b#EXZ?k*vGLls3a%LEHQl$b?L zAK>c+;BkV~hKKf&LG?3yEgh0MB&L6?En9pA0fzaIHY3RK;OY$4W&}kp%$ty9RPZbX zX+DBBwIdP^>Y^%8OA&Hm5hww{S0ceepB)kUp!|!c?+pz=(2zf5oh8PaUgWgZ8~Pr6 zni)6^zylX4SAlGU_mz-y5#*>mq-N(Kk6zd7sB3sZH#(pQfew;KT0IJC-+>nfBU+db z&`!gR1+4+TjtExJ5v8DIztF|pD9e6PltIfqER$lO1b{RF51qhAlvAM90;QmFj~}nC zzy%X*!!0O~K(`uz*93sN+VB9wP{w!xTLMMoRE#u*;R>YD5y;>OWcfWTB%opeEJ$%h zY{dyrVaO=~e7hM^T@4LCs6C)|^pE2XpiNDn>#vYZfaDx-=}da>3)9!oZHb_uMs(ni z{Rj;?@Tqhd&H!tJgo#GOpj>+F7rVq5o2~H zPLm;aLCtrqNKP#%$;{6yHZ%jdG^jMMBr`YFwWuh+2)n}|+8HqQ8-esYC8mIV=?Xat zA6zd%N)2#cK&}u#yRVT4HoILxn{H6{+=6loX1=7NNz`!}_=bpEv_4usnKO+2I_Vm{+5-X8yMyz-@t%4)E{Z4$+ZENtC0j-!TyH22w5E2X~^Q>fQG6EtsVgd z8IK1tjReU82#104h>Epy?JGj2MY~1SJG$Bj*K0#me9@1k@t{?erUz z7j+Cu$AR(#ItJIbs1-lD5-=a!{s6O}guAn|m4ZfSQfXdEse++_o{^q`uAy-$m~W^F z6=7fiwbVg##Gu`_?tY=5o81q9#F(MRh(Kw^svri&3IRrG9(Ilij0_Ai3=j;GmI4v3 zd;)Du&b(}aJnW!G6iD6%B+kIVAPb^l@*xQM7?3yv1A{S$hRJ6kpB5@%pw5ChRLd0&J)=+sM)lrV^Z$$KH>K}R5iq=Y~OOx_bA{|6+F$9x&k-X#VG z23+QYv@=!(_<%!>eKv?I4GTXN1_lNY1_|;rLgIk|<_6bhW@bi4utPxM*aEc&bax6| z4O25*ehE}w3uG9~j6|?BIG(`gm@+Uh7$M7NBIKVy^~)p6hau!yKnGPYFfjOo)WOVW zb^|*QbODVDR9**JKh%F5AUzBW3~w1Q{m675VMhiwGm;VV9Z-2(ep~^S$Cg$=;S5T% zAm^>Y=8ky?dp-~_=LXb2_{;&>2g*nw`@*o<=Yg;<0h-akjW1Z(`atAC=6FEevxfyO z{5+bOnOC!b6@t_iK-Ga#87Tk4(}`;{GgB{E892SoAX41{sJbe2_q+EnH?uH-b15io zDu69xVBiON7Uq68h0hLFIFQ|J& z*g={Y7+gRUOfR%PG=Rz(BFklfgAweP0I0k*vOLt>9H<;A+oqGW)k1IcYfXWLZ>xY&{AoB%4l{*6iLmEgP=65DYh6LpY zSh#_f$AGgIOby7L8^F;G&JPh#b8zMR3aC6Tcg}#y%OSfX6dVj-^LIeyJ5bVFJ=g&t zf82n|dm-y*as(RylK%mf4@S}N1=laZ3DU^GU;?6Ge#{1I2itD}mB*z&0xFM7e+5+j zDzf=ZXTTPK+&2R%k5cDCk}uf)9Z>l92vx<7%JHfy!fR z*MQPu3)KCg*y}J?kVT-dSc6L)6C~k-+`_>PawG!-uJon>mB&>+df<@7D!%|(-lLhBsUNHaoJSg@`0CH;q#2%1aHbB*R zpsTBgr~!ou4(3=BRX3hp0gumX_04pcq>Mcy4Q?*o;`6|Xr^`QOO;LGFADHi7}< z&IwR8^N`g*(s&<86-dn%sG3R0YCz_+fK)Jm&A9uZ3UcQXs5~z7k3i+YllriD zO9yKQr?Dplcm;htRJAmB&_2gTjdiYJM!XK8z2f4+HW?4jywr@;wCP z*AS3D1C@6|_J?aTa|sVS2gv+4P)E&}-zToXaYK2)G~L}1H5kfs(meYQZ& z(nKz+nH9hp4CIz2Pq(P)Gih-JeQbsaC3VKkyyFlF#jo#-6w|AJm zLA`#Ex*Vvw6X^K@)Rx>1aTCZ-XQ1}s>NCE9%419OAbYMr?FmA+2kah5#|>nU31|R? z0e3waLO{NRfczAwJlHv~xJOD`TcGl|`juA*$bW&#nyz=JCc3}(pkxd?d^ zs64j511Nk#pz_$#E~xw}fyz%qHXjnkZ6I4fX3T)9!B&5R@>B!V4|mb)R-azx9+qZS z<`dwo04^(@K+VFI4ncOYfCp)?$0x|WZ=iO8#^zDdI<$TPsquiCQIF}Te2||&YGC1J zg`U1Zg(7r(Z35IB4U}?09_$&ATQ@-Ev6Z`^Gz<&3SLk*?!uJ-)Eg*A#K+OU7?_g<+ z3DW2VnezY|W-qaua}#EcgEYv)3=Fu+kp!qbHhVzsXn@M&O6v=t@|hrO;O>MB3V`$< zfXaj7h5;rIjTeyo1E@T%vZ`9S&=Km-E=gC(*&v|kC5asUx{=I|1r z^0?gB0F}p9CV`|DfCvWc?NE@^0T98!z)+0rKFIV1*lQ1<^0>kmGI@zR{uIEIn+y#3 zDCR>Z2tYP4I4VZ$ZN2GDv z0urwkYNaLm|uaa!ItJgVZH|%4qMRkHprZ1;HUwc zBLi-z;&!VIR32A75CfG5SG%yVfD|!c^J}2;VCTZ*k-};Y0r@>pd2Id!h2oOVFr6^RHhAE+8^?g2^VfCvT#hB94}jzi2*?KzkS~DB zgQEr(=SXw$6Y$7`;;aJder5D}-@A{wm!*fbnGMni2Bq^QP_>}&Mu}fgydi}-3plW_ zkN<#-P=U(7M-Eev84n@x0?C|mdX^i~?R34Xp0q{gN?lma}Pv508-~bq`C|u)xpB; z6ndEk>Lcug#1kldK0w_MUb_Pe-(rNk0LXC+3=G)Xogn=NP;E0;oK$dC3V-d0ggifXaiT5$1oS^%fVP^0>;E4^VlW=^s3Rgvb9- zd0g%bfXZWQr-0H=0Rj05PC9jH96F|0pOd2IHB%6%D7fH5#I6o9e>EUiKM zQQ!^;C_WvaYH+2e1gJd9JT!9wxIYKViXKom5GuD;K+VAw7AK(cxcvG8Dvzrl!~vc# zh0gcE{EE~c{6K^|LH?0|+C#`c6;S(dm2)$o^4Q`A6z)5q^0>m|22>uG{vQP7CBPX7 zcR64ImB&^Og4`DYmB;123Ig&opz^rJqjwOHzX6rU72iLg^0>lBLK~Fm85nTY6BbZ; zT>2xR^4RxD48AsOE55; zMsG8~%0i@eg9f-lXJEiJX66Bv$2B*c0hI?2CBfVRnc@W3>m5*eTz**rl?TmV!`lT& z^CKsq^0>n41ymkee1Os`2RI|*PVX8}d0hHEpz_%42gP>=R32qr6{IW$xBfwH=z*%i z6-H~I^0?*{&Oqg{m4TqLX$v$AE~Cdcs4agKk#0o52H;-vX#$l`L{4{1kcm)`KSH4L z*unxNUqV2B3RE7~+|3rKJg&I70+q*Q{}-q{%G^gKlK;VrN^sBpm_X%mnI8g`$5khm zK;^NG4}-#I3RE7K|F%HoapjdOP|EH5Lq z(LluM??Lve)kb9Tlu@5AFgn;}LJn|s- zo`Je!F1B(3x^4la4i;|s)PdX+0#5i04B#PjSoj4%!VeTB*f40gpM~M$e{6Eh422+R z3@K)YVjSWn7~&QT3=E762yvJK(3%^NICMEE1B?c(lV(D)7dl-4b^|j*DNH+<4QCShKZQg5G!F5z7~-Ik4OBi0KxM$=_6!URppXQK zi=c_Wfu;xWIvAvUs=)+FZ?HX#Fmpgx%z(^MfSLo{b^`J)12e;Us3@2VCP4enKVTO)87dA+*B}F6;tQbSCTQwGYM2==Kt(}31ZHNqgdx6_iGcxBFYkfM zfZG@h3=B-n5b>)}ap*80Lm*WA4^$kwB#~h;RJ;JXo*g_t54D<^;WAV;RDgkjVJ8bH zJWCkCeKQ6Icst=f3&dVSs7F!T30E-8`3p7&Ar7891kG==Ld*yC6+yPb^0f{t#C&h4 z`7nR^Ld6TA;^4MCNFxIS12Y3G6rcj2IXCeBFHCWe3TB3D818I@x(75)08#^Uj~N@p zJ;$Np240WIz`)Q66~Brm4%&wWim%5|aZq~;WS#^&#J|v{CpKRMwjE@z zH77_lbl(pH1A_=Z#2i>W!OQ`z`39-ahpGq9M=~%laESlFA^r=8IIJ`Qxf+3)8UEr>{~w1qBdocNFa(@`nQ@4- z;t*%YAr9(+q8JV4Ff(vtsPBfRTUWIF#UKDl&mmB8Sos4oh?#*G!yJe>KMrw09OA+_ z#6@w4i{lWN#33$?LtGYzxI7MVMI7SFIK)+Ph^yle2i4l35Jbhy3|ctU>);UA!y&GZ zL)-v|xDgIER4%_XI z9=~2V)cfEN_roC`fI~b8hj<7M@lYJ%;W)%2afnA@h%d)cp7;qu>RIskK2rM%6jIC# zF&O4R#A9)Y$Keo<$044GLp&LWcq$I@bR6QDIK;DXi09xC&&460k3$^PdIyCRDrROV z#G$?zhj=Lt@p2sE6*$Bzafnyr5U<4{UWY@x0f%@K4)GQo;%yk>uc7HP0veRCb|8q$ z%m6#R1jK{jNkWkH44x0eNY7mu=7V%GGxXpP@53QJ0f+cR9O9tf8OSD7%*-$qhx+L_ z#Ao6VpM^tw4i52oIK)A{MN~I}c+3ooaHwC5LwpGi@ntx~VW-T1Y(rpXhE+J!ufZX{ z4u|*#9O9dBh;P9mz72=?4jkgUaER~0A-)fXIPBCIMA(2>%nXNcs6UKD{3s6b<2b}m z;t)TLL;Nfb@$)#uFX9ltghTu?4)Lov#INBHzkx&i77p<{IK=Pa5PyI}{2_+8xiF-> z0>1rG67IKPG57sDYgfkRvhhqyEjaakPV@;Jm5afmD75Ld<_u8Kok9f!Ck4sk6U;yO6Q z^>BzA;1D;$A#Q?0+zf}f1rBjb9OBkE#BFhi+v5;-#3AmCL)-<2xEl^}4;|e3 zTX2ZC;t+4cA>NKdyc36bHxBV09OAt=#QSlGPrxBQ35WP(9O6@Ph)>5MJ_CpNEF9u< zaEQ;tA-({I_#zzQpxHXqJPqP8Gc3iSemM^D6*$CK;SgVqLwqd`@pU-FH{cN8h(mlc z4)Lux#JA%R-+@DX7Y^~=IK=nh5Z{MG`~VK|gE+(w;}AcJL;M&H@e?@2PvQ_ijYIq_ z4)Jq1#4q3wzlcNpG7j;pIK;2x5Wk5-{5B5pyEw$};}Cy{L;Nug@h3RMpWzUHjzjz< z4)Iqw#NS|u-w}q)TU9~lt-xzVL4^i*e(f!WdXN+|!+RX!A909(#v%R{hxm6K;y-bS z|HdKy7l-(N9O8`7RRSPiA}})pGY)Z99O7&^#5pj;c|;)Le;gYAuyqw`Jn>I4?Y44yEsIMk6KM?mpBusHP4Jg69mdJ7hZ9t;N(fMM`r zZt#k5m3|kaLrz{q5jOyf`=N+|IJRJM@QE%UE*yJ-#bL7!aAAfF zusG<%EQA<{)d&^`onj8+A@EDEcrro&$`S*gosa?LA*d9vIQWzjgdhV0!zQpeXf-v8 zFvB&lcr~gZ0|Ub^uy`$oxCVHE6X=vnR4HGuIQY~V6p380cr%I^h_f0j-Vb7;;3r`5 z$tYqVPCGa&f=|@|ap71Gyif~#@+Vvj#E1opgKn(>@esHHEWQdM0A*bQXX({Y9)j8g z-gveZApm6^0*ixB6M>3As4rk~@U@~40Vuf_yipi*auiemO8o$fUx)G#lq`6EENC@2 zLJY*p2aDeVF;VaVu=rgRF%ahoSo|S~iGpRo3rHTJh=Dl1U~$lCJ0Kncmx9GXr}-en zK&%yD@$Voe3cdmshpnbXNc{tg!{<5yxsg z12gO}e~=<1%*+5?Oo$`~Vlp#87XgAeNSGPE8WBkl+-`&}_CpnBU}k_W_QMdzY*&Dk zFf(AbDd*y>kgQP6r(%ytV%1vBL4 z6A%*xGc$l2K`3G%4m0EgO%M|WGs6zcLlFhp3vNW9iGswzl|L47aHWk!99%hL5eHYc zSj54V8y0bJ<%2~WTsdG72UiYQ#KDC<7IAQ4k3}3@cw-R<7t&b7LA$Xqgc+C_z=bY` z1TzD;@WUbwF3hlqg9{rh;^0C8i#WJ2z#@*>o&+gkX25JuLc}rKlMr#}hD{I$2{S`( z?LZQNFqs)(tBN6f7|9Gd)da?c)69^YbKra!gBfyj5sV9`nIX3V!1*u+Gh}x(j0>lk zAtwpK`7j1E1Nama7#B`6Gr(3Y!-c>MX2>aQU^X(r%mCi$i7W(WGc#bepFtAL44Ca_ zh&X2Z86s{6qA)Ns17^D#q==aTvt11l$81+a#KAjpK};0P3^{!hMGVAYhTO~nVxnMX z$c{}EF%XBD0kho>5yxz|L&P!L?GSOyb~{8Ie3}c0iGrCKz$a9qh=DlFSlamzapX}7 zkO%}bGhnv=K{Cvc)0aU^6wJ(k+5QJ9U}nH<|3kz(BX7I_$7{Uz9klS1^B$y$mxnU8UjGXv~qNmMyz2GGd{sA3={ z1eh5x$4epNCqWbjW@Z4L_J$$MzzjZF3!C^EOhIt^Ig3O591d~t?M4`K%nTPW#6SYf z3>QHZ24-fsgdqkJU}gZF6a`|TVrGUbr~*(PGXrcFEK~qWF*96)vQQ~z2F&qikP2o7 z%<*T4_)QRnfteXF$E87vm>F(^C=ATZ0J|9&Rf?Gba~vC_f|=nSh{C|k4EHg_KmyDR z4?q+KW@f+~4+kk?X22W|hloE0Q5cw+;R%KqNPw9EbZQESg^HOOo}mgrdCUyYp)6F2 znE`y8J*pr$ePWK!gA_3{V2;m2#9xCb49v^`zO@jGIOrB(3}FUlhIg2PVE2GeB*u_q zX83?11`=Rq_z0pfFf+p^3^9-ZGs9;Pg@KtFK&PW%2s1DV*5jH$qxKl{1t2}l44CsT5OK`;7l=4B4s%#=h_m7l2j2n+G7<$dGqB@O4?A52 zML7dA1L)LPR51{bnSl$3IovqJG3S*a=J4WBk2&82Q4hLV9K=G!%nY#8L(s)B=chm_ zm>Gm{*ei@f9Ciu`NH+pAGl=3)FNQ;09EUjUG!ul`AQm%&Bo6gbIK)Ax?tpZnVrB-| zsVL~;vN+7aoWBFDqVA0gu48`eNf6wJ(^g~NPp9OAH3V^EA?U}n(8pbaw7~wF-7>79Q6dRCk1ZHM1#i8B|hdAhFNrZkFiOF9XW6qC5%mJO& z1!AFMW(IE@=J?jyw4GBbqZ zP>*X}K_m|KQ8>h-afrv@5Rb(n9*09b9*1}W4)H`B;z>BflW~Zr;1Ey6A)baq9CVr- zC?-%bGeZUr^_e)tvv7!K;}FlmA)bpvJP(I>J`V8$4Dl>2&{^Jyb30)({dHOlSXWE* z<50gCERHzm(-<0byTRg!b3S1++2?VX{{V;jFJN)RxuD=>nG6gJf5761b3uin_Ofb& z{DnB@6Sk5=4lIs1w-aWLAy^!7ZYRu~aIiSy+)h}_vQV3WL6HfuUKe)mK`&T6;@nQy zi1rq3?BRI|hxm0I;^I2k%~8Q29;X8e2gG?w(O{R6hwUjyUfWwnAko z4s-6~Q2z-mjyN9`R?}$cV|R}c4)H7;;-E9Iv4!V)9O@6~gZzs)mld{xokC zuvf+q7YA1_nh2#J&>P z3ZYjx%>Ru;oZkrKeoXUC!QzN>V!_MW85kJi!QzN>V)sCo{!IgmBhGh)&Gg&`izCj7 zg^5cTgX~3|j|W?6kp>n=oOcIXv9Sd#jyPuze18cji<@8%Pd^h-_#pQ0z{0281Y|F! z``6(xXFm?{OJH*l=hMPwe1C(*5&MH+@vdqLat~sE4Qxf7pDA|tC!2!Yi8#-eALLgC z28JfEIO058*ow5JU~$BGv#=F-Cvk|s1&brjp@pqT6Eg$Zi#UfC=6-t|;u&CZ#QC(a zdTl0H9C1D^tlT>Z7Dt><3)`7+A1n?!RTbhridIO6<6nELZzam4wzuoYClaoEdg!N9PP0dX!aY{i(H1;{;!b8%teumCKM zI7bmCejO~1sou~MWDepSU6}fAusGt}MVR;@usEiA87q)Eh;w&g>N9bOZv%@X&f|rt zXSN2JgDD;W7Dt@Z3(FTuHX!ww;$Ohxi1T}4=3Cl=)MJX**@D6uajq|{{j(RW9&xTO z%p4{=kU5C+Ct)l6u7Jfc)l1rg)FaLThK&sR*kezJ2{^<{K;l?V326k0Ly|ZO*$1{4 zaULdYW#?@i>OX_kBhJZ$tt6{<0J$G=PA1G8X%Wy}Dus-&asjp?wiAc>haDj5VJjJ7 z>aRn^!FP@_FhK6!W?*3W3>AmPH%z^NBgj37bBuGKzLv)!uIC7`7q)@`mhVcS;;@+| z@LeRJbAX}Zun|Gnj;2#kaacPEw(?Ti31U91B?VKT?*wuu;@o4{NjjUr;)wG_VfEk# zusGs;QP?@UPR<~65a)%$#AkuU5$AuxM!^4o#S!O!!u%WJ0x}12PARM$o(~pBoI45= z{{R-pbic7H#J{kULBaRkFfcF_x`NC{oC^wDA;IkiQ4c$L4ptHgxPjCo&I^U*FIliS z;#^SJ$|P&BIN}^pn0gPeIO6JjIK!e(}LJV53n&O3#zOm+3ZZcaQ7@d6J9 z20^5Ao?-2QJsyzo?}xUlU_FOyPl&iUbn66ch3i32?DpRA1i2G&PAO~!_DV01IO3dA zn0T2tc5`}hh(GrRmBWa0l3_FP@;=zjN%sNSi#P`rw$q>jhx%0MoYxAvXB3olUxLLE=Q+bx%$xaRH@^~x z_%45ty@+$3Vg7mt7Dt@(4C|lT27uHf&Ub~aeC?anc%*v#(2 zFpxOn99meoT?30F&Y^|vAblJTQja)?7B&LAEdnHtS?_Q}g8CPTa|~epouEjNdc?W4 zuo?V~~F zBhJf(xpNv=9J9WA4i-n8qYHDVa}3BF#QEDWcTNF|BhJ@_)#Go#;)wHgVfk7s7Gw_M zd|lWIum-R=;(T4$c-=3sIO2TWUQi*xz`#%v2Qmj%`!c{+Ot#?=p8*y}oUaQr-z*+v z4rcx93>HV6y9-O_>0oip`nLoujyR7OrhW!k9J3zZ1{O!0(+gYa`xGpWIFC0K>R+w| zkUJ6Q?!ww>7GQD2IlZvY=?|5SR8R4FDyNDfW;B#_ri8|9RiCZ&gq4nRu`` z;{0A%`Mee^j@fPzO$M2RIM)}pL(B*)jyS&;mXG|w;)wHpVg2z1U~$B`zVo3Y@~kN! z^AYFz!ondOERHzW7q$Xm0$3bzt}ko_%q6fm;v98Yyy&Ka%txG~4x3@E28$!k0bU19 zKYPI9h;x9Qq2kQxAafAs`ohY|RbX+%`MuH5KG%c{kb1M&cq&HE}0VC*mAiDBXgz~YE=;9)DdAArR%>v6qY zkolPPbq!b?alSDuJ%{Ci)FaL}hK&>M%)=i32f*qP=N`k<@6QLBgE;pXe4i=<14B>& zNE~tQG5FqE1_lO;LXbG(++$dOO{xeajyT^Kw!;21SR8R4GAw^-7lYIz&OL^;AJj`g z;)rvPVI!U)r66&{xyP{dZ&3ykN1S^M3;$xUIO5!6*tqvJusGt}W7v+77v&&x5a%1i zR-AmO0Er{cH-?p~T9qJi#5u{ZaRRO?kT~MpW7tYWL$EmF++*1NKCxhN#JR_?6^$KW zam2aDu=3$7SR8TgG0dF*RoLU5s~Y4U#CgcD^pgk{$E-J7!Qz`2hfyFWF-veNA#CguJa_Jvf9C2=Q zG1NV?>Ok&6oZAeGuj62G#JSC|^uSpUQja*dIUj0{J6IgE{%Qb=BhGV%?MyfT7Dt@t z4BH_f&;T+Yah@}*e-{ZBe*mp*Kz(zF#rMGCi1VCbJ9Cydg3Q6JkGz^d;+XZ?N3b~J zoM+e$M&V}c=}I4mxN9@We9ZE{1S}34-5~Xd^PtA**beGiusGtp zXxL7Gj6RTh#Cg%MlMYsZ#WCxP(_nGLInprqya$V8)+3G+K;|RPi-zU*2Cz8dd})~Y zA+R{&JZhM@z(kNai1VUhD=$}r#WCxh?n&75#nDM1a}eiC!@_ORWRN)Gd}&yE@SOq@ zN1QJWThSgc6}$P-Q$gwx=lH?O$;PRmelFsCJlOc&G92ct2b+U9zZzD~`~iz2&aZ~0 zL#=5bdok-HE3i1?Tx(eWF$pY=S&vKsizCjvhK(Z~2a6-lyN1o1d;*Ij&bx-GSDg-W z58}LQSU3cM#S!OS!`#yd7Dt@t4~zHBU~$Ab*syT;3l>M5gAEgRodL2JaSk?2ycaBv zI0qZHBj^}d9J74xpNTzwr_KbKk2p6Qc2Yy~ERZP4iot8o;J+<60kVpJZ+fx2Cz8dJZ)IGu`U9cgIWG-Ee44r&e?|TaF_uW$E+7# zfyEK$Z^P`hT>>%(asD>U{foiki1W8$^~GnfIA%TKvJ_+v;#_W+Ijh0qh;zAN_C5lO zBhKZ9iL);QnS)u6XoJNO_Xxn^HyMZcY_K@uegT;JGdRTmg2fT%fy2fnLYITwgE$Wy z=C9dcam0Dx1<-odYz6l6!D9u;9K<={uynf|hx)BJ#E*i_L7XcNTi@QdxajrNloHbX0)MJ+afnagWa=ryDjyQ)L7CxK6;)rv|Vf~IrU~$Ab)})g#U)hn0KRz~Y$YB-0v@y_n^s4pJY@BuiSRAuF+zl2-oO2FS{}(KdIOiM|kG|_c<|EE&hxuy(SR8TwIZXUI zSR8TwIjp{r*Z?vIasD|>eI!^MasD~Xzw^Q3i1W{3?SI~lAagLwu>`O8GLj{V!M?ab7xXTqtrg$Q;CZ>9F~0yDcDb z#Chqk_Ff)X9C2Pc%-%U*am0D)uzBR&Td?PgqhR%z`Qj5;9C5xnEF2WKg4}~RUmX_y zDPVEL`RXwB^T6Vm>Fohn95Y=hZ3CH)IFB7>eiT?7aUMI&{8?ae#Chzn@pIPgAafAs zvBSbGemnMXr~s=+oO=(ePbY!J5$CtV#`(8{#WB@i1dAiib%&LE$~!>rL7eLj+i~3t z7Dt@x4$EJcz~YE=-C-vRiR=WKgE-e677kHham2aqF!9M?am2aquz8amU~$B`?yzuv z3>HV6>kczVau>*6%y6&+i(~qC0$3bz4m@l=;tyCHaSl8zd^~o8%txF953{!kERHw_ z9##)70E=V#>nK`vusGs8d6>A$VURh9bK_y=q=Cf| z=gh;z=YYi#=gGsw&w<4e=gGt7Q$B&k5$DOn#FdVK!WnTsJS-jRfW;B#%)`W8z~YE= z=3ys_#(>2U=gC8tA2BeL&ZN_=i&USfJ`N_2 zn3s~18lRS5gs#dQY(jBjS!z5Ur2zqD4DnIHB|e6h1(~VIspXl)sqxAAdBr6~rO6=c zKvKquIXU^si6!|(@rgz05cOu6>3R7@sSu7ue0*j;$gvFZQGS;3@#%S`@yQhxK1PP| z@hPckiKRIu@j3a)+3^MWIho0ot{`Qmxdn+u**+GTdGQ4~iOH$Rf`&$AiA9--c_s0A z1^LBDGG-~MC5f3iK9+g;CGlzbrFkjXBolL!GSf@*OCdUqL;QRU4IFdQ^NTV|GIEQ3 zj4XmuQ%aLdGV}A|a}z5G3ViDakA-%1=%$F3vB)uGh!}MX!57sY_x>VsK^^B%DC*^D#8aO-#>Bj?XL2 zO-cou<6~%Ag6bPkg0aj^tbkY!;#q*aoLHP)4B{A?=4R%(=A`Dre9sUc6=hjinwgwk zoL^Mp2eQY|z&Ss!Br!8DwW!$Av8X7qGAuK-+%wZJINsbPwJb9^wb;bDG$qkB)7910 z&pR|HGt)IbImb0KlVZalW|_phh5Av$IGEeY0^E`^GhJOlN#D=N6r52qQgY&xi&7Iy zQsc{u5(^4ai{eW>eUc1ZeUc1a4GqC+QgY%;64N~c+{y?9j3JhgiAYTGiAM_CV6%8v zSD!3HS644rS0iJv1(=TTMAHEY5m#52kmQ`=B0nQYE(ht$NX-E|x+pQTBtE~iB((_c ze2@hYAD}Dr47Q99iFb81PERdKD~NY>^$&KB2SppW1OmG$r8GCUGQJ?OC^0v_C^aph z%+)j=!~i89P%(vF5lGh1G(J8zKczG$H9j*XzPO|aBxqz19}g<*+!9NiOOrD5JirAQ zSjG$@6PgFhU{0BNiA9y6C7C&(nDT)Ze4ug$sj!PL&n(G++5;-hAcY+$w}AX%X$dNU zK#4TIxTG{GK0eCO2xPryUS>&1VsSRUh%^at4EDyB$}=+qGW~)L<2^HjP2w|MU4zIr zB?#FVSL68jG=}&nP`H8I=$MjHlv-TupXNz~`36{$TtG;CP_Q{DK1nk&(=QmOab{rS z%s}q*Ni5Av&hSnx%1h1hOm?*diMbXP)Eu|`BB(W>9GMvfa-X4@Z)$FSQKf%D zYEdGnEHC!Nh$~m);?xpPa8?I}g?F%de2783YiM3sMoNxByg|HCe1Kb7KxUA4W~OVf zSv*LoVSHJpYp`>CuuHs`tE+2Suv2`nQ+$YFyh}ldQG8}*kf8-Q$%C>IdP!k`SyDg^ zhva@tFJkkA8Cq0g_`(J33rOMNR+j1N6>JzEY!vV63QIww!3(MCv8EC7gBNSwqQbgj&bd>lr*dQKjRFLj7 zllWlMcyN{JS{4-S86WH$9|X>uK_NyUe^OGi7`l#@EEq|NdL_$f@CM^7xIrBvNLw3G z{gIYK4P7&bN)9!Kv_-+qg}jo?lFA@xhsrrICnqT}IXeKQkx7hlaQo3DKEAjlGd>>D zjD#8Lm;-J-2A7!R=9i^<1|c=5p@kx@?hUp!5VZa6?-yKRXjYU8G7{7nPAy8!OHTC+ ziFY+AsVqo!^$)fLl}3JeqJ-AH3UC+4%hlD{5LX`}*fQQfWY9R%hzMsI#k>3XJ30Eq z$NR+_g4(osW$~bP7^LsxNoqIE2(A7BbzmV4HA{FGIM^K4O-u3uH6#r!(hB19^7E6j zQj<&KlR?c>aM#<>FR1|kX8_%lA4oR0vRC~q|UTVD~M0dFUTDH zj)X_Ji6N-CNi4}s1`pF^Cgx;Tf%_rxiFqmUDXEYVG|(860ca!ymT-$plj1!=-B9E5 z#G(T5AWv#=X)b82#4otSGPNQ(wE)zcaW;hvPXw1(CMBlCgGbVSX((Gy)AEIVGkzf(9Cq zy@{gA45TWkG_NExHx=B`bT&olbtXeY(E`-x2aOXNn#Ox17H7og<)@^^CnXlA!i)|s zfvYhxhz}^r%uS6iO3eVNg$z+To4SI!;(o{q4Z(($Bqrsg#-}Fb$3sgUpG*T#p!x;J z8ybLBfUWZkE-@)c%q;Q*buB@?R##W>P?9S)Cs4;^SfC@h+0YE^B8Z#8X~;7;5!6p6 z>-Zwubz9=y-JvAOAo|%^pmM}7icY%&SgC$eIvM?zu@eYf=ZhxKg+;SPiOC7|DX_OQ&-X(I-mw=a0#f2EzQd;j4voDicd)_N%TwxjhB>x zQ!;2^)YaM36ir!XZb6P`GAJs+@#z|n3~gitmlzs=3@c3q8RzL34>ilx+0-*2IoLS9 z%+)m{9$X0Kf%8~qnrB2_NpVqdiJ@g?aePi$VooV2o2J29TL$s2reHx=|6oU0(+lcy z>{h`73!DiMWdJl);>@`4JPXP@#A}1wRsywP$R~T4k3i)*^1Og4cwPY8oB+631Q+X| z`6I|ofhVY;1+EV=gS>;mBZ?R)lnfIII1ZdaFop<0(-@%HA?O4b)PbPn3(lWG-XJG} z$53%Ok#uum4n^cqm`HGA;!z%370v>`-61tlTX&Ze%&k&Ms`YRtfl zgc*Y5EYP%2eo<~>PJCtwXm~uo$TJ`l5rNPo1WH3e-ob`QZ8BVaVT_)Vv#DzUq^7~s z#~tYLa9GF;Vkd%&!C-H26a<%;mXsFcqswdPC;buOYUGn5-#hDPzh$@vATDd0-l+0@euR3aG}Vp+8iUxE}e z!G`g!uE9R>o}kGbSJwb=a}h_Y*aWQwZxjzLYFu4i3Xobc!!bl*Sp{6VKoTW1O=2%} z5Tc-R459&&EC+RF^NjKfb_UP$xt0aH#s|B{2YbZ3hUR3tf|7}6ynnDue6V3Wv?FO~ z7#{-4kntctdivxVxMJo^FVIv=nJXw`f)WWN42E^S%MA7eO)gO(E5SSAh9>cTpguah zwnFY1f-)fpgQ}|laC;J*0l@`wdY(*$0V z<7|4E!x|)@X_;Px9z37DFg0`VItR|F)3L(1Fp0%Via@W1k}z<1dXl}X)3%hL+YzpqNXs2IG00fmRPt zDTxK+NMQ!?9w?z1aJc~*%0bEP*sKL5a91y|^+AY+1;m=5U=xTv;8q5x$?ut$l3EcDZN^0uLtNk* z>=N(k1>%8MX@F8Oq+^CO1d1`Z2@6XMl_-OyFjXjH$420I3<*OP1X+sAcQyno^79TZ zu>{Ro#b@TF<(uRs=B7f#Q3jro41|wHLk)!Up$6ua=YrRirsM>a1%sLp!6k;~@$to( zRjKhM`SF=~C8_DDMY$l|-iT2=(0W0rkfAwPO;KVFWFcx|UWq59WKJ$ig1HSg-fdZu z4;|}v&17&1*DFd*(~FN!uc(MGNG&SP&r8h7EUAnyGXR;5JT7Vy4{{7>WV|>&CpFJA z%hfe21XS%q$Lamu+=5+0;zJyrd?4#5z{BQHFOo4*?ri9q3mPdm28D zpIPAw4nJfMSs;1H&;Zh4$#Qiq3rNla&GCiim1pLq#0RAom*#>N?ULaIhxMU`#=j9idWF{AT z{?S|gyWiI}u<4GJzXG=@xV;Fp)wwc&ma@2~miQzVmw@Lb zAwh$fxJAw{xXpppE@0QqS@G1rj*3hJe~y0ia1||6mJH(NDQ)AcMdrg156! z>Nb#R(Dgr{Qmd#GJYgN53R)TsuCZ`?5bu0FwuFcg;qYP$B>`b5b2daz7tk~ePQQMM zxv8*34HkeD2e4=dZ}$MLD@aYwhU@?V<=#wE(+8wM5fuVk-j1&^>D4T?5s z48o*A`a#2F*r$XIsXZm^jngjB-WN!5ic<+Rr6Q#@14z2}%uCBRG)+$}aZE1BEK7w5 zx|YH64ulWdhk{+Jkx_bT320FZ>Xxu#*GyOQqSWHjoD$G3EtJTDj4**Gk||k)0;$?S z9T8;Hq0`jh@hhxTZZK_#aTVy07sU>OHXo7GF?GBOt9wbD@`r1qx>sR|3CXMEudKmV zKH-?%KpWJ;GRg)k$f)8nECEQqHxXe5O`_oRHk9YH5D_&+OzestI{3UNNLlDg}q?#0WbqH-cq?aZZrNk#D7nWv%ww{AK;1IE* z)ST4B;#7wC_>|)Oc*w4JsHVh{)QU{du0zmJCj~KTnw(#nSK=B3TI~vLEJNz|fTH|@ z)S{BiRA^;QfgR?lx!}$4o?bzjpe1dtuAl*ClqwoyI?D_^0S2B4a7{v)s3t#}GAkhd zjE^!jNdoPPM_H(BWCU8f?3tIGQ(Bx^mKso$nwD7sUBql;3|5a_(hs~=%Gn6END;y{ zgmNt*0i2VX3(mI4>&po3I}geXHjNMR295t%B+Gez_6IR70ckqRf(1ztnQjuHnqgOnjz;y@k2b%+)mjTPtVjcmd?0jt8sgrjP5Ra=(sl~LwqCiG1)Il%qYZT2 z3n=a&t7{<56F-dgc%Vrl_}C;^DJ)zulp3ezrNBC^XbVG4lTy<&^T3k9B_^5qC5fI$ zXR?6i;_^V_&jgkRx@NkTg~UTP%!5b83{8p>^HTD29W2M)wRj!@zd`x|QaE4|;;eB}~{S zGcP+e1(J~{&nhUxsi4>{$&UxkUlxOgTtI8BK+6jtGx4sVwPKm+nUH}!Z_?-Y3_-({ zuvy3aB4{z6S{URFo;-mr7$U=96VM6`uyLS%5~;Io7RiYPiOHEIl@Jq1_m^2_9%#-O zyxbi;X$Rl1P0nnIE8z{=;He`r%yu?(^@lDwAj6lSlcn-fK=YblPm*CjJfJ}1>hX{f z4l~GvyP;unZh;5*94L2J&{;OFWdYuxomIicpatZhHBZ6D;08&MH*^g-s1Sqcwt(yQ zFMv*S6E-9mJkK0ll3M7AbMh()RP*85qKc>lVKZ>#q(5^|WtLc!3Yv%lm1pEf4J25D zOU%>qi^>y=Qam$3OM8Pr3)e6={Dan|L3aE@EG1^coDtT&GN4WhWEmfPn+#TUkad5q z&PMnZ@gXrODZ;CB=~J4$8vACibAC*x+4%NYz{r-g?dueS7urh;@|lb`Q|5 zmSBT;Pw*ZwjB$3zZZIt4?4Tw+_*h!Yc;CbV$i@QDApr2>A3(chjpJRxgOfy^Xn^ZL zgLs_h8GvN4A7>Cx_%s9Xwiob_Kh;Jq!Rwshqqy*uONi~IWQ>M_2KX=r1<5eSyg0Q4 zJY)?T*r$epu*we9n8q@k3ff|B4w))~Y%g~OiC}Iz2knYCG=$D%fo3oug%1@>buj=N z1zM61UVwzdNYclB;n55pRL337q??Q{vPn0Ms3<4hSOO7Gx`D(;Kj~&e6M-Mb3=cF5 zk!~VrTojytg)IjD+pe=2D8ek!UG1l&E1Ri;UEjCA+QZY1-4~|dH zNKMWLpY;XWF@&NVu^AfqTn7ARLK~cr!|_39T7ypzO9d~t1huQNo*seI{Rj!{ZZWis z_XRD}j1MWm7(BFycLAMNmS5=^kO`UfF^mriPV@wAo&#kfqyQ!MTnX^L0dNZwY3eC2 z73(1suHfMzP%|YSePRkcHstCW>o}u!Jkb?}nx+nQ71ikK#ew^gu(07N8TU!IN^9pbgXU#pQ_wWKG9`&Y&*? zTLf|{qzFb7mSmKN&PGJc$RV9113icYb?g}}^O=CgvLKN`1s8(qEcCV&Vh+d19J~@2 z(pV#WzPMkoQM?IwQznv77V8l&}nFpo?7AxUEBa_EUE_js(kf(Qo+oJL%>{9LL|TDCJP3nskwBcH0h$kk zY6eYAgYT$--b^t7;fHcuK6ptG^z0pss4_%42OV@`B5F?c0W}y6O_9|2=BK2(8o0We zq*kORd(z-AX3#k!=&9WUoR5%i0>r;sPsEFk+vi@^t`!Zwd$?lA?=pMuItP;Vxd^54wa3KIN7~?35eQ_$6%mC?p=&T%9rM7$)q53Z%2P48iA@U_TlHWoi#R zsSi7&%h?chE-ttPG)IqqToBk*#^8e?;!`0ts%Hdf3p4!CFX(X<&?CfyAe$ROT>+f4 zM}`LAL2JnA&X8n_)oRcQ!G@&R?G3dYR4Rg+9-!S_pq8Cqa6EL!FeJ*6+SO=B(vUP` z1>2=C-JYA8Tbv4B z;0D?eH#|oAq1|na`JDjBF;?jNJ|W$3Z2HpH>_Xve7jt7&10LjHaRtP2+vzA@gkUnV@5HT!VrQkSK6<+toC&4oTRMLIx!?Ak&GEw%hP2X2C5Ip9VUZ2~_*K=EBQAq#d}>4c6d1g!4>6lX&0wGDwpQaYPy9R5im9bSkXj=&7(K zxK{$0fX3KCM`}YV8tfMvfZIuswYcDgaHXl>ln1GrK}A2VLvk@|HH;M$NX;cs69ocT;w3s{R3(&EIW%+LVQRCf)?1P3;DwUB90EXLv* z32AFBI;jpB2`XJGiI<|Y?Z!bWkBhpfq%TrdQk zmI1zK05O1%y91gD8_GeNfkN+AX1cEI9~YP87bT_>n1w{$rv^Ip65B!d#^A9Dc=CoOT3pVCMnrH4=t|?#Jj8@3 z=q@G5BxzoJqGv#6Zl-Gp=!oVJ1IQ$$A^4;=P=$$@#RP9NgN%40B@ir=Y*-J)2Ola3 z(M`sbqY0>Q2(F5QOH9BsbD%qFN-R?mmzsdCG%JivW0F;+mO)Y(0UmekXE14*e=aj4Oqq7bAnuLC3KIA9NEQWUqrikmhR8dPz?<^|N{cfvw^V}WeuBZ5 zga=pVC1(`n=VewwkE+EoRBwc9s2*ubASfUi-$a5D>U;o}lirc01|arfJM14^qk-4f zz%GQ#HE<0`h993E1g@z{P>lzT{CfqPKxXE^g8=YZ1JKFqu&D)CGG`44x(T#A0d#=| zAy-jptQj3nx zLv|qDfo%xdC?4aKMra5^`puyB9iiZ%?*Of#S-dOwYDCbrgW%}`(1Hc%%YjO^J;yWbXHLU=&4aFaFN)XtG;Mjs(dzqZ$8srT< zG#f03e$y$vGB9Sw3GjxL5}*Mlv@{1Etr)HYJkFp&$AP;N#}NO0CBY?@xry1S@!-8k zD3v);qjRoFuE9p24N)ba)o`E{GzEz3&f}r$xj?t}nWm&BCsu;4Xn}MQT+QMk%T@h? z%|QCie8Cd9XMp8I0_0u|R_)+vA$2v@R^3T>IzX` zlfQBWv=tvVt_Z1Wyzn><5pY4EC6ung&hX0sgH7Y%(~85jeu9lIBezblANg$vnnlb@ zO-+Gbh3=XQYX}dQTn#GrAlJNuw(o$OCCI@7ZRXRj2?-h^2aUL=f~~|;lY^Z)d{U?p zXxoJ6-5t>B1Z;N;ps0h~-QiZ03La?! znTdCJmZ-Th&=NY(F18XwGq7tDOY(~#E9Q$m!ShV67P*NPs1r%>J#RtY;7hacPFg_* z+#oX);E^6|(=VvgK;StIg3~}yb3oHfkm~`VgO4~Z0Zkb|{fL+|!sTm}IYQ7yfsiSo zU~tgjD-Tmw$q07r<+0oFS7E;O+plY{cxMAngz` z6{t7q7`h_vB?Zr6f)Y$JXigur#RkO?diNMWgWnm@Mj3L?1-EAmT!TxHXCnxB2%kZa z@!jCmoYdqJ!{QQGOrwxT3s6S9{c-QgLLF^@jEd(PxPq?!1rKpx-L?c7DGV`yY=47{ zDh5HDU0A{eJV-#y;#p+9$fHxBA$-tnwcue?kkg18!v~*=14{FdQUjdUhS{hXEk?e{ zFWRw<3}psef@V}e1tij7527q4Y9Ji6qzID!vCMCPi%X;umdLBdjqr5%!F4QrS_q{E zBFeqsHUT*QU>PDbLmJ@ogXL5Fc3>TL$8n)KBy*B%9;h*cWf&j1Ou=s^TGaz?BH;}O zbL4P9S||uwo0pTBo|l>upHvDueFNn#G|ZDgaoyjJJT8vex&n{GIvXO!&f!TCS_tDW zwcy8mA`%{P`k>S9$m*6t9AV5Q|07@jzS};5J|%06~dGyjHuqBCU@GHKjoP zH}D7m==Sc!l$4^>;$qPB6FlhfyUf)Uw5AbWDpTItvxKk51TDDqB5M>H)dAFH!6`z9}K0CHQWS- z`^@7p2JZ-D!%Wo7WdPZEiO(S@2?#QH=ZAm6He~aWD`><8GQby~k_uhS4IbG84ec3& z&al7}*xvXCs4zyzkfJFUcMO5b5kJUn-`?Qx$LM6ky0g$C5VFG`(IZDI&OvMT+{yyV zTwPsZrvV_;8lcq=0m%?UA>DCs4F&Q(h7;iZd{F-b+WUs50^ANjQVj_mVhr|%UL1f_ zD?n=#aHdD?AQ`xVFVh6wZh+JpLaTmVT_GI=P?HSacfy^zNU0PMeQeWs&_O=1{xxRL zmlRXcdcn|B#x0?>On#atp+0IrNIYnU5geC<`=riBBzK;1haMrP!Rv6s?t;x1l5ZL1 zCZHBH!N!oH3AuBIJ8>F;&)LOt{W2suf^r9Bs|Rj5f#{ayBTUg^y%7S{+YHcq8rX3xL#J)G-%18cpMgZOWR& zyCN;9C<_PyFQ@=DW25j)VUprsw6eeqQWii0Jh3O2hMrmoZXH3|NhP3O6;k3xZ5u&P zKf=}CK~W6KHPDs}xNQ<#f>ghfk}AM;E%=5=@Kj1XC@ZEG7n4>?;|?!^^(nM%O}+{6 zA!#Hd2oyP>mK12H9~{@<41iX185+jNgFB?)$_!G0d4o3a!p3-zsxTyLDJYE43k_J= zk5Ufc=?9S#<(LT`)O(AM2lrd!lM+)P^EuGafo&OrjItQH`a|ck!O110v>+!Ha=nXB zu7Rr;rXp1Bpevsss{>#m2?|{B%3sh8K%m3+f=h6$jYBLyb14V`m6oRQo?g(Sz94ll zV(^0eflr);NN|avWqds7m{{;Kl;p&c)b#u!&{4mxpn+*qxD50-fRK1sLvy$wOgU)7 zGdKVs$qrI*U~I<)6&=APMrNS1Z}W5G6O)rui;EF#^Ndkymk8& z{swU)Qg|47;!F(aU3ggI4m~x%8>HaW1&=bC1Taz`08)T~hx@Tkm%&OiXt4v|4GykA z@wC$j1v8dLg=Ks^B)AR1r->sQZWterwh10o5@Z&~7bhp?Bo=`;2$;u1`cSTA@Ersp z@xd0L6aAs_iLvm&0JJ3r|EdEc3vKFajOi7@S%JK36`mBr`t`bc$nGW>HCLVh*@0#O-)2r$}RUIOu49NRJpa zQi{AqpN#W82s;?O6b@z4rM*UA@WvpdjJL?lF9vT3Gy&a~47=zqxCB)k)*^?*85Kg=2y}QlMg|1u zJX}#pO3;&HJoNSvbMWaop8k-#QZNfA@VGQ3=3;R_QD$Pd6Er|-4mpb-dT|n@p@x16 zEhxrtm;f$?N!A5Dx&WI!ph6W?jzWUa6*Sn0aw0CsEpRm$A&j9u1Y!;-b$fyn1*(5R z;*flgMI7um1_lOU1_)qcU|?W@VvvX#h=B1~7#RNihk!5~;$=9*r{NI4hC^J65h4L{ z3&=q*d)sh`Z^t1X0rC*k{h~q`HKl;AvC=qn-5A$pqK$kBbyIm!^Dxz zQ3ol6nuDA^U7+Hi@BuMJK$;mC7{Wjt1_lP0xC~T035R$VR2)PpBl))zBmlKn1xeh9 z6=ERBe308<`Oy+d9HdnRqKu&$Dh?9UKr-h5R2n*&qgj%s~!^C?s(;B=gTh z#X;t4B8gu^5(k;312L9Cf*ryDi6Of)7AlTz&O{vI>!9KwbCBH`#sM)OWR4b+dt#Bq zL3&Ie${6ND#X(}o?s)j`;6QU7a+!QJfG6&f`ZBTKLdXStAL_5PqBynB_1_l#o zdAb9K_O)s>Wf$;aaAP$WG@vnSlXjjs#R3q`nnNy*yMLq`nJDy%$s*q<%7z`bZ@4sYv2A zNaCQn09Gz_Ac;>$Qa>9>dUJ}YrhSKQjOL2(9;vHrVEWBE9sPBe~!_=!n`IDhE zy1h$rh_8W)!_0x%y9FwaZq9C~I7~gP-Z=~vM^}Fphxi?+ILsWFy-%Rx=;pkJio?{K zL;0VfG`f0bq;$9nDc-rD;xKby_6kA8(an*Bio?{~LG4q7ileJH#vyJC6$h0+$n8lF zs5rXFT_fQrM+4}|g?pft!FWb=EG#F5QkghPBSk~nhta{x&k z*__iz;>hanBZ(7JUcu4}$o<=Kg!5J;apdxU8Rok;FM&KG-;>R(v-1X2&`6T#B6BU1S6 zM=}SN4?yaX!xOnZ0Lm{g{|6wMa~R2-6eRJ9NaC=30Wu$1eLs@=BS`9(A&DcKgI*3F zgQ`O=hmRtea|0?5%a50!{KrrlXFOO<4EEX(DDjq4y=4vhKi$`hhNEga&OIK(}0h==14Ps1TzibK2whxlY1;)`&IZ^R*f0EhT_ z9OAceh(E_6{uzh(FQ_<(I*C+1I3taFAjb=A{09{8$l}E~)OX_$UyMV1HxBX3IK*G$ z5N8HC5SreQ-3b~)10`XQG_trcNC1m^Zye&uIK(S(i1*?UUxY(^Es{8LdAkWo9J&5N z?k^y>H%vjEh5G9>Qoi#>5=U-tL?DSHw-ah{h$HvEkll&gZXwm3X9j^gVf_YBa3i-f zVdE$u@l#0dbcTixC?B0e5{I=DK;q|-#Dk#fLFOR0OB0dAFCeMUK@vv}XIUii%}DBD zObKW4SM>gjNlK2B8_sbyFQ^@KKafrkEDIotMn}gg>L5>$+By*6{KXNUogH zIdhT3d6C4Mq2eHSg3^@+)Li6ph7U>oLLBN>Ld8MuL{`5aDvoacai}=DdRTiIB#!J( zKB&9U#nqwW=;rG~#nIJ=;1DlE5=RcVJ|uBgq;Ob)B+ic{{u)VK07;w^yg!i#E(MHB4)VD(22~v+--%Z3Jz8*>30Lh$vIK(d^i5nuRNA5=>izDYF zBP8`=pa_AcPf(j2R$su%1CaYokkljB%ce-;t~ksIM-oRiCmTr|IsIrN`O6H+d_yF0 zWOpLho5<$-L)C-g9aO%}0x=jE7?9h07D(nJw-c<8#9{R#C>*Sj#Py-(gT#^DgFK#t z>>lL)f*q1M$mTmDiNo5HAoHD&#NCnXwMP<1uJ=K6*|2y-PUo&T)VtvjcgG>_i9_56 zNgOs`4RXIPlDGkq``;ktJLK^9LsE|%KK@AJ$l-%54x7gV*&BeQ9@+drBynk|dqBw} zm<2Jv<_i@EnG=Gf9(g?vvU)Wn^`JEau<%EY7vypkIh{u!nUCzAC>-vIMpBRL9%S{% z?ukWGkL(`g`U01GY@z-Fg%2nnf%+IAA=tbXOnfeg!N9-(>t}(uAoWW?3)GGiWUeEWIJ;3(%MeOk57hUgUI$oF9?Jop7j!#V^R6$m)}j)FY=W zSUVh~9$CBzhk96ifz%_L4@*BFabcwJK{h`PNxTus{8S`yZV}#@$Wc9G|7IgK<97*XoViH;MK&LKJ{sA4SUVHso+>1Bt|OU)>@Vc?49MXO z%cr1n2DyImKr*KW$$VHp8e}hWJ%!v}MOI&oWKJ!TImqEqha^4$Nqs$%IC8t90ZAM= z9+AyK4*y0Z^{{?7$oBBo6a0$hvkU@iQPtLBq2HNgOtB2vQGPO9_i#nYcM=o!53G`Dy;Q+!Lpz9Q2?I@5K2%AIK+aafuZX|y#Msg>z_$DOr9whb1 z=>d71_92pb!pu*Dn%|0~9@+d^NaB4+;kE`U4l}0$YR(p@I4B&D z%{hrf{1K8kvO9l3#bM^d+zDPd3{B6-=8J+9LdB8YsSXtf%_AVM&oP3EgWL&fBg4YS z6Dkg(K=IoJ^;aNB0BSz6y~y*h$l|bd`yg!~bEZMfL7tyQR*yWsge*Q0WB^jT7dGxW z6DkgJKeGCDIK;O?#X%IvJu{*1*$WcD;xE{~0FX3D{amQ}Gavyh>K`JBBZvPBs5poM znX?RP&IgbH7IRpk{TPtHkj>$TilfJ)6jU6f9yuP3k;MCv(ybGc_yiLFo`wzQMxn6jU5!{wgH(u=WE; zd^M6dJG8z7iLXHtM{Xy8)>OdEw?k5oEbfUUj+_pW+X=|&5SA}s?uU&>!Nx0K;;W(M zHu^a57N|HZoMGx=<2mT<fq*H@_dFmaf_VC&>z;%lJhz}DBn#NR;0k=udD?Rn(&b;$98 zoKM#y`HRx{Vu7}+=^0vCs+9E;SBg}DcLT<9!P{fpdRL(Uh-hd7jv$F6rw3$l*mw@eUgYsr*!&1c963BmH6K|$^0+IqdyvJE z+rvkZ+^>WbzsHcoJ)z>T{0J)tVDsrP@my&7i9=Ig0~Jq26K{czpTN>T%$ze&aoBh! zOdPiF5qW((a{V4jYewiMK$_fvxj|iPu2g`3#3U z(bM4(s5xKI)ayVutRj!co4s$=Q$*Chk_i6oBf{)0&3$mbGVL=s1K|7|4k%Shq#07)FVe0~QN z2gMgS-GT@P28Pd2aZq?7m(R%in~?1l2N?j(kI3ynJtT2t^~n31kk$JksYgzqu>KCn zJ;?JiuyO_@jy$giE9XJt$m6=m<8sL3pRn~OAoa-N$o&gscWy^YpUCb+&PSlW7p#AW z++IZ<|3sdbzls!J$m^-DA&Dcm=da@szkx&iCXzVvx;W%XF@l8%Z2l z{T(E6l_V)Fam~kCDWY!|e%@IP!W(hxFxI|2(tGDlKP2A z>XFAMcjFL0h9r)hA5SBRBj-otboCO+UgUiA3P~I}|Gq&IN6x43ki?P4ncpLc!|Fqj ze?K6JBe!QhB8kK5O^|xze1|-4a04m7AeV2Sk<39(4_}bPVf`kM`N;8(Jn#P%Nj>s< z=5I*iu=5i@<{+mBB^==oTUQBEj~vd(HwrV8t14!WsYtMu1{eff-Y@8n?{u44U8G9R=C6IPEPug5@6|97G0gVZCZf8_o0$nJrKKe~I6$0d;6 zgIxdqLJ9}iI#rOp$mYY=BcYp*JdT8HKFmGn=8&2mkjr`G^njdSkkbS5xEXSKfURQ$ zxgR+_z}8KH#F5hj@_H@g^Z*<8K=&7H{Rzk%KT#5k=K7RA&DdB7iJ`JWOG=M#F5)AtVrU>`wEf8k@G3CIPy9@ zS5^|Isbz49W1_)%X1DSb713; zAoG#^3tNW?GaqISEIeW2pgsVoXht8WtpQoiz`y_tXPA1}d zxd&3e5h;Hm_p^}uccbg>AOSqO?hbO&+vvJGNH~nHyMu(o=(;;dIN(}$hdh3ZJg$qp z--H{fd`^QDH{j*7JV@fOdLL9Dfy!=JeFxhw0}|&&QV;8Afy6=TVf|s`eik2+dgOi< zNIk3_k30^>kE9-X-5qkhgKM23@;HD1k~zrcBagqq<~2ZW5kyjtJZ>P2LtF%hxG0i1 z^1Lu|zZkjyC5EIPd0to?NgR1TT>?oQ*}sxV;;{80Ab%m74{Kk6#HEnbBac@|BZ(uQ zmjYTF0_&e5k3S%bBd?1??x)~tUm=IHERwy*`%vVN#F57bt|GN7kj;@tQjaXIfFzDQ zZmEbQj%=?Ik~ngDLv|0WeFzFq>9LMlv6Hd_x6E964T)$5HE``4?m_ zayTHT2jp=``w$9_K_Jmqs4vMBaymT)x5PRYCEB>@VbYz8R8xkk2bNM-oS#KSeg@8&dpQAgTY2 zBn~>`9TpDA>uIf!)FaQ2BbQgm>!z%c)FY=uWOpK`^P5QJ67swNvU`x%`ysmrIbR^V z2R45V3Qy$p30n^X5=TzY$o(=KB=?W@%OLHj(fv@6a2VYW1qp}I{ZNo_7~KyA2?uKJ zheEFRZIQ}h(0V&iRRt=y9gxJ4+XIeB;>hE`PDtX&<-9YJ_-Hu~iO11$9uf|t8!-g&e=3<$C1Zp zkk5}rp7*gqn*Tyx*WHXHj(na8@_rEHc{Deq{sQuR6Y@MYvbZObdgSpXFC=kf_1;M0 z$m)HN#F72wizJS0jvtaZvip(eWsv7Rk>_QQ?e#}82ig2UBynW*!ARoB<6|L6;>hYj zXQaW>6|#634)x(k;>hlaKoUou7mh>{N1hi(4hQ7*oXGQc$opuJ=kH+a072;xdHxP| zKLtn}dHxP|ejrF3c|I3;oinmHY~2M&J@Pyq>>LJ=II{bZ^H&s-e_{I{LF$p`zmV@8 zKsE>YzER}#fV}Sjd0q_pJRIbCG1&fVkiE$H8hO4LdEOa$o(_4w7&$$}BKZq>zAqk0 z967xq&yymrLqMJ>|t`ex z&{zwsoE)h77rDGet{;)(Hv=hNki|1`h-V>*wl2N zk=JYGBbkF-UKJpTBd^;jL=s0{$A#QJK`tMPk<`Q1rGwHfa`}V2ZUni!LSA22f@Dq> zXmSd=&lkDegRS=inU7pgA=fXZNan!K{Q#-&MOqhuy#61#Jcpg@4pNVt?~wP0BIjS^ z^0o}gUgU5_E+0q@XJm8Ak<3SK|MVgG3%Py=oq-C=U&!gI63HB7@hT*7A4M^&d!w0!NfV`gG_(coHf7Bd0^;eAJBO9^`a_ z952Y>hAfU8@2xn@X-5)AuGf&`1zEfUNj-AAr4vaU*f^`zz(Wb={J z74-Na@O2A4&@%!-K`a0w7#P6fy-4CPb3n}$(7gswWehNPg2dxMkq?zXHop(a9AxwR zk;IYB2i=1JH3r4}a2)1??mdL5M>ZdHPa;el+58De_9DA~B9b`rIUC6LmLi|?KN%@r zVCI9I2(lLzkI4BKxtv4}AJBeAm^+c(58D3-6Ni}(3g;0SxgTUNA@_sq0AY}OKxZez+>Pw7=}7JbsRdzt>Sy3k4>}7OoB6Yl)WhUK z=FdhF2iXb2AaPvg%t2C*Y!2*9WsrX4{JQ`t-jUbwB9}|Z>K78B9=Y6JgrpvM9WS!^ z$m$mpp&q%uUV@|^xxKvv! zGl3EgwB3a)E{H>10ZAPBTxjHb`jE}BMpBO~jyx}bEM9{{{S+MHuyf!+;ec$;5hV4< z?!1g7j;#I#4sqo3YLM$|*!l1vdy&&EG3gL=201L9jHW}#IPhpXgoFdN(jjvBxg4q7 zBc{AvL8AJZNaN4A%!lR9{i}wc`=QkWxZR7ZJp()A z5T+JHgNj95>enE(D?n;N7^HqKQacUSegnlj2!qssFl?LwBnE3!fy80wqk_btM}mTc z8DRTFK;qD&48h_Hz$E1UQ0S3`U=i5)cp&xAqXWU>55OcuJwKEUra%sd*4IK{0SE!B z??C3j+MFO6ZzT1Tko=o~BrXC{2*sI5;-XM75Y>SsE(Q{SV%WX3AbVkLSCG^RB=xW} zF+k$5as#AZ3M2r<93TOxzhGzPfTT2##9?DoAaNHYaoAo$khm|BIBX6JB%XsLE)Nob z;vOV%SQ-FHtw9opjR}CnVfT7~q+w^$g2bi$38YB+uJA?S3JFG!8NE~+8Du@rd z`xr!n#9?zrAifHyAp;F(*qA>^9Coi1NIh(90VJM+q#m{x7$gol%Nj(3)Wgbf5Fa#0 z0HQ(SusQ<72c4M=qCw)Yu|yCbGPs5w{A#IHidzo3c#fr^Vj zD=m;2Aj}IQ7#J8#(8P0niCaVYHBcI41_NnC@w`32iXg&gC`(~!|q%F zY1x1z4mn|S9ClU`NL&QkI0N|$HkJYsH$f7It+51&+aigB+zG48 zBap;lceQ}jS0IVQ&i)07&p;CI1qnd$LL_mJ`LH^E50ZEvR4s_QgCq{yvkVgdgCq_+ z!wn=Z16p(rt!F`gE`Y|j1Csb;kU}UPVRL*SsU1k-u)X0R@jFQ3^FabI3~ls-+zIPDfTR?V#211DpjZ=0 z9OPfve47W7ILNHy(0I&16Tb=-??4lO3>9C2CT?~Q3cny;HIgkJp&p{H0&1Hb3_8^I000}_x z9VGFKP%#ko2T2@u)-*_520Ca0azE@$50JPGlK2&n02Id{iC=|^fv6fJ@oOLfD4v5P z4m%SSB((=g9CmgtNc;|xIP6XVkoX@YaabP+BrYQkA|c~Zw;@b0X@evVJ98Bz;ff>< zig#EVN^E54&<*lP%#i?fg}#QV+AB0 zfg}#Q`wt{ufg}!Fn+g)2gCza|Bml*Gki=nUt%IbFB8h{-6Fl(Cz`*bVN&FK?Arx~! zJ0>9a!{!P>QW{9&u)APD;)Y1#AoF48x%(iA!|u)isV_hhhn=MZ60bxO2blxAZ*mHf zIP7j~koql1;y*wFP<#bR9CkMiNa`+G)AhuzfzGN%Pe9Cmj+NW2$G z92CFyP_xz`iNo&F0;xZPBrX6|3ljf;Bn~_42_!B6?Z|`t1`noYI45*+0wfOdR}0h*9VBtsotq%_0Z8I1P_-cO0wi%&B=IRo;;_3dK2 zzk(#Lfh7I~Nn8_2Tm(9a0&>3=lDG+yIPC5rki8*D;yOs`OOV82WgAHS6eMvyB=uX6 z#9?bcK)AAoZ|$FAnGc6v$tONa{6^#9?;6*MG^<8ht2QmmrBd zBZ*Hz5_dro--0CWiX?soN!$%d{0ow}JCe8vbaD~oeh(yZ6C`oiorxg-h9HT1A*nAx z5{JzxgVaw!5{KOd1`^+bB<_o3&J`qa*j?Ko^I~+mcCP?Ce zNalwii3cHxmmrA;BZ*Hz5)VNV--09#yCV{0?-e9**j+y$@h?c?;YjBEMG^<4w*?@N zFfcGEKo`D%xDiO|9gxH$k;D^_#G{bJ8<50ddzV1&S%4%SgQWfdl6WkV_yZ*II3#ff z=wvg zWKJxSI7s~ysQMZtaoAm=AoX*Q#Iup?-Gd~agCu?jNjw)x{11|N9+J2WbbO(Te0!h3dNjw5cd;*eq1(G=I zPB)OfGmyk5A*tViBt98Q{05RZY%MX!oF7QyQ<2n5Ko{A7+&>LT+yY5_I+Az^5-F(p1RCnrA{MRQ(xZUDlll$?OF;CKeTqI`%e zKrYgYj|wg^G>?xj&a6s}FUgP3%qvMvPc6ztaezriVsVCNrfYJJYbJ_>p*dK6QDRPf za(-TMNn&1!XOd5np{r|hSrQg^n#ac{S5z1pT9)L;7nc-e=B0xSOixaZhxk6eBrz!` z73A&U66|WBj`j;Kv51dPNi9jt%rP{H2e~{sBeAGBJ|{KLGt1RADEr^Xn{Cv!U z!InB3xdwUrA*?VoN=huwOolqiGdZ}#&=6!?d`VGaW=XMUvTJanXR@oSD;^^~jmR<3 zHNZPL$UE2+6fLk+0`fkRV_m?>0VTr0vW~^YsYNA51_ec#c_l8HiRpRy#U+`^#UA;^ zB_Lyijp89FGuQ~L@el(HjnWcJ5_8-VGjmFdFw+*2RhDk~MY)M3C8-sjfvu`9;n~u9nHA zDT!cCfOm2x?lgpAIWY#IxSz07z$Gj!H6t8Fu7hwIN20R~4O3E!lZ!IJ6$JI1N6=Ud zXPSWG4pcg(CTF{(mSrZVg0e{_F(E~SR^%KHQErl+TH>2n;a-%P5}a9uy?lbGG*3@0 z@hiD-6?kkHq4P_`Lj-)cB;t;#8DM)6W}OsYSeReoA~% zVqSV`Jg64T%u7$sam_`lACXiW8N>$^W#*>F7o~#K#uw!0WF}WS8@Z;T)SB>`3%h!Y zc;EQqlEf0QXR$aRx2~ep;?xpXgxi8kjB_*dJab)xyhEUs0a}R%&JZT?@#(20A>c+v zZb6PkN@{LmUV2U{ES&{-C*w)~MCrznjtn44$TKf3-_SHYwZt*GB(p3PA_yu+l8TG< z;-jMcK+c53GLjA>qx94gP;&Gw%_+$&D9TSxEiTS4Dt66uH7`mnF3l;yY-J&W-VmGy z!0nCr)TI3QjMSWh)FMxxOaoA5?-v|zXaH6LP9>hfB_;)lnMI!7nVGIWNd~U2uCCx_ z4kYEE_zpQeP{$H1EdV4-49&pqg!ly92J{S0^d!OquJ|2KzPVW43k^=NGd(kr!!tP1 zGuSBJ6{SgM5FebHlbT#&SX|;7Tw;-0Selpc=^9)DYmNE^o5UMp$)Bzu2_u7e zm(;Yx(wq`V=1l?TUYHb?Tnd&lG)D?cXyXqQ-4JXDigv7lW@ucRmtT~UT9ld+pPN|V znF;bBs0aW%7{&RCC~3(R8VQhu?jA8*T)|n~(7-uAuOu-uFSV%H(y^#0u`(<(wcIn)FW4a798xM2gNp&zOi)`K zTa$+BhBCy*=at3h=BJeAq{e5aK)6MzIcbR{Ntt=j;xw3W;|Gl~$SQnfS^vL%b$H9RjUH zf=dc=JaKkzaCcuq;zJDLLk#0xUHyYC;)5;Gn$?sW8vyD7fy_pAFE~w@LAxo2$+-m{ z;4<3X)iX2G)wL|ZJ2}`m9)dC<6sVpH@(w|7_y?Dy7UK4J62875!tY>5qB@XrpMyLN zH5eXA=4tsw<%vZpo|&%xA@MwjE5TQ>TKj00E&t-SJ#ku@K8n`s2d-jndTXhS5jOQTw-XMSsb5JmY7qD z?1vD8cvn-fpsRndBX0j-w-rkv|!2l+LlX>xvPUWscE=IBUp2?dp#d1`J!PGWMZr&mxWDAl>T2Ajlt zqO?9t;{8BfgUpiDqQsK?BF|v6ct{HX)bLAkbt%A_Imq{EW(Cxj&@#Xe+Rw&S8)8(3 zpuP{z?h$gWY6`AZK{YD68Q=yZc)S4QWk@yandu5{pagkm26+dYp%)0CHUWi(V@WjB zFx3Fmc|d8E<1iK6I`Io8DlkB$BT{gf6B!(&TS0uFkYP3rgT~MR)cwp!1vP<_NC;+mn5cp2Dp`h+akygM{S=Pp{F2MSJ#Nd6rcDggpI-G z@vg2uS%$8zZb_M$u3oOLu4UML5S-|l3GEo5Bx0~QrlQH2@#MP z1R5QJJ@AfA8Z=$3N0=?)pp$0MRE?)Q8f=X0dUyvfiO3EfJUF2K4oD`#7bsT3G=lqqL}kmhf)`Uk~So94f*kNc&{F%K!+EJL3jtf z3}J;dS^+(LQUuEAgh#xqt81_UXmr9P-VkF{!ib1b3D6KSc$JK0yl-Lwc(xOBkjWS{ z`vRKqB4*Mh9><(ZJkBW>kPP-2mw3VxF5m%6@Bj&p1`lG8gp8)Td2wn9c;E!vViM%; z9vNmE8WyGIq$U=px+In)Vq|Gh#e>BH(tGrzdVzEsFgyaQ!Z3!ZprJ&%6`(2@91mq^ zIU5vAu%;Jc3cx3XaGMK@K_vHCkViB;!M*~`JAk_{py3}_IDj099G7O$g=vuBa4as) zOwR)iig<#$1rR}OoiLn3aITr4aS2zmqEyf_J=dUM%lIH~LMfJ3jz|Q12jK`I}SG9LX1kRZ7#GC7(?^;;P~W>)a2}V@HAjf zYMv{OAsIuX_%QJ34XktqjsCbIL_uw4tX3kFfEKZM5*+)%t{$t=Xug5AQy}x{Apc}0 z=73h?AeNed8tvHqVHxiWT3!+#Qh;bSqnC!DDN&cy;*z5LO3#2y$iRok>Rj{0#G2p zlAAZeL7>&R#Eg%nq=F1U@f8`S6y&((pp}#er{rd04;xTHg(dtU0b^*Il9>jbri=&A zPzHH}rj4D@#><0CEJ5Rc@x|qd1!Rri1)If#hVa0%z3^~EjJZLxg9*6EM680Q#sD9v zw!&JeBMte%tOrfNVnh_A!axkw7@33DFhQ0$5FShO3pR>30k89QHA&7dsPxRt%!O1; z0ien65TvFx=2Q|H^$*CuIBRF*iVtbvFcWiD1vD%jfUyh&qY**C5JLmdTrPB$0b3;q zYc3eWM`}WfNS-EGhXMC_rysIl{ zc0H{iKCQq2j6v%}T;hWbF_z#{GN4NRpmYNduR?NsjddDt|qA!smY!+7$^=lj>kye9!SX>5+|1N;E8$244DTg z{RaEQyA*)SHIxbulqjIt9cgz$Chpw{Xyrw4NnR?o76w79px_etSO#>M6VDZRbiId87*oMW)jKE+fv{@V)WRpVkRhn82_5(>F3B%SOi#r~9l^Nt8iPj!;He5DfdrR;h9FDx z5OYZ>@hQ;#ntAbwo&lM;nXVzAF~kr9$UKl?JlaktQ2mCOGXl*7fa@dZFpM7=b0H?6 z4j!lmgqi_gXP8OEiaGR-T5yR8cxEFLw6WDP6|o}(bWK0YxoCEn93xWo*!8oDIj z6EeV(SsV`;rpd|7PW23lcXbUeF*3|7jt4DxL{vGMNja&W!6u*)977A3EM#I9wDSrq z{$wM(3a&GsinK66_i8 zAB=U@0+igr=O|=mWPBlifm+|76>iAOGC_l;&@oeJ zWP=6?hs#(5xQSX?oLU@ToC!Mi13DzGtkW&=t|h|*q%49J1UbRj)&(D@MX14SU?U&w1ODIG-yxe2nC7(TFS6p!Q3 z8ffSa*3m*kvv^nV=^vouG{7MUn#hFaJ#b&jGZ;2r=xP}s91k5@1TD65M{g)&iIly zfR;S_gZCOE6}89%)X6!Zy`@+uMi7JO1D@dmydk+7X?O>gO40ku!)LI=5OdHhGuRUp zqtqP-L&P&t10l{vpm8{~Y9AIGsI^&eiDhnLc4|DhY({BJ5M`5Vl54OLXt{m~Xz>u} zgv)~XqEy&oIOy^r(4HaFl+@(JN>B$6(v)^Ji-)Y0^9wcu={JuDuY5$41fRbGYNet& z+7Fh$@H;pjetH#nc*r#gvZTNy9&%zQs6mOjpkO#Pbii}r;8F9W)U^B}=tvw@2G2or zT4jkj1tnN_D`B?kp@Zp=A#`X#hqQ78)au1C)(na!@Olo+EsUU!0VKu1S4tq6i8M$J z8p+3JjtQcJg6TGdIj-PN2rkFq?&ZLr2l;DUjcpE)Q?j$(!i`Ph`HKhS~7L{W5MF#OOsCjN;USfJ`5x&7t z^OXE}(7pm!(CP6g;XTAXW|@)?Jw9OAc?}$Ai8(p00cD6Y`k@B^XW~8p7XtHDlq*^z!#F+wb=d=PsoN0g2xk208reMPC^L8f7gA=>b_g6ae3}8iIfK-^i+bEN zh7X~}A$6=#hd1%;_CV@ZqZV$E(a>B2SI_}c;Qln$%^Hv+$YE#kLVEf^n9VHYV22h` z;4UhOEB}y~C^@fc&fgB9kRR-|`?h###sbcDsde}PRnZcIeF-Jt-8X9RRnb{Kc zkWWaWMD`c7I>XaBgtxSjQUi(|#0?ta8;UVQ48HG^p@L+R6(VB|&M01bIWwN^mxUjIE&s znjzMb6D7_F)qjZPYoMwRF`xlyqm<;wC#Iwnr51yZz(cBi2$%|5XM-h&P{FyD;ES0+ z>r_Fjg1qpIEFuQ>fv)KRK*Y9u@^(o_!DGQ z4Ad`o&BeEt9j#S@)Qd*$pMpnPaBqCX?B^mcki!@-L5#9sHde7GO3=^@eB=W(f`J}; zNI4zDI+W>TE_+bu*TpLOK2UGpGL3;6A%&)D(>M;89d{tggO|Ifl|U@Nyr>5 zeIDEqL&z}Hq9xcEd7U>n?}6ufL1PQxm2$48;LDJ}Q>Cs!ps<0)Z!owBL0f|injM9= zlAy^Fe_I2xji)zZioHbzU#({V=}q9?X^piz0lJ9;t(*kiKLIV- zQ3m5lNwKcz%@AlC9ld1%4G`o+NT68=D!D+Z9b=^o?j%Es!x62$+yHOPx)8R19yXur zk2w}h9d{sypJ_a?%{UX3$wp8~gtZAr9VepYG&4|6L$84#&0uhifO!nz(76N(ypjN# zClIH0BKiU-yALp1JD^!2_%s%JqQV^+nXdR!V{nOCT4qj8d>*LLgS2_V8+>dAEMtL- zJmi5}l;#E~GeO%Apk@Ga?4eXDq~suQ1p~S}0_p_th%>B{L`nsP>>88+CQw3v>UnG( zN3?<;bmTv{uLQ0OzylkQeMPV_0qivZ?&u;VkkGR?tauCv0WS{#b-JLzhvIXzq(zF2 z;H@4w6CpT~(3hg)Ddb>b1PV6rW=YWfM4-+iX!{C*J(Dg4;C(Kp@t$7LV@NP6Ab1^1 z{$MhpEDO4%C?0+zL40yzNosn25$HHN&;}kuQ_xvupgnW2+31jXS3`5SAWS)E=N9Hp zAVY)rcu3yF*xCfjoxvqWX3+Z#5|crf7$Tfel9&!}nStX3DdUh5+vu4PB^43KKcGSZ zYYmGQ=|P}NxWMgq(0~?}rDR|aU`@>_grAyYmFd{9Moh9ns;*b z0-vZKgc$UIBty{L6F41$_wj=F;*8G(ixF%m5HJ~E-U zDpL6lYLg@C6O6GXLMDO6lR*1~kjqT)fGao|g7(+J4_O55vVf;CY<)}EkQ!Pfp&AVt z{skR71TG~&!-piD?*m@(i)7le+03AME@1sE}alnwy%Fs0TK>C^ZL^j`6vi)RaZo z&7_)$$F)ca0Bc&sa10?0#MyP0YdQ+~cyBl(-?yMo6xwLIG(6I=?g+-J=*j#9<#v9wg5u=)vLU z;L{5{{UPU`W7YxS@o6$WgKh;DUy*7&c9%dL4{4O5UhRU?)&K=I4g)YePKjaA{lf%Y z0?G-X#1C=2D`>17=}sC9hr^X)I2Ef_tOY0PU0q1m6+??|40%u?fNl~v)1&BxiJ+*4 zRD&+zdPS*edhzk;6&3LXsYS*4d5JlhC6)1I1`K-1`MIF;@)`8<@=H?n-2Fmzi%Sxd zv!VR-yi&b_qWpr?qLNBz%R4nAKCLJ*H{tl>q0T|7~z%Y>n{WGBYko`ZI1pN!3`jO-R7zy#e28;gV zBn;Mzko&mS`zf%f$B$&zl|j5e*@Kz9Dkch(EkOhA36R`k`R6j zj1U8m^=~7=ejccPWdH9VLB9-CKeGKhNzktW)sJldZW8ocV9|eygz)o#>PL3}K@#i_ zf$B$g|7jBJPr;)92nqI=V9|e=1pO^o^q(a`{}iZx<5*#pb`Mt|2Ij{e*ugAw@A=`1F9c6{of`* z{|l&o1PN4Ea~3Hql% z^&{KQPeS-Dz@i@%2C#e!qjBZmHCXgNB*Fclz9J~-k=-v!g8M;z9d!MVNU;9`7WY3U zLH`{r`k#=X{{IkeibbG zpOFxL23YjVlVCrnu0{91A_@9Ep!Oq|f6XN54}j`N4!;%>^v7V)uStUb3@rM!NYGyb z)sO6deG>H7V9{?(f__j~p~s&E3Hm2sv40W?`e$I#??i(B1z7YulAwPDR6lb1c}_z9 z*@8vC3kmigz@p!k1pQ~Q=yxMQ{|zkqXOf`*2~`u#}IF9Fq$?0Fw^k-nvUr&Pm5~zM;|2L4JzXqxw+5I0# zNdFyJ^f!@U{}e3xTS?Hr0E_-M67;XZqW>ca{@;N`|0fdkAHkx(g9P_qz@opK1pRlg z=a53HC=o^&^MhA`iz@mQx3Hn)}85cSK zY$QRy02ck5NYF0>)eoh>^IxDbJz~?J3REMq`*)CFzX2Bg-$_V+HdyrkAVI$e7X5ok zaDNCE{rgGKpMXXGK@#-mV9|ew1pO6I{ZI-V|3^sB-vHGJrNH`GS&3gi(gD?ooPLgy zVE+`TekcXD{}>7S=Rh?g+kcz{{VTBOKS6^2Em-uQB0>KFEc#EAp#KaO{h%>*V)OqE zsD3B~4*#&FqpMevz{rrmr_w!)Uf1L#T zC9voR&1n!De=1n?-z33)11$P)k)YoOsvkN0Z!-@u~(B?wiyzejY6P-;tnS1gal7{r)Gx{|Z?2GcXa~{?fss|049Ofkpps67;8F(f@}8{RL3{$o^*}A^s|` z=w~89e*;uMa`-Wl(0=d1qMwNb{WGxWXC^`a5~zOU@M9rC{~D-%WdE~~pnnGz{p=*@ zKY~R+2MPKwVA0P>g8n;L^n=!P5SxErVA0P@g8g5x=;tFrKLa;r`OQy)eh#R9l{Q?TderXcizW}NqIsRoy(7ytz zA36TzNYK9psvp^Yc@p&R!J=Q01pP;#`jOqQM1uYcSoABCp#KgQ{VF8re}P568VUNp zK=mVsKRXHO?++IJnk3lI0u6X51s=Zv?WrU-{|Z1gBD){7CXiVD5>Wk63T(eF3GP>c zYDBi5lZ5cof$E1+VEau-u-^iz5!rrI67)Mj^+PGJ{bnTS_kn7JQegd{HI>AMe*{z` zlmhFwCc*v`s75FS)^9_C{sO2*CoOu ztlx0@aV~e$bu>V(YInQ2og6 z&m_VA8&Lho_Ggix{{d7#vi-Rv=zjy%k8FP)3HrZ4^&{J#PlA31K1lhGY(HpE39;eN z0o9LeKOYJCPXvp8eiHO6V9{Sjg8y}(`jP!#PJ(_DsD5PsSCF9J0jeL_{z?+`dtlLD zMS}hSsD5Pk3y={0F?f*$a`mKQKN46idr<2(7qXDWP+5TP<+~0#me?JNO zXJFC4fCT+Zu;^b%g8mIy^e-Yo{~j#*7n7j>1Qz{ENzi`$@0T%tsNU;A6 z7X8af(EkIA{uLzXXMr9(hn#*_k)U4ysvkN3tR_Lf1XMq={VPe(uL9MNY(HpEG_mog z1J#dg|0FEsmjVL=_^4gzb}dMy!@$5`0oDHiIu8PqfF5fG<1$DyK-ToY&I^Z2JcFoo zfO-IQc2glE0|S)HAOTgV2DKl0To6>0fk8qB&O~8AkF|u0=`cXd#HAm0jyFsL%zhZ( z3~E1UZW1O2qG9&Sf;dPR;wyxJAw;Aw2z32!~W?|`^BL8VHA4&&BmdBEmXfS zOe2&=*S`v?AKhOEp!yS_?J%fzc>06!H$nBI$G?I$M1>H9f|Ka>U&Nu`2&y05|LE>~ zg+spwR6n}^(e+C+Lh~=0L^M=CKbnKlxC~;@HGK|f`a7WU&jHl{ivI>I@ox>aAGEd$ z9!Lxf3|XLz$-uw>Yk$JzpvJ+t4Czq)=;2=n)!zeG31Oh?pNT_17c}8(LA1e1G;cD1 z&Uprvsp$Tn@E)QPIsbyrl!lcbFmceC5Fj_9yWa$AzY*MK2m@j&gbgBLXBwf~zXFT> zZCJwJjR|}Fr$OzP2U&uI(d~DH_6gAKKY+#l4lMSA&WXk5|5m8|{vb&t`_Z@z@i^?i z0k!`H)WM+i*M-IYr8xY5MGs;rG#NoS(7XocFfcGIz+wM^PY{bepo&5E_h7OAJP!LA z^dTDGLMS+i?*CIb?C1Clu^)a8JPQLu9~S$$nIZ9yp8wuL?ME*^(CrU~>PJt%4A6p~ zQ215h(60m4PbmMDLfa(h>CfQ{#Q(5-1`7X4So}W+hy7tt`=QAY9{vzh;arC4IP8zW zVn3*?K+pesaM<4hwI6m510NHE390!csU@$UrHPbmCA2U%e&KPA3Ge2N@@pt2f0{M2#SUjVhAQ1~f9`^4z! z#{g=f*bvA9FSUYz(EPnnKB?T^zhSz>JNjegHe!Bf$aT(r11CX#g>x7f7_Q-Pzs6ro_pia?elB+G?w<~| zpHTjDh3ZFl|6-_q=r9P}$>{FS$Dw}{RKGG@Cxn5nKZ_lE{9SAENtTj2nCSKY-egUjLvaGX_C!?EV*E zfQ)Fu?lA=AKhXVm==OicVSkh@B$X11|93d-FM!&Qoc=&}GNIdV$AdlpG(hc#sfV}* zq87#m(d|6g!w;sPko%i>u!mm<7WaehFN3)U=CA8G>|X$NKkObkkXu0*J^n7?uzvy6 zeo&Z!^nv0Jbbl7Q{h)Q8*vgMJb|7603^@=APNLhd#fv@sB|5VKrg>v;?S?)2GKub3514`F#p5&f1&yzp#>9VVAuf~*kxdV&vU?J z(Dna-g+ELIl>Pv{00x#$LE#U&KMAHECN9j!0A6Z_ZvO?S{m^L~sHuqj2Q!C{4;ue) z6%YmoE5tr@A36Pi?q5N-e<}|9k3;Q;4&&nV{{$TNuYlSQOQ)dx1G+y2-Tt>Y?Eeb2 z-y3c-gn{n=7dY%c0JR@E{6Y7ppxf^#fIa*j-b3soH2>6qLw^udKXe!cVi*Hv|EXF4 zd;D>5K++F#_<`;ZKzIKw9QL{o!=j~srW^TW~Y4-mv2ez%CU z-&+v7|1+TWBd1@`To1baH8|`y_yF;*9E5_C=>9LqVgC%M{m9`DI{zEp{?$0__k-F` zsQt7Ihy5F{*bh2?7~TH2IPA|O(*74X>_36Ue$e^7==SRiVUPblsQu7kIGo|HC4@cx zA3*I#PJf{DFVXGi5yo!+F{u5TaGN0v^!R5N#%{j`CuaTwoqvgLe+mx!KS1pdgjfqF z(e01NVZQ^^eq{fH&M!o_|2q!*ojyXWB;@~3IPC9$+K-(6LFbpD+wUrZJ^XW^_7fVv z?!%$K3927<4?V;;nB`}u2=@3}0Chid`2jlr2;Kd!aoB$iYCoazt7kauzktPl(D^;+ z_B)DV4?mVq5FewDKcS}|8&T~3XW)XQUu6G-&W}O2e>x8PO`!G@3jaws>^Hz-Kj{1n zbo+ndu)hOpKcVpdio^Z{EcS!WPe8XnObmPYZ-d%TDEtG(u!sKyEcS!;zoXl~35Wg9 zp!Tz&HF(hDe=QFC4?yilZvTSzkE7c!FOJ>+Z$3lPAA0`{-F_)??Ea77!AyUk{gdeS zH{q~f2YP}1Oo%h!B)a{zIPC9$+J6A56V(0$?Vm%pe?AWTk3#KV0M!qp(4vE377qJ8 zcp?5r?*D-H|DfCNB7r^p)4oD%B-DPem%twWE1>oxm*1fMBk1S-W`=zni&o7BR{d0T=TgQN2|DfkTE=la^{|D55GSV#vXocQ2pjm`yr-bmLD$C*uyVE2;zRwnop2k zkoz^TxPK82`}aZ1clk@lC$VE6wA zEcWYQvENq~yZy3%K)z*Q5Qk8365anPIP{x9^%ENZxq?H#8&p5&&OEr~SjOMa%VPIG zhX^G6L3<8C`a$7ufW`kxa@hU97`&^1fq{_wV{qu-N~Hd&IP@PUQvY=v`fow?L#H7? zK>)_+`R|e(_VAk^2?;+~`Ui!dDVFf#m&YD{GK`RvLTLPmOCG!Z2cY&N=O1${_B-RS z--<~4?QqzC1B?BZSnRLDVSfbFenoJoBM9{HFU4X12Q2nmW3hid4*M&i_RmFF4Q4@N z1_lLV2!axtd0%h#( zzW{YVa{PHTf{~JS^e&Q4@RkRY1dU0#rY!{R!G@ zg0JR^v z|DTM-{+T%J-vG6r(EQmH9QHRr?MKf4saWhkg2R3$=)yA>n8%?sy8rj%uzv#<`_r-5 z{{x5pZczKt+aKumn`vWDKM_#<=;IfVR0Z`60|SGhHum)M0P23^^pl0f{joUgZ-d%T zsQ<7GhyLYI{m^A}a0jEie<2R{|A4w5IsS98xc?gt`=3GWSAp9MVPJ&64)*Zp;)S@B zQ2kefL%#x4e;UMEIEik5xeoUD(@=rre`Nm`V)6eW9QL{V5=IaIg*fz^ zK=l*azwj4_elMtg4J5lkOmzEy;PC$isQ;1kZzUH0tLkA7|01aU^C7_ivKoxh?N`vl z9{w+&_9O2fuEt{j1RVAY2|z3+lz-mh(60g2F99(I>~;`=?*11z+SGT-2SJEN=rA_OYA{B(Uq>H%_*rN|>_=|@H(;^94u}1gLJ<4cfo(+)5K#ob z5{La4v>^5)*WXQ8>_3IW{^wBp36&p5aoBI54Y41&{A$5szl@ro6*t5l z{s~a~k@HU*7W+@(us;ZDKcW8rQ5^PHV6h*x7973(SYw3U{|lk^N24V;NT?vKfe-({S|r;`;p_n4~zYbCfNP|0ct;b`wiXyt~m5Fi9=KdfkO#FphYKx zg9-NVn*en`a{V_Ei~H+w*dGG5AAS5A;wprDs&LqU18P5V_)W%Q|8*Sp7vZoU-T#+x z*#7~G{Zp~nFJOv2{wCnCAKm}lrr5)uLmxB!Ovhq>Dh~VCLG35he(*BG?*1cC{pjnb z(ZkQx47>XkpzcR5e`jHF{|+4X3qUJgLiTUQVSfM?`{!V>|ByL$|HnY>H$@~>Fbm!P zd(E-?zX6N=^RU>TV1eEKNl^Pi=Uag-h7#!ZM_XXGe*@Hh|1WWiwTVr?s7O4G% z<}bpmvD+VEfSG@nVX;5S2D|;wp!S3IoWT4JqS5^yYlGeX0;v7S?Y|XR?4M zgQ*x87znjLrrKh+pTh{#|EsXr-)V>4ei0c+`a!Qh(EZbu-G5rgx!8S zIf%uC#-Da{k+g#r_@~_FE|u7yj)y?5}{@j~xC7u-JbDhy9bF_7iG9 zs<~i~zZFpZnedo{Fd)8yutB7v3-@UG#zYWxWLhZMF9QHRr?MH4uAID<F)s+ z`_E#rKMRNbMo|0F(;vG1sW|NafyI8%S%v8B_YNHP$3X2zA3s13|1&uB=i$(g?tV@; z?BU-4)ekz~77^MIwa9!HH|*grUVLTUhKrg2VnBQ2V!`*^i!n_v5fX0ctG@RKLw!_%t8;ppE%sl;Rta* za{Kuq7WZHB#2$W=v>`5qQ6H-?C$>obwAAipz&|ek?QE>cOwq_*u#Gok@l;3V-NohEcU;_Vt+jj`*%R?Cv^Tn z6%P9sK{@C5m;R1<2Nk7Q_ zZ$@U!{jW*E*zJD?wVzP@#Rg-yKLTn$a{6V)V*h>|_6rz5{9lIfI+z6!1vA0KE*$n3 zKQ$+h2jje$bY1bo;Gw*#7})KcV(#KMws&Mv(9$r2je&{USu_R}9A< zeriztgw_wsgkulC30T4pv?Uci{8Dh(??$Bk@i^>XfyI8%mPmB_7vQiz4QfAh7y+Ju z(bMm29QGf8+K)W{CWfW_a*Mzo{$0k9_(w0l(Cv4Oz#je)9+3QtJpUldX+n>`w^7*L&u$8FKcW8nizw{& zOL#)U54ryi+ERpWziTvh`%^3-_M^AoAfoW#U~q`WZvO&5i2carA81Pty8V?n>~DhF zPiX(davb`nLG=?lf9@#`{U@OM37y~d0Eho~VDUd_NjAFwrDL#%|9`0c=({i;~y?mvK*grM8c8izgngFGNM zL65-(+XW&Zq97I&$6*hDhXjcI$mJJkN*>+*dpPXxgW8Y2{s!Itn>g%`z+%4%miXTp zkKOge`MCt?r3Zm9j}{SQc}K+`S* z1A|y1cK^S?V!thx_zS~f|5~X1gzEo59QHFLV&)%vEcP$MVgG5U{e=9#5QqH|SnLN4 zX`_e#T^#nmgW8YYe?gCb<0S0y$KVBtDfI9|*RPj^J^l=^xF0m6g6{q~IP8~$+K=9U zgt!Wkex~EF-vf*Npe_Nr{cOqD{qF&_-wRG;* zQ2WutAKm{&IP6~mwI4bEdSOYwr*YW70ct<`{Ws|TKZe8p16b_$!D7Eo3ij~70=1t| z|5rT)d-&hLVn3)!haUb(IPCunwVzP?H5P~cAF$XTfW`m2aoDfs4T)(&`IkEtd-z#E z^`oaB^!Q^-#U6efNtopy=&nQb@SBOl{xGQhgxvoEhyDy4`XQkL&Wd2dI1Ri1tDySP z{g19+FAcl@6|nd}983H);jn)O4*Su=zZQr67Fg_$#A5$R9QJR6+K*m-qT7E2hy4Li z`;psE(OB%)OvfJn51{r#k1+tp6^KB$UnL!T_%}f9M?OC-7K{CJaoGRL2NKih@sDo* zHyrvod?EVL(=S96ocFM;YO6n5xc?&#`xE>i zHW8}--r%tR0Mvfu_|L&&e{?o>|4)M2kA8m$dj1W|#_sK4dQ=f`%AFcFPV!y{FMSB;ZJD&u1GHS@Yld%e;F40LvYw10kt1}{XN80@a)Lo zkHdZksQt+0cLf&vSK+XKJ=A_e>2E0x`wO7G~pHTX%DZ(E90a)A*x=Rqf{I3$WPFR)*bvo^Xi&VeW?pBgXyRjyUwIK=l(!zcyvq-7f)kKXU$C zjK%#`IPA}Y+7HtYb3epaFdhR#DGvJ+u-Ly8i~aL)*uM#CKYIR0xBn&%{YRks36-B$ zaJYX3)cvsd1C1Z7z~X+Ta_sT{2x>ne_ow5~{{^Za-TmnKH>n(Z{2##L{?%CAzZ8f4 zLePVY3Az6R4*e=n{e;fXdyT{WH=yoE9{*T}#r=sD*u%dIYCkOgU?~<_NQ z9)1FmkeqfI7I07+-G0AH?EaqtwI8|u*owveojB}&47DG<{R?pw)G-VU48m2|-Tw)y zKM|o6%tF`CSB2gE1)#w$wDNxk7WYTvus=8oViTeKvmA&1G#vWT!~Zo7{S{FCgw9WW zhQt3IQ2!&3f9%2Hf9Y!M;lBrJKYIBE2^DZufC;f`?BTxyYCm%N-G{|~9~}1Ig4$0g z{Cjcee*@Kze*PtT_;=uN{{^V~k;CsG7WcE%U=KgNXh=w*=YNPOIOxE{e;oERL)b*Jgo+M{7YbQ|1m7?-;cxo1yK9Z&rd=Rzi&A7 zZ-we7lzuB^MKlqT>qTLV*hL$_DjV;LW?hk?LClr26b=ciM0qTC_^m`7A`~7j)Uj(%urXQSHu-w0}0;(T$HzrICh@Jq| zkAD6utp5yB1Hy}OxPJxI{mAW?i&)$*P!9<|bo+M@X+L*8_VD8XO_ne)Fd*B18H@cz zIP8Bwr2V-#?9aer|5YsZuZ7x=zJH%17Lxu5jUTVTVgCuJ{mA9-bu9Kj!ePG|)P6$u zkKV;${|_wo-^60ScmwwM&w$!bDE$dFV2^)|d`S6^9RIhm*dGG5A3gjhKuN6&vBp!N%ZGC2}P&;J{7*#82H z{SUF&FWd-;eVD+QFthd}K|cmDyX z{i>iqM#AXs?}F+_cmE5henReV#o_)5Sls^#i~ILL?MHV%Lp&t?pr5~n?*4aB{pk5$ z398={$?qU0dNO^9!~Hj)?nh2PZ?U*Ptr-&k==SFjX@6of_W0*0gp_~C_P@ts{|y}W zZ-d(J3i31(M)&_^9QIpavHv3$`#oB)`~MG-_B*#=_kRW!`#)o`e-jS-9TOnw2XtpO zlJ}vv0wP4#;;?@P)PB&J*&x-R@rSQi>=$jt?tiZ&;_Me_#qNIv&|(D!1_osNzhkk# z1c&{>DG>Y7;~ydl@(~2*z~>e1`(cR1Yt0*m|Eu(+SA9g=?0-9H0rKO5ZT5C$YvA#4z-2-T17{smC| z=V3aI_)=V!p&52M4N`ayb-@p-6z zWHAsMrVqph;k0%J26im{;tNpy= Date: Sun, 21 Mar 2021 20:02:51 +0100 Subject: [PATCH 126/258] init task --- GPUSort/src/quicksort/quicksort.cuh | 99 +++++++++++++++++++++++++++-- 1 file changed, 94 insertions(+), 5 deletions(-) diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index 4264746eb..03dd88d8b 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -48,7 +48,7 @@ template __global__ void cudaQuickSort1stPhase(ArrayView arr, ArrayView aux, const Function &Cmp, int elemPerBlock, ArrayView tasks, - ArrayView taskMapping, int *tasksAmount, + ArrayView taskMapping, ArrayView newTasks, int *newTasksCnt, ArrayView secondPhaseTasks, int *secondPhaseTasksCnt) { @@ -56,11 +56,14 @@ __global__ void cudaQuickSort1stPhase(ArrayView arr, ArrayVi TASK &myTask = tasks[taskMapping[blockIdx.x]]; if (threadIdx.x == 0) + { pivot = pickPivot(myTask.depth %2 == 0? arr.getView(myTask.partitionBegin, myTask.partitionEnd ) : aux.getView(myTask.partitionBegin, myTask.partitionEnd ), Cmp ); + printf("pivot %d\n", pivot); + } __syncthreads(); bool isLast; @@ -156,9 +159,9 @@ __global__ void cudaInitTask(ArrayView cuda_tasks, //----------------------------------------------------------- //----------------------------------------------------------- -const int threadsPerBlock = 512, maxBlocks = 1 << 15; //32k +const int threadsPerBlock = 32, maxBlocks = 1 << 15; //32k const int maxTasks = 1 << 10; -const int minElemPerBlock = threadsPerBlock * 2; +const int minElemPerBlock = threadsPerBlock; class QUICKSORT { @@ -187,18 +190,104 @@ public: cuda_tasks.setElement(0, TASK(0, arr.getSize(), 0)); totalTask = tasksAmount = 1; cuda_2ndPhaseTasksAmount = 0; + iteration = 0; } template - void sort(const Function &cmp); + void sort(const Function &Cmp); + + int getSetsNeeded() const; + int getElemPerBlock() const; + + /** + * returns the amount of blocks needed + * */ + int initTasks(int elemPerBlock); }; template -void QUICKSORT::sort(const Function &cmp) +void QUICKSORT::sort(const Function &Cmp) { + int elemPerBlock = getElemPerBlock(); + int blocksCnt = initTasks(elemPerBlock); + + if (iteration % 2 == 0) + { + cudaQuickSort1stPhase + <<>>( + arr, aux.getView(), Cmp, elemPerBlock, + cuda_tasks.getView(), + cuda_blockToTaskMapping.getView(), + cuda_newTasks.getView(), + cuda_newTasksAmount.getData(), + cuda_2ndPhaseTasks.getView(), cuda_2ndPhaseTasksAmount.getData()); + } + + else + { + cudaQuickSort1stPhase<<>>( + arr, aux.getView(), Cmp, elemPerBlock, + cuda_newTasks.getView(), + cuda_blockToTaskMapping.getView(), + cuda_tasks.getView(), + cuda_newTasksAmount.getData(), + cuda_2ndPhaseTasks.getView(), cuda_2ndPhaseTasksAmount.getData()); + } + + + cudaDeviceSynchronize(); return; } +int QUICKSORT::getSetsNeeded() const +{ + auto view = iteration % 2 == 0 ? cuda_tasks.getConstView() : cuda_newTasks.getConstView(); + auto fetch = [=] __cuda_callable__(int i) { + auto &task = view[i]; + int size = task.partitionEnd - task.partitionBegin; + return size / minElemPerBlock + (size % minElemPerBlock != 0); + }; + auto reduction = [] __cuda_callable__(int a, int b) { return a + b; }; + return Algorithms::Reduction::reduce(0, tasksAmount, reduction, fetch, 0); +} + +int QUICKSORT::getElemPerBlock() const +{ + int setsNeeded = getSetsNeeded(); + + if (setsNeeded <= maxBlocks) + return minElemPerBlock; + + int setsPerBlock = ceil( 1.*setsNeeded / maxBlocks); + return setsPerBlock * minElemPerBlock; +} + +int QUICKSORT::initTasks(int elemPerBlock) +{ + int threads = min(tasksAmount, threadsPerBlock); + int blocks = tasksAmount / threads + (tasksAmount % threads != 0); + cuda_blockToTaskMapping_Cnt = 0; + + if (iteration % 2 == 0) + { + cudaInitTask<<>>( + cuda_tasks.getView(), tasksAmount, elemPerBlock, + cuda_blockToTaskMapping_Cnt.getData(), + cuda_blockToTaskMapping.getView()); + } + else + { + cudaInitTask<<>>( + cuda_newTasks.getView(), tasksAmount, elemPerBlock, + cuda_blockToTaskMapping_Cnt.getData(), + cuda_blockToTaskMapping.getView()); + } + + cuda_newTasksAmount.setElement(0, 0); + return cuda_blockToTaskMapping_Cnt.getElement(0); +} +//----------------------------------------------------------- +//----------------------------------------------------------- //----------------------------------------------------------- template -- GitLab From 2276e80875ea35860746e9dc42028fde244a7c37 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sun, 21 Mar 2021 22:07:05 +0100 Subject: [PATCH 127/258] 2nd phase --- GPUSort/src/quicksort/quicksort.cuh | 67 ++++++++++++++++++----------- 1 file changed, 41 insertions(+), 26 deletions(-) diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index 03dd88d8b..93411fd12 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -62,7 +62,6 @@ __global__ void cudaQuickSort1stPhase(ArrayView arr, ArrayVi aux.getView(myTask.partitionBegin, myTask.partitionEnd ), Cmp ); - printf("pivot %d\n", pivot); } __syncthreads(); @@ -98,6 +97,7 @@ __global__ void cudaQuickSort1stPhase(ArrayView arr, ArrayVi aux[i] = -1; #endif */ + aux[i] = -1; arr[i] = pivot; } @@ -159,7 +159,7 @@ __global__ void cudaInitTask(ArrayView cuda_tasks, //----------------------------------------------------------- //----------------------------------------------------------- -const int threadsPerBlock = 32, maxBlocks = 1 << 15; //32k +const int threadsPerBlock = 4, maxBlocks = 1 << 15; //32k const int maxTasks = 1 << 10; const int minElemPerBlock = threadsPerBlock; @@ -203,37 +203,45 @@ public: * returns the amount of blocks needed * */ int initTasks(int elemPerBlock); + + void processNewTasks(); }; template void QUICKSORT::sort(const Function &Cmp) { - int elemPerBlock = getElemPerBlock(); - int blocksCnt = initTasks(elemPerBlock); - - if (iteration % 2 == 0) - { - cudaQuickSort1stPhase - <<>>( - arr, aux.getView(), Cmp, elemPerBlock, - cuda_tasks.getView(), - cuda_blockToTaskMapping.getView(), - cuda_newTasks.getView(), - cuda_newTasksAmount.getData(), - cuda_2ndPhaseTasks.getView(), cuda_2ndPhaseTasksAmount.getData()); - } - - else + while(tasksAmount > 0) { - cudaQuickSort1stPhase<<>>( - arr, aux.getView(), Cmp, elemPerBlock, - cuda_newTasks.getView(), - cuda_blockToTaskMapping.getView(), - cuda_tasks.getView(), - cuda_newTasksAmount.getData(), - cuda_2ndPhaseTasks.getView(), cuda_2ndPhaseTasksAmount.getData()); + int elemPerBlock = getElemPerBlock(); + int blocksCnt = initTasks(elemPerBlock); + if (iteration % 2 == 0) + { + cudaQuickSort1stPhase + <<>>( + arr, aux.getView(), Cmp, elemPerBlock, + cuda_tasks.getView(), + cuda_blockToTaskMapping.getView(), + cuda_newTasks.getView(), + cuda_newTasksAmount.getData(), + cuda_2ndPhaseTasks.getView(), cuda_2ndPhaseTasksAmount.getData()); + } + else + { + cudaQuickSort1stPhase<<>>( + arr, aux.getView(), Cmp, elemPerBlock, + cuda_newTasks.getView(), + cuda_blockToTaskMapping.getView(), + cuda_tasks.getView(), + cuda_newTasksAmount.getData(), + cuda_2ndPhaseTasks.getView(), cuda_2ndPhaseTasksAmount.getData()); + } + processNewTasks(); + iteration++; } - + + cudaQuickSort2ndPhase + <<>> + (arr, aux, Cmp, cuda_2ndPhaseTasks.getView()); cudaDeviceSynchronize(); return; @@ -286,6 +294,13 @@ int QUICKSORT::initTasks(int elemPerBlock) cuda_newTasksAmount.setElement(0, 0); return cuda_blockToTaskMapping_Cnt.getElement(0); } + +void QUICKSORT::processNewTasks() +{ + tasksAmount = cuda_newTasksAmount.getElement(0); + totalTask = tasksAmount + cuda_2ndPhaseTasksAmount.getElement(0); +} + //----------------------------------------------------------- //----------------------------------------------------------- //----------------------------------------------------------- -- GitLab From 9f2d54ba57b151dadf624479c34278e6e2b9a8d2 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sun, 21 Mar 2021 22:59:32 +0100 Subject: [PATCH 128/258] malloc more variables at once --- GPUSort/src/quicksort/quicksort.cuh | 69 +++++++++++++++-------------- 1 file changed, 35 insertions(+), 34 deletions(-) diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index 93411fd12..1d71ad4ab 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -15,8 +15,7 @@ using namespace TNL::Containers; //----------------------------------------------------------- -__device__ -void writeNewTask(int begin, int end, int depth, ArrayView newTasks, int *newTasksCnt, +__device__ void writeNewTask(int begin, int end, int depth, ArrayView newTasks, int *newTasksCnt, ArrayView secondPhaseTasks, int *secondPhaseTasksCnt) { int size = end - begin; @@ -57,11 +56,8 @@ __global__ void cudaQuickSort1stPhase(ArrayView arr, ArrayVi if (threadIdx.x == 0) { - pivot = pickPivot(myTask.depth %2 == 0? - arr.getView(myTask.partitionBegin, myTask.partitionEnd ) : - aux.getView(myTask.partitionBegin, myTask.partitionEnd ), - Cmp - ); + pivot = pickPivot(myTask.depth % 2 == 0 ? arr.getView(myTask.partitionBegin, myTask.partitionEnd) : aux.getView(myTask.partitionBegin, myTask.partitionEnd), + Cmp); } __syncthreads(); @@ -159,23 +155,25 @@ __global__ void cudaInitTask(ArrayView cuda_tasks, //----------------------------------------------------------- //----------------------------------------------------------- -const int threadsPerBlock = 4, maxBlocks = 1 << 15; //32k -const int maxTasks = 1 << 10; +const int threadsPerBlock = 512, maxBlocks = 1 << 15; //32k +const int g_maxTasks = 1 << 10; const int minElemPerBlock = threadsPerBlock; class QUICKSORT { ArrayView arr; Array aux; - + int maxTasks; Array cuda_tasks, cuda_newTasks, cuda_2ndPhaseTasks; - Array cuda_newTasksAmount, cuda_2ndPhaseTasksAmount; //is in reality 1 integer + Array cudaCounters; + + ArrayView cuda_newTasksAmount, cuda_2ndPhaseTasksAmount; //is in reality 1 integer int tasksAmount; //counter for Host == cuda_newTasksAmount int totalTask; // cuda_newTasksAmount + cuda_2ndPhaseTasksAmount Array cuda_blockToTaskMapping; - Array cuda_blockToTaskMapping_Cnt; //is in reality 1 integer + ArrayView cuda_blockToTaskMapping_Cnt; //is in reality 1 integer int iteration = 0; @@ -183,9 +181,13 @@ class QUICKSORT public: QUICKSORT(ArrayView _arr) : arr(_arr), aux(arr.getSize()), + maxTasks(min(arr.getSize(), g_maxTasks)), cuda_tasks(maxBlocks), cuda_newTasks(maxBlocks), cuda_2ndPhaseTasks(maxBlocks), - cuda_newTasksAmount(1), cuda_2ndPhaseTasksAmount(1), - cuda_blockToTaskMapping(maxBlocks), cuda_blockToTaskMapping_Cnt(1) + cudaCounters(3), + cuda_newTasksAmount(cudaCounters.getView(0, 1)), + cuda_2ndPhaseTasksAmount(cudaCounters.getView(1, 2)), + cuda_blockToTaskMapping(maxBlocks), + cuda_blockToTaskMapping_Cnt(cudaCounters.getView(2, 3)) { cuda_tasks.setElement(0, TASK(0, arr.getSize(), 0)); totalTask = tasksAmount = 1; @@ -210,38 +212,37 @@ public: template void QUICKSORT::sort(const Function &Cmp) { - while(tasksAmount > 0) + while (tasksAmount > 0) { int elemPerBlock = getElemPerBlock(); int blocksCnt = initTasks(elemPerBlock); if (iteration % 2 == 0) { cudaQuickSort1stPhase - <<>>( - arr, aux.getView(), Cmp, elemPerBlock, - cuda_tasks.getView(), - cuda_blockToTaskMapping.getView(), - cuda_newTasks.getView(), - cuda_newTasksAmount.getData(), - cuda_2ndPhaseTasks.getView(), cuda_2ndPhaseTasksAmount.getData()); + <<>>( + arr, aux, Cmp, elemPerBlock, + cuda_tasks, + cuda_blockToTaskMapping, + cuda_newTasks, + cuda_newTasksAmount.getData(), + cuda_2ndPhaseTasks, cuda_2ndPhaseTasksAmount.getData()); } else { cudaQuickSort1stPhase<<>>( - arr, aux.getView(), Cmp, elemPerBlock, - cuda_newTasks.getView(), - cuda_blockToTaskMapping.getView(), - cuda_tasks.getView(), + arr, aux, Cmp, elemPerBlock, + cuda_newTasks, + cuda_blockToTaskMapping, + cuda_tasks, cuda_newTasksAmount.getData(), - cuda_2ndPhaseTasks.getView(), cuda_2ndPhaseTasksAmount.getData()); + cuda_2ndPhaseTasks, cuda_2ndPhaseTasksAmount.getData()); } processNewTasks(); iteration++; } cudaQuickSort2ndPhase - <<>> - (arr, aux, Cmp, cuda_2ndPhaseTasks.getView()); + <<>>(arr, aux, Cmp, cuda_2ndPhaseTasks); cudaDeviceSynchronize(); return; @@ -266,7 +267,7 @@ int QUICKSORT::getElemPerBlock() const if (setsNeeded <= maxBlocks) return minElemPerBlock; - int setsPerBlock = ceil( 1.*setsNeeded / maxBlocks); + int setsPerBlock = ceil(1. * setsNeeded / maxBlocks); return setsPerBlock * minElemPerBlock; } @@ -279,16 +280,16 @@ int QUICKSORT::initTasks(int elemPerBlock) if (iteration % 2 == 0) { cudaInitTask<<>>( - cuda_tasks.getView(), tasksAmount, elemPerBlock, + cuda_tasks, tasksAmount, elemPerBlock, cuda_blockToTaskMapping_Cnt.getData(), - cuda_blockToTaskMapping.getView()); + cuda_blockToTaskMapping); } else { cudaInitTask<<>>( - cuda_newTasks.getView(), tasksAmount, elemPerBlock, + cuda_newTasks, tasksAmount, elemPerBlock, cuda_blockToTaskMapping_Cnt.getData(), - cuda_blockToTaskMapping.getView()); + cuda_blockToTaskMapping); } cuda_newTasksAmount.setElement(0, 0); -- GitLab From bb1cbb6b90400abed0336ac47699b44c7372ae91 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Tue, 23 Mar 2021 22:05:42 +0100 Subject: [PATCH 129/258] remove executable --- GPUSort/src/quicksort/sample/main | Bin 217896 -> 0 bytes GPUSort/src/quicksort/sample/main.o | Bin 339720 -> 0 bytes 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 GPUSort/src/quicksort/sample/main delete mode 100644 GPUSort/src/quicksort/sample/main.o diff --git a/GPUSort/src/quicksort/sample/main b/GPUSort/src/quicksort/sample/main deleted file mode 100644 index 027e47b12cf8e10e6d91754dbe5c516063da9351..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 217896 zcmb<-^>JfjWMpQ50%is~21W)31_euqfCGeLc;N|G!r;JQ!NAMlz@W;Y!obGBz`(-5 zzyMQ+PTydH7yzR=AY2BJX&`+PAO-^i13E1ORR^O%ZUPB`XplY-8yhZAg^0pv1_1~k zq>mNEGyo9{3=C*=;&O;Mj7HW6w(kK%o&k-PPX?LHzyPC>^?|}>fhQyY7DS@bD?pMU zBe_5X0|QJSNEJxR!W+b5V6ccrr4=At1{e(sPmmiy*a8}!=(GaV8W@ePF9NC$opyoh zgV7*6KtjP!OHx2=bbDZYm_0BWW?ukQpTa*#xS`V}JRs8<7+^HW4vqD$sao&ds$54+r+)OC$2BKFgui|_nZbuJ;)4@ zogg(JgF*VCQ4HdO^@lPrfb%5CsRj%a859}Bm_TCT3=9kr3=9kxQqSe8=JGgK^1R`7 zdH8~F+Mncf(HT6>U;}Uvo7ou{gs{lzGh-KD%7I;6gd4l~WJc`bD;cnhpJ&4^-i|~4 zG#2dYMRAA=;_$CH4u9FOVmIFsNBGRbVg5B7;n0LboEJxUp21=7G*0a9Y{TK6R2=@z zU4*$yIP`@2VymR8Pw-AT)r+<48X-INV=`Bi`@fNYC6j;;RFP`wMW0+u}%v zB{<^!4i5J(#}SWTairV1INT3v=V7aNKy@Ijt$?LtFz# zx{?L8jgcJ)W$WM&|BWMlqi}?eGCTHkD2F3Gf5wrnB)J(Fl%Q@yrYyncD=;ibgtRgQ zKmpIdz@W&J3z%j?G=#J7e z^BLkjLww^?Qj1d4GmA@7i$Z*zbMo_2LlTp6QW@go({uCl;)_cXi%R0-Axg0-OfF4H z3@#~3P0S6h%uCKF%FoNJ0%-?{d*)@9_~xgS=A<%!xNb$M5QckdiBDp2iEB|&ei2y4 zH!&wCKbavu9;7S?>K?b!yyTM1{5-G<_tX-X)UwQEhylK-xyc2UQ1zjCFo(G%mN;eR zB^JSC0`d!-6LWH$^YhX&(@TpIkxc_z9$ZqCnU{`gK|oP{L26M+W@<6ie7JjHrn}~) zK&1joi!(57K=TGn8_XK8&3=iw5U=6si{VrZD0kysR8Qk0lkQtX-R8l32v?CNTsoS$2eSd{7-;GN769~E3; zWDp-;l2Mdj9-ot%mtK+)p9%_Th-^q$aEWDVMRIBZI5gs;{DMmijf+b2N-}d(AqriL zT+2XCA>AdJ6|T7q@lhe3nP7M16ytVu5Xf4H|MH8$>Blo9KExp2HP}2p$lDL6-N7dD z!KU#chVibh4DnGZIRRxL8!s%gifIEh=%%4e|~)i3cUUip2Qz(!`>a_{8MG(#)b%^2~KM zboB=t7zOcgfH#@Z?ra1~dO_a7#^^q@fW$B)1w*45XV5!CgV`Hr>MXx*cYT1qG>jDIl+s7vALOD`!KHt=IyX$ReV^2V?-m z*%q1k#qmjr#i=Hs@)@4CNiKH4ISeKJxCI602ggHl5Xr{Df)_cFg9|ZGffD9tXaFu1 z6O%H_3=KggWKLpnaePT-L27(vURplL(%=%11XxdeQep}uIe?OM2a@mQQ0pNwHb zd~!u%d{Sz9W?p=9Vo7oa$YxKBL`r&m%z@UUi6yBOnI#Olskz0eB@9K0 zkhWb#VtiUAD1(68vN@SapyUl|3g;$f<}rXQVgL!2rX&`XfJPt<4fPB_BE=;s$=cc= z5i<}sJvlkP7{oIK@sdF-GX{5O=Xhg10|s{=PbcSiBRvy1%Sg`@!j6cDH`X)JGqhlE zj)-t{@{BjsGuE?2kux$w<|3&>7S=N|1oaG<7+4sX7+4vY7}ywC7}yw?7?>EC!6XX< zGgt&92NPpvU}sDlU zU7+z-Mh1H%`3JhUnHd<^z+xa3j0{W+c~E)K)PhK6CNBd6Cqn~NtW+wKlYxPip$l#v z8$%#e4x~Th$8;tJ1}=taP_fF3>$w>iI2e{f`DI{vc7{z*eu?HJ4$#cYK`8(7`fOIv zX!BVpzYwgRiQxv+KG^&NY#g4!3S=t-!wTrU1Wf#lD@1$)nz+9^M0^LD_yOoV(Frv1 z6VQI)1vK%7^AK}xpot$!frvjq6aN4e|9~cb18V*cH1ShV^BJJ>FnJaP`Aeg~TPC#d)ZH1UM@ z5cLnx#1BBlKcI;#e1NFufcg(+Zvs?Y0Zn`YRNMefdt(WVTKAMafk|tOaqcQXwD5H%D~WpBo3+2@ z1VQ-^Nn8v`Tmng497$XONn8?1Tmwm53Q61mNn9F9+yY4)wiW>-?SLc>8WRSIfv^XX zxEx3TiUW|u<)LCADgsFy)b0R@fp7wnxFSdZiZhVJk;iunki?atYC%*5lDG;;0E!!s z#Gy+jz%m_3;%Zg9M;> z2a-5wj2$Y>Z~#eM7beKSz;FUdTn|b70+P5slK2fIaZtMrCiMVG9NeshO1wZ4hpYsE zNPIvNH-QL&$sb7KreF~$0WDWS>E8@01fn>Q#Gy?qkT8P)k~p;K1{Rk<61N76KnMjS zaa)KGnAAWLw*!kn2m>T>dx#L2v_KMvt?dCxI3S5Tf&`%014-NoDh8qgki?M}CPW~K zLz|u;0!iEpNxT6`+#5-}14-NmNqhp5 zcmR_43?%VDB=H4E;=xGbqs-9|7!85Z5Eu=C(GVC7fzc2c4S~@R7|0O#%rE!LqxlVo zM>p$1eI^Ev)&nI>|1WqnAK^F*cGiDWD_tgrPybb|b(t9WE%T@A2iVN>E%HE%W^A2hJ?>E%K=A2guy>E%Q?A2g8i>189F59%a+ zdRYkPg9c7My-bAjK?5eAUIxPXpn;N4FCF20&;ZG&mxgdYXkg^iOGP*zG$8Wnr68OS z8hHEkk`c}a4UBwx`SBmjzn}q;PcI+B`JjQ2PcJXR`Je%iPcIL``JjQ1PcJvZ`Je%h zPcIk3`JjQ0PcJ9J`Je%gPcIwce9*wgr180C4;t9`^wJT| z2MuU^dT9vfg9b7_y;OwrK?4|{UJAnbpn;1|FB#!{(169KmmmMa{0|za`1JB2oDUkH z`1JB3oDUk9`1JB1oDUk1`1EokoDUj^`1EoioDUj+`1EojoDUj!`1G<7&Ib)Re0o_3 z=Ys|sKD|tY^Fad)pI!#S`JjP?PcI$ee9(Zxr7}(@RA-A2fjQ>7^i?4;nc5 z^pX+I2Mri}din7W%>ST)f=@3W!ug;9f=@3m!ugW`JjP;PcIqae9(ZvrA5!A)F5y0QmIs zBAgHE_ee2g!4fi{!cF(Vf;`3RsU*%N*@;n zaQXA;zv@;ERQ_5t{!%plTr~bvG=47{zZH#Ni^eZS!R^h(fG1xd{Hz$FB+c}jsI62HN3x~@!z8HpQ7>aqVcby@z0|1kD~GSqVcz) z@zKNpRkipGycvb!pfsp6jYvhbg~F`vs#1H%<$;tISQ4M0ts}xet<~JXfQD}A7Jcc71m&4 z=w=0#Yt26x`CDf)Ffeqp{!?dS@R;GzD+1Nb3MzLzjvkn3bkRA*wCfExUrwQr8Q z{sA?&yIucu9)I!fH&VnkzfnM!KSw}*9Rc}1RCx!F&JUiQKYcpS`E*|Q=sfPxS^B`E zv-X8YXXp)&PTvbYovtTP+~7}rPCMH3@#)yI&0rF z*S=vWW#0jc{MX_L%dB1B)UsYo{qy>Bx9cC2Ky&De0N2{-`@p5s^^Q-c?*)XNoyT9~f?^Mxm^*8Kbe4YU4E@mU`o-Gy z1OJ=@D6Zh>to_qj`lCDa2Po!_?FYI4#o_<||DQZO0jA%hm-W>D|Nl|+DR^}L@aX&q zar6UFwoCEoW>E#TfIT{0e;6L{=ym-D%B~=DYC&9gEe3{NAb)yv-UIP3gQR<1!Ai?P>`u5+888>DicuSE z;{uOf-whtUAWP1HG(Q6iyIuerxDR6L@fUeIARAp5cyxxY@aP56pn_(FA``=lr(mf= z9^I@b6+p$=F^|scFYJE&|L@Tqy1=8`b%h5g2_Jhg;otxN2(7swt*hW#`w&{0fBgTy z8*DGwQO93gQUN&>;vEr928PZX9^Jk#QarSIz_C*j@j4I`!X98>f&()^1LWEpAn&OD zgd{X1*(MbRhS%%}E#Pq4jnMMx$N&E?&VC1ZAod5=0>eQWwZQ1LHB)9{cyZ}JNDr%) zGAJ01zc}^n|9{kCMF3o?gA1MGFK&NFP8%N0M;bgjYiD$pPU#Gt;L+*Z;nV5bfSg!< z7#=w8x&$0B-L5l0e*3}7z|bAK1XO@`x?H``?7D=p+jR-3*bSZH(HU|SMPveoNC#ZR zquaFsWZ+gz13h|KA2NVKlQmk2iQ%~G4p5nR+;tCxIsl=LK&TTC>I{g2q$*HiDLvxR z>$(GEeB}54|6336Pd(6bptQ2Nb_qkNqepk`29NI26&}5z7a(pw_<-4`mxsxt+xLh| zx9b^??$90Gt_Q-yJUXv+9`fk51wveOk@RI#G@OO-HyMgVgbeW7HDjP z6?^ou&H-iDPS#@zpvXavfa9(gK=E+g^$LW#0io`ID34y>10KDhJ3M+zuXyyjo&dRi z&sUVl*#e2210LO_J3M+rA3)rNh@2}f-L7{$xSU%*a7%71Ocg4t>#iuJibd8qjD8?V{l` z6Sin@1(&O=?(&pIgXtG4M8kGX28I{k7)gqTW{{pkP(7d^MXty8f1*M(AeDZsYH~~r zAWJ|gFY6nq*mV7o#Q=ehJ3(2)amRm9aoWqvJO@<#i~gAnqD=3A%jx4Ud_VpF@6mij z0b~e7TNVSF#vq7BZ!l$A4Am$?SmQ#l4$)a)%JdPqm3sWe?T-)}k=u~Hy!U>BYBf>2 z)nM1TOM!YMw(S4^|9_$P1zh}t+8m&YU&Eub_6Bk(0IK}EUBQ*qT5t(_$fMWwxku+U zP?)S@fV3t+)f~JvVfpd@e^BKIs+B;-f(j*&|6?Jod}xCf%4>dO0G97O;nH~!)SU7F zx6M9ybh_T?^nKvd?FuR{Z*1k5%M0T8NUytbw8+3doc&KNe6OX zuPCoDNVn?`k4}LJv4_E<=*U3_>HzU?W3bUG2w?+Ku>-Om8Wc$FOsFVwvj8O2?fL^%QD^CwZr3j!Gf+jrl`ldFl&mZ~I%`jK zmL5S%S>3Kj4!)4@aJ}D9d!M0%z1#K3_ZzR7yIqfTx<2T3eXtKSaNl|S#X$xJhECTf z9=)#MCLza%|Nr^7h2HOWeZqL5^O#3(=sk~aQ1x(e7ii=blrUA_g3>!gdFDHaa)=sm zx>frC>e3zT22~hOI*)mDo`3O71S2UszqUa%zrZ%jff8CLI5QN#N3__$mLGpH;SI=C zux#l2|NmbYfK-FzxcL+BX%C!J-J3;C{ctF&vfkb*u{h$iOAzVi&mjlEF ztA7Di|Lrv>`IyQ<6+DM)b zI}q1sfD@`%dh_bN6uf*LD_cSD|ikP zhB_G1BLd4VK*)Xt1xdFnO1*OsBn@&ST6P2Xvt9qp{9gN`^I$iWaRJ=I=ihdK@dUQA z6Ei=2euX`CW2&6_64FpOi73ZGAs*q;S-ZodvvdQfbBYn_p`gy~0cf?F3o5d^LA?@J zm{1Ey2%IW*Koxm_rXxLiK`M^FXa^0=fx7vK=8)=ZIZ%-U3BLoN&`tUS58bV={{Me5 z0pidd9w-5Zr%YH6(eQ(OpG^k!t_k@p57ab<`D`s{)`yVKI>3z|aLIf7<^TULw!HvH z0h-T1u^<5+p8}7E9Di{V)cx-j)!+tKjW0Yp1zzO5`2Qafa8PyTAmbp#ehoMXkjry$ z^QD(ndm;k^Qc4Q;XnbP<9(mn+NfVaCj9^ZA6Zxv)EfgE=h55h@c;jRhzW^c z6IvBOiK=%hNZg~>^zsBq>E$Q@DtcQPz>>WXHKK>WlHhWlLl7+g02GzIQ$aR*^!9>P zL+XJJKCt`&uzW8>nP@9Q{|Sis3&8TB4IaI{V5Jbv-Vn_VV98#H8c`pF=5C1Q1h9PQ z0*~Hauu_O-K8R)quw*Ypji@|AvmZpW0!Y3a92FkDy zuz-Ee2a$gOipt*510ZjMltMI5hED0*-@4qN`2C79`AtJ|L6g~lS!KQZ}e~|>? zf^F$M{vsH{g}CPU3l|6%?1#?dFH9j^h+mJtP=j#6!P9yCg*b!@3C80uI3Qeb$afxp z@$)gnUQm2?9)Ix)!Ue}`=kXV}AzW}8=sf=741^1g)z0HD_JX;PVV8{%_5x5*vJ}Dw z7b`QtY|$bPP%df(C*n>4k8Usvnic~DKx2-G(we`ei;;ohWdkE*0vB9H!NbY(@&Erw z17EGM05e1rhlQLZnm8;78PUXHVfpeAs`;=$y@)0b3*DV);;>*|h$apT?@lyvNY*(1 zq7Y3S76XxJ;;^W2L=%U_i6*KzByvR11i`WN{~?M`AyvhzhoFK8Tvo1s^8f#fE>QHL z)BvDz65KxMEWLtWN`d=+$6vHQ`TxJ$^-AaQ7uJtJ#vSy4H4q?~Ve8}n|0lqj11}zd zI4HF>O63Hu+z*4z0Xs_q-M26MOT14s#R z4$>|6t^^ewrWvfDw%Yv{KORBaYMz}(JUZ{c=zQ@1zfb1_P`3LAc5d?#foR7VL@Ry= zsB?+nf+}!G#}zueaIN$Bi?`oUhDM;hIgjI@F$|PhC6ElLO#_vsJBX-9r9ec{o0 z0m+Ka+8>|!1zo>@+T@@41zf*;<`;yFlpJ>et;PZwiM1DtY6f`f5Y#^a_d5bO&^w$U zhk!ci-2xzsJh~Y?EDsg4d33vec+CP5L6`;#dr%8&g-2&8$|&Xw(0KG7kT*S!yB+}3 z-K^(8WmTu^3Gkq0ukRj@-p~W^NrcY(ptu4hn-fSA4hy+JaRi!wIR3(sn*p>k4m?C$ zVhbCZECgjN=wyIANCi0Qf$UfT?IF#70Llo^k<3f?|Nnn&0Ml#&(%kL4z@s-5Sw}9i zjs+lxK`qDtIR_<6xxKbWwD-Vcr3D}xgwX_}kql7=83Jk7EP3$%|BHH%9B3*8TpS?c zulbDtaybB*W%>e2kZ13J%7HH)-Jw5HJhVk1Gaali%LkH1jhVgL=a zpqLMCMG+~#1l00HYJ-9jJhFab6o9XLRJa`yXKJyE(nnJsS&k<@s3&B8M1BE%Lr-kmaECMcf<3tOY#y20B7#KSD z%AhqEe;6L%-^Ti!lZm0_0RNN&EeA^6J$h}gb22f!_;n9HGz;p(*>-^hvq6=AukAJv z`{!*?dq7l*4Loj@%?YAxK`whS{|-o|6=WHx>)C7T2U7nMG?Um0wZ^%7DpfM^2=c=S$a0S(UfA{oL7GUOZw14DP~l7IjIzn1VomVM2^#PEWNlYs#=Htq{b zMbP;&))yb{prj)BjB2Ov8&Di|hkk)XTzBY)&H!+E&>i}wGXOj&cHDvI`})8CLF$e> z2q2gupy4-&s04y317ac-KA_Sh!=tlyg-7Q_kIoMsoxdOxt`j^uLuVjo1@P#o?+TA@ z*9jiop)))>89h1!K@M2q(Hj9Ov{ry}i7y9e{0cOeu?N(?2Ze3t{TCon@Wf-UhXzFW zkVmiS`c_bL^Zbj4pil?p#a<5wgzQA9>|um#1VXkLDhoQC7ix0BaaZu{mq%|X!tV1g zF0wO#((($A-i{6i1_tbk)*uwE01f?u`@Pe4>-~a!|T_I&RxVi1R2O4x`Q12gaSODGM4Kk~H z0*D7sQt`_Aq6(N^Y{zUp~Z-nD@a%KfdY_m+c5MrABX@+_+!}9e82%DVTMIQ10*4f zMFLz@1+jq=4!D(a{T8_Ui)e3`u)bJ)6C{S#afP;2x?P_%zhLZi{nA|fAi1ovJM=;4 z1&_=F@~y7}Jr2H5IQW7!`$S6T1g1!M(iP!1#uDpGrS`61FA9Zb!+UzlDuHrGDjEVJ%*eE_oO zPq*uzUJpi4*WaTzfDvroy?%swe>{3c&w=?Opp5pSL*O-Ex9fxE0|ww>k{2DIk#tx= z49d_RpfPBV&e8)OouNC>GVw9j`wXz*Wy6>M|F;|{VLRq}pYe4xShPD7HtNE24OB$B z-u37Xg$>R8yb2Nuz30*EdKWr2!^_XW0G{W+3yQKA*Fg#(gVA@c|Nq}|yMzT~IB3wD zzXf!nwMRFkZC(d5z4Nd~H)JL{AH-$6;Q^k@g{a^GS!U__piB~`=+~A1|2-^SAC$9$ z$K7AEg4e>d94IjayRqAK2RuXN-vG62j=yLH^^?J&bOP$&qgVg`f2|9e#fCTqT&{nB zDi6a@J`Lmwl-3993n!2-F-E&F$Bn^LI4|ICiUv8q^Z1L8pzbr+O+TP+GK5%yytV;8 zTZCoIxZ70%+)PyHX0SX|YwFYOdjT@}3!2moec{nv3Z6Ui=(XL~$iNW(zf=J-jRcCJ zHjw4bM-FC+vR z7(ktiURQ9h;yY+qqnp8}^AWUoZ3Z<{K+~4sZU%BY4Pqp8SWFEx5Y!2B3kZW+#3j<; z?Am$!g$76t93MYG&34eLH&Cq$>u}&{z3~V#FrWl1s7v(&thw{}i!~6BBD)DiehvZo z9`3vWxp|k>U+@qA~+%QWU%;+beqUE2>$Y$6ss%O`hS2(`}&UXSeH*eV|4(D8p^N z1ZvtL{DGdrdqthmY$5F8UWg-Lc>u}9-@jnExJwl5J!BU^!W>jHfqaDSVrC2%gQl(;M2?NP|LvJxMLZ(q0DPl3+h;kGS`79 zQxh;{8(7D{@PZXIsts8@@B$QQMxvl*%eB4%0mpb z02#_83Q2O{jM@!uEI;T5ErLDxfYpN$R6pMUmjc&cxPpQRltK`53al?;E`lNsBaFc* z6{V2e53&c;tp^7vxD6qpFKg%yI}gBw=g z5PWcZ>KTF$Zbn^0@WHJo5zyuyM6m-JzLx+6W#>4?=2sW5LFyo53mz9>&c{eMpjhra{(?;mWD_Xy{qX4a0QH|BwFY?9{Wr)Y zs6uy$H<8Q%*GOo&4>ZOGTJi=huwOt7_~6lv+P|D|4lONXH_P>fM{fWJl6f<5nCF1a zJn-`P&d@g=y`djGdO;NrILt~#898QVLKr7TMFFeki1i1Z&ZWfGyx}SqSQI^}3$e2Oiq!JpQ5tDH3anEZBn4`s_J%${ zGH^1)Kw%F^Vu2Yn52PAw?eQ1uMHm=fmVg@@Xes{d8Ca5mr&w(DY_F)xdvMVXn!5v+ zXoza!1)^dDw{kK-r6(wfy?|73H+aBZV6dEzC`u(=4RRy684fOY!$hHVHh2w48z>3F z8%MB|2#=Z0(j%Rr2hifCx%L1EeB z9y2@+K49_ytuO6$z0i3Job`^sI0VWi-~~~jY>K?LnDxbx)35+Tb`dCt-vG_>gY$5& zX;cL$pB;Z8238Fj%-jH3?m8bdh*S!i{)acsv&6yu@^04~kPs_41L}{1*R6omfvTk9 z@BjZ>y1wC`1JZ2{wdH#`#Fh=9l+<~+I}|ira{*$87|6_HpkZyLGyngC68r1TP;G~y z+A2ZX;KS?i_y@IhI!i&_%O4)y0^nsw;7|hBS)dkBClmG&oYDoIp>r@IWDY|q+k}^| z7#J9ixn5v|)~Oc)JbGPM1bFna`V>Q|YQu6+tqNMr>(R@r4{~9z=ngPt`V&mqN>?y2 zys!|5*QyIZOFctDLqaRS`wF^U7j!d#^PB4ikKR)7%J1zU<-MVxIsXeDkbvQr0M*!_ z<;8P4q1yqx!9tK&Isw)JSzW$IoPpuBF({rv;}*~X;pHGf^yT7{K)N7>@L@=(qKpaj zpF~eIphgp9?!coHRGov$EZ-jvIu$a%2%bTA>1;jo7c@80djO=Q8??T;bq9ng z;nCT;0m4-9=xkj9VQP4Ewk|*$N(C34kV(ewsbDRT2+@!PMMx`nma%&(*Z_zGKSTmH zop|dQsCVD`A7n%4@fX>kFzuZR=63gjECbEFgWC+?+HCd-uyXJuWv}V$Qb<;51Fe}n z3|D#lg$t^wOOJzw4ftEYZIa$ru-%{`>R!_|P!pVwgZlP|Ji1%KfdQIdy8c2BB-lL_ zEYb_{Bg7BFlHk%_-vh<0La15aL6(C?17?#zf+)&Epvq5xWI(-INZ>=P-V74# zo(hhRUPvfHtezo(&FXI@kbtNGX#-mg4tR)J;vhk&)x7}%9^I_0pkf~)S#|9H|Cf)! zwH$J|oP-*v4blsC95@spdc;72y`~$XYS=+suo`fDLDYOY3W_Mxllc&bJqB^XYQSEC zs5u7`>^04SnzJ9mO@wkcfVg14K&G(4!RQSV>@{_Rs_6l7!D^8Fs}2?HW|e)%$N+7Q zPXY<{wn7rlRB#G_1gIsFMCb7rvkrq2!f}^waOUvn24@wIZb-)IhGe7eR#2w$1T}7t zzsLd&^FR&mJpSU;;s5_%ECr1gLPNas_zSfops;|(;_(;DK~})h)A1KG4uht(kZW-M zmTm?HhL?>hgU9CGu>|NpP$K_v*d_=5PT^Z1Lv!zi=WkjU&j{$eGXI3%EszfeRI zhXnHR7n~s1K?*%k{JdBRay)oO3uGs_)WRHM!6O2%fk1UUs76I=GlQB?$6ve!w;4c- zqalj|?}J4q;8XCe27I*-2~EbGyYG*kj|9%zgceQeXC*Ea1LBg2d3 zmq7^wX3<}$^3Fb4vTP*JmfGRZbR32!g z0W_!qn$`mka7@6}JA!nAe|UiEZBY9hTuj@6T6w5CK>dAaV=5H1n-$ig1^0+LT|rA> zL0t@}6xft1kek6HWTBuDESQd&5a9mWMA*lOqIH)uHlk_%3PhE2dtT5!917nlp`LB8;SH)D5# ztO6$vl$-^cZ=2!KSvmo&spbLRM&`P~qdOGjt_{%k(o11bx$L?FGG^@yl>&!43%DWe zdcdRG7bHEwqr3EgM>nMHBn55}xt{Q7u06p3+KcT9ZhHrVgnJ`cUbcc_0c2AzXh$G8 z;e~?be3yIlf?9Q@;6|Hg=Lyi%V6W(v43IO9zlZ`U>aGRt`2*Pr+D`;_lH*QLzsnP{ zPvJdCQK#z)kK?Z3xiC;;6V!_S4{qp1ut1g@9DmUZR(Au`B~S6tmVwNwqvSkrD>4nL ztn>Jbrw9K3e{o0rL#vbF9*bV2eUx#GW`VR*=mEFw-B@pqxpyeBuRrNm*KlqChr4v{^2PQ z)bNF+bI^tg(7{KlpusPXUJnrv-R&UYkqpX=klpB@`v3fkAh7GfogY@g;ZFub?}T4+c~EIb0GB;sR1 zKMmr%DzNjQv2Y3OAW>_G^K!sE+od4q<$^*9do1wor9v$10hte~2@swCA6Q}mTE%xe z2q1Mlh@MfZ!8fDSmBWBGqhtaa?}N-JA*V5LZUjxqpe2N3uHRvO(C?r==nGI@Jdpwk z=Cv81Q~)|;$)lH7862>pCXiq*1yi=uGZ`3Ouz?~GnhHF+OCNwso?T#VoyT7k@B07$ zMc?lK|2q%u1D&Vj(QEr5gMs0N?M+A>kCHjcc7vh?UW$N1uCw$5TK5`M0f6g6WWRuV zQ=qMVU%(R_aVZ%v-Sr{`vSs;c?r~3!tMT0(EfJV9yc?P)1b>94?ybC z8~oEjTh+nq7Q0`8dS)d&9=*04*$fOXymx?{=L)I_K+c9QW>JPa8{tN@Hje{jive67 z=18RWrt1rjZqV5Z;EL%O$Tg7BwHL5j5!+B~54aWvhx1+nuIL8!u0XS093Gt)JTy;u z9Q?uL0qXTY#6X!0GM;@4Y&WR6bo_-5XaotS1P8`T(CMZr(Mg%ZswNNJ)){B8BbYKw-F3TbIfAE0JYeDwb!4!Z37qS@ubZ7;% zvYG;Rj0CJv)C%S@cvv3hpLzhTB@cBUXmYa`viQgYvKk4ga6za+ot=ZXB$0CtXjL63 z=YSo10W!38!J`v2`W*TKY4lkN6r12hk*K5DhqfclIwxb{UHI68*w+95Uj%LiS02y7POwc*XJB~2a2Zxaqf{T}TR}+?BNc#a zUW6#5>IKJIXXqWwNCOph4?H>rxFZe@bIVC>zlcUljW@PRvIC469B9ZFxR!}T*X zYQBJy&1aCc8nF;-BfwD&sZ@ELA=VxNQ>L{LYwv-qWr9XEs9%5*TpU|K!G)CS!L0=1 z+wA9|jpCPaAn!xshj%mBW>Fi6JMPAVlZZB`ov;qHToaN+zpchz{Z05r5?S!F%V;)f)k+WF^IAE;~5xUtOV_%h8YVQpa(VbLDLwF;LgfcP=5_= z0*-$(n#U8syR$$AGV}x?&G4M{KX-VLqRP! zP-7l>>N{~0nrlJP1fJdl9mCTJTG|g<@N@jdT~Ldm^B8ooKgbwRrn?0aMP2Cwum44_ zg7O-;pmYFDNB_Tsb+iJbKJx&bECHUw{{de2D%uJ$=Yt2Ry8>Bs1Fl*Hkg8V5L|Op$ zLqA>^9_V&mVePtxzmJm%v@moBsCiGsNh;m0J3w1pS7^Jg=>%;!$U6^8RTo}_gXt}x zlT$o;UAI8nDT$zIAlD;MO-DKpc<`@3)E&CQBl%Ld?;4NflOCF#JKs&2Hb9D!QtliXCOC&nwpR~p)G&^|A!nLrMB_^ z{}&tAgUjb!(6WVITkli`h8GEE|NnpO0xDTNJCC&7Mrf@B&zpe9XjW~2?>0eJ5spyN zu>sOW0JUe~{paBI;6gM2ymSd<1886sG;6a0WPj)02=J80)&MNaBoO1u^VWhI%dH?q z-3%VRkWuaCLjs`Q_4OAF8^E$)lb~{taCTn@auC9p_O&2B@&I&+#%o28qaY*9kiqif zFXBPgK}LqRtpET2MgKa41t?7)c*_Hv<3QtZpjIq6L4g)c>}b}1hhyKx-7Kw_zM@*Rwei#J7|3YZj?XpIQW3sgYg2Wh5-%gHrM`ODB%Q` z@7)a0b+?3C|Jc$-C%AF~jV-~(Ykz?Qr35s(3$CKUi?+bSs$i}le7OR+YMln&XMD`3 z^Z5%=cwoS0A#|a`#^4!Os3UQwFIc#O+v}jy9YA3WjtpLK;J{mNj-eo*G`@kX$J$!~ z>XdYDEdWzfGybEljk;lYz@xhrB6OfWFWU$B~kM6BtbHHc9T?7vp zAAbQ}UDDkOmgqeGLJZW10%?YvUb<(d`RT0U9k{0M~!~#Y<3Y5;SJpT)Tpy#ORo72j~nZ(CKopS(KL` zi+e*mAhRZ58yO%GCOeM%>kbXm&gbzJ|6gVzVhgfy@6x67_8D9YzcT6VB4zy z|6fd8`TxI1;}K8{Mmxqh#v=E}klNfJva<( zz}#b?a?PjnAvCptSFJ$@LMM1MA29$|Z}3!)QVxUWp;`}=NI?q#&>lFC?jR0mTVN?a z>Uc!+n*!tmAUryK7x;9#&hY8P|uQLk3>1 zf&33X`_Oj-*jR}0@n4|+pX&;6I|`g7KsgPZ{G`CcA)zxoy1_f1c0iYeNP~qzZF2_+ z&`u4AVIa4K&Ollwb_lZC`h-V!DX8~j0-4odeZjRHtyv8z;Xx^-8+`1nyD+G-gU10R z4S))CY=e(4Rxbm$f*e8Fw%4{chJoQl6lenYxa$LOlM%Fj3f3^IPT zoF2Wl3!@nrUerM>`v6KT0w9xCL)3uo9`oq6tp%xx0?p4OK|6lxG0`?O$0MAA- zFuYiG2-GYCH7$_WxSi;=t9&O2Dt(pyRILQ+}8Ytj@fU4A1J#avR{`ch~Ihq#T8IR zhb{X}SOD?_QY?T*kr7dbbjGtsx2pssu_%BV|CWb}@y!Otd85w;t^+L@1UUiGl<}ZW z;P7u_X#T+1a-f9JcvmC1G(rSVwg)tL@?{}QS22bq3+ID^2fcK9ftIEqgQ#yn>*K*A zqu`lKJ`c#y;b(8qh$QH`5|3V9%}`M3FIwdTqD&`)dA4_b7#Loh1~t&ZLx-iHnT{_z zLFcOP0-b0Es@8VS{r~^Pm3iQz5Oi~hN3X3JNLTY7=-C6P^B}wCft(GCRnUmUz^)_B zQ2_ZDvUUz-9ck_+P^Ac8<~C^#mZ5`<|kRY~oq^IA3_sXKJBh`eO zwH~Am)J#BLN9qLy+=Sz5wN7u9IWd|8=(e12I&Pm4rv|fIgntlX(m+7eh?R|25BAX zDv)5WX`?H|VY5M8uo@&!wt)nDO;w=gltZ|ZP;Lf@3-$|Si6hv#%pk#D(|?{2V_iU8 zuo@))-k$|(fq~|7t{#T_(+i{$y4(|GRP7^3HB_SW_=}*KAk8GKBUJ*8rXln<&HVrW zMJ#BP5E|K?$6wq7nF(7BeEdZ`gbVT2@fQIz(TDNyt|Of|3;Q}!&Y9TPktU*vLjvmf zi<>i0)x(@~a0Vyp^Yj<1aRV+BKbr_k-3bzUTu5NasP1Zf1|}U=GA-5-&VDJp?>@SzpP3 zVw-jTQAW^>A`+mIyOVV$gl7%mO+Ly9I^ZAMDljbTPC&h1&_WE*wu%=Z#s`ny&@V`{ z&GNF?PUZ-kiCo6Q+Lwr$A0re{#y>&lG=c`|?!R~k@(*b4y*C21=?mN+S_Tdf@Q@C> zEci@TG`~RFO34(gZTiw!yS@CJ3$QCd{srZZ^AAILGFln&cT2ww722A(*E z)!neL0}oxIj!vPZfg3XT(m;?jR#`}#fcwp>!2Ue`VhgG{pv(vZqm)0Odsv_a5NI+TW|so! z+yQXH2D?)b%smJy4!|Q}-~_`3au0kk3|#Ud(lzceySp+deL9efaPC=zo_moD+93*d z4S3*gC&*&(YOo6k*K7j`c7u)s2hHOmj|{NBI5q`K%0x|j&_45auqmJ^3kHu)P@fQ- zuOFWW9T5%c0R89)04+m8_9v)U058rF!3k20IwSA_+&zS?=k5iKK!I!pjbec&9DG5? zD_`*FE(MLkfvtTGnuI)10y)qu0h~1uOSyl5+P>?bgX$nq*A&*}_5`iCwRC+@g0eiJ z9F$;mP(__DcmhV8(11~c@I>CdP;8c_?&cFcQ^~xs= zsyRWa2Qqi61L1?B0<_;9QvE(#1PblrFF^NS!4`~Lg7YY7m>7IHhy<7m%Cg|i58qKU z^c?V{Cn&pvUGP$jfx)-)gHPv6@Yx#RdgG`Rc-!fX*WRGu0tNideo*2CW$xoIwu8n} z!Fm0{F7T{1sGdCkfQeL6UfD2$)BMQlhJ=L2D+V zLXaG00#ODUKnI7S8|chbP_x_>meC%7L_iaMy&}wzsar^K0d@w$;?Nf!&4&a)YYQQ3 z-(bNo@Ey-GP)dU~woy8sUqA!1h(@_&A4(T-)dUb1cgOPaDH2h_kUkY4m~2?YuEntrf`guWYy3s!^F@iYVp_L_2-L)0jNxL`F% z9Zz16V6W+Fs5$?7K#^v;0Lpz2;)4AG=@Wtd(ghOiHEo5eISt~1)gbvd4Jz2p8nYGd z&x;^ISi*rU^?)qrs6dkFJpSTl7pQC_q2sv{JX8W7^5E?L|Nq641t6cm2Uk+N!CZ(F zkH2^ZvI4Pz@p>0Y9Z!iK>>bYuU8o&Na7qKMc}Ek6xgr5g91_UKUwD9A2Ps}b@$=#Z z$nmIUEL!auhB}@MErF3sUsx^e0Xgv%+}1$aF5v;&FM;Tfz~?$aZD8=YH)yjMazo%B zXt6ouxG|J*O|*s=betU2)PPJee!)6c4sK!{bN$Z~FNxK15 zYhx>EyC8#fu!OL_1DpzBgK-X^c*V0-ya1FAU>U~r05l&*Z$>L=B~eYS>cmyjHbPC1 z=mcdjv>`rbkRY~__Qfr1#hn|}tTQ0X!Py^@dcj3RI!F*&+(AlOO{ns@AQ|*Rr~@R3 zt)$JxYV~O|NP!jz(gwB~lA*v>zi9^tF*w8_CGD*Z@Dj+s18Ye;4{9I-)IgLi^sm7N znzljJ+yQaH9zZH-kAnnzP3IXx9JT|*1*<_SX%~Y8drcjo=1hTb&7j;?5Etwhq+nD4 z3HF*wL)9dLxL`F%{{7PijZ4-~>*4-P2dRW59OROg4@si)_>25jkY*A}TKiex&;{>7 z?>zoue(V4LFRCEpiQo|LJpST0$V_Nn09|ti;X-_M{6%gnNhR&NHtZ#>S}XRFwhm1k z5>Urqd}~2f4|C4F7El0y8-Sqrc~J)nO_Y*$;G1!>p!LYm07YrWu|OnX0UreN3u2f3 zV~|(y6fVMGc}S(%Ynp5fNxJ{mp%pHBn!zy+E3#x;aTP8qP!o28oB>Y!NaN(oK!Vr` zm+ouW3YV8g5ZB~`w1HDRB*MTsoC_p~Qn;LfDt7?MfUQQ#ciJF9Y=w&eR;%-%W_NfYv7sgQ#01W#Sv=YVvt_2kQCW@{Do8F|Nk#!K|_wv z5br$xVjjp$XkO?%{z4wYh4||D3xP(G3YWkp?1jt82Gqg=mcbMoQNqtQcqmc?D15vJDb3;CiCh zluaL!bQi8dD_r7HO+C>tSPO9x z*xQhnOFmTjV~`9eFCh03&w&K76)u}EVzXKhYSwg+Hn7#;fQO8-7=Z+#NgYzSD6D{I zx~cV83zsL*LctBB7wkBs!o?6I*lT(QszwRK1*<_STzEl(y{1o~w@&`610`8d_#zc9 zFF}I6rtMI3Zb7&;Q0{3E7wi{Ex(64OX&}K~(?qD6;#7lA{>8KqlXms-53|EL2(E!AJyvq z|9`;<_ChPz0u-M>S7CvUFMa{K5)m?3n*?$Xc&PM;2Y9(U18lyu^Tf%+u+Aa6UU0{^ z^*{;lKG1|KX!unN0tXn8{$AJqn7>u4<7yd0qL10LB_1%RzbU_;7wVe z!OI_@$)Z}w=spX`VzjYYq}fLB+05V>$E9aMp#q-%d0y3(IIW7+Bm4jv%Kqoqa zj`4vw3^Wdj;upjqI&_%-7$?^0FmUhu_={b*Wuf}FfmI;f1M)QLSU=j>#44}`cp$-K z!R|*{3pg3%T@=TI1`@!-0+4yB<1d;yLG=J=G6ghDd;CQQc&->UZS@0rxcnkB>I?kt8{uGo$;FByZ z;EifXGc2Ky%OQ|vSjx)~Gc2K?wPxUA17(I~9wP(83;9aK3`=q)$QE$v)B$2PAF)86 zYXy&~LBb0X$QAEVS2$TAbWlww?08e>+&$oLh%)3H92S&4w6-&J9nXbGQjg-Umt_w6BL2q z{YqCrH;#d4d%?lG8O%Kd+gIWXy6WmCDBY#7gO-0lrnXSVCDB3=V}S8HNFz!~;sC$< z0d#Kzc;y6m**Y^Qt$@y001F*|@f37ABj~VB5Fa`w>H%B$-duZwfxqP^WL_8SbOF$z zn&2tdP|#^ppxML|&;`MFL3b&Ew#I-89mH{EFQEI!S`Ui-6KcLA?P`Gp+f3X>~b`it|1t(-R6UgnLV|bBv8iNK;NNIxf7lZOC?k327h&;H3 z0BxT@YGShzP?H1E1Su;9@dL>yfz1n~*S(JGLIkU08Z{Qv)p zTSyjw;|NwRlIjyhkimEwyc_dz_~ZoGw^?6A=pPUp)XfKl1lR)5u3ewrT+r6$86cOv0jcPY-QdxA z53OmX5z{{l=K>4TJ6|~6}z7ge2 z9(dzi1V-BoGwfb?9DKk6IfDw8c=YnXi3bq}uy_ZBRdekJM)*?A5;0KRcpP^CU-1WK zHyca@AhqM_%jt5s~pmoyFg>~R& zRr6|4L3tRqRT$GR4jEFMA6p8CNE%1;>E|3LnNg+4nX^g;O-Yv21T zxWjV%1ve-FL8Ef_U+nk-uBAYWUXfD?$mx)9?+twqKFthV$bswJ`yj_4auvuncwY%Q z7rg-Oiia%wI`9GHVbIaM*P&~8<35679(2dFD@^Dl*maocmWZF z;A{l$bRgmY6t`d@xT7$0Ja~1l*-H4R4X7;(E=iOSVgW2j4OVzr336!X@fRl;2<8(+ zN&xvB7me2XIc&bdCKXB8vZGss~AaZ|78^DYe2S%@_qwAUOIREyI=UP3k-XDLF3QA<%t|>N zzIqNxRxc1G0;sM317u(4UJkUwMnJ`CFQh@D3fjHV+X}l-9W*TH(c3By8gA)4{^B6W zZM{>$+}>93!G^u2s^EPI$6v63#(a8P!FR6rP6eCL+Y43$85Q{h8h!3{ZSd&rg{Tqz z4_a9ckz|EPf-Zbr;L+>b0IhXTfP}kSK@Rrl^*!LxD|!c^2Q(TEx-lI*;SSzvzQ-!Jgwpi}zi zBcIX_xn=3@A|{5<(D!3k$Rz;6Ou52r}RW;>Tb728|u!NeG{-K)yjv z2;V^KZXpTb4>%RMq9=s?AU#muKoY`bgq{sK z{(wpxw6e*YR6~40YXZ>>VZ$&4)M^Cn2E%fhLt`r=!;4#>!3%I-3w8|Arv^}Hg2o|1 z2eyE_U!ss1Z2mh@yokFZgX}F_vy90+{DP>80Oh| z%`w!o^Q&WsW9Oexk6zs?O^gh|KAL|)hk3r>kNp4N>kJB z;?$y&%=|nBbp~~Xw8YGu)D#6pMMW+K-_+vb#Pn1v1qLn#x6B+6!>u$g8Kl-qfx#y; z4`D zex5=}Myi5ysEeaQN@`hVa;lX=ZhmozLQZCOYEC6cvM5y{vsfW7zeFK5FTXTBL!r1J zF*y~c6QT)foE`(%AlINE{~!eh1q}vGD+LBUuHgLAB9Os3nR%%S3<{Zf3Jg}Dn^GA5 zqk(@=KIjx|kOI(f4Tv4$=i}=K3MNo+fm)8Bfjt%mhWIEy;}Aa|Lj%X0^!%dCl8oG9 zL-T;dqQsn>)Eu|`BJb3qywn^|pCm)q%qYKLgLp$TU$DRZ3sQ>`K_OV|=@%St?gI9C zv59kON}{W)adB#iXJ)3WYgs@@e2{mrd3=aLylZGqrfa;HtE+2Suv2`nQ+$YFyh}ld zQ9Q_5M7zU)*6uJgN-j-F^vugF2}vx@hK7!FeqKppW?pJhv87{CQDS9SW@@>oNr+>x zH-_gkU0pLX12X-B4dXpCgH7TyU0oUMY#A~^zGPr1%g;xzko1;cWM^enoLb_VlbV~FSE7-epI2N0QKP1#0O5fRP}5Y1 zF3&8is3GRs<#1|Xcr6i zkbXvfZmNDoX<}Z9z8)xJE@-TCzfQErRp(cD%ja7Xn=!3Q^9}%!D3)wK-HgFtdL(?qL81akd$AV zmr@MUYokz-ky)&3m!4V@oLQBsp~-;Gk5*vNQD9JrWk^XZ0SzgF99CSCSdy8nfQTkU zzOb@FPx&_Bh(rlXNF-@Mou>th87N;38mq7J2NJpr3TU!PN zFmSCXNKGzDO#v05U;%~jjLhT=h0J1*NJu^?@=^;+6LSYBiK*PaQ$PeJS1qU1~P+{pFoR}ez153#e9yA$~oa{66N}#Ep zfdQPF6+qs=k$AyUDCrlH{6OK(z`ziaky)%zoRMFelcJE9T9#U*kdatWkea8U3#(xA zixg5aQ^2(bD4NPa)kR{SLTXV_evuvn!Z3Jbfs!xf8&!t$35LgWmQdy8{XJwU~Ur?#xg0Kzh zc6iYOX&N9zQSt(&K1lImt$?BpCJ5H6V5^XmpO>ysnpd2eo|l>esTCm32e}CpYAAJx zM}Bb$C=5a2Ktg$iJ5cm8P|GV&5I_s8dLfjkY$&!7-N)OOG+9n?&~TCLk5DbN5H zSXyXR16U9iFOa~;2zXEtk5Y=o5~?o{E`-#_AP0gg32=D^;z7!^^2B0=%)HE!%*33` zs?-z(aDyUI0aiDI73qOXaZnv?zyMxl0tyNZaG=@R!YUL6H8rqeP!JnvD%8{{KvXDV zs<2Zq(1et$V0YkfLP36UCa8I!keHHElv-S@keHXEP?cJg53we{2*N>@g=+z2U1)?s zTtT)wLEZt^kdSBsw*wGiixj!IVm}?xk^^TtO=uGiwdqDu6&4F=FhQb%_f6<9~mGO`cSga0Q0NTaNFN#&L19^_2!5&OuAq)%*3=ND7j15c-ObyHo%nd9I zEDa3|4GoP9jSWo~TOij#8%uOszEKLne4NZ+qjZIBVO-;>A%}p&#EzJzf z49$$pjLl5UOwG*9%*`y!EX@tf4b6?rjm=HWP0h{B&CM;$EiDWz3@wZ-j4ezoOfAeT z%q=V|EG-Qz4K0l8-P-$LCW^Sr0 zs4<1zVG!*MnEH)C`kfL}z*zx9FIYWlZPNk%#Da4x^#Fyk2RHT)}8yFZE zm?Rn|gBc)ByhVI&VrE`^a(-?>X-R6lsZo4#L4jd>W@fyZNupU|aw@Fnf^IT|PpR1f zZ$o!6G6*n8Kyx}110$mY0|Ud2bcp#dlAVE(!34%(U|RR%YXJtXVlb;rSJ!~dVAJ?uvv_zpW@vJzniin0n@eIzB0gKNmAxCveba0)U?cs;L==#bw)ZeL%)2W;+ zk&LEB>O(S)D#;JYJbdX7Ng4LE1yYa*UNgYJKyoLPTHQ}li#oWygt|ktoZd(4fe)xt z5QPM^92jWVKw5!_%4{IrLv(7PXs3)iWU~cN710+3JY(lH$Fh=gEoT>)NRScM;W5Et6@QcDjXk0 zjdl*yY$_NPA4Qdh64XJ|Fe^R^UsDTe5gsM+QG|zIaI1)q!qS3+T8b$H&P$0!1<1oS zFvS@C2*fx}d{jtSaEWCqY+{K4Dv*?z5)Ym~1PeeL8A#JlXwyd^U6_in%u#_A8XCmM zCs$M?CS{fx8k)z)C+8#<7sr=W7No|5<~m#f>GKYv8SulWmS!hEo(8wj! zjG<}!d+<3az96wEF_)46qh-s6%&|^dI!PryK0b;T?JQT*#InT9ocMS{Q!@js5kiXw z40;ID!VYMVp^wtTN{sk;3o~%30V+8W{VzmmNaXx5Qd5%ZEhkVgL7hkKc^f2Wqqc64 z?4&~51j#gNG%S#e!_#~~Rz>~6MCG!aoJ!85o!u z5aKWeH<&=``CwcI(2^SvCzctc9!VS|$iTpm$_f(ahw>1V0vkvia~04}PLQ||LLrpJ z!o|R#$RNZJ0Oe~yX;G*+=wu_15KLSRDh?6@VVJl#RJ;Nt$iTn=6EA^^e}IZdKM^97#LvUC!pf6byHkW^=F~tFmp|z;#Z*Jd!c*_D193${uRpifYOhk z;*+87OoocTgo;DC3^4mYK*dv_5-L#o2UHvu^CD1j1|CQ_xIp=!y=fpO2UJ`Y#9?4y zkcZL&P;pzRd%~dN>v=%o4D&rykb!~W0WV0L8zBH?Iq`$Ud7wN5)gS^A2k(wU2r@7* zu(L8SV5Uw627ZvZ08%=ag=k|D0L;c6&R(s@F7#u+UGGSnz0rE5hD6KFsY+zwv0N(@)<1;cafR5({DFG#m4U7y7 zCqN9S{0=4th7Hhs0}=-DSAh08Lghi`Wk3@!NFEdnKhXFUpvZ%&KfuAjumg>s!3nY* z;=Twj28IA=QUsX?;uj$C6Sx@|BA`hUBoE?)@(0KS5Whm0fgu7KWgvb4F9U-Iv`B{W zGm!WZd<+Z$&>|Ql58_Wi;&0$%U|501KY_&G!Oy_30h)wC`a%2)Nc;l=3=BJ0jDdke43%$z#5WLUV9*dp$bL(a*>L(a*>L(a*>L(a*>L(a*>L(a!|c&ji)a1l7+3)z1Xg&ji)a1l7+3)emcb zXfrS{fRX^HOa%Ex2g<(z;xI5UfaF0*;02Tqk_Yi&?GBJUC`)jF6F36{NFKx&fbv1| zpd>5-<%8rwd<`fcBoC^3ETDXlJcu6w<%8tq7#J7|pnQ-#h(7_!*J5A*r;7_vK1e+% zN#20+LFz&L7f?P(9+X5sK=~kf5dR0150VGFA6o8$A0!Xr?|||_@*wx0fbv1|ApQj?A0!WQ{|hJ|BoE?qKl)^<%8rw?*9PggXBSc0cgDjk_Wk81Ih=LGmE}4=5ib4{|@O%LI}K@eQE$Fw8#%P(DaKC_XBne2{t&zXQq#$%Eo!0+bJu z2k~b>`5<|)`=NZ0Jcz#o$_L4V-4Er1K1d$K=YZDxAbF7cC7^te zJcw@r<-`1Q0m=ud2gSz?C?BLA#D4+hgXBT+@d3&Q$%FVmpnQ-#$o;UUGDsf8mw>i2 zK=L5>Ye4xRc@WBoA``3n(8X z58`ux3LORp2AF?5pnQ;eP<#YH`5^Tmegu>ck_W{{29yty2k{G_e2_fIstzb0BoE?G zfbv1|VE04$AbAjf1(Xkx2f2R-ln;^z@ozx+AbF7cKS22)c@SR!+HQsUM*`YT1*r$c zhX#}nQV-%AK=~kfP<&WG`5<`^-vi1A$%EV<0p)|_LHrCTA0!WQe*u&ak_YiCpnQ-# z$o&(be2_edzX8ez$%EW~0?G%;gZM9?e3*YUpzUmsdQf~AK=~l`Aif2Z50VGPhX<4o zk_YhvpnQ-#*!@sGNFKy5fbv1|VE04$AbAkK1Ih=m`O<%8rw?mq$L zgXBT{7f?P(9_0QXP(DZ=#Fv2fvta%yfbv1=LGe)m<%86N_#IF_NFEd)6QF#MJcvI7 z$_L4V+`j?J2g!r@JD_}!JjnegpnQ-#h<^df2g!ro59Nd8L3|F7BcSmIc0ZI4k_YiE zpnRBrDxiFjd7${{fbv1=LHr3&K1d!EA2XnQkUWUL0?G%;gWV71gXBT{6Hq=#9_)T7 zA0!Xr-+=N#@*wwrfbv1|Aie-7(2)G20p)|_LH_Z8@?rj&0p)|t0~xgf$_J?j@i##E zAbAjT2Z(^Ce-QHoh=AG$a{mn|AEY0|e*xu#^n=|00m=u-gZMw7e2_fY{h&aEnh)X| zK=~kfu=}BWkbV$91ImZ_X9biGQV(Ko01;60K+GK=0_t8+e4K#tLFR$@7odEQez5zY ze2_ed{{hMe$%EYw<%8rwd=8KyQ1e0Vmw@s?@*utiln>Gma(@7n50VG*3!r?Mf1usX zQED^gomjzp)?q$Fv1BixbW;gV7+GfkB;tfk7aKok5(nfjQUv3HyTo z3`gV{82CX0?;8T3-LA)M4B=pVzOos^7y?{S68^|06zmM!q z{}@ilGq5v&u9MNUU}8{bWoVS}VPJCy+xLmxL5^XIJOeYkIV%H)fDbDZI|GBlop&(% z|FR$W!|+F*fuEt{*MEM8M+kR-+{5trKRd%8h6a#4`+sD4n7Rj0c^Ds^rc?NF!0d&E zA2T#OKC&+WhYvhlpy9~O3<-~~><|7j?2%_+fQAdq92otS{lR}w_`vx5kQu+n{0!~j z@F3OyjEwAm|NrNg5rU?l4gXg z#C&64@ShRnKWO-Z&gx)bPX7h*hQnWw`;UO+ zLBsApKA?xgM|OvQ3}cWe7h3+n!UK}Ne|%sN1C|Ns9PWLUYv85#sP#C+sG z_J`qyJO>-=a)yR_hG=%ytkbFHuv5 znc2)46qpXgeB@XAcVL1%0}JzXu>Gve(-}HI_Je7KXi)jga6_JfnRz+`1CvHHGjlkD z0`q|w7G{WhKC&+Zm(Q#$%NZJ27^2x&Ans>jNoSbBWDxz3z2P6Id}U(@XJue<_hI-S z&dR{^fEG(eAS_EU5nL)h+1{r>~@B9tSxzV2?`Bws@A5^}JL-m2| zV_*=B`3>^l1$hQ`kp0X#(X6cQ3=FIbVwjmh>6by~6MMrSh7*87+~?vEbtvPlfl9!C?3#I zFaG`m`+{en^v%E^&Il@>Sh(1k7#Me0urq_)pB?>?z2Pa-1@LiDhM;l-mJS%$Kr=W3 zF&{wbrz4($jTuxwF|e>RsDsUgrIQcr3C|eLfa2#k*j`w=;Rn@o%sFPD^y_p)UP7D& z6rMR|+^paE8Mn4AFuh|AW#8gFDz> zkUmg4Vr465P+&O_^O0Tc-+>lT`eJ8+{UAOAi-zA9en|U(fngHZUN$y&h6T(AVmKMv!S;dE#|e1`cCdZ{KUOep;K#zo z&It0yNA?78{lv~u&ai;#KnyE$J2)KlS=gBxm~+e?g6fAW@*EuO>I@A`Qa3RkTGZe&e*`5Z3fDp3@!2u`cV7W*uVp0GU_bsj39S>g_Ms1EFVDm zpMjB`9b!K#n>zyoqeKkYf1vV-8Nz1<) z`5zR&;QR^kFU+6((0W`DR6pI2XJBPz2m70u`TKuRJTovbuz~X1TXu$j;QCF#oS}g^ zJDLrojzJ&>R7rs9GjRGiBFDhOkPdD?vNDu|X;Asaz`*$S3%i1x0?7T)dXt42-TevR z_94vOa2g&z(DoC!T{Po8Tprq_o27jVC@ue`Z*xafZlF{g~LZiNc#sC9*A&&v{ODYGW=sa!p;B- z5A<*Vw+}(-8ybG-?UcXb3;u)JAL#8iSpNC|E`RJ8*agKI8klp^PYI!Qk*m;RrK>pjbPY4{KjAh_QqDpz?(g-u`2F%peGA ze}EQ{HB^Gq=~rm`Sr}Tbf!kNi+0p#q*aA(zN)iGviL;}nGkTY=*u$3{DSRRQKzR7ls(gi} zvw+02z`_Gd`EJK>z@7oUT!W>vkI?oLEWKeHe__Hq{-OZtU%~sIuyiIY0xq9q zKJpV$zHO0bKu>q-KSbX9xKSt9xXqoZ{b_W`#Zw7Gsf|Yxs(E1s2B=7`K`_C4XKG{Vf z?K>d}(6}3@o`#fz;PgQ-|8rB4|KkYc|2Rnghol$udgdb&r2m93|FhxE{|w;#0}CH^ zQ7X6Zpzg!pzvG7Y?_l``Rt|#8*Av`G?GH%51{Mw<_!Z&^_wVkL+`khQ0QYlc!1E_t zK#NBk1isZ~HJyCmSEwT*d>CQ<0dW6`eMStd zzXu+l#@7FZrW4He1E^d_j01r45B~BM>^@Tazp!-jkr}oBi_tz~2K9gW8DQxJR&T=c z3&HkTIiddV0dV_XQ~=yB1=UX|>EtalWPVXdq8;4-C8PeC2Z}$k!-oY|_(0OhCl*ln z2$LT^-wA~ebbbq(UdU@-2~$$O&LdF1&V!V%sO1VQoqS}0UjZ2b0Z2QC`TulA1`dIkZ=mrv@cb}GK4|_IQcg1v zJ^z^rxeo<&(+Gr>sepMAnO}a1LUCz9L4FbFLaxj_$h|Vz;H&nE^$6Z=2E7ym@7-zO zTN=RksDPb<<1!Vr`&;d}$0xSIR(cfl(v<8kl_WEsFU?(Xp`G z?I^kchI9wO0|#;ok%AU=t|6jXfRJ_x8SGXnP}`C`3>vkE?Zrhl0a9NxFtmcEZ;&KF z>XEn%pq30a{YRnt`oYsn2p#D9LCegcTb&S!7#J8LK%4s*7#J#`hJ(}~ZL$OH+ykrE z1e4(H!Vngeyr2eQ>K--NWEGTIMZmu!F4F)-b7qm|pDi5QUf*giW z1`}pj1Jw_`Rs?K6C`h2&b>ZsdAp#ioi$FKtUjSJQ+ULjso)7>HRKxXx7-f76pg6$L z{{pnIl7WE%wvP{_7KA~qDP+2mhk*gQl?x;c$2&j^F&P*b7JwX$WIs$lc($5>f#DGw zWUnhE7~t9&BCH|o1f&SZ7JfV&*!AxKU7E+hz<{hD%{GG6!0-a_6gX)ACqx7k4WK=mpgrCY5eO;81hF4o{{vVC0vQes zKV}97Hgx@mnPK{2Aqp|~fg^-w04Q`Z4kV4mg2^#(&_d#`lsRwajieUnp1k^bY zNd^XnIu_8fBhXm^a3L@Qc0WxEm5rd|fvJwY8yAPQmk z1Z{|f@Yg_T&^gI&zVdpr zdV}u;VqkD!a0kt;A|1)h!Jt%`E5N|W!SKNIvIvQ@WFEe z4E~@KoAMYm?2A%!HZth?7p3MTx-jT+nK1s!pOki$B@hPmyzuW zBmV@3-1+e95+*a0@Y^s+%Ny7*Rq^*R`10E@$?{JC(+d2P8N8Tmn0|7`gKk2OFU~bL zh%e4a%PFqROO7ush%dOq!mrA}#mvCUufo92EXT&L%)rUi!|{&_}C5%cK`3o7X_@fwXIP)0!V;JmNvl;oL z8EjZG7;mDSd3A$@TZ)0%kYPUjz#t}OPX2yw#!lEZB&iJiehh}}s*L>p3?{6*Soi}N z44Ew$*CUPuvSgHt2H!Jc2~uOkZpFwSz+lR1&B!0fV8m?0_>&9gaX&sx{AvuG97~z` zH5hnU3Yq!U88}&HFvNq8uA0uk%dg77&N+>NU!8%Abt(hD8Us7i6b61nPDcNvw4BNs zP62jx2IhPQeqRPoRxN&hRR&I`DGY{Dh9(CXb0DW$ZfE3|WnkctVB`m_{94GsAH~4P zQo;}qKHsvKfrVFwfwPE#Uygy1wUB{VmVq%)ATd6zC^0wHgMn9&K`K#*;UMS`!Ni=L z^rFOq3_gbW1x5KK`Jj_0Wds%Y#RS!v)fxGFS!DS_1-~$Yj)X)$8&i;BBK%lPK?ddw z1`$^^hX0JkhNy>Osxq*AX2Ll7Qj_7o0P=~JSkApvV_?XK9CWG4z`_$HBxo8Xr01E* z!xQKSx{q!%gHvKjeo>B4uqWucxMBu=F9sI=C>|DmKL#elV59f|2Htf-`teaySvZtj z%?x=MgPE8<2r*p|+KxCm(vXKi+=z!E5pwDyD}$1XAv=SR5j%sBDLaFZ89RfJB|C$% zp#cX2vnqp-p#y`Fijlbl!$Knq*g45P4EzDYn^-lO`E`V{_^%58|F4w~IS26slQ_Ss zPyowCasFUtUjA+dFV^E6{Cy0iOvhOHCoq(AqzW=0W#`|>n8}~d#l#dX!X(GaufV|0 z9xe&ah_5+88Ije5iC>w4jemlW5C27x|A*qi2jJaibi9ai#@}8hj?2va7NU;)Wg<>Y z_n7&ML>yVqGxNtXII@?r@W(T_Ft2Cfk7ID)FBNg;KOm18vgY3_ z%Eog=l--h%ONxR2w}=(LvDkk(ep^v%{!URNHccjee+DD|O0i!4f1<*I#@9snr-_yE z%ZUl_D=~1gtrp}@731I#VC7e5;AXka0!q{Tr@5I}TE&>&v-2l%GqNZ#uy=!!JP&g> zD9LlO{$=1-W#HiN5o2On!NA`k#=v}mljoNh%W`3N&{2Nn41AgloDUiJfPqDVLHN1^1G6{-s|17a9|;EG3Q3Tlk}6{Y zgYX9l2H^+}24Qar1|g4fO^}ij2?iDi1{Mwm;e8Se!V4Tgii`P}K&=y3F$Oj!hRO0w z3=F&ut_&=~48jc@41%r(48qQQV5v}$l!7zEw+Rfw9nuWKi4qJ-?974;LY?dk$_%*@ z3`%uOf*=!xDLeIgA{c~CB^ZQTB^ZSF3NQ#;a4-n>NH7S06<`qd z;9wA*EWsc=c?E-TvIK+hEC~i?eg@VC2H}Mg48kdr48kfL48kiV7=#4{7=%SQ7=$-U zFt9vg6jqR65Z)=lAgn6DAS}&>3XU7>pSh8Wb4PKwSW4 z25_L3oB$s;W(Wz^Xf_50-bO=)1P2C|2@Iu4Y#?oB8Vte=3JmGb!3U5@FfcPQ6i?s) z1(6Xr9GDUqWQ`ISN|iYo7?c>yH5phOn8IyAr;Y*E<*K9EUPf?y{^3V=A) zU>U*l5)5KC5OH52khm>aylj~;h+_wli2|QcWe=7qeJ28Pt%D!~O8{dymnZ`RGuX9) zn59>U|^fU zsMH`_%@EEm1QBLn3-1&LGnE>63>YHpM8F&&Rtbjid7=yqLK%!o4Qvt&5nf^p3_<~n z%r976{{R2~zlKTpKO@6uA!Y`ankfwp8a#~+4H^s_{~H@LI6zDekRZp@hK9AQVD&Fp zd6*cK3OIxq0$D)KR!;`O1a5{e;;d5}8Z_8gKs^{%kQR29J&X(tY#=5J=yV4*kUWbR z3sVZHwiITV&#>@6$WD+O_#+tD7cwvvGcal}uv}r_VPX(2h+tuCVqjd#%;d@-%<`F$ zzktD(MT4EUfPr}-1AhafE{hB!Se+mP$72Rz5gA4n69)bOMk8hgW~K)WtZx~O3|Krk znfV_u=<+8pu_}F#d&s~T%EWk>(PaW7i%KGspin;pVDBnGwx3?>CE0`nQfxaKqXakCs?p3fj;V#~n1fst98iSYj;I%0%lQ`Hy~%Q zaxjZ9EMPWfU~x%d;4fg%6)^G;?047(y4J?Wc zjB6MeH?oNFZDcg%XIZh4g)M+d#36x^e*uFV^AA?WP6oygOky1K83Y-{7#QC(xiGOj zV6tZv<#^8|D)1Df%vJD1CJWaB1`(F~jGPM?M0xBP*%mOcTCgxaVd8IK(&YcZ#Kydk zk#RW#qXQ$G04vC8!VwXW5UyZnoW;mkzyNZqsE+_68w0z%#0Exx4i0wa1t3voy#R&) zCQ+6J42CTF9P4Br69a=0j}nVA52*AIcT!;xbS&U! zQej}x;{+Qb+_8lr7}QPGn!v#FkQpL2Lxn-yehUMO8n-&gB1L_UeVoDy9ta85ms{S$P;-^f?$5IT(dN zZpmQ~Qsib}RAC4Pb!~Za7(`?x7{mp67!>t67V`>AsW346Gcl?#Yy@?bK*J|2W-KZU zhAieRODqjp_*sOw6;Kaaiz=?su!I^>KfHMPwg9`)00T%`a z2UpNRcmfQdeJ!BP`~sQ`3;_;D7&-W#F!J&%F!1t6Fu4AA(g*+#ae$6H0v$Otfr)|P z0}}&-0xJVU0V@N;237`!53CFf25bxr0?=c$KnF{KY77BR1_sbEK>}P144~t7Ku6w8 z;9kSP@lZp9A%TH`;Q(kwG$X?ke&G!j4DUfF&lxZ?Fl=CEU@%}&VP@d5VfxRcA;17S zdg%c(1H%Cp1_lS#8%!KR8h@CC9eSAyK_w3(149Y3xD5w`p(+DI7GE%ET*yv>LD+$V zLD-RlLD(sWMcA2x!O(?);h2E1D+j}6P>1^8|Ns9%^xq5yrL$ro3`$($DGW;d;u#D| zm&Gy|3?GXzY~oY;C|1Cr^c6%fiWe|IxF5xGKng2B+zJM@k7As{!d5F7gdH;2CxdQ* z2JI_W0M+sW48jj2_`zbJy~_t6V&V$~7=l3?oErofz;dAR%LIs=u!8`Du%!WmxPSnI zxEKegu!R7F(1q75j8+1{pjGl4Aj*^;F#9mFTwo9u5MTgN4_5Glr~?q*0ti(gz`$~V zK{xA%1r{K1AP3FK5^1_Ndm1qLNyh7(LI7Z?nM7#QX-u_!PEgU;$m zXkZk!JS;3Muv0`hK!QQqLV`iu;R8c3DA#-tU=TLo0HsqF2?pkHUd8~9U{GjEYyh=? znEUw{cd)QTFe<4soMGV*U=Vg_F@uF1EGL*?ya$^_kOCEC z=LH5~P@rCb1nLC_VFQr<2jFNBwhZAC{$MQ%iZS5}(3k}+NP^j`0JcLd!BPR@C6Kc( zFfdv&GP5%TgObOC03js%ppn=B)+G$`D%3v935+a@MOYRv3V*mQ2BIzqFn}nik_Exy zARbg7DDoFDg51e+fq^lRoB0DnFete_V3xqN3*@&2jNk+V)w6&RoWd3`3O6VVf;4PU zH-5e!Y(j?!W`r%tQH*5mSnlWz+w#|g&!>C1DTA~ z^H6&r;RuZ+#S8lP85yrL1xqk6Fo06W0!ZpuzzB}11&m4$^c=uh1fq?RmyfxDB@vXQ zF6i;$MSxKa{}12Z%bl|E<|AX`?T2?~6*4eACE<~mL_8}RZ22foN@+&+x(bDDt18GpQ ze#y`Lftm3GhoBrk17ndaI0is&!5^dG*tx(UeBr79C@?^ggNRT_N>!R5vjH457o;J6 zf+_;ZLJNTo9%$wQN8JSmwGUDW42&B%nIZWW}vWEv$c!3MC1p*AL3mBO($+dDIBOzs`(1m;eo-zl9mid6ak=W(cwz;8n6_*doZ%!3U1tfDV3^3yd5y_@$+x zSY!r2xM-WOf}e!}lKmMVN&Nz&FxUxz{1z+wQZ_=O7s_*o_}2pcr;BLuY$ zm?bcPf?Ie4*uN|i3|beA4H$$cIPeJ{fC&jJOyC7+1z7=ij+TKQXdFj?0TDjp1qwW1 zw-_Yw2q(aFi(g3K28%Wv;6^f2K!6+6209?X!00H&tii@1!65!XfeWm1LIaoZ1qlY0 z1hB&uE^x9WK;j2(yYPn#9H0PR;J_*T0B(~(0|!|D0|5q5>_R>MVFEjA0s}Jx2eL{B zcE$uyZ3$H;{6K(#Lqbg4rb3KEL5xF!fn$Xz;|50N53C#;L^&it#xV+8dJD4zaB*;O zfdmo^I6>xb;N-~Qv#@CdeDk#CU+8 z(St#VPk;gJNpS2aC2%W%{0nl2Z~#0Reee(iS(2azG6KYs0JU?#HhmCa;Mf4Nh-CvK zvp*YagqZMw8H_9|*cn|Vm|rk3?%-kf;09+%P_|gX4$ej^*f~15rA0cp!TC|afI-;O zf(u;Su3%?b!Oc>^A-+L?8QdrXmF^x~;JUhkgK@7YvxIm!sQK>T!Nhn%091%pa43Ed z*Joh7!N_OBi+u8rA1C~Giq=N2`>Pb%pli6%LHhxDZvFUN)|9Orb;qTU=0Q} zA}3@pgBmb5xZ$L5!vutoxPbzs%>{Cv1ef9mfqDkU2aLg>$yjJrhpUzVm6IPpCJAp4 zU;&l6PzQnP9|uP900Btr1yqhh8lkK|xR@ZjW}@W467iNF0d_NV3~l@4uLlSI3i>iGkBR_fX&3zy?{~p z!3kcL0A{pW6w>Md2Nk!6LjM1Kb6CFoPK+2MYBLCXOG>($YVe z86%iMS)+mpoLViIS$=?05rh=p@Pk>{Qi27XRXdo#S)l`F1xo`b%LN9G8_eK5{b2@J z+YM$G4`voHDcoSdz_NjX#X^B)1B1AO0t4dC&ZLEeI9K6shKufW17&9A@$!XSkVEYea6EYgw+ zEQ}TG%mQqntOH8l(9&rEBTEEGjN=3^;|f*w4mQRe;KHzgiKBxA*6JJ0t*WV*fU^KJb{CqaRby{EDE4rB#6AgApYP1b1*+>Tn5rK zL@IBf?cECu91f}+Gq?pUxl)Z87&ffn2Gwn#p#hEz4)9Q)K?5_(3{a)SBEbL?D_~$} z`N0OsFf1#;ErbsWOe_<)AVM57IKf&cEMOGBkO9sy3=%98m{=~bvK(M!nZXGnCor;3 zU}6qn1hFCF6Ts{Vj4TYSED$zC9GU#V$ufb7V*#7E;|VU73MP<4Suz-vEE!@nStf9B zBnX0P)fM0ztZ;ylV+9AOh0svI!sr1?(Ht{aK=KnfKvlZ|1IGj&P<<}JAl%Tv0?PCY z1Q>)9I2c4MZh%w^KX3pQ3gC$uc(aD1fr)Vi1G5GLw88~9>=&?r33rE zwT}uIK^zGP#{p74VKG}s~_z#u$f2Loh?g++r|*ru7C&;1hSrkYzal3T&1gtSlcGc}_5~Y>?prxsC}$KHwI% znXSRn!6$6BL67ACJIJmI29_5LptR$_z@otf5}5$@Ob5tvVS@*Z!j>Z7AskSn5@e6? zh6#*3Cm8qzm{^4!JV2vnEDoR}jpbG67+U-2`@FCJsU29R|$8o+l(gwG@*80}Dvu4mn{90|v0h z8xt6XJu_HY6qrC<76*AlPX>kpQ{f1RJd1?gWKRYL@FK4l2N=O>A}oYG4fI(qxPo}% z9tI4-pu25eTwoMt@)2W+V3YO$MXLaV_>B#W5*8W4+!74JA6765&)@*H9$7e8gnz7H zWRYNCd0-$NA;G}%LrD0=3{W0oIlv0i%QAsgd_@JLxCMtI%L5h`4+h~635>!gIKT~U zP@z158Kip#v-ATGMwSU~{7ej>aokQrl_KU+^H1uG41MejkN6n`Di|3TfMg#0XJc%LN)>nT7nh=3@V_Wwjg5w zg9@nT7i1P-P!{tMU`T$%$`D#C?8Fu*z`&p;r@){hV4}bf#LmD}B;L#h9ydIuufU+f zWT3zh$jqQpB=&$|BOAy`Py88FiUb50Kt=`jGpH0X2{0_;0Lfg?XJ{>!;`zoS_F8~} z;gKK%PXz-L$fX5NADBU22`&CVf#D&@PgO#UppE|jAAELTRBA2W=5mqgB}fk=gKH%l z!-5jFFPsg`3{UtO1gZr8gBDKyfAXJ!;S4{6JLezn<_2blSNsf36))Ko85j~6CaaV% z{0Ck9^M3)Oa#MvRJBtE?Qys^D76pc4evrLQ74sRG6&Q?|YB;$$jrbWj6d0H{vNJ0% z7^>89U1VT$U|??KWmRCXJ118=CM#Um_1twv^21aG3iih%(K{d}2eg>urLxms^`$+)9Ck7tS zGF2{62r)A-6>(K_ECfkCl4npU;tpUCVF_R`EaGI?#K@??5XjGwne@3_>Rw7z8=yI|~ZPaw<7jN;5KU;tPJI&k$3& z#c3jg0Rt$A9V_b?SR0uBF!B^Jh=C>w3If0}$?}1TRe?cK;gbNfv1=t;j*wu30D~h> z5d)J0!^WU5pqOLfDPj=6YONXkL4YA$K)HyO;fVpmM5h7 zSy)Jofl-M40TY)3vn5jvhXW&z0<)P)9j5~$qXRR)0yDP>Qw?_k3v&Q7mjE+Y0299e zv%C>g4Qn;17-oFO#KgqV^a$iJw*TB87??LOa49fHs?;(6=THDKc^sH|44566YIqrp zKqVU21QsKvBKF<7%o~_^3mEwom}LxAinuN?F-~Ajc_h!kRK&f3k?8{ij{q~T0W-S* zGqVFDBNG$z0R}UbIu-?1;ZqEZEDFr>&lH&D9h)mf7@rEHf->JCCI@z5fd;0auwn)_ z;dP9g85o|hGc;FMj#3n=Ma;AfC* zV3-KXHA;L8OjYL?nG_fTkMT1|R-R*EV&xA?;bTxKyUVwTm8p*%v=K+8;)m^5E@lw# zls*Gb6$1l9Bm+Z1(g$XS8~hBu>zSDf*sYi<*bA5f%Q*it#0aoI1Q$=LWlRhVQ49?1 z2@Hw+3@TL&tbw56G{?4L2Bu&J<`3*V1q^}ioeT^c(!EQXnHgTmGx$DWU~XXGZ(z(# zQz>Ko&%pJ8LHTgy2WI95MvV zuz;47vN84XgU)9Xs9|DSz!CW1KR?5X35@bg6BxOf`vMs^Fop3msFblzU=%ztfzg;K%ru|4hftmjSBR@|ABkKbu<_5+PQ2Bjf0{3K40xgmJ zufVW@k!b=K^F}sdfeDO)LcI)142llSmI76R3%CMZE7=(gg4vihusbppi8Cx6F)%l<32|&-VBWyaqrffBs=zJFufT1{RKq6E%65TG zo}Gb%i{}A5p8~felLEIrQw{S6c5aRf?2HPWjCssVObpVG7`T}mxWt(lxLF>sGkxHS z2IVo14_t!W3>=IfxDua$iY!h624VI9_7J8bb_M|kHg*PX#tGa)PZAgz6Ig5+`uGL7 zg^vg@vV35dfAfJ|p78;baMS}PrUGWh-^?rp%)AGfAafdTiFfjbQz`zs0$bW&C zmwy8nuL)BPR{|sB0xrG-y!;24tR0zZSQ$PsF&8lKJ>ZpOd%(-Lfs5w?6Js40$gc^^ zjGGvkKkynd)i8PZun90rGGE~3F<{|8z+^ARZNSDjfr)7YFJpp0<|BRvrXn^4Zr%$_ zJPWv(6ZjY%K;|$9F!NvF)l_UO-mZO-f#Ibb_%f}=VpfTd9G5`6MSKjxRqPB5vH6Te zpel{6nB@bT;uM7kY|0$P4SXL#Gtmqxats0$Of3KY|L0c_N;)IYz*@|_SZ3J=MFxfx z1_q`grT{j^4}44uxDPUO^|3QN(PvPpU|@9?2bCtE3YwKIL0~c{vmWDT5H5;mX1>55 zs9wa(%*rmySit7Uz*EGKz+hOA^do@JP^gOG0E5zh8wCzfCGlkga}Y=+TLUZO0>+Ol zLRAd^8Dbe2{u}qPGn@j&6iWdM;{t}J6CgGdJIetMMg@kAJVgxbpv?&d4xb#j7OBi& zVqyecgU2<2p;2H4Gc$-Iz$o~{fTa;^T0zjK4}uFpH3sP1tbzmveleZ^2ElI%42=Rs z;KNoM92g2u@H4O%v9kVWo6gX{3aU5+I1(5VK{@FV>jnlTR;~{WN-B#E2s0jF5Pamo zz_frV5Omt$i3SEC)(QNK4GfM9>_rUwxfp}A6HkDeQsN>0j13GyXZRTenglo)1sIYp z@H048viC4DZ{Sj5tE>qyWSF3^fP0Zpg|rap^x6Lh1Q?Y$iuD=T44C8@4461s8iWP^ zUl3R*&!AGl%KYH}|NjCES|{`wm|8CIGpHBIG7DW`+<1ncL8XFO79_iXL-14qhYGVo z0i)0j2S&yK2Id8fo&p@QvKts#4H%dnFschkC@=^;I>0HoiN)e0Qv)-@JAQ`FN)~Yj z1_hS?416D$TzQHZq!Sn!1y}@y7cdHo7cdzyRWKYw8xTl17iqi4-#7f zgCScb>w37#{I4fXW~RCgu%{Awm^0@&XNDuXt24F!VDqGw=lRR4~X0OkiM2 zVBt{^4s@+#V-PUnW=h~z<|zKaIvq5A_^FRUK&gPS^hiE~Km{AC0(0V%eg@_W)(rxT zObkgU_!(G>*)Omweo$0kWC>tpa$x$xq0b;##LNzA(EQf~rD}f$^&)0=<_TE&(P8-CC|&CD8Vp65ETEi6PQ^XL>Lp9nF4qMK{b5R zA5Xys0a4e^N*1XL%=$~5E-*6)N^fA^CiOE4wWKsYFlLCvV z^a5tBCoB!jf;Su(#H0n71FFiv9#i5eEkbrC0n7 z42E$G46VGsn4j=7tf*x84@zz>j`AD~i~Kf71|8#P=&1DPpTWYxz`^jznZYqYK@i_1swNx@Fu;fZGa4;n0O_1U&kP>|KfPqoiB1!zke|Clo4;c6p z_%-<#@bK~%NLBMs;9-*QV3K7q5MmOL64W;kVk{D3b>J4{p1{r+ zAjEDUB*^B#Bg(J9$+m$>ki~#C`#(b;Kf@6NS>_G0YzjjB2bkpf68I&V6Zn&PDj38W z861QJk8KbWlxP$ddbvP0tF4$pLRdtAL0eo#fI(P7fFbb_GlM`8^FhvM{~v-v!Jh%7 zgjqm=LBuyRhj{~UxKIUyIP(QA$I#;cocGz79oUmyD>)eogc{fw5AX;IB``7?aB&py z3k5N7sJM5&abgPKbmXaEV0gfw81%t`K{2%WJQI5YgCPfl!D1Qa2cp8AoN>$x7?gM_ zT-4be7zEkv8CVxE2(!;IXLDc>ydcQL+Q1+$oFJynRUp8ZAjaI!z&t@fm8s@{2xEdU zPk^waN*(9_3j(YkM49s$nH|9WJVi#v2~3O|WTd$rgqc5ZFdH!N1;{b}WdK$BA7mK? zxEKo<$ zJKq5&Yo;0&1!jH+6>h!+F&+VSAzmA1_5^0O1hHsl2BsRG1TiiH6`lpmJR3N97BKTO zuFOtbYNjl5MZ9b!r~ywD8SA?L5z_@fL-WJ0t+hxyZjRdc6n8m zk^lzA1NMLo1%N=D<)O}Hba#XCIMl_ z1~&c%5eEJbtomFDtRbLQ(T@WH{03YM{2zFD`6o!m^BXY#U&H@_)fOypfQgrX1AjE% z16KYAJRs)(I<60_^7A83ZbX7!(;V zaLXV2z|G6ez!vg^nZdDAfr<5iprH5!Nuh)UA)W(5CMqRN2ZWgucm$aec$Ao!HW>3O zu<;rSR0w^LWZb~OtRN|PAyJV@L5gL9yf71koFP+@bb>Hr0WZ@614SY61Hw!WQjW|O z694TPm>u{y4;ah8Q{ZFZ4G`ilV5=2mIjzkSpvvbU#J@m;o9BV5JyQ+)0b>dN1~EoK zCV5d71!0~IygVONxf)pb0wnnx__>6RIWVwo&|q?qWK`l~EZ}EzkYp|3S1RIEP-RqL zV4J|tTENf0!I*`0f*6N^fY6f%T&xR(g^o;MV4c7(|7HTeJbwY(dr6iAQT_`|vit?w z+Tc80pzY1(Ajzm8$?PD>7%0NJfeB>61txX}Nl|tNR;~+@ECnF`pr0$2p~4=C|72rcJ(z{GDL`oD*n z!I=MpxCK9hP(ObH$NxQy3s}Wi3gkirDwu?s7Ktouqo!O4)R zh-E?@;{`EB2M4AN4vY!Pj0t)|><>&B8Ccjp2(l?K@CUGPXkXy}pv#nCtn%Q*1O~PQ zOA+A&M(qk#0dpn;7XAVTCdLg6g6wwAYzhqQ7g%^37zE`5jQJZFxRi?678_3`R^vEX6sD42**Q1&oXzO!*lYJ=6?NH86oqDb8hVU=ZYQU|`(9$KSx< zBxv8jB+k&lz|z1dDqX-J#u^~aSfI<#z--CXz@T>EB&a*cz|X+Uz%_x1pMhDEsewVf z0o)yDWPHHFD!|C!zyeY#?$E%%pWx2Ra==rVaRLLlpIg9S$rxbG#}KHjUX$R-%n+!h zRw2NkrCK7uprukEz@R0cBfy{~ogl!VB^4vUz+Vu^%PJ7aJb{6gA&^;sK}9wrfPtwX zP(>z1fT5WgJhZ9b0~@TA{iw%Q5U8Tpp}-Kx&JbEG!_UR2z@V6>=fI$%+oHe_$jP8m zr0-x*$PH=$9rIUUP|;~nUsKk#bzl(WcVKAYX9Nw# zNrWgclyEXI73nbW9bjQj0F9R_Foc0LOCFGAPGArbQ}9lF#Ll2nB$2?t_<#$fMA*Rr zVy!r+5e^m-vQc0N0_hT)V9kDjf%QQUV*&#cgPkHf!v`x7#sEXcgrG$57^+|b!$BU9 zQ!n&`hJqv;Wfy@4hB6Epnk&_WEH-ipFfe2=GBj7JNHV$}1o4s>LH#yy2?GwM0z*-^ z0#l|7a+U&B(jVOU0}L6J19(0drac3V7D#eo)Ny=}Wei~8 z4^ZainZUs1V8kCNOd{=yNQP;Z@+0 z=bvD~uUN#u;Gk5Hl)%hCfr*icrI&4j34a3ZgAqC z5TnleL70C54=4YG7z5BavFZdL{tY}FOeJ}21PHZ3exE$>G6O6R^4lwaM*r~Ey;Nw5Qq|0?dR$iXLgF%|(f&!BQh-53^=VK7$ zRFLLBz+~XBTEU|Ffrm}#$%0t3@9Y0AkmC5jBXpu5H5}Bt5?jCnjw4|KP;1DSDW`_1 zfQ|2gki5VE5Z{Xtd=P6>)G2rFpFJP18S5VjGpI|M^Z@~Hg zEdK!}K9EH^1-xvG1#ID*40HbfFJNOY5ausn%jGTL<)4toE+~0HCRv^VG$btD&$vL? zIjoq0y?O@Y1}XjpY1}eQ3)1A77Nkjo2<{f}5U~U!!vkqX0R}}$#}XMMD_tA z;{rKhsf|)>3*t)bi`4_E)ECxC(7nt}HL}d9F2pV#2 zU}AT0U=&bf-Qdi>K+u$NfrOaM1UXTG337}E-24j!g*he&#)Aqdp#_3v`b<0pZ0<}O znE5y4c=9h0{LjX+fSKn2GyewWv)l!2{0WLo;8edr@P88H;aSWRI6!%i-+)t&_k$_Z z0zE;A00AZgNyhuihWrd14_G)1IQa!6jQI=LctBS17qBJqFA$Uv6boQvS|G^uQJZan zAmarY{sn@{{0`6)XjLi`W<#K%t^qf+6DuAz|5vJZu|;SU(6cZV(cb z+0O&Y4;zF;)DuL~9&s{+7E6B+Qe5UIAS|LbK_u-FH$!N#)CVEOXMG9`%BmF&!ikR< z8B~hYA22a~U}WAPB%<;GtdFTk&4H=q7(WxJ86&SCn)ryDL8VC5fkBXSf}x;%0)vQL zfG9{WQ<2JfK}H5<<^%>d17UD;N#22h`GO%MSRkC6fvHIDf*})_7YMRT$bo^8ftfMD zkjcT2Q9;<`2|ts8Fr$O8+68%s8b*f7n0E{URbmRlY!et6FEAT&G8_mrVi7PD5>^ma zWb0F4aOEjtVE7=VctgIxK%~KdLGcxTff3^*KIRSVEDlCOycdL73ycIAJ{U3vsEA5D zkmX;%DafhNA|+ptrT>Wkg3$kTLE!`J{0le}cp5nQF9v|dd_zf7C7%vMk88GlXC;`o*h%aE~ z7&2zwnkVV6u=4!9!Uq*N=^Yreg{?tOO-md0v2ng8ukw$el)*=o~|QP4G%*REAs{> zJ_A-pIaW}i#=y!b#K3I8%71{#mfwMupM8S_;{#Uq3ljVXm=r+Xk#u0S z^MQ@?0t0^o69fMNc3Yl=Jf07Xd{pu%`Tjl)1K8Z>_^ zXP_n|zfqVm0K!!PBY5g^RU;L7+xNKC## zmHC0Qh>QSJ;v-P~DOVuGC?Lk{;KFu6Seo5IjS0fybWmerP-ER7!kEAyD4oE-q~OYw zz`z=yuKI|5LK5QzA?5&KHUUUsAnCxsm>|S_fPr;^2xEeph{S;c<^(mS05vuTVL>Jb z25`prFX&-35SF#OFpb~Ag;8*UA%|iEvqP;?m7syJ;*b3qg122fccl3>7~ z`ifscopAz#-~|IN{tNjUd;-G!1^m4H37oq84d$_Y49bkQf}o`TK~aP?z@6EFlb@kV zQl9Zb4Ih7lxgaC+HpT)bz5os$242n&Jp2vjLOdHxm<}W`E?{P6U}QYR!NS1E0UGfV zWjY|k$iU9HA%VR>Q2N9KUfCB9RQWgX^BJmEusE~|$r#abg~Et3$p%WViaIt z5)fwE!^rxeg2iS5(*+@Z11Vlmb-}`@AQgTCR5Cn}6ndnYq0_YV}KJggA-H95~c z@(d~!%WjLGDljg9zIJ2A&CB zjzSe01`2`90loYUT?~R64+InwzIR2G!8IuF6h!e(?6k$n}J#Xk^F=%0j38EO8*%On9Nj4m>aqSPk{R8eNoH@ zW^z0LEo5P3YG`1bz{VrM%p|~|q9yNO7=DGHK|sQQ0W`&H$inErd>Az0S)r^1DvC8f zXel#S6to`!^<&QQGjudFC@To~^GxWfc*W14GJ`?O0bJ`U1u!R_<7a4@sp!Ly2-+98 ziorpODP#qc0lSfVCBwoF<^yb?AXnVLto6iY0W+gRh@jE|7E#3oU4lvz{0kL96IYM= z7ceV3u`G}hd>6nVsRl3m2i|1_u6sNJ%OE0tT%|{0$Pk1)@w14DtpI z3|x$T?92fyObm?76PoxE82A|&Rm24v7{Fs@4b033SojZwaZBkhVCVV3z}TS5-@wSP z|KVo?BPSmNBjW*9eg;Mh`3nsU42&O`^&a^)-e82K5P z<(L{6q%SZqGj3pEXJFTq>p@D%hfLEAd0=KS`@`Zjz2Qiin z+=@!f1!Bz2XS^5~o^c+Smk>Nhh_&`{)T;1RkYAi!3@!!$vJF^G||fJc_G!ICq8hq-`Tut8bBj0)`h0id)Jz`*=~opHicZ3e{zZbn8q{s0C}*-rtWl|jq_3{nys zxETwSK!PGl1>Q^tI7Af~I24yK2{4K@fExHB{0rDQ1?3m8@GoE&(>lU$z|O0{%A~-q zrtnXJjeh|fBbNXxqaPFV0cLsm1xw)Bi|&d11dz2%AGrAiSS9%xmZ}KyEEVKuSZcu8z{<}cWF(~3&&bap zBx+cq_jZ8wWdQ~r0ii@CIR>E$jR3y-|DeU4Py7{xxOo)#c@>1@{S|~5Of=cG z95~K{rll*GH9@7TmV%I&qNe5p7UmCg*aCzEKQXX@i&dcu4lLrA99SoJma-@)%QNsW zED$i1tWZ8^V5BU@sKBgPp^(7GIDx(57`Vr*yn#*FEl!tFfSFmLmNCFd@qhCIRwK4b zwgcR$pjmn)z6+pr>Y&!VMgRjpgCGOT2R;@91|9}M(DJ{B^O!ye7;zM{2`IB3SfItq zc0rigK|=9^;squ*l?rCo1HvWppb2(XW&;t%fZ2j93k4V(82AqeFfq!ra~u$0{J_B0 zz{ry@o5>-beFJE1OPBL=39^Gt>e{0s}?om4uQH5*miK~weo3<4Ss41&iN2rx3t zXAEWMUm(Ds#8b3DfN=r?(}Oxu%>qW3EBp*hGk3BSFie`Ep#ds2S(z5_IkwDH(wM}+ zSTLLEfB@?TZJ}2IOd?DS983XBDoTn0Oo>li7$hqc1DFIA1DFCqHJDNWladlsfh-RL zCw~AFFIU1WX!7APVC8CHSJ7ukFcww5z{9m13rGe1AheAnGATP z`~~9K8yMsjL1XkAc*Q3OaBxjvOH~485v>o5{0YnqTnpG7nHZQV-ZL=?FfltQv0mUV ze8&zd)|n^pFfN!Y@Be@sG~>?rfLrL%g;lH%*mx(b6P$m6lm7slq^N{}ApeGS%A5@n z{0uz0j0#Nb0c^|(tb#>z8JH9}TtTrfz^uK5Nq|{iKY^8zQ-FtGfSFgzy^=wOkAc~o zjY0kbBMSpF8>r~w;#eTWFTf=tqA-D%r-4apk$?iX^sxda#U%_4JJ=MMq~#xQ$onfW z*=<5Po0^N^uOqcjrdDA1D41JXT%wSas*snTrwcn=S`X<2=5B^B95_y4Heg^>W%w$q z$3Xlk)bR`-IrTCM3K*~)Kpn)eNWs=t!9Y`?rba=DK~7PL;j_9Oh;M7FP*PNy%0TpK z)0_;2(37Uc7(TLMJ7`*xft{O?pN)~_6Bo|Iq>nNvF)(Q{D8@13wfIoO#P`ZGxcdZJ|%bs9#eqRwz=1*D+<}2X{usyyOB@))-#9SHTR1Q@`0|y2{op`WzN09bP2L=`;9$QNW zMqO3r0}S#`4onP+41yvoybWNxSR@&E7+6@^^c497=diGVPb?Fj;lRLgfEhfQ=)hof zpMfDEkcWwZ@##$?28IO+EQ}3|;wB8NOy3#&85oopuP`tKf=)WNXJ_DKVBv6JV7bAh z|Buyyftk@{H+u#v3j+(w4$#4L`pRqv7?_169NAb_Fz`5lHcaTZu{$s@gFG!%kif*q zz{tq1%Qy|Ra$o-$`vC@Kr5XO33`!2H91OzDGfwgxV9-zEbYNgsx)EZ*AU>mkQQWeD zk%fU#{~#wwaY2{^1Iqz6Vdf|ISsEBX;up9Y7?@d7HVNHuXJBMt~u>8 z@gEFK;&u(-z?jPiGE^bcfkEg1I|JhZP*MQt=jI3Lf8f9%&e6aiZ{NVcpl<{cQhJe9 z$-n^#;06Zyss;uYC686sL4Gqx;D-YPM*|Zm zVjCFb?}8GPM($4r@ed44jOubs;tZ?}%<}IJFt8{|2Y`HifKid3S$~;u!E-KF zCI;b*OKeOzs!U7_{0&@;`re?lq4c8A1Y|9L0~eD#2PhF_L}&?5IKUvxaDbJ$p9Q4N zD1?Wp9h~V)c^HHnHZln@YcPmN2{5rRF({@nGH@_`5o2d!P->{<1?_9<0ZnPjGJrg; zzla$W<_z@?3`{YiOb!e_iVPsz^zX4cFt9Ldg0m9?vx_PNgMyex^j8REqAdf1kpqLiJQpa3K6PMV z=HO#yV3kpDV9;*|6^26JK;gzHZ_mKUAiu+bfknCDy8{CwpFUFqgO39PgH{8B{s$gV zBI2%O;&5Q{ab#dna$r!=;d5Z%ZeSD^c3@yq;9)$#=;Oq|VCKLedc=W&X#>bTjAlA4 zKI%*d7<`-=7*re>R3bpj0Q4Bxm>7gr8yJ{{7=)NDuJAfAiN0}Q;64CKZp@Jk;NWp# zV9;w|5Kd^|XI2NP7t(29VDwdFW?+&w)XDf+PbI_~b~T z!r;rGVt0a}76Xd|HwzO3Ge0Pf;%gWf7`Pl5I9LuavoJC6G_dHavO6$va~LpiGw=t1 zdb3~*JMTHd3(RGJ(V+95O&~&GQV=xz31&eEfjtl!*LlxM0vq@lzVe!|n!t~HHlX8i z&k2yzp1Bz=pd9ur$iOTEKku2Lgn=J%+%wk)28xb(7A5_dX8}eA@G;Lj8QB()c+4}i z4byu=BhWF=o_U!iA&JG=WvrZ#Gn-Rcez0IUv)L1LM64_8Jx2aS1}FCGto#WKPTc%V z{3#4hEDsnbz>jKvz^G81$;iO}j8UmFAT!uBKG;m14SY~@H6woMEIf=CQI2Mw!^AAg$%A$}vpW+f|6)!?0atkz%gi9}RHVb14VhmstVf*6 z{E|UVAV`Oq`y~VDJX-cwpd@R={F(vJY0Rq`L8mdZ&tw1}!(6}sIfgkOatt%GJAcmB^wkQ8nyE}I3=bysxZjAsxq($n5i-x4l2zn z$;?f4Eh@?{(q#C+j{TTqS4M&OZqO0C$cH3b@d=t*@$sASF&g*<$A|J1<>V)W4oUXm zLpdawixc^f_hWIT(YPIJWRn`GjPK35;xidHCn?%ds~xGrKVHJ2S|# zS~BqS@MUooF!OgaWHDO`vQ1^=2OVp^iwSf%GJhqn0)LvIC(>EUtbBpYJkpGRKFn^3t2*06_ zAb+qB2U`{+zbXSezqt?>Z;ueOKRame@NIq%$g#)#lY|)9*E90TF)*HEBdSPjF5AOQI0nj zTAe$Mk_gSPxuBK~Fc#>5)8uVM;rf8 z1RZV6sKH>$tDplr+t>hnwlM>f5(5+XbYp?9ybPc#COp_c#}Gr0I2K42WpHL-V3;Mr zz{ty>%D@0Q=U6aB47Bf3g%vEhQi4HHkPjpv0uk6K!64`Y7Epi)?37>-ROMq3Wk_Oz zoqH@0Cd)8|fq@~68FYrR9mq}x1qOxN%o`Zlz$g7Gw6IDrh{%9Lyi^=~@Ue^o1B(@d zf*e}{gQ%hg1B0dN3@rOV z#|kS1^GPtUOan0mPf9R|sewC#3g7t+7=+a!ivJ5pFtFq?voSF!bPGx_h-reA2tJcw z5Z2-YZ9NhY;$Y;fU{)v+@?a3x)&w=e1Yb%p2nq3mWCg!VFo+8afkYKgG9@$cGBKpz z=Kvj5yn&JVHKRfew*doN0HbijM<#`*JQ57-0gOVN!3+vcd=emWp#$O!iVQ7aw(tQ1 z28E6MQ1w3<6&?vd#T{4_@&&3IhWV0|OHSg9_-ZS!aC?20H^51`iZbkYZZ~1_mBfVUS{* zazE9J>XOUxJ5MXj)039v6hIN4hXrfe7fq_N*7~@n>SaL8hd`J{v5MbQJqWVD)e4gy0 z572XJWm$3<8)P`Z7ZClJ0y^@TgCUQZzktD#;SU3^45K0!2Mb5310(pf+8qo6ESuO= zKR6072r%zrPz9Y;E5P)Mjr|4sNwp1(-=#G)728Ijl3_6S-IRqFaI2jmDa5FH>5N2SwAj81GV#2^YBawlj0_3<2 zEDQ`gco~>aFa~U3WMEh!3G%u42?2)S9ZU=i8wxlW#2FX_7=%|8uz)rY1~3R``~+>J zWO&Rd{X>93pkgT#a{@#30S*R+9Rdu(88=uM92kNlxEL4$4uVc26)zBA2ml?V8xhOG z8~`#({D%NTP=zo9Lqt9k;~%EP0A2@G&r~ z5MThgDQyQo1H%S>2e8})0fw{#0w8N4YJUhY1WO1qFbF8HF?;~4Fc4q}RuE!fkXX&e z08wKhzz}R8%)p?r875KyI--|RSRj*=L4hHfL6m{vhX8|`gy;f>00A)u1`b0OW&=hB z0S0jk0fs~aaRvqrad`&D0I-mU07HO<1OtPCDl0R@{0sqx#0W_Sh5$)<21bZ8!Ty(G zV2F@~DViX_kk}y2z)&F#S2RO_A$Woe14GAUPKetXRQ^3NXtvFs@(;U%(6=A&^#J zVPsHX5T5}~X8bJ-Y>FBh9UKgMxENM&d}b@qvNCi5FBN@i2j;6&M&+vx#SbgNGr34Jwq) z&XU8%0M3WP9tI4cLSQvJ=l~}W3v|e`09focJIfwec_5*{z!1&BPy@CUY#hrU(BV)l zKUf$*`=uBdIzcl@pdv8IM}9t-}YK$?%37>crC= zEGIx0LV!;ET)_%b$IZzw1?DFK27!X54D68ep+IdmZUzAhRH>dCl2n$r4 z3xJQx^$=hHS-u*4C^Lv906vx(WEwYvz>ic0W^jQEwUrTk6gD*2Z-C8$3W5DD47Kt$ zC+J{PkQt!r1+0ugnoD>Fgax|119DE52H0|_2?n54EbxJqk-32p$36Cje@3<4djpwmP^wt(zwU|^WQ2s$bb8f2gm z!3B&0KSUXr4Pb_X{SCU51ypArfEf?+BQ&xOFd&5-)TRRr!W9rZ1we;Rg46m0M&S&& z7^sX_U|`4wpQZ{83(!f57Z?OSgtCJC2nt^VupdET3C;irKSF{M8Z!_-LP8Qn2x?t6 zqcGGu0r2@yAnSHB2!oOqSgXK?udM6^OyE?C=52^Qpy-Ae5ArtDV+Y_KgQQlFsR!WZ zLlO~4Yyu-VlY(u6Wzq?Z!l1MT787`p3eHuaddvWHbUrw3LP8x9nkaq(wHg#aH#Z7o zure?+FoU8p0>om1Dua|c7LYVK0ena}$Qc3*Fe@i8f|C}+a)G66?4U58%Env(wj3Jl z0u12H5&#y1MzjDtqO-9iVQk?s0ZYhFU_^vGr~qMcU>10>lpPf1pkxh+a!`qY7UfvN z0umEYe}nx4i|7f+=?WUrC=szjhL7a}gD@yoA%PAGS8$-S`0#i~ne1;j`LNTNLe z2|!5jq85Q*b>Jc}fDx3TFp9tljKZLB1*>EE1Gc$B0bH;hU;t+du+7kLLGceHnV<*> zK-|O4ApAoBT7E-|Kaky^LKkcg%*F}e!w^AU0-b6PDcC^Hhj<%QFF-7Rp(6lNbbtYz z4(xd`734rn-L<|%)5HV=x03AaIanb}vaDalGP=p|<5Jd2W7=Kpp({F{s=+}R0aVd zP#p;I_lEp(6-tymT`$Y!GB%0?ie2 zY~W+~zyKlz7&s?q#DJLB0L~sC7!-am)-iy#Y=O>>hQ$8? zMus~AppXS;67d_LWWvbO0rDUyW+o_rj^AZa09gx)`2!3rJHRm_z`$|2Wq0xLrTNS3{T6=ePaR)z_T90x#b76nOA^*Di1 zdI-;cV)?)d(jLGFVnDp}0o>X`5)obz z0QNTMW==Mc^$ctbFg8nr6e!=kP+*0G8928TfNX;$!~aVA;URa)E&(ftBF^14{raDBpmpIR#eGfXM*{mIE+@Jir=R z3Sb5qfcdbx2HFM%-3h}8I^Y&+_yVwM21fB80^kt-AqMiIfdJT^1K=YXK%NCPW^}|r zdXU=~3_jwZG^W775WvWi0XkwAl%AK!!Ue?(!12UlAPdn7I@%G`$Xx?9N*v^70dVUT zWbKa&P-|HpfPDw*K!Q>(q+|xi7=w)j$aDu#QNt1;0Wua;szBQHDD?_zWdYXAG6(F& z0}SE<5IzU!J_LmsEPV`2;G_vIG{GqnTrorHTu{LbRxbR*1FU}nBTI!O#AfjqkU)l~ zF>rwe$^eiCAw(af2nD4@Na+E}f(4L*2vjmctOoTYz~vx|0W@?NPQZd!fPo%u0p=DUb$8i)TiU6hx^oyhvr(fMg)3rk=pa@&}xn zKqEz93t%k`Xp=GlTuVY*8aQkPc@!MEEOVf_hrt5u4zS0>H8y}cyeuccu9?8d@0UIzsu>f4KEnowct8g|0Bg+G3(3MdF3?K#4Tp&__fu%-)cV0>=?U3XCnlz+wQ)6%d91gKz}o z))Z*2gZAIS`H#bZh2a1Lhy-OP@X)~oMwUBDpwI&~ycJkLo}IwRAOH?iu%}pLltF@^ z&K4xMVY8D3eAGNkjWRT{9ArTZNN9qc3=K^KPz?-_p2Pu;F_r}kus8&b)`6^90Iu;s?f^3m zFo4uRyakcEz`$|`YAc5WKf?hA@eFWZpCy2mB|w11Kmb(in;5YefSNTRK>-GS$ceL- zA5=fGANU74sYJ~A(u&(LDez{FT(8W{3kh_PLQ!BCW;Jb+>2M{$LJEH~sCMHtcr z7z_m%%sD_;yR~yLe9->DuJn)Lh&&S;%XfZ;hU?J`3=KjI3=C44WAkKe8+QV>}@b8mks&<6sbG=U|xpfn7n4p+%m7OMpRvh0%h+IP5*Q0K?>u z><<4J4#+crPA2B$U=ZfvU{L@e|_$dj-Y=JdEv3 z3=E;~85r4_G#EJ+FsL)!SioSwpw8IA=)lhKou7lD@UQrS{|sN`U5r32l}86S^c@)% zJm$AD`yle;!$BVTBq;0;3@_1A7Of(no#+{%2^AXV}1aK;bbzL;L^#{0@&8_%oO`Z3y_r&hU@H zMTUV(0y9ei1Cs)?5yN9P14ad(uM8{& zN9;LR8QfVb7?fDpA{Z2y+1Mll8T>x7HzhEBWMOA`&d9)^X~D#x&dSgz;lse@Ucm5$ z^Ao#+9K#lQ24;41Rt63MAMmk-3U}T$FeLnCKk$d)4+En{$FCm@OahF_43Gb_GyG9t z&-#2!JKMY6YIfR7F86+4CqS=Mq zBbW>sgtiMX3H)wge0W2iK}?99wV_GEN1Q>NgNaFl(cs624-G6qU*s9sg$}TSYyh1K z2kIy=%N{n4a)g;7fQ6-lg~fw~rGZ75e+COn1`AsPi-LmO0!9%L1qNmY z7GY5V27wO;m=X@iGcbH;pUj}lD6)c;#ehZeBLf2?YXb|51&aV%12d>R`o_NCKjQ-i z<^)C_4;F#H2N(plFfcQGU=I56fn7-5fsvVmk*$GMfMb)X@&}F?pe8s=0~?C~BM%27 z%K}h_05KdGAtInt4^A*M2rybQe3;GwI>b+`#etQjgOvru;V}7P&mh3avH-*t{@cL9 zvx7~6K}>@s;mW&>>$VZmn;U8m(3` zI|BnlLjWtwbI|RIGOS$T3=M)CVm|U8`@?WUo`a2bIYUD|Lo_?uK_PkpD?nPGq5o)U<+hmVP{Z}U|`W;O8CH@@QmTi0!G2(6$~sEOalBb*bJC+%nmRmI31Cf z5N8QsyqIIg&HA0cfk`TcjdlA4#*L5I8~!mKkeA?K-EP2S$iPr|gF%2@033ya42%p4 zOo|Nd6$}g(3=AGj4_MjC85CF!#C&8|`*)xvfk}y-v4Y8vov~vD=;qY}3elh0lm0mz zk>}uGtzcDP@Y4WK9|$lcF|cU(ec@N&Fl1nuRKc*3jm@250rP<$cu0)DI<3<(B)ENtwICm4)AvL`4oEaWI>Sip23hLyRUgV8~sg`MdGgTO-twue{b zIXKwW85)?ReB@c%B^Z^=ITRQe8(0c1*fX%Ou>Ol+Q)FZ7)NqtxV3<6Cfw6(nxka8q zJ%GW1lYu>O27@672b%$dz{dyNj17zo3fztiEbQzvH7CfJGk<4nV9qvMz|7FVEv#R_ zpuomvCz!w>qt3$4xPU?6YXXj;z;=UEiG%q7(}FMj4<0jIuxDW7XtN0vh+$__ zpTVfa#-_m}An(8wTqDE4&ZfYm$X3+Bpv2C0fRWLGA^CL+Dz@s|jy#fQX1GnLOc7_BF9tj2oeun2v1`LlsIIuD-0O@0TZ=f>a z(F6_w-3CsF1Mv(32@HV(KJP_7{NS3#C?Jr|(7MEybIToA zz|OLQL*VTL76t}R#_b{uJ2<&kaDYxVWZ>hqX_05x!3i#|nQRU~*(?eCECP&d9E?mW zI9fh2GW=sa!p;D$0vHvz7#=V&J23LB;NVZ-=N0_Wz_P@Sfn89Xp@BKqYzGI+29Av% z85HC|?alvU$A9oR2#C2eGVmG-iXG=**!Y9N;g7-*W(GmAb^!*K1P((6F?In4h6ENt z#s+?d2u25n#|(lmm>HhhD>PIVFftzCcm5*JAbf*GfSrSZ;Q>Er4a)-1;GY2d0YL@k z>}XSg01E~Oeg+P|3;b*f48rOP3`z~^3JjVZstOG90t>jag~b*yvbqU z_JfJ#1tSXsHxHP>zkpj#UtswH?kpY-MkWT(CBXuWEI*i7I2idCaGS_-EZ`Op2;gLR zz$duq0ka{CK>H7VApuDvM;TUz=`VPdJ~AY*2?)(#Gf}wnZU&oz$Okn^(D`Ld3<>Nk z5)3>UO#BI~vHS^)%;M|`j7%JStO<<#39PQnV2M-~4o02?R-OzdmIPK70Y;Vz(D~c^ z39Pyd3S2IZSL7KAn0PXn1U@lvn!q_97O*)SkY^B)V6|bN5tG5hm%zwynvLxQJO2ST zSN;S>ekPFFjJ~W4Y|I5rYz=};4s47Jj0_7nm1XLJg7$jLVSPegZPiSBekPzT@W}guw*}&t#FD%aZfS-|pdH)f4 z1`z@F2Mx>}0S}n|KM-U{U}AZ|BrLjsoq<<~;RUC|2lfTF3z(GHMIUeq2z9D3u3%^4 zVC83E5tip>V9}J1V_@N7W{_eMVCBhRQ~Jo{z{}6T!YR+jz{17dz#%XCfYSzat9gSc zwdspP-C`q`;Q5dJGIQ zY7-u5FfmLJWmv$RLz{JnMFDJx&fRSH;kz1Z$fssd?UxATJg9&6xodTl(s{tF2 z1{>o7F@6SqMrI8rJ_io272=?485sDv85c0{Gw{o@XfP>#WKNLdv0!J>U{d(R?7*kO z&tSmBV!_U0z|6mZlTW@pL5|z-1N(sm{EDIiGx!)5uyJZIDZOP*kg*h!Xpdk4wPO_+ z1q9}SU7{w>qQJyOW@=ZVp+fp@>T*Tqr6-KE3fJi;RoVM>>@kFm4qY&1Pwp3CnPW{Fi0?n zISI%J2)y8!!2EwYBLjy(%(n(%Mg=(qj(i0xzM405~-2gDf|7}*af3H(?g z#&tj`L7qXKv7Nm_(NGQ4`E`(FY-bfYaDh5?Vhbb<*d?ZK5I2Yx7s*gIc6l%6Ah%#6gMt{t3wZ`9kpylB*Y_$S zFBliNzGq-yC}&_;@W_Epf$@q1TM_86JBP;%7P1CD%#s}v0zwY5j0c36!ZZUtEZCXZ zZ%8@uGc;H(5E4Ahz^(FMML~tZJ5*u9BLRkh141{V{~nNXJi^4l-=t^k@QArWKtV=j z1B(LV+Xe;&2GB111q@4md|*)d{{KHe1B(WOfT4qm!-04fnFCTL3=D5H7?jv$3^ z*ccn67-y)kaIgzA81OPWFp3{}AjT}gz~aFy^szyjKY^D=l$n8rg+r8ufrU|lgIz() zkc}Z-Lz+cH+2Jp{!XF0_mK$usTpB#W+y@w05=4bLZZHaSzF=fkU=?CaU|^9@=5b&X zewF~b-i$ASmpg$|QKozb4`^h)fstVW0}}%iXr-rz3X20HO9cZ<1_O%&8w&#~&kQ~m z16D=`VX+eqVk{PnECJ$-4JtNY z#AX~|;PGJPPY~lZ{K&7=z^ue96QKBj<-_rx|NrwdGRQ1vY-oBQHQ~TNhZcDTW@&R) zN23@v1`f~=p5p<=8_@}@K}VPw7z8iyO=SL1{{Pm0env?Jb(IMV7KjN*7)U!YOjy96 zq3*bVne8}e{1nvt-N76@VF3%9I|l>90?N6Ft8n9 zW}d;Ep!S{rfbfk4VvG}*1wJ{j2z*IUWL&^tz$mBI!K}pi)`Nkefv2nC9|P#f3bh%` z!g3Rs84{QzlpGX=p+(np2`oIGGP9v>MO{8!LqxWUBmLxW`kGiZC>19d_3 z1#BRt6PTGCgcuGmaLr(5J-}eVc;F9%G)UhG7J3rZE~>AU{;i4 zpTVre%rt@7@Eg0*1F+%=%mTj;h}axqW&qpB(!dNdVgl&?cnE_*fmxJMfjQ9OF}slY z3U(I;hKAw>cF?ezFmnUDC4;bf1G@q<^8#iA4-0k+HpT{ahDK%v2Pysrc4c{$26h<% zm3#({26l!O?EDSv94srC7!{aoj>t3I&}IqX;cs9!5^jO9U54p#vks1AS(&CWnuV2mU#*u}om*Z(w5LcMxI}5-(t9Il##8 zpcu>2!OZkPTE(6LtQ1s8Okf7x4(Gtg*uX7z;{gk3z>b5F{{fc}gM*TgxCJ{)2eT5h z_yT5uuL+7KGvq;bM}PzkKe9J47_mUiNdRRRo(@nlgorRb-~wC1(!uP+%;3%lI*)~w z=>|U=g8^u=gTa7Ffq_XvK*{C3hJdh(g8-WX15<V& z<}--@5MbFMAT0cWZ}LZXg?|D^hWKtP=FfdGpKuYv%d0UOf;0X4=84_KA>_&j)7G#FSO z2rwBius;wG61c$1w}6GAos-MKlp#TwKCbtc(rJoDWn5!*?*VE?{AJV9f7e z%E79@t018MKvg613%LSW|j;N)&mUu4g7-q4xEfaVkdYQ7cep?FtRp?GIX%Ay%6MS-~v^g z8(8=m_>9;e@PI1I2`nrLTp-2{0hSG*o9cOX2(V-rvovsm#1AmAykOyP5as87pd=-h z!D;xBf58DZC1$Y=%sd&K0?!&aZ8pe*%IgeH76(qAA3Q7>oGc0q5C)?H1A~JU_YWQ> z4`ySB#~&I*SqeCrUa+%efbM=WIU>(cz{#Y+!3NgCQozZR!O6nF$<)BX&%kNV$RH|k z-~l(&1S>`kCIjZ|=m16r2QGoX3LH!h90IHi94s0vECwnJ4C+n{Z{!(%NEqivvt}?0 zvrG_Ve4s3NLjhEDI`A`e{LzpWW@%s+`0pSs(6E4+L4boV!HVBOT7jWKjE%vLk%3>7 z{QxIV1E<4BCWSf%X{H87wi}#GD;U`ha55bLvtKZR^6(8VrI7b8xY#(P85y{YTI?Al zxIryVZsU;mVgihe33kq3=gOIV;lP~hPLM&}wC z2BrvUVetmgkhG=JTYd&MLw06&2T=!phK8yWjKVSn(u@sEi8tgKq!_L*U=)^g04+k` zVoKofw`X9N1YO>=K#)IyBVMpwgOfjj!<>nMjpqalKLeWxUjm1SWP=XV0~OFNn8XR( zEew+Bj0}wuF$#hV4Ll4F)b*J~6F3+RZ$`g)!0jZ*z$Wm6eG^+T{@kE%YmPPL+=G6V*?w*0TxCH27$~Q5{xHgm@^}B7K}>FQWDID3{vJC%#03# z0XO71q@)grdojo`3vn4FIJ4<^(IM~1oja?Z3Ffm*ZW_kePw|wA#@L%DNJcEG?(+g&{ z6P)}9I2HICgqc_@IGH%O*-mh>98h7h;O1W-%)op=h2;V#&k0V(1)S^)gau_@Ffbh8 zWJ*wFSs*MdA;BpuzJpU(Yy+q8-vz?LqBDerMRsrs3vb{AUF|8fgHu@WhKjI21Lx## z><=C@Y_VrxYTy)R*ukm9&d?yvpuo$}VDG@sz@T}7U0CJ@pN!N3PUa8F{0yw+g3C8B z@h{+PV?M#mzJOCuZ~-T%Y0SpEfKy6r1{=!+HitLt3f~mWSQc;!vCQBUX;{I@yMR-a zWdWxE+XY5O2M&e>oQx6-d={_uW;k!;V4Kds z!K@I?!mcb~x^ z$e!V-z~J}5QNiI%gNvYf0R!j;Uqb;uP7Vem1HX?7N(vHO0t^An8h-4o8qNwD(FP0# z9rX+goEZ}w1z80ct|>%waJ4fmU^x)O!m=4uSc@pKryp=s;$pFIP~zm+;3)9Lfg$mT zJO?{N`vwNk9dkb%9n3k|nH(G)zOXa=PtY}FXK&xZprpi2U zOmJp+pvtyDQHg`Yfx&>4RrHFmpv(cMpd<1O`V0b2it5ZO7#yRSS<@dl3Vc^!PW&U! zz`{}{z-Y+A%e_(fDf4~JA zhn*m(z{;Lk?%?yDpJ8HH!;cRiAd1||)w02?Jn4+WElJzof!mT zSUX&m*cdcGlZOHh4}=&Qn4NOWSeYZ3m6(||m>3_p2$8K~@X?(t$iCn? zLx~Lo!w(i=js^~f0LB8)9M>oQ1rMB!IG7oBuoyCPnXh0`;$mLG!r;J?1e$STWj^3x z$i}=p!VNSMD$MBMVR$57LV+QXNh6w-HJ!mtfNh2Aqk4wukL+##4$J^uoz-puTFIxN z5Us$#kio*xz{S9zYsAdPZowe1O^3z|T>>fmJ#B*#Q>=_79v5 zjE;=#43!xS0t^=z4B2PIJdkBbU{GY|{K0I<&%m_6ge3vg;9_V1-A2vB!Jrf?=n!DY z%8=dwn&D_*WODE_&W`@zAS=r7!&R^mBmkOd{V4w6fU7X`4+fSOj0_pfj0;#pA3SIH zQ_sN2$l%7ff`LJRiN8TXpS6LB`2rKq0|tHuMqZW%CPP*ha|T8x4oy*w1~0}2CWZz! z1_^J*A8t$yO#BBJOi@o0`zS-gX=28!4T|*`4lVr4m6}(gkeQc~TA`3xtdL(?qL81a zkd$AVm!iP%l^y$8Vl`_76}lN5K!=Dquld4|nV6GVm71bZo>`Kikf;DYMC=1Q_Cv&y z7M7)?>WBFG=>M7{svnQ{@Gu5ph9u;(!yN1s3^dglKFgDObeKVDUa}qoqZosMPiCGN z!^IRXm(0ZUy!_&l%;aMDxnT~VW5diCKQSpJrlb_578ffd=A|f9r55F*92>?E$M_8A zp|IFi5hfffjVlWn}>GPBnSh z>Bzvq;KIhh~sL5{t8;lRL9(gj+? zrp_RsxRQqfyu)1J8#gz|OyK}YMwTCm3=RzPN}$!4SFW>xgcTHY#X&Z+RDv!flUD{w zJ#%LVNeQc*?h_IL^#vH@LCs>pP3jzs|2sLE7z8Zl`!V-)gRF93V6f}uabQ#s=aOUK zb6^k-a9|K;X<%RwX5w*RU_1!QObiMkhG0utSQ(f>r}PN&I55Z`a$sN)z5zO_i;>|n zBglt_3Ct|&Osq@{0uG^x%qE~M(x8F{EGb_AQl8!}G+=swnWch(QI02`fq`Kg=q_#nj$9T{RA1l&op;5=;K0n| zz@TpkI{ZoLhO-5OQUe=ka+3in;lSy@z^oMEWx*ieU~F#;WV1Qrnk zonNL<5o*W47Lf`H5CP&1*3=Hvt3{@fwmW<2{px!rw*bxQ} zCI&?%MhON67Di4c2Bn55eg*-DSO#VX1|M0-NmX&66RZRdL}@WFf#&sn6d6FrqAAT{ zRb~)mdlApZ#2|3slL9C^ten7boWaMOfkDcFL8Xe_fq_96bhMMwfl3|*mWas=4h%jP z3=A3$4Ep`xuy*JMo#qSLC+=g(z#!qkpg)__fq{jgJ`|K^4uC^W-im>dLH>*b1B(K~ zR5J#)3zHcd7<{Z57-Sn5^tW&|Ffc0|m}kep_5)-*_*6TU`Jh9uF0n8%F(^4K=3rn^ zm;$odmVtq%fkBA>04uKplK?}0B!iU_I|GxC9RtW$qG6!3t{4PY*fVe*VDhnNVBmIO z5M2$rjFN$ci9zYWYAyzj21a>@21W*6yvzz`({+$RNUE$-vCSpdZQxJ)6pb1#&hOqcpR12@eB9 zFX+@~s%r^i2EATQf zgxmc4@4(BzkYW4pKWNi+lkLC%YlImX7TNy$e?pjn;gIdW|4)P&7#`XF`_CZ4z`$hp z@4t-*1A~*@zyA>;3=C~{|NfVVFfeSl`}co}2m`|>yMO;Th%hjy*#GCzyAk#85oXx{`>!cmx1A~=fD39d<+a9JpcVy;A3E5^7{ARfscVf!t3Av z1U?1^ZLfd-7w|DKSb6{Ze}a#J;h6Wo|1Uu5z5o5^;Adb^^7;2)gP(!H(dXZP4}J!Q z3BLdSXYeyH?D75gzlEQHVWr={|7-Xe7_RyK`+tU?fq}vQ-~TuK3=Dey|Net6le*>q z@4txv0|Q6EzyBcu3=CTW{{623oqrSf@Bb1|dq42s|04nn40l5R{eL6Cz)&9c@4tv3 z1A|cbzyCIZ3=BQt|Nf^4GBC^z|M$N|kbz-e_`m;41Q{4^g#Y`0M38~uU--ZOPXrkl zbRz!!XAxpxaEkc%Uqy(4Au;0Le;3fA*oc4sQ-l~8-bDWU-y_7ppcD1){~93%2A`;Z z|IY|9Fyuu2`~OCWfuSer-+vxq28K0J|NiR;Gcc${|NHMF%)sCl{qKK{Fatwc^uPZ- z!VC;WG5`L{h%hkRi~0Bej0gk6;rM_5|A;X#tWWs&Uj}q?Lejtg4&n?9rAhz(Cx|mJ zJWTrczd@XVfj{}*{{`X<3{lDd{)47PbCUo4e<04lFfaMvf6$ijFUkM@Ye+CKc%}UN z?<2v$kdgB5e~ttLLr2QL|2+~63@cLp{a+)&!0;gD-~S5|3=CYU|NeiFU|@(z{r6u) zl7XQu_1}LJNd|^FssH|mNHQ>dNd5P}M3R9)F74m{8IlYPerf;y?~r6*Sd{kf{|!k7 zh6`!`{{N9=U|>l9_g_Yefq^aK-+vn^1_rl`fBze#7#Qwn{QJK^ih-da^WXm?QVa|W zGXMR5BE`UPAoJgU7HI|syR3izb)*>>+Oq!r50GYHxRmwpe}OaugHrat{}ZGc7|OE$ z{of$Xz)+O)@BbBP1_puLfB%0-GcYX5{r6u+hJm3d@85q183u+0dH?<=$S^Qm&;R$I zLzaPoz2M(}4Os>T^MZf>J!Bae{0jd4&yZzcNGSOCzeARRVQA(L0atsVbrT_jH$T2WXF8%j^f*b>b zLD|3mJLDJ`9LoOvzahuK5K#8-{|`9^hJ9uK{>y+CQKgz3mr-V5c-;8!zl|~jgId$S|1ru841P`j{?{loFr+m7`#(pSfuX1A-~T%adpDhv$4ZU6q;s4y@vw*UJdqr$-8(*Ey%jS2%pPW!+A z3se{wcDMige?WzSL9FZF{}(C@4EMVJ{TEPWVBqTh_uoL3fkCGG-~Rwr28O)ufB!2~ z85nxH|NWn#%D}Ly```Z^stgP#y8r#Zp~}E;ulwKsAF2!tT0Q^%OQj@{O`Yw1_Q&M$^ZVxXfQA=o$~L0iv|Nj$kc!TmuN6BteN`n z{}BxahC@^T{ePmtz@Rhj-+vZO28OU{|Ng6JGBET^`}f~PlY!ycw159oG#MD=r~muk zqRGJEGyUKHC7KKjWz+xtKcdON&@uhr|0kLZ46~;H`_H1qz`!@--+vV?1_qTG|NgsZ zF)&=3@$Y|%76XIh%zyt|v=|sZ&HVR&i53IHidp~uAJJl9V4eT({~Ij^28sFq{_|)v zFszyX@4taI0|U>3fByrt85rai{QF;^&A?Ex;NSl#+6)ZS7X16aMVo&!Ee|@MzJ${|dSc3|x!<{ddr1V3@J^-~Sw41_p*D z|Nc+VWnj=+^6&o!T?U2)OaA@8qRYVWV#&Y%UvwE5_AUMQUqp|A;la{>|4sB57`T@G z`yT>YeY5P}{}Me02EAqf{!h_kU~pLW@BbD(28M)X|NdXmV_;}l_V51}JqCs)%m4jX z&}U$fTJi6{hdu*?(TacnGxQl46j%QH-=ojKP`CEq|1J6q40G20`+r5BfnmeifB(Pe zGcY__`|rPq0Rw~Nx_|#o3>X-^*Zun+V!*&qvF_jh5(5T??sfnEPcdL%c(Li<|2+l_ z46fV%{eNP>z_4idzyCaj3=BDY{{1&GWMEja=imPjLk5PMd;a||F=Sv6-uv(W6hj6E zhrR#)Z!u(GsM!1O{}n?9hNXM|{r_Ufz;Jx;zyBge3=B{A{`+rY#K6F@@8ACrBL)Vk zegFQK7%?zd?ECkBiV*`t(!PKHw-_-ntl9VP{}m$!hEx0g{r_Ub!0>M0zyBh}3=F*c z|NS>HW?*pG|L=c@F#|);{(t{Vj2RdT_W%1o#h8JiY5%|fTZ|bPrtJUs|B5jK!>0ZJ z{(muMV7RdV-+vJk28LfC{U!_yEC>JnPcUI%P&)YUe}f4FgUP{v{}-4rFjOD>_y2$i z14G}zfBzqtFfgn+`0qc1DFef)ga7_3m@+W@IQZ|sgDC?8*P(y^6HFNxst*18-(t$Z zu;$Re|4U367*vn^`+vlgf#JiEfB)Z@GBEHR{r8{8jDf-C=)eCuW(*8PNB{lzF=Jpj zdGz1^95V)n$;bZvpJK+q5O@6F|2<|54Bf~7{l8KgXPbLHWeL|9i|C7@SW0`+vurfg$L` zzyE*C85ojI{QECs!N4%%#J~SG77PrlPW<~HW5K|1=ET4MH5LpE_dxnB7#RM5^jk16 z1fBf%|A7SqL(0j2|3T+`tvdPdzk($L!3{#vSTZoQoc{O!jU@xanbZIN^H?!3+&lg6zm63H!=Kat{`*)lFz}uE z_dmyqfkEfYzyCc}3=DQ>{{3HL#lW!S%)kE^tQZ*focZ_v14#XufByxn85p$A{`+rW z&A{;Y?7#mJ)(i|h=l=b#ux4P$J@@bb3~L64hI9Y^@33ZIm~`&n{~Oi}4A;*6`~Sn5 zf#J)!fBz+H7#P^j|NC!Y!@%Hi{@?!y8wQ55^Z))=*f213od5TKh7AM5uJix?@33KD zIDh`%{~I<84DZhW`~Sm+fx+a$zyA`p3=DY}{{6SGWnk#J@b7~Fs!=x@4tl|1H+e#|NckVF)&zN z`uD%Wj)9^6(!c*R>=+n2uKfFdz>a}o;|4N{|o~I1Hb;i|AL@Clp|C> zGY=z30Z9H0NS^_2^LZE<7!Fwf`!5Sp2eUr}q?-Y3zYZew=j0_By ztpEK772ymp{V53f97YC)`_}*diy`axMacIsGB8}V{`X%PS>6jFzlM>4;e_?S|3b*} zo(TCfj0_CB@Rz~5J z!0^E4-+v=y`AmfT7A6LUPd5MlgYIyGn;(Xdzrw`8@X_Yqe}7OqfSJ$i26iCms-iDU z3=9f(|NiSBo6i6`5s`s`gOP#3!0zAwx1a?E>p_(mG#$A%Gc(;s*kQrU!0;8_jAVp- z1TzBzlRa)fRxmR#@YrKdE1+-&bt6A85$Pz~SHjFp$6B=%d6flLx{+0~Q8`IER1#MUlhK2OHN)VcRCH?uH-b15k8ePCf=P;mSAAJh$pryVzleIR+zVT$~2|Nbi@ z`!5P1Z@|jHAmoO-3<+RmVBm54_aD@(hMOOY&|d)6kE@KBzzT66t}+62z?_&H?zDe_ zm4QLj4O(Wy(taAmeo(l-U}j)Yar^f_nvsElWuYEQJTv=(vj{j3O0Y37M7aO^4_ZY5 z_eVLz98fq}fDW2;$L)>?0`e6E^Msv&;gROQ|1QXS86b5UXiE^6*1!LT z$Z{FrUxtc z1t5Pla4;}vX#M*S8fIaD)!9gO?g9=521PC0`3ZF1C`i8`$bB&V(DDdm{sRsM21%`d z|I8OT$&jG@07?h(TL1p<1LcS3+Nkc_0FG{Oe$e1#VEC$yJKuY7GBCW?#_i4w zsD3$QcZ7n20c?H;Cj-N0?SKC}Q0lCDumeE;Si#A_5Uu;~Kls`LxIY}h27u&Ga56Bo z>i+v5jH2HQuKxum1H%rzfB!*kH+Xu^25Se~&%wpOa7GWeehn@LhX4Aw^?PtJFbEs` z`+pVLe@th<7J%HB!NtIE!T_z#g(P3F{T*Bk4EsUjxG4G`gY|>**a|KNhHlS)|0|LG z2Wod@gS`kg;|v!A!vn8>|9w%+Z~>bEa>E-g1_oYl?0EF^Mhk-%J=ih(OTpT=2 zLi`F&*dTu_;9+2Jz+nbR{s0dHgO?BX_6SJ+0aQP>J_E=NKe!keN_-&gFcw}Ue;~En zRd^W~>U^-*sUSOCco`V_e6aURKz77%GcX*2*|Am~5mrccbnr4ToWWzq3SI_=J3jyZ zvmvK{roZ681i9e^F9X9NU$pWcT3>_Q@Pe0tp+xuJ|8P)Tz{*KbTMMcG!NbSEu+R73 z|Ct~)@G@os*ba~xI(!Ta=6?VFgVuAwWIA*%tI(*jZfK97ZA1wR8rUEsg}=h4+1gs1`e2Q;^& z6@pnw0(L3N$2DoWTgYa+@~5dj8<`P%>f^MUMwr4^)pCTOmz7mqw> z4y#%l8g4NCO#Wbpg4|gmz`(E`kNznF3=GGE{{1%q*$<1Cbg*`C8rvekz;GRp{1pKP zhUY>5{);2)_e1FaBEZ0KF9>@%4GJe7eg=jMLI3{8g8DFeDu{3b^Xq$;SvXFnkZgEng$Z!0-}}{2W0BhC5;Z{yQQ2!?l^Ygom92Wd0t|++G;&GX9Pr z1H&IY@_z&w7=DHQ`>%*>egq_(K;ZyN2fxGq{htfUW0A^;Z~&!Eq&yFrLo5i#-KRmAaRG7zhcE*}Rm8vl{>b`4{sh-Vkg)+3K?a6p5&!;2fbvhA5=wY5 zLz-IP^cf<|z;G?{-+xWyvYJ@|oWVeDDG_F1co6mPKWKamUXC&a!`(VXn1Mki8n^x} zpt;u=-1@HwGcY8@;I0$D2s1E*$KY-UiHI;TEQ$H|e+P0{Fs+BX&qRcQ!6p{_I29-? zLqr%D^ke`1??u)R3QMH2xk7}2VW&FoGG>Md1H%e+XnPt~h9Hd*?GRyLh``|nP#E73 zVPNoy#hoU9K>dJi902482~h?H?bv_+ThRSb1`Y;rJllveF!aP?uj@eh2b2y#@~NP7 z?WBN`UYNom=78*~5oTaG6$@zxv8W-bgATla{IdddAweu!AFdi=21x#dC0->bAZ-m#9=QBK=K-53=Fby|NevS5P_#z*Jfr=j~gWKA;!R96Zh{w z$lq}JXoUMR#26SH;&7+Q4lxD>vpDSiG?4i##26TA;{N>yU26k3-?f>!6j~OZ5MyAN zABVfFd?Ciba47EIe|6;ej6=AOL!5!(R2=THSq*Uph68c`{%auXk4NbD5NBXG5%=%^ ze3U#t4W8#S#2FYo67cw6oPohE;opA=6#e{g{VT*77|as?{r`zEHi#DDq3;@-M_080IFSg%Nom;o4xsQ+kz!zIO2eLZLFJc=6azy=+Q0wPkj;mL zaT~}MkQo_L3=FT*u-D(9Jk=n-6F-n zur338It1BuMT&vpUIzB~1iAN(1OvmJ49M6#3mcMM(E0_WMnal_;ZerF|Mi%D$_M!g zqz1H>$S(8We=AV>IwXZ~GpJC6j;|$1Gcbf?{`;?ilCR~#o&mWPbpK0C7WQ%%l!j+W zGBCtt{rmq4WJeN`9gy(71#%0>oDjg3l4A*l0{m(=WS0>1y07(A? z83qQ0Tu9u&%Xw(LfaEvGFfeH4;!ew;dyO1&|NW0bF&{FC2-5#ShJm3f_uqfeHB@l( zq5VpbynrkN!{l7ZcmP};(hLQsJp)+=h6Q-s7a+^PuqqdOnFKPwK$d}F4j%alvJ4D; zx&QtbqqrYDeF3%~w65z>F7EIJt@pZ-i#z^6>%h|T{{7EKF&{EP0CFE_ec0B#fB!e2 z$S(&a53v29bz^S%|Nd`7kzWgz2cfZCAY5(P58v`3DCp`_&B{{$4bKpIvczub{yVAz7kjz4k?49iN; z(hA59$S@k%zY6jU40lSfmn9%O9OM}oZsCznkY`}HfkPhT&jxu0hHE8|F+UW4LWUv0 zZdfAEz`#+8Jx_qbe2**xgHY+e|64$Ld#V^B%t3xx296rAId9|{80MAY_6v^!1H+tB z-1UGCXpLn z9Fb#SI8+Mhcd*PCMTCPVqz++%wwFQS{zHL*L8a{9|8(TE0}8K5M0m+4GB9M6VGnnZ z88(Ux3`J%C{+FSf0i8bqxj8|Rf#FEmzyF}k0dP0hA?#>SWMFtxhI`z6fg%ILqVj+L z!HX(jAKnz;LMI-+#~%L-05Oneh-3FCe=F6d4$LE1~6BKayQ&<(Pvq z1H++O+<7WNnStRzE%v@LD0~`}85p+IVxJQL$uCf5U^rI$?>}gq9qv}7G4=z>3=DVi z=zpNh!0^5ncYOw$WdDQ1eV}lVP=e?OkA>|LLc|YJIDpiF(t%17G`x=DR+pi~z!2F6 zRd)fmx(P}Q41Hby{+|MsX}3|;G3|uJ6DWK@>t_wR|NXZ?&V$7W`3EWt3_ji1+npf& z45|zavE7h196WrH+HVS~3=HMnxceXustgP*-DvUS+RU5@?UyE~GBC{P#vT3*stgP( zyK(351*!}TtGjWhuLA`1KTu_0SksLs|En=D%)?{8f*QpAxYp`8s4*}s>&87VnV`nN zZ~>3`4WM<%-MH6VEKp-$xYUiid^w=Tz;Lk}cm8>x#=!6dpa0bv7_Q-QpMp9A!xcRG z9n={Zj^U9{P-kG+fyev?bq0oQc;pwTGcauF#$LXI%1_Wb@O|C?{&RxL&*S`v{0Xb` zz9aI&8+8VTIX$@35|0K0!>k_MX+=kafuW}dcU|VA!N9N?kNzAD28LyL?C;TFU|51j zevJkL!=fJCV_0W27#KF-upd%pC#3_z2ay=e2$ z%mv{79H<-wt(%YRgO*$2e2Dk}jcp*6TNRoN41fA?$Hfdy28LgKxc$0AlYxPwA9p|K zh9(1pTtD`H8pxd=)EO8;`=RbU$cxpTAaxQN3=H34>Q-Y{2MRY2Ee3{J6L6Pv8Cnbs z%O~JYTOC>q3^OO-4v!UD3=Grp=s%&wz|e(9{)H9;Lk%8z4s8a8ya~ALX$@@#hD1E> z^U!8sh{q$Jq0PV$Hv#u}bcZ$rgC8FG71|68b`x;N_X%wVh7dgdd!fz1;5GqwJ;R~H zz~F&LzlIJ2gB1?{f%2Az4g-V51pNJ&932LREIfAf=rAzEO!)WT4LN_-BFe8dIt&a= z6L6P>puHVE6aM`_4Qew8awF0-tSm&ze?N2>7<4A$9y62BWngfchLYIM|XyU*B zptV=;$G~s~PZ-tc zF)$pNh+g1H-0CxaV$K^cfhgO~M_Q zOY|8SuHZ5Mh&}_uu}NriAC*Y{*JogOF$sG(g8au~z`*bgkNGN~Jw20f*NH9$3=F>} zVILm`nV(|7!0;ZA|5^+f7~W07omZ9^FffQr#@$9gV!*(lHyL;Mf%YKjOvas8SqvE% zR3_uDyHpGr7>p+4S^s0mz~F>Oe~KXkLjWHAErtvXkGOEJTU!F!8#Vdge@~S3=>U#k zQ1~7(WMIg`BmcyZfnmzzfB#*P_1A#4gUc!wBL;?DIOIY3OvZqL;r`@*|2aYPI@eec z`3%$t`~hzIfVzX^YYF&&O7#_^{ z_a8LR%izW*(9h(_C(+03%BRrF;>xGd!|K9k(8lJ$XVJ{=&gYQFXW__a;K--p#HZlI zC*i~=;KauP-Y?T*%)r1g^WT3~P&`2~Xz$Dt(1w$l(DDZ={{KItKsDnjmNK$ly8ze7UX2QU*gOT+=Bg1J%)>n)S zGobF$?*w^i5hLqH4u*}4tTQq~ZqHg?uV4u-QF5c7B~ zSs567SPK}yTL1rN1X&CF?;(hJHqf2pN9lvu9;s$YAYeWLN?UAFwB9%QA{;ffnvC zN}gt9yvL~cpOIlA$g7~qG6shKvqc$s)LHcyLs%mjvROe#$beJ-|No3+IAG9f0mo3d z+;m2U&7hD0`CFDz)R2_{l&&{3GTvq6dd|r36C?)>m;a1tdj1n9!rj5h+Rw3gs4+DceH^dYe zKM;+dfX2^*@>$Mq3=H5s4-5>T!>z$0F!2{qkHXyj0DATUCs-9^-zta+ z-Y>?$0NNG`;xhdH4_PY22o?GV<#RyI`w!(qjbr!#Liyk`6~HD^m#~25 zA9Pa>dfYJ}3^am)-02-TMU|RC50Hq6{ zbOV&00Hqf|=?zf&0F=G}r5`})4^Wx`yrGqWK>$iCKxqRg?Es|%pfu>*9#C2-fbv1- z_kj2lpnMRUmY5>}mV+U-Y;;a?1BoPjgzzsUfp`oIAaPJx*AWRKA#xzn-PzenK_fJ& zG_Ryo!O%d@NY6mm(6|)LH`IiRfE41v%nXd6v#2ptFf%YQfX=l*mtbaKW&oXMfho?y zPy;KVkR+KISV6ZsA&Eek%nWSsau6Z{A(?LSQB{ z13$wCEa?Yykv-`ABd{VQf|)^(0apJY2{SM=2r+;z>O>X;v6&f!84f_}Ll7T^nHfYF zVCe=X2BMi6L>bW24MbdwLE#^2dV+|HGtA(Eq&pBFhM5^87y|6DiAyppfYygFwIG_A zK?+_^g7`4Z%plEh0IPZ#25jXwGlML=zJr+oqL~@w7#Ps<4@6v^K?19|0z(5UXsCDtns^>m z9KD?h+IPgrpun&JS{{Jh2eNkp185iwGn}_W&0&DHV?k;`_%K*KW_&%u;eOB=Ng#U< zK<$OK7eM=tK^Hr5Ld}P@ixe3_?m?taP_-XL=@qv%_7h?De6^E%;hKe&hgt!A1zqU|u z4m9y7sJH-{csW#D0xAx(cQsU822K1vR9pc~TpB#AE5x9JCJvg8U}O+t(140Vo4X9j zQ1u2-ahSaoP;nD9@iwTq1)BI|sJIQ9_zG4?`gE~{gb%ct%kUK{9t#y$gGhr;c4q+P zix#Lj_#6@j28L;D5OaP&`;*{(_Y4dSYuG_)K!HI38jrB>zr+qvzZ1N{4v~I7fYl2# zC_vi>;PZGG7#LvrR07Q$Rt|_cm!ams+^Gf?{|Oa`weunQ3$woP=YY6V0nL0CPKf#b z;6)k?pktH4ArTH0k3|z-1{KeSii6K#U|?W)#tAWB1I>IlZix9WK}UcyFff46GXeF_ zcp&2Yq2_@1rZX@wMDRkyZNM7?A!pNp90WSQ8B~rLpqambAELehsvc(kF9C@7VQ2vc ztIi(@LfmPAW{!{$ME!TDdhod?3=9lbP;uBe0{DzE1_p*msQ6hl_n(7`-+_w5;;U2` zVm`V!=v-xxzZ}rqvr`1(&Hyy=1W|~%2bwsm7(_e*O?;IY#9k)o_y_nb6$S=|TTpS> zxCks?;Hn2f=RJe2GV#IEe!y1Va)CRpnC%l;afo{yp#635*>?;K43m06;+X9>70`9M z44CZ-U9kC*80`}~s5p9iCJ<~6W;-NZoPj}!L4ctQyfKA=0lcS47?GF0o3vXi*JXD!^Yjf=cs|kD`X(r-^}q;iVWTT?sHO zg0?GQ?az9sdMh;Z??S~7pt+M#9%B9sX!?PLgMmE6{OjNa^b8D8YZw}#>Ni8fAL=ZI zg-~&QXvBigLSkTGcn>v499ltv&#?l9vjWI`h@X(jtqKtLM1eP0GBAM8=VD-B_@KbR zAiyBc;DOd||Bk~PPeq71lRyW?FfcHH&u(I1V90`sYb8T$1)nj*z`*bcDn6|bA`Z2l z!9^Kjjtpq=Ad)%LpyE@Z5duDcje&vTiZXV8J;NbxsRA)SE(2mO=-e6*D+nsSVk(G- zFtY_JUIsqkj)4JuRs;hB186@3sQfvFmf!VNA?BNPK+FN{tp>4Dq2l+T83NWW?SzUa z>;|bqDo>!|51{EE8mtVLq2f2dhyNhT6LmF^n*|t>py>x%9WvyA&KqYCU_j4bbD`q9 zp#Fs>F9u6>i2DyLg184-{V?=YgU-Lf?Dus+)oVc0KdfH3st$4w#0+He8&o}P93EC~ zIBG!b)q<7-Fmw7f7#IXWB|5bI3~Oi3gR1A-3o#v5Zk*J>Ztr`j`ZN0>>S5)8h$h6| zf5%Y8qoCrw(DD=Jo&}oN&EE%Ap8z^k7j*tD#5e{XEr|I(paVS_7(mxq2jNg@d7?alYxPuUmKwp><}pN z0IL1~G$TTrWem1D5cBUr{RJEDRMCZqYeCaLEPP^hv728FRi6a)7p&agi$gt+9>n}* z(0mF$JBopUK@%#DUY{01#iK4jJOryZFF?hoLmRxX`c70IWIjYKGHD1F7hte~`U`Ys z0Z2YUAG^O+LDe712B~6TfKD4Re20p|=AEF`KSP`W$X)@41<-sAOSjoj@iox$0Tw=8 zQ1MgH`VM(-Xy9t^fVddLesCXWD^CtrX=u9AxPSAKZ$Q(6jg$At-8EzOu%t>hlsbXLNpF_gH zz)%f3N1j1|!KVo#4x9fu3l+Co1`!80QyCZ-Doi2foPf5MVC_IJGf+4~)FP9mW)Sty zq4fx?{@M%`e+6x4!s^G%P;vC~|07g91X_;4%4bJ&i23N^6QSY@py3bN-wAS;js-;h zQSf1Bi1ue8RQ$v)kSavG$k7s_J^`ApVDZvs$-uCXL5QK@9Yj5}8OpE^s{RV7W5U3| z04@F)oU)+Z4D9ciGY|7%U_`JAW(V` zfyNg!n=!0}s?UH1By4_cuQdaMAd?&e`aHoYsQNn-A?Ab5aR9NtLdEl-^UBcThQZec z;+_i7fFT2@jS5oDz|amA*M){BY#Qg54fb$Qw}qH91KLi2)tiA(ah-V(b7Aq)4i(=8 zZ70C;m#7`YoCeTq#S@_E6Bb{4?6HRfn*+rBUC{V~ zg@Y7Sya-xO!ond4Dh``RhpDfDieG^CM_}Q<9xA>D+HQxn&wU*s_8x#X#9$-W-B9s& zM%V9_L+nNG=SD)smq0T#teso|73V<E|v~TVH7RwGKdh3*JM?z`!us1ERhWY7Y3W2nGg*l^)pL{}ZZy z2ekf$rCS3}?CSkIA?6oB)x+8+6QJU3(0&ALT4g^}9JU?@<{kkrh&dV1c!c#6W1!*z z&~_%w{8do#1JDW+bY>>V=Rct0EztZ98}|tChM2DaZO4N47l71vLB-{u;R7oNHbcc< zf+lDf7#N_*g5fPxd_J^23EFc3GRM^iVm^BRF%BwT2Q3F+=6CyG56>k~^)Ar#4;%k^ zhC@AsQ5oh~!Dq}eFfdp{#jBbj z=7Y~SV_;yYg^F`*gowvNJ77nl;%lJml3?YGus_6n3uu7>D?crv;tJ4#Oj!8%LB)40 zgqRO)7BdtFK+HiO-&-1hJv=W!)!&DfOVD982GF6hpn5+Ctz3-^gqZ&ZTE4-;tph54 zJ`Um@@Hy9@aR#XPH)#74R-b|nO$M271FaWe?NZ$!?C$gkg4k;h%`dQUXn=~hLF+Zx zvWgi{aoKAS_duhW;RIB?586+Gxkoe@Vt(pYhk8={)396w;P4RA@-uDXY&Y%xDqs9!0J0Ms5pANaZv=szXE9fJq}fm-mZQN6=!Y$ zg$e@$tbVkPgxKqYmj0(g#nJmO=}{2%cF^`D`1~#g28KmYarAcDtte3VL+nH*)uJKh zptn~8pyKG`Hq}t^CTRTv8_z!t70-jVS7B+`AO>Q-6LkC*)=tQPilevdW<$mKpyd^; zzL-!Aa=!orY<&ox-modH#kKAx}%D&7O_7sK*hWIV(@=;L)upyKphKK|JU6<2_ci^9fX9zew>K+73e`qWH@ znDY;sfk5|ffTAV`D$W3H--3>@0EzE}iXVWs_rPaJGB7ZRra;U=@8{lwicjkT>1F_3 zZwF=+rb5&g%?EQCKxdJGSg%sC`6-RIXzk`b3flh$D1$h>9_eloG{Q?X^&~`X%+_C~H z&IBDNhxLP|WjF?}L`NuyB}= z4KW|K4h>e1Kgb4|j~X^09#;;;{P)oCfteEo6&HifheE4=2BTbv`WI0D!s=O`Jczgx z_`pjB23R{GEDyWA8F>(QR-o0tsW{p{ZBX;o(8gQ%@*(!3mt(>CAoGP7VC(mx(Z;YC zs@@HneqinD%TV!u;KRHb7+~e|_5z6c3ebcAS|0`qH_1YX_zLKF7pz|y02NmNUkb;- z06OOtq`tNY;vU%gf7p8VsYMWTen1O&*t&>)P;m#SLqPY8fz0^|6)%|#;xRD5*4wBT zL(D;+A1Q^3`$6X~VDY=57<;_%l|and08PlSbgl~(KMHNX!RAe3q2g}y+zRS4Q9>)s5pB0b`&ZuGy`Hj_}o)a z{a6VxCkvX+Vdhvu#nHzT+M(j;{ok2TajAU}^Fil$fmrNS5cAQ;MSY;+J<#?KZ2WL` z754Cd16AJu?I*&*LB1MdK6*QWq?>BHe7UW`7tWX0n-wWDK zfMy#8`C9Dmv=j%~tH7`TI^GL2Ew>h84*EFibf|a^v_A`LKdgs}e}eWSpv4D+U>(F9 z^mb859VETM_I1J1+Y+eyX;AZF;r0kBj=uiHxE^A@I<&t4Uq0RdGGB-Rwm%Bi{`YEt zs26~a|G?5u8B|;Z+AoInrxruSAICyM2DJVU#JbP`3Qqxs`Ox)p(B=z+S|h}K^!Zs& zsJIujJqhjZGJJ-Le}Lu-Sa^CgLCl$y1+tBS0UE3f8=&GY(D5!9Sf5Oh8Zwq4Ak1tOjVo&N%#f6u_cPzV)wf!1rVdY`iu#PhtBORY8tqU|&5tnuc7V+hU^oCBAA>f77-oRQp_-AX<4|*cLc<5v zZUn7E0*&LHftHi7@rRmzh`p1b5;wzy23s`;H3l+Zv zO|YqnSRgV?(b&0j50aoGM{Sbkpz6-OVJXqb-OUvs8I%nyO~r(o&! zFI3zQI<5{4J_g|#5cQLw9Xn_u^((`escm#C( z4LXg%uwW*{UIw)C5OkRxD1Y%l$6;Xgi1sXq`b21j3BD^Dv|be|jvmgNq2eCUb%0QB zGceAEn1k-G0H}BiwElvv->{hjQIEbJH3cgE5ZX?It*coF75`8H3N^$!K$f}K!%b!` z#QfXPaXeVKc|gTmpz8-<(!wr?L+ZumgOwV?A#uzK?sR9p{Q zp2PZm8cQJNb3n@n=x{%S@KT8Q5orGfR<7nk#nI>McY)@G83Y&%pzGve`_NB8)vrcN z4_~3;P0)CUW&?)kW!S@~8Z0irunbzU!^X9GmO!!cmOG&0 z=<8%yS3uOGuPf7nifcgE^T5g(C#X33I71~=9DTjbEU0)3w4V#BceX>t(Z?ZgL&eei zSz0S0?u>!PqZ!283{g<=D`@5LVyHO!dhFvXv4`_xsQL}i`C`yL-5@LYS3&G8fX-7x zhcy|VuEK5(-)e|@^z~*Nq2ec?{aIN1{1;SQ0NUS&l^eoqAm*UYU)w;%C7}5QroIU( zj=qli${OtM{0vne4(&I=%1M^B5PNq++i$RSr0P)ddC>JUuzEZcD((y||6$@iP;uC~ z8?butI#iqqnonWt@jgSv(dVJ`)?qJaY}Y~DiQbO&g^I(@34somFzkYgqxVO6*F((t z3vFM)+WBHoarE`HIZ$!*@@EoM{1>!>f|W}v*JF48VW@iad1~1W*yAg21H_#*&~^xH z9$_0)Tm;&lfu)BxP;vD3B=bgyIgg<2H)!?1pbZtbgl2SDJ0T7#{tDXffR&%yHiG(7 z3JePpA^S05<7Q`}>e2hHrkf!4-iEeIp~KA#(wlMkcQZsi3OC4h6eh2dQgNR8Jt{H zVyKtT5FekIlo?-=n9h(~R8m}0nwF-Q%#e~=l$xGdT#{N8Uy>W2oRgoITFek1pOPP+ zo|B)Hm=m8;l3!FDpIBPKker`ekds=HnxbcbT}6CaW?p7|Vo_0IWqfL0Nl_(3T2W$d zYJ5s*Zf+$*d^{2_J~J<~1Vw31W?r^laY<zktF$CF-qa{QxuC!>J~K1k%p}n)F*!AnAwDX~!V+R@ z5Hy&4GV`)iQy?}|XuPX&e0%}O?FjHTw2Y5WODu89%yW(pFG?&Z0L4{+TUkIr znQO37JlFsuhyejbsi1I7O>s*sNy^MCW=I3mE{P=|zxWuMf=zNsEz3+!4N0vi0eR4e zBHLXJL2ggX$w^8~&W1!N6$~|wkI!Vl6>+*|;E2;hM4TQdt)!(I>LnLI!qX@|KCcX% z#!^$_a})Ct(^FI8i$LD>F^0%N0wx2TuHw`3i_le>gH0$-EK3E2eNtu~R;2*}Weo9A z!6iP1mIaxq$*JX;#i{Yh`FX`9MWx9g>p)V*i8(p>$%!TTMe&J6=@9j1ndy1?MX3;u zMSOf_J}5&l#7FsA#>c1UmBuGmRQMPf#>c0mrX`loqb#(d%AN3d#b( znN^T*0=dt}&?q-CJu^8zuQWF)6>N@=p=k-KZ$Np=GB>dTVmXLs0rGNUadt6?V`!S2 zndh35nhWziDCt`kmS!er7v~q1_<`&(G;q$(D@n}EOD!t4bSx@LtPIOcE%(gy3ywF3 zWW-_<=hBo!*GyMeS3mF2oXkwu_~abd%uI?6gP3I!?-uGu4dY;LD+_Q-%FJ|i1tony zBU5lj$weyCz=jOh`74Cge2z_7x@`Maydv}MrsbYTrEn> zEQ!xAElDkcJ0D~L#0TgKJ%cUdL*iXsjnh+0(hA~TUHyaILFI9jkC9P)JlIX(x-Y&U zu_!S&z9=;f48AKZ#SvY^5mr7{Bv!pdrFbs#7{V8t7#G(jrT;>$BjGN4r> zq|A!4v;-9{po$S?2Rn}XJ!Uu`UM-tdu9fk z#Amv?29a$_5VA3#iZYEMJ_-~hAU8Uuq!gtV7yGAq5@Egpmedpw5+4+74vH$$jLh^4 z#%Y`x*f=wg`+O2h^O7^XQ;YIab3Bt>EkR0Rd&X0fvUo zp~WRdsfoGq#i05k%E!Ex!nA4Jfl^MuFUCXy%)m zn_pDvUyxdq2r8zFJu%|S)wnpd#1ov0L1Ez?Y#twC5bqkASC)~IV-Rl;Zxjz|VPpn* zXJ)zvo5h2a8pfApx&}MP2fM_3xw^WR1v|wDJH>|>#=8`R7{zC11{qp_lRPMwpceuL zn1w(9#BfN)#`GdKPne-aC5A6tz`lT#3vOkZu3o{0@xey%uCA~YG#b2+S{!Q{AwPJr z=1rO>2XjKn!4)Gp7{v!0gOWqATYONkD=5ceq&y!JgLrr)7M}ucD?^G%9GQ+1e+C=G zV~q;ZeP$9LY#I-)4PDEEf<5DdedB|`c{3;k+()3KWHEFdEm<&<67@=!(clfnS#X2; zI*?{GqWU8(hZ?$O4wW2g3~6S9n+tg*nI)C5K?i4O2OHGoN3@rIx_ZC+VCs2v9B)_9WI12aOae?WaxNJ9;&2WAfIfhBo?8j^+~!6e#n`bagc}NGpg>Nd*mP4rXUsrWM2|=NDu`Mkoep zSb`l1k8l%1P;rx3l9>!1fy+$H$*cnRL*f(jQsPrmA%kO};UfdkAO|er7MCW)dxE;5 z#^s4c1>o_U)Zo(GOosS)w;)Gf*LYVym-u*w_;`1}(0Er5sGx^S5JO@~YDH!VL%d6* zpQEp*GebPsUdUi%d}>|_sz7l`B4{KKYOTMUTd-?Le2Aly52QI5Tw-VxAC{U78m>!9 zEKc>z3{La}r5CtpW^O@_vyp3%cd!AtjPMRFF^>lg*cloc8YLwbXC{M&o-^~(J(Gh= z3=NYr5{u$XK;sIt`^CO1&PU-C6%sa4i4@#H1Wh!a<6I;6zVQM-(l-f>U!+ zlS>SXOI(9XEK&@5{olDGhO|HP2vqL;(hZ|;)@dV(o^F>;+c8r zU3|I#gN-u>rxUOvVF?qIL?GA@lv0d9F`1bM z3P5n=m8ODIm1|J2VSH|G0N50WSHa>SuYq*CLLwb8gl%Y?ms$=>68S}*nXVRy^bz0< ziuB+TLxcGEw9K5G_`LW;&w$L_OxKY3U_(%QJIFi4Fh01%$gC(aFC{-WJ~0_I=o_C| z0vg!JF9HoGf)W`hwhRqiE0R+Sz{4GeW}pGppwhgO%-mG)xQj_%Vs5IFX>vt{p&`<6 z3`jGHnaJ4?9@jV%wpqL{D78ZrLL7%^NtuGjfFa2UV^{_>DhwKIht3y3MutJ5>+0(2 zg0R^P8i=4EfD|kw`oh%`K1l&847`FegDpYb;&SNvf=r_Md4rQ8s0amT4bM#EGIVI=UqjP)-}t1`SP?FmkQ@dRriL-TlFNIbw(FeI1Y zO8`h&&M4jm)O^UV1WibLf*lGT#x5=?%CGbc08L7SfJS&g<55QCMVTe3eyQc4(d^93 zOmOXjV>Sj{sAJ|vSJ!~dVAJ?u@X#(;zj=CU3Ah@_#3=rQOA2yu*Z8=rvJg;77h(vW zJ+O!m@D4TwWvHPvJP%4h!TCAGuDK|U8IZ6yQpAELTk?x?6La7NxMx6SaEVzNxUmIm zXSo*SxaKg#r=~#WtZ|1M&c>A)q~dbT4e(AzaS)_+1xm=!ROA_undw>vu5E&X6FosA z;2=i_mlztC=H(ZG)<%G)j|)6AL1h7`9tGFUC^dB=N>j_#)jTy9JR$1|DpVaog=(-# zJZd}5B;GGSB^410!DjKG+7pCbU6Wivg)b-xfO^u9N*0{chIDyk6d#8IARaV<=t_iRTna!_eZi*jo?b2mprnG))HH-QHQil76DE*$rZK2M3O0_< z3=et73k1)dC*~m2)x#x)7dxQ4| zl5;$RkQ*P632R9BWE!}-Cg+%^|lt(${OOp41B3&8!A63f&)Q2!@BGtD!ixG1>9EFRqP_w)iY zK}%0OgG)eTRGG!`kciF6%mxn;g0o>}aXfhN5Z*$~1a+oBUDn_dLkpN}VsUY1I@lgi z1J>2m)wC=zr!>{I43-W+ojX{M9+K%%A{V0!j)!-ZVdb#1k!Mh_NqiExnE{GWSew`z zDR_(wk=jD3N%`^65jUSq15p3a&>$WjtKhW|uAmlBa(ZfsYZ(J*FxEY_1k$2)aLCC_ z0*Mrr=oRPd85)9C*+pT*A~eT=Itr*wVUP%B*8wze2hZ;CM1!aB1^F`wl;<$Va6vO? zkU?CAcu)WM;*yliy!g`M)D&=vN%SPt=!Uz)&^$iAB%>(5JU%Hg1=4^-9R>j#2!u95>EJu8>{_Qkel7=fG1~KvF=60cg6~6_!MxO)bzMO?-SJWKBVS z8fsk)ser-N7D`1Ac8IGhX!0DC@jwyh3vScmh)1M$5L%xYC0P=%4%!z2byeU6hmk=% zmY%*vyl;GQNn%MVxNrq&Aa#t&5LDhkVh6N#4eLl1$XtX&AXxz%R`KB62#GqBlFTyR z7bI6uQUq}rxX%?28mfv0IQN5vKr!nIsWo9H1sj9f%;5SLQ4a)pXS#xl zUr=@hVOMA(@e3xLG@T7WOC<0Yb*`?UA-_aWi3>@eqz0(75vYa^@(wnRhgSQ@gMLU0 z|8g_2HjKgL36@4QxC7y8Xqu9lmIj$_i3d-$fUCk_Cva^D4(4IewFM7NrKE!M8fdv0 zq?cO^PEElj@FEUcANz;I6H%JQCs!oKC#9x?Rs$!NBxj(HzT(cjgvvZvn}LjGohf)2 z8GE+FXjYhlT@MZfaFqcamvlCSw4g{$s^F{zYC&OX?13s8b4Z;98J}_miD0jKu#cf4 zoQZ$b&d@L=wK%yb6FjO1Yp5X>Q-PL)S0sWKor7i=QlWVY(Ue9M4T$!nv!Q2DumPw= z4k;r1L*ikzI_YE0pwbv>D@smA54fCS?8`lag7d)x&!9zHNUbw)eg+Q+fXW$fbkoRa z0T_XXEii_%!!nCXN)vM+u>qdsfTdQDmEc9Gh9>E$CBBIj?nRj?;MKFB5nDe*z(UGg zBUgXu5E{5DMQIuvxO$;0!qR-gUuT$r>I~O12GDYS@QP`Z)Vvg|V=b;hpuRg~4kjP6 zuFex2oUXxUsD&;x@xY6bRM_Y-dfo>I47BM2$y$(!1*A?z0G?h2Bw&yVCbYoA*WNOL zO`3z!K`@qKPEwinxaL9PaZZDPWU#c2z;+Ux1OfXgvjRRm49XM~ z7Ke$!o|#^*Wv;HE#iyVq4Jcyqtb`coRg*H10+*kl?NyKtSWaqQdPzonDr9jo>ToG2ae@*(bQHx9F}&pyLd5V^d=#h~ z37<8GMl5L72Wct`ZSdFG5NUJ-*4jrY<%sLIW9_z+ZJr@$hyimL0BID!5XYD#TDn4- z4+f`1S69&XDa4#PB>53LHVRtE8te=o8x1y%hmRa1M;rPCIC!oDHVJMDS~VJM8V{c4 zFa^!_fG}jnXAl=m=8!#IMV?W9pb!8pKPn4$jSqH@5B7+64b91Pjd!(7$%n2ka0Tzu z!q!&>XX$v9VH-oF$#-xSfHYte9|cO&@R|)2KqdL{pamYqpqK`)ZpZ>HFG$V`iFXC5 z%S_KK0c{lmw;WJfyGEcnYNx~$N6_Xl=r9^o6=J0$QEP(@4T@7sT%lv5;MfH@0koO{ zw8a55GJ(7w0o(XQZfb6EY6-L{i)cWD{0Ch}HrUqlI~(DPde>mjc>iD*&?*9qQC+fg zDy(q~>Pmwaw3McTCiy)b<6(og$cAkMj2ZOF0q6Un!{=hq~R&hTuV-7dR}Tud{QZBI1pvf8a(*`8t8`f z`9X;XrB1?FHwA4mf-)7>hMb`}Y^fxu?u!S_*~7eS85rv6>>cbM6k-C}3+sueG9i7z zw=ps6lPzHDlVJ2?X%D@sk% zi;qvQsE98}Eh^5>OU%hEsf;f(055%owt;T*y9kl<-)&@diE5ZiDc zxEl*@?15%eAS0)sb;Yi(kV!4GqSRtg_c$onG9Ee^?M-^Xn}E$golS#Oj3HruhM-e5 zDk>6_GRq7>b8yKyiN(d>xq|r2ytI5*^hD&1G0bcTS{Dk+<>01ICV2iU*aTdIKucgm zI3bN_f(D4uhKyZN#{-Z{O=m;U$`8=O4A5c#S1<4ajUdFh8Dt>_XlfR`5(Bhy3cMoO z6SU9`GNbJ26;TXXqv0Ct5)a{lw}FBp6e(0KAVY=4;3db<@j{&Iia^CJT9Ir99Y#zm zN==Q2H~P@S5vj1DVD1lj$);&MXmt@}$!0(ZxZelqvw_#)A?1C@$OX8PLa7PB4I)=) z{RBR01-2{54Sdo_Vo{}QUJACp0`ew&XkH~c&OsR$b2*A}YF-MYLPF|MLalI5@Zjyks;NJUbg~1lnm$N&W(r$7uNrv>O7{_%Mh^n<6%iN17W3ZRUq| zSzt56pkA+UYHo5tCH6FoG~0^4Mx9Vc2INeLH*r)lpy5TNu1-KEI2zHb5ok>TZg*m? zJjGgDfEO0Ix){JRiy_u@LCF0`OYlLxFtFpOQ$oR-eh{ycIEsOOAb}ZZiXRlt)Nvb9 zzZ$ad067H^*#s;ofQ3Aw7l3u~g|m?>=(q}S@d)kHAon%A@y!=O`eTqFL=>#hMu$0M z^MF6D?E~Oq9p9uYMvact!bj~k8@fWK(7{GJ8$vcAz*e<^7W)Qxqc$@@aRqWV)WaAX zZ@l3nl&UQx$m_w;+CqfNEpXd2H?blf(!>QNH_(m^ zbI{I@^rFNZ$hljIc_p5Zq59;qBuI7zyB)IHv%m+m1P7!QG>V5NgTNtqbM>oj}MMd&PYwphV0+WNzFr< z4MG&UkcDR8Mjz_ny#Z){CAjVc$1TowFSr9vT+s&_{?ABFOaTvBf`<*MvCRXz3m*4+ z2_$PFORq743%U>ray%U9yeIHRMex2$@c2MVYHng)dQK`@ZzQ+`bm~=U9zq9bPcFn1 z%nf*;T@i-y;C8!raET>oT^YEq2;NYK$X=k%9C%v{MoR|EHW|>mLhwRA4(&NflXd&dJ*hz1*gcJG+P8)D3oKsK0w+eb*dc0$6ChiFma12kAr znvw|XX;bkeQE<|N=PS_AGk7ioYlecX-a%Um4QUL48>|$raRC(tpc4)uCk{fph?!_z zL{OECy_xC>-F}FH0 zu@Ab58L4E(JzP&x+X3oMP^|?SdI2{^hH^6!)>pvVB*B=I@B(cHDFaWXy1@5}V+J$0 z0fu{a44#j1bZ&5Vi6HHGl)@D<$_0iif3fRq{TtLLO09XqV_YefZP7Cs+ zcR;W)c)}ar=OklZ(3Ob2dXNop$dgpiauOUJnB%~pwq{;x1$2TGJeH2_02C~9eb5Lb zKYTEUWxUB9o;NNA?I|HLO~AGifxHfF-9wt$6f9V(Lr}$sHfaU!Sz?>RgT_6$^uf9|$l1s< z0JMUk%+)m{-Y*zsk7Qm6XqTj+WoB`F4rJR@Q7UvJ4|q#DbZ4V~up?~P2;AHQ9SaYt zBtg@O@yUrLspsR8 zBRQbb54xuobb>y7<%zSAD>NY>M$%EIrO}6TGhM-rR8Xg%{Pj?#$@!&uC9XlxP8IGd z1tm5h?MNI|H0a10hU8lAF;_AVAt?1=$)6ssxOSO2J29L6-Mpngkv0 z@kuO(Ou1kk=K<{^PR@=`&d82WOU%rHx+UI7&jgfdL1i2J}OUs9>KZzAwI;{8FuU=?p0mT z=m{>d1TAd@-3mhf9)8&DBV@V=YlesdcaY*s@?l2?5Ip!GAOv*OAh^+nSaJ-jkwKLt zzQxDrn*iZcFUT8eP)-#AB^^W4BGC0Np#?6PWtqj9`FW7xIFt$p$HWk5Cj-)6D5NwG zuE#-pRzQnY5VaAw$|`mZfL3?l;KAO1^o89KQe2W>l$Z{ltB4ALOlC90M_EGZ&79O+ ze2ErIyWG_^2(*C(x@->8u0t6DfOYqr4WWkufE@{1LIOXpBiFz+$UD=^)io%@ARaUl zk8;Kf=*StbU?b2~2cY$4pf)CKvK}Rt_PIz<81s&Z6URnmOjgaoZBD;`8-sT1>s)M{C zd+R~xyP(u-kPrpeN|q?Ssb1 zpg9F~I1HmWz_QgLIma~@DH$M5`(UiBssqnwu2QC zP>|>&V`eGH8~fJ65KrWeIZC8J+JlBj`zt_O?29ww32m`}6hcUCRn%pdMkvcJA(4bp zSz&1qg9qBM4OD|>=Rv0f!cqw=cq~iup@-H(*83r)dqb0WP-h3S%Q+`C&oj%_H7g_@ zoIR0S1K3yW8-j8~UMlGNP4GeFuDQ9WQyY4Q7SPr&=;k?J@a?VG+r56sT_(_GPHZQS zK#z$=?(jlu98!V_JP3$Xl8`dy4T>db&P3|D=i=_UgUUy6euZ@?^V5jwQi2w5gVH#1 z9KZ&YK;u~8bw{2kdm_Ne0_i|P#L@te43^>>w1W-D=_=sPAh<;aX-t6zoUr#zkrR z#y~*xCh$>0(7G}30m9%dKA@g8tR@BpGx;OaFi$|v-voslXd@SR-zMx@2GD>y2)jCa z8lou!E%x+G2JJ$C6iWfgu=U_2uyTah3ox7wK?`xQ?+}F4omkTnc%&#jwIn{I0QqD> zP&NRSq#zGrY|D2xr0&!kIJ7{cqM%zuz@t)Fw_%Wywoscg;4}DxO(6sPpahQEjDYTC zMz4QB2?Nsd9rEp2d}HP&xChNiPJ`e(T(F-a04b;NHCLf6L{LSEI87Y1<_JFiPK{Cl z)bB?dX@d{6Axa`=Lsw|q3eh+~8s^1*YbmJB;)$URT3Lcy*d(n516P^I9bh6i)tG?B zF(CytB+wCFAloglE*``{r1mD(*@;1UK^(^H1hflAjfM?!ETPqinXaGZ(>A_4^_a;brov#`WDc%265TDBQx?)J{NL76O ziDrq(sfnP7M|Uv#K3^<36r7@PyDtiPQiy`Ygt#mabgeWfS)k@HQ0b3U0pU4l+Y~kP z43{Z6&?GqIay`&4MbID)v}#4_HK1&NfNb24Pbw}(^d51F8h3 z^f98Kr@XL)?`{Sqa7xzwKmrG;`HpR{i2O_I&51Zf(mW*}bp2{2^ax3qi%_Q{P;NB@ zc^bBmD9Ae)Grb&wY{jsadGIpn+XXp{2LiCa(*!q!zG2|*V48iKd~ zLS`L614Nm~r@e#5S}Zd2i@}ww38?1)8qPu=2Z8sKkg_4hTob6X3g3kR?akoWtRDg^ zz5KyV%6QN%E%C|4rQkiv(2?ta(&7xz72VGHd1;yHrA6S|ZgC7^4~Gnqn3LlgPzIX; z7%l}m=B0`7No@R^rVYV;Ojl6MLiSb1r$BFI%>$h&23qnDIhO`pgBluv&#pl%Jc3R6 z;9q-WjI{O$sqjV|i{%R5*#s)g4YAzagK~x?sEolnR%&Pn8eE0liVZ%g8|16OdI&7& zb_=9U9MEC~JkJU~j{vlf27LA}d{zjlP(Z1hK?Q;zs6!9+YI0F3=yDNIE3W`_uc#-e z)dj*BH>s0xSvg5nmRGPL=wKyRS0m&2c&x6&yUf}pBsr(J$PcvVAF3I0K^5fU-}wAe z(5+}7kAX14Ea0WRdgw`fB4+rm=-Qf|Xm z1%r<1yR8zme4EA6Ae86Ln?bHJt_%&rWt5P1e9_>?t>?9BMOr@sDy%7xzJ@Z zkoX#W>Cn(D-W8M#5qs1yZ!3hQL|4oB;CN`F1zj}j4jzC&8jeL963lc(9(YBl%S*+6 z2O;<{x_Hp(!Fixdb&>A!2A@Rd>Kg0_wh4l&+4yZ*7 zKJgr~($O0_F%NMW`mk>%>KY8_p`GAmS?KqfAr8X;jd{ZihpeLnZ==pcACmx0r3{zq z5WF~}v^cdmzBm(fycl#c$`!Ps5!>PnP|gMQH;3bx7WkxU;$~SuyN0NDkvZrxIMDUs z`ML4M8Kr4yIjLmr1pzNtfy@en))_&jm%ybUYDEm1+Z{q@DI?7tLZ?VUi4cE- zHWW!KUCfJ9OTaC5@BkRZSbR-lXCqg{FeYSSyJv`Rd`fCjYIEgeByMF?8= z3tdSFZdO5V6G==iEX^!JKQh$NFgYi`IJLMWAivxQ?1N!cTbM#N9fOAYLB$DjfD<(p zN^orlvWaAGxdxT`5a;1M`2ln#7UV1+w84TPZ^&Rc$s6UM({rHYigmUClD@$Ug<$Kz zGQm@yt|p*6LO`n}K;0}nE{3OaL*ye?L7K5FM>oPc{s&sGfwmkS(ThPp+aGHyAM0Ll zvtilykh8FM{ia{_* z<5rgG3ZFcIWeZSjLPue+uVREBh;ITqA0pT!J`;43uLZ@F5KByRV!L!50xgQaI9y6{OUTpwlZLeFWrnZQ!m?FzAvVS6A2I5+lQc z%+%!6^2}n;rMh{=B}JvlCGjPQ*-6NyJ;A2&;5&N^;z8#uL2Ab0lEk7C@M0-wvlf2# za&Sp%VURbt)e9{sKtT$bxd9EE`<3P5V}SXbUbeNlq;)f|hKEjmaPn zV#$7_Od~rR60ry$Vh_?1I>Ug(qQsn>)Eu|`B2czK-O}#^y2H-U6iJP5eoCsVfvc-Y zYDH?YXC@`pF7{)*KnDq<-~HnOy8Fk~)hEl))fG~;TE+*1HWzrgy1JGHdw_0v4)%de zPlK{6`Kcc~!G;n$ko0d^SelufP5p~2jnHL?>9tP_7q|8k4?RRCt zpc?=%uC7FDWD=^tpq&V$)kAov?F@$Kv>j;pFSQWd`v7g(H$>e&i4xhM#DlN50NvzA z##~cog)7=VcO&p+W}plSYP*2i_mG3HFmA{IFC)gjz#B`BO)_-FFafc&F`0XX4$Sx{i(96p&N!3fwE7dE2T!vAJB$QN|nUkWM znF1AZbn?_KNla$|E6+$Q&S20>smv>`%!SY;MG%>?)FRMvY$%-gA_l#p)SN_+1}Li_ zr-VTdbVUn;UO`TYUV45Bgi)ND%%E42T2aEF2Odq*E6NA;mKngEp2f2EFv;Watf+@!)Yoi1qP_MMa5~5X)d}kcA-k z>J{aK{g#-UnarRE5@pcKD}!7x5}cY2x)YT_FR8egK`%K!7tCXz7)UcPFfcMOFfc() zl3<1~V6+T`3nxKp;T#4A24w~Y26iO<7Et{#S^-%x0|NsG8s7{}e*{!Nj8;KY&w|GH zMAKgb)eoc5?MK%i&cML%?f?ILnEw|*^}}czsC{fu8r}Vg3=9n1NdA8S)eoaPpc+77 z1PVJSmq7w*UM2$r14s+Z{tO<71dM(TG8`fVCP8L^8Bn5-fq~&alKTUo2E*uWP<1f( z!|Z|4bqov)|B&=^NJ2Eg=m!ukIF3PK2XjAEn4y<}fdM2AGw%gdKa73=l4M|DfY}ez z4|OKPbf|tOsDUTsAPgA20;&;2$s(m6nEr*(^b3*)>ofq93=B{jl)j-NU<&FSFdIb7 zLDR1Q)d!;+&@>2uG%+wR!06>@`WKi$6u@YhI*2GjA544$n*I$?{V=+Kfc-ns^eb3G zY=Y57^B{Z}39}c*-;bt09~65H6fV*MA31f5JzIQW&j~0pY_)So(wUpQGvTh=k-Y7(HVNOeKUy*Z&PozrsI= zei&T>&Cd|skh}t8gJ>2;)be)*7bGNM^bTW?Dg;KipASucfE`3Xj6RPRVQ5?i1x5x2 zK4=*PvSmReME?R$2o1|;AUP0**$<<27#SG&k@W9?>fZs?4^s!y3&JoyjJANfA5;c{ zQ zA;K3zPk>M$tzZn(52O9i+|Q5z)er!oafKh)Fa`#O5H$S~Q2i26{m|kFrW~XGOl5@B zsWA69K=pe-^((+6pe8s#X&99SwIAIcn0_^wLMRQ34rChQQ>X$cRRFbr1JpuLJE0Ow o`OS%xcA#3pl)^-aslnU9TsQ%<7bYNxLw~Uc#N)+i8ql~507*)(9RL6T diff --git a/GPUSort/src/quicksort/sample/main.o b/GPUSort/src/quicksort/sample/main.o deleted file mode 100644 index c45eb7f9de62b40c47e94c75f7e0010bff0385d9..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 339720 zcmb<-^>JfjWMpQ50!9Wq21Y0wnexzLfpZzu)14Ap6-Nwkk&<6Y&&a?q0m`2Ur6(~mFieKBr!X=wOog(iF)}brXJlZQ0p-tR zWMG&DWzS}0V3@2+@9EP%wFfuS4g|d$^ zGB6y6vQIEFFr0+4PeJL^j0_BCpzO1Z3=HR>?DJ6i0wV*%MMegOOHlqTD1Doef#D96 zeV37e;U1KIpOJy#0hIlak%8e6l>L~If#C_1{gjb`;Te?u97?}nWMFs+Wxrx%V0aB> zzhPuxcnf8}V`N}>4`qK~WMKFRWq)F1VE7DWe_>=`_zGozV`O0X4rTvfWMKFSW&dJi zVE7GX|AEqf85tP;7~ zP9_EhE-0IuiGhI!%I0NaVBmwY`I#6P1fXm|CI$u}C|j6`fk6by7G+{!5QDPCnHU%( zplnGd1_mi8TbhZ1K?cf}Wny5EgR5lUZz(wCVS7_LCs zSE2MZCI*J~ z7#O}l*-%FvvmK^2`hj3Q)EplvaY$ z%FGN5Dp0m6GXsMfl&#Lpz@PzTYeH!)W(EdrC|ie_fk79_)?;R1(1)@Om>C!hp=={& z1_om&+k}~c!4%3iV`gA5hq5i8v?VhGgB6r*&CI}H17+JXGcede+4jr~3=UAXBQpbo z6O`=?rCpdA7+j%jH)aL~cPQI~nSsF*%JyPrVDN^reV7>-e4%VVW(Ed-C_8|efgupe z4uaCb%nS@6PP*85nY*>^x=$hI}ZyfSG}z5XvrM zW?(3WvP+m57)qh+GG+#bawxlknSr4a%C2H&V5o+&YnT}rYN6~pW(J0OD7yhlH!?FY zG(p+TP`U+5w?gSQDBaG?z|a9@cQP|DbV1qOP`U?7_cAju^g-GEPx@Ud_zFum;Lr3#HdFGcc@&vNu5Kjm!)T zo1pB?%nS@$pzN*83=G?#?Cnr`2Qvf1PAGd9l->=c_b@Xs?1i%TF*7jihq4bq>4VG+ z42PiX!^{i}N1*JZQ2H1%1H*AB`vjCe$;`lT3d%ms%)oF4%03IF&oMJFoQJY6KC#uGBYsTf{NW{W?;AjW#46HV7SN3z;K_L zf#CsE>>)D)!y_pBF_eD7%)syz%6^D&QEi(hdJ1F}- zGXuj1DElKb1H&gM`!h2G!xt#~D>DPbHz@l%GXuj9DElWf1H&&U`!_QK!yhR7FEazf zKPdY@sHe{YsY4i97#NtKY-Sb)1{Nrr6-u+QFfgz~*&I-slZAnSi-m#VGr!!gP>;^1 z9-Xy6Ji1+fcyzLYMg=Cs9)`;_zftI{eRJFu)Foo*cKy?N`~^q}9pxQ7IzM=J{`BcQ z=hJ!Jqw}~&XXyiv&e|6qouM~8I(;wrbh@6v=RS!3Up}4Jd^+!=>c8O8>3hSa)Afu? zr!Q_ZI&0rF*S=vWW#0kv`D<~6W!A25YFRI){(1em+w~7hpgDBbzUeG|(HZ*0qw|DM z=RuFoQy!hJ7eH?I!Dp98^BV)W)=u9CE}gD-d^&wEAnb%jzyw&lb(VhV4E@mU`o-Gy z1OJ=@D6Zh>to_qj`lCDa2Po!_?FYI4#o_<||DQYz*YDBGdg}lG|0wztJUV}Pbbf?5 z`hiDh=!+DOZWdJy1_lO?PS+oX2RwRRzj<^Xf1%C6z~Ipv`rV`3_kl;Z>kE(0^B$eY zz$s(`C_$xoXhXD>u!03o!V^}oNAsHqWK&;&O}zjz7355h-p~sky{;Sf34<+#q&kmY zP!Ph|8{mpN4};XgxVyjxbl!vUk>xy4m3knn3SHpQ>$?GB$vGs=7r+MYgP01nz0-Ap zM`!2?k6sXc%%j&9Bn$SVN9Q4rZg4*G=sf1pc^x9y9lF4y+jWHpC<#LXh*+(=K_){S z1+}}^74GxS8y?-hFH$_TdBCw#67f0^lr=SoxUADovsbYiRFjk zf#a@AzyZ_kIs@dlAFK=v-Jwf7k}r95x?H``?7D=p+jWV@43FN>DIT36M^Qv3V2E_U zMLfD)8$bqb#Wc{P7aFZ#>bNUtXpZ5yD`+&1;kYYk;Emz9D`=dJ;kYYkSdHPhD`*sr z0g|dfiKX<2N3ZJ+knte5w;tf1dZ6V%X=QWm5{6PokM7zH9^IuYKuI6s_Ja?YeR_GA zJi2|4xOBUo@#qfS(d~L5Jj|o>TIV5Hnv-H>VDM-@BJkP>W`Iwx?*;sNp0Pmm;LoET z&2JPi3*Qf*i0ce})9L%dr_=QTDC>8JzUVG}!IF}Vd48r`m2z|pk^6crDeUAHj8L{`8>Ji0;I?f8o-7Eo+&fyOpiF?!@6N5FAc z(1<0&an~!LLA>LxHy{*fs1g)82RwR1cX;%cUh(L4Jppq)QsiubM9u+^?$R9|y`c{v z?m|S)6_;+;J09JkC%RoPphXU-l?#s?umL{3prX_Q22$9hM8axN-0R<`U zdJK|BNR9@i(hppifGh!}ysU4aV$=0U76Sx2?gV8G#~uGc#c3}uh~v>K3ZYEF6tq3y z(R@S!WC%oC76W}WBB!lh-g`g)|3`COuPyuk|Np@?$b{I#pf(Gr;@9x#ti6F;3V(dZW|#fls$9sJy(u$g z8Kl-qfx#y;4$jdKL zNX^SHP0vs$E=WvHh3SN7f*PmC05-@qD9ArZfk8ooLDNcsL60jqzqANsa871ksse*T zW}X6rl>#VYLAe?k|ATTsldvEK93VxE3=AQDKE8e+Hpnv#3=9mQX)Eu|`BJb3qywn^|pCm)q%qYKLgLp$TU$DRZ3sQ>` zK_OV|=@%St?gI9Cv59kON}{W)adB#iXJ)3WYgs@@e2{mrd3=aLylZGqrfa;HtE+2S zuv2`nQ+$YFyh}ldQ9Q_5M7zU)*6uJgN-j-F^vugF2}vx@hK7!FeqKppW?pJhv87{C zQDS9SW@@>oNr+>xH-_gkU0pLX12X-B4dXpCgH7TyU0oUMY#A~^zGPr1%g;xzko1;cWM^enoLb_VlbV~FSE7-epI2N0 zQKP1#0O5fRP}5Y1F3&8 zis3GRs<#1|Xcr6ikbXvfZmNDoX<}Z9z8)xJE@-TCzfQErRp(cD%ja7Xn=!3Q^9}%!D3)wK-HgF ztdL(?qL81akd$AVmr@MUYokz-ky)&3m!4V@oLQBsp~-;Gk5*vNQD9JrWk^XZNd&u! zfuXo0u_QBD0TE4zd|_pUp7L$L5s4C(kVw*iI!_B0Gf=)7G*)3z1QmvbJlLxc4kUCL z7zl(f!gc!SjswLkJh&MgOG;963rbQ`6iV_Hic?DzKz>#z$tVJsM~Mn~r8zkTB}IDR zuu!nIRR9-&wzdokVBlI&keXbQngS|9!2$~58JWo$3YoL@TICYO{Z=2$5(D3lc|FcfE2rGi5o5zvUFLQ;MJ$1OPEV1Wut_u#|~ zi5yr;hVY=tnB-)inO6c$^$ZN))T{vV29CrFmO@FtkmLsncLoNAkc`Y?h2o6-(wr29 zywtMPB87~^f`ZgM1zlJLlV7Bel9>XoH9*l+4yrB^^Au8xit>x}7!ZcRBa7DI3Ci^} zDMxG=sFm-a`3YzCgC!BDI#9B34E6@)wdBM+Q2m@-lnScai%U|htjbc8OY(~pauSQu zQ;QTzG7|F?auX}!K@qE=$&iv-T#}gwt^q*-kH6d~%_+%*q&@UpilqpF)c_%t1*vvc zR>}DVl^QMx+n{cT7cG#c0YVfdFJS6}6ff2aDB56xV7&^q3OV_C=?bNJ#hK}OsVR_J z0pfg+n?Rw4Qipit7ngv-5EKq1lvlU|MK1%jyaELQw7|+&KrX8gp#skZ#o(3_F8ygm z`MJm@KvcsuGcbe{CFT{Ur51te5>QrE0PDyAwdYFmksEo?2Bsd!)1dqe3K5JV4Ae}( zTCLkb%Q%n%4RC>_g;q6y1!3_534Dxz2Nm%srC2PX`U2rXNPP^lA6!X*%QFxUQl^zB z7As`tWtLSzN7186WXXn+IF))rQwD5$A{6@!A< zKvSWnMggKi5mSYof`KNaTm`!WhZ73&i!(vZ1BJwtl%mw)Vui%K6osnPqI`%o`9%;8 zvMgK+DCPv~UrP6JncANTC90(L&AB!YsHm^T1^e zO1mZrfj!t@4;tS_9V9>&=`8)y?fS)I z25Q$7Jh+Px0u5VOcy!jD=qx>gHgeJJdgR~>`3~3n4Yl_fO4z$yk9@!Jnz`HcNT=(A zZr2ClBr@!~FB@DRvx@Tdv8 za)=u6xCqE|9-RlfK?9OcKvNCp!8*`}N}ONYfIJJC5rEr_G`tU!1zQfa6l3t;^@K+^ zXsG4?e@207)+3Azvlv+)F*4j`R7T7qcGlkMEWL5u6|_2&q1*Ka|M~-+t{=KxZ*+%# z;9q~J)Ab8j(Dh5_0njW`x9bf{*E_|m-L7w5yLG$Xv3C7X%+c-o1~lYg2IhZ(@|j+% zfd*JDU4N886fnOQ?RLEbo|Wd^_G213;V8QlR7bU;E3RT?_l1Ra<~n#S|!g$&w3CrLm~ z0Z$i!rd|sS9 zAY-5iLy&@VKWMB2oUo6BhD8`4v5sm9C<%k|Hz<>W#}++0OD~}1Qjc!e3;f#{S`L&L zLBHa(t%>+ z04ylc$~e{6a)=OxD?%#c!0E@M+jR%L6yr5xWnl0WuV}U?Bl;xFG@;EC^MHs3PED%Aw6_#Nf?}s1d*ezmQ2&q_N{*kH$Bk zRo@<+dvE;x|G#tVmB0W0dvs1c0pj#t`1}7qMv|EeQs~j^dZ2qMNHL-y2Th`PPX*~g z5o^Fy*P8JE|9^C02;ZZ()#3mD|Hvk^Du5?krh>%L3*g>X2C!r=L=7ao5H*EIZ|j49 zpox&FAR9e;d%>zv2rk4m(B#+K3ibm+7UEa9EI4=&vXEee%Ys85Aq$BixGXqc z5whSkfZ&2-70!h$766xG;H4liLqMe&j139r?pAOj?iBEVupnu#Ge7{fFyn6l?Sgs< z+C~K`l)$w_D>zu8b|Nj;KnXBpapaIg7Do<3WO3xML>5O5RAh1F&_xzU4rXL=WO3v;K@vwq4zl0^SS^Aqh^N+owtLh4Ss56TStoNcl(YWhVCZ9H zeaXSFl#%r!2g4yo)}0&-j~H1OaxgS8v37DWEMj8)$j-2ziS;Hs!+j>!gX|1{m{?b` zGt6OToyg9xhne*!8^axD)`x5iKbcuivN3eCux?~ySi^}_c!G*&aAT&k^a^@`2%a2; zB(HAQE6{}D(RmQwHbLY&P`wFnWS}Os5|nxqt_VxLc>`X?bDBu!F|>o~Oi=0sRhdJ@ zosJ_I(0s_jkKKKrW^r`4`qegcM#N z-+Oi*@#utbeL5e&Qq=^|ByqH3Ozh$R{~0-~S;H6tr!5`fJ6|cfg%FbMqJ_1S&Fh6=mlt9*B+2JJ&wB`0MoG9Zcq=@6|_Kak4JCl0kp+< zpoD({vPKTxV}o!ZnZyI!0&Y1_VvD@Q2So)q^PGUJ(Sw@-$~^EDL9Y!^GGEJQ4B$}x*$mww4@N$D-aE0Wnfst$lA-q zaF3C75fj64CZys7)Urh`Mj)#=zra#0XauA?^hb(^wg_Y;A#A}WC=x)f0;dgFuNssJ zpb2#Xa!MTlc~Hvo=nXIc)1b=L^#>?fgI0Kfg_;k*7o>st#~nZ!9+Lc<4=8-*7XY>Y z8DQZ6RRbP}L+XWsLJyuZkbDjb9Yp5Fr3xN?jc-6(jXL+rpfz%T7#`r?26jiw0sbim zS`L)BBQF4frCuXsK~y$M8HGH&(Ax^K4AkL97Hx%E(JEZy6x0BU{1ds`k50gv7Z zpncmOSPkiJUGne$|JM>YWEniVeL?97x-u1(p5S58S^ERBcDd8{4Jg#RL%%?xuRHWZ zX8^dw=nj3;830~>a@>LE`})8CLF$e>2q2gupoK0FQ3(W72E;@vo`A;2y(y*kKPDSv9Ec7V!W{8k-rxB*H0psWN69u2Vfpm_tdJk|98c<~xaQGMJMQo4h? z2(Ek3g6??30tU#g29R0Z6F@u=t8)TqtuMSra9!ciI|1ZRNKnBC!d_lyfaFC`2?{D1 zSwY@_#TcTs2bKZ{0o1i8xq%1mWuIK)`cz|8QQ$%295|9?hom`7WRfgzYRilLYlu12)X zgn?lPBkO-ghSQ9!uNWC-fU2JV{~7f=K~2R)jI0|u7&bDp&g5Wtz{q-$onZ&lf96IG zhEHs)FWDK|*jXDn7|wD)%;U9WWnl1OEnom^g*rmO3Ut21J4V*^ObjQWW^-7vIx<8- zG{GDy-OU)p!0?4p_9rvLDyDP>h84`RvsoB!F(-rF%4g2Xzz`}ol@a6^un0&ae*r54 zL;Y>WHOvf0nNiK-Q?q7ZsA4_D$S{G?2dsfl&5D7clJy`XLqAl63_tSOvobJbuy!*t zECGcN*pstm8AY{(KsC7JX-39-jEesm876|f3aZK&82--|W#mz3)ng1{jbzAXg=7;3 zu#?Dez@XIvj-hb5>5L4UK_LV3w=AQmAuH(64c5(!jCUEio-;E11j#|encWu!%&{KE-M3r4eL#2h78tC%nTijth1OwoxLVzhCPg| zZ=Vst!VlbRw$SA<@ zmDiiqo56v>f`Ng-fx#U#YOhzEQCw0~l9<|Mi>=yI7b{?$QDd6K+}GLrGX0*MPJ zLl_zO`xtVW{xY&XVdS5{kUJkdYnqr>VlbJZgx`iqTHe5hsfxdk!I$5LNtS;Cm{#DQ z%;3dj!}OCg9<tf11D1t z!+(Y-)Wwl*Y@D&+MUWp^zO!Ii_gK%sAJ5>%n#II_fWe7 zO#DX~oLC&0{<9dD7iE^D`lXhGR$LZmf`B`dQe{A9uxWg-nFu?7EQ14w0|$RRgEQ+b zM*h7F_ADig5=JGA{Dq8G{80=xoOz7=F%0&s*^KmWnkx= z#=x)6z{NV1fnSY*ooNaKzab~1e^OdbWeukQyE+4NJ_Elm11GB%KffviC({%L!ze?O z1B^LAsVSw&soNR(Wf>SaBpCVS8JL+DGVn(+FtU^|#2bTbC}v>cm0{p4V&Io!U}P<1 z;FV=y3=~L=Pb*5yP4!^l6=aY~6k<4-SEiSon3I!Ulvt3#$1uO3D8D4Xq_QAYMo@uY zOi-O!osqwnMV2pA@C##nR7hBGiDhaAU?i0Q6_EDPm<%QZZy_5HezC5He+F5He$D5VB-vP&PE+U|?2d z5HfUNFj6rxmta_EWWi8YP?VWhlGek(A0WJmRg;-tM<|Q`s__5+TKVw>iA9OICz!da7KL13Cf79CQSUw3~c-pgnalfiu^wmuM!_0A9b72@uI6~Vp(ElPJFzfshPoE zCXUO@{1&2){AD6eO!t`ii$ok*&olGKGB~oAvhc?r=kxPn!|F?)0zp>bVIeuGFYyM7ABQ{MYet!ld{z|c4 z{(qvvg2vZG_@{}L@ym$`@GCKJvaJ^6PZi_f5Mbq3XW(YJ%>qi({HM8@SX#xH-m~*3 zax=0hF|c=ok~|M{Hz>(-vi@b@S7qSf?-65STEW2IA;!Rbfs^N#7|U{D_xN}VvvLML zO$N?~4E*v8EX+4SX_PNrq;bnK?WR?{ji;7?d+XL{4UAW|FuKKNAC6 zsSyJMgA#*@0)tWm2RB1_Hs}-%3DAj6f~zDLL|ph8SeY0mZ)IR$&{c3@2oPcr*5F_e zJ|@8+d_sVMMS?;2x&#BWI0LH$gYX{-2H^@xkf4$(V*-Ql2MGq@2o45eZwUq=k8(|r zk`f6976%3v4hG?U5)8r%96*YTL8m-1Gq{R@j)s~HI*N(c!IgnUm_fLKgF(>MfI--q z4=e>bO^8>)nc>?62H_5A2H`{r1|@c8K?b2tb_QjJTnPrHIwnDoiNci<3@im&LR|6; z$_#Z93@i~0!ln`o!mSbv!g~c6ge^E2gnJ|ygueYOoKs~L4hItIU7i&1n2;*;t3oewh=fSm=YLdjS?72l{rBX zXRgV>;=mMc%gMmN46#9;3#8TpY=ht}2?kL&0Z??7{^4d|;B|0Yz{IkGsU(>Pq{vtu z6uV3-7(|U%FbEq+F%(Plg2XMs7L`5V197Yb!A^)20CB9rGJ@wN7{qKK;=V#4aa*u> z*)m}e#||PBB?98ugJnwJiGW<|AjrTHz!=UY3UjUCCJ6=+N3dJ<&xqb&W-g5t1L<^@8icDE!r6r&!VGNTox)(IQUi|x zLxi0Om?Ok0!4N)Alz~AggHfr0O@bl9ON@a*D1edq1&ho7|NsBjFbV%>WcVz^%)nAJ zrJ+HCr?H_ygMs6JV}k|(qt@4K~o>1PrVoE$l3yZL@43CJPS(0|OgKo<)p>DFswp3Ny@SSoj}gC&&%_5e)1L z8JLP07&RDJu7FNa6E28gVQgYxT*}Pk${@`0nUTMM!InjXowtAiRDm`y>axf%@-Q(7 z7YH(NJZ2CUkzr&pVc-v7G-6g@W_rND`j*kifW?E8ng0QUE`I_OtI`L#hYXCNOpJ#a zT_!NHs3bB83iUHECNjAQv#_uyGKmT#G6}`lG4L;7@DLJ6VqjaqU{b&$FrPt;Yd(V? zH_HL$`3yoPwhYW07@4)17;iAKFfj8kU>0S019Ao{2eSyn0%l_d7MBzT{sIO~{tb+* z!UZo_*aX1VNeD7>ZeY|iVG(3x4QJpFU~=W#z@pf|xQ2moBa0Z{Mn+S9mK7UW*aDbD z9191#Ht;R<%f zS&WPY3?R3P`Uo(xF|f-^Y+&@~;9zH701{=^3t$Lf5@lJyV927+v5q%M1++6snTLUq zgGoq`hv70P{#o=n7+e?_81z&aSbBJrSOj?(SX3B-jTsmiBvcrLjNKU+Js4Pd7~J(a z7@Tt$#GO?b#GO4D44oJljChn-lzBjypKa&ariykM~5aEt348fp2jMfAO zmWRv`u^B21;`Un@Sk$=HK^7_MbL`_3R`6h8N#cemsPJGAwq+53n7f5R+%|_H7_@+0 z)q{b>i3_5}z=J_tbpnHssyhRt3nMEJgNr@~gCYl`5Xdb#3_^dXJE)E8H1_p)%Mg<0DNd*QL@vkhUObi?h3?C8&7)lvIo4h^<3V;qAT=+qdfm!-L z1B(pD8)o?r84Nt~${7p>@u00>7fvuR^88>>YjK83e%QDt&PL!N3y3_&-b|fMEkC1A_q<1H%H)dPQyq zh6HW~h6Y{+h6}t53<7)%3;}!$3={Ym7!EivFgQ3eFdT4XU~q6^U^w8!z~JD_z;M8s zfx*Frf#HA)1A~Jr0|SeU00V;n14Doq1A~Aj14Dqr5k?OFCyc!O3Jkpb5e%;XoiqX% z3K$s}K%28cM+btAb_0!%E3h&!fQDo?ure@wU}a!1U}InqU}s>Mz|O!Rz`?)(sxbsO z85kyTGB5~mF)&QvVqg&9W?-1Wy@r9~p@s%S0s{jBXy{LXk>Lry@P-P8_n_0(KqpFp zPRTJ~QDJ7_v0?hpq#*!4QWSI~%mEe#1_#y~OdLWQf0%?FdYKDBB@ZJ5LkY9E4F`jv zDg#3nU$8X;1B0CegRlb!gRmn9gRoN$i?A~XgP{up!!ZG2R}O~DppG(V8y|@No57%T zRxE@;iAy|%L5W{HgF)%CSO$aPV=;zJd`ch13K*2Wf(S4>Yo&cc| zAd~}yQV?KB26>5};U_cm4aP)%1_p);w)_l?E0}_%7#J9!K?l+d4s}Te28IGqa0oC6 zGeAPjU?B&~2__it!DbPpKn2-(fk7A)s23oCdVxXM0Hpr`I2wd4L%4)LSc`&UO!xvc zX2p^0RRG(emSCv>@e;_{7Z?~V8JXD`fUpR=kZ^=XlHvvZ`;3g&nSv!CAq}zx6jKWr!7;UfQR#u6 z12~I7v@!DXF*mRzf|Aq)U490}0!Yw<($xh9VQ8SC`U|2_>4Q!HIG8~mTfhkR*aApO zT)-#{QKJ^19l!uit|0ybMz9+}PS7#{I{|77$bHyS5{m;fG!T_OXciz_R-g$Ae66 z>@{edA)JXV@Rc4YgLMjn%AEy_!k}0KB^5+^T)?Q*pagNxhPmJ(3gnOr3}A;`U;sM= zQHp&?2B!g3mxKHYk4?0+dB8v#l&oL!Gk;)a{Jw32MGlRmIz*OU|wJV2PPu;LHQVx|3PkrM)d=6NGibye%1wy zLZXmle?d?Yl%YTl#|RDsh#R4%eGo2S5Vq{$0To{0LTrHm1M31tW=wLe9LPvWnJIK3 zUz*i{87Qbc>pc#~rDN>qAfk9mI z1xT-O!2>>)3yi`Wz$SreSKg$FLVS@&KgrL>|vjhfEa0_n$`X`(K65jjpGO~Ai_tyK!FGB7J~#H;RKj&@e2vuV9|yH+(>2$2ylbiKnDaE7#*dU zHP|>L7{nhaaDi1$Xy6jQAi=-1x}U(Nc_NU7yfX80~Eju95{s^z-=;U-~j7? zAiw~MU8u)DOkihCU|?q8KvwC%&X@qIEurd!9|$mTNQjBsRETjXh;c|TaI6qz+`!2E zft6!}D2D{dI7VShZ()`IE)EVZkU)X~C&>H_oE#aP91;vHAHeQO;9wMApuh-H@ZkeH z*o7R7AbW)^R)7Y=1bM@m7!U9>dN2s_2{3>?363451a1Y8e?jgL4uB`44<2G5OA^#T zMu1onpmq+}rVj!P92-Cuv20*u_Ge>_5EDKygOOzgJEN-v^9u&X9X#wF+~5oe$`&iw z!P#g9J4Xk%v`7axI6o>FFbG>(aDj{473?f4xLGPV#5V{qgBxX_(%pj#Tvu0cFzyv) zmJkmIHQyaPm>5q8fC|wH4#f}R`V5RW7#S^~r7F~PMh8h|NW}`RoIts?fY!62rm7g)lq^6E6B+Q0>Bi=9ncDB0i$>V)O2XXp;Qn`7uXgsuuMQ{hrk;E z91${%8NAFdz-D6VUce~)-~=yA05e)G3TbtK0}RyqMe^|iMwS~gpqAGK4GxwCj4U(Q znB95R84i443I^>k!6LjM1Kb6C zFoPK+2MYBLCXOG>($YVe86%iMS)+mpoLViIS$=?05rh=p@Pk>{Qi27XRXdo#S)l`F z1xo`b%LN9G8_eK5{b2@J+YM$G4`voHDcoSdz_NjX#X^B)1B1AO0t4dC&ZLEeI9K6shKufW17 z&9A@$!XSkVEYea6EYgw+EQ}TG%mQqntOH8l(9&rEBTEEGjN=3^;|f*w4mQRe;KHzg ziKBxA*6JJ0t*WV*fU^KJb{CqaRby{EDE4r zB#6AgApYP1b1*-s{RJtQkjfiqd-nnZhl47|3~oV7u2f?Nh7BvYL3JBwXn-Sw13c7c z(7?S6Vo`cteTX@0_ zMq#VhqAU~mSOgeYGPqbk?V|!l5Jv*S@c?mzC;VUl4YmjfFbGfB!2lUzVbNd~wrOT( zIl;iPgM&kZRopp4nq>hqC}4yatN@h^3@jF`!VMDO5=?@Dqk|dbG7Scf3Cy6d=#U0E zy@Hb^LmR}9U=U`2G$d9?i$5@shNzV{IKjxmvVeufgOR0y1>~*=tRP2j2mm>V<%Kq@ z0B8=IC4-UqJ1egP6N>}`s|Eve2CJ~84g*VuEXxN|kb^h`_{3c@WLXY?0-I$AE6WE) zo)b(g8)SGuu44j`54eSGW^1r?@CjRO&|^8k4zjC)f#n4QDD5~fuxK!WL?(be(*d$v z*x&)9u%!ri2nW=t1lc3JVFDx12?l-vCRSkw574L?ivwr|jv4G+;U5*u!k!fjEC+an znFJWbBNQ0KU4HNjGhJX5eo?_9?CHSBvVa9F<)Og9c!3Et$i~#cEWE;kNyKvnD4T&K zg+aq$o;%oBIt0Poj2rC2o-;UDUa*0=KXxz+ds^$VOh8y-H-TN4i9=9$hXJ#&=Lrc= zEyX0jzyeaZLr&PjfB~%W#so%T&kR-;1tt)e#X;WClYybYR5$`6&mv(r*^_~R0W=c+ z;s7I9O@xK8r-44p1y>MH+{1t&7_yd?nMfk@GMivPMmInsX5fTh6KZJx|%mC#fmIJIHy(|-0#aC1?id%3f zvOHj6@n8`CkiaN>f&<*p1{KN^m_fQ{FiSu1U}Tx##?J)WV)Vb$P^E~u)clh=BSRlM z!y|qMfeJ>(1)ve$2mjd_PVh4bRWLGs0Pzp>u`^uYXJ{{GU=;OWU|`@7U|@J5z@R1K zBEX;}WFx?!C1@eQpaSY?3o-^UsDNsIL1qC4WicNChU7=A457usPHce!3=C>=3JfX& zCJGEe>~Q}al>Q!3JfYt1_}&;%nT|;VhNI-xAWK>{3gGv#T z0K*~rhSp*!o^L#2uLT$w9tkq=R4_1sTw37tff?kL(Bl6S7#@QBR3*d+nm+jd z;IjjxQfu)xmy1j4k;cQ@Lc*4&hP$l@Efs=vZ|C9d=3}^Tm+&TYn zH#aadyy9nQs(8t!$iR@mFj=L9;lCO}$0W-K zCRPOoMTJiS%*L*jY&k-L4FU{~JVgvl4h$QEzJOwmg{O!?{HnEP@CN~gbOGfeR)!}A z3=^FS7(i)QrlOajo}obW0|Ubi1!iF(H3mi@_6JN{3e1*FH5?9%JPOQaDs`L=jEoM< z{0hw6CQLQl1uV<~%v=J@Tmek{0?hJ8Of{_4pkkQu9TO81L(?OW$JqXJe_&wVz`&)z z9H~;r{GUSs#N=^c<}qM)WUAq1FanimToYJ~n2OkU>oRX(;w@n0S74SgR4L-Rz{EI# zHRX{!15*+A21ce23_JqNyavqd0?f<~jEqc7%m*0ERO(n1ScOk9FtR8x%Rf_KmUnEf z6k&WSkP6Cti8f{CHIl8G^ve}Dn^EnOf8_KbAg{hvVmbDDAy?QF)&q~V`Nfb2t3BmAX#~ifr*trD20zf zrR*->B37n8c7_x33@Q~rY`=0bgLtR(8F;D~7#JcM7z&a;Ff-iXXYgIm%v8W`#Z>2L1-d+%%Ok#{Udl9~hJmSAJk-ZeSGI*}y2o@qo>6XBArk zBjZ>0z(*h_-{oMM!1$3PiH~7P5eo}wu{ax3A3wtxc?N+RCZ+`(fe-%kGn|;fD9<#3 zk(;?MkZ}W37(atb8S4Z_!4nf0jhPst9xw`BXkcViVq$Dyl;=-ilrv!}VpiZ}Y+#ab zTfms~h@ZiunBfEKM+SxlW`-C144uU+p!;xG%-9oE_!!tLSVf!}gT%BS$tN)LKVanN zX<%f1z{K3Z7y>H4FHGQ`3`(FSlK&MLHZU?x;9}m$CM+<4QBbItL5V@pf!R`^N^k*J zplc;NgF!GG(*|}&rXq2M1?-Ft+(96h3M}AKWb3=YVW?6iR>&+U^nvN1JcDYPgcw5v z1A`=!0Z$nVPZfh0Lrem5pgSi6!+}W*Ob?j+c`6vh9Jo0RT`SpaTp0}*guXa12o?l0 zF*z{sG_Zw(3eF!4jJyqO{2$o;7;PDum>3iv@fYy0IdD0GN?1M?CME{v1~ws%4Ghd1 z*m)GV#aR`&W%(7j4Vh}#5;LRK(68 zz`(}Nz|A;;Tj)swBVz)KEkhr_0JrcF0Y;V&?DB6uu*)+(U=ogcz{FI*%=nv`rGT0D z0F!*@0VeKnrV0jzpBET-0vP!(@bdC+;Nmr5s^LmtWL&_-cYv4w0F$*NQw=M_Cnn|s z2EGTpl57un`8IIzTwr3X;{y3Lfthg=1M>%7L#7%g4<9xGW=ZA?ygUXh{0Erq#kdXF z7$-0>ZQx~05XgMQ&%ji~rohd6fr)1U7jpt1qXWns<^X2?3%r_&jm6uwFETK^lw)96 z!N<^8%qsDb;}VFsh>toxw zXF0&ZsKBt1rwDwXAVYz}CkL)YDl?dv7#SEC*tsS!Gz!dMW(IKt7zLjgurz{AD+v1Z zL2x0c#$X3kD+vtzVmtv1g5MMv8U>12nb;W^8XOo3Pw+Fa7qPPbXPeH@zzV831UM2H z5IOO{NVrp{{jqJC-fPZS}yQ2s29mH z3teE`c!r-rrGi-&B)fn^@KgbZ3bR51qtFcpM#caJ<^_zN0vxik8yHy)7?>U~stZUc zFbF+5z$v(i#o{AV12e-reumCU7I6j!1(yE|d>@!xd5RdM6Brok$eV$3N}^+=ENuc49pd*8w41c7?Mu#Gq4u3Utm}Kps2ve62Qvj!1RSfpFyyQ znH|)i`L78|)&30XMa=BX6W9_#kyypt$G~X75DbdNvj5C14_JgME-E&(P8-CC|&C zD8Vp65ETEi6PQ^XL>Lp9nF4qMK{b5RA5Xys0a4e^N*1XL%=$~5E-*6)N^fA^CiOE4wWKsYFlLCvV^a5tBCoB!jf;Su(#H0n711H%L*2BrpP#sl1p3-|;-EnsovsbLV>z_Kvt z12?!=P$Znd#Q1?t@rwQgNf8GJ2Bla04Ge~H3=FNjznGu!Gpwj&_zy~ME{^gX42%3W zNCq9_XXvQ(=bypC!N9@r$(g}1KtS%*2L@r^087RV405jyaPc_s8?{t2D6r&8FmNy= z^nig;*dj^%#ea5&3lA9h6Zkdx7x3`%7f4m}PvBvcWSzjnUmztd&m6$d zz+WJxDaXpd$05MOa{wgFC(T+QC9hu~WyV&(&tD+LY0Apr{DCR{05ij@_yQ?@{sexT z5KspAsldg*L4c88L8y*rgCviF5Ql>--vuTf11WozIyMC%3H}RA{QLq^f*cD3c`h*V zA7GMYF%V)BkP_545MnG6Vs+paOfkDJKGlzKtZ@5qegE;dAF2~T~|D5;P zm>t-YT`M^m3WOTi7!U9W3MDWy8gOwG@CyYoaHzOm;{|f@FA4Hk+8JQix{X9iR#tBS} z8)T%p9fX-ba4;J%@CC>*{srBT%ltu>QGkoFfI(j0frUwkRe+VTfPrU$Fuw!K|GZ?L zDh6SOBn1{G2WIApOe_nSxfLY&6U6p$FtGC-V6tYaVNqb_cTnNxOAzA`U>D-GVP;QY zW=jx@W@ccj;YkqVGEm`Jz|6COlV<@lKLfkIAoqGvHU@U40x?DdNk#`2<^%!e2`nrQ zl8gfE{1e0&IRw~+&Lps~GO)`(VPKb6RVfKzU_8K|qFTlx%#iRw(21W#pn^e&F@d-6 zoIWTjSc+YlFR&@PSH5Cm6<{+|DPa;2W^7>NZxCVN|G=uxmB1PTY8CxBAi!_H#lZi8 zhnIhXWIVqC^ZzycA6RX{0tc9Q`8V)K^F3hYf4~D`{;%Wuz$&l*ft82x0}uZPRxVBh z8O9H+jD75^0&E~{d=I#J0=W4bzez#7Z{LC2qE0W;GA1HJ`(JRkV@H*j$C zGqCaUKM?w#XU0^)(!i7WM4v&RLWn_;@dCH}u@Bt5+zf0XPna1TD;1bn4+sj1PmmN! zND$&VAY`Ia!gN5GIe|xzDS=0ciD`o|zXBVtp+JSu2T8^a49p6Wf)^4MnG~d0Cddmj zF~}J*6-g%uGZyeNEih0N57F=Lr zcaRiiXJF;JAjxt-o%H|{hXW|*I(W1dGe{{2846S|IjFE4VC64RX5>h)G=uKZslKGYIwbCvg1V!?=J|jHN&>M4*C6 zn2AA`|9}(={{mJ`eg|$|{ttZJd<$6lKk)tU;a|XN&;P;rzm7cf2jdui2X00o?f^qp z2X3YYS=JA{{2z?H_!;=*85gihaV%hEWKdyzpwI8XZO^}em6tz2yqbRkt2}>$R4?ZS zUPc8i_607C3-s9+xG)Epvn_C8`XIolXu$Y@iS2^`lLH^q2LUbzJ|+fcMh8Cr1_dU5 z13nJk3ugQem}J4u068szLzjPor7eE|w=Vw!0bBkBtg-wHg!y?s2=D~h@N8gZdf+Z1 zz#zzTK$t(loeNY*GB02i<5|EO!c)W`@!!C>@Cm39$RyQZqWIwB0zq|w3Ih|y4F;2# z|AW?^{eM#L;1xWRf#Ji81J1gy^b?#6nTl8@)G=NVV{~v}+Tg&Lpv;({C&d21gpq-T z?Smkj0t0^l3y1av{tvoL3C1c9PE256ORy9XPGHooU==WDGGO5^U|?e0z#zzO=gg+S zzMgd0l4-EVb3@YLm7#JOyidYy9n6MWx zFbg;^7l1m$mP`zc;u{(mSRPn$G%$(_d=TPqV009p(7<5CRK!x8!^prW=wHCd_`#H) zfzd*p%X2#s&sK{ssod4Sf6!3{Haf4NT$;4Gb&|jH1#73}UPS(u@VV{0z*N zObrZb2Tp>zgADu(%nV!;nD`l(HJKV1#2difaYn`mEUW^I{0%H1rQ!|^4Ezc1yetPi zg&8L>fcv=x43>-m)_e?s%IY-1K$A_<^<4qsRBb7NVDVt zS>^-=5itet#7FE5Dn$|r42%!BKuUxi93a+;gBsyrAt4(Dh9HnGu?g1f2N+l%1TiKs zFfrIEvNL?J5@8H5WK0N31dpK#CNLc20Xg+TKWHdOvQc&sXkaMAkfFI!O~_&+mjDAp z1|vgrrHUk@>p>7Ni4oLq6PGaHU@9;abt^Dsx*%sMP$m7roj<^kQ8|F;gJIe;&}f0w z2X~=U33?(53Jfd^maGky#_|j*7mS1@7#s~}Rs=&u#`7=Aj!tmAjtnflAHB_4zq$G^L-}f2lAXB%or7P80F-cm>3utctzL*n3x5O z7~g_wTt)>3L2dy?mIQe-l{$_OvWx)?`~k|mJQEnW9E^A-Fj%Y9ab5`Kn!sS9QpfXw zgSml$`8_l12L@^01)$*|Yn3|o2S!{842%sPECHtc2P}AbCh#$DU|}p`NaPVgO z(222t&4{73IG;sOBEdqCQGg*FbPlMv07E#aRnIKI5DwZk!Y;rdEg--U4w`!BRA7*n z5MU6I5nxb!#K^!c_&}J2{{t(lI+M-^R{jHaOxzz>*%~7GKd}D)uPD#-AwocrN$`O> znD4^hz{EFSwSt8SWl2 ztOYzJ{0ss`3}V6`?Dz|KL<9?rxtRhOELBRF11#*jid74E_*ge6Fct8~s21?Z7ZmVt z@C!)u@&`!h@@$Cbn-Ievz{AVZpv!bXpKF0WpMfTSfSN3SfU7kBf)GZo09XD7CqcFZ z5xxmLoEL-`6{Oi6#F!bJ*aW1dnH!w=C&Z|;eh}tgz{ANuA;thSPOLh?hkpYP2UAHN zTLCjasPW<8>d7x4Z7;}dFUXR>&vSsuP^FILqZ8W)J}w74{sbd!z5`7B4tA<67x?%O zFzIq#kd>Eb@L-VUxS+tK03z86`1u$FITfV&4=@?{t5&e6e&Asfda@wa?ECt^3#2$c z@CcnKNDT+|uEZAbfa6G50Mr`tWy+~xDq!P#ASAEzfrpQ~fK6VffUS~$1M~m8{2RFV z{CSEPbPRZT`3u-2`4!Z4`6pP*@*8meKg)lBi4SCvP600)V*y(@C&Qfo{|ngI3xxR# z*m8Lbc=;!!u?tFGkV%$j01XLC_cJaKb`C3MV6UFRxIv14K^nIV(}Fa4rUhxzAcDIE zJVY$P$nZd#QGh{FlJV_M(1@@UBZGh}sHl@>i0d0c5C$IwM1k0E3AR$OuLS1`}OI z1_qE46Fo)-1299Mks(2re?oeLl1h<734OvV2TuT38wi zgv39{vNRN^buvs4{va#Cl+w*4z|Gkp2pU6at`rp8$jkUaE)Z1m z%Dgvc`5@=tRm{YvAi>LRz|E^5!T*4dU0(i!`F~w`#s_?AECTWT5BPNX7YNGnGzjv4 z;A3E1Jc&g?jl)5XSHV}B`GW?30S7OmfG9_T1pfn2yM|kme}Nz`e}TTeAeXiXBNGE- zL69^zg92B8Adi7Q&jb-Z1re?ZB0Lua`3?v&&Q$`9N;)VoeiC6;NaK0n&l4cRcR_*c zfj?J(gDmp~CKdx7mJ3Y$2_mxm3j__hHZZX}I4}w*vTktZUm$48xIjWoW`dlkzyvu) z18)8Wg2EgV1mi&kl+XgfGJPhV0ycN14b1!-ayxedS-{M5fSG>-^I7f!HvR-f zCUB}>AoxFt@$f9>2^^rj$8W$X$NRyQX@Q=gM1TO3fh6O7WkY@jjt49p2Auo?62|-m zY&;;V_zT#Q_!kID2#N(TGA$5f`l!veK#=i*4F3W_WqyYW5UndMv_Md(X@Vfj1b)T| z3}Uhi1o;^Rn*Nzes#5mT801Q;EB3>g?M2rw{B&;hm13>X+M z=m}kN&}I74%EZ907+PGf!0y1nR4>G+z`&Sb!nHs_iKl449`ghhBNmmS54ns50-(Ws z$qJT&yhUsb44_d(rXprWX(fIJ0W;2m3P;d@V4f~(KroAd5#s|r#)4o*S<%bPIBI*euX^%J=LW`w82q`Y}6A%_r zn;??*h?^m_Sn7k2;JOM0KQJcAk#Il)j+K7m0*EIqZtOZ7b3?B>`15`vM9?0@9;1uLkXpxdH$kKnre?jPfx}fj@cK!vN z2|Nv){1=267z0JYOIbi1etrc*zHrqF773OGdPWVE%o}nif+msI@i8=1zOQ8xU|@Qn z%l3eUv4Gi-gF(Pph{=JSUqORWq+kLYQ$v~{lRyFEfqG|u2B%JbR%QnV75C1R{Y(Yy zpmn_re*6XuOpKR>m<$+r9+ZIQQN$N8^Kvp632`zUVhLd2O%UQg;K9SsU?k7Nz{qq! zou|QsZ-Fs$JtJcP7n1=y+lMy34K<*thYPHX54d?RFlg6t2dFag@h~wlFe->hb0nBB z&u0V8b4}>rm*zQO!pLU=l1gBZ=4}w+YG44*BO5U=3y3H)Rh;MG-_Xv;;vmAZ!GyU$ zibFtyD}b4;!Gwi@g}=c>nm>V=A5=|=Ur^`eY+#b_Y+&N&zo5JC0}JN_CSC?}{st!gNvahr z;-I2J<%6gri$I0=ftq`vnR91564a??^hZTJkf9^70&D;Q7GDd4Ykyfr)|t0J|+uLLSctM!tePE(dlH zpWlJ?fs~{JtE!}=fi#Z;E2Du3O9Bg%1FNAxmBf8%#sXF*fod^{20o?_>}nf6K5*s< zi02njV`Vg$p!kSiK#e0o#1T|4f6!%Iz$wKkpjNhs2~=fsIH;v^gBY9+YWhzYFZ9^( z8>rdJG#RJ~GAsO! zh&J&tsL3}lsPXVS2+OiMI5P(bD+^jI04?}qJ>bmDpvJaASdh6uh;4#UFlcdxI3q7x zfH2zxA#iU;>^~c*xCjttWpHKuAS5Q=pvwHfSwu#FDe)1g{*)^aViXW#c5q?4AS})9 zpvDAYaXP3mF{rU_5MfMU5R^_}U{Y{pN?>3OP*;7#J|T(mf)I0nFq;6RFpzX$U`!BV zKES}bK!hu8VJkUU6{u2;KC@lz>q_+f!U!} zsY=j5Sn!&j=7+TaRaEV5J@m#P<_R(pw2jfLGXeB7ypHP4L$*3{sMkp{sc~4 z{s!|{J_cn*TR~9L|DY(s8sN_Cz{$^0B`ME%p@xsY!Ca7$c^hK^6JG!a4+Agf2OjpRtASiue0FPgJMIa`(=Iy22RF!4n_wCMgdO7y$nnbgj-&LR<{TXGbF?? z32=%DC$KPmV9?m`TY!^Uo{c%7ieG?}mFYtuQ-ds{fhc=}EMtQ_qX2^xy8tKS1sRS5 zQ+XBi`5R<)q=XJk)nC!1pl>71q@d6LfrE#C14jV3H~yo5gHb?IO#1-4q8=Xuhl(hR z0SA)+2jeASA&!77t_fX1atxA{%6k|U6&VVcc?@(7Rf-fC*clyI5|4o9_7nve7#&>s z4|Fl9vN0@ZXZ+B^KA~Op2}400qXR?B6L|)e3TE~P6I`=S^fRazu}^4sDN8XA zVpU+YuV65q(8bM{z#w-hfkA}r00Ym2E=QpX4FiQh=73)QhAsv{jRyjX312?2@+|N& z6ROaZGnJIzIh^azTfth&&JLiI~sCRt~DjEkEI2UwjpXr~_#m&I1|44p9 zmjKfP1*QKC1x#itCCm-ofhR!y^S&tN12Z`ufEKc_GBq?XPGI8^U}h3vP|=ciFbu!K z&mbUSzyO-!HDqCQU_K0*@T^c)0u{xYAGDO2D+=0=fci1#_!&By8I%(TP&hay}%vAJYNCfQ*T*crZ#T2rF$$;I+y^>*J2lD|oP>?HbVAgu# zvVfVy+YxdAjK1nSB;FtErTa$sOi01E^#^E)uGNHIAuiz;2{WpZF(7GP!; zU{tHn1kFEovtM8cI?>OlQX#+~rnsR?MN{blqxKW_4P8tV7@3MVnHGQ+F0m9a3M(+k zaX#Que59Wc#<+l$Re^!Mz=;vOjzZ+Wf`VZQ`+r6TA%+PIqRtlr1rIQArh(iQ@aq8w zUjqYwK%}IUegT8lBmM>n-U3mk1_pV91_mz1K6d5+7A6Kp<_S%F2@L!Uj4I*+4GiEh zvj%4711$Up!nmdM7qIhuU|?)eD;)urn}o6)gFn=C}m;@L_8bA$v z5&i}2oPzQTSojyPi)kIz{bCTjgd=$mC=uh`2e%L`~qejP{1)X zFz_)j@++`PO38m<;4xt5XJBMeyYLq>@L3Qi#oxf7EWeduYaA61a0xw`z{1MFs&z%4fkpR3egeo^rVrfw0<4n! z3`SyF<5E3;kQT)Gvk#z#Mn6dx^kAP63k{p9jg+>5h z{eRHn&L{o~LfkwG{JaW6^8N}!3?`cFS`HlNLDSL|%$lImRZBrgOi@$w0SohoIcxz! zf}a@Jz{RT21qT-KOAf4)J4;y(TzG%!*YV^m;PtWZecW1PTVaSYsJ zR^GrS>=viXD8S6DP|FzLr1-yi0jm*PCEEe+RM0HF65j>TI(1NMUL$~kpFxm;>#1|LGc2Un@R;U>jB{sdC&wqE3<(J zW58@dmW2Y04GjDT1eh4***OjfFn(ZQYhdI_n9bx6&%Ob)+MCf~y%7Ub$9X1027ZPG z@lGlo%$kj=?Vzc8eg**z2L{1o3j`P$<}-$}^Dhu!P~s_CAiy|*f$2e=sAd79%N2eG zrkOig3K%BM(9i&tnygF<_#9hiDrrn&U@Vx;bU=W0gSODC045P81`ehGCKV;c0H(wz zE)0?tiUCZ5iUCZ4pc+gmfJsS-sX&&8fs;RgiI*#37Bu?>`Vr{QvL$*>{O9Cm zVqm(m(2#*aW~~w%69c0*2RjpkJhuY_i*SqugL;yvNJIVP7qmc z%)lTsmx1{jJJVV&b|wa)xjYQK4h%|bEF~EjYs8sV80AeI7#M^;P6P=HpK)Li%&=ns z)x4$-3=D$0ce(f+7?ez`EE$9n*cljIW#r93%H0zfc^w#pr#LW(3pg;yn>#Qt$R7jw zV2J~R_!|cXc?$;y21Qm*MF!pm2E_^15)6!YxLFt&RV*2LcC#@tD3)=7-66jLq|*vy z*$j z@(v)|*%ZP4f8f9%s1py??g-Le>A=9E#A9p8z^JRre1JjT$$^PMkwH*|g|`807mFkV z4+9HJo1P+{;2ahf@QG!@GaMK=4lsi!6CD_g?lUkX1oAL3Fh0F$#K5pXfrYVwQQU-q zmFYW!KLdjj;}r&mKn4be1@`O=oD3`+4h$?enDqa#IxsLZn(SuJU}a%oVc7vXm`-1r z?EnL_u!JKU%L)b_2hfHI{Wf+724;|_g$fdw7#SED*>xGGfmZJ8KVv_@z^pXGUz0(} zft7o(E*QMXCaW&UZ-tFt7-vSuzM+07VUh zxNQR?%K--c+q@v-e>gD68#pkqfXpuC1Dl<0$sqoNfl1u10UQ`}`9Ov$WI8Yi9bji* zJOD}xApP9@ApH*<7{oam807667#Q@8Ktf6{vML!kAOYOKAYawMz@p@l?ZCkDf{zgZxyGZcywyFv>eMFfhn(1_}IdVBlzA0!3^CgZy1kg3`$S$sqoLfr(LFj!B$> zwSigw-2ny`C5e1Z2KfMxuMaRP@-yo%6E1kp#mdAWoNAo)Ftvj-eJKxvaKlC>A!ZE*5h(#C z7A6M8G)4vvrY~abObkj5wY;ExO+Czztw1~u4El?hL1E5N@4&zmBg*8!;G@U@vQ7UU zs{;cIvnDt@F)+KRGB8L8fX>2G$zpS0U*7=*x3~a_CbB24)UEW(HOn1qTNGc2Hp`^bHhljPmvj zj12NS92i)X8@@X*F!Jd$H8A)%FfeE}FzA2a0VN{tN+u2mCLc!z1|7X}8s1_t4T27YFBka{7V1_nl7MP>#@ zdC+YqLcJlZ!UfAY*q9iE^c)zZCrC0dflrPUDh$31Dt0FrYB8`laI-KmF!O`rD87b) zfq~0`frI4$GYb;~PXmj-D!T&%H-`ZOHv@kFs5cA7Obn3oo+G@#Tm~2oI=Y2@DtelWDn^ReSuwXf}*%Nd` ztSjq1M*c(wC-&>C{0R(B-26=ZDGW|54;UxFk7|Cvs8F2A$iV-MQK>QjbS`qRnK~Qz zpyp~u{x}AE7EwkCBT+_vA;<~M9Q>eTWo=k^7%!q6%{+&RS(K9p?Q~{$CQkmvoQwjl z@+_8Uhcg>8zhGF8IF$BpfD`jq zcK)vd4s3r|Kxvr0gcFp88TK>s`v_XIq%rcp6R=@V2A`A6kpN1`)_ijXS!5ZxX9=?3 zWdI$KEGTHjA0ueUwhD5Rcs1yha0C7jL2Ld6f<|omAg&R+0qD?hL;g2{!u&BpA^e6y zg8ack9Bf&P{HhG>{N_Sjygfq9{_LQ=!?*c8Ajcl_PZDBaU(d)V$G~`wm3x&Ci#!Xj zBm>($23{Ek2hah>Yzz#N5&_`jjae9&1Q-~&7#LU>3>cI%l^7-=op4;3lam8F;W$&C zpH-NhpNWARdb+W^DuYr3Gv?{Wy5N&zp{E<`l6|_dk_RK?++mdCjfEEIGq6@LDDW|T zU=Y>=*HO^ZjrCAYH`W24WGfJAz~HQ)gM7HLFna={aHj-=F#6HPKNLYn8#8J!nDQ#< zz|J-{0H1Blz@)^$1U}tZ;43eKBj^enHqbG|&?Alol0_Mu85kI5NiZ<-GN>{zK+ZW9 zOc4X^yHsHXORkh)5ESGC35Y-hHcBuEx_|`~AObrj7z9=M7(^M8m|*7~3xvrsOkrSP z2xA7FVQdGo(?Nkj;WqOIMmF%d#0o8}5)2|T;1Dkr2OoSaf{r8>{2;*~tPhq|*vM7D zB&uJ)#8SbeAi?dyASMS^!c@T^Dp$b(I_XL9f&qiDGDNi*uLlFmKG3nk3c-953@p^93}R~F&Y;3~egg(!b%^5s0ul@?dCY7~3<}+X5)5LRU?qajBp8IX_&{5a1cW#k z`6`$diiA8E#I-d+jWEHN5)48@ydYV@?-C5+!a^WX#gk0Q47^MX>GwH6M-^{iWPZ)4 zP{VD&z!tzL-0+b};VF*<1A73Y5N9xhf)k$vNL=WEID;ZX3z#i@z<@zvBR^FAPez4D z0#I=W7KMC4Fq=h!S@05sb%9kOUdVuf&4Q6-0>dNFA+w;fU_mPnKu5+hFfo7+fNj?2 zV6bOkVBl6^VBle3U}9iU0i8ALtk1z>SWFHw;aT3=qax4r2 zOfC$dqh;5yE^q)%l!A|`J;pc{6qX#2gK7mBcd@8KJX0eViYEK3e!gA4~l0RscW zk13!dk2x6fnE4ABEE)bV@X9bMa&fS5lsYhiPpjR*Ai%PTP4$DL0D}PYE(TT5S+xR8 zuh`gMpr2IR!1!HSLxW)l3j+h_*0v5-28JK33=9+47#J$pA?L%cU}s?1!Op;Nft^8z z@gs)-g9Il7!wGH%h8e;P3>Rb=7+6dgm}ew1FjRmXw}FL$VFxb*^9ja)4U7y7D1MX{6!>0t^A5gLETeS(pPrMv4CrUkQMjtVPGg=5Z@ud5Dhv)_=Nz200%D{^B0D21$G7o zi4>43P=JI-a4;|gK)4(!%pg4)0u1310$>xPZ*Vez)Ez@XWj!xiZV2BW45U&tmke)FGB+D>^8C3EVForMSXJD8CF#u#^0RsaI zle_{86PLjUh8KYh%qffkp!+yGVnIi~F#KU-&|wAX1Sdg(3!I?S7L*P!$sh#9Z>TVZ z|6pcd_#nc>Ai%)k!3uK!2NkAp&=JZKEFeLMC)pSn0>ElOF~Xq2Cf)(zS%@$(6fp3M zFtYLMF!C8ZV5oS?!2E|HdjK76}O;JOmgM(oY z7sCpU&rF~rq+fuJk!E6G039FAk;A~y!f421qOgS>6n^U&q!TJYjs+da{e+MA1BZf! z$SDp+4K4;34h9Pjfg2s1JSJR<8XO^l0w)rfHBA(Fn8F#D85n-ZfI<)yKrxI=sv0aM z0-y_s|8HPm=;30J;07JG;Q}TLz~lii`36ifF*tyZf#zTk{_DV?v|LDrLFtK*1B239 zAqNJ-y+RCkxRp)|c`zuQ7xG{*ye-5~!NVZIz|g}3GIIx*`~xNv7(rgJ;$^5{V0pm6 zz$M5~%_jWf0|SE-FQ`P~VFF1jFfgoU6VCt#4?_YQR4ALBC5MdxTwDly7%+eefz|Aw z1DrrC&>_nLV6ofmEPG((frJ7BLo^3N4cJn!aV&p8heNUaU||65mttV(1kEIYio^hj z?gI>96TxL0=oB%K*`UEqh%98Q8&nLm4j*hL!y9g>6HjxnoB&-20Xp$>1uIA$Hz&gs zn4bg~1PYciutUy=0=3z=83Zg?!2%Kx^A9k9&c0y)+b-~f6_i654loFRfXE6k2)_^j z-9g3B&IUS-3uL|k_;L>h1qSdTfji&|H$Yh2oWd(0EKqSS06r?$Lx2Hf`D*Z?%pjHk z_*iC;Y1|9~KT;W(!38eVRz~nq*wA3V0X7RN1opcy)XLkOpo2|8W`L>}urdZ|F5wvv z7U=R0$T?XWV9TK<7=Ti-zz0@F<_1QPKOx40V&ekDp9f&R1BpRxI>3PBC8+t;;DeK) zws%16a{&`*pCZV^D8c+fl@SyrAR84J7%o6;bO4Q^GD8d!03Txq zI(7~c6cB$;fJkyP2z0Q5P7?vy0s9LLO9rusARayjaQ(igHl0hD152L_mvjEMWnO38=rp{((jG1mtuDjcAmJ zSRup5a)Ch@6swRx2Zbv*&{=%=K;d@m2PJ(_h=W`YvHFLMAf)nux)jBe5c^Ps zAgK^V2o!w^;4%!76hJ`^ib#kUH2Z_H;v3LO>nwi+Aq6UffDot-g!ucz0a2*G;ZXz& z0c<6@H8W<3J^_5fB*+v2NRoosk8qO(o?zzz#UYBXAfb*T1POE$AxM4zmEX`%5P+At z&@vqy?ksnN8H^Me)MSKLyb%B$*7rh35Y%|-W@6YN$iM`eE9BU~$MAsxL<%r)OaQT2 z8u%C*K=mDnv4DZ$0|STwF|PrfJw7lf{9vqO0BzX;T-2Ic||P-}YwAHxAgh?5zPfDLqDU^u|Y!k`RsxIo8NX7&IMaNPz@ZafcI#SIi- zl^VFw!f}8VVZ{Yjh6Io-djTuR`~$2E6Bs!TfY>YwlA!8w0;Bj2u)jbx8_NPn2@WZ( zKxujdD7XYZEM;O=U|Q3g886K z3Q@qaftBR~14jZY!vO}C09H`G0abGfte^pt0}Lz&UK#u3u?^hh=KGVw=o!e#6f9Hfq@}_ zktG9k#4ad3FOh`{iWh+6iN!z`q7`(sBdC$P25OWz$jt)a)+@-`9~Yq3vOEC$4%C4J zrCdnK4304d8wrr<4xpljB|-vZET~k0wChpo71YWCteIsF*o_Am#04OH4$yrF3Nu*x z7?{9G6I^J5QzW=zhSa&Bf*Gt__=g8r{{%*s3Q35~;x8b93{PX=0t=J@APqu@K1dM? zN{f)v1C#{|AO#VqWQ14^>PdjhK^6mO=rEjs1+M@DLj;UHfsy3`*cqT^xje`j6Bvb8 zd;k}n0t_q<*g--E7+BUoohaM^37i8AEDRhVIZz&F0$noFz{CKynq>o+A;7@Wzywlo zfISc1m&Uw3>*rQp!C0h3*^rR29^a}ptd|HCZQt290D*?Ac`(9 zf)s%R6v{vr>5&KJ8;=4uXiQE3Ge84n3Cy5io&fFAGW-Dzyn}o(fstbYxL{kr1}azK zYz9V_2h5-=qXZa03Z%I}qyPg;jRMOB262H6pipI*zy|UL$hpi6U=fZ9ps^j69daQ5 z1R#uN3}9qNHtqle$aJtTI39pp1~vTvGw1;51I&=JjAH}HG+~Yga1ekhC3#5j2)}@M z1iAGF&S~KIf+bS{28JK1pwtA8BZd?hTY!PZ0G2Bt3;_n=2*|A|&|C-Yzk~B1hXD)2 z0R|8W%1+>+g9(f*ca%V(2WogLuz)-}fssJ~9HwATvB)Tc1VNoGNN&SsCkyzfd6pVw zXk=}0aV3v902>}0BAgk0~}*43m9N=2pX*eS+f9KYDsMt3#VleQGlMuo z1N)2^Mn?AE|NryLFfy|L|Noz#8?1sL5q7#4hF zUNE1b#h!tQvC1?sAMy99%wC_{Mw!^V%|3jbJc$TNyCqzf<@3NV;+fUb6H=V17t z{efNSAHxxOCN`Gu{0t4(qZt?)gcukmJQ89QXJBA*_xZxU;4i}wW(Ibq@BjZZXPUFK zeCI#FbTe9n&5`p1^M)lK85aC!I3mx$#MaKhz%XG+3~K@d!;cRv9~2liGcbN+SNO+x zLLM|$EzHKjAk5CeF!=+!f*eDOJOh^ig8~bq1%q+edu{=S$sgGr{xKYo2OXa)%*nwZ z%)`N;{E<`LfT9JR#sOyFvrw6b63K#c2!(Mq!QwMivdmjUU(@;usFtGcbHeHvkE+a4@!fWLof_p(UQd zf{{gn(U6P5-GPa50Yl;mc?NEVZ~@TiOwTzOggG;qK*q%-FtXfWU|GS)@_@nNulj=j z4;UF%sBt7P8ZtAmcQ7h_ z*uc>ApZ(!~h6Z_t4U7jA9`iG_|NqbL@Q8swgK5)-fN$&!{}^0k7#IbZ1$@4kCN!|m z@L^zZV7hqXgFU;#KgS#L983)20t_G6nHktw8yG=HeW)`u)C>53V?XeZX-k4tQaHm5 zMuX^m5{Cq0*crrG8<=ywpRg-1vlK8eDKHx`JZ3XsRPg!Az*2C;o`aRaowb5NiG?kK zL4lc#O)`+d?<0Ft0^>&(c82GS3=EnUObqI*42=>#3~cTN3|}}uu{+2yY>{VRW;bVL z;1KWuA6uw!=UoFs!e90Se;EESFlu!C`oX{?z?jVN_&+J5 zU^0p45J+GW5(2fNgw!=nKeGCLV^{dYa73O%NXVQ)g25n~UC2Fx$&f*4y8x5G?*_() zH{=<_gxFadnk0P08N@l5m^2s-eth`Qz!LODo`GHH04vA_(5Y~sjsmmXkpxy24;Fz> z2@EDjm>B|CSUOl(JXly7ScLg!u&`vXuqCi4D99~f6cJHiU}j(u78PI+_;7$J;eb2? z!-w|C49bimD_B_!SQI}pFfg(BQ8dwE5Hkm4a;Ftkwg0nQRu?R5oa4@ng0A&ae!+{YZ0y_2J z1T%vGqb0+K=^UU#{KQ%uSXnw)SwI{PlP~rR0*ovRKwRO!4J zH{=;Otqw36Jc^Nj!T6D#IX9Y_&747j=|Id!ezkuGCdf0eFi&q_Fl1$(ZoweP#=)SV z5WRqvp@HoaGc)sa1_maLXlCYc1_kB=F)Yjz7z{tMFFU}oh?QkI=s?YAHkJbn8(CP= z8D=mUL_cJ2aA5kv$;J@Q%E038!|*?xm4V&eho8%xp@BIg+JSii3(HCd1_r?xW@bJ? z2L>5_HVzJ>-005-m<1#p7#JR~3X5BSD>efLf!_}p94^Q+us1MVV9tqVWp!s@U|kTy z%zRAv0)xya_J%(UC*(C)S${iOJH{|G|Kwnpz{(KLz`*Pp{pA58g9Ec7gSi011lIrJ z3=K@KW`YU~h71BR49v0v{Zh0y^Ru*q9fv1v0R( zGpI)}uxKzPd|*#_#&Bit=hvji|+%rWC;{m$RO zBo)KPy8Qy<#z*W8{}>O*OK`AmH()YkU?{x7AiypFjzU2OMg|2YMF#f@1_lcT1`nnO ztZd~B3M>a=KC-L*JJ6EAq{Plx!DPtJ*s%h1^XdVG=+Ep){~V6Ub8xU$uqrV4X@I8> z1Q?PSSTy{;@GEc_GB8Z4VA#mU=FYHy`9KUOL%RbbqXLuT33&!~2L?j{KUNNg1Oq=7 zHg?7n3`QT>6BHO0a+EVHU^)=P%G}Pu=%CNS&h&vn;GqKB!z=O}9PH{04NOu#@~rI= zj7sJl3Ji=5ECm{e_<) zfnAYd$_54n23AdZg_!qMjtvdcD?WT>S718;8VR|`z%b)IGYc;RgMiO_a|dRH?C1kb z3d}VJm>3*IjemS#yTPf%!F+&e!597qj~Oo5Gq7>A*#rv2u(PSpU{qpb(_j*icVG&x zkzrtGQ(#hLE9ziSVrM%5TCU>44loENC~&dtU|{6{O>eL|a58@2 zQJwK#fq~h9+weU*Ljnhn1cL%U!*eDBhQ}WqSQ!?8^fA3RP?_*(0*8QZ1E<4*cm{z4 zhCl(I_aYyDa7|+r5J+ce;5Zbs0F-3-7qH6-GCFWFDDW}lIrA@IXAxyzz%DGYf`MfP z2h##}V|E5f2@qRYVg-l82Yv-R2WA!pW|jm#t_AG!@eA0w0L zj0#)~515!87z?^HggM(!Q$HtEg3UZ+K=6|u{KX@Dj z#M~Jfcnt-`j&m?<{K4SxN8t!FgP>Tu00T<`harO)y8r`20*fGH13yCqqXWZZ2EiB1 z3{UM98Y&AI84vI~f01VpzQH2E&cVR&fFHDmWdUgLPk{Y^paOGtw5dRV1%m@W1Bc%Q zel`ULVRZ!tr3Q5c2F(st1qOM61>D)fVhb2qUN8!a3o!63;1(A9!Nl@{k%fVq2h8AK zz%8dQuzUe`77qs_69edyU;##!A51J9jQk6@O=LM1a0>_oa56mL6I}Fw*^ot`{Rh90 zfTWS53@gL*7raUz84}n8gl4drDBO8BgH1u?gPJ7h{4yqn1a=k)2A&Ki{sh)o{scy5 zarOj8CJsK<1V;V@R##@QL@EmhBToV=PX-f90xOFEBTEJ7{B8aOR$T@KE*Hlu@(cw` zJQ+*^pBOkz;G7Q&*c=YXGl)pA+OW@v$zbA3U}QMW#&&|8{{WjSe*z;v6Ub~vUseV- z<^m?R20I&xnz1;Bnv=7H53G&&a^M|A;(;hyeS82Ih`{2TcDT z2r?uvu{>ZB7G1#3z$?V?g45vx`vThqOiJvc4>$#cI#n1~urqP6@-wgq%X2fZXv)Vi zu<$T5NHGbp@?@|nePnXrRHD)53&P{u)0V9QxO1_l|m36C_G7$%4^EZ|{c zkY+Jp;%DHO6JkEV$gjZ2EzhsO$Rp0Lz{sV+1Tv*gfl+|ffQ?6kjd6h(KLbA_vj!8N z0|(a%anQ944E)@T3mEtr_+?o%n3O&;C&=+wu(N0|DSTpf;8WpeFkoV_U}rI4=3l_c zCtsc*$8Gq5{lEf#MNxqnd<+ZNI5n7*-ZCf1SPDtBN3ej}u?ma=0`tHwQIls;U}R+i znZ=^OC@-wQs4o9qfsvQFfuGrci9dl;Sx1=Rh9rvtGvfh9{s&Ur^79f{dF1CMaB>N; zEMNwCD}j?yUM_)^SM`YS192sGksabnLJ|Ukh9B7z5||YjBpAe;1Y`sRUT{oc{y&|O zfkPnXTZ1s8f}8?JzJeUn2{us%4S9YBIbMbX;tULo><5$teykAVI-ryw&!EoO&R(Ht zs0Ql%I!H3MvkEXcd}LRMQ(!QZV00H?FcxKOf1xDk&cVQVfXVrYJcBTUvak??vasL_ zC5Mm93iA#yGD#>Xg}m2L2n>0z%E+FUDkE@8D73L8JFFZj=KM4m~M z;kX2-x4j(P+ujax2ZKNhgB(+Vvarw$Wntk7%8UvO0yh%41j7v&ge4;sg+(&BK%F|V z1(F8r64N(`8$^qXWGEZEycctjTduSpy$t$qoqtAqQE;142w;nt>h`?9A*pq@4H}8Y~wG2_9zP zR{5`@pu*rCsxaY^07Jk5p&QYE4@fy4VPfEK(ld5=#9SeuAfvK@MS<~c1A_tsXqWv0 zh9y5fFsOY0|DT_MMT0@W&_TuFKs<}g0Vxv(hBq1vO6)QQoJepx(9tqwvEA$~+c~EC!5>4UA?d*cmJsnHF$@xCsnmGY&BDc(C#(i18YJ zBjm^CvKEWZJ>OpTK0wx`0DdE_?Ex}f<2HjvT@%uEhK3qH9fNf-HUdBDtafPtleU08vElVt;Fq%)ZD1Uu*~dX^Q;!U`v(866lI6qK27 zaImakW(i62q)&zEuHwT!R6WA4mSza(2PFNr`L5*zzsFf_i?KEKl8%qSc5*teZJIeuPC3cnw zc0+cS_5^k%c9sBkh69WW4mTDsGDI-2MX)o8#V|A|G9~b{MX)nSnI$+pVqr*U(qLt5 zU`;&2#K6i>F2KNYgG*hUg@Ie-Kmvo#j}IB_0$&)E85_8()lC=}K70`GU^e_9zHkAz z7Q1)_J4*pOD9cV@=ILN&Y+&SIP+;a@;&))vW$9pMaFBFiIKs{_ftkgEiN%A7MS+_o zf(xY3fsx^XJ~LR8!$-yg{~Xv@CNT3iFfs8v2r&wY7qGJ&VB~jDjAiLyW_lp4V$T3p z3MwQfFoSM~b6{j_;1;{_fCV&Q$HB<|fJ=zMK}ks5f}N#@*uk- zK!S!J*&7&)SRm#kfU*lu2PhdrM3^3Mfh}R_V0L0=aAyRa$HK~VgP)DT05sXbV8Eon zz$77{#P_vY&K|t7LhJdgO2Uu1?K-k4XK-k3t z%u5gucBv2$c6lMN$mRWh28Mr(f8<#h1leX7Fuh=qS7^}JV_d+>)gUPMt3gnSo8be4 zaEJt>aEJyYn}Z;e10!36AWMKCi-I6i1EX+=10&lCL52kOV5TGU8N`1Gu^GvtJOm9G z7#j8^2|RirAkO$efW?DXL4ePIjp>1a8smirtV(=*9=t3X3@i@>m<$-$9|#BuTwvu} zz{1eZ$>m_mkRZ(RgPG?6E9lNW1|DIS7iugVJgf<<+&@@Z95@s{GAZx~*ej@WJD3Wz zcQXVqeqve>&tSsxfR$wi3(E^u#s+522daYMJD6D)urNF@=65jVU{&B%5Kw=hDiR&R z$Mk`b?ExFV0&hIi2?l-z-eNg<1zsV61FQ@Uj0_KiS#Ge}GYF(7a54*t8}PE+U}d?$ z%Add)C&bgh%(6j%k%4*I33-P9Jm2{buA}hQfQh|oF|fR(L*pJ@X# zO9lt)0S5jCenEZ*PDUZI6FiIy7#S28SsO$dI#}6W2=X*=fhx`oEc^_7M(huGK$YbL z7M27q5Mze`%LdR*^*lQSSTc-R8aP4X2N+mhu<$pC@^e2>k`l|{H2lcF-~gKvv)Be^ zo(xWbXAPV-8{|Rdbp|Jk11HZ99+nJF76k?fgHeHj!9j}q2M?16v$4bD4-KL$1)NMT z*x52bcR!gNk!L92WYXYZ18ZR^;N;2RWMSZBYT)2!;IwCC5EVG^fSYN86(a|e0dsbA z03(9~m%v{I4kiZ<0agYM77Z2_0~H1abti^5@(e#DjB}$|Gnj>0CI~VXh;jQG%ySNcaRonSisC6z`>Vb#qS`kz|bJZ#$dz3|vMn_6!o-pq3`Lamafy0Y=6I zJLfO*jG_XJEE#sf!Vfr=KejL6W`4jaEU3XHEKndQ@Nfa6bBzoGQ-rjzcmrri+EVE) zKLeW~JF~lkr~^MkL)8gJVVMGH#s;Ru8}bZN4A&Ph3QIbG79nskC2;uLGq6j7E^k^O z$e+LwFW9cZ$)CVs&cwjRbApARflY)jfkQ;HL5Jyq3TPKh;sov%2FY|rhDM1P1wn=e z9)<_%`plvU9E^rHqu)H>c9LUY6ZpYCiCy43zXRiqXz3Gz0WF)@7yMmdB*bPQ>Da(7 zozB7Kz|X*;_kxkJfsNq+3!?;sK;{hz#uGA31 z5qSkpVV4CQY~Y2)E{uPe7%m7iJpl1rKJY*Iukc5n!N7&-1vA?TPW}U&3j7ViOe_|h zOdQ;7CpcLSs4!V@^Dhu)U_PM2a)FcQ1SjJHPWA=Df-)}{7!GhUC8)A25Ehn@;1m|$ z!6_`Zfm8VJ0%2j%8N$LMJ2-`fH*kWk_7vK|DJ*zHMOdJLbMiO#2ag%H*fTISa0)Z* z;8bE~Xb@*m;ALpAci?AW(7eDdEOUcTMrr{k^9N;q2G(-H!W zG-TsS7jO~a++kn9Bo)KXHl3lsk>P+;(t!*146JN3oHuf?O=sX>R)}U{Sp-_X>ELih zo`K7NA&6Ncnww>Xy#hPK0$GLy3<3)j;BcnF zMbNx}0d#|}p@1JJ2ZND;-$w-{1qm(zh5%*_KXz6PX9bOD0|tYRdWHqgj0ui{tO5+z z6rwq}+8GwG9Ef3I*$gVIMHJc74>&4uu~;}LadK>M6!_x6ka$F%gPoy$0|V%exgU-W z=A7(I4vr3A*ctvO=o+%Kx9?z3QfHH3P~qfU;K=yENr{2uV1%{*h;3VJQ<}G-P4sb#P!f z;lOagfsuh*phtm^;R18ujQ8veITj28`X`(Vn6sllusg6n-~x@qP7qXJWzQ^k@OjVA zFfpv*$A=H{3Xd5691v!b&{X1JUBRTp#?ZjPxWF;-f;|H}M_>bk5(j$)lOa2M6KJ7< zqcR7(1(PCsvjj+d1$fwZhCKs2dxHc6lLwu)xLji{e zLW~T|PB~_*%n{5=%*+~0j1OD{O!A!s6C4ph1mjJc@-E0SrnL*%~%;+*%TNU zq}rVXV%V80n1oq7Tnzv6GdM7@C@=`KFmSMBaPl|!=uQ@7U+|ov#D;<42a7OA0|!F@ zV*vw0L%=8g1rMB!IG7oBuoyCPnXh0`;$mLG!r;J?1e$STWj^3x$i}=p!VNSMD$MBM zVR$57LV+QXNh6w-HJ!mtfNh2Aqk4wukL+##4$J^uoz-puTFIxN5Us$#kio*xz{S9z zYsAdPZowe1O^wyM6kW*O;*n*g(eE3-z7!6rj85vj^ z6IcXpIIuE3U=R^kU=a9F%wWL4_(XwK;Lia@!H5bL1r}Bb2JQtc3=E762`+&QGv2ea zOXexah=b;>vkz!1{Bv;dXJ(L6V(0k5V8G8&zJXOa`q=>&1NIM`4UCSA>4NUw8 z7)()56ZdCeDw%*33` zs?-#P^30M9g+vALAz~lcu^%Fqw6H8CRX@bfNB`FxQT=$lhleo;GbACO9p+%CV4$ha z@L8VJqr(hJ^OE%#7{wS2d@}RI7%pNzH_QQaY?vA2Cng2F$A&S)F+Rh2XxIj%GsA8( zI)aW2<69d@?E}LYF0bSlVw7Y^W#wmL&}3-kW?mi7$;804LCctd;g3WC3lnI?KL-;7 zLzgu_=-6DF?_8kcqgaL68CaP>hZZr&GcYi*@h~wcc_;`mu&hXBP+<@urV+>FfdHx zE+W3OH~Fff#Kffli;GYBZIuexl1qEGkkj*TWpbN?5l|fR^+}S}=!YZfxghW7n z0S0+cvsiGGItSzbPEIBU0gL&5%st&8s~i{@?0R_|7!}01KVr00?2=bv}0yB#`6Dt#g zfJ0~^vk7R6G^n5fOUf63lxH|FusoOuTBX*L1Ty0YNN|G#14{-o$Ya_E7#S2F@LXZ$ z1ugn_VBlx~nZeS)z+}$k$iVQ9=MFQk1A~Hqu^a=_j3$ut!3uaB7@Y1iFfceXae)qd za$pb#4VWHaW~pFcl;eqKU|`q=x{F(YBbNmf)ff0c=Up)|I52}wYBB^J{-kun*@8i- zfekdd$pDpb;B;VMR*LYlU=VPyWMFQ9iKlWoFfc35@HJ;pW)LuBWB{$o0Gs5`4LXQ+ zg}nd+10y3O<{3Z|3>=004UCKm6+vbUY#EH8s0WMd_wsdMZ`eomnl?)+A*+2q=Ev( zfr+`Fk(G&oL7kalF$1$Y6B`o)L%bkEl?a0+BQpc2_st-7gn@&JK~afOf`NgBk&}r* zsUeD=LBJuFf!TqpuwV5P*)z~p1c0P>Y+80f4k2Ei5f44em;eC!z*xE&ZoSA#C2WME-pP&%-h zi-DtoQQo0}kwK_ego`CUixm`HYq%L$R%C%rp*jyLHG~RZ@$)(`usG}n4e>ECfJWY! z7=&tiX7W2Qu<;Z!h_F~PFf%dehq6J>rgC6`oK3aSImr!lU@rLZDLjOMfq|ick%6&+ ziGitsnSr^1g@L7^fuW(Hk)g4niJ_^XnW4F%g`uU9fsvt+k&&^HiIJ(1nUT4Xg^{JP zfw7^nk+HF{iLt4%nX$RCg|VfHfr+7sk%_U1iHWI+nTffHg^8u9fvKUXk*Tq%iK(fn znW?#{g{h^PftjJ1k(sfXiJ7UHnVGqng_)(ffw`f%k-4$CiMgq{nYp>Sg}J4LfrX)k zk%h5^iG`_!nT5H9g@vW1fu*6Pk)^SviKVHfnWed~V+WFC-jeI4j=@P)#`7p&PQQaUeao?txC>H4GF z^+D$WROwFF2i>kedOaAsL%(=*yT0&XJk)v8qZf2tHv|7R7RC;yZr3j$D-Jd^L4vEXg*-j3AUv}U_$I+m|ufEn%{VMbk?5m=qx?p(HXiU#iN@=RReT77P)$KyB>g>-g(%g+jR$=%Xq`1)Aa$03QN}q zWs)e0EL|Uzv+n>Ufl4GF~(R&cO+bi1DL=#Kqy5+3j#&2JPuI%|J;be4Yb=nQ?~(H;5$q!W*uejvHY zquUkLFas3BI(;9wbh_U0>GZwe(d{bH4N3?K-3*q8YE6B*eJ{9lyWa8W_I=>d4YIft zWbqG=UeNg}4B`Jv75{4W)N2Tb!3g$XDQ7l7LZaWt~SS`Y9~Ie?-X-J35wx?MjY zdj%yVk6J{_yDb{Q&k>=Q)@MkR0zA0!r{87rpT4EPde78G6H` zI~L@k8!*+Loku)+T_3=>9^DK+osT>^k9l<72S?=ukLDu^(T*{(hYlLJ7DJYh zZbI^4hd^yYW@FU{O1B`Ffsz0sb0fK*$gu84gf(hFV-07Bp9$IxPN3bcKe`<_Fyjf4 z=O6_o#H*kz^26ge=!_2r29VHk2hdg9Am#^1d?9>^l%hPE-$2}iX%;jjv4$Qf5( zyY_+ZD}W^rq%g!t;h44zJ{Nw|6vj~N~Z zAFz5bf@;ni;8Nf^*!l^eSVJ%6vE)v0N(Gnfs0AAqA+!P=lBz)Ig!KTpI)>JlV38FV zA|2==%?Ao#2@2#^kLCjrXnY4Wz6Kf}UIQQ{_h664Hxi(r?ELA``N5;}ut#U_hkyV7 zdvwnI0j8$D`1k)ma-_d7JmAp{QPVvYtfse>;Xg?LCq~ z(kDE6K`{?^uLr0(-g(1grpNc6Xf}bHsJ)>N&~&sMC=rGx7G!%Msvw?zS@Qq?e_~QR zwt5y`O}y~v1XXM=kSewph>8u~$^o}|Umz+tP~is)6HtQ)sgg!Z%HTR1-aSAxj$kPo z9y6V#M><0fpv6sd?E!{TesJp;n%X;E4|Ka8&~`lnZtb6dIl`m!P`B$5aB0ls(aXc+ z(d&Al^OgrF>p_Ccqucd@M>i;&`htoGl(dYR$3QLu&^q-(fJd+EiU5qN8gxMcu~qegZVr#`&=nrt zt}8q`L6POUpql}l-&{9%^p;+LM|E%L22k`OpAIkDFjNr&qO znC{ocpp*wnq(};0vm*rIDp3kyS5Trs>$0JB*91Ci|8$oA@aP29-Jmkd_lHNX>mQ^v zh*ThfnuR2(2X$^bOFw|Ssz^N?kT6Q02UG!r&e`+m-237$s21J&08CB20j7E{{Qdvm zrL*5!9>#8ZdyW zgywZrQ(JEHw}9Iuy{%xon-2(}XK7GwZ3PDgD33v!a30-L!6HyUBK**DphVvTuURlH z$2>ZrX5m%d4GDY{tHIF$4Mozd1_wNnSx~Ec0|Y#}!2?T3&D58V!L=M|;es0apcYOy zIB_7Ei#2?~UP5*lq|pwKdsJ@cAy9h{oK}#65sMmh|AOKY-JkG;GZmZy5CIC4fH|V` zxJx%Ub9i)vvx-MIB;#~LvQc*{C{uZY8n^JI21%n3F0|-@geS~QSS&)6z z-~kTg-VdnSf2ULU|cZG%@c+m2LM>n+T22Iv22TIsInq6No zzGnAmu6@B!8U${^c7w;>RKRSgN=VqDG`cNaf0Uq&5toXh>T!MWx@E#^MsS=X5*93^ z5Sas%)Ie=Zq?`ciDL_&TD7T|Dq(ME%7anNMSWv42C1-(JYco7LODCW;)jYagXLxkF zZt&<11-WYja(k)Mbq8e3+7~JXk2{Y}*8?8iz98ud9^Iv&B#zV~ay{YETzi6{1mt~0 z+q*Y{~KInH)AM^!gUQ_^ODq<2lC=Ypbmp%ZOJiEZ!pam2-y>=ejrv^3( zJP!sI!VxVf&85!L4`|(MPz3<450U)>8i)e*-Jv#ur^#9llv+Y+KSJ)?W zJ7S2i^B|}c0@V}8JUStn3zBgOby`6&3vnMJtkK&S5I)RHunG*j`$I2y^twJru4uqq zi0eHpU2pJD2Tkrihqd`zZkO<&w6Z`U(2Z+=2H{35Z64${2h5SkwJc}?6{S`ucPJLS zE4o3wE6{8nheziH56u%E2Y)a@ni5#rN1$R676i!dhPST3vjU(d6*S~PT^f+#4B**I zd-Zd|PV;ZH&j%ozBUjvIrQ0hmT0!A7a1)G3K zTksG7Pc(tk5IkKWhY4IAXd)AqN#W@T6w9zUgg42M+NY2Zg=Z7w@(32Htq1s9LKs1V zeIH;2?FSqM?HzFG2wt3mztMBd71gWpn1VrTlDWdV# z>d@E)*Sv_dffQ?mC^^8xaFR1$_-~sOK zxk^AfgRTmoo)l)P*#Ih$TMv{vfSOc3-M$B0x?Rsey5*o*lTygUB_y$jd-T?R5C0G9 z(t*=+I4n3ov+l%26TCX=^aV9$KEO@rJk|{o0WCa2ssYiLGeP?t;Dv19It0|s04dl6$3&R7^U8$4o$7@XkYED|UlKTMv|Kdvu2`fb^r`HN|d_ z`#=up_Fd8Kx~B6S)Onx*boDgEB2;%5N`*OjZVsLr}3%zy$)u*1FMSplhg^X%99})oduAwsy-K}7g zpmK=uvk9=N3#c(@4%B$92yzr?EDFs!i~$pf1>lAYNDIgx@RkR-1qqr61La|Of@=Q2 zh}8Ok)^3PSB_xA^yZoT89wl<_i1sbF*(S@{$L9KVt`VdHF0nhq_3K($n86G^Z&!ClFprC*kl+YER$m@h! z4wOoQ7c+tvHG+Z~K92XA5zQ<}i3yGukV2S4Kx;=nbTc3l4(iwrnhQYQ04*~CmCn#* zp_rlCSqt$$$aCP<=L3&}512uXa?pZC*B>6uwLch2I3dM019aUjQW(NR7PkN0@xprK-@EZk!rovsT!x_v<^K%>PAJkTw4UE$GOyMm#_=$LB<19*BL zJo|w#zBjZ3GHU`C^6Wg~*?IMKxJP&G4yf4+Afv)y+p!G{cX)v6TLw@srJd2EH?-ZO z*B5Fm+^Ig@sRw+zb3to!-~-w|oez9EUwL%L-oPp_DJl2#gzfMPJ(F~%_# zxj%-~HU}49;3Nu~+CXYULCx_1&2M1K3P`C1p1$yxU$9&a>e(Ze^oX(yl;*HF&8HKz z?A#UBVL)DIgON=@nFwjf=@_V7^XYtulG;LFfJY-dnvWQOvk7va1ycTi`~eyUZaq*U z1uX!;Ev@b#4iBs@_h^1o;L%yT!=tlwgGXoR3Xe|T1wNgwGkiLIQ6@Yf{nZ^F%?A_^ zIl~n+9t9c;xB+Tfc=T2%fa&f4&`8A&@OU&RS$7@-$7$z5kIoC7z7IfS2mISyAG91O zG4km41%(boKgz%hDBW)W8;2wenxg=>EJDFiJ`Rm9vl%04Xg&(g3JH#~4C+VR!%% z0*CfFf;)kTR*=VW*9V{`B4{O&2drNV9@BtlSXhr(dp~F_A8Zh)q=Bx7M;i)&ih@;j z9@^)MZXKv{M;QTBd@Tj4%|YoAsu!vK1vUm-sSNTayygU}>O8d1AJZ~;zXPTQw95{8 zND{dgr9udT77K%?k{=)?_rVx|L!d{AT)NOCz7b=UXrpVOsgE1Ty+2Utv*SMm$b-rc zS~p6)W<}Ur>J=Yrh`ZUyxE0T)tzprjfd)@Z67GCAg%8Ry(BV0(F6zz?#AB zxE~(i&N8T(0~)dY02ylqwID$<$X#V@B>Z1fEH82`+(q}dC@5V+OGhsdca~2 zIt4(TD^Q`0bM_SEF0kueKcKi|2xLD{sSNG&f${?)uF<9fAp`RYpyY0OsF>{8KqTv^ z5jgzY7@9vYwj3y-)p!?jE(RrO^tJZjsd~tu^%ooi>Y!W1i5xmC1+ClvvXhB{fngU5 zsG5VOR#>&>3=)G3-hqWc%Yd;KKCoaOT}KMqWQ4Nz4m9jFx{h>o9qIr7un{*zk7aZn zDQwL%q~C$jUq&4!1XX+BNosg037RwoEsTH+9d(A@==8k;o@fE>*?{a(=?=XDX@9`0 zUXRYh`$20I!9|(}XvY<^M|UuXM<=64XCP=d)C-SJ4*}@%Cy!3Z%p7EQJyzZXw7F`$ zErJKImcR=Tu~6clsFGv-K+0gV~J2i`n7;nUsFtxd?SVW@NA=>gpK zhO~@+cr+g{@UV3Kz(3^x|F#1j;O-KlwFI&sw0sxW5d|nG06lb^~Ipe zB0(Fn?jto(J$ha5!y0#>Mjg_SKFaz+xFP}>fKYgYS_~*HDsa;o8gJl~4%CsSa!8|3pbb@^F-TmCt6(J%M$ZkjUlSI_;3*3R zk4{jZ5T393w}Co?KRN6Tq{jNMmU5el|#H^8pSlsUMcd2*xkCgacQC@UREvTS6i*^Fc0v#tZTo6^aY7 z$UBC?@*OC8;AN&qr!QzQ6P}9DcfEoZ@FJyu&~hu#!u4aw)h{$Tc7tXiAZ*W0&@eIl zNES$zg&)|0mZ3pohM-|isH1&5KlpUM1W)iG)f=uiUVDRr3l{LmL)MUdcwrZK)*3XS z1z8FSE)yVn4ujc9MKg4y+A-EK4yj}U{kAhn&y#b)%fd~+T4S= zt^=uo2yLo?n^?zO|1)CgRUsJfGJx0oKpJeYkvxorb1(^L z`_yA}t2n#?2yWz#HscPXCy~)}C=eYX*s{~nX545qj?!k_XdiL3j|fUbqb--wmJ0&| z19D%G@Q9KV( z3>sv|KgAY*J3!1OVm1Okcf$07RY3Oq>;f;92hB7?T!&O7fXqVi6=XFzXci7Uf(P3! z0^O4VpNWIak|E8#fo6N*J#X;T3uJGuFKAohO;~vgI%yTM2?OqZqB9$k4hPtwxzIZi zkWa~lu5*UF6nr=?Xa)~DCJNs=&|G_ifxqP^WL_7^>ENTkK&MTCCU#FCueJbfjR9ph z#BpUWp!>&K50qwtGY#a#4d_hO3GmUbkeR9Pe$phG@jXElJz1o)m5)LINwdVn(-+PZGoxd@=8Tj1+1 zTwfrkLV^oqhetDUIHopmnG8x1ka8H3Wq+Wx@-T{I_|y%k?1AQ1Y(+RCbz>TVT?BH< z5>mm6XkTL51IrXBu0XL9eTE8!k3E;b&V59lghI|GnC?b)2KGiUemxi-!(7P#SrkM> z<>t|eRI`DXO`wDasCR-g0|(0S$n6Ehq9D*Z!s(y`Tp&B>K%ESagAZ6>^(Inu!~73P zOXw4LNMQ$S*+pR6sEAzTfcNu)&PRl$O4kK2dqEXB_(W=W5$4mI3tDwE1FoVwc7sRf zJ;WwE&(0%0o$p{P-CoOrQWt26P3JwBm`^8sxfH1C!?+ru`A7st+YDRk0f!H$dPGY+ zh&VtFtLEAdjPRwJC1Rks@i^{a0B#zAl63O{gV!S9b#EXZ?k*vGLls3a%LEHQl$b?L zAK>c+;BkV~hKKf&LG?3yEgh0MB&L6?En9pA0fzaIHY3RK;OY$4W&}kp%$ty9RPZbX zX+DBBwIdP^>Y^%8OA&Hm5hww{S0ceepB)kUp!|!c?+pz=(2zf5oh8PaUgWgZ8~Pr6 zni)6^zylX4SAlGU_mz-y5#*>mq-N(Kk6zd7sB3sZH#(pQfew;KT0IJC-+>nfBU+db z&`!gR1+4+TjtExJ5v8DIztF|pD9e6PltIfqER$lO1b{RF51qhAlvAM90;QmFj~}nC zzy%X*!!0O~K(`uz*93sN+VB9wP{w!xTLMMoRE#u*;R>YD5y;>OWcfWTB%opeEJ$%h zY{dyrVaO=~e7hM^T@4LCs6C)|^pE2XpiNDn>#vYZfaDx-=}da>3)9!oZHb_uMs(ni z{Rj;?@Tqhd&H!tJgo#GOpj>+F7rVq5o2~H zPLm;aLCtrqNKP#%$;{6yHZ%jdG^jMMBr`YFwWuh+2)n}|+8HqQ8-esYC8mIV=?Xat zA6zd%N)2#cK&}u#yRVT4HoILxn{H6{+=6loX1=7NNz`!}_=bpEv_4usnKO+2I_Vm{+5-X8yMyz-@t%4)E{Z4$+ZENtC0j-!TyH22w5E2X~^Q>fQG6EtsVgd z8IK1tjReU82#104h>Epy?JGj2MY~1SJG$Bj*K0#me9@1k@t{?erUz z7j+Cu$AR(#ItJIbs1-lD5-=a!{s6O}guAn|m4ZfSQfXdEse++_o{^q`uAy-$m~W^F z6=7fiwbVg##Gu`_?tY=5o81q9#F(MRh(Kw^svri&3IRrG9(Ilij0_Ai3=j;GmI4v3 zd;)Du&b(}aJnW!G6iD6%B+kIVAPb^l@*xQM7?3yv1A{S$hRJ6kpB5@%pw5ChRLd0&J)=+sM)lrV^Z$$KH>K}R5iq=Y~OOx_bA{|6+F$9x&k-X#VG z23+QYv@=!(_<%!>eKv?I4GTXN1_lNY1_|;rLgIk|<_6bhW@bi4utPxM*aEc&bax6| z4O25*ehE}w3uG9~j6|?BIG(`gm@+Uh7$M7NBIKVy^~)p6hau!yKnGPYFfjOo)WOVW zb^|*QbODVDR9**JKh%F5AUzBW3~w1Q{m675VMhiwGm;VV9Z-2(ep~^S$Cg$=;S5T% zAm^>Y=8ky?dp-~_=LXb2_{;&>2g*nw`@*o<=Yg;<0h-akjW1Z(`atAC=6FEevxfyO z{5+bOnOC!b6@t_iK-Ga#87Tk4(}`;{GgB{E892SoAX41{sJbe2_q+EnH?uH-b15io zDu69xVBiON7Uq68h0hLFIFQ|J& z*g={Y7+gRUOfR%PG=Rz(BFklfgAweP0I0k*vOLt>9H<;A+oqGW)k1IcYfXWLZ>xY&{AoB%4l{*6iLmEgP=65DYh6LpY zSh#_f$AGgIOby7L8^F;G&JPh#b8zMR3aC6Tcg}#y%OSfX6dVj-^LIeyJ5bVFJ=g&t zf82n|dm-y*as(RylK%mf4@S}N1=laZ3DU^GU;?6Ge#{1I2itD}mB*z&0xFM7e+5+j zDzf=ZXTTPK+&2R%k5cDCk}uf)9Z>l92vx<7%JHfy!fR z*MQPu3)KCg*y}J?kVT-dSc6L)6C~k-+`_>PawG!-uJon>mB&>+df<@7D!%|(-lLhBsUNHaoJSg@`0CH;q#2%1aHbB*R zpsTBgr~!ou4(3=BRX3hp0gumX_04pcq>Mcy4Q?*o;`6|Xr^`QOO;LGFADHi7}< z&IwR8^N`g*(s&<86-dn%sG3R0YCz_+fK)Jm&A9uZ3UcQXs5~z7k3i+YllriD zO9yKQr?Dplcm;htRJAmB&_2gTjdiYJM!XK8z2f4+HW?4jywr@;wCP z*AS3D1C@6|_J?aTa|sVS2gv+4P)E&}-zToXaYK2)G~L}1H5kfs(meYQZ& z(nKz+nH9hp4CIz2Pq(P)Gih-JeQbsaC3VKkyyFlF#jo#-6w|AJm zLA`#Ex*Vvw6X^K@)Rx>1aTCZ-XQ1}s>NCE9%419OAbYMr?FmA+2kah5#|>nU31|R? z0e3waLO{NRfczAwJlHv~xJOD`TcGl|`juA*$bW&#nyz=JCc3}(pkxd?d^ zs64j511Nk#pz_$#E~xw}fyz%qHXjnkZ6I4fX3T)9!B&5R@>B!V4|mb)R-azx9+qZS z<`dwo04^(@K+VFI4ncOYfCp)?$0x|WZ=iO8#^zDdI<$TPsquiCQIF}Te2||&YGC1J zg`U1Zg(7r(Z35IB4U}?09_$&ATQ@-Ev6Z`^Gz<&3SLk*?!uJ-)Eg*A#K+OU7?_g<+ z3DW2VnezY|W-qaua}#EcgEYv)3=Fu+kp!qbHhVzsXn@M&O6v=t@|hrO;O>MB3V`$< zfXaj7h5;rIjTeyo1E@T%vZ`9S&=Km-E=gC(*&v|kC5asUx{=I|1r z^0?gB0F}p9CV`|DfCvWc?NE@^0T98!z)+0rKFIV1*lQ1<^0>kmGI@zR{uIEIn+y#3 zDCR>Z2tYP4I4VZ$ZN2GDv z0urwkYNaLm|uaa!ItJgVZH|%4qMRkHprZ1;HUwc zBLi-z;&!VIR32A75CfG5SG%yVfD|!c^J}2;VCTZ*k-};Y0r@>pd2Id!h2oOVFr6^RHhAE+8^?g2^VfCvT#hB94}jzi2*?KzkS~DB zgQEr(=SXw$6Y$7`;;aJder5D}-@A{wm!*fbnGMni2Bq^QP_>}&Mu}fgydi}-3plW_ zkN<#-P=U(7M-Eev84n@x0?C|mdX^i~?R34Xp0q{gN?lma}Pv508-~bq`C|u)xpB; z6ndEk>Lcug#1kldK0w_MUb_Pe-(rNk0LXC+3=G)Xogn=NP;E0;oK$dC3V-d0ggifXaiT5$1oS^%fVP^0>;E4^VlW=^s3Rgvb9- zd0g%bfXZWQr-0H=0Rj05PC9jH96F|0pOd2IHB%6%D7fH5#I6o9e>EUiKM zQQ!^;C_WvaYH+2e1gJd9JT!9wxIYKViXKom5GuD;K+VAw7AK(cxcvG8Dvzrl!~vc# zh0gcE{EE~c{6K^|LH?0|+C#`c6;S(dm2)$o^4Q`A6z)5q^0>m|22>uG{vQP7CBPX7 zcR64ImB&^Og4`DYmB;123Ig&opz^rJqjwOHzX6rU72iLg^0>lBLK~Fm85nTY6BbZ; zT>2xR^4RxD48AsOE55; zMsG8~%0i@eg9f-lXJEiJX66Bv$2B*c0hI?2CBfVRnc@W3>m5*eTz**rl?TmV!`lT& z^CKsq^0>n41ymkee1Os`2RI|*PVX8}d0hHEpz_%42gP>=R32qr6{IW$xBfwH=z*%i z6-H~I^0?*{&Oqg{m4TqLX$v$AE~Cdcs4agKk#0o52H;-vX#$l`L{4{1kcm)`KSH4L z*unxNUqV2B3RE7~+|3rKJg&I70+q*Q{}-q{%G^gKlK;VrN^sBpm_X%mnI8g`$5khm zK;^NG4}-#I3RE7K|F%HoapjdOP|EH5Lq z(LluM??Lve)kb9Tlu@5AFgn;}LJn|s- zo`Je!F1B(3x^4la4i;|s)PdX+0#5i04B#PjSoj4%!VeTB*f40gpM~M$e{6Eh422+R z3@K)YVjSWn7~&QT3=E762yvJK(3%^NICMEE1B?c(lV(D)7dl-4b^|j*DNH+<4QCShKZQg5G!F5z7~-Ik4OBi0KxM$=_6!URppXQK zi=c_Wfu;xWIvAvUs=)+FZ?HX#Fmpgx%z(^MfSLo{b^`J)12e;Us3@2VCP4enKVTO)87dA+*B}F6;tQbSCTQwGYM2==Kt(}31ZHNqgdx6_iGcxBFYkfM zfZG@h3=B-n5b>)}ap*80Lm*WA4^$kwB#~h;RJ;JXo*g_t54D<^;WAV;RDgkjVJ8bH zJWCkCeKQ6Icst=f3&dVSs7F!T30E-8`3p7&Ar7891kG==Ld*yC6+yPb^0f{t#C&h4 z`7nR^Ld6TA;^4MCNFxIS12Y3G6rcj2IXCeBFHCWe3TB3D818I@x(75)08#^Uj~N@p zJ;$Np240WIz`)Q66~Brm4%&wWim%5|aZq~;WS#^&#J|v{CpKRMwjE@z zH77_lbl(pH1A_=Z#2i>W!OQ`z`39-ahpGq9M=~%laESlFA^r=8IIJ`Qxf+3)8UEr>{~w1qBdocNFa(@`nQ@4- z;t*%YAr9(+q8JV4Ff(vtsPBfRTUWIF#UKDl&mmB8Sos4oh?#*G!yJe>KMrw09OA+_ z#6@w4i{lWN#33$?LtGYzxI7MVMI7SFIK)+Ph^yle2i4l35Jbhy3|ctU>);UA!y&GZ zL)-v|xDgIER4%_XI z9=~2V)cfEN_roC`fI~b8hj<7M@lYJ%;W)%2afnA@h%d)cp7;qu>RIskK2rM%6jIC# zF&O4R#A9)Y$Keo<$044GLp&LWcq$I@bR6QDIK;DXi09xC&&460k3$^PdIyCRDrROV z#G$?zhj=Lt@p2sE6*$Bzafnyr5U<4{UWY@x0f%@K4)GQo;%yk>uc7HP0veRCb|8q$ z%m6#R1jK{jNkWkH44x0eNY7mu=7V%GGxXpP@53QJ0f+cR9O9tf8OSD7%*-$qhx+L_ z#Ao6VpM^tw4i52oIK)A{MN~I}c+3ooaHwC5LwpGi@ntx~VW-T1Y(rpXhE+J!ufZX{ z4u|*#9O9dBh;P9mz72=?4jkgUaER~0A-)fXIPBCIMA(2>%nXNcs6UKD{3s6b<2b}m z;t)TLL;Nfb@$)#uFX9ltghTu?4)Lov#INBHzkx&i77p<{IK=Pa5PyI}{2_+8xiF-> z0>1rG67IKPG57sDYgfkRvhhqyEjaakPV@;Jm5afmD75Ld<_u8Kok9f!Ck4sk6U;yO6Q z^>BzA;1D;$A#Q?0+zf}f1rBjb9OBkE#BFhi+v5;-#3AmCL)-<2xEl^}4;|e3 zTX2ZC;t+4cA>NKdyc36bHxBV09OAt=#QSlGPrxBQ35WP(9O6@Ph)>5MJ_CpNEF9u< zaEQ;tA-({I_#zzQpxHXqJPqP8Gc3iSemM^D6*$CK;SgVqLwqd`@pU-FH{cN8h(mlc z4)Lux#JA%R-+@DX7Y^~=IK=nh5Z{MG`~VK|gE+(w;}AcJL;M&H@e?@2PvQ_ijYIq_ z4)Jq1#4q3wzlcNpG7j;pIK;2x5Wk5-{5B5pyEw$};}Cy{L;Nug@h3RMpWzUHjzjz< z4)Iqw#NS|u-w}q)TU9~lt-xzVL4^i*e(f!WdXN+|!+RX!A909(#v%R{hxm6K;y-bS z|HdKy7l-(N9O8`7RRSPiA}})pGY)Z99O7&^#5pj;c|;)Le;gYAuyqw`Jn>I4?Y44yEsIMk6KM?mpBusHP4Jg69mdJ7hZ9t;N(fMM`r zZt#k5m3|kaLrz{q5jOyf`=N+|IJRJM@QE%UE*yJ-#bL7!aAAfF zusG<%EQA<{)d&^`onj8+A@EDEcrro&$`S*gosa?LA*d9vIQWzjgdhV0!zQpeXf-v8 zFvB&lcr~gZ0|Ub^uy`$oxCVHE6X=vnR4HGuIQY~V6p380cr%I^h_f0j-Vb7;;3r`5 z$tYqVPCGa&f=|@|ap71Gyif~#@+Vvj#E1opgKn(>@esHHEWQdM0A*bQXX({Y9)j8g z-gveZApm6^0*ixB6M>3As4rk~@U@~40Vuf_yipi*auiemO8o$fUx)G#lq`6EENC@2 zLJY*p2aDeVF;VaVu=rgRF%ahoSo|S~iGpRo3rHTJh=Dl1U~$lCJ0Kncmx9GXr}-en zK&%yD@$Voe3cdmshpnbXNc{tg!{<5yxsg z12gO}e~=<1%*+5?Oo$`~Vlp#87XgAeNSGPE8WBkl+-`&}_CpnBU}k_W_QMdzY*&Dk zFf(AbDd*y>kgQP6r(%ytV%1vBL4 z6A%*xGc$l2K`3G%4m0EgO%M|WGs6zcLlFhp3vNW9iGswzl|L47aHWk!99%hL5eHYc zSj54V8y0bJ<%2~WTsdG72UiYQ#KDC<7IAQ4k3}3@cw-R<7t&b7LA$Xqgc+C_z=bY` z1TzD;@WUbwF3hlqg9{rh;^0C8i#WJ2z#@*>o&+gkX25JuLc}rKlMr#}hD{I$2{S`( z?LZQNFqs)(tBN6f7|9Gd)da?c)69^YbKra!gBfyj5sV9`nIX3V!1*u+Gh}x(j0>lk zAtwpK`7j1E1Nama7#B`6Gr(3Y!-c>MX2>aQU^X(r%mCi$i7W(WGc#bepFtAL44Ca_ zh&X2Z86s{6qA)Ns17^D#q==aTvt11l$81+a#KAjpK};0P3^{!hMGVAYhTO~nVxnMX z$c{}EF%XBD0kho>5yxz|L&P!L?GSOyb~{8Ie3}c0iGrCKz$a9qh=DlFSlamzapX}7 zkO%}bGhnv=K{Cvc)0aU^6wJ(k+5QJ9U}nH<|3kz(BX7I_$7{Uz9klS1^B$y$mxnU8UjGXv~qNmMyz2GGd{sA3={ z1eh5x$4epNCqWbjW@Z4L_J$$MzzjZF3!C^EOhIt^Ig3O591d~t?M4`K%nTPW#6SYf z3>QHZ24-fsgdqkJU}gZF6a`|TVrGUbr~*(PGXrcFEK~qWF*96)vQQ~z2F&qikP2o7 z%<*T4_)QRnfteXF$E87vm>F(^C=ATZ0J|9&Rf?Gba~vC_f|=nSh{C|k4EHg_KmyDR z4?q+KW@f+~4+kk?X22W|hloE0Q5cw+;R%KqNPw9EbZQESg^HOOo}mgrdCUyYp)6F2 znE`y8J*pr$ePWK!gA_3{V2;m2#9xCb49v^`zO@jGIOrB(3}FUlhIg2PVE2GeB*u_q zX83?11`=Rq_z0pfFf+p^3^9-ZGs9;Pg@KtFK&PW%2s1DV*5jH$qxKl{1t2}l44CsT5OK`;7l=4B4s%#=h_m7l2j2n+G7<$dGqB@O4?A52 zML7dA1L)LPR51{bnSl$3IovqJG3S*a=J4WBk2&82Q4hLV9K=G!%nY#8L(s)B=chm_ zm>Gm{*ei@f9Ciu`NH+pAGl=3)FNQ;09EUjUG!ul`AQm%&Bo6gbIK)Ax?tpZnVrB-| zsVL~;vN+7aoWBFDqVA0gu48`eNf6wJ(^g~NPp9OAH3V^EA?U}n(8pbaw7~wF-7>79Q6dRCk1ZHM1#i8B|hdAhFNrZkFiOF9XW6qC5%mJO& z1!AFMW(IE@=J?jyw4GBbqZ zP>*X}K_m|KQ8>h-afrv@5Rb(n9*09b9*1}W4)H`B;z>BflW~Zr;1Ey6A)baq9CVr- zC?-%bGeZUr^_e)tvv7!K;}FlmA)bpvJP(I>J`V8$4Dl>2&{^Jyb30)({dHOlSXWE* z<50gCERHzm(-<0byTRg!b3S1++2?VX{{V;jFJN)RxuD=>nG6gJf5761b3uin_Ofb& z{DnB@6Sk5=4lIs1w-aWLAy^!7ZYRu~aIiSy+)h}_vQV3WL6HfuUKe)mK`&T6;@nQy zi1rq3?BRI|hxm0I;^I2k%~8Q29;X8e2gG?w(O{R6hwUjyUfWwnAko z4s-6~Q2z-mjyN9`R?}$cV|R}c4)H7;;-E9Iv4!V)9O@6~gZzs)mld{xokC zuvf+q7YA1_nh2#J&>P z3ZYjx%>Ru;oZkrKeoXUC!QzN>V!_MW85kJi!QzN>V)sCo{!IgmBhGh)&Gg&`izCj7 zg^5cTgX~3|j|W?6kp>n=oOcIXv9Sd#jyPuze18cji<@8%Pd^h-_#pQ0z{0281Y|F! z``6(xXFm?{OJH*l=hMPwe1C(*5&MH+@vdqLat~sE4Qxf7pDA|tC!2!Yi8#-eALLgC z28JfEIO058*ow5JU~$BGv#=F-Cvk|s1&brjp@pqT6Eg$Zi#UfC=6-t|;u&CZ#QC(a zdTl0H9C1D^tlT>Z7Dt><3)`7+A1n?!RTbhridIO6<6nELZzam4wzuoYClaoEdg!N9PP0dX!aY{i(H1;{;!b8%teumCKM zI7bmCejO~1sou~MWDepSU6}fAusGt}MVR;@usEiA87q)Eh;w&g>N9bOZv%@X&f|rt zXSN2JgDD;W7Dt@Z3(FTuHX!ww;$Ohxi1T}4=3Cl=)MJX**@D6uajq|{{j(RW9&xTO z%p4{=kU5C+Ct)l6u7Jfc)l1rg)FaLThK&sR*kezJ2{^<{K;l?V326k0Ly|ZO*$1{4 zaULdYW#?@i>OX_kBhJZ$tt6{<0J$G=PA1G8X%Wy}Dus-&asjp?wiAc>haDj5VJjJ7 z>aRn^!FP@_FhK6!W?*3W3>AmPH%z^NBgj37bBuGKzLv)!uIC7`7q)@`mhVcS;;@+| z@LeRJbAX}Zun|Gnj;2#kaacPEw(?Ti31U91B?VKT?*wuu;@o4{NjjUr;)wG_VfEk# zusGs;QP?@UPR<~65a)%$#AkuU5$AuxM!^4o#S!O!!u%WJ0x}12PARM$o(~pBoI45= z{{R-pbic7H#J{kULBaRkFfcF_x`NC{oC^wDA;IkiQ4c$L4ptHgxPjCo&I^U*FIliS z;#^SJ$|P&BIN}^pn0gPeIO6JjIK!e(}LJV53n&O3#zOm+3ZZcaQ7@d6J9 z20^5Ao?-2QJsyzo?}xUlU_FOyPl&iUbn66ch3i32?DpRA1i2G&PAO~!_DV01IO3dA zn0T2tc5`}hh(GrRmBWa0l3_FP@;=zjN%sNSi#P`rw$q>jhx%0MoYxAvXB3olUxLLE=Q+bx%$xaRH@^~x z_%45ty@+$3Vg7mt7Dt@(4C|lT27uHf&Ub~aeC?anc%*v#(2 zFpxOn99meoT?30F&Y^|vAblJTQja)?7B&LAEdnHtS?_Q}g8CPTa|~epouEjNdc?W4 zuo?V~~F zBhJf(xpNv=9J9WA4i-n8qYHDVa}3BF#QEDWcTNF|BhJ@_)#Go#;)wHgVfk7s7Gw_M zd|lWIum-R=;(T4$c-=3sIO2TWUQi*xz`#%v2Qmj%`!c{+Ot#?=p8*y}oUaQr-z*+v z4rcx93>HV6y9-O_>0oip`nLoujyR7OrhW!k9J3zZ1{O!0(+gYa`xGpWIFC0K>R+w| zkUJ6Q?!ww>7GQD2IlZvY=?|5SR8R4FDyNDfW;B#_ri8|9RiCZ&gq4nRu`` z;{0A%`Mee^j@fPzO$M2RIM)}pL(B*)jyS&;mXG|w;)wHpVg2z1U~$B`zVo3Y@~kN! z^AYFz!ondOERHzW7q$Xm0$3bzt}ko_%q6fm;v98Yyy&Ka%txG~4x3@E28$!k0bU19 zKYPI9h;x9Qq2kQxAafAs`ohY|RbX+%`MuH5KG%c{kb1M&cq&HE}0VC*mAiDBXgz~YE=;9)DdAArR%>v6qY zkolPPbq!b?alSDuJ%{Ci)FaL}hK&>M%)=i32f*qP=N`k<@6QLBgE;pXe4i=<14B>& zNE~tQG5FqE1_lO;LXbG(++$dOO{xeajyT^Kw!;21SR8R4GAw^-7lYIz&OL^;AJj`g z;)rvPVI!U)r66&{xyP{dZ&3ykN1S^M3;$xUIO5!6*tqvJusGt}W7v+77v&&x5a%1i zR-AmO0Er{cH-?p~T9qJi#5u{ZaRRO?kT~MpW7tYWL$EmF++*1NKCxhN#JR_?6^$KW zam2aDu=3$7SR8TgG0dF*RoLU5s~Y4U#CgcD^pgk{$E-J7!Qz`2hfyFWF-veNA#CguJa_Jvf9C2=Q zG1NV?>Ok&6oZAeGuj62G#JSC|^uSpUQja*dIUj0{J6IgE{%Qb=BhGV%?MyfT7Dt@t z4BH_f&;T+Yah@}*e-{ZBe*mp*Kz(zF#rMGCi1VCbJ9Cydg3Q6JkGz^d;+XZ?N3b~J zoM+e$M&V}c=}I4mxN9@We9ZE{1S}34-5~Xd^PtA**beGiusGtp zXxL7Gj6RTh#Cg%MlMYsZ#WCxP(_nGLInprqya$V8)+3G+K;|RPi-zU*2Cz8dd})~Y zA+R{&JZhM@z(kNai1VUhD=$}r#WCxh?n&75#nDM1a}eiC!@_ORWRN)Gd}&yE@SOq@ zN1QJWThSgc6}$P-Q$gwx=lH?O$;PRmelFsCJlOc&G92ct2b+U9zZzD~`~iz2&aZ~0 zL#=5bdok-HE3i1?Tx(eWF$pY=S&vKsizCjvhK(Z~2a6-lyN1o1d;*Ij&bx-GSDg-W z58}LQSU3cM#S!OS!`#yd7Dt@t4~zHBU~$Ab*syT;3l>M5gAEgRodL2JaSk?2ycaBv zI0qZHBj^}d9J74xpNTzwr_KbKk2p6Qc2Yy~ERZP4iot8o;J+<60kVpJZ+fx2Cz8dJZ)IGu`U9cgIWG-Ee44r&e?|TaF_uW$E+7# zfyEK$Z^P`hT>>%(asD>U{foiki1W8$^~GnfIA%TKvJ_+v;#_W+Ijh0qh;zAN_C5lO zBhKZ9iL);QnS)u6XoJNO_Xxn^HyMZcY_K@uegT;JGdRTmg2fT%fy2fnLYITwgE$Wy z=C9dcam0Dx1<-odYz6l6!D9u;9K<={uynf|hx)BJ#E*i_L7XcNTi@QdxajrNloHbX0)MJ+afnagWa=ryDjyQ)L7CxK6;)rv|Vf~IrU~$Ab)})g#U)hn0KRz~Y$YB-0v@y_n^s4pJY@BuiSRAuF+zl2-oO2FS{}(KdIOiM|kG|_c<|EE&hxuy(SR8TwIZXUI zSR8TwIjp{r*Z?vIasD|>eI!^MasD~Xzw^Q3i1W{3?SI~lAagLwu>`O8GLj{V!M?ab7xXTqtrg$Q;CZ>9F~0yDcDb z#Chqk_Ff)X9C2Pc%-%U*am0D)uzBR&Td?PgqhR%z`Qj5;9C5xnEF2WKg4}~RUmX_y zDPVEL`RXwB^T6Vm>Fohn95Y=hZ3CH)IFB7>eiT?7aUMI&{8?ae#Chzn@pIPgAafAs zvBSbGemnMXr~s=+oO=(ePbY!J5$CtV#`(8{#WB@i1dAiib%&LE$~!>rL7eLj+i~3t z7Dt@x4$EJcz~YE=-C-vRiR=WKgE-e677kHham2aqF!9M?am2aquz8amU~$B`?yzuv z3>HV6>kczVau>*6%y6&+i(~qC0$3bz4m@l=;tyCHaSl8zd^~o8%txF953{!kERHw_ z9##)70E=V#>nK`vusGs8d6>A$VURh9bK_y=q=Cf| z=gh;z=YYi#=gGsw&w<4e=gGt7Q$B&k5$DOn#FdVK!WnTsJS-jRfW;B#%)`W8z~YE= z=3ys_#(>2U=gC8tA2BeL&ZN_=i&USfJ`N_2 zn3s~18lRS5gs#dQY(jBjS!z5Ur2zqD4DnIHB|e6h1(~VIspXl)sqxAAdBr6~rO6=c zKvKquIXU^si6!|(@rgz05cOu6>3R7@sSu7ue0*j;$gvFZQGS;3@#%S`@yQhxK1PP| z@hPckiKRIu@j3a)+3^MWIho0ot{`Qmxdn+u**+GTdGQ4~iOH$Rf`&$AiA9--c_s0A z1^LBDGG-~MC5f3iK9+g;CGlzbrFkjXBolL!GSf@*OCdUqL;QRU4IFdQ^NTV|GIEQ3 zj4XmuQ%aLdGV}A|a}z5G3ViDakA-%1=%$F3vB)uGh!}MX!57sY_x>VsK^^B%DC*^D#8aO-#>Bj?XL2 zO-cou<6~%Ag6bPkg0aj^tbkY!;#q*aoLHP)4B{A?=4R%(=A`Dre9sUc6=hjinwgwk zoL^Mp2eQY|z&Ss!Br!8DwW!$Av8X7qGAuK-+%wZJINsbPwJb9^wb;bDG$qkB)7910 z&pR|HGt)IbImb0KlVZalW|_phh5Av$IGEeY0^E`^GhJOlN#D=N6r52qQgY&xi&7Iy zQsc{u5(^4ai{eW>eUc1ZeUc1a4GqC+QgY%;64N~c+{y?9j3JhgiAYTGiAM_CV6%8v zSD!3HS644rS0iJv1(=TTMAHEY5m#52kmQ`=B0nQYE(ht$NX-E|x+pQTBtE~iB((_c ze2@hYAD}Dr47Q99iFb81PERdKD~NY>^$&KB2SppW1OmG$r8GCUGQJ?OC^0v_C^aph z%+)j=!~i89P%(vF5lGh1G(J8zKczG$H9j*XzPO|aBxqz19}g<*+!9NiOOrD5JirAQ zSjG$@6PgFhU{0BNiA9y6C7C&(nDT)Ze4ug$sj!PL&n(G++5;-hAcY+$w}AX%X$dNU zK#4TIxTG{GK0eCO2xPryUS>&1VsSRUh%^at4EDyB$}=+qGW~)L<2^HjP2w|MU4zIr zB?#FVSL68jG=}&nP`H8I=$MjHlv-TupXNz~`36{$TtG;CP_Q{DK1nk&(=QmOab{rS z%s}q*Ni5Av&hSnx%1h1hOm?*diMbXP)Eu|`BB(W>9GMvfa-X4@Z)$FSQKf%D zYEdGnEHC!Nh$~m);?xpPa8?I}g?F%de2783YiM3sMoNxByg|HCe1Kb7KxUA4W~OVf zSv*LoVSHJpYp`>CuuHs`tE+2Suv2`nQ+$YFyh}ldQG8}*kf8-Q$%C>IdP!k`SyDg^ zhva@tFJkkA8Cq0g_`(J33rOMNR+j1N6>JzEY!vV63QIww!3(MCv8EC7gBNSwqQbgj&bd>lr*dQKjRFLj7 zllWlMcyN{JS{4-S86WH$9|X>uK_NyUe^OGi7`l#@EEq|NdL_$f@CM^7xIrBvNLw3G z{gIYK4P7&bN)9!Kv_-+qg}jo?lFA@xhsrrICnqT}IXeKQkx7hlaQo3DKEAjlGd>>D zjD#8Lm;-J-2A7!R=9i^<1|c=5p@kx@?hUp!5VZa6?-yKRXjYU8G7{7nPAy8!OHTC+ ziFY+AsVqo!^$)fLl}3JeqJ-AH3UC+4%hlD{5LX`}*fQQfWY9R%hzMsI#k>3XJ30Eq z$NR+_g4(osW$~bP7^LsxNoqIE2(A7BbzmV4HA{FGIM^K4O-u3uH6#r!(hB19^7E6j zQj<&KlR?c>aM#<>FR1|kX8_%lA4oR0vRC~q|UTVD~M0dFUTDH zj)X_Ji6N-CNi4}s1`pF^Cgx;Tf%_rxiFqmUDXEYVG|(860ca!ymT-$plj1!=-B9E5 z#G(T5AWv#=X)b82#4otSGPNQ(wE)zcaW;hvPXw1(CMBlCgGbVSX((Gy)AEIVGkzf(9Cq zy@{gA45TWkG_NExHx=B`bT&olbtXeY(E`-x2aOXNn#Ox17H7og<)@^^CnXlA!i)|s zfvYhxhz}^r%uS6iO3eVNg$z+To4SI!;(o{q4Z(($Bqrsg#-}Fb$3sgUpG*T#p!x;J z8ybLBfUWZkE-@)c%q;Q*buB@?R##W>P?9S)Cs4;^SfC@h+0YE^B8Z#8X~;7;5!6p6 z>-Zwubz9=y-JvAOAo|%^pmM}7icY%&SgC$eIvM?zu@eYf=ZhxKg+;SPiOC7|DX_OQ&-X(I-mw=a0#f2EzQd;j4voDicd)_N%TwxjhB>x zQ!;2^)YaM36ir!XZb6P`GAJs+@#z|n3~gitmlzs=3@c3q8RzL34>ilx+0-*2IoLS9 z%+)m{9$X0Kf%8~qnrB2_NpVqdiJ@g?aePi$VooV2o2J29TL$s2reHx=|6oU0(+lcy z>{h`73!DiMWdJl);>@`4JPXP@#A}1wRsywP$R~T4k3i)*^1Og4cwPY8oB+631Q+X| z`6I|ofhVY;1+EV=gS>;mBZ?R)lnfIII1ZdaFop<0(-@%HA?O4b)PbPn3(lWG-XJG} z$53%Ok#uum4n^cqm`HGA;!z%370v>`-61tlTX&Ze%&k&Ms`YRtfl zgc*Y5EYP%2eo<~>PJCtwXm~uo$TJ`l5rNPo1WH3e-ob`QZ8BVaVT_)Vv#DzUq^7~s z#~tYLa9GF;Vkd%&!C-H26a<%;mXsFcqswdPC;buOYUGn5-#hDPzh$@vATDd0-l+0@euR3aG}Vp+8iUxE}e z!G`g!uE9R>o}kGbSJwb=a}h_Y*aWQwZxjzLYFu4i3Xobc!!bl*Sp{6VKoTW1O=2%} z5Tc-R459&&EC+RF^NjKfb_UP$xt0aH#s|B{2YbZ3hUR3tf|7}6ynnDue6V3Wv?FO~ z7#{-4kntctdivxVxMJo^FVIv=nJXw`f)WWN42E^S%MA7eO)gO(E5SSAh9>cTpguah zwnFY1f-)fpgQ}|laC;J*0l@`wdY(*$0V z<7|4E!x|)@X_;Px9z37DFg0`VItR|F)3L(1Fp0%Via@W1k}z<1dXl}X)3%hL+YzpqNXs2IG00fmRPt zDTxK+NMQ!?9w?z1aJc~*%0bEP*sKL5a91y|^+AY+1;m=5U=xTv;8q5x$?ut$l3EcDZN^0uLtNk* z>=N(k1>%8MX@F8Oq+^CO1d1`Z2@6XMl_-OyFjXjH$420I3<*OP1X+sAcQyno^79TZ zu>{Ro#b@TF<(uRs=B7f#Q3jro41|wHLk)!Up$6ua=YrRirsM>a1%sLp!6k;~@$to( zRjKhM`SF=~C8_DDMY$l|-iT2=(0W0rkfAwPO;KVFWFcx|UWq59WKJ$ig1HSg-fdZu z4;|}v&17&1*DFd*(~FN!uc(MGNG&SP&r8h7EUAnyGXR;5JT7Vy4{{7>WV|>&CpFJA z%hfe21XS%q$Lamu+=5+0;zJyrd?4#5z{BQHFOo4*?ri9q3mPdm28D zpIPAw4nJfMSs;1H&;Zh4$#Qiq3rNla&GCiim1pLq#0RAom*#>N?ULaIhxMU`#=j9idWF{AT z{?S|gyWiI}u<4GJzXG=@xV;Fp)wwc&ma@2~miQzVmw@Lb zAwh$fxJAw{xXpppE@0QqS@G1rj*3hJe~y0ia1||6mJH(NDQ)AcMdrg156! z>Nb#R(Dgr{Qmd#GJYgN53R)TsuCZ`?5bu0FwuFcg;qYP$B>`b5b2daz7tk~ePQQMM zxv8*34HkeD2e4=dZ}$MLD@aYwhU@?V<=#wE(+8wM5fuVk-j1&^>D4T?5s z48o*A`a#2F*r$XIsXZm^jngjB-WN!5ic<+Rr6Q#@14z2}%uCBRG)+$}aZE1BEK7w5 zx|YH64ulWdhk{+Jkx_bT320FZ>Xxu#*GyOQqSWHjoD$G3EtJTDj4**Gk||k)0;$?S z9T8;Hq0`jh@hhxTZZK_#aTVy07sU>OHXo7GF?GBOt9wbD@`r1qx>sR|3CXMEudKmV zKH-?%KpWJ;GRg)k$f)8nECEQqHxXe5O`_oRHk9YH5D_&+OzestI{3UNNLlDg}q?#0WbqH-cq?aZZrNk#D7nWv%ww{AK;1IE* z)ST4B;#7wC_>|)Oc*w4JsHVh{)QU{du0zmJCj~KTnw(#nSK=B3TI~vLEJNz|fTH|@ z)S{BiRA^;QfgR?lx!}$4o?bzjpe1dtuAl*ClqwoyI?D_^0S2B4a7{v)s3t#}GAkhd zjE^!jNdoPPM_H(BWCU8f?3tIGQ(Bx^mKso$nwD7sUBql;3|5a_(hs~=%Gn6END;y{ zgmNt*0i2VX3(mI4>&po3I}geXHjNMR295t%B+Gez_6IR70ckqRf(1ztnQjuHnqgOnjz;y@k2b%+)mjTPtVjcmd?0jt8sgrjP5Ra=(sl~LwqCiG1)Il%qYZT2 z3n=a&t7{<56F-dgc%Vrl_}C;^DJ)zulp3ezrNBC^XbVG4lTy<&^T3k9B_^5qC5fI$ zXR?6i;_^V_&jgkRx@NkTg~UTP%!5b83{8p>^HTD29W2M)wRj!@zd`x|QaE4|;;eB}~{S zGcP+e1(J~{&nhUxsi4>{$&UxkUlxOgTtI8BK+6jtGx4sVwPKm+nUH}!Z_?-Y3_-({ zuvy3aB4{z6S{URFo;-mr7$U=96VM6`uyLS%5~;Io7RiYPiOHEIl@Jq1_m^2_9%#-O zyxbi;X$Rl1P0nnIE8z{=;He`r%yu?(^@lDwAj6lSlcn-fK=YblPm*CjJfJ}1>hX{f z4l~GvyP;unZh;5*94L2J&{;OFWdYuxomIicpatZhHBZ6D;08&MH*^g-s1Sqcwt(yQ zFMv*S6E-9mJkK0ll3M7AbMh()RP*85qKc>lVKZ>#q(5^|WtLc!3Yv%lm1pEf4J25D zOU%>qi^>y=Qam$3OM8Pr3)e6={Dan|L3aE@EG1^coDtT&GN4WhWEmfPn+#TUkad5q z&PMnZ@gXrODZ;CB=~J4$8vACibAC*x+4%NYz{r-g?dueS7urh;@|lb`Q|5 zmSBT;Pw*ZwjB$3zZZIt4?4Tw+_*h!Yc;CbV$i@QDApr2>A3(chjpJRxgOfy^Xn^ZL zgLs_h8GvN4A7>Cx_%s9Xwiob_Kh;Jq!Rwshqqy*uONi~IWQ>M_2KX=r1<5eSyg0Q4 zJY)?T*r$epu*we9n8q@k3ff|B4w))~Y%g~OiC}Iz2knYCG=$D%fo3oug%1@>buj=N z1zM61UVwzdNYclB;n55pRL337q??Q{vPn0Ms3<4hSOO7Gx`D(;Kj~&e6M-Mb3=cF5 zk!~VrTojytg)IjD+pe=2D8ek!UG1l&E1Ri;UEjCA+QZY1-4~|dH zNKMWLpY;XWF@&NVu^AfqTn7ARLK~cr!|_39T7ypzO9d~t1huQNo*seI{Rj!{ZZWis z_XRD}j1MWm7(BFycLAMNmS5=^kO`UfF^mriPV@wAo&#kfqyQ!MTnX^L0dNZwY3eC2 z73(1suHfMzP%|YSePRkcHstCW>o}u!Jkb?}nx+nQ71ikK#ew^gu(07N8TU!IN^9pbgXU#pQ_wWKG9`&Y&*? zTLf|{qzFb7mSmKN&PGJc$RV9113icYb?g}}^O=CgvLKN`1s8(qEcCV&Vh+d19J~@2 z(pV#WzPMkoQM?IwQznv77V8l&}nFpo?7AxUEBa_EUE_js(kf(Qo+oJL%>{9LL|TDCJP3nskwBcH0h$kk zY6eYAgYT$--b^t7;fHcuK6ptG^z0pss4_%42OV@`B5F?c0W}y6O_9|2=BK2(8o0We zq*kORd(z-AX3#k!=&9WUoR5%i0>r;sPsEFk+vi@^t`!Zwd$?lA?=pMuItP;Vxd^54wa3KIN7~?35eQ_$6%mC?p=&T%9rM7$)q53Z%2P48iA@U_TlHWoi#R zsSi7&%h?chE-ttPG)IqqToBk*#^8e?;!`0ts%Hdf3p4!CFX(X<&?CfyAe$ROT>+f4 zM}`LAL2JnA&X8n_)oRcQ!G@&R?G3dYR4Rg+9-!S_pq8Cqa6EL!FeJ*6+SO=B(vUP` z1>2=C-JYA8Tbv4B z;0D?eH#|oAq1|na`JDjBF;?jNJ|W$3Z2HpH>_Xve7jt7&10LjHaRtP2+vzA@gkUnV@5HT!VrQkSK6<+toC&4oTRMLIx!?Ak&GEw%hP2X2C5Ip9VUZ2~_*K=EBQAq#d}>4c6d1g!4>6lX&0wGDwpQaYPy9R5im9bSkXj=&7(K zxK{$0fX3KCM`}YV8tfMvfZIuswYcDgaHXl>ln1GrK}A2VLvk@|HH;M$NX;cs69ocT;w3s{R3(&EIW%+LVQRCf)?1P3;DwUB90EXLv* z32AFBI;jpB2`XJGiI<|Y?Z!bWkBhpfq%TrdQk zmI1zK05O1%y91gD8_GeNfkN+AX1cEI9~YP87bT_>n1w{$rv^Ip65B!d#^A9Dc=CoOT3pVCMnrH4=t|?#Jj8@3 z=q@G5BxzoJqGv#6Zl-Gp=!oVJ1IQ$$A^4;=P=$$@#RP9NgN%40B@ir=Y*-J)2Ola3 z(M`sbqY0>Q2(F5QOH9BsbD%qFN-R?mmzsdCG%JivW0F;+mO)Y(0UmekXE14*e=aj4Oqq7bAnuLC3KIA9NEQWUqrikmhR8dPz?<^|N{cfvw^V}WeuBZ5 zga=pVC1(`n=VewwkE+EoRBwc9s2*ubASfUi-$a5D>U;o}lirc01|arfJM14^qk-4f zz%GQ#HE<0`h993E1g@z{P>lzT{CfqPKxXE^g8=YZ1JKFqu&D)CGG`44x(T#A0d#=| zAy-jptQj3nx zLv|qDfo%xdC?4aKMra5^`puyB9iiZ%?*Of#S-dOwYDCbrgW%}`(1Hc%%YjO^J;yWbXHLU=&4aFaFN)XtG;Mjs(dzqZ$8srT< zG#f03e$y$vGB9Sw3GjxL5}*Mlv@{1Etr)HYJkFp&$AP;N#}NO0CBY?@xry1S@!-8k zD3v);qjRoFuE9p24N)ba)o`E{GzEz3&f}r$xj?t}nWm&BCsu;4Xn}MQT+QMk%T@h? z%|QCie8Cd9XMp8I0_0u|R_)+vA$2v@R^3T>IzX` zlfQBWv=tvVt_Z1Wyzn><5pY4EC6ung&hX0sgH7Y%(~85jeu9lIBezblANg$vnnlb@ zO-+Gbh3=XQYX}dQTn#GrAlJNuw(o$OCCI@7ZRXRj2?-h^2aUL=f~~|;lY^Z)d{U?p zXxoJ6-5t>B1Z;N;ps0h~-QiZ03La?! znTdCJmZ-Th&=NY(F18XwGq7tDOY(~#E9Q$m!ShV67P*NPs1r%>J#RtY;7hacPFg_* z+#oX);E^6|(=VvgK;StIg3~}yb3oHfkm~`VgO4~Z0Zkb|{fL+|!sTm}IYQ7yfsiSo zU~tgjD-Tmw$q07r<+0oFS7E;O+plY{cxMAngz` z6{t7q7`h_vB?Zr6f)Y$JXigur#RkO?diNMWgWnm@Mj3L?1-EAmT!TxHXCnxB2%kZa z@!jCmoYdqJ!{QQGOrwxT3s6S9{c-QgLLF^@jEd(PxPq?!1rKpx-L?c7DGV`yY=47{ zDh5HDU0A{eJV-#y;#p+9$fHxBA$-tnwcue?kkg18!v~*=14{FdQUjdUhS{hXEk?e{ zFWRw<3}psef@V}e1tij7527q4Y9Ji6qzID!vCMCPi%X;umdLBdjqr5%!F4QrS_q{E zBFeqsHUT*QU>PDbLmJ@ogXL5Fc3>TL$8n)KBy*B%9;h*cWf&j1Ou=s^TGaz?BH;}O zbL4P9S||uwo0pTBo|l>upHvDueFNn#G|ZDgaoyjJJT8vex&n{GIvXO!&f!TCS_tDW zwcy8mA`%{P`k>S9$m*6t9AV5Q|07@jzS};5J|%06~dGyjHuqBCU@GHKjoP zH}D7m==Sc!l$4^>;$qPB6FlhfyUf)Uw5AbWDpTItvxKk51TDDqB5M>H)dAFH!6`z9}K0CHQWS- z`^@7p2JZ-D!%Wo7WdPZEiO(S@2?#QH=ZAm6He~aWD`><8GQby~k_uhS4IbG84ec3& z&al7}*xvXCs4zyzkfJFUcMO5b5kJUn-`?Qx$LM6ky0g$C5VFG`(IZDI&OvMT+{yyV zTwPsZrvV_;8lcq=0m%?UA>DCs4F&Q(h7;iZd{F-b+WUs50^ANjQVj_mVhr|%UL1f_ zD?n=#aHdD?AQ`xVFVh6wZh+JpLaTmVT_GI=P?HSacfy^zNU0PMeQeWs&_O=1{xxRL zmlRXcdcn|B#x0?>On#atp+0IrNIYnU5geC<`=riBBzK;1haMrP!Rv6s?t;x1l5ZL1 zCZHBH!N!oH3AuBIJ8>F;&)LOt{W2suf^r9Bs|Rj5f#{ayBTUg^y%7S{+YHcq8rX3xL#J)G-%18cpMgZOWR& zyCN;9C<_PyFQ@=DW25j)VUprsw6eeqQWii0Jh3O2hMrmoZXH3|NhP3O6;k3xZ5u&P zKf=}CK~W6KHPDs}xNQ<#f>ghfk}AM;E%=5=@Kj1XC@ZEG7n4>?;|?!^^(nM%O}+{6 zA!#Hd2oyP>mK12H9~{@<41iX185+jNgFB?)$_!G0d4o3a!p3-zsxTyLDJYE43k_J= zk5Ufc=?9S#<(LT`)O(AM2lrd!lM+)P^EuGafo&OrjItQH`a|ck!O110v>+!Ha=nXB zu7Rr;rXp1Bpevsss{>#m2?|{B%3sh8K%m3+f=h6$jYBLyb14V`m6oRQo?g(Sz94ll zV(^0eflr);NN|avWqds7m{{;Kl;p&c)b#u!&{4mxpn+*qxD50-fRK1sLvy$wOgU)7 zGdKVs$qrI*U~I<)6&=APMrNS1Z}W5G6O)rui;EF#^Ndkymk8& z{swU)Qg|47;!F(aU3ggI4m~x%8>HaW1&=bC1Taz`08)T~hx@Tkm%&OiXt4v|4GykA z@wC$j1v8dLg=Ks^B)AR1r->sQZWterwh10o5@Z&~7bhp?Bo=`;2$;u1`cSTA@Ersp z@xd0L6aAs_iLvm&0JJ3r|EdEc3vKFajOi7@S%JK36`mBr`t`bc$nGW>HCLVh*@0#O-)2r$}RUIOu49NRJpa zQi{AqpN#W82s;?O6b@z4rM*UA@WvpdjJL?lF9vT3Gy&a~47=zqxCB)k)*^?*85Kg=2y}QlMg|1u zJX}#pO3;&HJoNSvbMWaop8k-#QZNfA@VGQ3=3;R_QD$Pd6Er|-4mpb-dT|n@p@x16 zEhxrtm;f$?N!A5Dx&WI!ph6W?jzWUa6*Sn0aw0CsEpRm$A&j9u1Y!;-b$fyn1*(5R z;*flgMI7um1_lOU1_)qcU|?W@VvvX#h=B1~7#RNihk!5~;$=9*r{NI4hC^J65h4L{ z3&=q*d)sh`Z^t1X0rC*k{h~q`HKl;AvC=qn-5A$pqK$kBbyIm!^Dxz zQ3ol6nuDA^U7+Hi@BuMJK$;mC7{Wjt1_lP0xC~T035R$VR2)PpBl))zBmlKn1xeh9 z6=ERBe308<`Oy+d9HdnRqKu&$Dh?9UKr-h5R2n*&qgj%s~!^C?s(;B=gTh z#X;t4B8gu^5(k;312L9Cf*ryDi6Of)7AlTz&O{vI>!9KwbCBH`#sM)OWR4b+dt#Bq zL3&Ie${6ND#X(}o?s)j`;6QU7a+!QJfG6&f`ZBTKLdXStAL_5PqBynB_1_l#o zdAb9K_O)s>Wf$;aaAP$WG@vnSlXjjs#R3q`nnNy*yMLq`nJDy%$s*q<%7z`bZ@4sYv2A zNaCQn09Gz_Ac;>$Qa>9>dUJ}YrhSKQjOL2(9;vHrVEWBE9sPBe~!_=!n`IDhE zy1h$rh_8W)!_0x%y9FwaZq9C~I7~gP-Z=~vM^}Fphxi?+ILsWFy-%Rx=;pkJio?{K zL;0VfG`f0bq;$9nDc-rD;xKby_6kA8(an*Bio?{~LG4q7ileJH#vyJC6$h0+$n8lF zs5rXFT_fQrM+4}|g?pft!FWb=EG#F5QkghPBSk~nhta{x&k z*__iz;>hanBZ(7JUcu4}$o<=Kg!5J;apdxU8Rok;FM&KG-;>R(v-1X2&`6T#B6BU1S6 zM=}SN4?yaX!xOnZ0Lm{g{|6wMa~R2-6eRJ9NaC=30Wu$1eLs@=BS`9(A&DcKgI*3F zgQ`O=hmRtea|0?5%a50!{KrrlXFOO<4EEX(DDjq4y=4vhKi$`hhNEga&OIK(}0h==14Ps1TzibK2whxlY1;)`&IZ^R*f0EhT_ z9OAceh(E_6{uzh(FQ_<(I*C+1I3taFAjb=A{09{8$l}E~)OX_$UyMV1HxBX3IK*G$ z5N8HC5SreQ-3b~)10`XQG_trcNC1m^Zye&uIK(S(i1*?UUxY(^Es{8LdAkWo9J&5N z?k^y>H%vjEh5G9>Qoi#>5=U-tL?DSHw-ah{h$HvEkll&gZXwm3X9j^gVf_YBa3i-f zVdE$u@l#0dbcTixC?B0e5{I=DK;q|-#Dk#fLFOR0OB0dAFCeMUK@vv}XIUii%}DBD zObKW4SM>gjNlK2B8_sbyFQ^@KKafrkEDIotMn}gg>L5>$+By*6{KXNUogH zIdhT3d6C4Mq2eHSg3^@+)Li6ph7U>oLLBN>Ld8MuL{`5aDvoacai}=DdRTiIB#!J( zKB&9U#nqwW=;rG~#nIJ=;1DlE5=RcVJ|uBgq;Ob)B+ic{{u)VK07;w^yg!i#E(MHB4)VD(22~v+--%Z3Jz8*>30Lh$vIK(d^i5nuRNA5=>izDYF zBP8`=pa_AcPf(j2R$su%1CaYokkljB%ce-;t~ksIM-oRiCmTr|IsIrN`O6H+d_yF0 zWOpLho5<$-L)C-g9aO%}0x=jE7?9h07D(nJw-c<8#9{R#C>*Sj#Py-(gT#^DgFK#t z>>lL)f*q1M$mTmDiNo5HAoHD&#NCnXwMP<1uJ=K6*|2y-PUo&T)VtvjcgG>_i9_56 zNgOs`4RXIPlDGkq``;ktJLK^9LsE|%KK@AJ$l-%54x7gV*&BeQ9@+drBynk|dqBw} zm<2Jv<_i@EnG=Gf9(g?vvU)Wn^`JEau<%EY7vypkIh{u!nUCzAC>-vIMpBRL9%S{% z?ukWGkL(`g`U01GY@z-Fg%2nnf%+IAA=tbXOnfeg!N9-(>t}(uAoWW?3)GGiWUeEWIJ;3(%MeOk57hUgUI$oF9?Jop7j!#V^R6$m)}j)FY=W zSUVh~9$CBzhk96ifz%_L4@*BFabcwJK{h`PNxTus{8S`yZV}#@$Wc9G|7IgK<97*XoViH;MK&LKJ{sA4SUVHso+>1Bt|OU)>@Vc?49MXO z%cr1n2DyImKr*KW$$VHp8e}hWJ%!v}MOI&oWKJ!TImqEqha^4$Nqs$%IC8t90ZAM= z9+AyK4*y0Z^{{?7$oBBo6a0$hvkU@iQPtLBq2HNgOtB2vQGPO9_i#nYcM=o!53G`Dy;Q+!Lpz9Q2?I@5K2%AIK+aafuZX|y#Msg>z_$DOr9whb1 z=>d71_92pb!pu*Dn%|0~9@+d^NaB4+;kE`U4l}0$YR(p@I4B&D z%{hrf{1K8kvO9l3#bM^d+zDPd3{B6-=8J+9LdB8YsSXtf%_AVM&oP3EgWL&fBg4YS z6Dkg(K=IoJ^;aNB0BSz6y~y*h$l|bd`yg!~bEZMfL7tyQR*yWsge*Q0WB^jT7dGxW z6DkgJKeGCDIK;O?#X%IvJu{*1*$WcD;xE{~0FX3D{amQ}Gavyh>K`JBBZvPBs5poM znX?RP&IgbH7IRpk{TPtHkj>$TilfJ)6jU6f9yuP3k;MCv(ybGc_yiLFo`wzQMxn6jU5!{wgH(u=WE; zd^M6dJG8z7iLXHtM{Xy8)>OdEw?k5oEbfUUj+_pW+X=|&5SA}s?uU&>!Nx0K;;W(M zHu^a57N|HZoMGx=<2mT<fq*H@_dFmaf_VC&>z;%lJhz}DBn#NR;0k=udD?Rn(&b;$98 zoKM#y`HRx{Vu7}+=^0vCs+9E;SBg}DcLT<9!P{fpdRL(Uh-hd7jv$F6rw3$l*mw@eUgYsr*!&1c963BmH6K|$^0+IqdyvJE z+rvkZ+^>WbzsHcoJ)z>T{0J)tVDsrP@my&7i9=Ig0~Jq26K{czpTN>T%$ze&aoBh! zOdPiF5qW((a{V4jYewiMK$_fvxj|iPu2g`3#3U z(bM4(s5xKI)ayVutRj!co4s$=Q$*Chk_i6oBf{)0&3$mbGVL=s1K|7|4k%Shq#07)FVe0~QN z2gMgS-GT@P28Pd2aZq?7m(R%in~?1l2N?j(kI3ynJtT2t^~n31kk$JksYgzqu>KCn zJ;?JiuyO_@jy$giE9XJt$m6=m<8sL3pRn~OAoa-N$o&gscWy^YpUCb+&PSlW7p#AW z++IZ<|3sdbzls!J$m^-DA&Dcm=da@szkx&iCXzVvx;W%XF@l8%Z2l z{T(E6l_V)Fam~kCDWY!|e%@IP!W(hxFxI|2(tGDlKP2A z>XFAMcjFL0h9r)hA5SBRBj-otboCO+UgUiA3P~I}|Gq&IN6x43ki?P4ncpLc!|Fqj ze?K6JBe!QhB8kK5O^|xze1|-4a04m7AeV2Sk<39(4_}bPVf`kM`N;8(Jn#P%Nj>s< z=5I*iu=5i@<{+mBB^==oTUQBEj~vd(HwrV8t14!WsYtMu1{eff-Y@8n?{u44U8G9R=C6IPEPug5@6|97G0gVZCZf8_o0$nJrKKe~I6$0d;6 zgIxdqLJ9}iI#rOp$mYY=BcYp*JdT8HKFmGn=8&2mkjr`G^njdSkkbS5xEXSKfURQ$ zxgR+_z}8KH#F5hj@_H@g^Z*<8K=&7H{Rzk%KT#5k=K7RA&DdB7iJ`JWOG=M#F5)AtVrU>`wEf8k@G3CIPy9@ zS5^|Isbz49W1_)%X1DSb713; zAoG#^3tNW?GaqISEIeW2pgsVoXht8WtpQoiz`y_tXPA1}d zxd&3e5h;Hm_p^}uccbg>AOSqO?hbO&+vvJGNH~nHyMu(o=(;;dIN(}$hdh3ZJg$qp z--H{fd`^QDH{j*7JV@fOdLL9Dfy!=JeFxhw0}|&&QV;8Afy6=TVf|s`eik2+dgOi< zNIk3_k30^>kE9-X-5qkhgKM23@;HD1k~zrcBagqq<~2ZW5kyjtJZ>P2LtF%hxG0i1 z^1Lu|zZkjyC5EIPd0to?NgR1TT>?oQ*}sxV;;{80Ab%m74{Kk6#HEnbBac@|BZ(uQ zmjYTF0_&e5k3S%bBd?1??x)~tUm=IHERwy*`%vVN#F57bt|GN7kj;@tQjaXIfFzDQ zZmEbQj%=?Ik~ngDLv|0WeFzFq>9LMlv6Hd_x6E964T)$5HE``4?m_ zayTHT2jp=``w$9_K_Jmqs4vMBaymT)x5PRYCEB>@VbYz8R8xkk2bNM-oS#KSeg@8&dpQAgTY2 zBn~>`9TpDA>uIf!)FaQ2BbQgm>!z%c)FY=uWOpK`^P5QJ67swNvU`x%`ysmrIbR^V z2R45V3Qy$p30n^X5=TzY$o(=KB=?W@%OLHj(fv@6a2VYW1qp}I{ZNo_7~KyA2?uKJ zheEFRZIQ}h(0V&iRRt=y9gxJ4+XIeB;>hE`PDtX&<-9YJ_-Hu~iO11$9uf|t8!-g&e=3<$C1Zp zkk5}rp7*gqn*Tyx*WHXHj(na8@_rEHc{Deq{sQuR6Y@MYvbZObdgSpXFC=kf_1;M0 z$m)HN#F72wizJS0jvtaZvip(eWsv7Rk>_QQ?e#}82ig2UBynW*!ARoB<6|L6;>hYj zXQaW>6|#634)x(k;>hlaKoUou7mh>{N1hi(4hQ7*oXGQc$opuJ=kH+a072;xdHxP| zKLtn}dHxP|ejrF3c|I3;oinmHY~2M&J@Pyq>>LJ=II{bZ^H&s-e_{I{LF$p`zmV@8 zKsE>YzER}#fV}Sjd0q_pJRIbCG1&fVkiE$H8hO4LdEOa$o(_4w7&$$}BKZq>zAqk0 z967xq&yymrLqMJ>|t`ex z&{zwsoE)h77rDGet{;)(Hv=hNki|1`h-V>*wl2N zk=JYGBbkF-UKJpTBd^;jL=s0{$A#QJK`tMPk<`Q1rGwHfa`}V2ZUni!LSA22f@Dq> zXmSd=&lkDegRS=inU7pgA=fXZNan!K{Q#-&MOqhuy#61#Jcpg@4pNVt?~wP0BIjS^ z^0o}gUgU5_E+0q@XJm8Ak<3SK|MVgG3%Py=oq-C=U&!gI63HB7@hT*7A4M^&d!w0!NfV`gG_(coHf7Bd0^;eAJBO9^`a_ z952Y>hAfU8@2xn@X-5)AuGf&`1zEfUNj-AAr4vaU*f^`zz(Wb={J z74-Na@O2A4&@%!-K`a0w7#P6fy-4CPb3n}$(7gswWehNPg2dxMkq?zXHop(a9AxwR zk;IYB2i=1JH3r4}a2)1??mdL5M>ZdHPa;el+58De_9DA~B9b`rIUC6LmLi|?KN%@r zVCI9I2(lLzkI4BKxtv4}AJBeAm^+c(58D3-6Ni}(3g;0SxgTUNA@_sq0AY}OKxZez+>Pw7=}7JbsRdzt>Sy3k4>}7OoB6Yl)WhUK z=FdhF2iXb2AaPvg%t2C*Y!2*9WsrX4{JQ`t-jUbwB9}|Z>K78B9=Y6JgrpvM9WS!^ z$m$mpp&q%uUV@|^xxKvv! zGl3EgwB3a)E{H>10ZAPBTxjHb`jE}BMpBO~jyx}bEM9{{{S+MHuyf!+;ec$;5hV4< z?!1g7j;#I#4sqo3YLM$|*!l1vdy&&EG3gL=201L9jHW}#IPhpXgoFdN(jjvBxg4q7 zBc{AvL8AJZNaN4A%!lR9{i}wc`=QkWxZR7ZJp()A z5T+JHgNj95>enE(D?n;N7^HqKQacUSegnlj2!qssFl?LwBnE3!fy80wqk_btM}mTc z8DRTFK;qD&48h_Hz$E1UQ0S3`U=i5)cp&xAqXWU>55OcuJwKEUra%sd*4IK{0SE!B z??C3j+MFO6ZzT1Tko=o~BrXC{2*sI5;-XM75Y>SsE(Q{SV%WX3AbVkLSCG^RB=xW} zF+k$5as#AZ3M2r<93TOxzhGzPfTT2##9?DoAaNHYaoAo$khm|BIBX6JB%XsLE)Nob z;vOV%SQ-FHtw9opjR}CnVfT7~q+w^$g2bi$38YB+uJA?S3JFG!8NE~+8Du@rd z`xr!n#9?zrAifHyAp;F(*qA>^9Coi1NIh(90VJM+q#m{x7$gol%Nj(3)Wgbf5Fa#0 z0HQ(SusQ<72c4M=qCw)Yu|yCbGPs5w{A#IHidzo3c#fr^Vj zD=m;2Aj}IQ7#J8#(8P0niCaVYHBcI41_NnC@w`32iXg&gC`(~!|q%F zY1x1z4mn|S9ClU`NL&QkI0N|$HkJYsH$f7It+51&+aigB+zG48 zBap;lceQ}jS0IVQ&i)07&p;CI1qnd$LL_mJ`LH^E50ZEvR4s_QgCq{yvkVgdgCq_+ z!wn=Z16p(rt!F`gE`Y|j1Csb;kU}UPVRL*SsU1k-u)X0R@jFQ3^FabI3~ls-+zIPDfTR?V#211DpjZ=0 z9OPfve47W7ILNHy(0I&16Tb=-??4lO3>9C2CT?~Q3cny;HIgkJp&p{H0&1Hb3_8^I000}_x z9VGFKP%#ko2T2@u)-*_520Ca0azE@$50JPGlK2&n02Id{iC=|^fv6fJ@oOLfD4v5P z4m%SSB((=g9CmgtNc;|xIP6XVkoX@YaabP+BrYQkA|c~Zw;@b0X@evVJ98Bz;ff>< zig#EVN^E54&<*lP%#i?fg}#QV+AB0 zfg}#Q`wt{ufg}!Fn+g)2gCza|Bml*Gki=nUt%IbFB8h{-6Fl(Cz`*bVN&FK?Arx~! zJ0>9a!{!P>QW{9&u)APD;)Y1#AoF48x%(iA!|u)isV_hhhn=MZ60bxO2blxAZ*mHf zIP7j~koql1;y*wFP<#bR9CkMiNa`+G)AhuzfzGN%Pe9Cmj+NW2$G z92CFyP_xz`iNo&F0;xZPBrX6|3ljf;Bn~_42_!B6?Z|`t1`noYI45*+0wfOdR}0h*9VBtsotq%_0Z8I1P_-cO0wi%&B=IRo;;_3dK2 zzk(#Lfh7I~Nn8_2Tm(9a0&>3=lDG+yIPC5rki8*D;yOs`OOV82WgAHS6eMvyB=uX6 z#9?bcK)AAoZ|$FAnGc6v$tONa{6^#9?;6*MG^<8ht2QmmrBd zBZ*Hz5_dro--0CWiX?soN!$%d{0ow}JCe8vbaD~oeh(yZ6C`oiorxg-h9HT1A*nAx z5{JzxgVaw!5{KOd1`^+bB<_o3&J`qa*j?Ko^I~+mcCP?Ce zNalwii3cHxmmrA;BZ*Hz5)VNV--09#yCV{0?-e9**j+y$@h?c?;YjBEMG^<4w*?@N zFfcGEKo`D%xDiO|9gxH$k;D^_#G{bJ8<50ddzV1&S%4%SgQWfdl6WkV_yZ*II3#ff z=wvg zWKJxSI7s~ysQMZtaoAm=AoX*Q#Iup?-Gd~agCu?jNjw)x{11|N9+J2WbbO(Te0!h3dNjw5cd;*eq1(G=I zPB)OfGmyk5A*tViBt98Q{05RZY%MX!oF7QyQ<2n5Ko{A7+&>LT+yY5_I+Az^5-F(p1RCnrA{MRQ(xZUDlll$?OF;CKeTqI`%e zKrYgYj|wg^G>?xj&a6s}FUgP3%qvMvPc6ztaezriVsVCNrfYJJYbJ_>p*dK6QDRPf za(-TMNn&1!XOd5np{r|hSrQg^n#ac{S5z1pT9)L;7nc-e=B0xSOixaZhxk6eBrz!` z73A&U66|WBj`j;Kv51dPNi9jt%rP{H2e~{sBeAGBJ|{KLGt1RADEr^Xn{Cv!U z!InB3xdwUrA*?VoN=huwOolqiGdZ}#&=6!?d`VGaW=XMUvTJanXR@oSD;^^~jmR<3 zHNZPL$UE2+6fLk+0`fkRV_m?>0VTr0vW~^YsYNA51_ec#c_l8HiRpRy#U+`^#UA;^ zB_Lyijp89FGuQ~L@el(HjnWcJ5_8-VGjmFdFw+*2RhDk~MY)M3C8-sjfvu`9;n~u9nHA zDT!cCfOm2x?lgpAIWY#IxSz07z$Gj!H6t8Fu7hwIN20R~4O3E!lZ!IJ6$JI1N6=Ud zXPSWG4pcg(CTF{(mSrZVg0e{_F(E~SR^%KHQErl+TH>2n;a-%P5}a9uy?lbGG*3@0 z@hiD-6?kkHq4P_`Lj-)cB;t;#8DM)6W}OsYSeReoA~% zVqSV`Jg64T%u7$sam_`lACXiW8N>$^W#*>F7o~#K#uw!0WF}WS8@Z;T)SB>`3%h!Y zc;EQqlEf0QXR$aRx2~ep;?xpXgxi8kjB_*dJab)xyhEUs0a}R%&JZT?@#(20A>c+v zZb6PkN@{LmUV2U{ES&{-C*w)~MCrznjtn44$TKf3-_SHYwZt*GB(p3PA_yu+l8TG< z;-jMcK+c53GLjA>qx94gP;&Gw%_+$&D9TSxEiTS4Dt66uH7`mnF3l;yY-J&W-VmGy z!0nCr)TI3QjMSWh)FMxxOaoA5?-v|zXaH6LP9>hfB_;)lnMI!7nVGIWNd~U2uCCx_ z4kYEE_zpQeP{$H1EdV4-49&pqg!ly92J{S0^d!OquJ|2KzPVW43k^=NGd(kr!!tP1 zGuSBJ6{SgM5FebHlbT#&SX|;7Tw;-0Selpc=^9)DYmNE^o5UMp$)Bzu2_u7e zm(;Yx(wq`V=1l?TUYHb?Tnd&lG)D?cXyXqQ-4JXDigv7lW@ucRmtT~UT9ld+pPN|V znF;bBs0aW%7{&RCC~3(R8VQhu?jA8*T)|n~(7-uAuOu-uFSV%H(y^#0u`(<(wcIn)FW4a798xM2gNp&zOi)`K zTa$+BhBCy*=at3h=BJeAq{e5aK)6MzIcbR{Ntt=j;xw3W;|Gl~$SQnfS^vL%b$H9RjUH zf=dc=JaKkzaCcuq;zJDLLk#0xUHyYC;)5;Gn$?sW8vyD7fy_pAFE~w@LAxo2$+-m{ z;4<3X)iX2G)wL|ZJ2}`m9)dC<6sVpH@(w|7_y?Dy7UK4J62875!tY>5qB@XrpMyLN zH5eXA=4tsw<%vZpo|&%xA@MwjE5TQ>TKj00E&t-SJ#ku@K8n`s2d-jndTXhS5jOQTw-XMSsb5JmY7qD z?1vD8cvn-fpsRndBX0j-w-rkv|!2l+LlX>xvPUWscE=IBUp2?dp#d1`J!PGWMZr&mxWDAl>T2Ajlt zqO?9t;{8BfgUpiDqQsK?BF|v6ct{HX)bLAkbt%A_Imq{EW(Cxj&@#Xe+Rw&S8)8(3 zpuP{z?h$gWY6`AZK{YD68Q=yZc)S4QWk@yandu5{pagkm26+dYp%)0CHUWi(V@WjB zFx3Fmc|d8E<1iK6I`Io8DlkB$BT{gf6B!(&TS0uFkYP3rgT~MR)cwp!1vP<_NC;+mn5cp2Dp`h+akygM{S=Pp{F2MSJ#Nd6rcDggpI-G z@vg2uS%$8zZb_M$u3oOLu4UML5S-|l3GEo5Bx0~QrlQH2@#MP z1R5QJJ@AfA8Z=$3N0=?)pp$0MRE?)Q8f=X0dUyvfiO3EfJUF2K4oD`#7bsT3G=lqqL}kmhf)`Uk~So94f*kNc&{F%K!+EJL3jtf z3}J;dS^+(LQUuEAgh#xqt81_UXmr9P-VkF{!ib1b3D6KSc$JK0yl-Lwc(xOBkjWS{ z`vRKqB4*Mh9><(ZJkBW>kPP-2mw3VxF5m%6@Bj&p1`lG8gp8)Td2wn9c;E!vViM%; z9vNmE8WyGIq$U=px+In)Vq|Gh#e>BH(tGrzdVzEsFgyaQ!Z3!ZprJ&%6`(2@91mq^ zIU5vAu%;Jc3cx3XaGMK@K_vHCkViB;!M*~`JAk_{py3}_IDj099G7O$g=vuBa4as) zOwR)iig<#$1rR}OoiLn3aITr4aS2zmqEyf_J=dUM%lIH~LMfJ3jz|Q12jK`I}SG9LX1kRZ7#GC7(?^;;P~W>)a2}V@HAjf zYMv{OAsIuX_%QJ34XktqjsCbIL_uw4tX3kFfEKZM5*+)%t{$t=Xug5AQy}x{Apc}0 z=73h?AeNed8tvHqVHxiWT3!+#Qh;bSqnC!DDN&cy;*z5LO3#2y$iRok>Rj{0#G2p zlAAZeL7>&R#Eg%nq=F1U@f8`S6y&((pp}#er{rd04;xTHg(dtU0b^*Il9>jbri=&A zPzHH}rj4D@#><0CEJ5Rc@x|qd1!Rri1)If#hVa0%z3^~EjJZLxg9*6EM680Q#sD9v zw!&JeBMte%tOrfNVnh_A!axkw7@33DFhQ0$5FShO3pR>30k89QHA&7dsPxRt%!O1; z0ien65TvFx=2Q|H^$*CuIBRF*iVtbvFcWiD1vD%jfUyh&qY**C5JLmdTrPB$0b3;q zYc3eWM`}WfNS-EGhXMC_rysIl{ zc0H{iKCQq2j6v%}T;hWbF_z#{GN4NRpmYNduR?NsjddDt|qA!smY!+7$^=lj>kye9!SX>5+|1N;E8$244DTg z{RaEQyA*)SHIxbulqjIt9cgz$Chpw{Xyrw4NnR?o76w79px_etSO#>M6VDZRbiId87*oMW)jKE+fv{@V)WRpVkRhn82_5(>F3B%SOi#r~9l^Nt8iPj!;He5DfdrR;h9FDx z5OYZ>@hQ;#ntAbwo&lM;nXVzAF~kr9$UKl?JlaktQ2mCOGXl*7fa@dZFpM7=b0H?6 z4j!lmgqi_gXP8OEiaGR-T5yR8cxEFLw6WDP6|o}(bWK0YxoCEn93xWo*!8oDIj z6EeV(SsV`;rpd|7PW23lcXbUeF*3|7jt4DxL{vGMNja&W!6u*)977A3EM#I9wDSrq z{$wM(3a&GsinK66_i8 zAB=U@0+igr=O|=mWPBlifm+|76>iAOGC_l;&@oeJ zWP=6?hs#(5xQSX?oLU@ToC!Mi13DzGtkW&=t|h|*q%49J1UbRj)&(D@MX14SU?U&w1ODIG-yxe2nC7(TFS6p!Q3 z8ffSa*3m*kvv^nV=^vouG{7MUn#hFaJ#b&jGZ;2r=xP}s91k5@1TD65M{g)&iIly zfR;S_gZCOE6}89%)X6!Zy`@+uMi7JO1D@dmydk+7X?O>gO40ku!)LI=5OdHhGuRUp zqtqP-L&P&t10l{vpm8{~Y9AIGsI^&eiDhnLc4|DhY({BJ5M`5Vl54OLXt{m~Xz>u} zgv)~XqEy&oIOy^r(4HaFl+@(JN>B$6(v)^Ji-)Y0^9wcu={JuDuY5$41fRbGYNet& z+7Fh$@H;pjetH#nc*r#gvZTNy9&%zQs6mOjpkO#Pbii}r;8F9W)U^B}=tvw@2G2or zT4jkj1tnN_D`B?kp@Zp=A#`X#hqQ78)au1C)(na!@Olo+EsUU!0VKu1S4tq6i8M$J z8p+3JjtQcJg6TGdIj-PN2rkFq?&ZLr2l;DUjcpE)Q?j$(!i`Ph`HKhS~7L{W5MF#OOsCjN;USfJ`5x&7t z^OXE}(7pm!(CP6g;XTAXW|@)?Jw9OAc?}$Ai8(p00cD6Y`k@B^XW~8p7XtHDlq*^z!#F+wb=d=PsoN0g2xk208reMPC^L8f7gA=>b_g6ae3}8iIfK-^i+bEN zh7X~}A$6=#hd1%;_CV@ZqZV$E(a>B2SI_}c;Qln$%^Hv+$YE#kLVEf^n9VHYV22h` z;4UhOEB}y~C^@fc&fgB9kRR-|`?h###sbcDsde}PRnZcIeF-Jt-8X9RRnb{Kc zkWWaWMD`c7I>XaBgtxSjQUi(|#0?ta8;UVQ48HG^p@L+R6(VB|&M01bIWwN^mxUjIE&s znjzMb6D7_F)qjZPYoMwRF`xlyqm<;wC#Iwnr51yZz(cBi2$%|5XM-h&P{FyD;ES0+ z>r_Fjg1qpIEFuQ>fv)KRK*Y9u@^(o_!DGQ z4Ad`o&BeEt9j#S@)Qd*$pMpnPaBqCX?B^mcki!@-L5#9sHde7GO3=^@eB=W(f`J}; zNI4zDI+W>TE_+bu*TpLOK2UGpGL3;6A%&)D(>M;89d{tggO|Ifl|U@Nyr>5 zeIDEqL&z}Hq9xcEd7U>n?}6ufL1PQxm2$48;LDJ}Q>Cs!ps<0)Z!owBL0f|injM9= zlAy^Fe_I2xji)zZioHbzU#({V=}q9?X^piz0lJ9;t(*kiKLIV- zQ3m5lNwKcz%@AlC9ld1%4G`o+NT68=D!D+Z9b=^o?j%Es!x62$+yHOPx)8R19yXur zk2w}h9d{sypJ_a?%{UX3$wp8~gtZAr9VepYG&4|6L$84#&0uhifO!nz(76N(ypjN# zClIH0BKiU-yALp1JD^!2_%s%JqQV^+nXdR!V{nOCT4qj8d>*LLgS2_V8+>dAEMtL- zJmi5}l;#E~GeO%Apk@Ga?4eXDq~suQ1p~S}0_p_th%>B{L`nsP>>88+CQw3v>UnG( zN3?<;bmTv{uLQ0OzylkQeMPV_0qivZ?&u;VkkGR?tauCv0WS{#b-JLzhvIXzq(zF2 z;H@4w6CpT~(3hg)Ddb>b1PV6rW=YWfM4-+iX!{C*J(Dg4;C(Kp@t$7LV@NP6Ab1^1 z{$MhpEDO4%C?0+zL40yzNosn25$HHN&;}kuQ_xvupgnW2+31jXS3`5SAWS)E=N9Hp zAVY)rcu3yF*xCfjoxvqWX3+Z#5|crf7$Tfel9&!}nStX3DdUh5+vu4PB^43KKcGSZ zYYmGQ=|P}NxWMgq(0~?}rDR|aU`@>_grAyYmFd{9Moh9ns;*b z0-vZKgc$UIBty{L6F41$_wj=F;*8G(ixF%m5HJ~E-U zDpL6lYLg@C6O6GXLMDO6lR*1~kjqT)fGao|g7(+J4_O55vVf;CY<)}EkQ!Pfp&AVt z{skR71TG~&!-piD?*m@(i)7le+03AME@1sE}alnwy%Fs0TK>C^ZL^j`6vi)RaZo z&7_)$$F)ca0Bc&sa10?0#MyP0YdQ+~cyBl(-?yMo6xwLIG(6I=?g+-J=*j#9<#v9wg5u=)vLU z;L{5{{UPU`W7YxS@o6$WgKh;DUy*7&c9%dL4{4O5UhRU?)&K=I4g)YePKjaA{lf%Y z0?G-X#1C=2D`>17=}sC9hr^X)I2Ef_tOY0PU0q1m6+??|40%u?fNl~v)1&BxiJ+*4 zRD&+zdPS*edhzk;6&3LXsYS*4d5JlhC6)1I1`K-1`MIF;@)`8<@=H?n-2Fmzi%Sxd zv!VR-yi&b_qWpr?qLNBz%R4nAKCLJ*H{tl>q0T|7~z%Y>n{WGBYko`ZI1pN!3`jO-R7zy#e28;gV zBn;Mzko&mS`zf%f$B$&zl|j5e*@Kz9Dkch(EkOhA36R`k`R6j zj1U8m^=~7=ejccPWdH9VLB9-CKeGKhNzktW)sJldZW8ocV9|eygz)o#>PL3}K@#i_ zf$B$g|7jBJPr;)92nqI=V9|e=1pO^o^q(a`{}iZx<5*#pb`Mt|2Ij{e*ugAw@A=`1F9c6{of`* z{|l&o1PN4Ea~3Hql% z^&{KQPeS-Dz@i@%2C#e!qjBZmHCXgNB*Fclz9J~-k=-v!g8M;z9d!MVNU;9`7WY3U zLH`{r`k#=X{{IkeibbG zpOFxL23YjVlVCrnu0{91A_@9Ep!Oq|f6XN54}j`N4!;%>^v7V)uStUb3@rM!NYGyb z)sO6deG>H7V9{?(f__j~p~s&E3Hm2sv40W?`e$I#??i(B1z7YulAwPDR6lb1c}_z9 z*@8vC3kmigz@p!k1pQ~Q=yxMQ{|zkqXOf`*2~`u#}IF9Fq$?0Fw^k-nvUr&Pm5~zM;|2L4JzXqxw+5I0# zNdFyJ^f!@U{}e3xTS?Hr0E_-M67;XZqW>ca{@;N`|0fdkAHkx(g9P_qz@opK1pRlg z=a53HC=o^&^MhA`iz@mQx3Hn)}85cSK zY$QRy02ck5NYF0>)eoh>^IxDbJz~?J3REMq`*)CFzX2Bg-$_V+HdyrkAVI$e7X5ok zaDNCE{rgGKpMXXGK@#-mV9|ew1pO6I{ZI-V|3^sB-vHGJrNH`GS&3gi(gD?ooPLgy zVE+`TekcXD{}>7S=Rh?g+kcz{{VTBOKS6^2Em-uQB0>KFEc#EAp#KaO{h%>*V)OqE zsD3B~4*#&FqpMevz{rrmr_w!)Uf1L#T zC9voR&1n!De=1n?-z33)11$P)k)YoOsvkN0Z!-@u~(B?wiyzejY6P-;tnS1gal7{r)Gx{|Z?2GcXa~{?fss|049Ofkpps67;8F(f@}8{RL3{$o^*}A^s|` z=w~89e*;uMa`-Wl(0=d1qMwNb{WGxWXC^`a5~zOU@M9rC{~D-%WdE~~pnnGz{p=*@ zKY~R+2MPKwVA0P>g8n;L^n=!P5SxErVA0P@g8g5x=;tFrKLa;r`OQy)eh#R9l{Q?TderXcizW}NqIsRoy(7ytz zA36TzNYK9psvp^Yc@p&R!J=Q01pP;#`jOqQM1uYcSoABCp#KgQ{VF8re}P568VUNp zK=mVsKRXHO?++IJnk3lI0u6X51s=Zv?WrU-{|Z1gBD){7CXiVD5>Wk63T(eF3GP>c zYDBi5lZ5cof$E1+VEau-u-^iz5!rrI67)Mj^+PGJ{bnTS_kn7JQegd{HI>AMe*{z` zlmhFwCc*v`s75FS)^9_C{sO2*CoOu ztlx0@aV~e$bu>V(YInQ2og6 z&m_VA8&Lho_Ggix{{d7#vi-Rv=zjy%k8FP)3HrZ4^&{J#PlA31K1lhGY(HpE39;eN z0o9LeKOYJCPXvp8eiHO6V9{Sjg8y}(`jP!#PJ(_DsD5PsSCF9J0jeL_{z?+`dtlLD zMS}hSsD5Pk3y={0F?f*$a`mKQKN46idr<2(7qXDWP+5TP<+~0#me?JNO zXJFC4fCT+Zu;^b%g8mIy^e-Yo{~j#*7n7j>1Qz{ENzi`$@0T%tsNU;A6 z7X8af(EkIA{uLzXXMr9(hn#*_k)U4ysvkN3tR_Lf1XMq={VPe(uL9MNY(HpEG_mog z1J#dg|0FEsmjVL=_^4gzb}dMy!@$5`0oDHiIu8PqfF5fG<1$DyK-ToY&I^Z2JcFoo zfO-IQc2glE0|S)HAOTgV2DKl0To6>0fk8qB&O~8AkF|u0=`cXd#HAm0jyFsL%zhZ( z3~E1UZW1O2qG9&Sf;dPR;wyxJAw;Aw2z32!~W?|`^BL8VHA4&&BmdBEmXfS zOe2&=*S`v?AKhOEp!yS_?J%fzc>06!H$nBI$G?I$M1>H9f|Ka>U&Nu`2&y05|LE>~ zg+spwR6n}^(e+C+Lh~=0L^M=CKbnKlxC~;@HGK|f`a7WU&jHl{ivI>I@ox>aAGEd$ z9!Lxf3|XLz$-uw>Yk$JzpvJ+t4Czq)=;2=n)!zeG31Oh?pNT_17c}8(LA1e1G;cD1 z&Uprvsp$Tn@E)QPIsbyrl!lcbFmceC5Fj_9yWa$AzY*MK2m@j&gbgBLXBwf~zXFT> zZCJwJjR|}Fr$OzP2U&uI(d~DH_6gAKKY+#l4lMSA&WXk5|5m8|{vb&t`_Z@z@i^?i z0k!`H)WM+i*M-IYr8xY5MGs;rG#NoS(7XocFfcGIz+wM^PY{bepo&5E_h7OAJP!LA z^dTDGLMS+i?*CIb?C1Clu^)a8JPQLu9~S$$nIZ9yp8wuL?ME*^(CrU~>PJt%4A6p~ zQ215h(60m4PbmMDLfa(h>CfQ{#Q(5-1`7X4So}W+hy7tt`=QAY9{vzh;arC4IP8zW zVn3*?K+pesaM<4hwI6m510NHE390!csU@$UrHPbmCA2U%e&KPA3Ge2N@@pt2f0{M2#SUjVhAQ1~f9`^4z! z#{g=f*bvA9FSUYz(EPnnKB?T^zhSz>JNjegHe!Bf$aT(r11CX#g>x7f7_Q-Pzs6ro_pia?elB+G?w<~| zpHTjDh3ZFl|6-_q=r9P}$>{FS$Dw}{RKGG@Cxn5nKZ_lE{9SAENtTj2nCSKY-egUjLvaGX_C!?EV*E zfQ)Fu?lA=AKhXVm==OicVSkh@B$X11|93d-FM!&Qoc=&}GNIdV$AdlpG(hc#sfV}* zq87#m(d|6g!w;sPko%i>u!mm<7WaehFN3)U=CA8G>|X$NKkObkkXu0*J^n7?uzvy6 zeo&Z!^nv0Jbbl7Q{h)Q8*vgMJb|7603^@=APNLhd#fv@sB|5VKrg>v;?S?)2GKub3514`F#p5&f1&yzp#>9VVAuf~*kxdV&vU?J z(Dna-g+ELIl>Pv{00x#$LE#U&KMAHECN9j!0A6Z_ZvO?S{m^L~sHuqj2Q!C{4;ue) z6%YmoE5tr@A36Pi?q5N-e<}|9k3;Q;4&&nV{{$TNuYlSQOQ)dx1G+y2-Tt>Y?Eeb2 z-y3c-gn{n=7dY%c0JR@E{6Y7ppxf^#fIa*j-b3soH2>6qLw^udKXe!cVi*Hv|EXF4 zd;D>5K++F#_<`;ZKzIKw9QL{o!=j~srW^TW~Y4-mv2ez%CU z-&+v7|1+TWBd1@`To1baH8|`y_yF;*9E5_C=>9LqVgC%M{m9`DI{zEp{?$0__k-F` zsQt7Ihy5F{*bh2?7~TH2IPA|O(*74X>_36Ue$e^7==SRiVUPblsQu7kIGo|HC4@cx zA3*I#PJf{DFVXGi5yo!+F{u5TaGN0v^!R5N#%{j`CuaTwoqvgLe+mx!KS1pdgjfqF z(e01NVZQ^^eq{fH&M!o_|2q!*ojyXWB;@~3IPC9$+K-(6LFbpD+wUrZJ^XW^_7fVv z?!%$K3927<4?V;;nB`}u2=@3}0Chid`2jlr2;Kd!aoB$iYCoazt7kauzktPl(D^;+ z_B)DV4?mVq5FewDKcS}|8&T~3XW)XQUu6G-&W}O2e>x8PO`!G@3jaws>^Hz-Kj{1n zbo+ndu)hOpKcVpdio^Z{EcS!WPe8XnObmPYZ-d%TDEtG(u!sKyEcS!;zoXl~35Wg9 zp!Tz&HF(hDe=QFC4?yilZvTSzkE7c!FOJ>+Z$3lPAA0`{-F_)??Ea77!AyUk{gdeS zH{q~f2YP}1Oo%h!B)a{zIPC9$+J6A56V(0$?Vm%pe?AWTk3#KV0M!qp(4vE377qJ8 zcp?5r?*D-H|DfCNB7r^p)4oD%B-DPem%twWE1>oxm*1fMBk1S-W`=zni&o7BR{d0T=TgQN2|DfkTE=la^{|D55GSV#vXocQ2pjm`yr-bmLD$C*uyVE2;zRwnop2k zkoz^TxPK82`}aZ1clk@lC$VE6wA zEcWYQvENq~yZy3%K)z*Q5Qk8365anPIP{x9^%ENZxq?H#8&p5&&OEr~SjOMa%VPIG zhX^G6L3<8C`a$7ufW`kxa@hU97`&^1fq{_wV{qu-N~Hd&IP@PUQvY=v`fow?L#H7? zK>)_+`R|e(_VAk^2?;+~`Ui!dDVFf#m&YD{GK`RvLTLPmOCG!Z2cY&N=O1${_B-RS z--<~4?QqzC1B?BZSnRLDVSfbFenoJoBM9{HFU4X12Q2nmW3hid4*M&i_RmFF4Q4@N z1_lLV2!axtd0%h#( zzW{YVa{PHTf{~JS^e&Q4@RkRY1dU0#rY!{R!G@ zg0JR^v z|DTM-{+T%J-vG6r(EQmH9QHRr?MKf4saWhkg2R3$=)yA>n8%?sy8rj%uzv#<`_r-5 z{{x5pZczKt+aKumn`vWDKM_#<=;IfVR0Z`60|SGhHum)M0P23^^pl0f{joUgZ-d%T zsQ<7GhyLYI{m^A}a0jEie<2R{|A4w5IsS98xc?gt`=3GWSAp9MVPJ&64)*Zp;)S@B zQ2kefL%#x4e;UMEIEik5xeoUD(@=rre`Nm`V)6eW9QL{V5=IaIg*fz^ zK=l*azwj4_elMtg4J5lkOmzEy;PC$isQ;1kZzUH0tLkA7|01aU^C7_ivKoxh?N`vl z9{w+&_9O2fuEt{j1RVAY2|z3+lz-mh(60g2F99(I>~;`=?*11z+SGT-2SJEN=rA_OYA{B(Uq>H%_*rN|>_=|@H(;^94u}1gLJ<4cfo(+)5K#ob z5{La4v>^5)*WXQ8>_3IW{^wBp36&p5aoBI54Y41&{A$5szl@ro6*t5l z{s~a~k@HU*7W+@(us;ZDKcW8rQ5^PHV6h*x7973(SYw3U{|lk^N24V;NT?vKfe-({S|r;`;p_n4~zYbCfNP|0ct;b`wiXyt~m5Fi9=KdfkO#FphYKx zg9-NVn*en`a{V_Ei~H+w*dGG5AAS5A;wprDs&LqU18P5V_)W%Q|8*Sp7vZoU-T#+x z*#7~G{Zp~nFJOv2{wCnCAKm}lrr5)uLmxB!Ovhq>Dh~VCLG35he(*BG?*1cC{pjnb z(ZkQx47>XkpzcR5e`jHF{|+4X3qUJgLiTUQVSfM?`{!V>|ByL$|HnY>H$@~>Fbm!P zd(E-?zX6N=^RU>TV1eEKNl^Pi=Uag-h7#!ZM_XXGe*@Hh|1WWiwTVr?s7O4G% z<}bpmvD+VEfSG@nVX;5S2D|;wp!S3IoWT4JqS5^yYlGeX0;v7S?Y|XR?4M zgQ*x87znjLrrKh+pTh{#|EsXr-)V>4ei0c+`a!Qh(EZbu-G5rgx!8S zIf%uC#-Da{k+g#r_@~_FE|u7yj)y?5}{@j~xC7u-JbDhy9bF_7iG9 zs<~i~zZFpZnedo{Fd)8yutB7v3-@UG#zYWxWLhZMF9QHRr?MH4uAID<F)s+ z`_E#rKMRNbMo|0F(;vG1sW|NafyI8%S%v8B_YNHP$3X2zA3s13|1&uB=i$(g?tV@; z?BU-4)ekz~77^MIwa9!HH|*grUVLTUhKrg2VnBQ2V!`*^i!n_v5fX0ctG@RKLw!_%t8;ppE%sl;Rta* za{Kuq7WZHB#2$W=v>`5qQ6H-?C$>obwAAipz&|ek?QE>cOwq_*u#Gok@l;3V-NohEcU;_Vt+jj`*%R?Cv^Tn z6%P9sK{@C5m;R1<2Nk7Q_ zZ$@U!{jW*E*zJD?wVzP@#Rg-yKLTn$a{6V)V*h>|_6rz5{9lIfI+z6!1vA0KE*$n3 zKQ$+h2jje$bY1bo;Gw*#7})KcV(#KMws&Mv(9$r2je&{USu_R}9A< zeriztgw_wsgkulC30T4pv?Uci{8Dh(??$Bk@i^>XfyI8%mPmB_7vQiz4QfAh7y+Ju z(bMm29QGf8+K)W{CWfW_a*Mzo{$0k9_(w0l(Cv4Oz#je)9+3QtJpUldX+n>`w^7*L&u$8FKcW8nizw{& zOL#)U54ryi+ERpWziTvh`%^3-_M^AoAfoW#U~q`WZvO&5i2carA81Pty8V?n>~DhF zPiX(davb`nLG=?lf9@#`{U@OM37y~d0Eho~VDUd_NjAFwrDL#%|9`0c=({i;~y?mvK*grM8c8izgngFGNM zL65-(+XW&Zq97I&$6*hDhXjcI$mJJkN*>+*dpPXxgW8Y2{s!Itn>g%`z+%4%miXTp zkKOge`MCt?r3Zm9j}{SQc}K+`S* z1A|y1cK^S?V!thx_zS~f|5~X1gzEo59QHFLV&)%vEcP$MVgG5U{e=9#5QqH|SnLN4 zX`_e#T^#nmgW8YYe?gCb<0S0y$KVBtDfI9|*RPj^J^l=^xF0m6g6{q~IP8~$+K=9U zgt!Wkex~EF-vf*Npe_Nr{cOqD{qF&_-wRG;* zQ2WutAKm{&IP6~mwI4bEdSOYwr*YW70ct<`{Ws|TKZe8p16b_$!D7Eo3ij~70=1t| z|5rT)d-&hLVn3)!haUb(IPCunwVzP?H5P~cAF$XTfW`m2aoDfs4T)(&`IkEtd-z#E z^`oaB^!Q^-#U6efNtopy=&nQb@SBOl{xGQhgxvoEhyDy4`XQkL&Wd2dI1Ri1tDySP z{g19+FAcl@6|nd}983H);jn)O4*Su=zZQr67Fg_$#A5$R9QJR6+K*m-qT7E2hy4Li z`;psE(OB%)OvfJn51{r#k1+tp6^KB$UnL!T_%}f9M?OC-7K{CJaoGRL2NKih@sDo* zHyrvod?EVL(=S96ocFM;YO6n5xc?&#`xE>i zHW8}--r%tR0Mvfu_|L&&e{?o>|4)M2kA8m$dj1W|#_sK4dQ=f`%AFcFPV!y{FMSB;ZJD&u1GHS@Yld%e;F40LvYw10kt1}{XN80@a)Lo zkHdZksQt+0cLf&vSK+XKJ=A_e>2E0x`wO7G~pHTX%DZ(E90a)A*x=Rqf{I3$WPFR)*bvo^Xi&VeW?pBgXyRjyUwIK=l(!zcyvq-7f)kKXU$C zjK%#`IPA}Y+7HtYb3epaFdhR#DGvJ+u-Ly8i~aL)*uM#CKYIR0xBn&%{YRks36-B$ zaJYX3)cvsd1C1Z7z~X+Ta_sT{2x>ne_ow5~{{^Za-TmnKH>n(Z{2##L{?%CAzZ8f4 zLePVY3Az6R4*e=n{e;fXdyT{WH=yoE9{*T}#r=sD*u%dIYCkOgU?~<_NQ z9)1FmkeqfI7I07+-G0AH?EaqtwI8|u*owveojB}&47DG<{R?pw)G-VU48m2|-Tw)y zKM|o6%tF`CSB2gE1)#w$wDNxk7WYTvus=8oViTeKvmA&1G#vWT!~Zo7{S{FCgw9WW zhQt3IQ2!&3f9%2Hf9Y!M;lBrJKYIBE2^DZufC;f`?BTxyYCm%N-G{|~9~}1Ig4$0g z{Cjcee*@Kze*PtT_;=uN{{^V~k;CsG7WcE%U=KgNXh=w*=YNPOIOxE{e;oERL)b*Jgo+M{7YbQ|1m7?-;cxo1yK9Z&rd=Rzi&A7 zZ-we7lzuB^MKlqT>qTLV*hL$_DjV;LW?hk?LClr26b=ciM0qTC_^m`7A`~7j)Uj(%urXQSHu-w0}0;(T$HzrICh@Jq| zkAD6utp5yB1Hy}OxPJxI{mAW?i&)$*P!9<|bo+M@X+L*8_VD8XO_ne)Fd*B18H@cz zIP8Bwr2V-#?9aer|5YsZuZ7x=zJH%17Lxu5jUTVTVgCuJ{mA9-bu9Kj!ePG|)P6$u zkKV;${|_wo-^60ScmwwM&w$!bDE$dFV2^)|d`S6^9RIhm*dGG5A3gjhKuN6&vBp!N%ZGC2}P&;J{7*#82H z{SUF&FWd-;eVD+QFthd}K|cmDyX z{i>iqM#AXs?}F+_cmE5henReV#o_)5Sls^#i~ILL?MHV%Lp&t?pr5~n?*4aB{pk5$ z398={$?qU0dNO^9!~Hj)?nh2PZ?U*Ptr-&k==SFjX@6of_W0*0gp_~C_P@ts{|y}W zZ-d(J3i31(M)&_^9QIpavHv3$`#oB)`~MG-_B*#=_kRW!`#)o`e-jS-9TOnw2XtpO zlJ}vv0wP4#;;?@P)PB&J*&x-R@rSQi>=$jt?tiZ&;_Me_#qNIv&|(D!1_osNzhkk# z1c&{>DG>Y7;~ydl@(~2*z~>e1`(cR1Yt0*m|Eu(+SA9g=?0-9H0rKO5ZT5C$YvA#4z-2-T17{smC| z=V3aI_)=V!p&52M4N`ayb-@p-6z zWHAsMrVqph;k0%J26im{;tNpy= Date: Tue, 23 Mar 2021 22:07:13 +0100 Subject: [PATCH 130/258] check on task amount --- GPUSort/src/quicksort/quicksort.cuh | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index 1d71ad4ab..15c3dce52 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -156,7 +156,7 @@ __global__ void cudaInitTask(ArrayView cuda_tasks, //----------------------------------------------------------- //----------------------------------------------------------- const int threadsPerBlock = 512, maxBlocks = 1 << 15; //32k -const int g_maxTasks = 1 << 10; +const int g_maxTasks = 1 << 14; const int minElemPerBlock = threadsPerBlock; class QUICKSORT @@ -212,7 +212,8 @@ public: template void QUICKSORT::sort(const Function &Cmp) { - while (tasksAmount > 0) + + while (tasksAmount > 0 && tasksAmount*2 < maxTasks) { int elemPerBlock = getElemPerBlock(); int blocksCnt = initTasks(elemPerBlock); @@ -240,10 +241,21 @@ void QUICKSORT::sort(const Function &Cmp) processNewTasks(); iteration++; } + + if(tasksAmount > 0) + { + cudaQuickSort2ndPhase + <<>>(arr, aux, Cmp, + iteration % 2 == 0? cuda_newTasks : cuda_tasks + ); + } - cudaQuickSort2ndPhase - <<>>(arr, aux, Cmp, cuda_2ndPhaseTasks); - + if(totalTask - tasksAmount > 0) + { + cudaQuickSort2ndPhase + <<>>(arr, aux, Cmp, cuda_2ndPhaseTasks); + } + cudaDeviceSynchronize(); return; } -- GitLab From 9171d9bfcebd746e41f3f6d81de8e25834908ba5 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Tue, 23 Mar 2021 22:12:03 +0100 Subject: [PATCH 131/258] maxTasks --- GPUSort/src/quicksort/quicksort.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index 15c3dce52..c63f0279a 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -182,11 +182,11 @@ public: QUICKSORT(ArrayView _arr) : arr(_arr), aux(arr.getSize()), maxTasks(min(arr.getSize(), g_maxTasks)), - cuda_tasks(maxBlocks), cuda_newTasks(maxBlocks), cuda_2ndPhaseTasks(maxBlocks), + cuda_tasks(maxTasks), cuda_newTasks(maxTasks), cuda_2ndPhaseTasks(maxTasks), cudaCounters(3), cuda_newTasksAmount(cudaCounters.getView(0, 1)), cuda_2ndPhaseTasksAmount(cudaCounters.getView(1, 2)), - cuda_blockToTaskMapping(maxBlocks), + cuda_blockToTaskMapping(maxTasks), cuda_blockToTaskMapping_Cnt(cudaCounters.getView(2, 3)) { cuda_tasks.setElement(0, TASK(0, arr.getSize(), 0)); -- GitLab From 6acbfe7e4179e2f0196f2bc140325d65efb4b083 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 24 Mar 2021 03:19:02 +0100 Subject: [PATCH 132/258] replace modulo with bitwise and --- GPUSort/src/quicksort/quicksort.cuh | 4 ++-- GPUSort/src/quicksort/quicksort_1Block.cuh | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index c63f0279a..c199a9eff 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -56,14 +56,14 @@ __global__ void cudaQuickSort1stPhase(ArrayView arr, ArrayVi if (threadIdx.x == 0) { - pivot = pickPivot(myTask.depth % 2 == 0 ? arr.getView(myTask.partitionBegin, myTask.partitionEnd) : aux.getView(myTask.partitionBegin, myTask.partitionEnd), + pivot = pickPivot( (myTask.depth&1) == 0 ? arr.getView(myTask.partitionBegin, myTask.partitionEnd) : aux.getView(myTask.partitionBegin, myTask.partitionEnd), Cmp); } __syncthreads(); bool isLast; - if (myTask.depth % 2 == 0) + if ( (myTask.depth&1) == 0) { isLast = cudaPartition( arr.getView(myTask.partitionBegin, myTask.partitionEnd), diff --git a/GPUSort/src/quicksort/quicksort_1Block.cuh b/GPUSort/src/quicksort/quicksort_1Block.cuh index f4e189b0a..16ca01123 100644 --- a/GPUSort/src/quicksort/quicksort_1Block.cuh +++ b/GPUSort/src/quicksort/quicksort_1Block.cuh @@ -99,7 +99,7 @@ __device__ void singleBlockQuickSort(ArrayView arr, end = stackArrEnd[stackTop-1]; depth = stackDepth[stackTop-1]; stackTop--; - pivot = pickPivot(depth%2 == 0? + pivot = pickPivot((depth&1) == 0? arr.getView(begin, end) : aux.getView(begin, end), Cmp @@ -108,8 +108,8 @@ __device__ void singleBlockQuickSort(ArrayView arr, __syncthreads(); int size = end - begin; - auto src = depth%2 == 0 ? arr.getView(begin, end) : aux.getView(begin, end); - auto dst = depth%2 == 0 ? aux.getView(begin, end) : arr.getView(begin, end); + auto src = (depth&1) == 0 ? arr.getView(begin, end) : aux.getView(begin, end); + auto dst = (depth&1) == 0 ? aux.getView(begin, end) : arr.getView(begin, end); if(size <= blockDim.x*2) { -- GitLab From 866c77e11f17cfedba0e986c1c47d4e0fccfd4c0 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 24 Mar 2021 03:22:16 +0100 Subject: [PATCH 133/258] pic pivot as median of 3 --- GPUSort/src/quicksort/cudaPartition.cuh | 28 ++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/GPUSort/src/quicksort/cudaPartition.cuh b/GPUSort/src/quicksort/cudaPartition.cuh index b3710b406..33240a8a1 100644 --- a/GPUSort/src/quicksort/cudaPartition.cuh +++ b/GPUSort/src/quicksort/cudaPartition.cuh @@ -13,7 +13,33 @@ using namespace TNL::Containers; template __device__ Value pickPivot(TNL::Containers::ArrayView src, const Function & Cmp) { - return src[0]; + //return src[0]; + //return src[src.getSize()-1]; + + if(src.getSize() ==1) + return src[0]; + + Value a = src[0], b = src[src.getSize()/2], c = src[src.getSize() - 1]; + + if(Cmp(a, b)) // ..a..b.. + { + if(Cmp(b, c))// ..a..b..c + return b; + else if(Cmp(c, a))//..c..a..b.. + return a; + else //..a..c..b.. + return c; + } + else //..b..a.. + { + if(Cmp(a, c))//..b..a..c + return a; + else if(Cmp(c, b))//..c..b..a.. + return b; + else //..b..c..a.. + return c; + } + } __device__ -- GitLab From 804fcc91b73bb0d1f994b928127f5787764e583e Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 24 Mar 2021 19:28:42 +0100 Subject: [PATCH 134/258] bug fix --- GPUSort/src/quicksort/quicksort.cuh | 25 +++++++++++----------- GPUSort/src/quicksort/quicksort_1Block.cuh | 1 + 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index c199a9eff..6ecfd390a 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -93,7 +93,7 @@ __global__ void cudaQuickSort1stPhase(ArrayView arr, ArrayVi aux[i] = -1; #endif */ - aux[i] = -1; + //aux[i] = -1; arr[i] = pivot; } @@ -170,7 +170,7 @@ class QUICKSORT ArrayView cuda_newTasksAmount, cuda_2ndPhaseTasksAmount; //is in reality 1 integer int tasksAmount; //counter for Host == cuda_newTasksAmount - int totalTask; // cuda_newTasksAmount + cuda_2ndPhaseTasksAmount + int host_2ndPhaseTasksAmount; // cuda_2ndPhaseTasksAmount Array cuda_blockToTaskMapping; ArrayView cuda_blockToTaskMapping_Cnt; //is in reality 1 integer @@ -186,11 +186,12 @@ public: cudaCounters(3), cuda_newTasksAmount(cudaCounters.getView(0, 1)), cuda_2ndPhaseTasksAmount(cudaCounters.getView(1, 2)), - cuda_blockToTaskMapping(maxTasks), + cuda_blockToTaskMapping(maxBlocks), cuda_blockToTaskMapping_Cnt(cudaCounters.getView(2, 3)) { cuda_tasks.setElement(0, TASK(0, arr.getSize(), 0)); - totalTask = tasksAmount = 1; + tasksAmount = 1; + host_2ndPhaseTasksAmount = 0; cuda_2ndPhaseTasksAmount = 0; iteration = 0; } @@ -212,8 +213,7 @@ public: template void QUICKSORT::sort(const Function &Cmp) { - - while (tasksAmount > 0 && tasksAmount*2 < maxTasks) + while (tasksAmount > 0 && tasksAmount*2 < maxTasks && host_2ndPhaseTasksAmount + tasksAmount*2 < maxTasks) { int elemPerBlock = getElemPerBlock(); int blocksCnt = initTasks(elemPerBlock); @@ -241,19 +241,19 @@ void QUICKSORT::sort(const Function &Cmp) processNewTasks(); iteration++; } - + if(tasksAmount > 0) { cudaQuickSort2ndPhase <<>>(arr, aux, Cmp, - iteration % 2 == 0? cuda_newTasks : cuda_tasks + iteration % 2 == 0? cuda_tasks : cuda_newTasks ); } - - if(totalTask - tasksAmount > 0) + + if(host_2ndPhaseTasksAmount > 0) { cudaQuickSort2ndPhase - <<>>(arr, aux, Cmp, cuda_2ndPhaseTasks); + <<>>(arr, aux, Cmp, cuda_2ndPhaseTasks); } cudaDeviceSynchronize(); @@ -311,7 +311,8 @@ int QUICKSORT::initTasks(int elemPerBlock) void QUICKSORT::processNewTasks() { tasksAmount = cuda_newTasksAmount.getElement(0); - totalTask = tasksAmount + cuda_2ndPhaseTasksAmount.getElement(0); + cuda_newTasksAmount = 0; + host_2ndPhaseTasksAmount = cuda_2ndPhaseTasksAmount.getElement(0); } //----------------------------------------------------------- diff --git a/GPUSort/src/quicksort/quicksort_1Block.cuh b/GPUSort/src/quicksort/quicksort_1Block.cuh index 16ca01123..fede2ca67 100644 --- a/GPUSort/src/quicksort/quicksort_1Block.cuh +++ b/GPUSort/src/quicksort/quicksort_1Block.cuh @@ -114,6 +114,7 @@ __device__ void singleBlockQuickSort(ArrayView arr, if(size <= blockDim.x*2) { externSort(src, arr.getView(begin, end), Cmp); + __syncthreads(); continue; } -- GitLab From 99f09130908f3ea98cbeb230ad06879d5b8a1dc5 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 24 Mar 2021 19:53:15 +0100 Subject: [PATCH 135/258] improved memory access --- GPUSort/src/quicksort/quicksort.cuh | 29 +++++++++++----------- GPUSort/src/quicksort/quicksort_1Block.cuh | 2 +- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index 6ecfd390a..d3bd9a7b9 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -56,8 +56,11 @@ __global__ void cudaQuickSort1stPhase(ArrayView arr, ArrayVi if (threadIdx.x == 0) { - pivot = pickPivot( (myTask.depth&1) == 0 ? arr.getView(myTask.partitionBegin, myTask.partitionEnd) : aux.getView(myTask.partitionBegin, myTask.partitionEnd), - Cmp); + if((myTask.depth&1) == 0) + pivot = pickPivot(arr.getView(myTask.partitionBegin, myTask.partitionEnd), Cmp); + else + pivot = pickPivot(aux.getView(myTask.partitionBegin, myTask.partitionEnd), Cmp); + } __syncthreads(); @@ -81,8 +84,6 @@ __global__ void cudaQuickSort1stPhase(ArrayView arr, ArrayVi if (!isLast) return; - myTask = tasks[taskMapping[blockIdx.x]]; - int leftBegin = myTask.partitionBegin, leftEnd = myTask.partitionBegin + myTask.dstBegin; int rightBegin = myTask.partitionBegin + myTask.dstEnd, rightEnd = myTask.partitionEnd; @@ -93,7 +94,6 @@ __global__ void cudaQuickSort1stPhase(ArrayView arr, ArrayVi aux[i] = -1; #endif */ - //aux[i] = -1; arr[i] = pivot; } @@ -242,18 +242,17 @@ void QUICKSORT::sort(const Function &Cmp) iteration++; } - if(tasksAmount > 0) - { - cudaQuickSort2ndPhase - <<>>(arr, aux, Cmp, - iteration % 2 == 0? cuda_tasks : cuda_newTasks - ); - } - - if(host_2ndPhaseTasksAmount > 0) + Algorithms::MultiDeviceMemoryOperations::copy( + cuda_2ndPhaseTasks.getData() + host_2ndPhaseTasksAmount, + iteration % 2 == 0? cuda_tasks.getData() : cuda_newTasks.getData(), + tasksAmount + ); + + int total = tasksAmount + host_2ndPhaseTasksAmount; + if(total > 0) { cudaQuickSort2ndPhase - <<>>(arr, aux, Cmp, cuda_2ndPhaseTasks); + <<>>(arr, aux, Cmp, cuda_2ndPhaseTasks); } cudaDeviceSynchronize(); diff --git a/GPUSort/src/quicksort/quicksort_1Block.cuh b/GPUSort/src/quicksort/quicksort_1Block.cuh index fede2ca67..098cabbf5 100644 --- a/GPUSort/src/quicksort/quicksort_1Block.cuh +++ b/GPUSort/src/quicksort/quicksort_1Block.cuh @@ -138,7 +138,7 @@ __device__ void singleBlockQuickSort(ArrayView arr, __syncthreads(); for (int i = pivotBegin + threadIdx.x; i < pivotEnd; i += blockDim.x) - src[i] = dst[i] = pivot; + arr[begin + i] = pivot; if(threadIdx.x == 0) { -- GitLab From 93296481bdf61f5ab81d8ef0b53cc94f70ad0a6b Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 24 Mar 2021 19:56:19 +0100 Subject: [PATCH 136/258] extend mapping --- GPUSort/src/quicksort/quicksort.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index d3bd9a7b9..51c77d865 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -186,7 +186,7 @@ public: cudaCounters(3), cuda_newTasksAmount(cudaCounters.getView(0, 1)), cuda_2ndPhaseTasksAmount(cudaCounters.getView(1, 2)), - cuda_blockToTaskMapping(maxBlocks), + cuda_blockToTaskMapping(maxBlocks*2), cuda_blockToTaskMapping_Cnt(cudaCounters.getView(2, 3)) { cuda_tasks.setElement(0, TASK(0, arr.getSize(), 0)); -- GitLab From bee8c4b7498505c99445e2b5cdddeb594039cdb8 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 24 Mar 2021 20:32:25 +0100 Subject: [PATCH 137/258] benchmark update --- GPUSort/benchmark/benchmarker.cpp | 44 ++++++++++++------- GPUSort/benchmark/bitonic_benchmark/Makefile | 4 +- .../benchmark/quicksort_benchmark/Makefile | 2 + 3 files changed, 33 insertions(+), 17 deletions(-) diff --git a/GPUSort/benchmark/benchmarker.cpp b/GPUSort/benchmark/benchmarker.cpp index b0171aae1..05b6d62d2 100644 --- a/GPUSort/benchmark/benchmarker.cpp +++ b/GPUSort/benchmark/benchmarker.cpp @@ -16,7 +16,7 @@ using namespace TNL; using namespace TNL::Containers; using namespace std; -const int lowPow = 15, highLow = 22; +const int lowPow = 15, highLow = 25; const int tries = 50; double measure(const vector&vec) @@ -79,28 +79,40 @@ double decreasing(int size) return measure(vec); } -int main() +void start(ostream & out, string delim) { - string delim = "\t"; - cout << "size" << delim; - cout << "random" << delim; - cout << "sorted" << delim; - cout << "almost" << delim; - cout << "decreasing" << delim; - cout << endl; + out << "size" << delim; + out << "random" << delim; + out << "sorted" << delim; + out << "almost" << delim; + out << "decreasing"; + out << endl; for(int pow = lowPow; pow <= highLow; pow++) { int size =(1<< pow); vector vec(size); - cout << "2^" << pow << delim; - cout << fixed << setprecision(3); - cout << random(size) << delim; - cout << sorted(size) << delim; - cout << almostSorted(size) << delim; - cout << decreasing(size) << delim; - cout << endl; + out << "2^" << pow << delim; + out << fixed << setprecision(3); + out << random(size) << delim; + out << sorted(size) << delim; + out << almostSorted(size) << delim; + out << decreasing(size); + out << endl; + } +} + +int main(int argc, char *argv[]) +{ + if(argc == 1) + { + start(cout, "\t"); + } + else + { + ofstream out(argv[1]); + start(out, ","); } return 0; } \ No newline at end of file diff --git a/GPUSort/benchmark/bitonic_benchmark/Makefile b/GPUSort/benchmark/bitonic_benchmark/Makefile index 57736cce3..8e4060e61 100644 --- a/GPUSort/benchmark/bitonic_benchmark/Makefile +++ b/GPUSort/benchmark/bitonic_benchmark/Makefile @@ -10,7 +10,9 @@ cuda: $(CUDA_TARGETS) run: cuda ./$(CUDA_TARGETS) - + +measure: cuda + ./$(CUDA_TARGETS) ../bitonic.csv .PHONY: clean clean: diff --git a/GPUSort/benchmark/quicksort_benchmark/Makefile b/GPUSort/benchmark/quicksort_benchmark/Makefile index 57736cce3..3444b640f 100644 --- a/GPUSort/benchmark/quicksort_benchmark/Makefile +++ b/GPUSort/benchmark/quicksort_benchmark/Makefile @@ -11,6 +11,8 @@ cuda: $(CUDA_TARGETS) run: cuda ./$(CUDA_TARGETS) +measure: cuda + ./$(CUDA_TARGETS) ../quicksort.csv .PHONY: clean clean: -- GitLab From 2d8dcad999635938c938998a954596327b6ec7a7 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 24 Mar 2021 20:32:48 +0100 Subject: [PATCH 138/258] ignore files --- GPUSort/.gitignore | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/GPUSort/.gitignore b/GPUSort/.gitignore index 58e73e001..07c569033 100644 --- a/GPUSort/.gitignore +++ b/GPUSort/.gitignore @@ -1,2 +1,5 @@ .vscode -backup \ No newline at end of file +backup +*.csv +*.o +*.cuo \ No newline at end of file -- GitLab From dba7a501fb86bdffd157b46e574ccf747bd17691 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Thu, 25 Mar 2021 20:23:57 +0100 Subject: [PATCH 139/258] zero entropy and count wrong sorteds --- GPUSort/benchmark/benchmarker.cpp | 50 ++++++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 7 deletions(-) diff --git a/GPUSort/benchmark/benchmarker.cpp b/GPUSort/benchmark/benchmarker.cpp index 05b6d62d2..5a03567a4 100644 --- a/GPUSort/benchmark/benchmarker.cpp +++ b/GPUSort/benchmark/benchmarker.cpp @@ -1,8 +1,11 @@ #include +#include #include #include +#include +#include +using namespace std; -#include #include "../src/util/timer.h" //--------------------------- @@ -12,27 +15,49 @@ * */ //--------------------------- +#ifdef HAVE_CUDA + +#include +#include "../src/util/algorithm.h" using namespace TNL; using namespace TNL::Containers; -using namespace std; -const int lowPow = 15, highLow = 25; +#endif + +static int notCorrectCounters = 0; + +const int lowPow = 13, highLow = 25; const int tries = 50; double measure(const vector&vec) { - Array arr(vec.size()); vector resAcc; + for(int i = 0; i < tries; i++) { - arr = vec; + #ifdef HAVE_CUDA + Array arr(vec); auto view = arr.getView(); { TIMER t([&](double res){resAcc.push_back(res);}); SORTERFUNCTION(view); } + + if(!is_sorted(view)) + notCorrectCounters++; + #else + vector tmp = vec; + + { + TIMER t([&](double res){resAcc.push_back(res);}); + SORTERFUNCTION(tmp); + } + + if(!std::is_sorted(tmp.begin(), tmp.end())) + notCorrectCounters++; + #endif } return accumulate(resAcc.begin(), resAcc.end(), 0.0) / resAcc.size(); @@ -79,13 +104,23 @@ double decreasing(int size) return measure(vec); } +double zero_entropy(int size) +{ + vector vec(size); + for(auto & x : vec) + x = size; + + return measure(vec); +} + void start(ostream & out, string delim) { out << "size" << delim; out << "random" << delim; out << "sorted" << delim; out << "almost" << delim; - out << "decreasing"; + out << "decreas" << delim; + out << "zero_entropy"; out << endl; for(int pow = lowPow; pow <= highLow; pow++) @@ -98,7 +133,8 @@ void start(ostream & out, string delim) out << random(size) << delim; out << sorted(size) << delim; out << almostSorted(size) << delim; - out << decreasing(size); + out << decreasing(size) << delim; + out << zero_entropy(size); out << endl; } } -- GitLab From 0c36031e97d1095512b14aaccdd34b7394d71d30 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Thu, 25 Mar 2021 21:15:38 +0100 Subject: [PATCH 140/258] print counter --- GPUSort/benchmark/benchmarker.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/GPUSort/benchmark/benchmarker.cpp b/GPUSort/benchmark/benchmarker.cpp index 5a03567a4..35a3a8bc0 100644 --- a/GPUSort/benchmark/benchmarker.cpp +++ b/GPUSort/benchmark/benchmarker.cpp @@ -150,5 +150,10 @@ int main(int argc, char *argv[]) ofstream out(argv[1]); start(out, ","); } + + if(notCorrectCounters > 0) + { + std::cerr << notCorrectCounters << " tries were sorted incorrectly" << std::endl; + } return 0; } \ No newline at end of file -- GitLab From 9de7ca9a60c74e0db49f90c1f489e0bca48345da Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Thu, 25 Mar 2021 23:11:23 +0100 Subject: [PATCH 141/258] coalesced write through shared memory --- GPUSort/src/quicksort/cudaPartition.cuh | 52 ++++++++++++++++++++++--- GPUSort/src/quicksort/quicksort.cuh | 22 +++++++---- 2 files changed, 60 insertions(+), 14 deletions(-) diff --git a/GPUSort/src/quicksort/cudaPartition.cuh b/GPUSort/src/quicksort/cudaPartition.cuh index 33240a8a1..5e1eec9dd 100644 --- a/GPUSort/src/quicksort/cudaPartition.cuh +++ b/GPUSort/src/quicksort/cudaPartition.cuh @@ -57,6 +57,34 @@ void countElem(ArrayView arr, } } +__device__ +void copyDataShared(ArrayView src, + ArrayView dst, + int *sharedMem, + int smallerStart, int biggerStart, + int smallerTotal, int biggerTotal, + int smallerOffset, int biggerOffset, //exclusive prefix sum of elements + const int &pivot) +{ + + for (int i = threadIdx.x; i < src.getSize(); i += blockDim.x) + { + int data = src[i]; + if (data < pivot) + sharedMem[smallerOffset++] = data; + else if (data > pivot) + sharedMem[smallerTotal + biggerOffset++] = data; + + } + __syncthreads(); + + for (int i = threadIdx.x; i < smallerTotal; i += blockDim.x) + dst[smallerStart + i] = sharedMem[i]; + + for (int i = threadIdx.x; i < biggerTotal; i += blockDim.x) + dst[biggerStart + i] = sharedMem[smallerTotal + i]; +} + __device__ void copyData(ArrayView src, ArrayView dst, @@ -78,12 +106,14 @@ void copyData(ArrayView src, template __device__ bool cudaPartition(ArrayView src, ArrayView dst, + int * sharedMem, const Function &Cmp, const int & pivot, int elemPerBlock, TASK & task ) { static __shared__ int myBegin, myEnd; static __shared__ int smallerStart, biggerStart; + static __shared__ int smallerTotal, biggerTotal; static __shared__ bool writePivot; if (threadIdx.x == 0) @@ -100,21 +130,31 @@ __device__ bool cudaPartition(ArrayView src, int smaller = 0, bigger = 0; countElem(srcView, smaller, bigger, pivot); - int smallerOffset = blockInclusivePrefixSum(smaller); - int biggerOffset = blockInclusivePrefixSum(bigger); + int smallerPrefSumInc = blockInclusivePrefixSum(smaller); + int biggerPrefSumInc = blockInclusivePrefixSum(bigger); if (threadIdx.x == blockDim.x - 1) //last thread in block has sum of all values { - smallerStart = atomicAdd(&(task.dstBegin), smallerOffset); - biggerStart = atomicAdd(&(task.dstEnd), -biggerOffset) - biggerOffset; + smallerStart = atomicAdd(&(task.dstBegin), smallerPrefSumInc); + biggerStart = atomicAdd(&(task.dstEnd), -biggerPrefSumInc) - biggerPrefSumInc; + smallerTotal = smallerPrefSumInc; + biggerTotal = biggerPrefSumInc; } __syncthreads(); //----------------------------------------------------------- - int destSmaller = smallerStart + smallerOffset - smaller; - int destBigger = biggerStart + biggerOffset - bigger; + /* + int destSmaller = smallerStart + smallerPrefSumInc - smaller; + int destBigger = biggerStart + biggerPrefSumInc - bigger; copyData(srcView, dst, destSmaller, destBigger, pivot); + */ + + copyDataShared(srcView, dst, sharedMem, + smallerStart, biggerStart, + smallerTotal, biggerTotal, + smallerPrefSumInc - smaller, biggerPrefSumInc - bigger, //exclusive prefix sum of elements + pivot); __syncthreads(); //----------------------------------------------------------- diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index 51c77d865..094ac92e8 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -51,6 +51,8 @@ __global__ void cudaQuickSort1stPhase(ArrayView arr, ArrayVi ArrayView newTasks, int *newTasksCnt, ArrayView secondPhaseTasks, int *secondPhaseTasksCnt) { + extern __shared__ int externMem[]; + int * sharedMem = externMem; static __shared__ int pivot; TASK &myTask = tasks[taskMapping[blockIdx.x]]; @@ -71,6 +73,7 @@ __global__ void cudaQuickSort1stPhase(ArrayView arr, ArrayVi isLast = cudaPartition( arr.getView(myTask.partitionBegin, myTask.partitionEnd), aux.getView(myTask.partitionBegin, myTask.partitionEnd), + sharedMem, Cmp, pivot, elemPerBlock, myTask); } else @@ -78,6 +81,7 @@ __global__ void cudaQuickSort1stPhase(ArrayView arr, ArrayVi isLast = cudaPartition( aux.getView(myTask.partitionBegin, myTask.partitionEnd), arr.getView(myTask.partitionBegin, myTask.partitionEnd), + sharedMem, Cmp, pivot, elemPerBlock, myTask); } @@ -217,10 +221,11 @@ void QUICKSORT::sort(const Function &Cmp) { int elemPerBlock = getElemPerBlock(); int blocksCnt = initTasks(elemPerBlock); + int externMemByteSize = elemPerBlock*sizeof(int); if (iteration % 2 == 0) { cudaQuickSort1stPhase - <<>>( + <<>>( arr, aux, Cmp, elemPerBlock, cuda_tasks, cuda_blockToTaskMapping, @@ -230,13 +235,14 @@ void QUICKSORT::sort(const Function &Cmp) } else { - cudaQuickSort1stPhase<<>>( - arr, aux, Cmp, elemPerBlock, - cuda_newTasks, - cuda_blockToTaskMapping, - cuda_tasks, - cuda_newTasksAmount.getData(), - cuda_2ndPhaseTasks, cuda_2ndPhaseTasksAmount.getData()); + cudaQuickSort1stPhase + <<>>( + arr, aux, Cmp, elemPerBlock, + cuda_newTasks, + cuda_blockToTaskMapping, + cuda_tasks, + cuda_newTasksAmount.getData(), + cuda_2ndPhaseTasks, cuda_2ndPhaseTasksAmount.getData()); } processNewTasks(); iteration++; -- GitLab From 085be8b83082c5a58c498afeb13838f5d743645a Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Thu, 25 Mar 2021 23:17:48 +0100 Subject: [PATCH 142/258] insert new task safely --- GPUSort/src/quicksort/quicksort.cuh | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index 094ac92e8..2a7c92519 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -24,12 +24,24 @@ __device__ void writeNewTask(int begin, int end, int depth, ArrayView Date: Thu, 25 Mar 2021 23:51:13 +0100 Subject: [PATCH 143/258] fix overflow of tasks --- GPUSort/src/quicksort/quicksort.cuh | 91 +++++++++++++++++++---------- 1 file changed, 59 insertions(+), 32 deletions(-) diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index 2a7c92519..59e57126d 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -24,20 +24,26 @@ __device__ void writeNewTask(int begin, int end, int depth, ArrayView arr, ArrayVi ArrayView secondPhaseTasks, int *secondPhaseTasksCnt) { extern __shared__ int externMem[]; - int * sharedMem = externMem; + int *sharedMem = externMem; static __shared__ int pivot; TASK &myTask = tasks[taskMapping[blockIdx.x]]; if (threadIdx.x == 0) { - if((myTask.depth&1) == 0) + if ((myTask.depth & 1) == 0) pivot = pickPivot(arr.getView(myTask.partitionBegin, myTask.partitionEnd), Cmp); else pivot = pickPivot(aux.getView(myTask.partitionBegin, myTask.partitionEnd), Cmp); - } __syncthreads(); bool isLast; - if ( (myTask.depth&1) == 0) + if ((myTask.depth & 1) == 0) { isLast = cudaPartition( arr.getView(myTask.partitionBegin, myTask.partitionEnd), @@ -185,8 +190,8 @@ class QUICKSORT ArrayView cuda_newTasksAmount, cuda_2ndPhaseTasksAmount; //is in reality 1 integer - int tasksAmount; //counter for Host == cuda_newTasksAmount - int host_2ndPhaseTasksAmount; // cuda_2ndPhaseTasksAmount + int tasksAmount; //counter for Host == cuda_newTasksAmount + int host_2ndPhaseTasksAmount; // cuda_2ndPhaseTasksAmount Array cuda_blockToTaskMapping; ArrayView cuda_blockToTaskMapping_Cnt; //is in reality 1 integer @@ -202,7 +207,7 @@ public: cudaCounters(3), cuda_newTasksAmount(cudaCounters.getView(0, 1)), cuda_2ndPhaseTasksAmount(cudaCounters.getView(1, 2)), - cuda_blockToTaskMapping(maxBlocks*2), + cuda_blockToTaskMapping(maxBlocks * 2), cuda_blockToTaskMapping_Cnt(cudaCounters.getView(2, 3)) { cuda_tasks.setElement(0, TASK(0, arr.getSize(), 0)); @@ -229,11 +234,25 @@ public: template void QUICKSORT::sort(const Function &Cmp) { - while (tasksAmount > 0 && tasksAmount*2 < maxTasks && host_2ndPhaseTasksAmount + tasksAmount*2 < maxTasks) + while (tasksAmount > 0) { + //by partitioning with n=tasksAmount, max 2n new tasks can be created + //quicksort1stPhase will 1st try to insert into newTasks + //if not enough space then insert into 2nd phase as last resort, and vice versa when inserting into 2ndphase + int maxNewTasks = 2 * tasksAmount; + int spaceLeft = cuda_newTasks.getSize() + (cuda_2ndPhaseTasks.getSize() - host_2ndPhaseTasksAmount); + + if (maxNewTasks >= spaceLeft) + break; + //in case all new tasks are written into newTasks, theres still space in 2ndphase to save it + + //2ndphase task is now full + if (host_2ndPhaseTasksAmount >= cuda_2ndPhaseTasks.getSize()) + break; + int elemPerBlock = getElemPerBlock(); int blocksCnt = initTasks(elemPerBlock); - int externMemByteSize = elemPerBlock*sizeof(int); + int externMemByteSize = elemPerBlock * sizeof(int); if (iteration % 2 == 0) { cudaQuickSort1stPhase @@ -247,32 +266,40 @@ void QUICKSORT::sort(const Function &Cmp) } else { - cudaQuickSort1stPhase - <<>>( - arr, aux, Cmp, elemPerBlock, - cuda_newTasks, - cuda_blockToTaskMapping, - cuda_tasks, - cuda_newTasksAmount.getData(), - cuda_2ndPhaseTasks, cuda_2ndPhaseTasksAmount.getData()); + cudaQuickSort1stPhase<<>>( + arr, aux, Cmp, elemPerBlock, + cuda_newTasks, + cuda_blockToTaskMapping, + cuda_tasks, + cuda_newTasksAmount.getData(), + cuda_2ndPhaseTasks, cuda_2ndPhaseTasksAmount.getData()); } processNewTasks(); iteration++; } - Algorithms::MultiDeviceMemoryOperations::copy( - cuda_2ndPhaseTasks.getData() + host_2ndPhaseTasksAmount, - iteration % 2 == 0? cuda_tasks.getData() : cuda_newTasks.getData(), - tasksAmount - ); + if (tasksAmount > 0) + { + cudaStream_t s; + cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking); + + cudaQuickSort2ndPhase + <<>>(arr, aux, Cmp, + iteration % 2 == 0 ? cuda_tasks : cuda_newTasks); + cudaStreamDestroy(s); + } - int total = tasksAmount + host_2ndPhaseTasksAmount; - if(total > 0) + if (host_2ndPhaseTasksAmount > 0) { + cudaStream_t s; + cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking); + cudaQuickSort2ndPhase - <<>>(arr, aux, Cmp, cuda_2ndPhaseTasks); + <<>>(arr, aux, Cmp, cuda_2ndPhaseTasks); + + cudaStreamDestroy(s); } - + cudaDeviceSynchronize(); return; } @@ -327,9 +354,9 @@ int QUICKSORT::initTasks(int elemPerBlock) void QUICKSORT::processNewTasks() { - tasksAmount = cuda_newTasksAmount.getElement(0); + tasksAmount = min(cuda_newTasksAmount.getElement(0), maxTasks); cuda_newTasksAmount = 0; - host_2ndPhaseTasksAmount = cuda_2ndPhaseTasksAmount.getElement(0); + host_2ndPhaseTasksAmount = min(cuda_2ndPhaseTasksAmount.getElement(0), maxTasks); } //----------------------------------------------------------- -- GitLab From 471fee90a12177eb5d57fbc2ed5658e55f4276cd Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sat, 27 Mar 2021 17:26:37 +0100 Subject: [PATCH 144/258] dynamic fix --- GPUSort/src/quicksort_dynamic/helper.cuh | 17 +- GPUSort/src/quicksort_dynamic/quicksort.cu | 245 ++++++++------------ GPUSort/src/quicksort_dynamic/quicksort.cuh | 12 +- GPUSort/src/quicksort_dynamic/task.h | 5 +- 4 files changed, 110 insertions(+), 169 deletions(-) diff --git a/GPUSort/src/quicksort_dynamic/helper.cuh b/GPUSort/src/quicksort_dynamic/helper.cuh index 8f8172b44..41cd87f4a 100644 --- a/GPUSort/src/quicksort_dynamic/helper.cuh +++ b/GPUSort/src/quicksort_dynamic/helper.cuh @@ -4,11 +4,10 @@ template __device__ void countElem(TNL::Containers::ArrayView src, - int myBegin, int myEnd, int &smaller, int &bigger, const Value &pivot) { - for (int i = myBegin + threadIdx.x; i < myEnd; i += blockDim.x) + for (int i = threadIdx.x; i < src.getSize(); i += blockDim.x) { int data = src[i]; if (data < pivot) @@ -20,13 +19,12 @@ __device__ void countElem(TNL::Containers::ArrayView src, template __device__ void copyData(TNL::Containers::ArrayView src, - int myBegin, int myEnd, TNL::Containers::ArrayView dst, int smallerStart, int biggerStart, const Value &pivot) { - for (int i = myBegin + threadIdx.x; i < myEnd; i += blockDim.x) + for (int i = threadIdx.x; i < src.getSize(); i += blockDim.x) { int data = src[i]; if (data < pivot) @@ -36,7 +34,7 @@ __device__ void copyData(TNL::Containers::ArrayView src, } } -__device__ void calcBlocksNeeded(int elemLeft, int elemRight, int &blocksLeft, int &blocksRight) +__device__ void calcBlocksNeeded(int totalBlocks, int elemLeft, int elemRight, int &blocksLeft, int &blocksRight) { int minElemPerBlock = blockDim.x*2; blocksLeft = elemLeft / minElemPerBlock + (elemLeft% minElemPerBlock != 0); @@ -44,10 +42,10 @@ __device__ void calcBlocksNeeded(int elemLeft, int elemRight, int &blocksLeft, i int totalSets = blocksLeft + blocksRight; - if(totalSets<= gridDim.x) + if(totalSets<= totalBlocks) return; - int multiplier = 1.*gridDim.x / totalSets + 1; + int multiplier = 1.*totalBlocks/ totalSets + 1; minElemPerBlock *= multiplier; blocksLeft = elemLeft / minElemPerBlock + (elemLeft% minElemPerBlock != 0); @@ -58,10 +56,9 @@ __device__ void calcBlocksNeeded(int elemLeft, int elemRight, int &blocksLeft, i template __device__ Value pickPivot(TNL::Containers::ArrayView src, const Function & Cmp) { - return src[0]; + //return src[0]; //return src[src.getSize()-1]; - /* if(src.getSize() ==1) return src[0]; @@ -85,5 +82,5 @@ __device__ Value pickPivot(TNL::Containers::ArrayView src, const else //..b..c..a.. return c; } - */ + } \ No newline at end of file diff --git a/GPUSort/src/quicksort_dynamic/quicksort.cu b/GPUSort/src/quicksort_dynamic/quicksort.cu index 12320a73c..5877a064a 100644 --- a/GPUSort/src/quicksort_dynamic/quicksort.cu +++ b/GPUSort/src/quicksort_dynamic/quicksort.cu @@ -6,158 +6,124 @@ #include "../bitonicSort/bitonicSort.h" #include "helper.cuh" #include -#include #include +#include #define deb(x) std::cout << #x << " = " << x << std::endl; -using CudaArrayView = TNL::Containers::ArrayView; -using CudaTaskArray = TNL::Containers::Array; +using namespace TNL; +using namespace TNL::Containers; template -__device__ bool cudaPartition(CudaArrayView src, CudaArrayView dst, TASK * task, const int & pivot, const Function & Cmp) +__global__ void cudaPartition(ArrayView src, ArrayView dst, int pivot, TASK *task, const Function &Cmp) { static __shared__ int smallerStart, biggerStart; - static __shared__ bool writePivot; - int elemPerBlock = ceil( ((double)src.getSize()) / gridDim.x); + int elemPerBlock = ceil(((double)src.getSize()) / gridDim.x); int myBegin = blockIdx.x * elemPerBlock; int myEnd = TNL::min(src.getSize(), myBegin + elemPerBlock); int smaller = 0, bigger = 0; - countElem(src, myBegin, myEnd, smaller, bigger, pivot); + countElem(src.getView(myBegin, myEnd), smaller, bigger, pivot); - int smallerOffset = blockInclusivePrefixSum(smaller); - int biggerOffset = blockInclusivePrefixSum(bigger); + int smallerInclusiveSum = blockInclusivePrefixSum(smaller); + int biggerInclusiveSum = blockInclusivePrefixSum(bigger); if (threadIdx.x == blockDim.x - 1) //last thread in block has sum of all values { - smallerStart = atomicAdd(&(task->begin), smallerOffset); - biggerStart = atomicAdd(&(task->end), -biggerOffset) - biggerOffset; + smallerStart = atomicAdd(&(task->begin), smallerInclusiveSum); + biggerStart = atomicAdd(&(task->end), -biggerInclusiveSum) - biggerInclusiveSum; } __syncthreads(); - int destSmaller = smallerStart + smallerOffset - smaller; - int destBigger = biggerStart + biggerOffset - bigger; - copyData(src, myBegin, myEnd, dst, destSmaller, destBigger, pivot); - - if (threadIdx.x == 0) - writePivot = (atomicAdd(&(task->stillWorkingCnt), -1) == 1); - __syncthreads(); - - return writePivot; + int destSmaller = smallerStart + (smallerInclusiveSum - smaller); + int destBigger = biggerStart + (biggerInclusiveSum - bigger); + copyData(src.getView(myBegin, myEnd), dst, destSmaller, destBigger, pivot); } template -__device__ void multiBlockQuickSort(CudaArrayView arr, CudaArrayView aux, TASK * task, const Function & Cmp, int depth) +__device__ void multiBlockQuickSort(ArrayView arr, ArrayView aux, const Function &Cmp, int depth, int availblocks) { static __shared__ int pivot; + static __shared__ int leftEnd, rightBegin; - if(threadIdx.x == 0) - pivot = pickPivot(depth %2 == 0? arr: aux, Cmp); - __syncthreads(); - - bool isLast; - if(depth %2 == 0) - isLast = cudaPartition(arr, aux, task, pivot, Cmp); - else - isLast = cudaPartition(aux, arr, task, pivot, Cmp); - - if(!isLast) - return; - - int leftEnd = task->begin, rightBegin = task->end; - - for (int i = leftEnd + threadIdx.x; i < rightBegin; i += blockDim.x) - arr[i] = pivot; + if (threadIdx.x == 0) + { + pivot = pickPivot(depth % 2 == 0 ? arr : aux, Cmp); - if(threadIdx.x != 0) - return; - - int blocksLeft = 1, blocksRight = 1; - calcBlocksNeeded(leftEnd - 0, arr.getSize() - rightBegin, blocksLeft, blocksRight); + TASK *task = (TASK *)malloc(sizeof(TASK)); + *task = TASK(0, arr.getSize()); - bool usedLeft = false; + if (depth % 2 == 0) + cudaPartition<<>>(arr, aux, pivot, task, Cmp); + else + cudaPartition<<>>(aux, arr, pivot, task, Cmp); + cudaDeviceSynchronize(); - if(leftEnd > 0) - { - *task = TASK(0, leftEnd, blocksLeft); - usedLeft = true; - - cudaStream_t s; - cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking); - cudaQuickSort<<>>( - arr.getView(0, leftEnd), - aux.getView(0, leftEnd), - task, - Cmp, depth+1); - cudaStreamDestroy(s); + leftEnd = task->begin, rightBegin = task->end; + free(task); } + __syncthreads(); - if((arr.getSize() - rightBegin)> 0) + for (int i = leftEnd + threadIdx.x; i < rightBegin; i += blockDim.x) + arr[i] = pivot; + + if (threadIdx.x == 0) { - TASK * newTaskRight = nullptr; + int blocksLeft = 0, blocksRight = 0; + calcBlocksNeeded(availblocks, leftEnd - 0, arr.getSize() - rightBegin, blocksLeft, blocksRight); - if(usedLeft) + if(leftEnd > 0) { - newTaskRight = (TASK * )malloc(sizeof(TASK)); - if(!newTaskRight) - { - printf("couldnt allocate memory for right task\n"); - return; - } - *newTaskRight = TASK(0, arr.getSize() - rightBegin, blocksRight); + cudaStream_t s; + cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking); + + cudaQuickSort<<<1, blockDim.x>>>(arr.getView(0, leftEnd), aux.getView(0, leftEnd), Cmp, blocksLeft, depth + 1); + + cudaStreamDestroy(s); } - else + if(arr.getSize() - rightBegin > 0) { - usedLeft = true; - *task = TASK(0, arr.getSize() - rightBegin, blocksRight); - } + cudaStream_t s; + cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking); - cudaStream_t s; - cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking); - cudaQuickSort<<>>( - arr.getView(rightBegin, arr.getSize()), - aux.getView(rightBegin, aux.getSize()), - newTaskRight? newTaskRight : task, - Cmp, depth+1); - cudaStreamDestroy(s); + cudaQuickSort<<<1, blockDim.x>>>(arr.getView(rightBegin, arr.getSize()), aux.getView(rightBegin, aux.getSize()), Cmp, blocksRight, depth + 1); + cudaStreamDestroy(s); + } } - - if(!usedLeft) - free(task); } //------------------------------------------------------------------------- + template -__device__ void externSort(CudaArrayView src, CudaArrayView dst, const Function & Cmp) +__device__ void externSort(ArrayView src, ArrayView dst, const Function &Cmp) { static __shared__ int sharedMem[externMemSize]; bitonicSort_Block(src, dst, sharedMem, Cmp); } -template +template __device__ void stackPush(int stackArrBegin[], int stackArrEnd[], - int stackDepth[], int & stackTop, - int begin, int pivotBegin, - int pivotEnd, int end, - int depth) + int stackDepth[], int &stackTop, + int begin, int pivotBegin, + int pivotEnd, int end, + int depth) { int sizeL = pivotBegin - begin, sizeR = end - pivotEnd; - + //push the bigger one 1st and then smaller one 2nd //in next iteration, the smaller part will be handled 1st - if(sizeL > sizeR) + if (sizeL > sizeR) { - if(sizeL > 0) //left from pivot are smaller elems + if (sizeL > 0) //left from pivot are smaller elems { stackArrBegin[stackTop] = begin; stackArrEnd[stackTop] = pivotBegin; stackDepth[stackTop] = depth + 1; (stackTop)++; } - - if(sizeR > 0) //right from pivot until end are elem greater than pivot + + if (sizeR > 0) //right from pivot until end are elem greater than pivot { assert(stackTop < stackSize && "Local quicksort stack overflow."); @@ -169,7 +135,7 @@ __device__ void stackPush(int stackArrBegin[], int stackArrEnd[], } else { - if(sizeR > 0) //right from pivot until end are elem greater than pivot + if (sizeR > 0) //right from pivot until end are elem greater than pivot { stackArrBegin[stackTop] = pivotEnd; stackArrEnd[stackTop] = end; @@ -177,7 +143,7 @@ __device__ void stackPush(int stackArrBegin[], int stackArrEnd[], (stackTop)++; } - if(sizeL > 0) //left from pivot are smaller elems + if (sizeL > 0) //left from pivot are smaller elems { assert(stackTop < stackSize && "Local quicksort stack overflow."); @@ -190,11 +156,11 @@ __device__ void stackPush(int stackArrBegin[], int stackArrEnd[], } template -__device__ void singleBlockQuickSort(CudaArrayView arr, CudaArrayView aux, const Function & Cmp, int _depth) +__device__ void singleBlockQuickSort(ArrayView arr, ArrayView aux, const Function &Cmp, int _depth) { static __shared__ int stackTop; static __shared__ int stackArrBegin[stackSize], stackArrEnd[stackSize], stackDepth[stackSize]; - static __shared__ int begin, end, depth,pivotBegin, pivotEnd; + static __shared__ int begin, end, depth, pivotBegin, pivotEnd; static __shared__ int pivot; if (threadIdx.x == 0) @@ -207,34 +173,31 @@ __device__ void singleBlockQuickSort(CudaArrayView arr, CudaArrayView aux, const } __syncthreads(); - while(stackTop > 0) + while (stackTop > 0) { if (threadIdx.x == 0) { - begin = stackArrBegin[stackTop-1]; - end = stackArrEnd[stackTop-1]; - depth = stackDepth[stackTop-1]; + begin = stackArrBegin[stackTop - 1]; + end = stackArrEnd[stackTop - 1]; + depth = stackDepth[stackTop - 1]; stackTop--; - pivot = pickPivot(depth%2 == 0? - arr.getView(begin, end) : - aux.getView(begin, end), - Cmp - ); + pivot = pickPivot(depth % 2 == 0 ? arr.getView(begin, end) : aux.getView(begin, end), + Cmp); } __syncthreads(); int size = end - begin; - auto src = depth%2 == 0 ? arr.getView(begin, end) : aux.getView(begin, end); - auto dst = depth%2 == 0 ? aux.getView(begin, end) : arr.getView(begin, end); + auto src = depth % 2 == 0 ? arr.getView(begin, end) : aux.getView(begin, end); + auto dst = depth % 2 == 0 ? aux.getView(begin, end) : arr.getView(begin, end); - if(size <= blockDim.x*2) + if (size <= blockDim.x * 2) { - externSort(src, arr.getView(begin, end), Cmp); + externSort(src, arr.getView(begin, end), Cmp); continue; } int smaller = 0, bigger = 0; - countElem(src, 0, size, smaller, bigger, pivot); + countElem(src, smaller, bigger, pivot); int smallerOffset = blockInclusivePrefixSum(smaller); int biggerOffset = blockInclusivePrefixSum(bigger); @@ -247,78 +210,54 @@ __device__ void singleBlockQuickSort(CudaArrayView arr, CudaArrayView aux, const __syncthreads(); int destSmaller = 0 + smallerOffset - smaller; - int destBigger = pivotEnd + (biggerOffset - bigger); + int destBigger = pivotEnd + (biggerOffset - bigger); - copyData(src, 0, size, dst, destSmaller, destBigger, pivot); + copyData(src, dst, destSmaller, destBigger, pivot); __syncthreads(); for (int i = pivotBegin + threadIdx.x; i < pivotEnd; i += blockDim.x) src[i] = dst[i] = pivot; - if(threadIdx.x == 0) + if (threadIdx.x == 0) { stackPush(stackArrBegin, stackArrEnd, stackDepth, stackTop, - begin, begin+ pivotBegin, - begin +pivotEnd, end, - depth); + begin, begin + pivotBegin, + begin + pivotEnd, end, + depth); } __syncthreads(); } //ends while loop } +//------------------------------------------------------------------------- + template -__global__ void cudaQuickSort(CudaArrayView arr, CudaArrayView aux, TASK * task, const Function & Cmp, int depth) +__global__ void cudaQuickSort(ArrayView arr, ArrayView aux, + const Function &Cmp, int availBlocks, int depth) { - if(gridDim.x > 1) - { - multiBlockQuickSort(arr, aux, task, Cmp, depth); - } - else - { - if(threadIdx.x == 0) - free(task); - + if (availBlocks == 0 || arr.getSize() <= blockDim.x * 2 || depth >= 20) //todo: determine max depth singleBlockQuickSort(arr, aux, Cmp, depth); - } + else + multiBlockQuickSort(arr, aux, Cmp, depth, availBlocks); } //----------------------------------------------------------- -/** - * call this kernel using 1 thread only - * */ template -__global__ void cudaQuickSortEntry(CudaArrayView arr, CudaArrayView aux, const Function & Cmp, int blocks, int threadsPerBlock) -{ - TASK * task = (TASK *)malloc(sizeof(TASK)); - *task = TASK(0, arr.getSize(), blocks); - if(!task) - { - printf("couldnt allocate memory for right task\n"); - return; - } - - //task is freed by the block that wrote pivot - cudaQuickSort<<>>(arr, aux, task, Cmp, 0); -} - -//----------------------------------------------------------- - -template -void quicksort(CudaArrayView arr, const Function & Cmp) +void quicksort(ArrayView arr, const Function &Cmp) { TNL::Containers::Array aux(arr.getSize()); - + const int threadsPerBlock = 512, maxBlocks = 1 << 15; //32k - const int minElemPerBlock = threadsPerBlock*2; + const int minElemPerBlock = threadsPerBlock * 2; int sets = arr.getSize() / minElemPerBlock + (arr.getSize() % minElemPerBlock != 0); int blocks = min(sets, maxBlocks); - cudaQuickSortEntry<<<1, 1>>>(arr, aux.getView(), Cmp, blocks, threadsPerBlock); + cudaQuickSort<<<1, threadsPerBlock>>>(arr, aux.getView(), Cmp, blocks, 0); cudaDeviceSynchronize(); } void quicksort(TNL::Containers::ArrayView arr) { - quicksort(arr, []__cuda_callable__(int a, int b){return a < b;}); + quicksort(arr, [] __cuda_callable__(int a, int b) { return a < b; }); } diff --git a/GPUSort/src/quicksort_dynamic/quicksort.cuh b/GPUSort/src/quicksort_dynamic/quicksort.cuh index 037e8d339..d6f563a4d 100644 --- a/GPUSort/src/quicksort_dynamic/quicksort.cuh +++ b/GPUSort/src/quicksort_dynamic/quicksort.cuh @@ -3,8 +3,14 @@ #include #include "task.h" -using CudaArrayView = TNL::Containers::ArrayView; +using namespace TNL; +using namespace TNL::Containers; + +template +__global__ void cudaQuickSort(ArrayView arr, ArrayView aux, + const Function &Cmp, int availBlocks, int depth); + template -void quicksort(CudaArrayView arr, const Function & Cmp); +void quicksort(ArrayView arr, const Function & Cmp); -void quicksort(TNL::Containers::ArrayViewarr); \ No newline at end of file +void quicksort(ArrayViewarr); \ No newline at end of file diff --git a/GPUSort/src/quicksort_dynamic/task.h b/GPUSort/src/quicksort_dynamic/task.h index 632b0dff9..b6c897e01 100644 --- a/GPUSort/src/quicksort_dynamic/task.h +++ b/GPUSort/src/quicksort_dynamic/task.h @@ -3,11 +3,10 @@ struct TASK { int begin, end; - int stillWorkingCnt; __cuda_callable__ - TASK(int _begin, int _end, int blocks) - : begin(_begin), end(_end), stillWorkingCnt(blocks){} + TASK(int _begin, int _end) + : begin(_begin), end(_end){} __cuda_callable__ TASK(){}; -- GitLab From c379fa3716dbecda9b5d59e605e252d1e66e9194 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sun, 28 Mar 2021 00:42:32 +0100 Subject: [PATCH 145/258] swap fetch and reduce order --- GPUSort/src/quicksort/quicksort.cuh | 2 +- GPUSort/src/util/algorithm.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index 59e57126d..663324299 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -313,7 +313,7 @@ int QUICKSORT::getSetsNeeded() const return size / minElemPerBlock + (size % minElemPerBlock != 0); }; auto reduction = [] __cuda_callable__(int a, int b) { return a + b; }; - return Algorithms::Reduction::reduce(0, tasksAmount, reduction, fetch, 0); + return Algorithms::Reduction::reduce(0, tasksAmount, fetch, reduction, 0); } int QUICKSORT::getElemPerBlock() const diff --git a/GPUSort/src/util/algorithm.h b/GPUSort/src/util/algorithm.h index 92f4ec264..0715c5f42 100644 --- a/GPUSort/src/util/algorithm.h +++ b/GPUSort/src/util/algorithm.h @@ -9,7 +9,7 @@ bool is_sorted(TNL::Containers::ArrayView arr, const auto fetch = [=] __cuda_callable__(int i) { return Cmp(arr[i - 1], arr[i]); }; auto reduction = [] __cuda_callable__(bool a, bool b) { return a && b; }; - return TNL::Algorithms::Reduction::reduce(1, arr.getSize(), reduction, fetch, true); + return TNL::Algorithms::Reduction::reduce(1, arr.getSize(), fetch, reduction, true); } template -- GitLab From 696749dfb2184ddf39bf7267a54ff3505a770416 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Mon, 29 Mar 2021 03:03:59 +0200 Subject: [PATCH 146/258] use loop unrolling instead of template to limit use of recursion --- GPUSort/src/util/reduction.cuh | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/GPUSort/src/util/reduction.cuh b/GPUSort/src/util/reduction.cuh index 234871c93..d913f93e0 100644 --- a/GPUSort/src/util/reduction.cuh +++ b/GPUSort/src/util/reduction.cuh @@ -36,29 +36,21 @@ __device__ int blockReduceSum(int val) return shared[0]; } - -template __device__ int warpInclusivePrefixSum(int value) { - if(it*2 <= 32) + int laneId = threadIdx.x & 0x1f; + + #pragma unroll + for (int i = 1; i*2 <= 32; i *= 2)//32 here is warp size { - int i = it; int n = __shfl_up_sync(0xffffffff, value, i); - int laneId = threadIdx.x & 0x1f; if ((laneId & (warpSize - 1)) >= i) value += n; - return warpInclusivePrefixSum= 32? 32 : it*2>(value); - } return value; } -__device__ int warpInclusivePrefixSum(int value) -{ - return warpInclusivePrefixSum<1>(value); -} - __device__ int blockInclusivePrefixSum(int value) { static __shared__ int shared[32]; -- GitLab From 0606df2fa01cab07fb519fad96b71e4fab0c67b5 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Mon, 29 Mar 2021 03:11:44 +0200 Subject: [PATCH 147/258] set max depth limit and use streams --- GPUSort/src/quicksort_dynamic/quicksort.cu | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/GPUSort/src/quicksort_dynamic/quicksort.cu b/GPUSort/src/quicksort_dynamic/quicksort.cu index 5877a064a..ca50dd624 100644 --- a/GPUSort/src/quicksort_dynamic/quicksort.cu +++ b/GPUSort/src/quicksort_dynamic/quicksort.cu @@ -78,7 +78,7 @@ __device__ void multiBlockQuickSort(ArrayView arr, ArrayView cudaStream_t s; cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking); - cudaQuickSort<<<1, blockDim.x>>>(arr.getView(0, leftEnd), aux.getView(0, leftEnd), Cmp, blocksLeft, depth + 1); + cudaQuickSort<<<1, blockDim.x, 0, s>>>(arr.getView(0, leftEnd), aux.getView(0, leftEnd), Cmp, blocksLeft, depth + 1); cudaStreamDestroy(s); } @@ -87,7 +87,7 @@ __device__ void multiBlockQuickSort(ArrayView arr, ArrayView cudaStream_t s; cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking); - cudaQuickSort<<<1, blockDim.x>>>(arr.getView(rightBegin, arr.getSize()), aux.getView(rightBegin, aux.getSize()), Cmp, blocksRight, depth + 1); + cudaQuickSort<<<1, blockDim.x, 0, s>>>(arr.getView(rightBegin, arr.getSize()), aux.getView(rightBegin, aux.getSize()), Cmp, blocksRight, depth + 1); cudaStreamDestroy(s); } } @@ -235,7 +235,7 @@ template __global__ void cudaQuickSort(ArrayView arr, ArrayView aux, const Function &Cmp, int availBlocks, int depth) { - if (availBlocks == 0 || arr.getSize() <= blockDim.x * 2 || depth >= 20) //todo: determine max depth + if (availBlocks == 0 || arr.getSize() <= blockDim.x * 2 || depth >= 4) //todo: determine max depth singleBlockQuickSort(arr, aux, Cmp, depth); else multiBlockQuickSort(arr, aux, Cmp, depth, availBlocks); @@ -253,6 +253,7 @@ void quicksort(ArrayView arr, const Function &Cmp) int sets = arr.getSize() / minElemPerBlock + (arr.getSize() % minElemPerBlock != 0); int blocks = min(sets, maxBlocks); + cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, 10); cudaQuickSort<<<1, threadsPerBlock>>>(arr, aux.getView(), Cmp, blocks, 0); cudaDeviceSynchronize(); } -- GitLab From 741e3368a490dee92f14f4a34be11c7e3a6fdae7 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Mon, 29 Mar 2021 20:52:15 +0200 Subject: [PATCH 148/258] add different inputs and better interface for measuring --- GPUSort/benchmark/benchmarker.cpp | 129 +++++++++++++++++++++++++++--- 1 file changed, 120 insertions(+), 9 deletions(-) diff --git a/GPUSort/benchmark/benchmarker.cpp b/GPUSort/benchmark/benchmarker.cpp index 35a3a8bc0..6c3d43e4d 100644 --- a/GPUSort/benchmark/benchmarker.cpp +++ b/GPUSort/benchmark/benchmarker.cpp @@ -4,6 +4,7 @@ #include #include #include +#include using namespace std; #include "../src/util/timer.h" @@ -26,15 +27,27 @@ using namespace TNL::Containers; static int notCorrectCounters = 0; -const int lowPow = 13, highLow = 25; -const int tries = 50; +#ifndef LOW_POW + #define LOW_POW 10 +#endif + +#ifndef HIGH_POW + #define HIGH_POW 25 +#endif + +#ifndef TRIES + #define TRIES 20 +#endif + +double measure(const vector&vec); +#ifndef MY_OWN_MEASURE double measure(const vector&vec) { vector resAcc; - for(int i = 0; i < tries; i++) + for(int i = 0; i < TRIES; i++) { #ifdef HAVE_CUDA Array arr(vec); @@ -62,6 +75,7 @@ double measure(const vector&vec) return accumulate(resAcc.begin(), resAcc.end(), 0.0) / resAcc.size(); } +#endif double sorted(int size) { @@ -72,6 +86,16 @@ double sorted(int size) } double random(int size) +{ + srand(size + 2021); + + vector vec(size); + generate(vec.begin(), vec.end(), [=](){return std::rand() % (2*size);}); + + return measure(vec); +} + +double shuffle(int size) { srand(size); @@ -98,8 +122,8 @@ double almostSorted(int size) double decreasing(int size) { vector vec(size); - for(size_t i = 0; i < size; i++) - vec[i] = -i; + for(int i = 0; i < size; i++) + vec[i] = size - i; return measure(vec); } @@ -113,17 +137,101 @@ double zero_entropy(int size) return measure(vec); } +double gaussian(int size) +{ + srand(size + 2000); + + vector vec(size); + for (int i = 0; i < size; ++i) + { + int value = 0; + for (int j = 0; j < 4; ++j) + value += rand()%16384; + + vec[i] = value /4; + } + return measure(vec); +} + +double bucket(int size) +{ + srand (size + 94215); + vector vec(size); + + double tmp = ((double)size)*3000000; //(RAND_MAX)/p; --> ((double)N)*30000; + double tmp2 = sqrt(tmp); + + int p= (size+tmp2-1)/tmp2; + + const int VALUE = 8192/p; //(RAND_MAX)/p; + + int i=0; int x=0; + //the array of size N is split into 'p' buckets + while(i < p) + { + for (int z = 0; z < p; ++z) + for (int j = 0; j < size/(p*p); ++j) + { + //every bucket has N/(p*p) items and the range is [min : VALUE-1 ] + int min = VALUE*z; + + vec[x]= min + ( rand() % (VALUE-1) ) ; + x++; + } + i++; + } + + return measure(vec); +} + +double staggared(int size) +{ + srand (size + 815618); + vector vec(size); + + int tmp=4096; //(RAND_MAX)/p; --> size=2048 + int p= (size+tmp-1)/tmp; + + const int VALUE = 16384/p; //(RAND_MAX)/p; + + int i=1; int x=0; + //the array of size N is split into 'p' buckets + while(i <= p) + { + //every bucket has N/(p) items + for (int j = 0; j < size/(p); ++j) + { + int min; + + if(i<=(p/2)) + min = (2*i -1)*VALUE; + + else + min = (2*i-p-1)*VALUE; + + vec[x++]= min + ( rand() % (VALUE - 1) ); + } + i++; + } + + return measure(vec); +} + void start(ostream & out, string delim) { out << "size" << delim; out << "random" << delim; + out << "shuffle" << delim; out << "sorted" << delim; out << "almost" << delim; out << "decreas" << delim; + out << "gauss" << delim; + out << "bucket" << delim; + out << "stagger" << delim; out << "zero_entropy"; out << endl; - for(int pow = lowPow; pow <= highLow; pow++) + for(int pow = LOW_POW; pow <= HIGH_POW; pow++) { int size =(1<< pow); vector vec(size); @@ -131,9 +239,13 @@ void start(ostream & out, string delim) out << "2^" << pow << delim; out << fixed << setprecision(3); out << random(size) << delim; + out << shuffle(size) << delim; out << sorted(size) << delim; out << almostSorted(size) << delim; out << decreasing(size) << delim; + out << gaussian(size) << delim; + out << bucket(size) << delim; + out << staggared(size) << delim; out << zero_entropy(size); out << endl; } @@ -147,13 +259,12 @@ int main(int argc, char *argv[]) } else { - ofstream out(argv[1]); + std::ofstream out(argv[1]); start(out, ","); } - if(notCorrectCounters > 0) { - std::cerr << notCorrectCounters << " tries were sorted incorrectly" << std::endl; + std::cerr << notCorrectCounters << "tries were sorted incorrectly" << std::endl; } return 0; } \ No newline at end of file -- GitLab From 0c281e72fb4dcfa325a1de45bc111c1ab3ab3145 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Tue, 30 Mar 2021 21:00:30 +0200 Subject: [PATCH 149/258] host sync instead of atomic add for pivot writing --- GPUSort/src/quicksort/quicksort.cuh | 43 +++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 5 deletions(-) diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index 663324299..be6d9a4a9 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -83,11 +83,10 @@ __global__ void cudaQuickSort1stPhase(ArrayView arr, ArrayVi } __syncthreads(); - bool isLast; if ((myTask.depth & 1) == 0) { - isLast = cudaPartition( + cudaPartition( arr.getView(myTask.partitionBegin, myTask.partitionEnd), aux.getView(myTask.partitionBegin, myTask.partitionEnd), sharedMem, @@ -95,15 +94,33 @@ __global__ void cudaQuickSort1stPhase(ArrayView arr, ArrayVi } else { - isLast = cudaPartition( + cudaPartition( aux.getView(myTask.partitionBegin, myTask.partitionEnd), arr.getView(myTask.partitionBegin, myTask.partitionEnd), sharedMem, Cmp, pivot, elemPerBlock, myTask); } +} - if (!isLast) - return; +template +__global__ void cudaWritePivot(ArrayView arr, ArrayView aux, + const Function &Cmp, int elemPerBlock, + ArrayView tasks, + ArrayView taskMapping, + ArrayView newTasks, int *newTasksCnt, + ArrayView secondPhaseTasks, int *secondPhaseTasksCnt) +{ + static __shared__ int pivot; + TASK &myTask = tasks[blockIdx.x]; + + if (threadIdx.x == 0) + { + if ((myTask.depth & 1) == 0) + pivot = pickPivot(arr.getView(myTask.partitionBegin, myTask.partitionEnd), Cmp); + else + pivot = pickPivot(aux.getView(myTask.partitionBegin, myTask.partitionEnd), Cmp); + } + __syncthreads(); int leftBegin = myTask.partitionBegin, leftEnd = myTask.partitionBegin + myTask.dstBegin; int rightBegin = myTask.partitionBegin + myTask.dstEnd, rightEnd = myTask.partitionEnd; @@ -263,6 +280,14 @@ void QUICKSORT::sort(const Function &Cmp) cuda_newTasks, cuda_newTasksAmount.getData(), cuda_2ndPhaseTasks, cuda_2ndPhaseTasksAmount.getData()); + + cudaWritePivot<<>>( + arr, aux, Cmp, elemPerBlock, + cuda_tasks, + cuda_blockToTaskMapping, + cuda_newTasks, + cuda_newTasksAmount.getData(), + cuda_2ndPhaseTasks, cuda_2ndPhaseTasksAmount.getData()); } else { @@ -273,6 +298,14 @@ void QUICKSORT::sort(const Function &Cmp) cuda_tasks, cuda_newTasksAmount.getData(), cuda_2ndPhaseTasks, cuda_2ndPhaseTasksAmount.getData()); + + cudaWritePivot<<>>( + arr, aux, Cmp, elemPerBlock, + cuda_newTasks, + cuda_blockToTaskMapping, + cuda_tasks, + cuda_newTasksAmount.getData(), + cuda_2ndPhaseTasks, cuda_2ndPhaseTasksAmount.getData()); } processNewTasks(); iteration++; -- GitLab From a99e7835f7e196b9d0fa13cc65874fd7fc89d118 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Tue, 30 Mar 2021 21:11:25 +0200 Subject: [PATCH 150/258] remove unnecessary params --- GPUSort/src/quicksort/quicksort.cuh | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index be6d9a4a9..b27b7561c 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -65,9 +65,7 @@ template __global__ void cudaQuickSort1stPhase(ArrayView arr, ArrayView aux, const Function &Cmp, int elemPerBlock, ArrayView tasks, - ArrayView taskMapping, - ArrayView newTasks, int *newTasksCnt, - ArrayView secondPhaseTasks, int *secondPhaseTasksCnt) + ArrayView taskMapping) { extern __shared__ int externMem[]; int *sharedMem = externMem; @@ -106,7 +104,6 @@ template __global__ void cudaWritePivot(ArrayView arr, ArrayView aux, const Function &Cmp, int elemPerBlock, ArrayView tasks, - ArrayView taskMapping, ArrayView newTasks, int *newTasksCnt, ArrayView secondPhaseTasks, int *secondPhaseTasksCnt) { @@ -275,16 +272,11 @@ void QUICKSORT::sort(const Function &Cmp) cudaQuickSort1stPhase <<>>( arr, aux, Cmp, elemPerBlock, - cuda_tasks, - cuda_blockToTaskMapping, - cuda_newTasks, - cuda_newTasksAmount.getData(), - cuda_2ndPhaseTasks, cuda_2ndPhaseTasksAmount.getData()); + cuda_tasks, cuda_blockToTaskMapping); cudaWritePivot<<>>( arr, aux, Cmp, elemPerBlock, cuda_tasks, - cuda_blockToTaskMapping, cuda_newTasks, cuda_newTasksAmount.getData(), cuda_2ndPhaseTasks, cuda_2ndPhaseTasksAmount.getData()); @@ -293,16 +285,11 @@ void QUICKSORT::sort(const Function &Cmp) { cudaQuickSort1stPhase<<>>( arr, aux, Cmp, elemPerBlock, - cuda_newTasks, - cuda_blockToTaskMapping, - cuda_tasks, - cuda_newTasksAmount.getData(), - cuda_2ndPhaseTasks, cuda_2ndPhaseTasksAmount.getData()); + cuda_newTasks, cuda_blockToTaskMapping); cudaWritePivot<<>>( arr, aux, Cmp, elemPerBlock, cuda_newTasks, - cuda_blockToTaskMapping, cuda_tasks, cuda_newTasksAmount.getData(), cuda_2ndPhaseTasks, cuda_2ndPhaseTasksAmount.getData()); -- GitLab From 2f53f7dfe3841202713404f5599b9f2dad616a55 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Tue, 30 Mar 2021 21:41:38 +0200 Subject: [PATCH 151/258] pick pivot once to minimize global access --- GPUSort/src/quicksort/cudaPartition.cuh | 43 ++++++++++++----- GPUSort/src/quicksort/quicksort.cuh | 64 ++++++++++++++----------- GPUSort/src/quicksort/task.h | 10 ++-- 3 files changed, 72 insertions(+), 45 deletions(-) diff --git a/GPUSort/src/quicksort/cudaPartition.cuh b/GPUSort/src/quicksort/cudaPartition.cuh index 5e1eec9dd..98ec5b505 100644 --- a/GPUSort/src/quicksort/cudaPartition.cuh +++ b/GPUSort/src/quicksort/cudaPartition.cuh @@ -42,6 +42,37 @@ __device__ Value pickPivot(TNL::Containers::ArrayView src, const } +template +__device__ Value pickPivotIdx(TNL::Containers::ArrayView src, const Function & Cmp) +{ + //return 0; + //return src.getSize()-1; + + if(src.getSize() <= 1) + return 0; + + Value a = src[0], b = src[src.getSize()/2], c = src[src.getSize() - 1]; + + if(Cmp(a, b)) // ..a..b.. + { + if(Cmp(b, c))// ..a..b..c + return src.getSize()/2; + else if(Cmp(c, a))//..c..a..b.. + return 0; + else //..a..c..b.. + return src.getSize() - 1; + } + else //..b..a.. + { + if(Cmp(a, c))//..b..a..c + return 0; + else if(Cmp(c, b))//..c..b..a.. + return src.getSize()/2; + else //..b..c..a.. + return src.getSize() - 1; + } +} + __device__ void countElem(ArrayView arr, int &smaller, int &bigger, @@ -104,7 +135,7 @@ void copyData(ArrayView src, //---------------------------------------------------------------------------------- template -__device__ bool cudaPartition(ArrayView src, +__device__ void cudaPartition(ArrayView src, ArrayView dst, int * sharedMem, const Function &Cmp, const int & pivot, @@ -114,7 +145,6 @@ __device__ bool cudaPartition(ArrayView src, static __shared__ int myBegin, myEnd; static __shared__ int smallerStart, biggerStart; static __shared__ int smallerTotal, biggerTotal; - static __shared__ bool writePivot; if (threadIdx.x == 0) { @@ -155,13 +185,4 @@ __device__ bool cudaPartition(ArrayView src, smallerTotal, biggerTotal, smallerPrefSumInc - smaller, biggerPrefSumInc - bigger, //exclusive prefix sum of elements pivot); - __syncthreads(); - - //----------------------------------------------------------- - - if (threadIdx.x == 0) - writePivot = atomicAdd(&(task.stillWorkingCnt), -1) == 1; - __syncthreads(); - - return writePivot; } \ No newline at end of file diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index b27b7561c..15f3c0655 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -15,7 +15,8 @@ using namespace TNL::Containers; //----------------------------------------------------------- -__device__ void writeNewTask(int begin, int end, int depth, ArrayView newTasks, int *newTasksCnt, +__device__ void writeNewTask(int begin, int end, int depth, int pivotIdx, + ArrayView newTasks, int *newTasksCnt, ArrayView secondPhaseTasks, int *secondPhaseTasksCnt) { int size = end - begin; @@ -25,12 +26,12 @@ __device__ void writeNewTask(int begin, int end, int depth, ArrayView newTasks, int *newTasksCnt, - ArrayView secondPhaseTasks, int *secondPhaseTasksCnt) -{ - writeNewTask(leftBegin, leftEnd, depth, newTasks, newTasksCnt, secondPhaseTasks, secondPhaseTasksCnt); - writeNewTask(rightBegin, rightEnd, depth, newTasks, newTasksCnt, secondPhaseTasks, secondPhaseTasksCnt); -} //---------------------------------------------------- template @@ -75,13 +67,12 @@ __global__ void cudaQuickSort1stPhase(ArrayView arr, ArrayVi if (threadIdx.x == 0) { if ((myTask.depth & 1) == 0) - pivot = pickPivot(arr.getView(myTask.partitionBegin, myTask.partitionEnd), Cmp); + pivot = arr[myTask.pivotIdx]; else - pivot = pickPivot(aux.getView(myTask.partitionBegin, myTask.partitionEnd), Cmp); + pivot = aux[myTask.pivotIdx]; } __syncthreads(); - if ((myTask.depth & 1) == 0) { cudaPartition( @@ -102,10 +93,10 @@ __global__ void cudaQuickSort1stPhase(ArrayView arr, ArrayVi template __global__ void cudaWritePivot(ArrayView arr, ArrayView aux, - const Function &Cmp, int elemPerBlock, - ArrayView tasks, - ArrayView newTasks, int *newTasksCnt, - ArrayView secondPhaseTasks, int *secondPhaseTasksCnt) + const Function &Cmp, int elemPerBlock, + ArrayView tasks, + ArrayView newTasks, int *newTasksCnt, + ArrayView secondPhaseTasks, int *secondPhaseTasksCnt) { static __shared__ int pivot; TASK &myTask = tasks[blockIdx.x]; @@ -113,9 +104,9 @@ __global__ void cudaWritePivot(ArrayView arr, ArrayView arr, ArrayView 0) + { + int leftPivotIdx = pickPivotIdx((myTask.depth & 1) == 0? + aux.getView(leftBegin, leftEnd) : + arr.getView(leftBegin, leftEnd) + , Cmp) + leftBegin; + + writeNewTask(leftBegin, leftEnd, myTask.depth, leftPivotIdx, newTasks, newTasksCnt, secondPhaseTasks, secondPhaseTasksCnt); + } + + if(rightEnd - rightBegin) + { + int rightPivotIdx = pickPivotIdx((myTask.depth & 1) == 0? + aux.getView(rightBegin, rightEnd) : + arr.getView(rightBegin, rightEnd) + , Cmp) + rightBegin; + + writeNewTask(rightBegin, rightEnd, myTask.depth, rightPivotIdx, newTasks, newTasksCnt, secondPhaseTasks, secondPhaseTasksCnt); + } } //----------------------------------------------------------- @@ -224,7 +230,7 @@ public: cuda_blockToTaskMapping(maxBlocks * 2), cuda_blockToTaskMapping_Cnt(cudaCounters.getView(2, 3)) { - cuda_tasks.setElement(0, TASK(0, arr.getSize(), 0)); + cuda_tasks.setElement(0, TASK(0, arr.getSize(), 0, arr.getSize()/2)); tasksAmount = 1; host_2ndPhaseTasksAmount = 0; cuda_2ndPhaseTasksAmount = 0; @@ -259,7 +265,7 @@ void QUICKSORT::sort(const Function &Cmp) if (maxNewTasks >= spaceLeft) break; //in case all new tasks are written into newTasks, theres still space in 2ndphase to save it - + //2ndphase task is now full if (host_2ndPhaseTasksAmount >= cuda_2ndPhaseTasks.getSize()) break; diff --git a/GPUSort/src/quicksort/task.h b/GPUSort/src/quicksort/task.h index 848d7460c..30168d96f 100644 --- a/GPUSort/src/quicksort/task.h +++ b/GPUSort/src/quicksort/task.h @@ -8,16 +8,16 @@ struct TASK //helper variables for blocks working on this task int depth; + int pivotIdx; int dstBegin, dstEnd; int firstBlock, blockCount;//for workers read only values - int stillWorkingCnt;//shared counter of blocks working together(how many are still working) __cuda_callable__ - TASK(int begin, int end, int depth) + TASK(int begin, int end, int depth, int pivotIdx) : partitionBegin(begin), partitionEnd(end), - depth(depth), + depth(depth), pivotIdx(pivotIdx), dstBegin(-151561), dstEnd(-151561), - firstBlock(-100), blockCount(-100), stillWorkingCnt(-100) + firstBlock(-100), blockCount(-100) {} __cuda_callable__ @@ -25,7 +25,7 @@ struct TASK { dstBegin= 0; dstEnd = partitionEnd - partitionBegin; this->firstBlock = firstBlock; - blockCount = stillWorkingCnt = blocks; + blockCount = blocks; } TASK() = default; -- GitLab From c82f9964b9b682f9f58b6feff9b66c5c63695dab Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Tue, 30 Mar 2021 22:07:45 +0200 Subject: [PATCH 152/258] remove useless syncs, small optimization --- GPUSort/src/quicksort/cudaPartition.cuh | 20 ++++++++------------ GPUSort/src/quicksort/quicksort.cuh | 14 +++++--------- 2 files changed, 13 insertions(+), 21 deletions(-) diff --git a/GPUSort/src/quicksort/cudaPartition.cuh b/GPUSort/src/quicksort/cudaPartition.cuh index 98ec5b505..4752e73ea 100644 --- a/GPUSort/src/quicksort/cudaPartition.cuh +++ b/GPUSort/src/quicksort/cudaPartition.cuh @@ -105,15 +105,16 @@ void copyDataShared(ArrayView src, sharedMem[smallerOffset++] = data; else if (data > pivot) sharedMem[smallerTotal + biggerOffset++] = data; - } __syncthreads(); - for (int i = threadIdx.x; i < smallerTotal; i += blockDim.x) + for (int i = threadIdx.x; i < smallerTotal + biggerTotal; i += blockDim.x) + { + if(i < smallerTotal) dst[smallerStart + i] = sharedMem[i]; - - for (int i = threadIdx.x; i < biggerTotal; i += blockDim.x) - dst[biggerStart + i] = sharedMem[smallerTotal + i]; + else + dst[biggerStart + i - smallerTotal] = sharedMem[i]; + } } __device__ @@ -142,16 +143,11 @@ __device__ void cudaPartition(ArrayView src, int elemPerBlock, TASK & task ) { - static __shared__ int myBegin, myEnd; static __shared__ int smallerStart, biggerStart; static __shared__ int smallerTotal, biggerTotal; - if (threadIdx.x == 0) - { - myBegin = elemPerBlock * (blockIdx.x - task.firstBlock); - myEnd = TNL::min(myBegin + elemPerBlock, src.getSize()); - } - __syncthreads(); + int myBegin = elemPerBlock * (blockIdx.x - task.firstBlock); + int myEnd = TNL::min(myBegin + elemPerBlock, src.getSize()); auto srcView = src.getView(myBegin, myEnd); diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index 15f3c0655..c1bf9e5b6 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -61,17 +61,13 @@ __global__ void cudaQuickSort1stPhase(ArrayView arr, ArrayVi { extern __shared__ int externMem[]; int *sharedMem = externMem; - static __shared__ int pivot; + int pivot; TASK &myTask = tasks[taskMapping[blockIdx.x]]; - if (threadIdx.x == 0) - { - if ((myTask.depth & 1) == 0) - pivot = arr[myTask.pivotIdx]; - else - pivot = aux[myTask.pivotIdx]; - } - __syncthreads(); + if ((myTask.depth & 1) == 0) + pivot = arr[myTask.pivotIdx]; + else + pivot = aux[myTask.pivotIdx]; if ((myTask.depth & 1) == 0) { -- GitLab From a177000603d44ee7b2fe97c6229601ab73de41a3 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Tue, 30 Mar 2021 23:17:48 +0200 Subject: [PATCH 153/258] small optimization --- GPUSort/src/quicksort/quicksort.cuh | 14 ++++---------- GPUSort/src/util/reduction.cuh | 9 +++++---- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index c1bf9e5b6..61f54f907 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -194,7 +194,7 @@ __global__ void cudaInitTask(ArrayView cuda_tasks, //----------------------------------------------------------- const int threadsPerBlock = 512, maxBlocks = 1 << 15; //32k const int g_maxTasks = 1 << 14; -const int minElemPerBlock = threadsPerBlock; +const int minElemPerBlock = threadsPerBlock*2; class QUICKSORT { @@ -253,14 +253,8 @@ void QUICKSORT::sort(const Function &Cmp) while (tasksAmount > 0) { //by partitioning with n=tasksAmount, max 2n new tasks can be created - //quicksort1stPhase will 1st try to insert into newTasks - //if not enough space then insert into 2nd phase as last resort, and vice versa when inserting into 2ndphase - int maxNewTasks = 2 * tasksAmount; - int spaceLeft = cuda_newTasks.getSize() + (cuda_2ndPhaseTasks.getSize() - host_2ndPhaseTasksAmount); - - if (maxNewTasks >= spaceLeft) + if (2 * tasksAmount >= maxTasks) break; - //in case all new tasks are written into newTasks, theres still space in 2ndphase to save it //2ndphase task is now full if (host_2ndPhaseTasksAmount >= cuda_2ndPhaseTasks.getSize()) @@ -306,7 +300,7 @@ void QUICKSORT::sort(const Function &Cmp) cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking); cudaQuickSort2ndPhase - <<>>(arr, aux, Cmp, + <<>>(arr, aux, Cmp, iteration % 2 == 0 ? cuda_tasks : cuda_newTasks); cudaStreamDestroy(s); } @@ -317,7 +311,7 @@ void QUICKSORT::sort(const Function &Cmp) cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking); cudaQuickSort2ndPhase - <<>>(arr, aux, Cmp, cuda_2ndPhaseTasks); + <<>>(arr, aux, Cmp, cuda_2ndPhaseTasks); cudaStreamDestroy(s); } diff --git a/GPUSort/src/util/reduction.cuh b/GPUSort/src/util/reduction.cuh index d913f93e0..70d932e77 100644 --- a/GPUSort/src/util/reduction.cuh +++ b/GPUSort/src/util/reduction.cuh @@ -38,13 +38,14 @@ __device__ int blockReduceSum(int val) __device__ int warpInclusivePrefixSum(int value) { - int laneId = threadIdx.x & 0x1f; + int laneId = threadIdx.x & (32-1); #pragma unroll - for (int i = 1; i*2 <= 32; i *= 2)//32 here is warp size + for (int i = 0; i < 6; i++) //iterates until x == 1<<5 == 32 which is warpSize { - int n = __shfl_up_sync(0xffffffff, value, i); - if ((laneId & (warpSize - 1)) >= i) + int x = 1<= x) value += n; } -- GitLab From 2116bdc2bc6d90dc62d8772d3ffcbde17de700ee Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 31 Mar 2021 19:12:09 +0200 Subject: [PATCH 154/258] warp optimization and error checking --- GPUSort/src/quicksort/quicksort.cuh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index 61f54f907..5cc87ed8d 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -316,7 +316,10 @@ void QUICKSORT::sort(const Function &Cmp) cudaStreamDestroy(s); } - cudaDeviceSynchronize(); + auto error = cudaDeviceSynchronize(); + if(error != cudaSuccess) + deb(error); + return; } -- GitLab From db348d8f04ee5585eb8ea6f2f6f32d210eca0e4f Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 31 Mar 2021 19:13:25 +0200 Subject: [PATCH 155/258] warp optimization --- GPUSort/src/util/reduction.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPUSort/src/util/reduction.cuh b/GPUSort/src/util/reduction.cuh index 70d932e77..00caa521b 100644 --- a/GPUSort/src/util/reduction.cuh +++ b/GPUSort/src/util/reduction.cuh @@ -41,7 +41,7 @@ __device__ int warpInclusivePrefixSum(int value) int laneId = threadIdx.x & (32-1); #pragma unroll - for (int i = 0; i < 6; i++) //iterates until x == 1<<5 == 32 which is warpSize + for (int i = 0; i < 5; i++) //iterates until x == 1<<5 == 32 which is warpSize { int x = 1< Date: Wed, 31 Mar 2021 19:51:05 +0200 Subject: [PATCH 156/258] small changes --- GPUSort/src/quicksort/cudaPartition.cuh | 6 ++---- GPUSort/src/quicksort/quicksort.cuh | 14 +++++++++++++- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/GPUSort/src/quicksort/cudaPartition.cuh b/GPUSort/src/quicksort/cudaPartition.cuh index 4752e73ea..2171f66d4 100644 --- a/GPUSort/src/quicksort/cudaPartition.cuh +++ b/GPUSort/src/quicksort/cudaPartition.cuh @@ -81,10 +81,8 @@ void countElem(ArrayView arr, for (int i = threadIdx.x; i < arr.getSize(); i += blockDim.x) { int data = arr[i]; - if (data < pivot) - smaller++; - else if (data > pivot) - bigger++; + smaller += (data < pivot); + bigger += (data > pivot); } } diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index 5cc87ed8d..bacf24f68 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -186,7 +186,16 @@ __global__ void cudaInitTask(ArrayView cuda_tasks, cuda_tasks[i].initTask(myFirstAvailBlock, blocksNeeded); for (int set = 0; set < blocksNeeded; set++) - cuda_blockToTaskMapping[myFirstAvailBlock++] = i; + { + if(myFirstAvailBlock >= cuda_blockToTaskMapping.getSize()) + { + printf("ran out of memory for mapping\n"); + } + else + { + cuda_blockToTaskMapping[myFirstAvailBlock++] = i; + } + } } } @@ -262,6 +271,9 @@ void QUICKSORT::sort(const Function &Cmp) int elemPerBlock = getElemPerBlock(); int blocksCnt = initTasks(elemPerBlock); + if(blocksCnt > cuda_blockToTaskMapping.getSize()) + break; + int externMemByteSize = elemPerBlock * sizeof(int); if (iteration % 2 == 0) { -- GitLab From 5205a6df8fd438715426a1f699021dea4ab6072d Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 31 Mar 2021 21:41:53 +0200 Subject: [PATCH 157/258] checking error and maxTasks calc --- GPUSort/src/quicksort/quicksort.cuh | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index bacf24f68..7a0bd6767 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -201,7 +201,7 @@ __global__ void cudaInitTask(ArrayView cuda_tasks, //----------------------------------------------------------- //----------------------------------------------------------- -const int threadsPerBlock = 512, maxBlocks = 1 << 15; //32k +const int threadsPerBlock = 512, g_maxBlocks = 1 << 15; //32k const int g_maxTasks = 1 << 14; const int minElemPerBlock = threadsPerBlock*2; @@ -209,7 +209,7 @@ class QUICKSORT { ArrayView arr; Array aux; - int maxTasks; + int maxTasks, maxBlocks; Array cuda_tasks, cuda_newTasks, cuda_2ndPhaseTasks; Array cudaCounters; @@ -227,7 +227,7 @@ class QUICKSORT public: QUICKSORT(ArrayView _arr) : arr(_arr), aux(arr.getSize()), - maxTasks(min(arr.getSize(), g_maxTasks)), + maxTasks(min(arr.getSize(), g_maxTasks)), maxBlocks(min(arr.getSize()/minElemPerBlock, g_maxBlocks)), cuda_tasks(maxTasks), cuda_newTasks(maxTasks), cuda_2ndPhaseTasks(maxTasks), cudaCounters(3), cuda_newTasksAmount(cudaCounters.getView(0, 1)), @@ -240,6 +240,10 @@ public: host_2ndPhaseTasksAmount = 0; cuda_2ndPhaseTasksAmount = 0; iteration = 0; + + auto error = cudaGetLastError(); + if(error != cudaSuccess) + deb(error); } template -- GitLab From 6ca42b4705728ff802e1949ac3938a503be14059 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 31 Mar 2021 22:30:57 +0200 Subject: [PATCH 158/258] revert, non modulo input fails --- GPUSort/src/quicksort/quicksort.cuh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index 7a0bd6767..818517d36 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -227,7 +227,8 @@ class QUICKSORT public: QUICKSORT(ArrayView _arr) : arr(_arr), aux(arr.getSize()), - maxTasks(min(arr.getSize(), g_maxTasks)), maxBlocks(min(arr.getSize()/minElemPerBlock, g_maxBlocks)), + maxTasks(min(arr.getSize(), g_maxTasks)), + maxBlocks(g_maxBlocks), cuda_tasks(maxTasks), cuda_newTasks(maxTasks), cuda_2ndPhaseTasks(maxTasks), cudaCounters(3), cuda_newTasksAmount(cudaCounters.getView(0, 1)), -- GitLab From 0848f4f5ca679df65b0ec7006dacc75cabdaf684 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 31 Mar 2021 22:33:21 +0200 Subject: [PATCH 159/258] refactor 1st phase --- GPUSort/src/quicksort/quicksort.cuh | 37 ++++++++++------------------- 1 file changed, 12 insertions(+), 25 deletions(-) diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index 818517d36..d5e510231 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -280,33 +280,20 @@ void QUICKSORT::sort(const Function &Cmp) break; int externMemByteSize = elemPerBlock * sizeof(int); - if (iteration % 2 == 0) - { - cudaQuickSort1stPhase - <<>>( - arr, aux, Cmp, elemPerBlock, - cuda_tasks, cuda_blockToTaskMapping); - - cudaWritePivot<<>>( + auto & task = iteration % 2 == 0? cuda_tasks : cuda_newTasks; + cudaQuickSort1stPhase + <<>>( arr, aux, Cmp, elemPerBlock, - cuda_tasks, - cuda_newTasks, - cuda_newTasksAmount.getData(), - cuda_2ndPhaseTasks, cuda_2ndPhaseTasksAmount.getData()); - } - else - { - cudaQuickSort1stPhase<<>>( - arr, aux, Cmp, elemPerBlock, - cuda_newTasks, cuda_blockToTaskMapping); + task, cuda_blockToTaskMapping); + + auto & newTask = iteration % 2 == 0? cuda_newTasks : cuda_tasks; + cudaWritePivot<<>>( + arr, aux, Cmp, elemPerBlock, + task, + newTask, + cuda_newTasksAmount.getData(), + cuda_2ndPhaseTasks, cuda_2ndPhaseTasksAmount.getData()); - cudaWritePivot<<>>( - arr, aux, Cmp, elemPerBlock, - cuda_newTasks, - cuda_tasks, - cuda_newTasksAmount.getData(), - cuda_2ndPhaseTasks, cuda_2ndPhaseTasksAmount.getData()); - } processNewTasks(); iteration++; } -- GitLab From cd302687873292aff3493cccf26037ac63614c46 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Thu, 1 Apr 2021 02:04:45 +0200 Subject: [PATCH 160/258] fix modulo 0 during generation --- GPUSort/benchmark/benchmarker.cpp | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/GPUSort/benchmark/benchmarker.cpp b/GPUSort/benchmark/benchmarker.cpp index 6c3d43e4d..c3fb3632b 100644 --- a/GPUSort/benchmark/benchmarker.cpp +++ b/GPUSort/benchmark/benchmarker.cpp @@ -192,7 +192,7 @@ double staggared(int size) int tmp=4096; //(RAND_MAX)/p; --> size=2048 int p= (size+tmp-1)/tmp; - const int VALUE = 16384/p; //(RAND_MAX)/p; + const int VALUE = (1<<31)/p; //(RAND_MAX)/p; int i=1; int x=0; //the array of size N is split into 'p' buckets @@ -205,10 +205,9 @@ double staggared(int size) if(i<=(p/2)) min = (2*i -1)*VALUE; - else min = (2*i-p-1)*VALUE; - + vec[x++]= min + ( rand() % (VALUE - 1) ); } i++; @@ -237,14 +236,14 @@ void start(ostream & out, string delim) vector vec(size); out << "2^" << pow << delim; - out << fixed << setprecision(3); - out << random(size) << delim; - out << shuffle(size) << delim; - out << sorted(size) << delim; - out << almostSorted(size) << delim; - out << decreasing(size) << delim; - out << gaussian(size) << delim; - out << bucket(size) << delim; + //out << fixed << setprecision(3); + //out << random(size) << delim; + //out << shuffle(size) << delim; + //out << sorted(size) << delim; + //out << almostSorted(size) << delim; + //out << decreasing(size) << delim; + //out << gaussian(size) << delim; + //out << bucket(size) << delim; out << staggared(size) << delim; out << zero_entropy(size); out << endl; -- GitLab From fda4519fed3c4dbc5f9116de24f0ed8c8474ef02 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Thu, 1 Apr 2021 02:05:37 +0200 Subject: [PATCH 161/258] accidental comment of tests --- GPUSort/benchmark/benchmarker.cpp | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/GPUSort/benchmark/benchmarker.cpp b/GPUSort/benchmark/benchmarker.cpp index c3fb3632b..36dab036e 100644 --- a/GPUSort/benchmark/benchmarker.cpp +++ b/GPUSort/benchmark/benchmarker.cpp @@ -205,9 +205,10 @@ double staggared(int size) if(i<=(p/2)) min = (2*i -1)*VALUE; + else min = (2*i-p-1)*VALUE; - + vec[x++]= min + ( rand() % (VALUE - 1) ); } i++; @@ -236,14 +237,14 @@ void start(ostream & out, string delim) vector vec(size); out << "2^" << pow << delim; - //out << fixed << setprecision(3); - //out << random(size) << delim; - //out << shuffle(size) << delim; - //out << sorted(size) << delim; - //out << almostSorted(size) << delim; - //out << decreasing(size) << delim; - //out << gaussian(size) << delim; - //out << bucket(size) << delim; + out << fixed << setprecision(3); + out << random(size) << delim; + out << shuffle(size) << delim; + out << sorted(size) << delim; + out << almostSorted(size) << delim; + out << decreasing(size) << delim; + out << gaussian(size) << delim; + out << bucket(size) << delim; out << staggared(size) << delim; out << zero_entropy(size); out << endl; -- GitLab From 9a8b25135967d724f01fa44a93621f9ef6ba4d1c Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Thu, 1 Apr 2021 02:17:41 +0200 Subject: [PATCH 162/258] debug out comment, some syncs and small fixes --- GPUSort/src/quicksort/cudaPartition.cuh | 12 +++++++ GPUSort/src/quicksort/quicksort.cuh | 43 ++++++++++++++++++------- 2 files changed, 43 insertions(+), 12 deletions(-) diff --git a/GPUSort/src/quicksort/cudaPartition.cuh b/GPUSort/src/quicksort/cudaPartition.cuh index 2171f66d4..21f5169c0 100644 --- a/GPUSort/src/quicksort/cudaPartition.cuh +++ b/GPUSort/src/quicksort/cudaPartition.cuh @@ -125,9 +125,21 @@ void copyData(ArrayView src, { int data = src[i]; if (data < pivot) + { + /* + if(smallerStart >= dst.getSize() || smallerStart < 0) + printf("failed here: b:%d t:%d: tried to write into [%d]/%d\n", blockDim.x, threadIdx.x, smallerStart, dst.getSize()); + */ dst[smallerStart++] = data; + } else if (data > pivot) + { + /* + if(biggerStart >= dst.getSize() || biggerStart < 0) + printf("failed here: b:%d t:%d: tried to write into [%d]/%d\n", blockDim.x, threadIdx.x, biggerStart, dst.getSize()); + */ dst[biggerStart++] = data; + } } } diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index d5e510231..90db6d0d7 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -20,8 +20,15 @@ __device__ void writeNewTask(int begin, int end, int depth, int pivotIdx, ArrayView secondPhaseTasks, int *secondPhaseTasksCnt) { int size = end - begin; + if(size < 0) + { + printf("negative size, something went really wrong\n"); + return; + } + if (size == 0) return; + if (size <= blockDim.x * 2) { int idx = atomicAdd(secondPhaseTasksCnt, 1); @@ -29,6 +36,7 @@ __device__ void writeNewTask(int begin, int end, int depth, int pivotIdx, secondPhaseTasks[idx] = TASK(begin, end, depth + 1, pivotIdx); else { + //printf("ran out of memory, trying backup\n"); int idx = atomicAdd(newTasksCnt, 1); if (idx < newTasks.getSize()) newTasks[idx] = TASK(begin, end, depth + 1, pivotIdx); @@ -43,6 +51,7 @@ __device__ void writeNewTask(int begin, int end, int depth, int pivotIdx, newTasks[idx] = TASK(begin, end, depth + 1, pivotIdx); else { + //printf("ran out of memory, trying backup\n"); int idx = atomicAdd(secondPhaseTasksCnt, 1); if (idx < secondPhaseTasks.getSize()) secondPhaseTasks[idx] = TASK(begin, end, depth + 1, pivotIdx); @@ -132,7 +141,7 @@ __global__ void cudaWritePivot(ArrayView arr, ArrayView 0) { int rightPivotIdx = pickPivotIdx((myTask.depth & 1) == 0? aux.getView(rightBegin, rightEnd) : @@ -151,6 +160,9 @@ __global__ void cudaQuickSort2ndPhase(ArrayView arr, ArrayVi ArrayView secondPhaseTasks) { TASK &myTask = secondPhaseTasks[blockIdx.x]; + if(myTask.partitionEnd - myTask.partitionBegin <= 0 ) + return; + auto arrView = arr.getView(myTask.partitionBegin, myTask.partitionEnd); auto auxView = aux.getView(myTask.partitionBegin, myTask.partitionEnd); @@ -266,12 +278,15 @@ void QUICKSORT::sort(const Function &Cmp) { while (tasksAmount > 0) { - //by partitioning with n=tasksAmount, max 2n new tasks can be created - if (2 * tasksAmount >= maxTasks) + //2ndphase task is now full or tasksAmount is full, as backup during writing, overflowing tasks were written into the other array + if (tasksAmount >= maxTasks || host_2ndPhaseTasksAmount >= maxTasks) + { + //deb("task overflow") break; + } - //2ndphase task is now full - if (host_2ndPhaseTasksAmount >= cuda_2ndPhaseTasks.getSize()) + //just in case newly created tasks wouldnt fit + if(tasksAmount*2 >= maxTasks + (maxTasks - host_2ndPhaseTasksAmount)) break; int elemPerBlock = getElemPerBlock(); @@ -298,14 +313,18 @@ void QUICKSORT::sort(const Function &Cmp) iteration++; } + auto error = cudaDeviceSynchronize(); + if(error != cudaSuccess) + deb(error); + if (tasksAmount > 0) { + auto & tasks = iteration % 2 == 0 ? cuda_tasks : cuda_newTasks; cudaStream_t s; cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking); cudaQuickSort2ndPhase - <<>>(arr, aux, Cmp, - iteration % 2 == 0 ? cuda_tasks : cuda_newTasks); + <<>>(arr, aux, Cmp, tasks); cudaStreamDestroy(s); } @@ -315,15 +334,15 @@ void QUICKSORT::sort(const Function &Cmp) cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking); cudaQuickSort2ndPhase - <<>>(arr, aux, Cmp, cuda_2ndPhaseTasks); + <<>> + (arr, aux, Cmp, cuda_2ndPhaseTasks); cudaStreamDestroy(s); } - auto error = cudaDeviceSynchronize(); + error = cudaDeviceSynchronize(); if(error != cudaSuccess) deb(error); - return; } @@ -377,9 +396,9 @@ int QUICKSORT::initTasks(int elemPerBlock) void QUICKSORT::processNewTasks() { - tasksAmount = min(cuda_newTasksAmount.getElement(0), maxTasks); + tasksAmount = cuda_newTasksAmount.getElement(0); cuda_newTasksAmount = 0; - host_2ndPhaseTasksAmount = min(cuda_2ndPhaseTasksAmount.getElement(0), maxTasks); + host_2ndPhaseTasksAmount = cuda_2ndPhaseTasksAmount.getElement(0); } //----------------------------------------------------------- -- GitLab From 2119bf9c21d2d79c4d7a5d7f0db46e8b5ec9c54c Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Thu, 1 Apr 2021 02:17:55 +0200 Subject: [PATCH 163/258] fix comment --- GPUSort/src/util/reduction.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPUSort/src/util/reduction.cuh b/GPUSort/src/util/reduction.cuh index 00caa521b..aeafe831d 100644 --- a/GPUSort/src/util/reduction.cuh +++ b/GPUSort/src/util/reduction.cuh @@ -41,7 +41,7 @@ __device__ int warpInclusivePrefixSum(int value) int laneId = threadIdx.x & (32-1); #pragma unroll - for (int i = 0; i < 5; i++) //iterates until x == 1<<5 == 32 which is warpSize + for (int i = 0; i < 5; i++) //iterates until x == 1<<4 == 16 which is half warpSize { int x = 1< Date: Fri, 2 Apr 2021 18:33:33 +0200 Subject: [PATCH 164/258] better synching and use default stream --- GPUSort/src/quicksort/quicksort.cuh | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index 90db6d0d7..51d3e6202 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -276,6 +276,8 @@ public: template void QUICKSORT::sort(const Function &Cmp) { + cudaError_t error; + while (tasksAmount > 0) { //2ndphase task is now full or tasksAmount is full, as backup during writing, overflowing tasks were written into the other array @@ -313,19 +315,17 @@ void QUICKSORT::sort(const Function &Cmp) iteration++; } - auto error = cudaDeviceSynchronize(); - if(error != cudaSuccess) + if((error = cudaDeviceSynchronize()) != cudaSuccess) + { deb(error); + return; + } if (tasksAmount > 0) { auto & tasks = iteration % 2 == 0 ? cuda_tasks : cuda_newTasks; - cudaStream_t s; - cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking); - cudaQuickSort2ndPhase - <<>>(arr, aux, Cmp, tasks); - cudaStreamDestroy(s); + <<>>(arr, aux, Cmp, tasks); } if (host_2ndPhaseTasksAmount > 0) @@ -340,9 +340,12 @@ void QUICKSORT::sort(const Function &Cmp) cudaStreamDestroy(s); } - error = cudaDeviceSynchronize(); - if(error != cudaSuccess) + + if((error = cudaDeviceSynchronize()) != cudaSuccess) + { deb(error); + return; + } return; } -- GitLab From 33964fcc28b7816ca2b2e07706645cc59a2fe6d2 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Fri, 2 Apr 2021 19:20:55 +0200 Subject: [PATCH 165/258] clean up init tasks and process new tasks --- GPUSort/src/quicksort/quicksort.cuh | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index 51d3e6202..0a3906c7a 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -378,20 +378,11 @@ int QUICKSORT::initTasks(int elemPerBlock) int blocks = tasksAmount / threads + (tasksAmount % threads != 0); cuda_blockToTaskMapping_Cnt = 0; - if (iteration % 2 == 0) - { - cudaInitTask<<>>( - cuda_tasks, tasksAmount, elemPerBlock, - cuda_blockToTaskMapping_Cnt.getData(), - cuda_blockToTaskMapping); - } - else - { - cudaInitTask<<>>( - cuda_newTasks, tasksAmount, elemPerBlock, - cuda_blockToTaskMapping_Cnt.getData(), - cuda_blockToTaskMapping); - } + auto &tasks = iteration % 2 == 0? cuda_tasks : cuda_newTasks; + cudaInitTask<<>>( + tasks, tasksAmount, elemPerBlock, + cuda_blockToTaskMapping_Cnt.getData(), + cuda_blockToTaskMapping); cuda_newTasksAmount.setElement(0, 0); return cuda_blockToTaskMapping_Cnt.getElement(0); @@ -400,7 +391,6 @@ int QUICKSORT::initTasks(int elemPerBlock) void QUICKSORT::processNewTasks() { tasksAmount = cuda_newTasksAmount.getElement(0); - cuda_newTasksAmount = 0; host_2ndPhaseTasksAmount = cuda_2ndPhaseTasksAmount.getElement(0); } -- GitLab From 5ad62a5c6d28d285884ae6418a9e11f4d08a3e18 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Fri, 2 Apr 2021 19:22:21 +0200 Subject: [PATCH 166/258] allow the set blockSize during init calc --- GPUSort/src/quicksort/quicksort.cuh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index 0a3906c7a..a8b51897a 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -262,7 +262,7 @@ public: template void sort(const Function &Cmp); - int getSetsNeeded() const; + int getSetsNeeded(int elemPerBlock) const; int getElemPerBlock() const; /** @@ -349,13 +349,13 @@ void QUICKSORT::sort(const Function &Cmp) return; } -int QUICKSORT::getSetsNeeded() const +int QUICKSORT::getSetsNeeded(int elemPerBlock) const { auto view = iteration % 2 == 0 ? cuda_tasks.getConstView() : cuda_newTasks.getConstView(); auto fetch = [=] __cuda_callable__(int i) { auto &task = view[i]; int size = task.partitionEnd - task.partitionBegin; - return size / minElemPerBlock + (size % minElemPerBlock != 0); + return size / elemPerBlock + (size % elemPerBlock != 0); }; auto reduction = [] __cuda_callable__(int a, int b) { return a + b; }; return Algorithms::Reduction::reduce(0, tasksAmount, fetch, reduction, 0); @@ -363,7 +363,7 @@ int QUICKSORT::getSetsNeeded() const int QUICKSORT::getElemPerBlock() const { - int setsNeeded = getSetsNeeded(); + int setsNeeded = getSetsNeeded(minElemPerBlock); if (setsNeeded <= maxBlocks) return minElemPerBlock; -- GitLab From 6e1d4d287b52fe692a1a29bd655dcdf2037dc0c1 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Fri, 2 Apr 2021 21:18:32 +0200 Subject: [PATCH 167/258] set lowerbound of single block quicksort size --- GPUSort/src/quicksort/cudaPartition.cuh | 4 ++-- GPUSort/src/quicksort/quicksort.cuh | 32 ++++++++++++------------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/GPUSort/src/quicksort/cudaPartition.cuh b/GPUSort/src/quicksort/cudaPartition.cuh index 21f5169c0..d3dfc658a 100644 --- a/GPUSort/src/quicksort/cudaPartition.cuh +++ b/GPUSort/src/quicksort/cudaPartition.cuh @@ -134,10 +134,10 @@ void copyData(ArrayView src, } else if (data > pivot) { - /* + if(biggerStart >= dst.getSize() || biggerStart < 0) printf("failed here: b:%d t:%d: tried to write into [%d]/%d\n", blockDim.x, threadIdx.x, biggerStart, dst.getSize()); - */ + dst[biggerStart++] = data; } } diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index a8b51897a..d87f4fcb3 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -15,7 +15,7 @@ using namespace TNL::Containers; //----------------------------------------------------------- -__device__ void writeNewTask(int begin, int end, int depth, int pivotIdx, +__device__ void writeNewTask(int begin, int end, int depth, int pivotIdx, int maxElemFor2ndPhase, ArrayView newTasks, int *newTasksCnt, ArrayView secondPhaseTasks, int *secondPhaseTasksCnt) { @@ -29,7 +29,7 @@ __device__ void writeNewTask(int begin, int end, int depth, int pivotIdx, if (size == 0) return; - if (size <= blockDim.x * 2) + if (size <= maxElemFor2ndPhase) { int idx = atomicAdd(secondPhaseTasksCnt, 1); if (idx < secondPhaseTasks.getSize()) @@ -98,7 +98,7 @@ __global__ void cudaQuickSort1stPhase(ArrayView arr, ArrayVi template __global__ void cudaWritePivot(ArrayView arr, ArrayView aux, - const Function &Cmp, int elemPerBlock, + const Function &Cmp, int maxElemFor2ndPhase, ArrayView tasks, ArrayView newTasks, int *newTasksCnt, ArrayView secondPhaseTasks, int *secondPhaseTasksCnt) @@ -138,7 +138,8 @@ __global__ void cudaWritePivot(ArrayView arr, ArrayView 0) @@ -148,7 +149,8 @@ __global__ void cudaWritePivot(ArrayView arr, ArrayView cuda_tasks, const int threadsPerBlock = 512, g_maxBlocks = 1 << 15; //32k const int g_maxTasks = 1 << 14; const int minElemPerBlock = threadsPerBlock*2; +const int maxBitonicSize = threadsPerBlock*2; +const int desired_2ndPhasElemPerBlock = maxBitonicSize*8; class QUICKSORT { @@ -277,19 +281,20 @@ template void QUICKSORT::sort(const Function &Cmp) { cudaError_t error; - + while (tasksAmount > 0) { //2ndphase task is now full or tasksAmount is full, as backup during writing, overflowing tasks were written into the other array if (tasksAmount >= maxTasks || host_2ndPhaseTasksAmount >= maxTasks) { - //deb("task overflow") break; } //just in case newly created tasks wouldnt fit if(tasksAmount*2 >= maxTasks + (maxTasks - host_2ndPhaseTasksAmount)) + { break; + } int elemPerBlock = getElemPerBlock(); int blocksCnt = initTasks(elemPerBlock); @@ -305,7 +310,7 @@ void QUICKSORT::sort(const Function &Cmp) auto & newTask = iteration % 2 == 0? cuda_newTasks : cuda_tasks; cudaWritePivot<<>>( - arr, aux, Cmp, elemPerBlock, + arr, aux, Cmp, desired_2ndPhasElemPerBlock, task, newTask, cuda_newTasksAmount.getData(), @@ -320,24 +325,19 @@ void QUICKSORT::sort(const Function &Cmp) deb(error); return; } - + if (tasksAmount > 0) { auto & tasks = iteration % 2 == 0 ? cuda_tasks : cuda_newTasks; cudaQuickSort2ndPhase <<>>(arr, aux, Cmp, tasks); } - + if (host_2ndPhaseTasksAmount > 0) { - cudaStream_t s; - cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking); - cudaQuickSort2ndPhase - <<>> + <<>> (arr, aux, Cmp, cuda_2ndPhaseTasks); - - cudaStreamDestroy(s); } -- GitLab From 5fb8f1f83787c1ef9408b8203cef69e44beab0f9 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Fri, 2 Apr 2021 23:18:49 +0200 Subject: [PATCH 168/258] bitonic sorter inplace with fetch and swap, need to implement tests --- GPUSort/src/bitonicSort/bitonicSort.h | 65 +++++++++++++++++++++++- GPUSort/tests/bitonic_tests/unitTests.cu | 32 ++++++++++++ 2 files changed, 96 insertions(+), 1 deletion(-) diff --git a/GPUSort/src/bitonicSort/bitonicSort.h b/GPUSort/src/bitonicSort/bitonicSort.h index b62b3f3b0..4fd3b926a 100644 --- a/GPUSort/src/bitonicSort/bitonicSort.h +++ b/GPUSort/src/bitonicSort/bitonicSort.h @@ -292,4 +292,67 @@ void bitonicSort(std::vector & vec) bitonicSort(vec, [] __cuda_callable__ (const Value & a, const Value & b) {return a < b;}); } -//--------------------------------------------- \ No newline at end of file +//--------------------------------------------- +//--------------------------------------------- + +template +__global__ void bitonicMergeGlobal(int size, const FETCH & Fetch, + const CMP & Cmp, const SWAP & Swap, + int monotonicSeqLen, int len, int partsInSeq) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + + int part = i / (len / 2); //computes which sorting block this thread belongs to + + //the index of 2 elements that should be compared and swapped + int s = part * len + (i & ((len / 2) - 1) ); + int e = s + len / 2; + if (e >= size) //arr[e] is virtual padding and will not be exchanged with + return; + + //calculate the direction of swapping + int monotonicSeqIdx = part / partsInSeq; + bool ascending = (monotonicSeqIdx & 1) != 0; + if ((monotonicSeqIdx + 1) * monotonicSeqLen >= size) //special case for part with no "partner" to be merged with in next phase + ascending = true; + + if( (ascending == Cmp(Fetch(e), Fetch(s)))) + Swap(s, e); +} + + + +template +void bitonicSort(int begin, int end, const FETCH & Fetch, const CMP& Cmp, const SWAP & Swap) +{ + int size = end - begin; + int paddedSize = closestPow2(size); + + int threadsNeeded = size / 2 + (size %2 !=0); + + const int maxThreadsPerBlock = 512; + int threadPerBlock = maxThreadsPerBlock; + int blocks = threadsNeeded / threadPerBlock + (threadsNeeded % threadPerBlock != 0); + + auto fetchWithOffset = + [=] __cuda_callable__(int i) + { + return Fetch(i + begin); + }; + + auto swapWithOffset = + [=] __cuda_callable__(int i, int j) + { + return Swap(i+begin, i+begin); + }; + + for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) + { + for (int len = monotonicSeqLen, partsInSeq = 1; len > 1; len /= 2, partsInSeq *= 2) + { + bitonicMergeGlobal<<>>( + size, fetchWithOffset, Cmp, swapWithOffset, monotonicSeqLen, len, partsInSeq); + } + } + cudaDeviceSynchronize(); +} \ No newline at end of file diff --git a/GPUSort/tests/bitonic_tests/unitTests.cu b/GPUSort/tests/bitonic_tests/unitTests.cu index 4649c3fef..024fa56ed 100644 --- a/GPUSort/tests/bitonic_tests/unitTests.cu +++ b/GPUSort/tests/bitonic_tests/unitTests.cu @@ -239,7 +239,39 @@ TEST(sortRange, middleMultiBlock) ASSERT_TRUE(arr[e + (std::rand() % (size - e))] == -1); ASSERT_TRUE(arr.back() == -1); } +/* +void fetchAndSwapSorter(TNL::Containers::ArrayView view) +{ + + //auto Fetch = [=]__cuda_callable__(int i){return view[i];}; + //auto Cmp = [=]__cuda_callable__(const int & a, const int & b){return a < b;}; + //auto Swap = [=] __device__ (int i, int j){TNL::swap(view[i], view[j]);}; + //bitonicSort(0, view.getSize(), Fetch, Cmp, Swap); + +} + +TEST(fetchAndSwap, oneBlockSort) +{ + int size = 9; + const int stride = 227; + int i = 0; + + std::vector orig(size); + std::iota(orig.begin(), orig.end(), 0); + do + { + if ((i++) % stride != 0) + continue; + + TNL::Containers::Array cudaArr(orig); + auto view = cudaArr.getView(); + fetchAndSwapSorter(view); + ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl; + } + while (std::next_permutation(orig.begin(), orig.end())); +} +*/ //---------------------------------------------------------------------------------- -- GitLab From f718c1ae953217eaa4544968f894e5a556d35ea1 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sat, 3 Apr 2021 22:12:45 +0200 Subject: [PATCH 169/258] fix fetch and add tests for bitonic --- GPUSort/src/bitonicSort/bitonicSort.h | 12 ++--- GPUSort/tests/bitonic_tests/unitTests.cu | 58 ++++++++++++++++++++---- 2 files changed, 55 insertions(+), 15 deletions(-) diff --git a/GPUSort/src/bitonicSort/bitonicSort.h b/GPUSort/src/bitonicSort/bitonicSort.h index 4fd3b926a..a066ef5d8 100644 --- a/GPUSort/src/bitonicSort/bitonicSort.h +++ b/GPUSort/src/bitonicSort/bitonicSort.h @@ -296,8 +296,8 @@ void bitonicSort(std::vector & vec) //--------------------------------------------- template -__global__ void bitonicMergeGlobal(int size, const FETCH & Fetch, - const CMP & Cmp, const SWAP & Swap, +__global__ void bitonicMergeGlobal(int size, FETCH Fetch, + const CMP & Cmp, SWAP Swap, int monotonicSeqLen, int len, int partsInSeq) { int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -316,14 +316,14 @@ __global__ void bitonicMergeGlobal(int size, const FETCH & Fetch, if ((monotonicSeqIdx + 1) * monotonicSeqLen >= size) //special case for part with no "partner" to be merged with in next phase ascending = true; - if( (ascending == Cmp(Fetch(e), Fetch(s)))) + if( ascending == Cmp(Fetch(e), Fetch(s))) Swap(s, e); } template -void bitonicSort(int begin, int end, const FETCH & Fetch, const CMP& Cmp, const SWAP & Swap) +void bitonicSort(int begin, int end, FETCH Fetch, const CMP& Cmp, SWAP Swap) { int size = end - begin; int paddedSize = closestPow2(size); @@ -341,9 +341,9 @@ void bitonicSort(int begin, int end, const FETCH & Fetch, const CMP& Cmp, const }; auto swapWithOffset = - [=] __cuda_callable__(int i, int j) + [=] __cuda_callable__(int i, int j) mutable { - return Swap(i+begin, i+begin); + Swap(i+begin, j+begin); }; for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) diff --git a/GPUSort/tests/bitonic_tests/unitTests.cu b/GPUSort/tests/bitonic_tests/unitTests.cu index 024fa56ed..53fe8b3b5 100644 --- a/GPUSort/tests/bitonic_tests/unitTests.cu +++ b/GPUSort/tests/bitonic_tests/unitTests.cu @@ -239,15 +239,14 @@ TEST(sortRange, middleMultiBlock) ASSERT_TRUE(arr[e + (std::rand() % (size - e))] == -1); ASSERT_TRUE(arr.back() == -1); } -/* -void fetchAndSwapSorter(TNL::Containers::ArrayView view) + +template +void fetchAndSwapSorter(TNL::Containers::ArrayView view) { - - //auto Fetch = [=]__cuda_callable__(int i){return view[i];}; - //auto Cmp = [=]__cuda_callable__(const int & a, const int & b){return a < b;}; - //auto Swap = [=] __device__ (int i, int j){TNL::swap(view[i], view[j]);}; - //bitonicSort(0, view.getSize(), Fetch, Cmp, Swap); - + auto Fetch = [=]__cuda_callable__(int i){return view[i];}; + auto Cmp = [=]__cuda_callable__(const TYPE & a, const TYPE & b){return a < b;}; + auto Swap = [=] __cuda_callable__ (int i, int j) mutable {TNL::swap(view[i], view[j]);}; + bitonicSort(0, view.getSize(), Fetch, Cmp, Swap); } TEST(fetchAndSwap, oneBlockSort) @@ -271,7 +270,48 @@ TEST(fetchAndSwap, oneBlockSort) } while (std::next_permutation(orig.begin(), orig.end())); } -*/ + +TEST(fetchAndSwap, typeDouble) +{ + int size = 5; + std::vector orig(size); + std::iota(orig.begin(), orig.end(), 0); + + do + { + TNL::Containers::Array cudaArr(orig); + auto view = cudaArr.getView(); + fetchAndSwapSorter(view); + ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl; + } + while (std::next_permutation(orig.begin(), orig.end())); +} + +void fetchAndSwap_sortMiddle(TNL::Containers::ArrayView view, int from, int to) +{ + auto Fetch = [=]__cuda_callable__(int i){return view[i];}; + auto Cmp = [=]__cuda_callable__(const int & a, const int & b){return a < b;}; + auto Swap = [=] __cuda_callable__ (int i, int j) mutable {TNL::swap(view[i], view[j]);}; + bitonicSort(from, to, Fetch, Cmp, Swap); +} + +TEST(fetchAndSwap, sortMiddle) +{ + std::vector orig{5, 9, 4, 54, 21, 6, 7, 9, 0, 9, 42, 4}; + TNL::Containers::Array cudaArr(orig); + auto view = cudaArr.getView(); + int from = 3, to = 8; + + fetchAndSwap_sortMiddle(view, from, to); + ASSERT_TRUE(is_sorted(view.getView(3, 8))) << "result " << view << std::endl; + + for(size_t i = 0; i < orig.size(); i++) + { + if(i < from || i >= to) + ASSERT_TRUE(view.getElement(i) == orig[i]); + } +} + //---------------------------------------------------------------------------------- -- GitLab From 058057a0c9cef1b48fe64c4b702f0a43c41c1911 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sat, 3 Apr 2021 22:43:42 +0200 Subject: [PATCH 170/258] choose pivot during task initialization instead of during writePivot --- GPUSort/src/quicksort/quicksort.cuh | 69 +++++++++++++---------------- GPUSort/src/quicksort/task.h | 7 +-- 2 files changed, 36 insertions(+), 40 deletions(-) diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index d87f4fcb3..4f9c7a168 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -15,7 +15,7 @@ using namespace TNL::Containers; //----------------------------------------------------------- -__device__ void writeNewTask(int begin, int end, int depth, int pivotIdx, int maxElemFor2ndPhase, +__device__ void writeNewTask(int begin, int end, int depth, int maxElemFor2ndPhase, ArrayView newTasks, int *newTasksCnt, ArrayView secondPhaseTasks, int *secondPhaseTasksCnt) { @@ -33,13 +33,13 @@ __device__ void writeNewTask(int begin, int end, int depth, int pivotIdx, int ma { int idx = atomicAdd(secondPhaseTasksCnt, 1); if (idx < secondPhaseTasks.getSize()) - secondPhaseTasks[idx] = TASK(begin, end, depth + 1, pivotIdx); + secondPhaseTasks[idx] = TASK(begin, end, depth + 1); else { //printf("ran out of memory, trying backup\n"); int idx = atomicAdd(newTasksCnt, 1); if (idx < newTasks.getSize()) - newTasks[idx] = TASK(begin, end, depth + 1, pivotIdx); + newTasks[idx] = TASK(begin, end, depth + 1); else printf("ran out of memory for second phase task, there isnt even space in newTask list\nPart of array may stay unsorted!!!\n"); } @@ -48,13 +48,13 @@ __device__ void writeNewTask(int begin, int end, int depth, int pivotIdx, int ma { int idx = atomicAdd(newTasksCnt, 1); if (idx < newTasks.getSize()) - newTasks[idx] = TASK(begin, end, depth + 1, pivotIdx); + newTasks[idx] = TASK(begin, end, depth + 1); else { //printf("ran out of memory, trying backup\n"); int idx = atomicAdd(secondPhaseTasksCnt, 1); if (idx < secondPhaseTasks.getSize()) - secondPhaseTasks[idx] = TASK(begin, end, depth + 1, pivotIdx); + secondPhaseTasks[idx] = TASK(begin, end, depth + 1); else printf("ran out of memory for newtask, there isnt even space in second phase task list\nPart of array may stay unsorted!!!\n"); } @@ -96,11 +96,8 @@ __global__ void cudaQuickSort1stPhase(ArrayView arr, ArrayVi } } -template -__global__ void cudaWritePivot(ArrayView arr, ArrayView aux, - const Function &Cmp, int maxElemFor2ndPhase, - ArrayView tasks, - ArrayView newTasks, int *newTasksCnt, +__global__ void cudaWritePivot(ArrayView arr, ArrayView aux, int maxElemFor2ndPhase, + ArrayView tasks, ArrayView newTasks, int *newTasksCnt, ArrayView secondPhaseTasks, int *secondPhaseTasksCnt) { static __shared__ int pivot; @@ -133,24 +130,18 @@ __global__ void cudaWritePivot(ArrayView arr, ArrayView 0) { - int leftPivotIdx = pickPivotIdx((myTask.depth & 1) == 0? - aux.getView(leftBegin, leftEnd) : - arr.getView(leftBegin, leftEnd) - , Cmp) + leftBegin; - - writeNewTask(leftBegin, leftEnd, myTask.depth, leftPivotIdx, maxElemFor2ndPhase, - newTasks, newTasksCnt, secondPhaseTasks, secondPhaseTasksCnt); + writeNewTask(leftBegin, leftEnd, myTask.depth, + maxElemFor2ndPhase, + newTasks, newTasksCnt, + secondPhaseTasks, secondPhaseTasksCnt); } if(rightEnd - rightBegin > 0) { - int rightPivotIdx = pickPivotIdx((myTask.depth & 1) == 0? - aux.getView(rightBegin, rightEnd) : - arr.getView(rightBegin, rightEnd) - , Cmp) + rightBegin; - - writeNewTask(rightBegin, rightEnd, myTask.depth, rightPivotIdx, maxElemFor2ndPhase, - newTasks, newTasksCnt, secondPhaseTasks, secondPhaseTasksCnt); + writeNewTask(rightBegin, rightEnd, + myTask.depth, maxElemFor2ndPhase, + newTasks, newTasksCnt, + secondPhaseTasks, secondPhaseTasksCnt); } } @@ -171,10 +162,11 @@ __global__ void cudaQuickSort2ndPhase(ArrayView arr, ArrayVi singleBlockQuickSort(arrView, auxView, Cmp, myTask.depth); } //----------------------------------------------------------- - +template __global__ void cudaInitTask(ArrayView cuda_tasks, int taskAmount, int elemPerBlock, int *firstAvailBlock, - ArrayView cuda_blockToTaskMapping) + ArrayView cuda_blockToTaskMapping, + ArrayView src, const Function &Cmp) { static __shared__ int avail; @@ -195,9 +187,10 @@ __global__ void cudaInitTask(ArrayView cuda_tasks, if (i < taskAmount) { + auto task = cuda_tasks[i]; int myFirstAvailBlock = avail + blocksNeeded_total - blocksNeeded; - - cuda_tasks[i].initTask(myFirstAvailBlock, blocksNeeded); + int pivotIdx = task.partitionBegin + pickPivotIdx(src.getView(task.partitionBegin, task.partitionEnd), Cmp); + cuda_tasks[i].initTask(myFirstAvailBlock, blocksNeeded, pivotIdx); for (int set = 0; set < blocksNeeded; set++) { @@ -252,7 +245,7 @@ public: cuda_blockToTaskMapping(maxBlocks * 2), cuda_blockToTaskMapping_Cnt(cudaCounters.getView(2, 3)) { - cuda_tasks.setElement(0, TASK(0, arr.getSize(), 0, arr.getSize()/2)); + cuda_tasks.setElement(0, TASK(0, arr.getSize(), 0)); tasksAmount = 1; host_2ndPhaseTasksAmount = 0; cuda_2ndPhaseTasksAmount = 0; @@ -272,7 +265,8 @@ public: /** * returns the amount of blocks needed * */ - int initTasks(int elemPerBlock); + template + int initTasks(int elemPerBlock, const Function & Cmp); void processNewTasks(); }; @@ -297,7 +291,7 @@ void QUICKSORT::sort(const Function &Cmp) } int elemPerBlock = getElemPerBlock(); - int blocksCnt = initTasks(elemPerBlock); + int blocksCnt = initTasks(elemPerBlock, Cmp); if(blocksCnt > cuda_blockToTaskMapping.getSize()) break; @@ -310,10 +304,8 @@ void QUICKSORT::sort(const Function &Cmp) auto & newTask = iteration % 2 == 0? cuda_newTasks : cuda_tasks; cudaWritePivot<<>>( - arr, aux, Cmp, desired_2ndPhasElemPerBlock, - task, - newTask, - cuda_newTasksAmount.getData(), + arr, aux, desired_2ndPhasElemPerBlock, + task, newTask, cuda_newTasksAmount.getData(), cuda_2ndPhaseTasks, cuda_2ndPhaseTasksAmount.getData()); processNewTasks(); @@ -372,17 +364,20 @@ int QUICKSORT::getElemPerBlock() const return setsPerBlock * minElemPerBlock; } -int QUICKSORT::initTasks(int elemPerBlock) +template +int QUICKSORT::initTasks(int elemPerBlock, const Function & Cmp) { int threads = min(tasksAmount, threadsPerBlock); int blocks = tasksAmount / threads + (tasksAmount % threads != 0); cuda_blockToTaskMapping_Cnt = 0; + auto src = iteration % 2 == 0? arr : aux.getView(); auto &tasks = iteration % 2 == 0? cuda_tasks : cuda_newTasks; cudaInitTask<<>>( tasks, tasksAmount, elemPerBlock, cuda_blockToTaskMapping_Cnt.getData(), - cuda_blockToTaskMapping); + cuda_blockToTaskMapping, + src, Cmp); cuda_newTasksAmount.setElement(0, 0); return cuda_blockToTaskMapping_Cnt.getElement(0); diff --git a/GPUSort/src/quicksort/task.h b/GPUSort/src/quicksort/task.h index 30168d96f..8a0b687ca 100644 --- a/GPUSort/src/quicksort/task.h +++ b/GPUSort/src/quicksort/task.h @@ -13,19 +13,20 @@ struct TASK int firstBlock, blockCount;//for workers read only values __cuda_callable__ - TASK(int begin, int end, int depth, int pivotIdx) + TASK(int begin, int end, int depth) : partitionBegin(begin), partitionEnd(end), - depth(depth), pivotIdx(pivotIdx), + depth(depth), pivotIdx(-1), dstBegin(-151561), dstEnd(-151561), firstBlock(-100), blockCount(-100) {} __cuda_callable__ - void initTask(int firstBlock, int blocks) + void initTask(int firstBlock, int blocks, int pivotIdx) { dstBegin= 0; dstEnd = partitionEnd - partitionBegin; this->firstBlock = firstBlock; blockCount = blocks; + this->pivotIdx = pivotIdx; } TASK() = default; -- GitLab From 62f3f491c3d4feca21b298892e5b7b27031885df Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sat, 3 Apr 2021 23:48:37 +0200 Subject: [PATCH 171/258] tests to prevent lost of elements --- .../tests/quicksort_unitTests/unitTests.cu | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/GPUSort/tests/quicksort_unitTests/unitTests.cu b/GPUSort/tests/quicksort_unitTests/unitTests.cu index 6397e3359..927a3d7f0 100644 --- a/GPUSort/tests/quicksort_unitTests/unitTests.cu +++ b/GPUSort/tests/quicksort_unitTests/unitTests.cu @@ -3,6 +3,8 @@ #include #include #include +#include +#include #include #include @@ -67,6 +69,59 @@ TEST(randomGenerated, bigArray_randomVal) } } +TEST(noLostElement, smallArray) +{ + std::srand(9151); + + int size = (1<<7); + std::vector arr(size); + for(auto & x : arr) x = std::rand(); + + TNL::Containers::Array cudaArr(arr); + auto view = cudaArr.getView(); + quicksort(view); + + std::sort(arr.begin(), arr.end()); + TNL::Containers::Array cudaArr2(arr); + ASSERT_TRUE(view == cudaArr2.getView()); +} + +TEST(noLostElement, midSizedArray) +{ + std::srand(91503); + + int size = (1<<15); + std::vector arr(size); + for(auto & x : arr) x = std::rand(); + + TNL::Containers::Array cudaArr(arr); + auto view = cudaArr.getView(); + quicksort(view); + + std::sort(arr.begin(), arr.end()); + TNL::Containers::Array cudaArr2(arr); + ASSERT_TRUE(view == cudaArr2.getView()); +} + +TEST(noLostElement, bigSizedArray) +{ + std::srand(15611); + + int size = (1<<22); + std::vector arr(size); + for(auto & x : arr) x = std::rand(); + for(int i = 0; i < 10000; i++) + arr[std::rand() % arr.size()] = (1<<10); + + TNL::Containers::Array cudaArr(arr); + auto view = cudaArr.getView(); + quicksort(view); + + TNL::Containers::Array cudaArr2(arr); + thrust::sort(thrust::device, cudaArr2.getData(), cudaArr2.getData() + cudaArr2.getSize()); + ASSERT_TRUE(view == cudaArr2.getView()); +} + //---------------------------------------------------------------------------------- int main(int argc, char **argv) -- GitLab From d82c4a45f2c66d569b499a17042c470f9df98d77 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sun, 4 Apr 2021 00:18:12 +0200 Subject: [PATCH 172/258] add compare reduction to find min and max --- GPUSort/src/util/reduction.cuh | 41 +++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/GPUSort/src/util/reduction.cuh b/GPUSort/src/util/reduction.cuh index aeafe831d..ee11a5a71 100644 --- a/GPUSort/src/util/reduction.cuh +++ b/GPUSort/src/util/reduction.cuh @@ -36,6 +36,8 @@ __device__ int blockReduceSum(int val) return shared[0]; } +//------------------------------------------------------------------------------- + __device__ int warpInclusivePrefixSum(int value) { int laneId = threadIdx.x & (32-1); @@ -71,4 +73,41 @@ __device__ int blockInclusivePrefixSum(int value) tmp += shared[wid]; return tmp; -} \ No newline at end of file +} + +//-------------------------------------------------------------------- + +template +__device__ int warpCmpReduce(int initVal, const Operator & Cmp) +{ + const unsigned int maskConstant = 0xffffffff; //not used + for (unsigned int mask = warpSize / 2; mask > 0; mask >>= 1) + initVal = Cmp(initVal, __shfl_xor_sync(maskConstant, initVal, mask)); + + return initVal; +} + +template +__device__ int blockCmpReduce(int val, const Operator & Cmp) +{ + static __shared__ int shared[32]; + int lane = threadIdx.x & (warpSize - 1); + int wid = threadIdx.x / warpSize; + + val = warpCmpReduce(val, Cmp); + + if (lane == 0) + shared[wid] = val; + __syncthreads(); + + val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : shared[0]; + + if (wid == 0) + val = warpReduceSum(val, Cmp); + + if(threadIdx.x == 0) + shared[0] = val; + __syncthreads(); + + return shared[0]; +} -- GitLab From de26bc589799d0023015f79be4f584ad06b77646 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sun, 4 Apr 2021 00:23:30 +0200 Subject: [PATCH 173/258] fix naming --- GPUSort/src/util/reduction.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPUSort/src/util/reduction.cuh b/GPUSort/src/util/reduction.cuh index ee11a5a71..12ea62285 100644 --- a/GPUSort/src/util/reduction.cuh +++ b/GPUSort/src/util/reduction.cuh @@ -103,7 +103,7 @@ __device__ int blockCmpReduce(int val, const Operator & Cmp) val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : shared[0]; if (wid == 0) - val = warpReduceSum(val, Cmp); + val = warpCmpReduce(val, Cmp); if(threadIdx.x == 0) shared[0] = val; -- GitLab From a5ec78acdfc4b79ec6605d0b92f4aa4df5475a17 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sun, 4 Apr 2021 19:03:07 +0200 Subject: [PATCH 174/258] change value to fix cederman breaking --- GPUSort/benchmark/benchmarker.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPUSort/benchmark/benchmarker.cpp b/GPUSort/benchmark/benchmarker.cpp index 36dab036e..e5856b920 100644 --- a/GPUSort/benchmark/benchmarker.cpp +++ b/GPUSort/benchmark/benchmarker.cpp @@ -192,7 +192,7 @@ double staggared(int size) int tmp=4096; //(RAND_MAX)/p; --> size=2048 int p= (size+tmp-1)/tmp; - const int VALUE = (1<<31)/p; //(RAND_MAX)/p; + const int VALUE = (1<<30)/p; //(RAND_MAX)/p; int i=1; int x=0; //the array of size N is split into 'p' buckets -- GitLab From 735449938e177cac3ad794336c767c08b4ff2ef4 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sun, 4 Apr 2021 19:18:45 +0200 Subject: [PATCH 175/258] flushing after every calc --- GPUSort/benchmark/benchmarker.cpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/GPUSort/benchmark/benchmarker.cpp b/GPUSort/benchmark/benchmarker.cpp index e5856b920..0e3dac93c 100644 --- a/GPUSort/benchmark/benchmarker.cpp +++ b/GPUSort/benchmark/benchmarker.cpp @@ -238,14 +238,31 @@ void start(ostream & out, string delim) out << "2^" << pow << delim; out << fixed << setprecision(3); + out << random(size) << delim; + out.flush(); + out << shuffle(size) << delim; + out.flush(); + out << sorted(size) << delim; + out.flush(); + out << almostSorted(size) << delim; + out.flush(); + out << decreasing(size) << delim; + out.flush(); + out << gaussian(size) << delim; + out.flush(); + out << bucket(size) << delim; + out.flush(); + out << staggared(size) << delim; + out.flush(); + out << zero_entropy(size); out << endl; } -- GitLab From dda2f497831274bf82b49d66c6251154f227df95 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Mon, 5 Apr 2021 14:02:17 +0200 Subject: [PATCH 176/258] proper checking --- GPUSort/src/quicksort/quicksort.cuh | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index 4f9c7a168..5e0562046 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -251,9 +251,7 @@ public: cuda_2ndPhaseTasksAmount = 0; iteration = 0; - auto error = cudaGetLastError(); - if(error != cudaSuccess) - deb(error); + TNL_CHECK_CUDA_DEVICE; } template @@ -274,7 +272,6 @@ public: template void QUICKSORT::sort(const Function &Cmp) { - cudaError_t error; while (tasksAmount > 0) { @@ -295,12 +292,17 @@ void QUICKSORT::sort(const Function &Cmp) if(blocksCnt > cuda_blockToTaskMapping.getSize()) break; + TNL_CHECK_CUDA_DEVICE; + int externMemByteSize = elemPerBlock * sizeof(int); auto & task = iteration % 2 == 0? cuda_tasks : cuda_newTasks; + cudaQuickSort1stPhase <<>>( arr, aux, Cmp, elemPerBlock, task, cuda_blockToTaskMapping); + + TNL_CHECK_CUDA_DEVICE; auto & newTask = iteration % 2 == 0? cuda_newTasks : cuda_tasks; cudaWritePivot<<>>( @@ -308,15 +310,14 @@ void QUICKSORT::sort(const Function &Cmp) task, newTask, cuda_newTasksAmount.getData(), cuda_2ndPhaseTasks, cuda_2ndPhaseTasksAmount.getData()); + TNL_CHECK_CUDA_DEVICE; + processNewTasks(); iteration++; } - if((error = cudaDeviceSynchronize()) != cudaSuccess) - { - deb(error); - return; - } + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; if (tasksAmount > 0) { @@ -324,6 +325,7 @@ void QUICKSORT::sort(const Function &Cmp) cudaQuickSort2ndPhase <<>>(arr, aux, Cmp, tasks); } + TNL_CHECK_CUDA_DEVICE; if (host_2ndPhaseTasksAmount > 0) { @@ -331,13 +333,10 @@ void QUICKSORT::sort(const Function &Cmp) <<>> (arr, aux, Cmp, cuda_2ndPhaseTasks); } + TNL_CHECK_CUDA_DEVICE; - - if((error = cudaDeviceSynchronize()) != cudaSuccess) - { - deb(error); - return; - } + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; return; } -- GitLab From a122d092e8f9421e99f8ac2fa6247cde322bbb91 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Mon, 5 Apr 2021 14:23:01 +0200 Subject: [PATCH 177/258] track max sharedMem size --- GPUSort/src/quicksort/cudaPartition.cuh | 46 +++++++++++--- GPUSort/src/quicksort/quicksort.cuh | 82 +++++++++++++++++-------- 2 files changed, 95 insertions(+), 33 deletions(-) diff --git a/GPUSort/src/quicksort/cudaPartition.cuh b/GPUSort/src/quicksort/cudaPartition.cuh index d3dfc658a..92d9f0579 100644 --- a/GPUSort/src/quicksort/cudaPartition.cuh +++ b/GPUSort/src/quicksort/cudaPartition.cuh @@ -146,7 +146,7 @@ void copyData(ArrayView src, //---------------------------------------------------------------------------------- template -__device__ void cudaPartition(ArrayView src, +__device__ void cudaPartition_1(ArrayView src, ArrayView dst, int * sharedMem, const Function &Cmp, const int & pivot, @@ -180,15 +180,47 @@ __device__ void cudaPartition(ArrayView src, //----------------------------------------------------------- - /* - int destSmaller = smallerStart + smallerPrefSumInc - smaller; - int destBigger = biggerStart + biggerPrefSumInc - bigger; - copyData(srcView, dst, destSmaller, destBigger, pivot); - */ - copyDataShared(srcView, dst, sharedMem, smallerStart, biggerStart, smallerTotal, biggerTotal, smallerPrefSumInc - smaller, biggerPrefSumInc - bigger, //exclusive prefix sum of elements pivot); +} + +//------------------------------------------------------------------ + +template +__device__ void cudaPartition_2(ArrayView src, + ArrayView dst, + const Function &Cmp, const int & pivot, + int elemPerBlock, TASK & task + ) +{ + static __shared__ int smallerStart, biggerStart; + + int myBegin = elemPerBlock * (blockIdx.x - task.firstBlock); + int myEnd = TNL::min(myBegin + elemPerBlock, src.getSize()); + + auto srcView = src.getView(myBegin, myEnd); + + //------------------------------------------------------------------------- + + int smaller = 0, bigger = 0; + countElem(srcView, smaller, bigger, pivot); + + int smallerPrefSumInc = blockInclusivePrefixSum(smaller); + int biggerPrefSumInc = blockInclusivePrefixSum(bigger); + + if (threadIdx.x == blockDim.x - 1) //last thread in block has sum of all values + { + smallerStart = atomicAdd(&(task.dstBegin), smallerPrefSumInc); + biggerStart = atomicAdd(&(task.dstEnd), -biggerPrefSumInc) - biggerPrefSumInc; + } + __syncthreads(); + + //----------------------------------------------------------- + + int destSmaller = smallerStart + smallerPrefSumInc - smaller; + int destBigger = biggerStart + biggerPrefSumInc - bigger; + copyData(srcView, dst, destSmaller, destBigger, pivot); } \ No newline at end of file diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index 5e0562046..38bd88ad4 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -60,42 +60,60 @@ __device__ void writeNewTask(int begin, int end, int depth, int maxElemFor2ndPha } } } + //---------------------------------------------------- template -__global__ void cudaQuickSort1stPhase(ArrayView arr, ArrayView aux, +__global__ void cudaQuickSort1stPhase_1(ArrayView arr, ArrayView aux, const Function &Cmp, int elemPerBlock, ArrayView tasks, ArrayView taskMapping) { extern __shared__ int externMem[]; int *sharedMem = externMem; - int pivot; + + static __shared__ int pivot; + TASK &myTask = tasks[taskMapping[blockIdx.x]]; + auto & src = (myTask.depth & 1) == 0? arr : aux; + auto & dst = (myTask.depth & 1) == 0? aux : arr; - if ((myTask.depth & 1) == 0) - pivot = arr[myTask.pivotIdx]; - else - pivot = aux[myTask.pivotIdx]; + if (threadIdx.x == 0) + pivot = src[myTask.pivotIdx]; + __syncthreads(); - if ((myTask.depth & 1) == 0) - { - cudaPartition( - arr.getView(myTask.partitionBegin, myTask.partitionEnd), - aux.getView(myTask.partitionBegin, myTask.partitionEnd), - sharedMem, - Cmp, pivot, elemPerBlock, myTask); - } - else - { - cudaPartition( - aux.getView(myTask.partitionBegin, myTask.partitionEnd), - arr.getView(myTask.partitionBegin, myTask.partitionEnd), - sharedMem, - Cmp, pivot, elemPerBlock, myTask); - } + cudaPartition_1( + src.getView(myTask.partitionBegin, myTask.partitionEnd), + dst.getView(myTask.partitionBegin, myTask.partitionEnd), + sharedMem, + Cmp, pivot, elemPerBlock, myTask); +} + +template +__global__ void cudaQuickSort1stPhase_2(ArrayView arr, ArrayView aux, + const Function &Cmp, int elemPerBlock, + ArrayView tasks, + ArrayView taskMapping) +{ + static __shared__ int pivot; + + TASK &myTask = tasks[taskMapping[blockIdx.x]]; + auto & src = (myTask.depth & 1) == 0? arr : aux; + auto & dst = (myTask.depth & 1) == 0? aux : arr; + + if (threadIdx.x == 0) + pivot = src[myTask.pivotIdx]; + __syncthreads(); + + cudaPartition_2( + src.getView(myTask.partitionBegin, myTask.partitionEnd), + dst.getView(myTask.partitionBegin, myTask.partitionEnd), + Cmp, pivot, elemPerBlock, myTask); } +//---------------------------------------------------- + + __global__ void cudaWritePivot(ArrayView arr, ArrayView aux, int maxElemFor2ndPhase, ArrayView tasks, ArrayView newTasks, int *newTasksCnt, ArrayView secondPhaseTasks, int *secondPhaseTasksCnt) @@ -231,7 +249,8 @@ class QUICKSORT ArrayView cuda_blockToTaskMapping_Cnt; //is in reality 1 integer int iteration = 0; - + //-------------------------------------------------------------------------------------- + cudaDeviceProp deviceProp; //-------------------------------------------------------------------------------------- public: QUICKSORT(ArrayView _arr) @@ -251,6 +270,7 @@ public: cuda_2ndPhaseTasksAmount = 0; iteration = 0; + cudaGetDeviceProperties(&deviceProp, 0); //change device TNL_CHECK_CUDA_DEVICE; } @@ -297,10 +317,20 @@ void QUICKSORT::sort(const Function &Cmp) int externMemByteSize = elemPerBlock * sizeof(int); auto & task = iteration % 2 == 0? cuda_tasks : cuda_newTasks; - cudaQuickSort1stPhase - <<>>( - arr, aux, Cmp, elemPerBlock, + if(externMemByteSize <= deviceProp.sharedMemPerBlock) + { + cudaQuickSort1stPhase_1 + <<>>( + arr, aux, Cmp, elemPerBlock, + task, cuda_blockToTaskMapping); + } + else + { + cudaQuickSort1stPhase_2 + <<>>( + arr, aux, Cmp, elemPerBlock, task, cuda_blockToTaskMapping); + } TNL_CHECK_CUDA_DEVICE; -- GitLab From 9cf49553a23ff85938a9e4e799ed32d4bb68aea6 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Mon, 5 Apr 2021 15:24:30 +0200 Subject: [PATCH 178/258] roll back to 88782e514fffa0c135adc798457b6706277ab9f9 for warp scan --- GPUSort/src/util/reduction.cuh | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/GPUSort/src/util/reduction.cuh b/GPUSort/src/util/reduction.cuh index 12ea62285..47a934001 100644 --- a/GPUSort/src/util/reduction.cuh +++ b/GPUSort/src/util/reduction.cuh @@ -43,11 +43,10 @@ __device__ int warpInclusivePrefixSum(int value) int laneId = threadIdx.x & (32-1); #pragma unroll - for (int i = 0; i < 5; i++) //iterates until x == 1<<4 == 16 which is half warpSize + for (int i = 1; i*2 <= 32; i *= 2)//32 here is warp size { - int x = 1<= x) + int n = __shfl_up_sync(0xffffffff, value, i); + if ((laneId & (warpSize - 1)) >= i) value += n; } -- GitLab From 353b3a251eda1f462c824668972e27a486b98980 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Mon, 5 Apr 2021 16:56:35 +0200 Subject: [PATCH 179/258] add comments and reorder pivot calc place --- GPUSort/src/quicksort/quicksort_1Block.cuh | 37 ++++++++++++---------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/GPUSort/src/quicksort/quicksort_1Block.cuh b/GPUSort/src/quicksort/quicksort_1Block.cuh index 098cabbf5..2828b2ea3 100644 --- a/GPUSort/src/quicksort/quicksort_1Block.cuh +++ b/GPUSort/src/quicksort/quicksort_1Block.cuh @@ -78,8 +78,8 @@ __device__ void singleBlockQuickSort(ArrayView arr, { static __shared__ int stackTop; static __shared__ int stackArrBegin[stackSize], stackArrEnd[stackSize], stackDepth[stackSize]; - static __shared__ int begin, end, depth,pivotBegin, pivotEnd; - static __shared__ int pivot; + static __shared__ int begin, end, depth; + static __shared__ int pivot, pivotBegin, pivotEnd; if (threadIdx.x == 0) { @@ -93,53 +93,58 @@ __device__ void singleBlockQuickSort(ArrayView arr, while(stackTop > 0) { + //pick up partition to break up if (threadIdx.x == 0) { begin = stackArrBegin[stackTop-1]; end = stackArrEnd[stackTop-1]; depth = stackDepth[stackTop-1]; stackTop--; - pivot = pickPivot((depth&1) == 0? - arr.getView(begin, end) : - aux.getView(begin, end), - Cmp - ); } __syncthreads(); int size = end - begin; - auto src = (depth&1) == 0 ? arr.getView(begin, end) : aux.getView(begin, end); - auto dst = (depth&1) == 0 ? aux.getView(begin, end) : arr.getView(begin, end); + auto &src = (depth&1) == 0 ? arr : aux; + //small enough for for bitonic if(size <= blockDim.x*2) { - externSort(src, arr.getView(begin, end), Cmp); + externSort(src.getView(begin, end), arr.getView(begin, end), Cmp); __syncthreads(); continue; } + //------------------------------------------------------ + + //actually do partitioning from here on out + if(threadIdx.x == 0) + pivot = pickPivot(src.getView(begin, end),Cmp); + __syncthreads(); int smaller = 0, bigger = 0; - countElem(src, smaller, bigger, pivot); + countElem(src.getView(begin, end), smaller, bigger, pivot); + //synchronization is in this function already int smallerOffset = blockInclusivePrefixSum(smaller); int biggerOffset = blockInclusivePrefixSum(bigger); - if (threadIdx.x == blockDim.x - 1) + if (threadIdx.x == blockDim.x - 1) //has sum of all smaller and greater elements than pivot in src { - pivotBegin = smallerOffset; + pivotBegin = 0 + smallerOffset; pivotEnd = size - biggerOffset; } __syncthreads(); - int destSmaller = 0 + smallerOffset - smaller; + int destSmaller = 0 + (smallerOffset - smaller); int destBigger = pivotEnd + (biggerOffset - bigger); + auto &dst = (depth&1) == 0 ? aux : arr; - copyData(src, dst, destSmaller, destBigger, pivot); + copyData(src.getView(begin, end), dst.getView(begin, end), destSmaller, destBigger, pivot); __syncthreads(); for (int i = pivotBegin + threadIdx.x; i < pivotEnd; i += blockDim.x) arr[begin + i] = pivot; + //creates new tasks if(threadIdx.x == 0) { stackPush(stackArrBegin, stackArrEnd, stackDepth, stackTop, @@ -147,6 +152,6 @@ __device__ void singleBlockQuickSort(ArrayView arr, begin +pivotEnd, end, depth); } - __syncthreads(); + __syncthreads(); //sync to update stackTop } //ends while loop } \ No newline at end of file -- GitLab From 8f3146f4f098383d1e5e6b8ecc3b62788578e004 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Mon, 5 Apr 2021 17:18:55 +0200 Subject: [PATCH 180/258] refactor out variables and better error checking --- GPUSort/src/quicksort/quicksort.cuh | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index 38bd88ad4..c579a36d4 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -238,15 +238,14 @@ class QUICKSORT Array aux; int maxTasks, maxBlocks; Array cuda_tasks, cuda_newTasks, cuda_2ndPhaseTasks; - Array cudaCounters; - ArrayView cuda_newTasksAmount, cuda_2ndPhaseTasksAmount; //is in reality 1 integer + Array cuda_newTasksAmount, cuda_2ndPhaseTasksAmount; //is in reality 1 integer int tasksAmount; //counter for Host == cuda_newTasksAmount int host_2ndPhaseTasksAmount; // cuda_2ndPhaseTasksAmount Array cuda_blockToTaskMapping; - ArrayView cuda_blockToTaskMapping_Cnt; //is in reality 1 integer + Array cuda_blockToTaskMapping_Cnt; //is in reality 1 integer int iteration = 0; //-------------------------------------------------------------------------------------- @@ -258,11 +257,10 @@ public: maxTasks(min(arr.getSize(), g_maxTasks)), maxBlocks(g_maxBlocks), cuda_tasks(maxTasks), cuda_newTasks(maxTasks), cuda_2ndPhaseTasks(maxTasks), - cudaCounters(3), - cuda_newTasksAmount(cudaCounters.getView(0, 1)), - cuda_2ndPhaseTasksAmount(cudaCounters.getView(1, 2)), + cuda_newTasksAmount(1), + cuda_2ndPhaseTasksAmount(1), cuda_blockToTaskMapping(maxBlocks * 2), - cuda_blockToTaskMapping_Cnt(cudaCounters.getView(2, 3)) + cuda_blockToTaskMapping_Cnt(1) { cuda_tasks.setElement(0, TASK(0, arr.getSize(), 0)); tasksAmount = 1; @@ -335,7 +333,7 @@ void QUICKSORT::sort(const Function &Cmp) TNL_CHECK_CUDA_DEVICE; auto & newTask = iteration % 2 == 0? cuda_newTasks : cuda_tasks; - cudaWritePivot<<>>( + cudaWritePivot<<>>( arr, aux, desired_2ndPhasElemPerBlock, task, newTask, cuda_newTasksAmount.getData(), cuda_2ndPhaseTasks, cuda_2ndPhaseTasksAmount.getData()); @@ -345,26 +343,26 @@ void QUICKSORT::sort(const Function &Cmp) processNewTasks(); iteration++; } - - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; if (tasksAmount > 0) { auto & tasks = iteration % 2 == 0 ? cuda_tasks : cuda_newTasks; cudaQuickSort2ndPhase <<>>(arr, aux, Cmp, tasks); + + TNL_CHECK_CUDA_DEVICE; + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; } - TNL_CHECK_CUDA_DEVICE; if (host_2ndPhaseTasksAmount > 0) { cudaQuickSort2ndPhase <<>> (arr, aux, Cmp, cuda_2ndPhaseTasks); - } - TNL_CHECK_CUDA_DEVICE; + TNL_CHECK_CUDA_DEVICE; + } cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; return; -- GitLab From 0ba39e3c161be6db07d666a0468fc942dd27e003 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Mon, 5 Apr 2021 18:20:34 +0200 Subject: [PATCH 181/258] improve task initialization --- GPUSort/src/quicksort/quicksort.cuh | 96 ++++++++++++++++------------- 1 file changed, 52 insertions(+), 44 deletions(-) diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index c579a36d4..2e9df8997 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -8,6 +8,9 @@ #include "../bitonicSort/bitonicSort.h" #include +#include +#include + #define deb(x) std::cout << #x << " = " << x << std::endl; using namespace TNL; @@ -179,51 +182,42 @@ __global__ void cudaQuickSort2ndPhase(ArrayView arr, ArrayVi singleBlockQuickSort(arrView, auxView, Cmp, myTask.depth); } + //----------------------------------------------------------- + +__global__ void cudaCalcBlocksNeeded(ArrayView cuda_tasks, int elemPerBlock, + ArrayView blocksNeeded) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if(i >= cuda_tasks.getSize()) + return; + + auto task = cuda_tasks[i]; + int size = task.partitionEnd - task.partitionBegin; + blocksNeeded[i] = size / elemPerBlock + (size % elemPerBlock != 0); +} + template -__global__ void cudaInitTask(ArrayView cuda_tasks, - int taskAmount, int elemPerBlock, int *firstAvailBlock, +__global__ void cudaInitTask2(ArrayView cuda_tasks, int elemPerBlock, ArrayView cuda_blockToTaskMapping, + ArrayView cuda_reductionTaskInitMem, ArrayView src, const Function &Cmp) { - static __shared__ int avail; - - int i = blockDim.x * blockIdx.x + threadIdx.x; - int blocksNeeded = 0; - - if (i < taskAmount) - { - auto task = cuda_tasks[i]; - int size = task.partitionEnd - task.partitionBegin; - blocksNeeded = size / elemPerBlock + (size % elemPerBlock != 0); - } + if(blockIdx.x >= cuda_tasks.getSize()) + return; - int blocksNeeded_total = blockInclusivePrefixSum(blocksNeeded); - if (threadIdx.x == blockDim.x - 1) - avail = atomicAdd(firstAvailBlock, blocksNeeded_total); - __syncthreads(); + int start = blockIdx.x == 0? 0 : cuda_reductionTaskInitMem[blockIdx.x -1]; + int end = cuda_reductionTaskInitMem[blockIdx.x]; + for(int i = start + threadIdx.x; i < end; i += blockDim.x) + cuda_blockToTaskMapping[i] = blockIdx.x; - if (i < taskAmount) + if(threadIdx.x == 0) { - auto task = cuda_tasks[i]; - int myFirstAvailBlock = avail + blocksNeeded_total - blocksNeeded; + TASK & task = cuda_tasks[blockIdx.x]; int pivotIdx = task.partitionBegin + pickPivotIdx(src.getView(task.partitionBegin, task.partitionEnd), Cmp); - cuda_tasks[i].initTask(myFirstAvailBlock, blocksNeeded, pivotIdx); - - for (int set = 0; set < blocksNeeded; set++) - { - if(myFirstAvailBlock >= cuda_blockToTaskMapping.getSize()) - { - printf("ran out of memory for mapping\n"); - } - else - { - cuda_blockToTaskMapping[myFirstAvailBlock++] = i; - } - } + task.initTask(start, end-start, pivotIdx); } } - //----------------------------------------------------------- //----------------------------------------------------------- const int threadsPerBlock = 512, g_maxBlocks = 1 << 15; //32k @@ -245,7 +239,7 @@ class QUICKSORT int host_2ndPhaseTasksAmount; // cuda_2ndPhaseTasksAmount Array cuda_blockToTaskMapping; - Array cuda_blockToTaskMapping_Cnt; //is in reality 1 integer + Array cuda_reductionTaskInitMem; int iteration = 0; //-------------------------------------------------------------------------------------- @@ -260,7 +254,7 @@ public: cuda_newTasksAmount(1), cuda_2ndPhaseTasksAmount(1), cuda_blockToTaskMapping(maxBlocks * 2), - cuda_blockToTaskMapping_Cnt(1) + cuda_reductionTaskInitMem(maxTasks) { cuda_tasks.setElement(0, TASK(0, arr.getSize(), 0)); tasksAmount = 1; @@ -307,7 +301,7 @@ void QUICKSORT::sort(const Function &Cmp) int elemPerBlock = getElemPerBlock(); int blocksCnt = initTasks(elemPerBlock, Cmp); - if(blocksCnt > cuda_blockToTaskMapping.getSize()) + if(blocksCnt >= cuda_blockToTaskMapping.getSize()) break; TNL_CHECK_CUDA_DEVICE; @@ -396,18 +390,32 @@ int QUICKSORT::initTasks(int elemPerBlock, const Function & Cmp) { int threads = min(tasksAmount, threadsPerBlock); int blocks = tasksAmount / threads + (tasksAmount % threads != 0); - cuda_blockToTaskMapping_Cnt = 0; auto src = iteration % 2 == 0? arr : aux.getView(); auto &tasks = iteration % 2 == 0? cuda_tasks : cuda_newTasks; - cudaInitTask<<>>( - tasks, tasksAmount, elemPerBlock, - cuda_blockToTaskMapping_Cnt.getData(), - cuda_blockToTaskMapping, - src, Cmp); + + //[i] == how many blocks task i needs + cudaCalcBlocksNeeded<<>>(tasks.getView(0, tasksAmount), + elemPerBlock, cuda_reductionTaskInitMem.getView(0, tasksAmount)); + + thrust::inclusive_scan(thrust::device, + cuda_reductionTaskInitMem.getData(), + cuda_reductionTaskInitMem.getData() + tasksAmount, + cuda_reductionTaskInitMem.getData()); + + int blocksNeeded = cuda_reductionTaskInitMem.getElement(tasksAmount - 1); + if(blocksNeeded >= cuda_blockToTaskMapping.getSize()) + return blocksNeeded; + + cudaInitTask2<<>>( + tasks.getView(0, tasksAmount), elemPerBlock, + cuda_blockToTaskMapping.getView(0, blocksNeeded), + cuda_reductionTaskInitMem.getView(0, tasksAmount), + src, Cmp + ); cuda_newTasksAmount.setElement(0, 0); - return cuda_blockToTaskMapping_Cnt.getElement(0); + return blocksNeeded; } void QUICKSORT::processNewTasks() -- GitLab From 167814500d6ea684c744155865cde4982f6e223f Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Mon, 5 Apr 2021 18:43:25 +0200 Subject: [PATCH 182/258] small variable changes --- GPUSort/src/quicksort/quicksort.cuh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index 2e9df8997..3b1a38ad6 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -198,7 +198,7 @@ __global__ void cudaCalcBlocksNeeded(ArrayView cuda_tasks, } template -__global__ void cudaInitTask2(ArrayView cuda_tasks, int elemPerBlock, +__global__ void cudaInitTask(ArrayView cuda_tasks, ArrayView cuda_blockToTaskMapping, ArrayView cuda_reductionTaskInitMem, ArrayView src, const Function &Cmp) @@ -253,7 +253,7 @@ public: cuda_tasks(maxTasks), cuda_newTasks(maxTasks), cuda_2ndPhaseTasks(maxTasks), cuda_newTasksAmount(1), cuda_2ndPhaseTasksAmount(1), - cuda_blockToTaskMapping(maxBlocks * 2), + cuda_blockToTaskMapping(maxBlocks), cuda_reductionTaskInitMem(maxTasks) { cuda_tasks.setElement(0, TASK(0, arr.getSize(), 0)); @@ -404,11 +404,12 @@ int QUICKSORT::initTasks(int elemPerBlock, const Function & Cmp) cuda_reductionTaskInitMem.getData()); int blocksNeeded = cuda_reductionTaskInitMem.getElement(tasksAmount - 1); + //need too many blocks, give back control if(blocksNeeded >= cuda_blockToTaskMapping.getSize()) return blocksNeeded; - cudaInitTask2<<>>( - tasks.getView(0, tasksAmount), elemPerBlock, + cudaInitTask<<>>( + tasks.getView(0, tasksAmount), cuda_blockToTaskMapping.getView(0, blocksNeeded), cuda_reductionTaskInitMem.getView(0, tasksAmount), src, Cmp -- GitLab From 08be407cacbdcdf33c32d482615efbe103f55c34 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Tue, 6 Apr 2021 18:42:37 +0200 Subject: [PATCH 183/258] switch to bitonic in the beginning if possible --- GPUSort/src/quicksort/quicksort_1Block.cuh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/GPUSort/src/quicksort/quicksort_1Block.cuh b/GPUSort/src/quicksort/quicksort_1Block.cuh index 2828b2ea3..4ee834ffc 100644 --- a/GPUSort/src/quicksort/quicksort_1Block.cuh +++ b/GPUSort/src/quicksort/quicksort_1Block.cuh @@ -76,6 +76,13 @@ __device__ void singleBlockQuickSort(ArrayView arr, ArrayView aux, const Function & Cmp, int _depth) { + if(arr.getSize() <= blockDim.x*2) + { + auto src = (_depth &1) == 0? arr : aux; + externSort(src, arr, Cmp); + return; + } + static __shared__ int stackTop; static __shared__ int stackArrBegin[stackSize], stackArrEnd[stackSize], stackDepth[stackSize]; static __shared__ int begin, end, depth; -- GitLab From 28f085aa590f594c6ea5c6ee3c4991ec30e3cb4a Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Tue, 6 Apr 2021 22:22:46 +0200 Subject: [PATCH 184/258] sorting different types --- GPUSort/src/quicksort/cudaPartition.cuh | 45 ++++++----- GPUSort/src/quicksort/quicksort.cuh | 76 ++++++++++--------- GPUSort/src/quicksort/quicksort_1Block.cuh | 21 ++--- .../tests/quicksort_unitTests/unitTests.cu | 19 +++++ 4 files changed, 96 insertions(+), 65 deletions(-) diff --git a/GPUSort/src/quicksort/cudaPartition.cuh b/GPUSort/src/quicksort/cudaPartition.cuh index 92d9f0579..16fb8c5cc 100644 --- a/GPUSort/src/quicksort/cudaPartition.cuh +++ b/GPUSort/src/quicksort/cudaPartition.cuh @@ -73,32 +73,34 @@ __device__ Value pickPivotIdx(TNL::Containers::ArrayView src, con } } +template __device__ -void countElem(ArrayView arr, +void countElem(ArrayView arr, int &smaller, int &bigger, - const int &pivot) + const Value &pivot) { for (int i = threadIdx.x; i < arr.getSize(); i += blockDim.x) { - int data = arr[i]; + const Value data = arr[i]; smaller += (data < pivot); bigger += (data > pivot); } } +template __device__ -void copyDataShared(ArrayView src, - ArrayView dst, - int *sharedMem, +void copyDataShared(ArrayView src, + ArrayView dst, + Value *sharedMem, int smallerStart, int biggerStart, int smallerTotal, int biggerTotal, int smallerOffset, int biggerOffset, //exclusive prefix sum of elements - const int &pivot) + const Value &pivot) { for (int i = threadIdx.x; i < src.getSize(); i += blockDim.x) { - int data = src[i]; + const Value data = src[i]; if (data < pivot) sharedMem[smallerOffset++] = data; else if (data > pivot) @@ -115,15 +117,16 @@ void copyDataShared(ArrayView src, } } +template __device__ -void copyData(ArrayView src, - ArrayView dst, +void copyData(ArrayView src, + ArrayView dst, int smallerStart, int biggerStart, - const int &pivot) + const Value &pivot) { for (int i = threadIdx.x; i < src.getSize(); i += blockDim.x) { - int data = src[i]; + const Value data = src[i]; if (data < pivot) { /* @@ -145,11 +148,11 @@ void copyData(ArrayView src, //---------------------------------------------------------------------------------- -template -__device__ void cudaPartition_1(ArrayView src, - ArrayView dst, - int * sharedMem, - const Function &Cmp, const int & pivot, +template +__device__ void cudaPartition_1(ArrayView src, + ArrayView dst, + Value * sharedMem, + const Function &Cmp, const Value & pivot, int elemPerBlock, TASK & task ) { @@ -189,10 +192,10 @@ __device__ void cudaPartition_1(ArrayView src, //------------------------------------------------------------------ -template -__device__ void cudaPartition_2(ArrayView src, - ArrayView dst, - const Function &Cmp, const int & pivot, +template +__device__ void cudaPartition_2(ArrayView src, + ArrayView dst, + const Function &Cmp, const Value & pivot, int elemPerBlock, TASK & task ) { diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index 3b1a38ad6..554773913 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -66,16 +66,16 @@ __device__ void writeNewTask(int begin, int end, int depth, int maxElemFor2ndPha //---------------------------------------------------- -template -__global__ void cudaQuickSort1stPhase_1(ArrayView arr, ArrayView aux, +template +__global__ void cudaQuickSort1stPhase_1(ArrayView arr, ArrayView aux, const Function &Cmp, int elemPerBlock, ArrayView tasks, ArrayView taskMapping) { extern __shared__ int externMem[]; - int *sharedMem = externMem; + Value *sharedMem = (Value*)externMem; - static __shared__ int pivot; + static __shared__ Value pivot; TASK &myTask = tasks[taskMapping[blockIdx.x]]; auto & src = (myTask.depth & 1) == 0? arr : aux; @@ -92,13 +92,13 @@ __global__ void cudaQuickSort1stPhase_1(ArrayView arr, Array Cmp, pivot, elemPerBlock, myTask); } -template -__global__ void cudaQuickSort1stPhase_2(ArrayView arr, ArrayView aux, +template +__global__ void cudaQuickSort1stPhase_2(ArrayView arr, ArrayView aux, const Function &Cmp, int elemPerBlock, ArrayView tasks, ArrayView taskMapping) { - static __shared__ int pivot; + static __shared__ Value pivot; TASK &myTask = tasks[taskMapping[blockIdx.x]]; auto & src = (myTask.depth & 1) == 0? arr : aux; @@ -116,12 +116,12 @@ __global__ void cudaQuickSort1stPhase_2(ArrayView arr, Array //---------------------------------------------------- - -__global__ void cudaWritePivot(ArrayView arr, ArrayView aux, int maxElemFor2ndPhase, +template +__global__ void cudaWritePivot(ArrayView arr, ArrayView aux, int maxElemFor2ndPhase, ArrayView tasks, ArrayView newTasks, int *newTasksCnt, ArrayView secondPhaseTasks, int *secondPhaseTasksCnt) { - static __shared__ int pivot; + static __shared__ Value pivot; TASK &myTask = tasks[blockIdx.x]; if (threadIdx.x == 0) @@ -168,8 +168,8 @@ __global__ void cudaWritePivot(ArrayView arr, ArrayView -__global__ void cudaQuickSort2ndPhase(ArrayView arr, ArrayView aux, +template +__global__ void cudaQuickSort2ndPhase(ArrayView arr, ArrayView aux, const Function &Cmp, ArrayView secondPhaseTasks) { @@ -180,7 +180,7 @@ __global__ void cudaQuickSort2ndPhase(ArrayView arr, ArrayVi auto arrView = arr.getView(myTask.partitionBegin, myTask.partitionEnd); auto auxView = aux.getView(myTask.partitionBegin, myTask.partitionEnd); - singleBlockQuickSort(arrView, auxView, Cmp, myTask.depth); + singleBlockQuickSort(arrView, auxView, Cmp, myTask.depth); } //----------------------------------------------------------- @@ -197,11 +197,11 @@ __global__ void cudaCalcBlocksNeeded(ArrayView cuda_tasks, blocksNeeded[i] = size / elemPerBlock + (size % elemPerBlock != 0); } -template +template __global__ void cudaInitTask(ArrayView cuda_tasks, ArrayView cuda_blockToTaskMapping, ArrayView cuda_reductionTaskInitMem, - ArrayView src, const Function &Cmp) + ArrayView src, const Function &Cmp) { if(blockIdx.x >= cuda_tasks.getSize()) return; @@ -226,10 +226,11 @@ const int minElemPerBlock = threadsPerBlock*2; const int maxBitonicSize = threadsPerBlock*2; const int desired_2ndPhasElemPerBlock = maxBitonicSize*8; +template class QUICKSORT { - ArrayView arr; - Array aux; + ArrayView arr; + Array aux; int maxTasks, maxBlocks; Array cuda_tasks, cuda_newTasks, cuda_2ndPhaseTasks; @@ -246,7 +247,7 @@ class QUICKSORT cudaDeviceProp deviceProp; //-------------------------------------------------------------------------------------- public: - QUICKSORT(ArrayView _arr) + QUICKSORT(ArrayView _arr) : arr(_arr), aux(arr.getSize()), maxTasks(min(arr.getSize(), g_maxTasks)), maxBlocks(g_maxBlocks), @@ -281,8 +282,9 @@ public: void processNewTasks(); }; +template template -void QUICKSORT::sort(const Function &Cmp) +void QUICKSORT::sort(const Function &Cmp) { while (tasksAmount > 0) @@ -306,19 +308,19 @@ void QUICKSORT::sort(const Function &Cmp) TNL_CHECK_CUDA_DEVICE; - int externMemByteSize = elemPerBlock * sizeof(int); + int externMemByteSize = elemPerBlock * sizeof(Value); auto & task = iteration % 2 == 0? cuda_tasks : cuda_newTasks; if(externMemByteSize <= deviceProp.sharedMemPerBlock) { - cudaQuickSort1stPhase_1 + cudaQuickSort1stPhase_1 <<>>( arr, aux, Cmp, elemPerBlock, task, cuda_blockToTaskMapping); } else { - cudaQuickSort1stPhase_2 + cudaQuickSort1stPhase_2 <<>>( arr, aux, Cmp, elemPerBlock, task, cuda_blockToTaskMapping); @@ -327,7 +329,8 @@ void QUICKSORT::sort(const Function &Cmp) TNL_CHECK_CUDA_DEVICE; auto & newTask = iteration % 2 == 0? cuda_newTasks : cuda_tasks; - cudaWritePivot<<>>( + cudaWritePivot + <<>>( arr, aux, desired_2ndPhasElemPerBlock, task, newTask, cuda_newTasksAmount.getData(), cuda_2ndPhaseTasks, cuda_2ndPhaseTasksAmount.getData()); @@ -341,7 +344,7 @@ void QUICKSORT::sort(const Function &Cmp) if (tasksAmount > 0) { auto & tasks = iteration % 2 == 0 ? cuda_tasks : cuda_newTasks; - cudaQuickSort2ndPhase + cudaQuickSort2ndPhase <<>>(arr, aux, Cmp, tasks); TNL_CHECK_CUDA_DEVICE; @@ -351,7 +354,7 @@ void QUICKSORT::sort(const Function &Cmp) if (host_2ndPhaseTasksAmount > 0) { - cudaQuickSort2ndPhase + cudaQuickSort2ndPhase <<>> (arr, aux, Cmp, cuda_2ndPhaseTasks); @@ -362,7 +365,8 @@ void QUICKSORT::sort(const Function &Cmp) return; } -int QUICKSORT::getSetsNeeded(int elemPerBlock) const +template +int QUICKSORT::getSetsNeeded(int elemPerBlock) const { auto view = iteration % 2 == 0 ? cuda_tasks.getConstView() : cuda_newTasks.getConstView(); auto fetch = [=] __cuda_callable__(int i) { @@ -374,7 +378,8 @@ int QUICKSORT::getSetsNeeded(int elemPerBlock) const return Algorithms::Reduction::reduce(0, tasksAmount, fetch, reduction, 0); } -int QUICKSORT::getElemPerBlock() const +template +int QUICKSORT::getElemPerBlock() const { int setsNeeded = getSetsNeeded(minElemPerBlock); @@ -385,8 +390,9 @@ int QUICKSORT::getElemPerBlock() const return setsPerBlock * minElemPerBlock; } +template template -int QUICKSORT::initTasks(int elemPerBlock, const Function & Cmp) +int QUICKSORT::initTasks(int elemPerBlock, const Function & Cmp) { int threads = min(tasksAmount, threadsPerBlock); int blocks = tasksAmount / threads + (tasksAmount % threads != 0); @@ -419,7 +425,8 @@ int QUICKSORT::initTasks(int elemPerBlock, const Function & Cmp) return blocksNeeded; } -void QUICKSORT::processNewTasks() +template +void QUICKSORT::processNewTasks() { tasksAmount = cuda_newTasksAmount.getElement(0); host_2ndPhaseTasksAmount = cuda_2ndPhaseTasksAmount.getElement(0); @@ -429,14 +436,15 @@ void QUICKSORT::processNewTasks() //----------------------------------------------------------- //----------------------------------------------------------- -template -void quicksort(ArrayView arr, const Function &Cmp) +template +void quicksort(ArrayView arr, const Function &Cmp) { - QUICKSORT sorter(arr); + QUICKSORT sorter(arr); sorter.sort(Cmp); } -void quicksort(ArrayView arr) +template +void quicksort(ArrayView arr) { - quicksort(arr, [] __cuda_callable__(int a, int b) { return a < b; }); + quicksort(arr, [] __cuda_callable__(const Value & a, const Value & b) { return a < b; }); } diff --git a/GPUSort/src/quicksort/quicksort_1Block.cuh b/GPUSort/src/quicksort/quicksort_1Block.cuh index 4ee834ffc..f28509c66 100644 --- a/GPUSort/src/quicksort/quicksort_1Block.cuh +++ b/GPUSort/src/quicksort/quicksort_1Block.cuh @@ -9,12 +9,12 @@ using namespace TNL; using namespace TNL::Containers; -template -__device__ void externSort(ArrayView src, - ArrayView dst, +template +__device__ void externSort(ArrayView src, + ArrayView dst, const Function & Cmp) { - static __shared__ int sharedMem[externMemSize]; + static __shared__ Value sharedMem[externMemSize]; bitonicSort_Block(src, dst, sharedMem, Cmp); } @@ -71,22 +71,23 @@ __device__ void stackPush(int stackArrBegin[], int stackArrEnd[], } } -template -__device__ void singleBlockQuickSort(ArrayView arr, - ArrayView aux, +template +__device__ void singleBlockQuickSort(ArrayView arr, + ArrayView aux, const Function & Cmp, int _depth) { if(arr.getSize() <= blockDim.x*2) { auto src = (_depth &1) == 0? arr : aux; - externSort(src, arr, Cmp); + externSort(src, arr, Cmp); return; } static __shared__ int stackTop; static __shared__ int stackArrBegin[stackSize], stackArrEnd[stackSize], stackDepth[stackSize]; static __shared__ int begin, end, depth; - static __shared__ int pivot, pivotBegin, pivotEnd; + static __shared__ int pivotBegin, pivotEnd; + static __shared__ Value pivot; if (threadIdx.x == 0) { @@ -116,7 +117,7 @@ __device__ void singleBlockQuickSort(ArrayView arr, //small enough for for bitonic if(size <= blockDim.x*2) { - externSort(src.getView(begin, end), arr.getView(begin, end), Cmp); + externSort(src.getView(begin, end), arr.getView(begin, end), Cmp); __syncthreads(); continue; } diff --git a/GPUSort/tests/quicksort_unitTests/unitTests.cu b/GPUSort/tests/quicksort_unitTests/unitTests.cu index 927a3d7f0..e87493add 100644 --- a/GPUSort/tests/quicksort_unitTests/unitTests.cu +++ b/GPUSort/tests/quicksort_unitTests/unitTests.cu @@ -122,6 +122,25 @@ TEST(noLostElement, bigSizedArray) ASSERT_TRUE(view == cudaArr2.getView()); } +TEST(types, type_double) +{ + std::srand(8451); + + int size = (1<<16); + std::vector arr(size); + for(auto & x : arr) x = std::rand(); + for(int i = 0; i < 10000; i++) + arr[std::rand() % arr.size()] = (1<<10); + + TNL::Containers::Array cudaArr(arr); + auto view = cudaArr.getView(); + quicksort(view); + + TNL::Containers::Array cudaArr2(arr); + thrust::sort(thrust::device, cudaArr2.getData(), cudaArr2.getData() + cudaArr2.getSize()); + ASSERT_TRUE(view == cudaArr2.getView()); +} + //---------------------------------------------------------------------------------- int main(int argc, char **argv) -- GitLab From cac28578e781e1da7394bc127ecde6189306cb17 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 7 Apr 2021 01:27:32 +0200 Subject: [PATCH 185/258] merge the 2 2nd phase launches and change elemPerBlock --- GPUSort/src/quicksort/quicksort.cuh | 63 +++++++++++++++++++++-------- 1 file changed, 46 insertions(+), 17 deletions(-) diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index 554773913..4b729ac81 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -183,6 +183,28 @@ __global__ void cudaQuickSort2ndPhase(ArrayView arr, Array singleBlockQuickSort(arrView, auxView, Cmp, myTask.depth); } + +template +__global__ void cudaQuickSort2ndPhase(ArrayView arr, ArrayView aux, + const Function &Cmp, + ArrayView secondPhaseTasks1, + ArrayView secondPhaseTasks2) +{ + TASK myTask; + if(blockIdx.x < secondPhaseTasks1.getSize()) + myTask = secondPhaseTasks1[blockIdx.x]; + else + myTask = secondPhaseTasks2[blockIdx.x - secondPhaseTasks1.getSize()]; + + if(myTask.partitionEnd - myTask.partitionBegin <= 0 ) + return; + + auto arrView = arr.getView(myTask.partitionBegin, myTask.partitionEnd); + auto auxView = aux.getView(myTask.partitionBegin, myTask.partitionEnd); + + singleBlockQuickSort(arrView, auxView, Cmp, myTask.depth); +} + //----------------------------------------------------------- __global__ void cudaCalcBlocksNeeded(ArrayView cuda_tasks, int elemPerBlock, @@ -222,9 +244,9 @@ __global__ void cudaInitTask(ArrayView cuda_tasks, //----------------------------------------------------------- const int threadsPerBlock = 512, g_maxBlocks = 1 << 15; //32k const int g_maxTasks = 1 << 14; -const int minElemPerBlock = threadsPerBlock*2; +const int minElemPerBlock = threadsPerBlock*10; const int maxBitonicSize = threadsPerBlock*2; -const int desired_2ndPhasElemPerBlock = maxBitonicSize*8; +const int desired_2ndPhasElemPerBlock = maxBitonicSize; template class QUICKSORT @@ -341,24 +363,31 @@ void QUICKSORT::sort(const Function &Cmp) iteration++; } - if (tasksAmount > 0) + int total2ndPhase = tasksAmount + host_2ndPhaseTasksAmount; + if (total2ndPhase > 0) { - auto & tasks = iteration % 2 == 0 ? cuda_tasks : cuda_newTasks; - cudaQuickSort2ndPhase - <<>>(arr, aux, Cmp, tasks); + const int stackSize = 32; + if(tasksAmount >0 && host_2ndPhaseTasksAmount > 0) + { + auto tasks = iteration % 2 == 0 ? cuda_tasks.getView(0, tasksAmount) : cuda_newTasks.getView(0, tasksAmount); + auto tasks2 = cuda_2ndPhaseTasks.getView(0, host_2ndPhaseTasksAmount); - TNL_CHECK_CUDA_DEVICE; - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - } - - if (host_2ndPhaseTasksAmount > 0) - { - cudaQuickSort2ndPhase - <<>> - (arr, aux, Cmp, cuda_2ndPhaseTasks); + cudaQuickSort2ndPhase + <<>>(arr, aux, Cmp, tasks, tasks2); + } + else if(tasksAmount >0) + { + auto tasks = iteration % 2 == 0 ? cuda_tasks.getView(0, tasksAmount) : cuda_newTasks.getView(0, tasksAmount); + cudaQuickSort2ndPhase + <<>>(arr, aux, Cmp, tasks); + } + else + { + auto tasks2 = cuda_2ndPhaseTasks.getView(0, host_2ndPhaseTasksAmount); - TNL_CHECK_CUDA_DEVICE; + cudaQuickSort2ndPhase + <<>>(arr, aux, Cmp, tasks2); + } } cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; -- GitLab From 74f93d94813c482b0a945fcfdd1bd38d7b4bf950 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 7 Apr 2021 14:23:43 +0200 Subject: [PATCH 186/258] calc sharedMem for 1st phase --- GPUSort/src/quicksort/quicksort.cuh | 330 +++++++++++++++++----------- 1 file changed, 204 insertions(+), 126 deletions(-) diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index 4b729ac81..f1886a84c 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -19,11 +19,11 @@ using namespace TNL::Containers; //----------------------------------------------------------- __device__ void writeNewTask(int begin, int end, int depth, int maxElemFor2ndPhase, - ArrayView newTasks, int *newTasksCnt, + ArrayView newTasks, int *newTasksCnt, ArrayView secondPhaseTasks, int *secondPhaseTasksCnt) { int size = end - begin; - if(size < 0) + if (size < 0) { printf("negative size, something went really wrong\n"); return; @@ -68,18 +68,18 @@ __device__ void writeNewTask(int begin, int end, int depth, int maxElemFor2ndPha template __global__ void cudaQuickSort1stPhase_1(ArrayView arr, ArrayView aux, - const Function &Cmp, int elemPerBlock, - ArrayView tasks, - ArrayView taskMapping) + const Function &Cmp, int elemPerBlock, + ArrayView tasks, + ArrayView taskMapping) { extern __shared__ int externMem[]; - Value *sharedMem = (Value*)externMem; + Value *sharedMem = (Value *)externMem; static __shared__ Value pivot; TASK &myTask = tasks[taskMapping[blockIdx.x]]; - auto & src = (myTask.depth & 1) == 0? arr : aux; - auto & dst = (myTask.depth & 1) == 0? aux : arr; + auto &src = (myTask.depth & 1) == 0 ? arr : aux; + auto &dst = (myTask.depth & 1) == 0 ? aux : arr; if (threadIdx.x == 0) pivot = src[myTask.pivotIdx]; @@ -94,15 +94,15 @@ __global__ void cudaQuickSort1stPhase_1(ArrayView arr, Arr template __global__ void cudaQuickSort1stPhase_2(ArrayView arr, ArrayView aux, - const Function &Cmp, int elemPerBlock, - ArrayView tasks, - ArrayView taskMapping) + const Function &Cmp, int elemPerBlock, + ArrayView tasks, + ArrayView taskMapping) { static __shared__ Value pivot; TASK &myTask = tasks[taskMapping[blockIdx.x]]; - auto & src = (myTask.depth & 1) == 0? arr : aux; - auto & dst = (myTask.depth & 1) == 0? aux : arr; + auto &src = (myTask.depth & 1) == 0 ? arr : aux; + auto &dst = (myTask.depth & 1) == 0 ? aux : arr; if (threadIdx.x == 0) pivot = src[myTask.pivotIdx]; @@ -116,7 +116,7 @@ __global__ void cudaQuickSort1stPhase_2(ArrayView arr, Arr //---------------------------------------------------- -template +template __global__ void cudaWritePivot(ArrayView arr, ArrayView aux, int maxElemFor2ndPhase, ArrayView tasks, ArrayView newTasks, int *newTasksCnt, ArrayView secondPhaseTasks, int *secondPhaseTasksCnt) @@ -149,20 +149,20 @@ __global__ void cudaWritePivot(ArrayView arr, ArrayView 0) + if (leftEnd - leftBegin > 0) { writeNewTask(leftBegin, leftEnd, myTask.depth, - maxElemFor2ndPhase, - newTasks, newTasksCnt, - secondPhaseTasks, secondPhaseTasksCnt); + maxElemFor2ndPhase, + newTasks, newTasksCnt, + secondPhaseTasks, secondPhaseTasksCnt); } - if(rightEnd - rightBegin > 0) + if (rightEnd - rightBegin > 0) { writeNewTask(rightBegin, rightEnd, - myTask.depth, maxElemFor2ndPhase, - newTasks, newTasksCnt, - secondPhaseTasks, secondPhaseTasksCnt); + myTask.depth, maxElemFor2ndPhase, + newTasks, newTasksCnt, + secondPhaseTasks, secondPhaseTasksCnt); } } @@ -174,7 +174,7 @@ __global__ void cudaQuickSort2ndPhase(ArrayView arr, Array ArrayView secondPhaseTasks) { TASK &myTask = secondPhaseTasks[blockIdx.x]; - if(myTask.partitionEnd - myTask.partitionBegin <= 0 ) + if (myTask.partitionEnd - myTask.partitionBegin <= 0) return; auto arrView = arr.getView(myTask.partitionBegin, myTask.partitionEnd); @@ -183,7 +183,6 @@ __global__ void cudaQuickSort2ndPhase(ArrayView arr, Array singleBlockQuickSort(arrView, auxView, Cmp, myTask.depth); } - template __global__ void cudaQuickSort2ndPhase(ArrayView arr, ArrayView aux, const Function &Cmp, @@ -191,12 +190,12 @@ __global__ void cudaQuickSort2ndPhase(ArrayView arr, Array ArrayView secondPhaseTasks2) { TASK myTask; - if(blockIdx.x < secondPhaseTasks1.getSize()) + if (blockIdx.x < secondPhaseTasks1.getSize()) myTask = secondPhaseTasks1[blockIdx.x]; else myTask = secondPhaseTasks2[blockIdx.x - secondPhaseTasks1.getSize()]; - if(myTask.partitionEnd - myTask.partitionBegin <= 0 ) + if (myTask.partitionEnd - myTask.partitionBegin <= 0) return; auto arrView = arr.getView(myTask.partitionBegin, myTask.partitionEnd); @@ -208,16 +207,16 @@ __global__ void cudaQuickSort2ndPhase(ArrayView arr, Array //----------------------------------------------------------- __global__ void cudaCalcBlocksNeeded(ArrayView cuda_tasks, int elemPerBlock, - ArrayView blocksNeeded) + ArrayView blocksNeeded) { int i = blockIdx.x * blockDim.x + threadIdx.x; - if(i >= cuda_tasks.getSize()) + if (i >= cuda_tasks.getSize()) return; auto task = cuda_tasks[i]; int size = task.partitionEnd - task.partitionBegin; blocksNeeded[i] = size / elemPerBlock + (size % elemPerBlock != 0); -} +} template __global__ void cudaInitTask(ArrayView cuda_tasks, @@ -225,40 +224,42 @@ __global__ void cudaInitTask(ArrayView cuda_tasks, ArrayView cuda_reductionTaskInitMem, ArrayView src, const Function &Cmp) { - if(blockIdx.x >= cuda_tasks.getSize()) + if (blockIdx.x >= cuda_tasks.getSize()) return; - int start = blockIdx.x == 0? 0 : cuda_reductionTaskInitMem[blockIdx.x -1]; + int start = blockIdx.x == 0 ? 0 : cuda_reductionTaskInitMem[blockIdx.x - 1]; int end = cuda_reductionTaskInitMem[blockIdx.x]; - for(int i = start + threadIdx.x; i < end; i += blockDim.x) + for (int i = start + threadIdx.x; i < end; i += blockDim.x) cuda_blockToTaskMapping[i] = blockIdx.x; - if(threadIdx.x == 0) + if (threadIdx.x == 0) { - TASK & task = cuda_tasks[blockIdx.x]; + TASK &task = cuda_tasks[blockIdx.x]; int pivotIdx = task.partitionBegin + pickPivotIdx(src.getView(task.partitionBegin, task.partitionEnd), Cmp); - task.initTask(start, end-start, pivotIdx); + task.initTask(start, end - start, pivotIdx); } } //----------------------------------------------------------- //----------------------------------------------------------- -const int threadsPerBlock = 512, g_maxBlocks = 1 << 15; //32k -const int g_maxTasks = 1 << 14; -const int minElemPerBlock = threadsPerBlock*10; -const int maxBitonicSize = threadsPerBlock*2; -const int desired_2ndPhasElemPerBlock = maxBitonicSize; -template +template class QUICKSORT { ArrayView arr; Array aux; - int maxTasks, maxBlocks; + + int maxBlocks, threadsPerBlock, desiredElemPerBlock, maxSharable; + + const int maxBitonicSize = threadsPerBlock * 2; + const int desired_2ndPhasElemPerBlock = maxBitonicSize; + const int g_maxTasks = 1 << 14; + + int maxTasks; Array cuda_tasks, cuda_newTasks, cuda_2ndPhaseTasks; - Array cuda_newTasksAmount, cuda_2ndPhaseTasksAmount; //is in reality 1 integer + Array cuda_newTasksAmount, cuda_2ndPhaseTasksAmount; //is in reality 1 integer each - int tasksAmount; //counter for Host == cuda_newTasksAmount + int host_1stPhaseTasksAmount; //counter for Host == cuda_newTasksAmount int host_2ndPhaseTasksAmount; // cuda_2ndPhaseTasksAmount Array cuda_blockToTaskMapping; @@ -266,32 +267,40 @@ class QUICKSORT int iteration = 0; //-------------------------------------------------------------------------------------- - cudaDeviceProp deviceProp; //-------------------------------------------------------------------------------------- public: - QUICKSORT(ArrayView _arr) - : arr(_arr), aux(arr.getSize()), + QUICKSORT(ArrayView arr, int gridDim, int blockDim, int desiredElemPerBlock, int maxSharable) + : arr(arr.getView()), aux(arr.getSize()), + maxBlocks(gridDim), threadsPerBlock(blockDim), + desiredElemPerBlock(desiredElemPerBlock), maxSharable(maxSharable), + maxTasks(min(arr.getSize(), g_maxTasks)), - maxBlocks(g_maxBlocks), + cuda_tasks(maxTasks), cuda_newTasks(maxTasks), cuda_2ndPhaseTasks(maxTasks), - cuda_newTasksAmount(1), - cuda_2ndPhaseTasksAmount(1), + cuda_newTasksAmount(1), cuda_2ndPhaseTasksAmount(1), + cuda_blockToTaskMapping(maxBlocks), cuda_reductionTaskInitMem(maxTasks) { cuda_tasks.setElement(0, TASK(0, arr.getSize(), 0)); - tasksAmount = 1; + host_1stPhaseTasksAmount = 1; + host_2ndPhaseTasksAmount = 0; cuda_2ndPhaseTasksAmount = 0; iteration = 0; - cudaGetDeviceProperties(&deviceProp, 0); //change device TNL_CHECK_CUDA_DEVICE; } template void sort(const Function &Cmp); + template + void firstPhase(const Function &Cmp); + + template + void secondPhase(const Function &Cmp); + int getSetsNeeded(int elemPerBlock) const; int getElemPerBlock() const; @@ -299,41 +308,66 @@ public: * returns the amount of blocks needed * */ template - int initTasks(int elemPerBlock, const Function & Cmp); + int initTasks(int elemPerBlock, const Function &Cmp); void processNewTasks(); }; +//--------------------------------------------------------------------------------------------- + template template void QUICKSORT::sort(const Function &Cmp) { - - while (tasksAmount > 0) + firstPhase(Cmp); + + int total2ndPhase = host_1stPhaseTasksAmount + host_2ndPhaseTasksAmount; + if (total2ndPhase > 0) + secondPhase(Cmp); + + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + return; +} + +//--------------------------------------------------------------------------------------------- + +template +template +void QUICKSORT::firstPhase(const Function &Cmp) +{ + while (host_1stPhaseTasksAmount > 0) { - //2ndphase task is now full or tasksAmount is full, as backup during writing, overflowing tasks were written into the other array - if (tasksAmount >= maxTasks || host_2ndPhaseTasksAmount >= maxTasks) - { + //2ndphase task is now full or host_1stPhaseTasksAmount is full, as backup during writing, overflowing tasks were written into the other array + if (host_1stPhaseTasksAmount >= maxTasks || host_2ndPhaseTasksAmount >= maxTasks) break; - } //just in case newly created tasks wouldnt fit - if(tasksAmount*2 >= maxTasks + (maxTasks - host_2ndPhaseTasksAmount)) - { + if (host_1stPhaseTasksAmount * 2 >= maxTasks + (maxTasks - host_2ndPhaseTasksAmount)) break; - } int elemPerBlock = getElemPerBlock(); int blocksCnt = initTasks(elemPerBlock, Cmp); - if(blocksCnt >= cuda_blockToTaskMapping.getSize()) + TNL_CHECK_CUDA_DEVICE; + + if (blocksCnt >= maxBlocks) //too many blocks needed, switch to 2nd phase break; - TNL_CHECK_CUDA_DEVICE; + //----------------------------------------------- + //do the partitioning + auto &task = iteration % 2 == 0 ? cuda_tasks : cuda_newTasks; int externMemByteSize = elemPerBlock * sizeof(Value); - auto & task = iteration % 2 == 0? cuda_tasks : cuda_newTasks; - if(externMemByteSize <= deviceProp.sharedMemPerBlock) + /** + * check if can partition using shared memory for coalesced read and write + * 1st phase of partitioning + * sets of blocks work on a task + * + * using the atomicAdd intristic, each block reserves a chunk of memory where to move elements + * smaller and bigger than pivot move to + * */ + if (externMemByteSize <= maxSharable) { cudaQuickSort1stPhase_1 <<>>( @@ -345,110 +379,124 @@ void QUICKSORT::sort(const Function &Cmp) cudaQuickSort1stPhase_2 <<>>( arr, aux, Cmp, elemPerBlock, - task, cuda_blockToTaskMapping); + task, cuda_blockToTaskMapping); } - TNL_CHECK_CUDA_DEVICE; - auto & newTask = iteration % 2 == 0? cuda_newTasks : cuda_tasks; + /** + * fill in the gap between smaller and bigger with elements == pivot + * after writing also create new tasks, each task generates at max 2 tasks + * + * tasks smaller than desired_2ndPhasElemPerBlock go into 2nd phase + * bigger need more blocks to partition and are written into newTask + * with iteration %2, rotate between the 2 tasks array to save from copying + * */ + auto &newTask = iteration % 2 == 0 ? cuda_newTasks : cuda_tasks; cudaWritePivot - <<>>( - arr, aux, desired_2ndPhasElemPerBlock, - task, newTask, cuda_newTasksAmount.getData(), - cuda_2ndPhaseTasks, cuda_2ndPhaseTasksAmount.getData()); - + <<>>( + arr, aux, desired_2ndPhasElemPerBlock, + task, newTask, cuda_newTasksAmount.getData(), + cuda_2ndPhaseTasks, cuda_2ndPhaseTasksAmount.getData()); TNL_CHECK_CUDA_DEVICE; processNewTasks(); iteration++; } - - int total2ndPhase = tasksAmount + host_2ndPhaseTasksAmount; - if (total2ndPhase > 0) +} + +//---------------------------------------------------------------------- + +template +template +void QUICKSORT::secondPhase(const Function &Cmp) +{ + int total2ndPhase = host_1stPhaseTasksAmount + host_2ndPhaseTasksAmount; + const int stackSize = 32; + auto &leftoverTasks = iteration % 2 == 0 ? cuda_tasks : cuda_newTasks; + + if (host_1stPhaseTasksAmount > 0 && host_2ndPhaseTasksAmount > 0) { - const int stackSize = 32; - if(tasksAmount >0 && host_2ndPhaseTasksAmount > 0) - { - auto tasks = iteration % 2 == 0 ? cuda_tasks.getView(0, tasksAmount) : cuda_newTasks.getView(0, tasksAmount); - auto tasks2 = cuda_2ndPhaseTasks.getView(0, host_2ndPhaseTasksAmount); + auto tasks2 = cuda_2ndPhaseTasks.getView(0, host_2ndPhaseTasksAmount); - cudaQuickSort2ndPhase - <<>>(arr, aux, Cmp, tasks, tasks2); - } - else if(tasksAmount >0) - { - auto tasks = iteration % 2 == 0 ? cuda_tasks.getView(0, tasksAmount) : cuda_newTasks.getView(0, tasksAmount); - cudaQuickSort2ndPhase - <<>>(arr, aux, Cmp, tasks); - } - else - { - auto tasks2 = cuda_2ndPhaseTasks.getView(0, host_2ndPhaseTasksAmount); + cudaQuickSort2ndPhase + <<>>(arr, aux, Cmp, leftoverTasks, tasks2); + } + else if (host_1stPhaseTasksAmount > 0) + { + auto tasks = leftoverTasks.getView(0, host_1stPhaseTasksAmount); + cudaQuickSort2ndPhase + <<>>(arr, aux, Cmp, tasks); + } + else + { + auto tasks2 = cuda_2ndPhaseTasks.getView(0, host_2ndPhaseTasksAmount); - cudaQuickSort2ndPhase - <<>>(arr, aux, Cmp, tasks2); - } + cudaQuickSort2ndPhase + <<>>(arr, aux, Cmp, tasks2); } - cudaDeviceSynchronize(); - TNL_CHECK_CUDA_DEVICE; - return; } +//---------------------------------------------------------------------- + template int QUICKSORT::getSetsNeeded(int elemPerBlock) const { auto view = iteration % 2 == 0 ? cuda_tasks.getConstView() : cuda_newTasks.getConstView(); auto fetch = [=] __cuda_callable__(int i) { - auto &task = view[i]; + const auto &task = view[i]; int size = task.partitionEnd - task.partitionBegin; return size / elemPerBlock + (size % elemPerBlock != 0); }; auto reduction = [] __cuda_callable__(int a, int b) { return a + b; }; - return Algorithms::Reduction::reduce(0, tasksAmount, fetch, reduction, 0); + return Algorithms::Reduction::reduce(0, host_1stPhaseTasksAmount, fetch, reduction, 0); } template int QUICKSORT::getElemPerBlock() const { - int setsNeeded = getSetsNeeded(minElemPerBlock); + int setsNeeded = getSetsNeeded(desiredElemPerBlock); if (setsNeeded <= maxBlocks) - return minElemPerBlock; + return desiredElemPerBlock; + + //want multiplier*minElemPerBLock <= x*threadPerBlock + //find smallest x so that this inequality holds + double multiplier = 1. * setsNeeded / maxBlocks; + int elemPerBlock = multiplier * desiredElemPerBlock; + setsNeeded = elemPerBlock / threadsPerBlock + (elemPerBlock % threadsPerBlock != 0); - int setsPerBlock = ceil(1. * setsNeeded / maxBlocks); - return setsPerBlock * minElemPerBlock; + return setsNeeded * threadsPerBlock; } template template -int QUICKSORT::initTasks(int elemPerBlock, const Function & Cmp) +int QUICKSORT::initTasks(int elemPerBlock, const Function &Cmp) { - int threads = min(tasksAmount, threadsPerBlock); - int blocks = tasksAmount / threads + (tasksAmount % threads != 0); + int threads = min(host_1stPhaseTasksAmount, threadsPerBlock); + int blocks = host_1stPhaseTasksAmount / threads + (host_1stPhaseTasksAmount % threads != 0); - auto src = iteration % 2 == 0? arr : aux.getView(); - auto &tasks = iteration % 2 == 0? cuda_tasks : cuda_newTasks; + auto src = iteration % 2 == 0 ? arr : aux.getView(); + auto &tasks = iteration % 2 == 0 ? cuda_tasks : cuda_newTasks; //[i] == how many blocks task i needs - cudaCalcBlocksNeeded<<>>(tasks.getView(0, tasksAmount), - elemPerBlock, cuda_reductionTaskInitMem.getView(0, tasksAmount)); + cudaCalcBlocksNeeded<<>>(tasks.getView(0, host_1stPhaseTasksAmount), + elemPerBlock, cuda_reductionTaskInitMem.getView(0, host_1stPhaseTasksAmount)); thrust::inclusive_scan(thrust::device, - cuda_reductionTaskInitMem.getData(), - cuda_reductionTaskInitMem.getData() + tasksAmount, - cuda_reductionTaskInitMem.getData()); + cuda_reductionTaskInitMem.getData(), + cuda_reductionTaskInitMem.getData() + host_1stPhaseTasksAmount, + cuda_reductionTaskInitMem.getData()); - int blocksNeeded = cuda_reductionTaskInitMem.getElement(tasksAmount - 1); + int blocksNeeded = cuda_reductionTaskInitMem.getElement(host_1stPhaseTasksAmount - 1); //need too many blocks, give back control - if(blocksNeeded >= cuda_blockToTaskMapping.getSize()) + if (blocksNeeded >= cuda_blockToTaskMapping.getSize()) return blocksNeeded; - cudaInitTask<<>>( - tasks.getView(0, tasksAmount), + cudaInitTask<<>>( + tasks.getView(0, host_1stPhaseTasksAmount), cuda_blockToTaskMapping.getView(0, blocksNeeded), - cuda_reductionTaskInitMem.getView(0, tasksAmount), - src, Cmp - ); + cuda_reductionTaskInitMem.getView(0, host_1stPhaseTasksAmount), + src, Cmp); cuda_newTasksAmount.setElement(0, 0); return blocksNeeded; @@ -457,7 +505,7 @@ int QUICKSORT::initTasks(int elemPerBlock, const Function & Cmp) template void QUICKSORT::processNewTasks() { - tasksAmount = cuda_newTasksAmount.getElement(0); + host_1stPhaseTasksAmount = cuda_newTasksAmount.getElement(0); host_2ndPhaseTasksAmount = cuda_2ndPhaseTasksAmount.getElement(0); } @@ -468,12 +516,42 @@ void QUICKSORT::processNewTasks() template void quicksort(ArrayView arr, const Function &Cmp) { - QUICKSORT sorter(arr); + const int maxBlocks = (1 << 20); + + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, 0); + int sharedReserve = sizeof(Value) + sizeof(int) * 16; //1pivot + 16 other shared vars reserved + int maxSharable = deviceProp.sharedMemPerBlock - sharedReserve; + + //blockDim*multiplier*sizeof(Value) <= maxSharable + + int blockDim = 512; //best case + int elemPerBlock = maxSharable / sizeof(Value); + const int maxMultiplier = 8; + int multiplier = min(elemPerBlock / blockDim, maxMultiplier); + if (multiplier <= 0) + { + blockDim = 256; + multiplier = min(elemPerBlock / blockDim, maxMultiplier); + if (multiplier <= 0) + { + //worst case scenario, shared memory cant be utilized at all because of the sheer size of Value + //sort has to be done with the use of global memory alone + + QUICKSORT sorter(arr, maxBlocks, 512, 0, maxSharable); + sorter.sort(Cmp); + return; + } + } + + assert(blockDim * multiplier * sizeof(Value) <= maxSharable); + + QUICKSORT sorter(arr, maxBlocks, blockDim, multiplier*blockDim, maxSharable); sorter.sort(Cmp); } template void quicksort(ArrayView arr) { - quicksort(arr, [] __cuda_callable__(const Value & a, const Value & b) { return a < b; }); + quicksort(arr, [] __cuda_callable__(const Value &a, const Value &b) { return a < b; }); } -- GitLab From fae1bc6dda8556a8c3bf93e7493b9b59b68344f6 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 7 Apr 2021 14:35:10 +0200 Subject: [PATCH 187/258] unify partitioning function --- GPUSort/src/quicksort/cudaPartition.cuh | 77 ++++++++----------------- GPUSort/src/quicksort/quicksort.cuh | 36 +++--------- 2 files changed, 31 insertions(+), 82 deletions(-) diff --git a/GPUSort/src/quicksort/cudaPartition.cuh b/GPUSort/src/quicksort/cudaPartition.cuh index 16fb8c5cc..e5d61f728 100644 --- a/GPUSort/src/quicksort/cudaPartition.cuh +++ b/GPUSort/src/quicksort/cudaPartition.cuh @@ -129,18 +129,14 @@ void copyData(ArrayView src, const Value data = src[i]; if (data < pivot) { - /* if(smallerStart >= dst.getSize() || smallerStart < 0) - printf("failed here: b:%d t:%d: tried to write into [%d]/%d\n", blockDim.x, threadIdx.x, smallerStart, dst.getSize()); - */ + printf("failed smaller: b:%d t:%d: tried to write into [%d]/%d\n", blockDim.x, threadIdx.x, smallerStart, dst.getSize()); dst[smallerStart++] = data; } else if (data > pivot) { - if(biggerStart >= dst.getSize() || biggerStart < 0) - printf("failed here: b:%d t:%d: tried to write into [%d]/%d\n", blockDim.x, threadIdx.x, biggerStart, dst.getSize()); - + printf("failed bigger: b:%d t:%d: tried to write into [%d]/%d\n", blockDim.x, threadIdx.x, biggerStart, dst.getSize()); dst[biggerStart++] = data; } } @@ -148,8 +144,8 @@ void copyData(ArrayView src, //---------------------------------------------------------------------------------- -template -__device__ void cudaPartition_1(ArrayView src, +template +__device__ void cudaPartition(ArrayView src, ArrayView dst, Value * sharedMem, const Function &Cmp, const Value & pivot, @@ -157,7 +153,6 @@ __device__ void cudaPartition_1(ArrayView src, ) { static __shared__ int smallerStart, biggerStart; - static __shared__ int smallerTotal, biggerTotal; int myBegin = elemPerBlock * (blockIdx.x - task.firstBlock); int myEnd = TNL::min(myBegin + elemPerBlock, src.getSize()); @@ -168,7 +163,7 @@ __device__ void cudaPartition_1(ArrayView src, int smaller = 0, bigger = 0; countElem(srcView, smaller, bigger, pivot); - + int smallerPrefSumInc = blockInclusivePrefixSum(smaller); int biggerPrefSumInc = blockInclusivePrefixSum(bigger); @@ -176,54 +171,30 @@ __device__ void cudaPartition_1(ArrayView src, { smallerStart = atomicAdd(&(task.dstBegin), smallerPrefSumInc); biggerStart = atomicAdd(&(task.dstEnd), -biggerPrefSumInc) - biggerPrefSumInc; - smallerTotal = smallerPrefSumInc; - biggerTotal = biggerPrefSumInc; } __syncthreads(); //----------------------------------------------------------- + if(useShared) + { + static __shared__ int smallerTotal, biggerTotal; + if (threadIdx.x == blockDim.x - 1) + { + smallerTotal = smallerPrefSumInc; + biggerTotal = biggerPrefSumInc; + } + __syncthreads(); - copyDataShared(srcView, dst, sharedMem, - smallerStart, biggerStart, - smallerTotal, biggerTotal, - smallerPrefSumInc - smaller, biggerPrefSumInc - bigger, //exclusive prefix sum of elements - pivot); -} - -//------------------------------------------------------------------ - -template -__device__ void cudaPartition_2(ArrayView src, - ArrayView dst, - const Function &Cmp, const Value & pivot, - int elemPerBlock, TASK & task - ) -{ - static __shared__ int smallerStart, biggerStart; - - int myBegin = elemPerBlock * (blockIdx.x - task.firstBlock); - int myEnd = TNL::min(myBegin + elemPerBlock, src.getSize()); - - auto srcView = src.getView(myBegin, myEnd); - - //------------------------------------------------------------------------- - - int smaller = 0, bigger = 0; - countElem(srcView, smaller, bigger, pivot); - - int smallerPrefSumInc = blockInclusivePrefixSum(smaller); - int biggerPrefSumInc = blockInclusivePrefixSum(bigger); - - if (threadIdx.x == blockDim.x - 1) //last thread in block has sum of all values + copyDataShared(srcView, dst, sharedMem, + smallerStart, biggerStart, + smallerTotal, biggerTotal, + smallerPrefSumInc - smaller, biggerPrefSumInc - bigger, //exclusive prefix sum of elements + pivot); + } + else { - smallerStart = atomicAdd(&(task.dstBegin), smallerPrefSumInc); - biggerStart = atomicAdd(&(task.dstEnd), -biggerPrefSumInc) - biggerPrefSumInc; + int destSmaller = smallerStart + smallerPrefSumInc - smaller; + int destBigger = biggerStart + biggerPrefSumInc - bigger; + copyData(srcView, dst, destSmaller, destBigger, pivot); } - __syncthreads(); - - //----------------------------------------------------------- - - int destSmaller = smallerStart + smallerPrefSumInc - smaller; - int destBigger = biggerStart + biggerPrefSumInc - bigger; - copyData(srcView, dst, destSmaller, destBigger, pivot); } \ No newline at end of file diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index f1886a84c..96a1622fc 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -66,8 +66,8 @@ __device__ void writeNewTask(int begin, int end, int depth, int maxElemFor2ndPha //---------------------------------------------------- -template -__global__ void cudaQuickSort1stPhase_1(ArrayView arr, ArrayView aux, +template +__global__ void cudaQuickSort1stPhase(ArrayView arr, ArrayView aux, const Function &Cmp, int elemPerBlock, ArrayView tasks, ArrayView taskMapping) @@ -85,35 +85,13 @@ __global__ void cudaQuickSort1stPhase_1(ArrayView arr, Arr pivot = src[myTask.pivotIdx]; __syncthreads(); - cudaPartition_1( + cudaPartition( src.getView(myTask.partitionBegin, myTask.partitionEnd), dst.getView(myTask.partitionBegin, myTask.partitionEnd), sharedMem, Cmp, pivot, elemPerBlock, myTask); } -template -__global__ void cudaQuickSort1stPhase_2(ArrayView arr, ArrayView aux, - const Function &Cmp, int elemPerBlock, - ArrayView tasks, - ArrayView taskMapping) -{ - static __shared__ Value pivot; - - TASK &myTask = tasks[taskMapping[blockIdx.x]]; - auto &src = (myTask.depth & 1) == 0 ? arr : aux; - auto &dst = (myTask.depth & 1) == 0 ? aux : arr; - - if (threadIdx.x == 0) - pivot = src[myTask.pivotIdx]; - __syncthreads(); - - cudaPartition_2( - src.getView(myTask.partitionBegin, myTask.partitionEnd), - dst.getView(myTask.partitionBegin, myTask.partitionEnd), - Cmp, pivot, elemPerBlock, myTask); -} - //---------------------------------------------------- template @@ -358,7 +336,7 @@ void QUICKSORT::firstPhase(const Function &Cmp) auto &task = iteration % 2 == 0 ? cuda_tasks : cuda_newTasks; int externMemByteSize = elemPerBlock * sizeof(Value); - + /** * check if can partition using shared memory for coalesced read and write * 1st phase of partitioning @@ -369,15 +347,15 @@ void QUICKSORT::firstPhase(const Function &Cmp) * */ if (externMemByteSize <= maxSharable) { - cudaQuickSort1stPhase_1 + cudaQuickSort1stPhase <<>>( arr, aux, Cmp, elemPerBlock, task, cuda_blockToTaskMapping); } else { - cudaQuickSort1stPhase_2 - <<>>( + cudaQuickSort1stPhase + <<>>( arr, aux, Cmp, elemPerBlock, task, cuda_blockToTaskMapping); } -- GitLab From ac4815b0c7fd0183b24f12fe99bc233e3884f20d Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 7 Apr 2021 15:14:37 +0200 Subject: [PATCH 188/258] fix tasks cnt bug --- GPUSort/src/quicksort/quicksort.cuh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index 96a1622fc..063d9becc 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -394,10 +394,11 @@ void QUICKSORT::secondPhase(const Function &Cmp) if (host_1stPhaseTasksAmount > 0 && host_2ndPhaseTasksAmount > 0) { + auto tasks = leftoverTasks.getView(0, host_1stPhaseTasksAmount); auto tasks2 = cuda_2ndPhaseTasks.getView(0, host_2ndPhaseTasksAmount); cudaQuickSort2ndPhase - <<>>(arr, aux, Cmp, leftoverTasks, tasks2); + <<>>(arr, aux, Cmp, tasks, tasks2); } else if (host_1stPhaseTasksAmount > 0) { @@ -483,8 +484,8 @@ int QUICKSORT::initTasks(int elemPerBlock, const Function &Cmp) template void QUICKSORT::processNewTasks() { - host_1stPhaseTasksAmount = cuda_newTasksAmount.getElement(0); - host_2ndPhaseTasksAmount = cuda_2ndPhaseTasksAmount.getElement(0); + host_1stPhaseTasksAmount = min(cuda_newTasksAmount.getElement(0), maxTasks); + host_2ndPhaseTasksAmount = min(cuda_2ndPhaseTasksAmount.getElement(0), maxTasks); } //----------------------------------------------------------- -- GitLab From cc506dc3c99953ee5880cec645139a66a7f24883 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 7 Apr 2021 15:40:48 +0200 Subject: [PATCH 189/258] use shared mem in 1block sort too --- GPUSort/src/quicksort/cudaPartition.cuh | 10 +++- GPUSort/src/quicksort/quicksort.cuh | 52 ++++++++++++++++++--- GPUSort/src/quicksort/quicksort_1Block.cuh | 54 ++++++++++++++++------ 3 files changed, 94 insertions(+), 22 deletions(-) diff --git a/GPUSort/src/quicksort/cudaPartition.cuh b/GPUSort/src/quicksort/cudaPartition.cuh index e5d61f728..0381d2299 100644 --- a/GPUSort/src/quicksort/cudaPartition.cuh +++ b/GPUSort/src/quicksort/cudaPartition.cuh @@ -82,8 +82,10 @@ void countElem(ArrayView arr, for (int i = threadIdx.x; i < arr.getSize(); i += blockDim.x) { const Value data = arr[i]; - smaller += (data < pivot); - bigger += (data > pivot); + if(data < pivot) + smaller++; + else if(data > pivot) + bigger++; } } @@ -129,14 +131,18 @@ void copyData(ArrayView src, const Value data = src[i]; if (data < pivot) { + /* if(smallerStart >= dst.getSize() || smallerStart < 0) printf("failed smaller: b:%d t:%d: tried to write into [%d]/%d\n", blockDim.x, threadIdx.x, smallerStart, dst.getSize()); + */ dst[smallerStart++] = data; } else if (data > pivot) { + /* if(biggerStart >= dst.getSize() || biggerStart < 0) printf("failed bigger: b:%d t:%d: tried to write into [%d]/%d\n", blockDim.x, threadIdx.x, biggerStart, dst.getSize()); + */ dst[biggerStart++] = data; } } diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index 063d9becc..faca01882 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -149,8 +149,12 @@ __global__ void cudaWritePivot(ArrayView arr, ArrayView __global__ void cudaQuickSort2ndPhase(ArrayView arr, ArrayView aux, const Function &Cmp, - ArrayView secondPhaseTasks) + ArrayView secondPhaseTasks, + int elemInShared) { + extern __shared__ int externMem[]; + Value * sharedMem = (Value *) externMem; + TASK &myTask = secondPhaseTasks[blockIdx.x]; if (myTask.partitionEnd - myTask.partitionBegin <= 0) return; @@ -158,15 +162,29 @@ __global__ void cudaQuickSort2ndPhase(ArrayView arr, Array auto arrView = arr.getView(myTask.partitionBegin, myTask.partitionEnd); auto auxView = aux.getView(myTask.partitionBegin, myTask.partitionEnd); - singleBlockQuickSort(arrView, auxView, Cmp, myTask.depth); + if(elemInShared == 0) + { + singleBlockQuickSort + (arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared); + } + else + { + singleBlockQuickSort + (arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared); + } + } template __global__ void cudaQuickSort2ndPhase(ArrayView arr, ArrayView aux, const Function &Cmp, ArrayView secondPhaseTasks1, - ArrayView secondPhaseTasks2) + ArrayView secondPhaseTasks2, + int elemInShared) { + extern __shared__ int externMem[]; + Value * sharedMem = (Value *) externMem; + TASK myTask; if (blockIdx.x < secondPhaseTasks1.getSize()) myTask = secondPhaseTasks1[blockIdx.x]; @@ -174,12 +192,24 @@ __global__ void cudaQuickSort2ndPhase(ArrayView arr, Array myTask = secondPhaseTasks2[blockIdx.x - secondPhaseTasks1.getSize()]; if (myTask.partitionEnd - myTask.partitionBegin <= 0) + { + printf("empty task???\n"); return; + } auto arrView = arr.getView(myTask.partitionBegin, myTask.partitionEnd); auto auxView = aux.getView(myTask.partitionBegin, myTask.partitionEnd); - singleBlockQuickSort(arrView, auxView, Cmp, myTask.depth); + if(elemInShared == 0) + { + singleBlockQuickSort + (arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared); + } + else + { + singleBlockQuickSort + (arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared); + } } //----------------------------------------------------------- @@ -392,26 +422,34 @@ void QUICKSORT::secondPhase(const Function &Cmp) const int stackSize = 32; auto &leftoverTasks = iteration % 2 == 0 ? cuda_tasks : cuda_newTasks; + int elemInShared = desiredElemPerBlock; + int externSharedByteSize = sizeof(Value) * elemInShared; + if(externSharedByteSize > maxSharable) + { + externSharedByteSize = 0; + elemInShared = 0; + } + if (host_1stPhaseTasksAmount > 0 && host_2ndPhaseTasksAmount > 0) { auto tasks = leftoverTasks.getView(0, host_1stPhaseTasksAmount); auto tasks2 = cuda_2ndPhaseTasks.getView(0, host_2ndPhaseTasksAmount); cudaQuickSort2ndPhase - <<>>(arr, aux, Cmp, tasks, tasks2); + <<>>(arr, aux, Cmp, tasks, tasks2, elemInShared); } else if (host_1stPhaseTasksAmount > 0) { auto tasks = leftoverTasks.getView(0, host_1stPhaseTasksAmount); cudaQuickSort2ndPhase - <<>>(arr, aux, Cmp, tasks); + <<>>(arr, aux, Cmp, tasks, elemInShared); } else { auto tasks2 = cuda_2ndPhaseTasks.getView(0, host_2ndPhaseTasksAmount); cudaQuickSort2ndPhase - <<>>(arr, aux, Cmp, tasks2); + <<>>(arr, aux, Cmp, tasks2, elemInShared); } } diff --git a/GPUSort/src/quicksort/quicksort_1Block.cuh b/GPUSort/src/quicksort/quicksort_1Block.cuh index f28509c66..0067daaf8 100644 --- a/GPUSort/src/quicksort/quicksort_1Block.cuh +++ b/GPUSort/src/quicksort/quicksort_1Block.cuh @@ -9,12 +9,12 @@ using namespace TNL; using namespace TNL::Containers; -template +template __device__ void externSort(ArrayView src, ArrayView dst, + Value * sharedMem, const Function & Cmp) { - static __shared__ Value sharedMem[externMemSize]; bitonicSort_Block(src, dst, sharedMem, Cmp); } @@ -71,15 +71,16 @@ __device__ void stackPush(int stackArrBegin[], int stackArrEnd[], } } -template +template __device__ void singleBlockQuickSort(ArrayView arr, ArrayView aux, - const Function & Cmp, int _depth) + const Function & Cmp, int _depth, + Value * sharedMem, int memSize) { if(arr.getSize() <= blockDim.x*2) { auto src = (_depth &1) == 0? arr : aux; - externSort(src, arr, Cmp); + externSort(src, arr, sharedMem, Cmp); return; } @@ -117,7 +118,7 @@ __device__ void singleBlockQuickSort(ArrayView arr, //small enough for for bitonic if(size <= blockDim.x*2) { - externSort(src.getView(begin, end), arr.getView(begin, end), Cmp); + externSort(src.getView(begin, end), arr.getView(begin, end), sharedMem, Cmp); __syncthreads(); continue; } @@ -132,21 +133,48 @@ __device__ void singleBlockQuickSort(ArrayView arr, countElem(src.getView(begin, end), smaller, bigger, pivot); //synchronization is in this function already - int smallerOffset = blockInclusivePrefixSum(smaller); - int biggerOffset = blockInclusivePrefixSum(bigger); + int smallerPrefSumInc = blockInclusivePrefixSum(smaller); + int biggerPrefSumInc = blockInclusivePrefixSum(bigger); if (threadIdx.x == blockDim.x - 1) //has sum of all smaller and greater elements than pivot in src { - pivotBegin = 0 + smallerOffset; - pivotEnd = size - biggerOffset; + pivotBegin = 0 + smallerPrefSumInc; + pivotEnd = size - biggerPrefSumInc; } __syncthreads(); - int destSmaller = 0 + (smallerOffset - smaller); - int destBigger = pivotEnd + (biggerOffset - bigger); + //-------------------------------------------------------------- + /** + * move elements, either use shared mem for coalesced access or without shared mem if data is too big + * */ + auto &dst = (depth&1) == 0 ? aux : arr; - copyData(src.getView(begin, end), dst.getView(begin, end), destSmaller, destBigger, pivot); + if(useShared && size <= memSize) + { + static __shared__ int smallerTotal, biggerTotal; + if (threadIdx.x == blockDim.x - 1) + { + smallerTotal = smallerPrefSumInc; + biggerTotal = biggerPrefSumInc; + } + __syncthreads(); + + copyDataShared(src.getView(begin, end), dst.getView(begin, end), + sharedMem, + 0, pivotEnd, + smallerTotal, biggerTotal, + smallerPrefSumInc - smaller, biggerPrefSumInc - bigger, //exclusive prefix sum of elements + pivot); + } + else + { + int destSmaller = 0 + (smallerPrefSumInc - smaller); + int destBigger = pivotEnd + (biggerPrefSumInc - bigger); + + copyData(src.getView(begin, end), dst.getView(begin, end), destSmaller, destBigger, pivot); + } + __syncthreads(); for (int i = pivotBegin + threadIdx.x; i < pivotEnd; i += blockDim.x) -- GitLab From 898ed316ab984572efe576a88a910ad7cb51a595 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 7 Apr 2021 22:59:00 +0200 Subject: [PATCH 190/258] add cmp function --- GPUSort/src/quicksort/cudaPartition.cuh | 32 ++++++++++------------ GPUSort/src/quicksort/quicksort_1Block.cuh | 6 ++-- 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/GPUSort/src/quicksort/cudaPartition.cuh b/GPUSort/src/quicksort/cudaPartition.cuh index 0381d2299..fcca2c328 100644 --- a/GPUSort/src/quicksort/cudaPartition.cuh +++ b/GPUSort/src/quicksort/cudaPartition.cuh @@ -3,9 +3,6 @@ #include #include "../util/reduction.cuh" #include "task.h" -#include - -#define deb(x) std::cout << #x << " = " << x << std::endl; using namespace TNL; using namespace TNL::Containers; @@ -73,26 +70,26 @@ __device__ Value pickPivotIdx(TNL::Containers::ArrayView src, con } } -template +template __device__ -void countElem(ArrayView arr, +void countElem(ArrayView arr, const Function & Cmp, int &smaller, int &bigger, const Value &pivot) { for (int i = threadIdx.x; i < arr.getSize(); i += blockDim.x) { const Value data = arr[i]; - if(data < pivot) + if(Cmp(data, pivot)) smaller++; - else if(data > pivot) + else if(Cmp(pivot, data) ) bigger++; } } -template +template __device__ void copyDataShared(ArrayView src, - ArrayView dst, + ArrayView dst, const Function & Cmp, Value *sharedMem, int smallerStart, int biggerStart, int smallerTotal, int biggerTotal, @@ -103,9 +100,9 @@ void copyDataShared(ArrayView src, for (int i = threadIdx.x; i < src.getSize(); i += blockDim.x) { const Value data = src[i]; - if (data < pivot) + if (Cmp(data, pivot)) sharedMem[smallerOffset++] = data; - else if (data > pivot) + else if (Cmp(pivot, data)) sharedMem[smallerTotal + biggerOffset++] = data; } __syncthreads(); @@ -119,17 +116,18 @@ void copyDataShared(ArrayView src, } } -template +template __device__ void copyData(ArrayView src, ArrayView dst, + const Function & Cmp, int smallerStart, int biggerStart, const Value &pivot) { for (int i = threadIdx.x; i < src.getSize(); i += blockDim.x) { const Value data = src[i]; - if (data < pivot) + if ( Cmp(data, pivot) ) { /* if(smallerStart >= dst.getSize() || smallerStart < 0) @@ -137,7 +135,7 @@ void copyData(ArrayView src, */ dst[smallerStart++] = data; } - else if (data > pivot) + else if ( Cmp(pivot, data) ) { /* if(biggerStart >= dst.getSize() || biggerStart < 0) @@ -168,7 +166,7 @@ __device__ void cudaPartition(ArrayView src, //------------------------------------------------------------------------- int smaller = 0, bigger = 0; - countElem(srcView, smaller, bigger, pivot); + countElem(srcView, Cmp, smaller, bigger, pivot); int smallerPrefSumInc = blockInclusivePrefixSum(smaller); int biggerPrefSumInc = blockInclusivePrefixSum(bigger); @@ -191,7 +189,7 @@ __device__ void cudaPartition(ArrayView src, } __syncthreads(); - copyDataShared(srcView, dst, sharedMem, + copyDataShared(srcView, dst, Cmp, sharedMem, smallerStart, biggerStart, smallerTotal, biggerTotal, smallerPrefSumInc - smaller, biggerPrefSumInc - bigger, //exclusive prefix sum of elements @@ -201,6 +199,6 @@ __device__ void cudaPartition(ArrayView src, { int destSmaller = smallerStart + smallerPrefSumInc - smaller; int destBigger = biggerStart + biggerPrefSumInc - bigger; - copyData(srcView, dst, destSmaller, destBigger, pivot); + copyData(srcView, dst, Cmp, destSmaller, destBigger, pivot); } } \ No newline at end of file diff --git a/GPUSort/src/quicksort/quicksort_1Block.cuh b/GPUSort/src/quicksort/quicksort_1Block.cuh index 0067daaf8..6503ab3c8 100644 --- a/GPUSort/src/quicksort/quicksort_1Block.cuh +++ b/GPUSort/src/quicksort/quicksort_1Block.cuh @@ -130,7 +130,7 @@ __device__ void singleBlockQuickSort(ArrayView arr, __syncthreads(); int smaller = 0, bigger = 0; - countElem(src.getView(begin, end), smaller, bigger, pivot); + countElem(src.getView(begin, end), Cmp, smaller, bigger, pivot); //synchronization is in this function already int smallerPrefSumInc = blockInclusivePrefixSum(smaller); @@ -160,7 +160,7 @@ __device__ void singleBlockQuickSort(ArrayView arr, } __syncthreads(); - copyDataShared(src.getView(begin, end), dst.getView(begin, end), + copyDataShared(src.getView(begin, end), dst.getView(begin, end), Cmp, sharedMem, 0, pivotEnd, smallerTotal, biggerTotal, @@ -172,7 +172,7 @@ __device__ void singleBlockQuickSort(ArrayView arr, int destSmaller = 0 + (smallerPrefSumInc - smaller); int destBigger = pivotEnd + (biggerPrefSumInc - bigger); - copyData(src.getView(begin, end), dst.getView(begin, end), destSmaller, destBigger, pivot); + copyData(src.getView(begin, end), dst.getView(begin, end), Cmp, destSmaller, destBigger, pivot); } __syncthreads(); -- GitLab From c7b915a6c9cb0863e4edd9e3f3c7feb17d0a33c6 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 7 Apr 2021 23:10:02 +0200 Subject: [PATCH 191/258] file formating and switch Cmp position in function call --- GPUSort/src/quicksort/cudaPartition.cuh | 106 ++++++++++----------- GPUSort/src/quicksort/quicksort.cuh | 37 ++++--- GPUSort/src/quicksort/quicksort_1Block.cuh | 87 +++++++++-------- 3 files changed, 111 insertions(+), 119 deletions(-) diff --git a/GPUSort/src/quicksort/cudaPartition.cuh b/GPUSort/src/quicksort/cudaPartition.cuh index fcca2c328..ebc28137e 100644 --- a/GPUSort/src/quicksort/cudaPartition.cuh +++ b/GPUSort/src/quicksort/cudaPartition.cuh @@ -8,93 +8,92 @@ using namespace TNL; using namespace TNL::Containers; template -__device__ Value pickPivot(TNL::Containers::ArrayView src, const Function & Cmp) +__device__ Value pickPivot(TNL::Containers::ArrayView src, const Function &Cmp) { //return src[0]; //return src[src.getSize()-1]; - if(src.getSize() ==1) + if (src.getSize() == 1) return src[0]; - - Value a = src[0], b = src[src.getSize()/2], c = src[src.getSize() - 1]; - if(Cmp(a, b)) // ..a..b.. + Value a = src[0], b = src[src.getSize() / 2], c = src[src.getSize() - 1]; + + if (Cmp(a, b)) // ..a..b.. { - if(Cmp(b, c))// ..a..b..c + if (Cmp(b, c)) // ..a..b..c return b; - else if(Cmp(c, a))//..c..a..b.. + else if (Cmp(c, a)) //..c..a..b.. return a; else //..a..c..b.. return c; } else //..b..a.. { - if(Cmp(a, c))//..b..a..c + if (Cmp(a, c)) //..b..a..c return a; - else if(Cmp(c, b))//..c..b..a.. + else if (Cmp(c, b)) //..c..b..a.. return b; else //..b..c..a.. return c; } - } template -__device__ Value pickPivotIdx(TNL::Containers::ArrayView src, const Function & Cmp) +__device__ Value pickPivotIdx(TNL::Containers::ArrayView src, const Function &Cmp) { //return 0; //return src.getSize()-1; - if(src.getSize() <= 1) + if (src.getSize() <= 1) return 0; - - Value a = src[0], b = src[src.getSize()/2], c = src[src.getSize() - 1]; - if(Cmp(a, b)) // ..a..b.. + Value a = src[0], b = src[src.getSize() / 2], c = src[src.getSize() - 1]; + + if (Cmp(a, b)) // ..a..b.. { - if(Cmp(b, c))// ..a..b..c - return src.getSize()/2; - else if(Cmp(c, a))//..c..a..b.. + if (Cmp(b, c)) // ..a..b..c + return src.getSize() / 2; + else if (Cmp(c, a)) //..c..a..b.. return 0; else //..a..c..b.. return src.getSize() - 1; } else //..b..a.. { - if(Cmp(a, c))//..b..a..c + if (Cmp(a, c)) //..b..a..c return 0; - else if(Cmp(c, b))//..c..b..a.. - return src.getSize()/2; + else if (Cmp(c, b)) //..c..b..a.. + return src.getSize() / 2; else //..b..c..a.. return src.getSize() - 1; } } template -__device__ -void countElem(ArrayView arr, const Function & Cmp, - int &smaller, int &bigger, - const Value &pivot) +__device__ void countElem(ArrayView arr, + const Function &Cmp, + int &smaller, int &bigger, + const Value &pivot) { for (int i = threadIdx.x; i < arr.getSize(); i += blockDim.x) { const Value data = arr[i]; - if(Cmp(data, pivot)) + if (Cmp(data, pivot)) smaller++; - else if(Cmp(pivot, data) ) + else if (Cmp(pivot, data)) bigger++; } } template -__device__ -void copyDataShared(ArrayView src, - ArrayView dst, const Function & Cmp, - Value *sharedMem, - int smallerStart, int biggerStart, - int smallerTotal, int biggerTotal, - int smallerOffset, int biggerOffset, //exclusive prefix sum of elements - const Value &pivot) +__device__ void copyDataShared(ArrayView src, + ArrayView dst, + const Function &Cmp, + Value *sharedMem, + int smallerStart, int biggerStart, + int smallerTotal, int biggerTotal, + int smallerOffset, int biggerOffset, //exclusive prefix sum of elements + const Value &pivot) { for (int i = threadIdx.x; i < src.getSize(); i += blockDim.x) @@ -109,7 +108,7 @@ void copyDataShared(ArrayView src, for (int i = threadIdx.x; i < smallerTotal + biggerTotal; i += blockDim.x) { - if(i < smallerTotal) + if (i < smallerTotal) dst[smallerStart + i] = sharedMem[i]; else dst[biggerStart + i - smallerTotal] = sharedMem[i]; @@ -117,17 +116,16 @@ void copyDataShared(ArrayView src, } template -__device__ -void copyData(ArrayView src, - ArrayView dst, - const Function & Cmp, - int smallerStart, int biggerStart, - const Value &pivot) +__device__ void copyData(ArrayView src, + ArrayView dst, + const Function &Cmp, + int smallerStart, int biggerStart, + const Value &pivot) { for (int i = threadIdx.x; i < src.getSize(); i += blockDim.x) { const Value data = src[i]; - if ( Cmp(data, pivot) ) + if (Cmp(data, pivot)) { /* if(smallerStart >= dst.getSize() || smallerStart < 0) @@ -135,7 +133,7 @@ void copyData(ArrayView src, */ dst[smallerStart++] = data; } - else if ( Cmp(pivot, data) ) + else if (Cmp(pivot, data)) { /* if(biggerStart >= dst.getSize() || biggerStart < 0) @@ -151,10 +149,10 @@ void copyData(ArrayView src, template __device__ void cudaPartition(ArrayView src, ArrayView dst, - Value * sharedMem, - const Function &Cmp, const Value & pivot, - int elemPerBlock, TASK & task - ) + const Function &Cmp, + Value *sharedMem, + const Value &pivot, + int elemPerBlock, TASK &task) { static __shared__ int smallerStart, biggerStart; @@ -167,7 +165,7 @@ __device__ void cudaPartition(ArrayView src, int smaller = 0, bigger = 0; countElem(srcView, Cmp, smaller, bigger, pivot); - + int smallerPrefSumInc = blockInclusivePrefixSum(smaller); int biggerPrefSumInc = blockInclusivePrefixSum(bigger); @@ -179,7 +177,7 @@ __device__ void cudaPartition(ArrayView src, __syncthreads(); //----------------------------------------------------------- - if(useShared) + if (useShared) { static __shared__ int smallerTotal, biggerTotal; if (threadIdx.x == blockDim.x - 1) @@ -190,10 +188,10 @@ __device__ void cudaPartition(ArrayView src, __syncthreads(); copyDataShared(srcView, dst, Cmp, sharedMem, - smallerStart, biggerStart, - smallerTotal, biggerTotal, - smallerPrefSumInc - smaller, biggerPrefSumInc - bigger, //exclusive prefix sum of elements - pivot); + smallerStart, biggerStart, + smallerTotal, biggerTotal, + smallerPrefSumInc - smaller, biggerPrefSumInc - bigger, //exclusive prefix sum of elements + pivot); } else { diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index faca01882..22dd4cbd2 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -68,9 +68,9 @@ __device__ void writeNewTask(int begin, int end, int depth, int maxElemFor2ndPha template __global__ void cudaQuickSort1stPhase(ArrayView arr, ArrayView aux, - const Function &Cmp, int elemPerBlock, - ArrayView tasks, - ArrayView taskMapping) + const Function &Cmp, int elemPerBlock, + ArrayView tasks, + ArrayView taskMapping) { extern __shared__ int externMem[]; Value *sharedMem = (Value *)externMem; @@ -88,8 +88,8 @@ __global__ void cudaQuickSort1stPhase(ArrayView arr, Array cudaPartition( src.getView(myTask.partitionBegin, myTask.partitionEnd), dst.getView(myTask.partitionBegin, myTask.partitionEnd), - sharedMem, - Cmp, pivot, elemPerBlock, myTask); + Cmp, sharedMem, pivot, + elemPerBlock, myTask); } //---------------------------------------------------- @@ -153,7 +153,7 @@ __global__ void cudaQuickSort2ndPhase(ArrayView arr, Array int elemInShared) { extern __shared__ int externMem[]; - Value * sharedMem = (Value *) externMem; + Value *sharedMem = (Value *)externMem; TASK &myTask = secondPhaseTasks[blockIdx.x]; if (myTask.partitionEnd - myTask.partitionBegin <= 0) @@ -162,17 +162,14 @@ __global__ void cudaQuickSort2ndPhase(ArrayView arr, Array auto arrView = arr.getView(myTask.partitionBegin, myTask.partitionEnd); auto auxView = aux.getView(myTask.partitionBegin, myTask.partitionEnd); - if(elemInShared == 0) + if (elemInShared == 0) { - singleBlockQuickSort - (arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared); + singleBlockQuickSort(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared); } else { - singleBlockQuickSort - (arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared); + singleBlockQuickSort(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared); } - } template @@ -183,7 +180,7 @@ __global__ void cudaQuickSort2ndPhase(ArrayView arr, Array int elemInShared) { extern __shared__ int externMem[]; - Value * sharedMem = (Value *) externMem; + Value *sharedMem = (Value *)externMem; TASK myTask; if (blockIdx.x < secondPhaseTasks1.getSize()) @@ -200,15 +197,13 @@ __global__ void cudaQuickSort2ndPhase(ArrayView arr, Array auto arrView = arr.getView(myTask.partitionBegin, myTask.partitionEnd); auto auxView = aux.getView(myTask.partitionBegin, myTask.partitionEnd); - if(elemInShared == 0) + if (elemInShared == 0) { - singleBlockQuickSort - (arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared); + singleBlockQuickSort(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared); } else { - singleBlockQuickSort - (arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared); + singleBlockQuickSort(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared); } } @@ -366,7 +361,7 @@ void QUICKSORT::firstPhase(const Function &Cmp) auto &task = iteration % 2 == 0 ? cuda_tasks : cuda_newTasks; int externMemByteSize = elemPerBlock * sizeof(Value); - + /** * check if can partition using shared memory for coalesced read and write * 1st phase of partitioning @@ -424,7 +419,7 @@ void QUICKSORT::secondPhase(const Function &Cmp) int elemInShared = desiredElemPerBlock; int externSharedByteSize = sizeof(Value) * elemInShared; - if(externSharedByteSize > maxSharable) + if (externSharedByteSize > maxSharable) { externSharedByteSize = 0; elemInShared = 0; @@ -563,7 +558,7 @@ void quicksort(ArrayView arr, const Function &Cmp) assert(blockDim * multiplier * sizeof(Value) <= maxSharable); - QUICKSORT sorter(arr, maxBlocks, blockDim, multiplier*blockDim, maxSharable); + QUICKSORT sorter(arr, maxBlocks, blockDim, multiplier * blockDim, maxSharable); sorter.sort(Cmp); } diff --git a/GPUSort/src/quicksort/quicksort_1Block.cuh b/GPUSort/src/quicksort/quicksort_1Block.cuh index 6503ab3c8..e63e4e006 100644 --- a/GPUSort/src/quicksort/quicksort_1Block.cuh +++ b/GPUSort/src/quicksort/quicksort_1Block.cuh @@ -11,35 +11,34 @@ using namespace TNL::Containers; template __device__ void externSort(ArrayView src, - ArrayView dst, - Value * sharedMem, - const Function & Cmp) + ArrayView dst, + const Function &Cmp, Value *sharedMem) { bitonicSort_Block(src, dst, sharedMem, Cmp); } -template +template __device__ void stackPush(int stackArrBegin[], int stackArrEnd[], - int stackDepth[], int & stackTop, - int begin, int pivotBegin, - int pivotEnd, int end, - int depth) + int stackDepth[], int &stackTop, + int begin, int pivotBegin, + int pivotEnd, int end, + int depth) { int sizeL = pivotBegin - begin, sizeR = end - pivotEnd; - + //push the bigger one 1st and then smaller one 2nd //in next iteration, the smaller part will be handled 1st - if(sizeL > sizeR) + if (sizeL > sizeR) { - if(sizeL > 0) //left from pivot are smaller elems + if (sizeL > 0) //left from pivot are smaller elems { stackArrBegin[stackTop] = begin; stackArrEnd[stackTop] = pivotBegin; stackDepth[stackTop] = depth + 1; stackTop++; } - - if(sizeR > 0) //right from pivot until end are elem greater than pivot + + if (sizeR > 0) //right from pivot until end are elem greater than pivot { assert(stackTop < stackSize && "Local quicksort stack overflow."); @@ -51,7 +50,7 @@ __device__ void stackPush(int stackArrBegin[], int stackArrEnd[], } else { - if(sizeR > 0) //right from pivot until end are elem greater than pivot + if (sizeR > 0) //right from pivot until end are elem greater than pivot { stackArrBegin[stackTop] = pivotEnd; stackArrEnd[stackTop] = end; @@ -59,7 +58,7 @@ __device__ void stackPush(int stackArrBegin[], int stackArrEnd[], stackTop++; } - if(sizeL > 0) //left from pivot are smaller elems + if (sizeL > 0) //left from pivot are smaller elems { assert(stackTop < stackSize && "Local quicksort stack overflow."); @@ -73,14 +72,14 @@ __device__ void stackPush(int stackArrBegin[], int stackArrEnd[], template __device__ void singleBlockQuickSort(ArrayView arr, - ArrayView aux, - const Function & Cmp, int _depth, - Value * sharedMem, int memSize) + ArrayView aux, + const Function &Cmp, int _depth, + Value *sharedMem, int memSize) { - if(arr.getSize() <= blockDim.x*2) + if (arr.getSize() <= blockDim.x * 2) { - auto src = (_depth &1) == 0? arr : aux; - externSort(src, arr, sharedMem, Cmp); + auto src = (_depth & 1) == 0 ? arr : aux; + externSort(src, arr, Cmp, sharedMem); return; } @@ -100,33 +99,33 @@ __device__ void singleBlockQuickSort(ArrayView arr, } __syncthreads(); - while(stackTop > 0) + while (stackTop > 0) { //pick up partition to break up if (threadIdx.x == 0) { - begin = stackArrBegin[stackTop-1]; - end = stackArrEnd[stackTop-1]; - depth = stackDepth[stackTop-1]; + begin = stackArrBegin[stackTop - 1]; + end = stackArrEnd[stackTop - 1]; + depth = stackDepth[stackTop - 1]; stackTop--; } __syncthreads(); int size = end - begin; - auto &src = (depth&1) == 0 ? arr : aux; + auto &src = (depth & 1) == 0 ? arr : aux; //small enough for for bitonic - if(size <= blockDim.x*2) + if (size <= blockDim.x * 2) { - externSort(src.getView(begin, end), arr.getView(begin, end), sharedMem, Cmp); + externSort(src.getView(begin, end), arr.getView(begin, end), Cmp, sharedMem); __syncthreads(); continue; } //------------------------------------------------------ //actually do partitioning from here on out - if(threadIdx.x == 0) - pivot = pickPivot(src.getView(begin, end),Cmp); + if (threadIdx.x == 0) + pivot = pickPivot(src.getView(begin, end), Cmp); __syncthreads(); int smaller = 0, bigger = 0; @@ -148,9 +147,9 @@ __device__ void singleBlockQuickSort(ArrayView arr, * move elements, either use shared mem for coalesced access or without shared mem if data is too big * */ - auto &dst = (depth&1) == 0 ? aux : arr; + auto &dst = (depth & 1) == 0 ? aux : arr; - if(useShared && size <= memSize) + if (useShared && size <= memSize) { static __shared__ int smallerTotal, biggerTotal; if (threadIdx.x == blockDim.x - 1) @@ -160,17 +159,17 @@ __device__ void singleBlockQuickSort(ArrayView arr, } __syncthreads(); - copyDataShared(src.getView(begin, end), dst.getView(begin, end), Cmp, - sharedMem, - 0, pivotEnd, - smallerTotal, biggerTotal, - smallerPrefSumInc - smaller, biggerPrefSumInc - bigger, //exclusive prefix sum of elements - pivot); + copyDataShared(src.getView(begin, end), dst.getView(begin, end), + Cmp, sharedMem, + 0, pivotEnd, + smallerTotal, biggerTotal, + smallerPrefSumInc - smaller, biggerPrefSumInc - bigger, //exclusive prefix sum of elements + pivot); } else { int destSmaller = 0 + (smallerPrefSumInc - smaller); - int destBigger = pivotEnd + (biggerPrefSumInc - bigger); + int destBigger = pivotEnd + (biggerPrefSumInc - bigger); copyData(src.getView(begin, end), dst.getView(begin, end), Cmp, destSmaller, destBigger, pivot); } @@ -181,13 +180,13 @@ __device__ void singleBlockQuickSort(ArrayView arr, arr[begin + i] = pivot; //creates new tasks - if(threadIdx.x == 0) + if (threadIdx.x == 0) { stackPush(stackArrBegin, stackArrEnd, stackDepth, stackTop, - begin, begin+ pivotBegin, - begin +pivotEnd, end, - depth); + begin, begin + pivotBegin, + begin + pivotEnd, end, + depth); } __syncthreads(); //sync to update stackTop - } //ends while loop + } //ends while loop } \ No newline at end of file -- GitLab From 0d4157b11716bbac75b8da6ddb9663377e8942ff Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 7 Apr 2021 23:56:08 +0200 Subject: [PATCH 192/258] support for structure sorting --- GPUSort/src/quicksort/cudaPartition.cuh | 2 +- GPUSort/src/quicksort/quicksort.cuh | 15 ++--------- GPUSort/src/quicksort/quicksort_1Block.cuh | 25 +++++++++++++------ .../tests/quicksort_unitTests/unitTests.cu | 22 ++++++++++++++++ 4 files changed, 43 insertions(+), 21 deletions(-) diff --git a/GPUSort/src/quicksort/cudaPartition.cuh b/GPUSort/src/quicksort/cudaPartition.cuh index ebc28137e..93f4e5fe3 100644 --- a/GPUSort/src/quicksort/cudaPartition.cuh +++ b/GPUSort/src/quicksort/cudaPartition.cuh @@ -39,7 +39,7 @@ __device__ Value pickPivot(TNL::Containers::ArrayView src, const } template -__device__ Value pickPivotIdx(TNL::Containers::ArrayView src, const Function &Cmp) +__device__ int pickPivotIdx(TNL::Containers::ArrayView src, const Function &Cmp) { //return 0; //return src.getSize()-1; diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index 22dd4cbd2..1c4e4a3dc 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -75,15 +75,12 @@ __global__ void cudaQuickSort1stPhase(ArrayView arr, Array extern __shared__ int externMem[]; Value *sharedMem = (Value *)externMem; - static __shared__ Value pivot; TASK &myTask = tasks[taskMapping[blockIdx.x]]; auto &src = (myTask.depth & 1) == 0 ? arr : aux; auto &dst = (myTask.depth & 1) == 0 ? aux : arr; - if (threadIdx.x == 0) - pivot = src[myTask.pivotIdx]; - __syncthreads(); + Value pivot = src[myTask.pivotIdx]; cudaPartition( src.getView(myTask.partitionBegin, myTask.partitionEnd), @@ -99,17 +96,9 @@ __global__ void cudaWritePivot(ArrayView arr, ArrayView tasks, ArrayView newTasks, int *newTasksCnt, ArrayView secondPhaseTasks, int *secondPhaseTasksCnt) { - static __shared__ Value pivot; TASK &myTask = tasks[blockIdx.x]; - if (threadIdx.x == 0) - { - if ((myTask.depth & 1) == 0) - pivot = arr[myTask.pivotIdx]; - else - pivot = aux[myTask.pivotIdx]; - } - __syncthreads(); + Value pivot = (myTask.depth & 1) == 0 ? arr[myTask.pivotIdx] : aux[myTask.pivotIdx]; int leftBegin = myTask.partitionBegin, leftEnd = myTask.partitionBegin + myTask.dstBegin; int rightBegin = myTask.partitionBegin + myTask.dstEnd, rightEnd = myTask.partitionEnd; diff --git a/GPUSort/src/quicksort/quicksort_1Block.cuh b/GPUSort/src/quicksort/quicksort_1Block.cuh index e63e4e006..45967366c 100644 --- a/GPUSort/src/quicksort/quicksort_1Block.cuh +++ b/GPUSort/src/quicksort/quicksort_1Block.cuh @@ -17,6 +17,14 @@ __device__ void externSort(ArrayView src, bitonicSort_Block(src, dst, sharedMem, Cmp); } +template +__device__ void externSort(ArrayView src, + ArrayView dst, + const Function &Cmp) +{ + bitonicSort_Block(src, dst, Cmp); +} + template __device__ void stackPush(int stackArrBegin[], int stackArrEnd[], int stackDepth[], int &stackTop, @@ -79,7 +87,11 @@ __device__ void singleBlockQuickSort(ArrayView arr, if (arr.getSize() <= blockDim.x * 2) { auto src = (_depth & 1) == 0 ? arr : aux; - externSort(src, arr, Cmp, sharedMem); + if(useShared) + externSort(src, arr, Cmp, sharedMem); + else + externSort(src, arr, Cmp); + return; } @@ -87,7 +99,6 @@ __device__ void singleBlockQuickSort(ArrayView arr, static __shared__ int stackArrBegin[stackSize], stackArrEnd[stackSize], stackDepth[stackSize]; static __shared__ int begin, end, depth; static __shared__ int pivotBegin, pivotEnd; - static __shared__ Value pivot; if (threadIdx.x == 0) { @@ -117,16 +128,16 @@ __device__ void singleBlockQuickSort(ArrayView arr, //small enough for for bitonic if (size <= blockDim.x * 2) { - externSort(src.getView(begin, end), arr.getView(begin, end), Cmp, sharedMem); + if(useShared) + externSort(src.getView(begin, end), arr.getView(begin, end), Cmp, sharedMem); + else + externSort(src.getView(begin, end), arr.getView(begin, end), Cmp); __syncthreads(); continue; } //------------------------------------------------------ - //actually do partitioning from here on out - if (threadIdx.x == 0) - pivot = pickPivot(src.getView(begin, end), Cmp); - __syncthreads(); + Value pivot = pickPivot(src.getView(begin, end), Cmp); int smaller = 0, bigger = 0; countElem(src.getView(begin, end), Cmp, smaller, bigger, pivot); diff --git a/GPUSort/tests/quicksort_unitTests/unitTests.cu b/GPUSort/tests/quicksort_unitTests/unitTests.cu index e87493add..dc3518e49 100644 --- a/GPUSort/tests/quicksort_unitTests/unitTests.cu +++ b/GPUSort/tests/quicksort_unitTests/unitTests.cu @@ -141,6 +141,28 @@ TEST(types, type_double) ASSERT_TRUE(view == cudaArr2.getView()); } +struct TMPSTRUCT{ + uint8_t m_data[16]; + + __cuda_callable__ TMPSTRUCT(){m_data[0] = 0;} + __cuda_callable__ TMPSTRUCT(int first){m_data[0] = first;}; + __cuda_callable__ bool operator <(const TMPSTRUCT& other) const { return m_data[0] < other.m_data[0];} +}; + + +TEST(types, struct) +{ + std::srand(8451); + + int size = (1<<13); + std::vector arr(size); + for(auto & x : arr) x = TMPSTRUCT(std::rand()); + + TNL::Containers::Array cudaArr(arr); + auto view = cudaArr.getView(); + quicksort(view); +} + //---------------------------------------------------------------------------------- int main(int argc, char **argv) -- GitLab From 2094e943bad5d95a0b124907a7611518db19a272 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 7 Apr 2021 23:56:55 +0200 Subject: [PATCH 193/258] sort without shared memory --- GPUSort/src/bitonicSort/bitonicSort.h | 32 +++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/GPUSort/src/bitonicSort/bitonicSort.h b/GPUSort/src/bitonicSort/bitonicSort.h index a066ef5d8..9cff095db 100644 --- a/GPUSort/src/bitonicSort/bitonicSort.h +++ b/GPUSort/src/bitonicSort/bitonicSort.h @@ -178,6 +178,38 @@ void bitonicSort_Block(TNL::Containers::ArrayView src dst[copy2] = sharedMem[copy2]; } } + +template +__device__ +void bitonicSort_Block(TNL::Containers::ArrayView src, + TNL::Containers::ArrayView dst, + const Function & Cmp) +{ + int i = threadIdx.x; + int paddedSize = closestPow2(src.getSize()); + + for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) + { + //calculate the direction of swapping + int monotonicSeqIdx = i / (monotonicSeqLen/2); + bool ascending = (monotonicSeqIdx & 1) != 0; + if ((monotonicSeqIdx + 1) * monotonicSeqLen >= src.getSize()) //special case for parts with no "partner" + ascending = true; + + for (int len = monotonicSeqLen; len > 1; len /= 2) + { + //calculates which 2 indexes will be compared and swap + int part = i / (len / 2); + int s = part * len + (i & ((len / 2) - 1)); + int e = s + len / 2; + + if(e < src.getSize()) //not touching virtual padding + cmpSwap(src[s], src[e], ascending, Cmp); + __syncthreads(); + } + } +} + /** * very similar to bitonicMergeSharedMemory * does bitonicMergeSharedMemory but afterwards increases monotoncSeqLen -- GitLab From b0fe360620fb10103c6f46d3d738b3526b9cce49 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Thu, 8 Apr 2021 00:10:31 +0200 Subject: [PATCH 194/258] versions that dont use shared memory --- GPUSort/src/bitonicSort/bitonicSort.h | 80 ++++++++++++++++++++++++--- 1 file changed, 73 insertions(+), 7 deletions(-) diff --git a/GPUSort/src/bitonicSort/bitonicSort.h b/GPUSort/src/bitonicSort/bitonicSort.h index 9cff095db..793fb2b70 100644 --- a/GPUSort/src/bitonicSort/bitonicSort.h +++ b/GPUSort/src/bitonicSort/bitonicSort.h @@ -120,6 +120,44 @@ void bitonicMergeSharedMemory(TNL::Containers::ArrayView +__global__ +void bitonicMerge(TNL::Containers::ArrayView arr, + const Function & Cmp, + int monotonicSeqLen, int len, int partsInSeq) +{ + //1st index and last index of subarray that this threadBlock should merge + int myBlockStart = blockIdx.x * (2*blockDim.x); + int myBlockEnd = TNL::min(arr.getSize(), myBlockStart+(2*blockDim.x)); + + auto src = arr.getView(myBlockStart, myBlockEnd); + + //calculate the direction of swapping + int i = blockIdx.x * blockDim.x + threadIdx.x; + int part = i / (len / 2); + int monotonicSeqIdx = part / partsInSeq; + + bool ascending = (monotonicSeqIdx & 1) != 0; + //special case for parts with no "partner" + if ((monotonicSeqIdx + 1) * monotonicSeqLen >= arr.getSize()) + ascending = true; + //------------------------------------------ + + //do bitonic merge + for (; len > 1; len /= 2) + { + //calculates which 2 indexes will be compared and swap + int part = threadIdx.x / (len / 2); + int s = part * len + (threadIdx.x & ((len /2) - 1)); + int e = s + len / 2; + + if(e < myBlockEnd - myBlockStart) //not touching virtual padding + cmpSwap(src[s], src[e], ascending, Cmp); + __syncthreads(); + } +} + //--------------------------------------------- template @@ -232,6 +270,19 @@ __global__ void bitoniSort1stStepSharedMemory(TNL::Containers::ArrayView +__global__ void bitoniSort1stStep(TNL::Containers::ArrayView arr, const Function & Cmp) +{ + int myBlockStart = blockIdx.x * (2*blockDim.x); + int myBlockEnd = TNL::min(arr.getSize(), myBlockStart + (2*blockDim.x)); + + if(blockIdx.x%2 || blockIdx.x + 1 == gridDim.x) + bitonicSort_Block(arr.getView(myBlockStart, myBlockEnd), arr.getView(myBlockStart, myBlockEnd), Cmp); + else + bitonicSort_Block(arr.getView(myBlockStart, myBlockEnd), arr.getView(myBlockStart, myBlockEnd), + [&] __cuda_callable__ (const Value&a, const Value&b){return Cmp(b, a);} + ); +} //--------------------------------------------- template @@ -246,27 +297,42 @@ void bitonicSort(TNL::Containers::ArrayView src, int int threadPerBlock = maxThreadsPerBlock; int blocks = threadsNeeded / threadPerBlock + (threadsNeeded % threadPerBlock != 0); - const int sharedMemLen = threadPerBlock * 2; - const int sharedMemSize = sharedMemLen* sizeof(Value); + int sharedMemLen = threadPerBlock * 2; + int sharedMemSize = sharedMemLen* sizeof(Value); //--------------------------------------------------------------------------------- - bitoniSort1stStepSharedMemory<<>>(arr, Cmp); + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, 0); + + //--------------------------------------------------------------------------------- + + if(sharedMemSize <= deviceProp.sharedMemPerBlock) + bitoniSort1stStepSharedMemory<<>>(arr, Cmp); + else + bitoniSort1stStep<<>>(arr, Cmp); for (int monotonicSeqLen = 2*sharedMemLen; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) { for (int len = monotonicSeqLen, partsInSeq = 1; len > 1; len /= 2, partsInSeq *= 2) { - if(len > sharedMemLen) + if(len > sharedMemLen ) { bitonicMergeGlobal<<>>( arr, Cmp, monotonicSeqLen, len, partsInSeq); } else { - - bitonicMergeSharedMemory<<>>( - arr, Cmp, monotonicSeqLen, len, partsInSeq); + if(sharedMemSize <= deviceProp.sharedMemPerBlock) + { + bitonicMergeSharedMemory<<>>( + arr, Cmp, monotonicSeqLen, len, partsInSeq); + } + else + { + bitonicMerge<<>>( + arr, Cmp, monotonicSeqLen, len, partsInSeq); + } break; } } -- GitLab From f94d17441e80dbae5918873533c5f92ff86b983a Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Thu, 8 Apr 2021 00:28:51 +0200 Subject: [PATCH 195/258] reformating and adding function description --- GPUSort/src/bitonicSort/bitonicSort.h | 228 ++++++++++++++------------ 1 file changed, 124 insertions(+), 104 deletions(-) diff --git a/GPUSort/src/bitonicSort/bitonicSort.h b/GPUSort/src/bitonicSort/bitonicSort.h index 793fb2b70..24f3628a5 100644 --- a/GPUSort/src/bitonicSort/bitonicSort.h +++ b/GPUSort/src/bitonicSort/bitonicSort.h @@ -16,26 +16,29 @@ __host__ __device__ int closestPow2(int x) } template -__host__ __device__ void cmpSwap(Value & a, Value &b, bool ascending, const Function & Cmp) +__host__ __device__ void cmpSwap(Value &a, Value &b, bool ascending, const Function &Cmp) { - if( (ascending == Cmp(b, a))) + if (ascending == Cmp(b, a)) TNL::swap(a, b); } + //--------------------------------------------- + /** * this kernel simulates 1 exchange + * splits input arr that is bitonic into 2 bitonic sequences */ template __global__ void bitonicMergeGlobal(TNL::Containers::ArrayView arr, - const Function & Cmp, - int monotonicSeqLen, int len, int partsInSeq) + const Function &Cmp, + int monotonicSeqLen, int len, int partsInSeq) { int i = blockIdx.x * blockDim.x + threadIdx.x; int part = i / (len / 2); //computes which sorting block this thread belongs to //the index of 2 elements that should be compared and swapped - int s = part * len + (i & ((len / 2) - 1) ); + int s = part * len + (i & ((len / 2) - 1)); int e = s + len / 2; if (e >= arr.getSize()) //arr[e] is virtual padding and will not be exchanged with return; @@ -48,39 +51,41 @@ __global__ void bitonicMergeGlobal(TNL::Containers::ArrayView -__global__ -void bitonicMergeSharedMemory(TNL::Containers::ArrayView arr, - const Function & Cmp, - int monotonicSeqLen, int len, int partsInSeq) +__global__ void bitonicMergeSharedMemory(TNL::Containers::ArrayView arr, + const Function &Cmp, + int monotonicSeqLen, int len, int partsInSeq) { extern __shared__ int externMem[]; - Value * sharedMem = (Value *)externMem; + Value *sharedMem = (Value *)externMem; - int sharedMemLen = 2*blockDim.x; + int sharedMemLen = 2 * blockDim.x; //1st index and last index of subarray that this threadBlock should merge int myBlockStart = blockIdx.x * sharedMemLen; - int myBlockEnd = TNL::min(arr.getSize(), myBlockStart+sharedMemLen); + int myBlockEnd = TNL::min(arr.getSize(), myBlockStart + sharedMemLen); //copy from globalMem into sharedMem int copy1 = myBlockStart + threadIdx.x; int copy2 = copy1 + blockDim.x; { - if(copy1 < myBlockEnd) + if (copy1 < myBlockEnd) sharedMem[threadIdx.x] = arr[copy1]; - if(copy2 < myBlockEnd) + if (copy2 < myBlockEnd) sharedMem[threadIdx.x + blockDim.x] = arr[copy2]; __syncthreads(); } - + //------------------------------------------ //bitonic activity { @@ -100,36 +105,40 @@ void bitonicMergeSharedMemory(TNL::Containers::ArrayView -__global__ -void bitonicMerge(TNL::Containers::ArrayView arr, - const Function & Cmp, - int monotonicSeqLen, int len, int partsInSeq) +__global__ void bitonicMerge(TNL::Containers::ArrayView arr, + const Function &Cmp, + int monotonicSeqLen, int len, int partsInSeq) { //1st index and last index of subarray that this threadBlock should merge - int myBlockStart = blockIdx.x * (2*blockDim.x); - int myBlockEnd = TNL::min(arr.getSize(), myBlockStart+(2*blockDim.x)); + int myBlockStart = blockIdx.x * (2 * blockDim.x); + int myBlockEnd = TNL::min(arr.getSize(), myBlockStart + (2 * blockDim.x)); auto src = arr.getView(myBlockStart, myBlockEnd); @@ -149,10 +158,10 @@ void bitonicMerge(TNL::Containers::ArrayView arr, { //calculates which 2 indexes will be compared and swap int part = threadIdx.x / (len / 2); - int s = part * len + (threadIdx.x & ((len /2) - 1)); + int s = part * len + (threadIdx.x & ((len / 2) - 1)); int e = s + len / 2; - if(e < myBlockEnd - myBlockStart) //not touching virtual padding + if (e < myBlockEnd - myBlockStart) //not touching virtual padding cmpSwap(src[s], src[e], ascending, Cmp); __syncthreads(); } @@ -160,25 +169,31 @@ void bitonicMerge(TNL::Containers::ArrayView arr, //--------------------------------------------- +/** + * IMPORTANT: all threads in block have to call this function to work properly + * IMPORTANT: input can be at max size of blockDim.x*2, bigger size will lead to part of input unsorted + * Description: sorts src and writes into dst within a block + * works independently from other concurrent blocks + * @param sharedMem sharedMem pointer has to be able to store blockDim.x*2 elements + * */ template -__device__ -void bitonicSort_Block(TNL::Containers::ArrayView src, - TNL::Containers::ArrayView dst, - Value* sharedMem, const Function & Cmp) +__device__ void bitonicSort_Block(TNL::Containers::ArrayView src, + TNL::Containers::ArrayView dst, + Value *sharedMem, const Function &Cmp) { //copy from globalMem into sharedMem int copy1 = threadIdx.x; int copy2 = copy1 + blockDim.x; { - if(copy1 < src.getSize()) + if (copy1 < src.getSize()) sharedMem[copy1] = src[copy1]; - if(copy2 < src.getSize()) + if (copy2 < src.getSize()) sharedMem[copy2] = src[copy2]; __syncthreads(); } - + //------------------------------------------ //bitonic activity { @@ -188,7 +203,7 @@ void bitonicSort_Block(TNL::Containers::ArrayView src for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) { //calculate the direction of swapping - int monotonicSeqIdx = i / (monotonicSeqLen/2); + int monotonicSeqIdx = i / (monotonicSeqLen / 2); bool ascending = (monotonicSeqIdx & 1) != 0; if ((monotonicSeqIdx + 1) * monotonicSeqLen >= src.getSize()) //special case for parts with no "partner" ascending = true; @@ -200,7 +215,7 @@ void bitonicSort_Block(TNL::Containers::ArrayView src int s = part * len + (i & ((len / 2) - 1)); int e = s + len / 2; - if(e < src.getSize()) //not touching virtual padding + if (e < src.getSize()) //not touching virtual padding cmpSwap(sharedMem[s], sharedMem[e], ascending, Cmp); __syncthreads(); } @@ -210,18 +225,25 @@ void bitonicSort_Block(TNL::Containers::ArrayView src //------------------------------------------ //writeback to global memory { - if(copy1 < src.getSize()) + if (copy1 < src.getSize()) dst[copy1] = sharedMem[copy1]; - if(copy2 < src.getSize()) + if (copy2 < src.getSize()) dst[copy2] = sharedMem[copy2]; } } + +/** + * IMPORTANT: all threads in block have to call this function to work properly + * IMPORTANT: input can be at max size of blockDim.x*2, bigger size will lead to part of input unsorted + * Description: sorts src and writes into dst within a block + * works independently from other concurrent blocks + * this version doesnt use shared memory and is prefered for Value with big size + * */ template -__device__ -void bitonicSort_Block(TNL::Containers::ArrayView src, - TNL::Containers::ArrayView dst, - const Function & Cmp) +__device__ void bitonicSort_Block(TNL::Containers::ArrayView src, + TNL::Containers::ArrayView dst, + const Function &Cmp) { int i = threadIdx.x; int paddedSize = closestPow2(src.getSize()); @@ -229,7 +251,7 @@ void bitonicSort_Block(TNL::Containers::ArrayView src for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) { //calculate the direction of swapping - int monotonicSeqIdx = i / (monotonicSeqLen/2); + int monotonicSeqIdx = i / (monotonicSeqLen / 2); bool ascending = (monotonicSeqIdx & 1) != 0; if ((monotonicSeqIdx + 1) * monotonicSeqLen >= src.getSize()) //special case for parts with no "partner" ascending = true; @@ -241,7 +263,7 @@ void bitonicSort_Block(TNL::Containers::ArrayView src int s = part * len + (i & ((len / 2) - 1)); int e = s + len / 2; - if(e < src.getSize()) //not touching virtual padding + if (e < src.getSize()) //not touching virtual padding cmpSwap(src[s], src[e], ascending, Cmp); __syncthreads(); } @@ -249,56 +271,58 @@ void bitonicSort_Block(TNL::Containers::ArrayView src } /** - * very similar to bitonicMergeSharedMemory - * does bitonicMergeSharedMemory but afterwards increases monotoncSeqLen - * then trickles down again - * this continues until whole sharedMem is sorted + * entrypoint for bitonicSort_Block + * sorts @param arr in alternating order to create bitonic sequences + * sharedMem has to be able to store at least blockDim.x*2 elements * */ template -__global__ void bitoniSort1stStepSharedMemory(TNL::Containers::ArrayView arr, const Function & Cmp) +__global__ void bitoniSort1stStepSharedMemory(TNL::Containers::ArrayView arr, const Function &Cmp) { extern __shared__ int externMem[]; - int sharedMemLen = 2*blockDim.x; + int sharedMemLen = 2 * blockDim.x; int myBlockStart = blockIdx.x * sharedMemLen; int myBlockEnd = TNL::min(arr.getSize(), myBlockStart + sharedMemLen); - if(blockIdx.x%2 || blockIdx.x + 1 == gridDim.x) - bitonicSort_Block(arr.getView(myBlockStart, myBlockEnd), arr.getView(myBlockStart, myBlockEnd), (Value*) externMem, Cmp); + if (blockIdx.x % 2 || blockIdx.x + 1 == gridDim.x) + bitonicSort_Block(arr.getView(myBlockStart, myBlockEnd), arr.getView(myBlockStart, myBlockEnd), (Value *)externMem, Cmp); else - bitonicSort_Block(arr.getView(myBlockStart, myBlockEnd), arr.getView(myBlockStart, myBlockEnd), (Value*) externMem, - [&] __cuda_callable__ (const Value&a, const Value&b){return Cmp(b, a);} - ); + bitonicSort_Block(arr.getView(myBlockStart, myBlockEnd), arr.getView(myBlockStart, myBlockEnd), (Value *)externMem, + [&] __cuda_callable__(const Value &a, const Value &b) { return Cmp(b, a); }); } +/** + * entrypoint for bitonicSort_Block + * sorts @param arr in alternating order to create bitonic sequences + * doesn't use shared memory + * */ template -__global__ void bitoniSort1stStep(TNL::Containers::ArrayView arr, const Function & Cmp) +__global__ void bitoniSort1stStep(TNL::Containers::ArrayView arr, const Function &Cmp) { - int myBlockStart = blockIdx.x * (2*blockDim.x); - int myBlockEnd = TNL::min(arr.getSize(), myBlockStart + (2*blockDim.x)); + int myBlockStart = blockIdx.x * (2 * blockDim.x); + int myBlockEnd = TNL::min(arr.getSize(), myBlockStart + (2 * blockDim.x)); - if(blockIdx.x%2 || blockIdx.x + 1 == gridDim.x) + if (blockIdx.x % 2 || blockIdx.x + 1 == gridDim.x) bitonicSort_Block(arr.getView(myBlockStart, myBlockEnd), arr.getView(myBlockStart, myBlockEnd), Cmp); else bitonicSort_Block(arr.getView(myBlockStart, myBlockEnd), arr.getView(myBlockStart, myBlockEnd), - [&] __cuda_callable__ (const Value&a, const Value&b){return Cmp(b, a);} - ); + [&] __cuda_callable__(const Value &a, const Value &b) { return Cmp(b, a); }); } //--------------------------------------------- template -void bitonicSort(TNL::Containers::ArrayView src, int begin, int end, const Function& Cmp) +void bitonicSort(TNL::Containers::ArrayView src, int begin, int end, const Function &Cmp) { TNL::Containers::ArrayView arr = src.getView(begin, end); int paddedSize = closestPow2(arr.getSize()); - int threadsNeeded = arr.getSize() / 2 + (arr.getSize() %2 !=0); + int threadsNeeded = arr.getSize() / 2 + (arr.getSize() % 2 != 0); const int maxThreadsPerBlock = 512; int threadPerBlock = maxThreadsPerBlock; int blocks = threadsNeeded / threadPerBlock + (threadsNeeded % threadPerBlock != 0); int sharedMemLen = threadPerBlock * 2; - int sharedMemSize = sharedMemLen* sizeof(Value); + int sharedMemSize = sharedMemLen * sizeof(Value); //--------------------------------------------------------------------------------- @@ -307,23 +331,23 @@ void bitonicSort(TNL::Containers::ArrayView src, int //--------------------------------------------------------------------------------- - if(sharedMemSize <= deviceProp.sharedMemPerBlock) + if (sharedMemSize <= deviceProp.sharedMemPerBlock) bitoniSort1stStepSharedMemory<<>>(arr, Cmp); else bitoniSort1stStep<<>>(arr, Cmp); - - for (int monotonicSeqLen = 2*sharedMemLen; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) + + for (int monotonicSeqLen = 2 * sharedMemLen; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) { for (int len = monotonicSeqLen, partsInSeq = 1; len > 1; len /= 2, partsInSeq *= 2) { - if(len > sharedMemLen ) + if (len > sharedMemLen) { bitonicMergeGlobal<<>>( arr, Cmp, monotonicSeqLen, len, partsInSeq); } else { - if(sharedMemSize <= deviceProp.sharedMemPerBlock) + if (sharedMemSize <= deviceProp.sharedMemPerBlock) { bitonicMergeSharedMemory<<>>( arr, Cmp, monotonicSeqLen, len, partsInSeq); @@ -345,11 +369,11 @@ void bitonicSort(TNL::Containers::ArrayView src, int template void bitonicSort(TNL::Containers::ArrayView arr, int begin, int end) { - bitonicSort(arr, begin, end, [] __cuda_callable__ (const Value & a, const Value & b) {return a < b;}); + bitonicSort(arr, begin, end, [] __cuda_callable__(const Value &a, const Value &b) { return a < b; }); } template -void bitonicSort(TNL::Containers::ArrayView arr, const Function & Cmp) +void bitonicSort(TNL::Containers::ArrayView arr, const Function &Cmp) { bitonicSort(arr, 0, arr.getSize(), Cmp); } @@ -357,53 +381,53 @@ void bitonicSort(TNL::Containers::ArrayView arr, cons template void bitonicSort(TNL::Containers::ArrayView arr) { - bitonicSort(arr, [] __cuda_callable__ (const Value & a, const Value & b) {return a < b;}); + bitonicSort(arr, [] __cuda_callable__(const Value &a, const Value &b) { return a < b; }); } //--------------------------------------------- template -void bitonicSort(std::vector & vec, int begin, int end, const Function & Cmp) +void bitonicSort(std::vector &vec, int begin, int end, const Function &Cmp) { TNL::Containers::Array Arr(vec); auto view = Arr.getView(); bitonicSort(view, begin, end, Cmp); - TNL::Algorithms::MultiDeviceMemoryOperations:: - copy(vec.data(), view.getData(), view.getSize()); + TNL::Algorithms::MultiDeviceMemoryOperations:: + copy(vec.data(), view.getData(), view.getSize()); } template -void bitonicSort(std::vector & vec, int begin, int end) +void bitonicSort(std::vector &vec, int begin, int end) { - bitonicSort(vec, begin, end, [] __cuda_callable__ (const Value & a, const Value & b) {return a < b;}); + bitonicSort(vec, begin, end, [] __cuda_callable__(const Value &a, const Value &b) { return a < b; }); } template -void bitonicSort(std::vector & vec, const Function & Cmp) +void bitonicSort(std::vector &vec, const Function &Cmp) { bitonicSort(vec, 0, vec.size(), Cmp); } template -void bitonicSort(std::vector & vec) +void bitonicSort(std::vector &vec) { - bitonicSort(vec, [] __cuda_callable__ (const Value & a, const Value & b) {return a < b;}); + bitonicSort(vec, [] __cuda_callable__(const Value &a, const Value &b) { return a < b; }); } //--------------------------------------------- //--------------------------------------------- -template -__global__ void bitonicMergeGlobal(int size, FETCH Fetch, - const CMP & Cmp, SWAP Swap, - int monotonicSeqLen, int len, int partsInSeq) +template +__global__ void bitonicMergeGlobal(int size, FETCH Fetch, + const CMP &Cmp, SWAP Swap, + int monotonicSeqLen, int len, int partsInSeq) { int i = blockIdx.x * blockDim.x + threadIdx.x; int part = i / (len / 2); //computes which sorting block this thread belongs to //the index of 2 elements that should be compared and swapped - int s = part * len + (i & ((len / 2) - 1) ); + int s = part * len + (i & ((len / 2) - 1)); int e = s + len / 2; if (e >= size) //arr[e] is virtual padding and will not be exchanged with return; @@ -414,34 +438,30 @@ __global__ void bitonicMergeGlobal(int size, FETCH Fetch, if ((monotonicSeqIdx + 1) * monotonicSeqLen >= size) //special case for part with no "partner" to be merged with in next phase ascending = true; - if( ascending == Cmp(Fetch(e), Fetch(s))) + if (ascending == Cmp(Fetch(e), Fetch(s))) Swap(s, e); } - - -template -void bitonicSort(int begin, int end, FETCH Fetch, const CMP& Cmp, SWAP Swap) +template +void bitonicSort(int begin, int end, FETCH Fetch, const CMP &Cmp, SWAP Swap) { int size = end - begin; int paddedSize = closestPow2(size); - int threadsNeeded = size / 2 + (size %2 !=0); + int threadsNeeded = size / 2 + (size % 2 != 0); const int maxThreadsPerBlock = 512; int threadPerBlock = maxThreadsPerBlock; int blocks = threadsNeeded / threadPerBlock + (threadsNeeded % threadPerBlock != 0); - auto fetchWithOffset = - [=] __cuda_callable__(int i) - { + auto fetchWithOffset = + [=] __cuda_callable__(int i) { return Fetch(i + begin); }; - - auto swapWithOffset = - [=] __cuda_callable__(int i, int j) mutable - { - Swap(i+begin, j+begin); + + auto swapWithOffset = + [=] __cuda_callable__(int i, int j) mutable { + Swap(i + begin, j + begin); }; for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) -- GitLab From 151674279c906349d34b8835f2a55152c751f8cc Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Fri, 9 Apr 2021 01:16:41 +0200 Subject: [PATCH 196/258] refactoring --- GPUSort/src/quicksort/cudaPartition.cuh | 14 +- GPUSort/src/quicksort/quicksort.cuh | 387 ++++++--------------- GPUSort/src/quicksort/quicksort_1Block.cuh | 119 ++++--- GPUSort/src/quicksort/quicksort_kernel.cuh | 248 +++++++++++++ 4 files changed, 437 insertions(+), 331 deletions(-) create mode 100644 GPUSort/src/quicksort/quicksort_kernel.cuh diff --git a/GPUSort/src/quicksort/cudaPartition.cuh b/GPUSort/src/quicksort/cudaPartition.cuh index 93f4e5fe3..e6b9ad0b8 100644 --- a/GPUSort/src/quicksort/cudaPartition.cuh +++ b/GPUSort/src/quicksort/cudaPartition.cuh @@ -16,7 +16,7 @@ __device__ Value pickPivot(TNL::Containers::ArrayView src, const if (src.getSize() == 1) return src[0]; - Value a = src[0], b = src[src.getSize() / 2], c = src[src.getSize() - 1]; + const Value &a = src[0], &b = src[src.getSize() / 2], &c = src[src.getSize() - 1]; if (Cmp(a, b)) // ..a..b.. { @@ -47,7 +47,7 @@ __device__ int pickPivotIdx(TNL::Containers::ArrayView src, const if (src.getSize() <= 1) return 0; - Value a = src[0], b = src[src.getSize() / 2], c = src[src.getSize() - 1]; + const Value &a = src[0], &b = src[src.getSize() / 2], &c = src[src.getSize() - 1]; if (Cmp(a, b)) // ..a..b.. { @@ -69,6 +69,8 @@ __device__ int pickPivotIdx(TNL::Containers::ArrayView src, const } } +//----------------------------------------------------------- + template __device__ void countElem(ArrayView arr, const Function &Cmp, @@ -77,7 +79,7 @@ __device__ void countElem(ArrayView arr, { for (int i = threadIdx.x; i < arr.getSize(); i += blockDim.x) { - const Value data = arr[i]; + const Value &data = arr[i]; if (Cmp(data, pivot)) smaller++; else if (Cmp(pivot, data)) @@ -85,6 +87,8 @@ __device__ void countElem(ArrayView arr, } } +//----------------------------------------------------------- + template __device__ void copyDataShared(ArrayView src, ArrayView dst, @@ -98,7 +102,7 @@ __device__ void copyDataShared(ArrayView src, for (int i = threadIdx.x; i < src.getSize(); i += blockDim.x) { - const Value data = src[i]; + const Value &data = src[i]; if (Cmp(data, pivot)) sharedMem[smallerOffset++] = data; else if (Cmp(pivot, data)) @@ -124,7 +128,7 @@ __device__ void copyData(ArrayView src, { for (int i = threadIdx.x; i < src.getSize(); i += blockDim.x) { - const Value data = src[i]; + const Value &data = src[i]; if (Cmp(data, pivot)) { /* diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index 1c4e4a3dc..62ae9a8f6 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -1,271 +1,58 @@ #pragma once #include -#include "../util/reduction.cuh" #include "task.h" -#include "cudaPartition.cuh" -#include "quicksort_1Block.cuh" -#include "../bitonicSort/bitonicSort.h" -#include +#include "quicksort_kernel.cuh" #include #include +#include #define deb(x) std::cout << #x << " = " << x << std::endl; using namespace TNL; using namespace TNL::Containers; -//----------------------------------------------------------- - -__device__ void writeNewTask(int begin, int end, int depth, int maxElemFor2ndPhase, - ArrayView newTasks, int *newTasksCnt, - ArrayView secondPhaseTasks, int *secondPhaseTasksCnt) -{ - int size = end - begin; - if (size < 0) - { - printf("negative size, something went really wrong\n"); - return; - } - - if (size == 0) - return; - - if (size <= maxElemFor2ndPhase) - { - int idx = atomicAdd(secondPhaseTasksCnt, 1); - if (idx < secondPhaseTasks.getSize()) - secondPhaseTasks[idx] = TASK(begin, end, depth + 1); - else - { - //printf("ran out of memory, trying backup\n"); - int idx = atomicAdd(newTasksCnt, 1); - if (idx < newTasks.getSize()) - newTasks[idx] = TASK(begin, end, depth + 1); - else - printf("ran out of memory for second phase task, there isnt even space in newTask list\nPart of array may stay unsorted!!!\n"); - } - } - else - { - int idx = atomicAdd(newTasksCnt, 1); - if (idx < newTasks.getSize()) - newTasks[idx] = TASK(begin, end, depth + 1); - else - { - //printf("ran out of memory, trying backup\n"); - int idx = atomicAdd(secondPhaseTasksCnt, 1); - if (idx < secondPhaseTasks.getSize()) - secondPhaseTasks[idx] = TASK(begin, end, depth + 1); - else - printf("ran out of memory for newtask, there isnt even space in second phase task list\nPart of array may stay unsorted!!!\n"); - } - } -} - -//---------------------------------------------------- - -template -__global__ void cudaQuickSort1stPhase(ArrayView arr, ArrayView aux, - const Function &Cmp, int elemPerBlock, - ArrayView tasks, - ArrayView taskMapping) -{ - extern __shared__ int externMem[]; - Value *sharedMem = (Value *)externMem; - - - TASK &myTask = tasks[taskMapping[blockIdx.x]]; - auto &src = (myTask.depth & 1) == 0 ? arr : aux; - auto &dst = (myTask.depth & 1) == 0 ? aux : arr; - - Value pivot = src[myTask.pivotIdx]; - - cudaPartition( - src.getView(myTask.partitionBegin, myTask.partitionEnd), - dst.getView(myTask.partitionBegin, myTask.partitionEnd), - Cmp, sharedMem, pivot, - elemPerBlock, myTask); -} - -//---------------------------------------------------- - template -__global__ void cudaWritePivot(ArrayView arr, ArrayView aux, int maxElemFor2ndPhase, - ArrayView tasks, ArrayView newTasks, int *newTasksCnt, - ArrayView secondPhaseTasks, int *secondPhaseTasksCnt) -{ - TASK &myTask = tasks[blockIdx.x]; - - Value pivot = (myTask.depth & 1) == 0 ? arr[myTask.pivotIdx] : aux[myTask.pivotIdx]; - - int leftBegin = myTask.partitionBegin, leftEnd = myTask.partitionBegin + myTask.dstBegin; - int rightBegin = myTask.partitionBegin + myTask.dstEnd, rightEnd = myTask.partitionEnd; - - for (int i = leftEnd + threadIdx.x; i < rightBegin; i += blockDim.x) - { - /* - #ifdef DEBUG - aux[i] = -1; - #endif - */ - arr[i] = pivot; - } - - if (threadIdx.x != 0) - return; - - if (leftEnd - leftBegin > 0) - { - writeNewTask(leftBegin, leftEnd, myTask.depth, - maxElemFor2ndPhase, - newTasks, newTasksCnt, - secondPhaseTasks, secondPhaseTasksCnt); - } - - if (rightEnd - rightBegin > 0) - { - writeNewTask(rightBegin, rightEnd, - myTask.depth, maxElemFor2ndPhase, - newTasks, newTasksCnt, - secondPhaseTasks, secondPhaseTasksCnt); - } -} - -//----------------------------------------------------------- - -template -__global__ void cudaQuickSort2ndPhase(ArrayView arr, ArrayView aux, - const Function &Cmp, - ArrayView secondPhaseTasks, - int elemInShared) -{ - extern __shared__ int externMem[]; - Value *sharedMem = (Value *)externMem; - - TASK &myTask = secondPhaseTasks[blockIdx.x]; - if (myTask.partitionEnd - myTask.partitionBegin <= 0) - return; - - auto arrView = arr.getView(myTask.partitionBegin, myTask.partitionEnd); - auto auxView = aux.getView(myTask.partitionBegin, myTask.partitionEnd); - - if (elemInShared == 0) - { - singleBlockQuickSort(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared); - } - else - { - singleBlockQuickSort(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared); - } -} - -template -__global__ void cudaQuickSort2ndPhase(ArrayView arr, ArrayView aux, - const Function &Cmp, - ArrayView secondPhaseTasks1, - ArrayView secondPhaseTasks2, - int elemInShared) -{ - extern __shared__ int externMem[]; - Value *sharedMem = (Value *)externMem; - - TASK myTask; - if (blockIdx.x < secondPhaseTasks1.getSize()) - myTask = secondPhaseTasks1[blockIdx.x]; - else - myTask = secondPhaseTasks2[blockIdx.x - secondPhaseTasks1.getSize()]; - - if (myTask.partitionEnd - myTask.partitionBegin <= 0) - { - printf("empty task???\n"); - return; - } - - auto arrView = arr.getView(myTask.partitionBegin, myTask.partitionEnd); - auto auxView = aux.getView(myTask.partitionBegin, myTask.partitionEnd); - - if (elemInShared == 0) - { - singleBlockQuickSort(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared); - } - else - { - singleBlockQuickSort(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared); - } -} - -//----------------------------------------------------------- - -__global__ void cudaCalcBlocksNeeded(ArrayView cuda_tasks, int elemPerBlock, - ArrayView blocksNeeded) -{ - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= cuda_tasks.getSize()) - return; - - auto task = cuda_tasks[i]; - int size = task.partitionEnd - task.partitionBegin; - blocksNeeded[i] = size / elemPerBlock + (size % elemPerBlock != 0); -} - -template -__global__ void cudaInitTask(ArrayView cuda_tasks, - ArrayView cuda_blockToTaskMapping, - ArrayView cuda_reductionTaskInitMem, - ArrayView src, const Function &Cmp) +class QUICKSORT { - if (blockIdx.x >= cuda_tasks.getSize()) - return; + int maxBlocks, threadsPerBlock, desiredElemPerBlock, maxSharable; //kernel config - int start = blockIdx.x == 0 ? 0 : cuda_reductionTaskInitMem[blockIdx.x - 1]; - int end = cuda_reductionTaskInitMem[blockIdx.x]; - for (int i = start + threadIdx.x; i < end; i += blockDim.x) - cuda_blockToTaskMapping[i] = blockIdx.x; + //-------------------------------------- - if (threadIdx.x == 0) - { - TASK &task = cuda_tasks[blockIdx.x]; - int pivotIdx = task.partitionBegin + pickPivotIdx(src.getView(task.partitionBegin, task.partitionEnd), Cmp); - task.initTask(start, end - start, pivotIdx); - } -} -//----------------------------------------------------------- -//----------------------------------------------------------- - -template -class QUICKSORT -{ - ArrayView arr; - Array aux; + Array auxMem; + ArrayView arr, aux; - int maxBlocks, threadsPerBlock, desiredElemPerBlock, maxSharable; + //-------------------------------------- const int maxBitonicSize = threadsPerBlock * 2; const int desired_2ndPhasElemPerBlock = maxBitonicSize; const int g_maxTasks = 1 << 14; - int maxTasks; - Array cuda_tasks, cuda_newTasks, cuda_2ndPhaseTasks; - Array cuda_newTasksAmount, cuda_2ndPhaseTasksAmount; //is in reality 1 integer each + //-------------------------------------- - int host_1stPhaseTasksAmount; //counter for Host == cuda_newTasksAmount - int host_2ndPhaseTasksAmount; // cuda_2ndPhaseTasksAmount + //cuda side task initialization and storing + Array cuda_tasks, cuda_newTasks, cuda_2ndPhaseTasks; //1 set of 2 rotating tasks and 2nd phase + Array cuda_newTasksAmount, cuda_2ndPhaseTasksAmount; //is in reality 1 integer each Array cuda_blockToTaskMapping; Array cuda_reductionTaskInitMem; + //-------------------------------------- + + int host_1stPhaseTasksAmount = 0, host_2ndPhaseTasksAmount = 0; int iteration = 0; + //-------------------------------------------------------------------------------------- //-------------------------------------------------------------------------------------- public: QUICKSORT(ArrayView arr, int gridDim, int blockDim, int desiredElemPerBlock, int maxSharable) - : arr(arr.getView()), aux(arr.getSize()), - maxBlocks(gridDim), threadsPerBlock(blockDim), + : maxBlocks(gridDim), threadsPerBlock(blockDim), desiredElemPerBlock(desiredElemPerBlock), maxSharable(maxSharable), + arr(arr.getView()), auxMem(arr.getSize()), aux(auxMem.getView()), + maxTasks(min(arr.getSize(), g_maxTasks)), cuda_tasks(maxTasks), cuda_newTasks(maxTasks), cuda_2ndPhaseTasks(maxTasks), @@ -274,37 +61,63 @@ public: cuda_blockToTaskMapping(maxBlocks), cuda_reductionTaskInitMem(maxTasks) { - cuda_tasks.setElement(0, TASK(0, arr.getSize(), 0)); - host_1stPhaseTasksAmount = 1; + if (arr.getSize() > desired_2ndPhasElemPerBlock) + { + cuda_tasks.setElement(0, TASK(0, arr.getSize(), 0)); + host_1stPhaseTasksAmount = 1; + } + else + { + cuda_2ndPhaseTasks.setElement(0, TASK(0, arr.getSize(), 0)); + host_2ndPhaseTasksAmount = 1; + } - host_2ndPhaseTasksAmount = 0; cuda_2ndPhaseTasksAmount = 0; - iteration = 0; - TNL_CHECK_CUDA_DEVICE; } + //-------------------------------------------------------------------------------------- template void sort(const Function &Cmp); - template - void firstPhase(const Function &Cmp); - - template - void secondPhase(const Function &Cmp); + //-------------------------------------------------------------------------------------- + /** + * returns how many blocks are needed to start sort phase 1 if @param elemPerBlock were to be used + * */ int getSetsNeeded(int elemPerBlock) const; + + /** + * returns the optimal amount of elements per thread needed for phase + * */ int getElemPerBlock() const; /** - * returns the amount of blocks needed + * returns the amount of blocks needed to start phase 1 while also initializing all tasks * */ template int initTasks(int elemPerBlock, const Function &Cmp); + /** + * does the 1st phase of quicksort until out of task memory or each task is small enough + * for correctness, secondphase method needs to be called to sort each subsequences + * */ + template + void firstPhase(const Function &Cmp); + + /** + * update necessary variables after 1 phase1 sort + * */ void processNewTasks(); + + /** + * sorts all leftover tasks + * */ + template + void secondPhase(const Function &Cmp); }; +//--------------------------------------------------------------------------------------------- //--------------------------------------------------------------------------------------------- template @@ -339,25 +152,29 @@ void QUICKSORT::firstPhase(const Function &Cmp) break; int elemPerBlock = getElemPerBlock(); + + /** + * initializes tasks so that each block knows which task to work on and which part of array to split + * also sets pivot needed for partitioning, this is why Cmp is needed + * */ int blocksCnt = initTasks(elemPerBlock, Cmp); TNL_CHECK_CUDA_DEVICE; - if (blocksCnt >= maxBlocks) //too many blocks needed, switch to 2nd phase + //not enough or too many blocks needed, switch to 2nd phase + if (blocksCnt <= 1 || blocksCnt > cuda_blockToTaskMapping.getSize()) break; //----------------------------------------------- //do the partitioning auto &task = iteration % 2 == 0 ? cuda_tasks : cuda_newTasks; - int externMemByteSize = elemPerBlock * sizeof(Value); + int externMemByteSize = elemPerBlock * sizeof(Value) + sizeof(Value); //elems + 1 for pivot /** - * check if can partition using shared memory for coalesced read and write - * 1st phase of partitioning - * sets of blocks work on a task + * check if partition procedure can use shared memory for coalesced write after reordering * - * using the atomicAdd intristic, each block reserves a chunk of memory where to move elements - * smaller and bigger than pivot move to + * move elements smaller than pivot to the left and bigger to the right + * note: pivot isnt inserted in the middle yet * */ if (externMemByteSize <= maxSharable) { @@ -369,7 +186,7 @@ void QUICKSORT::firstPhase(const Function &Cmp) else { cudaQuickSort1stPhase - <<>>( + <<>>( arr, aux, Cmp, elemPerBlock, task, cuda_blockToTaskMapping); } @@ -385,12 +202,14 @@ void QUICKSORT::firstPhase(const Function &Cmp) * */ auto &newTask = iteration % 2 == 0 ? cuda_newTasks : cuda_tasks; cudaWritePivot - <<>>( + <<>>( arr, aux, desired_2ndPhasElemPerBlock, task, newTask, cuda_newTasksAmount.getData(), cuda_2ndPhaseTasks, cuda_2ndPhaseTasksAmount.getData()); TNL_CHECK_CUDA_DEVICE; + //---------------------------------------- + processNewTasks(); iteration++; } @@ -407,10 +226,10 @@ void QUICKSORT::secondPhase(const Function &Cmp) auto &leftoverTasks = iteration % 2 == 0 ? cuda_tasks : cuda_newTasks; int elemInShared = desiredElemPerBlock; - int externSharedByteSize = sizeof(Value) * elemInShared; + int externSharedByteSize = elemInShared * sizeof(Value) + sizeof(Value); //reserve space for storing elements + 1 pivot if (externSharedByteSize > maxSharable) { - externSharedByteSize = 0; + externSharedByteSize = sizeof(Value); elemInShared = 0; } @@ -473,33 +292,38 @@ template template int QUICKSORT::initTasks(int elemPerBlock, const Function &Cmp) { - int threads = min(host_1stPhaseTasksAmount, threadsPerBlock); - int blocks = host_1stPhaseTasksAmount / threads + (host_1stPhaseTasksAmount % threads != 0); - auto src = iteration % 2 == 0 ? arr : aux.getView(); + auto &src = iteration % 2 == 0 ? arr : aux; auto &tasks = iteration % 2 == 0 ? cuda_tasks : cuda_newTasks; - //[i] == how many blocks task i needs - cudaCalcBlocksNeeded<<>>(tasks.getView(0, host_1stPhaseTasksAmount), - elemPerBlock, cuda_reductionTaskInitMem.getView(0, host_1stPhaseTasksAmount)); + //-------------------------------------------------------- + int blocks = host_1stPhaseTasksAmount / threadsPerBlock + (host_1stPhaseTasksAmount % threadsPerBlock != 0); + + cudaCalcBlocksNeeded<<>>(tasks.getView(0, host_1stPhaseTasksAmount), elemPerBlock, + cuda_reductionTaskInitMem.getView(0, host_1stPhaseTasksAmount)); + //cuda_reductionTaskInitMem[i] == how many blocks task i needs thrust::inclusive_scan(thrust::device, cuda_reductionTaskInitMem.getData(), cuda_reductionTaskInitMem.getData() + host_1stPhaseTasksAmount, cuda_reductionTaskInitMem.getData()); + //cuda_reductionTaskInitMem[i] == how many blocks task [0..i] need int blocksNeeded = cuda_reductionTaskInitMem.getElement(host_1stPhaseTasksAmount - 1); + //need too many blocks, give back control - if (blocksNeeded >= cuda_blockToTaskMapping.getSize()) + if (blocksNeeded > cuda_blockToTaskMapping.getSize()) return blocksNeeded; - cudaInitTask<<>>( - tasks.getView(0, host_1stPhaseTasksAmount), - cuda_blockToTaskMapping.getView(0, blocksNeeded), - cuda_reductionTaskInitMem.getView(0, host_1stPhaseTasksAmount), - src, Cmp); + //-------------------------------------------------------- - cuda_newTasksAmount.setElement(0, 0); + cudaInitTask<<>>( + tasks.getView(0, host_1stPhaseTasksAmount), //task to read from + cuda_blockToTaskMapping.getView(0, blocksNeeded), //maps block to a certain task + cuda_reductionTaskInitMem.getView(0, host_1stPhaseTasksAmount), //has how many each task need blocks precalculated + src, Cmp); //used to pick pivot + + cuda_newTasksAmount.setElement(0, 0); //resets new element counter return blocksNeeded; } @@ -517,29 +341,40 @@ void QUICKSORT::processNewTasks() template void quicksort(ArrayView arr, const Function &Cmp) { - const int maxBlocks = (1 << 20); - cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, 0); - int sharedReserve = sizeof(Value) + sizeof(int) * 16; //1pivot + 16 other shared vars reserved - int maxSharable = deviceProp.sharedMemPerBlock - sharedReserve; - //blockDim*multiplier*sizeof(Value) <= maxSharable + /** + * for every block there is a bit of shared memory reserved, the actual value can slightly differ + * */ + int sharedReserve = sizeof(int) * (16 + 3 * 32); + int maxSharable = deviceProp.sharedMemPerBlock - sharedReserve; int blockDim = 512; //best case - int elemPerBlock = maxSharable / sizeof(Value); + + /** + * the goal is to use shared memory as often as possible + * each thread in a block will process n elements, n==multiplier + * + 1 reserved for pivot (statically allocating Value type throws weird error, hence it needs to be dynamic) + * + * blockDim*multiplier*sizeof(Value) + 1*sizeof(Value) <= maxSharable + * */ + int elemPerBlock = (maxSharable - sizeof(Value)) / sizeof(Value); //try to use up all of shared memory to store elements + const int maxBlocks = (1 << 20); const int maxMultiplier = 8; int multiplier = min(elemPerBlock / blockDim, maxMultiplier); - if (multiplier <= 0) + + if (multiplier <= 0) //a block cant store 512 elements, sorting some really big data { - blockDim = 256; + blockDim = 256; //try to fit 256 elements multiplier = min(elemPerBlock / blockDim, maxMultiplier); + if (multiplier <= 0) { //worst case scenario, shared memory cant be utilized at all because of the sheer size of Value //sort has to be done with the use of global memory alone - QUICKSORT sorter(arr, maxBlocks, 512, 0, maxSharable); + QUICKSORT sorter(arr, maxBlocks, 512, 0, 0); sorter.sort(Cmp); return; } diff --git a/GPUSort/src/quicksort/quicksort_1Block.cuh b/GPUSort/src/quicksort/quicksort_1Block.cuh index 45967366c..9bbe49e8e 100644 --- a/GPUSort/src/quicksort/quicksort_1Block.cuh +++ b/GPUSort/src/quicksort/quicksort_1Block.cuh @@ -25,58 +25,16 @@ __device__ void externSort(ArrayView src, bitonicSort_Block(src, dst, Cmp); } +//--------------------------------------------------------------- + template __device__ void stackPush(int stackArrBegin[], int stackArrEnd[], int stackDepth[], int &stackTop, int begin, int pivotBegin, int pivotEnd, int end, - int depth) -{ - int sizeL = pivotBegin - begin, sizeR = end - pivotEnd; - - //push the bigger one 1st and then smaller one 2nd - //in next iteration, the smaller part will be handled 1st - if (sizeL > sizeR) - { - if (sizeL > 0) //left from pivot are smaller elems - { - stackArrBegin[stackTop] = begin; - stackArrEnd[stackTop] = pivotBegin; - stackDepth[stackTop] = depth + 1; - stackTop++; - } - - if (sizeR > 0) //right from pivot until end are elem greater than pivot - { - assert(stackTop < stackSize && "Local quicksort stack overflow."); - - stackArrBegin[stackTop] = pivotEnd; - stackArrEnd[stackTop] = end; - stackDepth[stackTop] = depth + 1; - stackTop++; - } - } - else - { - if (sizeR > 0) //right from pivot until end are elem greater than pivot - { - stackArrBegin[stackTop] = pivotEnd; - stackArrEnd[stackTop] = end; - stackDepth[stackTop] = depth + 1; - stackTop++; - } - - if (sizeL > 0) //left from pivot are smaller elems - { - assert(stackTop < stackSize && "Local quicksort stack overflow."); + int depth); - stackArrBegin[stackTop] = begin; - stackArrEnd[stackTop] = pivotBegin; - stackDepth[stackTop] = depth + 1; - stackTop++; - } - } -} +//--------------------------------------------------------------- template __device__ void singleBlockQuickSort(ArrayView arr, @@ -86,8 +44,8 @@ __device__ void singleBlockQuickSort(ArrayView arr, { if (arr.getSize() <= blockDim.x * 2) { - auto src = (_depth & 1) == 0 ? arr : aux; - if(useShared) + auto &src = (_depth & 1) == 0 ? arr : aux; + if (useShared && arr.getSize() <= memSize) externSort(src, arr, Cmp, sharedMem); else externSort(src, arr, Cmp); @@ -99,6 +57,8 @@ __device__ void singleBlockQuickSort(ArrayView arr, static __shared__ int stackArrBegin[stackSize], stackArrEnd[stackSize], stackDepth[stackSize]; static __shared__ int begin, end, depth; static __shared__ int pivotBegin, pivotEnd; + Value *piv = sharedMem; + sharedMem += 1; if (threadIdx.x == 0) { @@ -128,16 +88,20 @@ __device__ void singleBlockQuickSort(ArrayView arr, //small enough for for bitonic if (size <= blockDim.x * 2) { - if(useShared) + if (useShared && size <= memSize) externSort(src.getView(begin, end), arr.getView(begin, end), Cmp, sharedMem); else externSort(src.getView(begin, end), arr.getView(begin, end), Cmp); __syncthreads(); continue; } + //------------------------------------------------------ - Value pivot = pickPivot(src.getView(begin, end), Cmp); + if (threadIdx.x == 0) + *piv = pickPivot(src.getView(begin, end), Cmp); + __syncthreads(); + Value &pivot = *piv; int smaller = 0, bigger = 0; countElem(src.getView(begin, end), Cmp, smaller, bigger, pivot); @@ -200,4 +164,59 @@ __device__ void singleBlockQuickSort(ArrayView arr, } __syncthreads(); //sync to update stackTop } //ends while loop +} + +//-------------------------------------------------------------- + +template +__device__ void stackPush(int stackArrBegin[], int stackArrEnd[], + int stackDepth[], int &stackTop, + int begin, int pivotBegin, + int pivotEnd, int end, + int depth) +{ + int sizeL = pivotBegin - begin, sizeR = end - pivotEnd; + + //push the bigger one 1st and then smaller one 2nd + //in next iteration, the smaller part will be handled 1st + if (sizeL > sizeR) + { + if (sizeL > 0) //left from pivot are smaller elems + { + stackArrBegin[stackTop] = begin; + stackArrEnd[stackTop] = pivotBegin; + stackDepth[stackTop] = depth + 1; + stackTop++; + } + + if (sizeR > 0) //right from pivot until end are elem greater than pivot + { + assert(stackTop < stackSize && "Local quicksort stack overflow."); + + stackArrBegin[stackTop] = pivotEnd; + stackArrEnd[stackTop] = end; + stackDepth[stackTop] = depth + 1; + stackTop++; + } + } + else + { + if (sizeR > 0) //right from pivot until end are elem greater than pivot + { + stackArrBegin[stackTop] = pivotEnd; + stackArrEnd[stackTop] = end; + stackDepth[stackTop] = depth + 1; + stackTop++; + } + + if (sizeL > 0) //left from pivot are smaller elems + { + assert(stackTop < stackSize && "Local quicksort stack overflow."); + + stackArrBegin[stackTop] = begin; + stackArrEnd[stackTop] = pivotBegin; + stackDepth[stackTop] = depth + 1; + stackTop++; + } + } } \ No newline at end of file diff --git a/GPUSort/src/quicksort/quicksort_kernel.cuh b/GPUSort/src/quicksort/quicksort_kernel.cuh new file mode 100644 index 000000000..b9d0dd75b --- /dev/null +++ b/GPUSort/src/quicksort/quicksort_kernel.cuh @@ -0,0 +1,248 @@ +#pragma once + +#include +#include "../util/reduction.cuh" +#include "task.h" +#include "cudaPartition.cuh" +#include "quicksort_1Block.cuh" + +using namespace TNL; +using namespace TNL::Containers; + +//----------------------------------------------------------- + +__device__ void writeNewTask(int begin, int end, int depth, int maxElemFor2ndPhase, + ArrayView newTasks, int *newTasksCnt, + ArrayView secondPhaseTasks, int *secondPhaseTasksCnt); + +//----------------------------------------------------------- + +__global__ void cudaCalcBlocksNeeded(ArrayView cuda_tasks, int elemPerBlock, + ArrayView blocksNeeded) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= cuda_tasks.getSize()) + return; + + TASK &task = cuda_tasks[i]; + int size = task.partitionEnd - task.partitionBegin; + blocksNeeded[i] = size / elemPerBlock + (size % elemPerBlock != 0); +} + +//----------------------------------------------------------- + +template +__global__ void cudaInitTask(ArrayView cuda_tasks, + ArrayView cuda_blockToTaskMapping, + ArrayView cuda_reductionTaskInitMem, + ArrayView src, const Function &Cmp) +{ + if (blockIdx.x >= cuda_tasks.getSize()) + return; + + int start = blockIdx.x == 0 ? 0 : cuda_reductionTaskInitMem[blockIdx.x - 1]; + int end = cuda_reductionTaskInitMem[blockIdx.x]; + for (int i = start + threadIdx.x; i < end; i += blockDim.x) + cuda_blockToTaskMapping[i] = blockIdx.x; + + if (threadIdx.x == 0) + { + TASK &task = cuda_tasks[blockIdx.x]; + int pivotIdx = task.partitionBegin + pickPivotIdx(src.getView(task.partitionBegin, task.partitionEnd), Cmp); + task.initTask(start, end - start, pivotIdx); + } +} + +//---------------------------------------------------- + +template +__global__ void cudaQuickSort1stPhase(ArrayView arr, ArrayView aux, + const Function &Cmp, int elemPerBlock, + ArrayView tasks, + ArrayView taskMapping) +{ + extern __shared__ int externMem[]; + Value *piv = (Value *)externMem; + Value *sharedMem = piv + 1; + + TASK &myTask = tasks[taskMapping[blockIdx.x]]; + auto &src = (myTask.depth & 1) == 0 ? arr : aux; + auto &dst = (myTask.depth & 1) == 0 ? aux : arr; + + if (threadIdx.x == 0) + *piv = src[myTask.pivotIdx]; + __syncthreads(); + Value &pivot = *piv; + + cudaPartition( + src.getView(myTask.partitionBegin, myTask.partitionEnd), + dst.getView(myTask.partitionBegin, myTask.partitionEnd), + Cmp, sharedMem, pivot, + elemPerBlock, myTask); +} + +//---------------------------------------------------- + +template +__global__ void cudaWritePivot(ArrayView arr, ArrayView aux, int maxElemFor2ndPhase, + ArrayView tasks, ArrayView newTasks, int *newTasksCnt, + ArrayView secondPhaseTasks, int *secondPhaseTasksCnt) +{ + extern __shared__ int externMem[]; + Value *piv = (Value *)externMem; + + TASK &myTask = tasks[blockIdx.x]; + + if (threadIdx.x == 0) + *piv = (myTask.depth & 1) == 0 ? arr[myTask.pivotIdx] : aux[myTask.pivotIdx]; + __syncthreads(); + Value &pivot = *piv; + + int leftBegin = myTask.partitionBegin, leftEnd = myTask.partitionBegin + myTask.dstBegin; + int rightBegin = myTask.partitionBegin + myTask.dstEnd, rightEnd = myTask.partitionEnd; + + for (int i = leftEnd + threadIdx.x; i < rightBegin; i += blockDim.x) + { + /* + #ifdef DEBUG + aux[i] = -1; + #endif + */ + arr[i] = pivot; + } + + if (threadIdx.x != 0) + return; + + if (leftEnd - leftBegin > 0) + { + writeNewTask(leftBegin, leftEnd, myTask.depth, + maxElemFor2ndPhase, + newTasks, newTasksCnt, + secondPhaseTasks, secondPhaseTasksCnt); + } + + if (rightEnd - rightBegin > 0) + { + writeNewTask(rightBegin, rightEnd, + myTask.depth, maxElemFor2ndPhase, + newTasks, newTasksCnt, + secondPhaseTasks, secondPhaseTasksCnt); + } +} + +//----------------------------------------------------------- + +__device__ void writeNewTask(int begin, int end, int depth, int maxElemFor2ndPhase, + ArrayView newTasks, int *newTasksCnt, + ArrayView secondPhaseTasks, int *secondPhaseTasksCnt) +{ + int size = end - begin; + if (size < 0) + { + printf("negative size, something went really wrong\n"); + return; + } + + if (size == 0) + return; + + if (size <= maxElemFor2ndPhase) + { + int idx = atomicAdd(secondPhaseTasksCnt, 1); + if (idx < secondPhaseTasks.getSize()) + secondPhaseTasks[idx] = TASK(begin, end, depth + 1); + else + { + //printf("ran out of memory, trying backup\n"); + int idx = atomicAdd(newTasksCnt, 1); + if (idx < newTasks.getSize()) + newTasks[idx] = TASK(begin, end, depth + 1); + else + printf("ran out of memory for second phase task, there isnt even space in newTask list\nPart of array may stay unsorted!!!\n"); + } + } + else + { + int idx = atomicAdd(newTasksCnt, 1); + if (idx < newTasks.getSize()) + newTasks[idx] = TASK(begin, end, depth + 1); + else + { + //printf("ran out of memory, trying backup\n"); + int idx = atomicAdd(secondPhaseTasksCnt, 1); + if (idx < secondPhaseTasks.getSize()) + secondPhaseTasks[idx] = TASK(begin, end, depth + 1); + else + printf("ran out of memory for newtask, there isnt even space in second phase task list\nPart of array may stay unsorted!!!\n"); + } + } +} + +//----------------------------------------------------------- + +template +__global__ void cudaQuickSort2ndPhase(ArrayView arr, ArrayView aux, + const Function &Cmp, + ArrayView secondPhaseTasks, + int elemInShared) +{ + extern __shared__ int externMem[]; + Value *sharedMem = (Value *)externMem; + + TASK &myTask = secondPhaseTasks[blockIdx.x]; + if (myTask.partitionEnd - myTask.partitionBegin <= 0) + { + //printf("empty task???\n"); + return; + } + + auto arrView = arr.getView(myTask.partitionBegin, myTask.partitionEnd); + auto auxView = aux.getView(myTask.partitionBegin, myTask.partitionEnd); + + if (elemInShared == 0) + { + singleBlockQuickSort(arrView, auxView, Cmp, myTask.depth, sharedMem, 0); + } + else + { + singleBlockQuickSort(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared); + } +} + +template +__global__ void cudaQuickSort2ndPhase(ArrayView arr, ArrayView aux, + const Function &Cmp, + ArrayView secondPhaseTasks1, + ArrayView secondPhaseTasks2, + int elemInShared) +{ + extern __shared__ int externMem[]; + Value *sharedMem = (Value *)externMem; + + TASK myTask; + if (blockIdx.x < secondPhaseTasks1.getSize()) + myTask = secondPhaseTasks1[blockIdx.x]; + else + myTask = secondPhaseTasks2[blockIdx.x - secondPhaseTasks1.getSize()]; + + if (myTask.partitionEnd - myTask.partitionBegin <= 0) + { + //printf("empty task???\n"); + return; + } + + auto arrView = arr.getView(myTask.partitionBegin, myTask.partitionEnd); + auto auxView = aux.getView(myTask.partitionBegin, myTask.partitionEnd); + + if (elemInShared == 0) + { + singleBlockQuickSort(arrView, auxView, Cmp, myTask.depth, sharedMem, 0); + } + else + { + singleBlockQuickSort(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared); + } +} + +//----------------------------------------------------------- \ No newline at end of file -- GitLab From bfa8bf6141d9d2f0ecdd0e4ac0242048f5cdb118 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Fri, 9 Apr 2021 01:38:53 +0200 Subject: [PATCH 197/258] task printing and getting size --- GPUSort/src/quicksort/task.h | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/GPUSort/src/quicksort/task.h b/GPUSort/src/quicksort/task.h index 8a0b687ca..ecbc8522c 100644 --- a/GPUSort/src/quicksort/task.h +++ b/GPUSort/src/quicksort/task.h @@ -29,5 +29,20 @@ struct TASK this->pivotIdx = pivotIdx; } + __cuda_callable__ + int getSize() const + { + return end - begin; + } + TASK() = default; -}; \ No newline at end of file +}; + +std::ostream& operator<<(std::ostream & out, const TASK & task) +{ + out << "[ "; + out << task.partitionBegin << " - " << task.partitionEnd; + out << " | " << "depth: " << task.depth; + out << " | " << "pivotIdx: " << task.pivotIdx; + return out << " ] "; +} \ No newline at end of file -- GitLab From fdda43b213744e43acac3a7ad30ba74630b31844 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Fri, 9 Apr 2021 01:39:07 +0200 Subject: [PATCH 198/258] debug option --- GPUSort/src/quicksort/quicksort.cuh | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index 62ae9a8f6..3f6bee5fc 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -10,6 +10,11 @@ #include #define deb(x) std::cout << #x << " = " << x << std::endl; +#ifdef CHECK_RESULT_SORT +#include "../util/algorithm.h" +#include +#endif + using namespace TNL; using namespace TNL::Containers; @@ -132,6 +137,24 @@ void QUICKSORT::sort(const Function &Cmp) cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; + +#ifdef CHECK_RESULT_SORT +if(!is_sorted(arr)) +{ + std::ofstream out("error.txt"); + out << arr << std::endl; + out << aux << std::endl; + out << cuda_tasks << std::endl; + out << cuda_newTasks << std::endl; + out << cuda_2ndPhaseTasks << std::endl; + + out << cuda_newTasksAmount << std::endl; + out << cuda_2ndPhaseTasksAmount << std::endl; + + out << iteration << std::endl; +} +#endif + return; } -- GitLab From cb6fbce7401fe08176a95b5f32d22ea272647ee3 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Fri, 9 Apr 2021 01:51:49 +0200 Subject: [PATCH 199/258] fix size --- GPUSort/src/quicksort/task.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPUSort/src/quicksort/task.h b/GPUSort/src/quicksort/task.h index ecbc8522c..49a514c31 100644 --- a/GPUSort/src/quicksort/task.h +++ b/GPUSort/src/quicksort/task.h @@ -32,7 +32,7 @@ struct TASK __cuda_callable__ int getSize() const { - return end - begin; + return partitionEnd - partitionBegin; } TASK() = default; -- GitLab From 432f983c0c90db43ab9929ed4eb87f8cc390337d Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Fri, 9 Apr 2021 02:01:07 +0200 Subject: [PATCH 200/258] refactor out generator from benchmark --- GPUSort/benchmark/benchmarker.cpp | 118 +++--------------------- GPUSort/benchmark/generators.cpp | 148 ++++++++++++++++++++++++++++++ 2 files changed, 160 insertions(+), 106 deletions(-) create mode 100644 GPUSort/benchmark/generators.cpp diff --git a/GPUSort/benchmark/benchmarker.cpp b/GPUSort/benchmark/benchmarker.cpp index 0e3dac93c..4dea4a94f 100644 --- a/GPUSort/benchmark/benchmarker.cpp +++ b/GPUSort/benchmark/benchmarker.cpp @@ -8,6 +8,7 @@ using namespace std; #include "../src/util/timer.h" +#include "generators.cpp" //--------------------------- /** @@ -78,143 +79,48 @@ double measure(const vector&vec) #endif double sorted(int size) -{ - vector vec(size); - iota(vec.begin(), vec.end(), 0); - - return measure(vec); +{ + return measure(generateSorted(size)); } double random(int size) { - srand(size + 2021); - - vector vec(size); - generate(vec.begin(), vec.end(), [=](){return std::rand() % (2*size);}); - - return measure(vec); + return measure(generateRandom(size)); } double shuffle(int size) { - srand(size); - - vector vec(size); - iota(vec.begin(), vec.end(), 0); - random_shuffle(vec.begin(), vec.end()); - - return measure(vec); + return measure(generateShuffle(size)); } double almostSorted(int size) { - vector vec(size); - iota(vec.begin(), vec.end(), 0); - for(int i = 0; i < 3; i++) //swaps 3 times in array - { - int s = rand() % (size - 3); - std::swap(vec[s], vec[s + 1]); - } - - return measure(vec); + return measure(generateAlmostSorted(size)); } double decreasing(int size) { - vector vec(size); - for(int i = 0; i < size; i++) - vec[i] = size - i; - - return measure(vec); + return measure(generateDecreasing(size)); } double zero_entropy(int size) -{ - vector vec(size); - for(auto & x : vec) - x = size; - - return measure(vec); +{ + return measure(generateZero_entropy(size)); } double gaussian(int size) { - srand(size + 2000); - - vector vec(size); - for (int i = 0; i < size; ++i) - { - int value = 0; - for (int j = 0; j < 4; ++j) - value += rand()%16384; - - vec[i] = value /4; - } - return measure(vec); + return measure(generateZero_entropy(size)); } double bucket(int size) { - srand (size + 94215); - vector vec(size); - - double tmp = ((double)size)*3000000; //(RAND_MAX)/p; --> ((double)N)*30000; - double tmp2 = sqrt(tmp); - - int p= (size+tmp2-1)/tmp2; - - const int VALUE = 8192/p; //(RAND_MAX)/p; - - int i=0; int x=0; - //the array of size N is split into 'p' buckets - while(i < p) - { - for (int z = 0; z < p; ++z) - for (int j = 0; j < size/(p*p); ++j) - { - //every bucket has N/(p*p) items and the range is [min : VALUE-1 ] - int min = VALUE*z; - - vec[x]= min + ( rand() % (VALUE-1) ) ; - x++; - } - i++; - } - - return measure(vec); + return measure(generateBucket(size)); } double staggared(int size) { - srand (size + 815618); - vector vec(size); - - int tmp=4096; //(RAND_MAX)/p; --> size=2048 - int p= (size+tmp-1)/tmp; - - const int VALUE = (1<<30)/p; //(RAND_MAX)/p; - - int i=1; int x=0; - //the array of size N is split into 'p' buckets - while(i <= p) - { - //every bucket has N/(p) items - for (int j = 0; j < size/(p); ++j) - { - int min; - - if(i<=(p/2)) - min = (2*i -1)*VALUE; - - else - min = (2*i-p-1)*VALUE; - - vec[x++]= min + ( rand() % (VALUE - 1) ); - } - i++; - } - - return measure(vec); + return measure(generateStaggered(size)); } void start(ostream & out, string delim) diff --git a/GPUSort/benchmark/generators.cpp b/GPUSort/benchmark/generators.cpp new file mode 100644 index 000000000..e117a5abe --- /dev/null +++ b/GPUSort/benchmark/generators.cpp @@ -0,0 +1,148 @@ +#pragma once +#include +#include +#include +#include +using namespace std; + +vector generateSorted(int size) +{ + vector vec(size); + + iota(vec.begin(), vec.end(), 0); + + return vec; +} + +vector generateRandom(int size) +{ + vector vec(size); + + srand(size + 2021); + generate(vec.begin(), vec.end(), [=](){return std::rand() % (2*size);}); + + return vec; +} + +vector generateShuffle(int size) +{ + vector vec(size); + + iota(vec.begin(), vec.end(), 0); + srand(size); + random_shuffle(vec.begin(), vec.end()); + + return vec; +} + +vector generateAlmostSorted(int size) +{ + vector vec(size); + + iota(vec.begin(), vec.end(), 0); + srand(9451); + for(int i = 0; i < 3; i++) //swaps 3 times in array + { + int s = rand() % (size - 3); + std::swap(vec[s], vec[s + 1]); + } + + return vec; +} + +vector generateDecreasing(int size) +{ + vector vec(size); + + for(int i = 0; i < size; i++) + vec[i] = size - i; + + return vec; +} + +vector generateZero_entropy(int size) +{ + vector vec(size, 515); + return vec; +} + +vector generateGaussian(int size) +{ + vector vec(size); + srand(size + 2000); + + for (int i = 0; i < size; ++i) + { + int value = 0; + for (int j = 0; j < 4; ++j) + value += rand()%16384; + + vec[i] = value /4; + } + + return vec; +} + +vector generateBucket(int size) +{ + vector vec(size); + + srand (size + 94215); + double tmp = ((double)size)*3000000; //(RAND_MAX)/p; --> ((double)N)*30000; + double tmp2 = sqrt(tmp); + + int p= (size+tmp2-1)/tmp2; + + const int VALUE = 8192/p; //(RAND_MAX)/p; + + int i=0; int x=0; + //the array of size N is split into 'p' buckets + while(i < p) + { + for (int z = 0; z < p; ++z) + for (int j = 0; j < size/(p*p); ++j) + { + //every bucket has N/(p*p) items and the range is [min : VALUE-1 ] + int min = VALUE*z; + + vec[x]= min + ( rand() % (VALUE-1) ) ; + x++; + } + i++; + } + + return vec; +} + +vector generateStaggered(int size) +{ + vector vec(size); + + srand (size + 815618); + int tmp=4096; //(RAND_MAX)/p; --> size=2048 + int p= (size+tmp-1)/tmp; + + const int VALUE = (1<<30)/p; //(RAND_MAX)/p; + + int i=1; int x=0; + //the array of size N is split into 'p' buckets + while(i <= p) + { + //every bucket has N/(p) items + for (int j = 0; j < size/(p); ++j) + { + int min; + + if(i<=(p/2)) + min = (2*i -1)*VALUE; + + else + min = (2*i-p-1)*VALUE; + + vec[x++]= min + ( rand() % (VALUE - 1) ); + } + i++; + } + + return vec; +} \ No newline at end of file -- GitLab From 14292184d8ed75230325a16daaeb62634c24e4d1 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Fri, 9 Apr 2021 02:12:32 +0200 Subject: [PATCH 201/258] sort 3D points --- .../tests/quicksort_unitTests/unitTests.cu | 26 +++++++++++-------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/GPUSort/tests/quicksort_unitTests/unitTests.cu b/GPUSort/tests/quicksort_unitTests/unitTests.cu index dc3518e49..85b22674b 100644 --- a/GPUSort/tests/quicksort_unitTests/unitTests.cu +++ b/GPUSort/tests/quicksort_unitTests/unitTests.cu @@ -141,26 +141,30 @@ TEST(types, type_double) ASSERT_TRUE(view == cudaArr2.getView()); } -struct TMPSTRUCT{ - uint8_t m_data[16]; - - __cuda_callable__ TMPSTRUCT(){m_data[0] = 0;} - __cuda_callable__ TMPSTRUCT(int first){m_data[0] = first;}; - __cuda_callable__ bool operator <(const TMPSTRUCT& other) const { return m_data[0] < other.m_data[0];} +struct TMPSTRUCT_xyz{ + double x, y, z; + __cuda_callable__ TMPSTRUCT_xyz(): x(0){} + __cuda_callable__ TMPSTRUCT_xyz(int first){x = first;}; + __cuda_callable__ bool operator <(const TMPSTRUCT_xyz& other) const { return x< other.x;} + __cuda_callable__ TMPSTRUCT_xyz& operator =(const TMPSTRUCT_xyz& other) {x = other.x; return *this;} }; +std::ostream & operator<<(std::ostream & out, const TMPSTRUCT_xyz & data){return out << data.x;} TEST(types, struct) { - std::srand(8451); + std::srand(46151); - int size = (1<<13); - std::vector arr(size); - for(auto & x : arr) x = TMPSTRUCT(std::rand()); + int size = (1<<18); + std::vector arr(size); + for(auto & x : arr) x = TMPSTRUCT_xyz(std::rand()); - TNL::Containers::Array cudaArr(arr); + TNL::Containers::Array cudaArr(arr); auto view = cudaArr.getView(); + //thrust::sort(thrust::device, cudaArr.getData(), cudaArr.getData() + cudaArr.getSize()); + //std::cout << view << std::endl; quicksort(view); + ASSERT_TRUE(is_sorted(view)); } //---------------------------------------------------------------------------------- -- GitLab From 8690b88797c2fd705e75ae082c0c063400a5a637 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Fri, 9 Apr 2021 02:37:36 +0200 Subject: [PATCH 202/258] fix sort checking --- GPUSort/src/util/algorithm.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/GPUSort/src/util/algorithm.h b/GPUSort/src/util/algorithm.h index 0715c5f42..ac3cac57c 100644 --- a/GPUSort/src/util/algorithm.h +++ b/GPUSort/src/util/algorithm.h @@ -5,9 +5,10 @@ template bool is_sorted(TNL::Containers::ArrayView arr, const Function &Cmp) { - if(arr.getSize() <= 1) return true; + if (arr.getSize() <= 1) + return true; - auto fetch = [=] __cuda_callable__(int i) { return Cmp(arr[i - 1], arr[i]); }; + auto fetch = [=] __cuda_callable__(int i) { return !Cmp(arr[i], arr[i - 1]); }; auto reduction = [] __cuda_callable__(bool a, bool b) { return a && b; }; return TNL::Algorithms::Reduction::reduce(1, arr.getSize(), fetch, reduction, true); } @@ -15,5 +16,5 @@ bool is_sorted(TNL::Containers::ArrayView arr, const template bool is_sorted(TNL::Containers::ArrayView arr) { - return is_sorted(arr, [] __cuda_callable__(const Value &a, const Value &b) { return a <= b; }); + return is_sorted(arr, [] __cuda_callable__(const Value &a, const Value &b) { return a < b; }); } -- GitLab From 7693fd1da8f4394f002b15771f5ba1c3d5636a66 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Fri, 9 Apr 2021 03:32:12 +0200 Subject: [PATCH 203/258] empty 2nd phase when filled with enough tasks --- GPUSort/src/quicksort/quicksort.cuh | 30 ++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index 3f6bee5fc..1ffdd9127 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -166,13 +166,35 @@ void QUICKSORT::firstPhase(const Function &Cmp) { while (host_1stPhaseTasksAmount > 0) { - //2ndphase task is now full or host_1stPhaseTasksAmount is full, as backup during writing, overflowing tasks were written into the other array - if (host_1stPhaseTasksAmount >= maxTasks || host_2ndPhaseTasksAmount >= maxTasks) + if (host_1stPhaseTasksAmount >= maxTasks) break; + if(host_2ndPhaseTasksAmount >= maxTasks) //2nd phase occupies enoughs tasks to warrant premature 2nd phase sort + { + int tmp = host_1stPhaseTasksAmount; + host_1stPhaseTasksAmount = 0; + secondPhase(Cmp); + cuda_2ndPhaseTasksAmount = host_2ndPhaseTasksAmount = 0; + host_1stPhaseTasksAmount = tmp; + } + //just in case newly created tasks wouldnt fit + //bite the bullet and sort with single blocks if (host_1stPhaseTasksAmount * 2 >= maxTasks + (maxTasks - host_2ndPhaseTasksAmount)) - break; + { + if(host_2ndPhaseTasksAmount >= 0.75*maxTasks) //2nd phase occupies enoughs tasks to warrant premature 2nd phase sort + { + int tmp = host_1stPhaseTasksAmount; + host_1stPhaseTasksAmount = 0; + secondPhase(Cmp); + cuda_2ndPhaseTasksAmount = host_2ndPhaseTasksAmount = 0; + host_1stPhaseTasksAmount = tmp; + } + else + break; + } + + //--------------------------------------------------------------- int elemPerBlock = getElemPerBlock(); @@ -297,6 +319,8 @@ int QUICKSORT::getSetsNeeded(int elemPerBlock) const template int QUICKSORT::getElemPerBlock() const { + return desiredElemPerBlock; + int setsNeeded = getSetsNeeded(desiredElemPerBlock); if (setsNeeded <= maxBlocks) -- GitLab From 3eb29381de79fd9a3295754e95365d88a171d0ce Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Fri, 9 Apr 2021 16:41:30 +0200 Subject: [PATCH 204/258] use ptx to calc closest pow 2 --- GPUSort/src/bitonicSort/bitonicSort.h | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/GPUSort/src/bitonicSort/bitonicSort.h b/GPUSort/src/bitonicSort/bitonicSort.h index 24f3628a5..8bec9f472 100644 --- a/GPUSort/src/bitonicSort/bitonicSort.h +++ b/GPUSort/src/bitonicSort/bitonicSort.h @@ -3,6 +3,19 @@ //--------------------------------------------- +// Inline PTX call to return index of highest non-zero bit in a word +static __device__ __forceinline__ unsigned int __btflo(unsigned int word) +{ + unsigned int ret; + asm volatile("bfind.u32 %0, %1;" : "=r"(ret) : "r"(word)); + return ret; +} + +__device__ int closestPow2_ptx(int len) +{ + return 1 << (__btflo((unsigned)len-1U)+1); +} + __host__ __device__ int closestPow2(int x) { if (x == 0) @@ -198,7 +211,7 @@ __device__ void bitonicSort_Block(TNL::Containers::ArrayView Date: Fri, 9 Apr 2021 16:46:52 +0200 Subject: [PATCH 205/258] fix gaussian test --- GPUSort/benchmark/benchmarker.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPUSort/benchmark/benchmarker.cpp b/GPUSort/benchmark/benchmarker.cpp index 4dea4a94f..162df150e 100644 --- a/GPUSort/benchmark/benchmarker.cpp +++ b/GPUSort/benchmark/benchmarker.cpp @@ -110,7 +110,7 @@ double zero_entropy(int size) double gaussian(int size) { - return measure(generateZero_entropy(size)); + return measure(generateGaussian(size)); } double bucket(int size) -- GitLab From faf647ce0bb9db0175e5488aae3cc50ad285d3be Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Fri, 9 Apr 2021 16:55:22 +0200 Subject: [PATCH 206/258] refactor out permutation 8 --- GPUSort/tests/bitonic_tests/unitTests.cu | 28 ++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/GPUSort/tests/bitonic_tests/unitTests.cu b/GPUSort/tests/bitonic_tests/unitTests.cu index 53fe8b3b5..3e0ce07b7 100644 --- a/GPUSort/tests/bitonic_tests/unitTests.cu +++ b/GPUSort/tests/bitonic_tests/unitTests.cu @@ -12,9 +12,9 @@ //---------------------------------------------------------------------------------- -TEST(permutations, allPermutationSize_1_to_8) +TEST(permutations, allPermutationSize_2_to_7) { - for(int i = 2; i<=8; i++ ) + for(int i = 2; i<=7; i++ ) { int size = i; std::vector orig(size); @@ -33,6 +33,30 @@ TEST(permutations, allPermutationSize_1_to_8) } } +TEST(permutations, allPermutationSize_8) +{ + int size = 9; + const int stride = 151; + int i = 0; + + std::vector orig(size); + std::iota(orig.begin(), orig.end(), 0); + + do + { + if ((i++) % stride != 0) + continue; + + TNL::Containers::Array cudaArr(orig); + auto view = cudaArr.getView(); + + bitonicSort(view); + + ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl; + } + while (std::next_permutation(orig.begin(), orig.end())); +} + TEST(permutations, somePermutationSize9) { int size = 9; -- GitLab From 8eedaddb44cc85d313436ed5e11e6e548c961689 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Fri, 9 Apr 2021 16:55:46 +0200 Subject: [PATCH 207/258] use for loop to copy for better readability --- GPUSort/src/bitonicSort/bitonicSort.h | 43 +++++++-------------------- 1 file changed, 10 insertions(+), 33 deletions(-) diff --git a/GPUSort/src/bitonicSort/bitonicSort.h b/GPUSort/src/bitonicSort/bitonicSort.h index 8bec9f472..ac1c67f95 100644 --- a/GPUSort/src/bitonicSort/bitonicSort.h +++ b/GPUSort/src/bitonicSort/bitonicSort.h @@ -88,16 +88,9 @@ __global__ void bitonicMergeSharedMemory(TNL::Containers::ArrayView Date: Fri, 9 Apr 2021 17:12:56 +0200 Subject: [PATCH 208/258] fix copy and add support for block bitonic sort of bigger size --- GPUSort/src/bitonicSort/bitonicSort.h | 64 +++++++++++++++------------ 1 file changed, 36 insertions(+), 28 deletions(-) diff --git a/GPUSort/src/bitonicSort/bitonicSort.h b/GPUSort/src/bitonicSort/bitonicSort.h index ac1c67f95..958e8e489 100644 --- a/GPUSort/src/bitonicSort/bitonicSort.h +++ b/GPUSort/src/bitonicSort/bitonicSort.h @@ -191,27 +191,32 @@ __device__ void bitonicSort_Block(TNL::Containers::ArrayView= src.getSize()) //special case for parts with no "partner" - ascending = true; - for (int len = monotonicSeqLen; len > 1; len /= 2) { - //calculates which 2 indexes will be compared and swap - int part = i / (len / 2); - int s = part * len + (i & ((len / 2) - 1)); - int e = s + len / 2; + for(int i = threadIdx.x; ; i+=blockDim.x) //simulates other blocks in case src.size > blockDim.x*2 + { + //calculates which 2 indexes will be compared and swap + int part = i / (len / 2); + int s = part * len + (i & ((len / 2) - 1)); + int e = s + len / 2; + + if(e >= src.getSize()) //touching virtual padding, the order dont swap + break; + + //calculate the direction of swapping + int monotonicSeqIdx = i / (monotonicSeqLen / 2); + bool ascending = (monotonicSeqIdx & 1) != 0; + if ((monotonicSeqIdx + 1) * monotonicSeqLen >= src.getSize()) //special case for parts with no "partner" + ascending = true; - if (e < src.getSize()) //not touching virtual padding cmpSwap(sharedMem[s], sharedMem[e], ascending, Cmp); - __syncthreads(); + } + + __syncthreads(); //only 1 synchronization needed } } } @@ -232,29 +237,32 @@ __device__ void bitonicSort_Block(TNL::Containers::ArrayView __device__ void bitonicSort_Block(TNL::Containers::ArrayView src, - TNL::Containers::ArrayView dst, const Function &Cmp) { - int i = threadIdx.x; int paddedSize = closestPow2_ptx(src.getSize()); for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) { - //calculate the direction of swapping - int monotonicSeqIdx = i / (monotonicSeqLen / 2); - bool ascending = (monotonicSeqIdx & 1) != 0; - if ((monotonicSeqIdx + 1) * monotonicSeqLen >= src.getSize()) //special case for parts with no "partner" - ascending = true; - for (int len = monotonicSeqLen; len > 1; len /= 2) { - //calculates which 2 indexes will be compared and swap - int part = i / (len / 2); - int s = part * len + (i & ((len / 2) - 1)); - int e = s + len / 2; + for(int i = threadIdx.x; ; i+=blockDim.x) //simulates other blocks in case src.size > blockDim.x*2 + { + //calculates which 2 indexes will be compared and swap + int part = i / (len / 2); + int s = part * len + (i & ((len / 2) - 1)); + int e = s + len / 2; + + if(e >= src.getSize()) + break; + + //calculate the direction of swapping + int monotonicSeqIdx = i / (monotonicSeqLen / 2); + bool ascending = (monotonicSeqIdx & 1) != 0; + if ((monotonicSeqIdx + 1) * monotonicSeqLen >= src.getSize()) //special case for parts with no "partner" + ascending = true; - if (e < src.getSize()) //not touching virtual padding cmpSwap(src[s], src[e], ascending, Cmp); + } __syncthreads(); } } @@ -292,9 +300,9 @@ __global__ void bitoniSort1stStep(TNL::Containers::ArrayView Date: Fri, 9 Apr 2021 17:16:43 +0200 Subject: [PATCH 209/258] comments on functions --- GPUSort/src/bitonicSort/bitonicSort.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/GPUSort/src/bitonicSort/bitonicSort.h b/GPUSort/src/bitonicSort/bitonicSort.h index 958e8e489..5e0905268 100644 --- a/GPUSort/src/bitonicSort/bitonicSort.h +++ b/GPUSort/src/bitonicSort/bitonicSort.h @@ -173,10 +173,10 @@ __global__ void bitonicMerge(TNL::Containers::ArrayView __device__ void bitonicSort_Block(TNL::Containers::ArrayView src, @@ -230,8 +230,9 @@ __device__ void bitonicSort_Block(TNL::Containers::ArrayView Date: Fri, 9 Apr 2021 17:24:16 +0200 Subject: [PATCH 210/258] fix copy after no shared bitonic sort --- GPUSort/src/quicksort/quicksort_1Block.cuh | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/GPUSort/src/quicksort/quicksort_1Block.cuh b/GPUSort/src/quicksort/quicksort_1Block.cuh index 9bbe49e8e..900d3b9ad 100644 --- a/GPUSort/src/quicksort/quicksort_1Block.cuh +++ b/GPUSort/src/quicksort/quicksort_1Block.cuh @@ -19,10 +19,9 @@ __device__ void externSort(ArrayView src, template __device__ void externSort(ArrayView src, - ArrayView dst, const Function &Cmp) { - bitonicSort_Block(src, dst, Cmp); + bitonicSort_Block(src, Cmp); } //--------------------------------------------------------------- @@ -48,7 +47,13 @@ __device__ void singleBlockQuickSort(ArrayView arr, if (useShared && arr.getSize() <= memSize) externSort(src, arr, Cmp, sharedMem); else - externSort(src, arr, Cmp); + { + externSort(src, Cmp); + //extern sort without shared memory only works in-place, need to copy into from aux + if ((_depth & 1) != 0) + for (int i = threadIdx.x; i < arr.getSize(); i += blockDim.x) + arr[i] = src[i]; + } return; } @@ -91,7 +96,13 @@ __device__ void singleBlockQuickSort(ArrayView arr, if (useShared && size <= memSize) externSort(src.getView(begin, end), arr.getView(begin, end), Cmp, sharedMem); else - externSort(src.getView(begin, end), arr.getView(begin, end), Cmp); + { + externSort(src.getView(begin, end), Cmp); + //extern sort without shared memory only works in-place, need to copy into from aux + if ((depth & 1) != 0) + for (int i = threadIdx.x; i < src.getSize(); i += blockDim.x) + arr[begin + i] = src[i]; + } __syncthreads(); continue; } -- GitLab From 0f8c950af66d85babc9e3dba3a7898dbe1a5b65c Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Fri, 9 Apr 2021 17:27:54 +0200 Subject: [PATCH 211/258] sort 64b data --- .../tests/quicksort_unitTests/unitTests.cu | 28 +++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/GPUSort/tests/quicksort_unitTests/unitTests.cu b/GPUSort/tests/quicksort_unitTests/unitTests.cu index 85b22674b..abcdfc3c0 100644 --- a/GPUSort/tests/quicksort_unitTests/unitTests.cu +++ b/GPUSort/tests/quicksort_unitTests/unitTests.cu @@ -150,8 +150,7 @@ struct TMPSTRUCT_xyz{ }; std::ostream & operator<<(std::ostream & out, const TMPSTRUCT_xyz & data){return out << data.x;} - -TEST(types, struct) +TEST(types, struct_3D_points) { std::srand(46151); @@ -167,6 +166,31 @@ TEST(types, struct) ASSERT_TRUE(is_sorted(view)); } +struct TMPSTRUCT_64b{ + uint8_t m_Data[64]; + __cuda_callable__ TMPSTRUCT_64b() {m_Data[0] = 0;} + __cuda_callable__ TMPSTRUCT_64b(int first){m_Data[0] = first;}; + __cuda_callable__ bool operator <(const TMPSTRUCT_64b& other) const { return m_Data[0]< other.m_Data[0];} + __cuda_callable__ TMPSTRUCT_64b& operator =(const TMPSTRUCT_64b& other) {m_Data[0] = other.m_Data[0]; return *this;} +}; +std::ostream & operator<<(std::ostream & out, const TMPSTRUCT_64b & data){return out << (unsigned) data.m_Data[0];} + +TEST(types, struct_64b) +{ + std::srand(96); + + int size = (1<<18); + std::vector arr(size); + for(auto & x : arr) x = TMPSTRUCT_64b(std::rand() % 512); + + TNL::Containers::Array cudaArr(arr); + auto view = cudaArr.getView(); + //thrust::sort(thrust::device, cudaArr.getData(), cudaArr.getData() + cudaArr.getSize()); + //std::cout << view << std::endl; + quicksort(view); + ASSERT_TRUE(is_sorted(view)); +} + //---------------------------------------------------------------------------------- int main(int argc, char **argv) -- GitLab From e08acbb0e02a3cd388d748d640d128dd78c9f357 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Fri, 9 Apr 2021 17:45:49 +0200 Subject: [PATCH 212/258] change bitonic threshold size --- GPUSort/src/quicksort/quicksort.cuh | 8 ++++---- GPUSort/src/quicksort/quicksort_1Block.cuh | 7 ++++--- GPUSort/src/quicksort/quicksort_kernel.cuh | 16 ++++++++++------ 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index 1ffdd9127..170f08738 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -30,7 +30,7 @@ class QUICKSORT //-------------------------------------- - const int maxBitonicSize = threadsPerBlock * 2; + const int maxBitonicSize = threadsPerBlock * 8; const int desired_2ndPhasElemPerBlock = maxBitonicSize; const int g_maxTasks = 1 << 14; int maxTasks; @@ -284,20 +284,20 @@ void QUICKSORT::secondPhase(const Function &Cmp) auto tasks2 = cuda_2ndPhaseTasks.getView(0, host_2ndPhaseTasksAmount); cudaQuickSort2ndPhase - <<>>(arr, aux, Cmp, tasks, tasks2, elemInShared); + <<>>(arr, aux, Cmp, tasks, tasks2, elemInShared, maxBitonicSize); } else if (host_1stPhaseTasksAmount > 0) { auto tasks = leftoverTasks.getView(0, host_1stPhaseTasksAmount); cudaQuickSort2ndPhase - <<>>(arr, aux, Cmp, tasks, elemInShared); + <<>>(arr, aux, Cmp, tasks, elemInShared, maxBitonicSize); } else { auto tasks2 = cuda_2ndPhaseTasks.getView(0, host_2ndPhaseTasksAmount); cudaQuickSort2ndPhase - <<>>(arr, aux, Cmp, tasks2, elemInShared); + <<>>(arr, aux, Cmp, tasks2, elemInShared, maxBitonicSize); } } diff --git a/GPUSort/src/quicksort/quicksort_1Block.cuh b/GPUSort/src/quicksort/quicksort_1Block.cuh index 900d3b9ad..05621d697 100644 --- a/GPUSort/src/quicksort/quicksort_1Block.cuh +++ b/GPUSort/src/quicksort/quicksort_1Block.cuh @@ -39,9 +39,10 @@ template __device__ void singleBlockQuickSort(ArrayView arr, ArrayView aux, const Function &Cmp, int _depth, - Value *sharedMem, int memSize) + Value *sharedMem, int memSize, + int maxBitonicSize) { - if (arr.getSize() <= blockDim.x * 2) + if (arr.getSize() <= maxBitonicSize) { auto &src = (_depth & 1) == 0 ? arr : aux; if (useShared && arr.getSize() <= memSize) @@ -91,7 +92,7 @@ __device__ void singleBlockQuickSort(ArrayView arr, auto &src = (depth & 1) == 0 ? arr : aux; //small enough for for bitonic - if (size <= blockDim.x * 2) + if (size <= maxBitonicSize) { if (useShared && size <= memSize) externSort(src.getView(begin, end), arr.getView(begin, end), Cmp, sharedMem); diff --git a/GPUSort/src/quicksort/quicksort_kernel.cuh b/GPUSort/src/quicksort/quicksort_kernel.cuh index b9d0dd75b..1d31cca46 100644 --- a/GPUSort/src/quicksort/quicksort_kernel.cuh +++ b/GPUSort/src/quicksort/quicksort_kernel.cuh @@ -185,7 +185,7 @@ template __global__ void cudaQuickSort2ndPhase(ArrayView arr, ArrayView aux, const Function &Cmp, ArrayView secondPhaseTasks, - int elemInShared) + int elemInShared, int maxBitonicSize) { extern __shared__ int externMem[]; Value *sharedMem = (Value *)externMem; @@ -202,11 +202,13 @@ __global__ void cudaQuickSort2ndPhase(ArrayView arr, Array if (elemInShared == 0) { - singleBlockQuickSort(arrView, auxView, Cmp, myTask.depth, sharedMem, 0); + singleBlockQuickSort + (arrView, auxView, Cmp, myTask.depth, sharedMem, 0, maxBitonicSize); } else { - singleBlockQuickSort(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared); + singleBlockQuickSort + (arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared, maxBitonicSize); } } @@ -215,7 +217,7 @@ __global__ void cudaQuickSort2ndPhase(ArrayView arr, Array const Function &Cmp, ArrayView secondPhaseTasks1, ArrayView secondPhaseTasks2, - int elemInShared) + int elemInShared, int maxBitonicSize) { extern __shared__ int externMem[]; Value *sharedMem = (Value *)externMem; @@ -237,11 +239,13 @@ __global__ void cudaQuickSort2ndPhase(ArrayView arr, Array if (elemInShared == 0) { - singleBlockQuickSort(arrView, auxView, Cmp, myTask.depth, sharedMem, 0); + singleBlockQuickSort + (arrView, auxView, Cmp, myTask.depth, sharedMem, 0, maxBitonicSize); } else { - singleBlockQuickSort(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared); + singleBlockQuickSort + (arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared, maxBitonicSize); } } -- GitLab From 5111e24f24a65995170d601b32178de3df7d1be2 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sat, 10 Apr 2021 22:16:52 +0200 Subject: [PATCH 213/258] refactor out measuring part --- GPUSort/benchmark/benchmarker.cpp | 162 ++++-------------- .../benchmark/bitonic_benchmark/benchmark.cu | 12 +- GPUSort/benchmark/measure.cpp | 31 ++++ GPUSort/benchmark/measure.cu | 39 +++++ GPUSort/benchmark/measure.h | 6 + .../quicksort_benchmark/benchmark.cu | 12 +- .../quicksort_dynamic_benchmark/benchmark.cu | 12 +- 7 files changed, 135 insertions(+), 139 deletions(-) create mode 100644 GPUSort/benchmark/measure.cpp create mode 100644 GPUSort/benchmark/measure.cu create mode 100644 GPUSort/benchmark/measure.h diff --git a/GPUSort/benchmark/benchmarker.cpp b/GPUSort/benchmark/benchmarker.cpp index 162df150e..f31ce9fa0 100644 --- a/GPUSort/benchmark/benchmarker.cpp +++ b/GPUSort/benchmark/benchmarker.cpp @@ -7,26 +7,8 @@ #include using namespace std; -#include "../src/util/timer.h" #include "generators.cpp" - -//--------------------------- -/** - * important! to make use of this benchmarker, it is needed to define SORTERFUNCTION - * then include this file - * */ -//--------------------------- - -#ifdef HAVE_CUDA - -#include -#include "../src/util/algorithm.h" -using namespace TNL; -using namespace TNL::Containers; - -#endif - -static int notCorrectCounters = 0; +#include "measure.h" #ifndef LOW_POW #define LOW_POW 10 @@ -40,88 +22,7 @@ static int notCorrectCounters = 0; #define TRIES 20 #endif -double measure(const vector&vec); - -#ifndef MY_OWN_MEASURE -double measure(const vector&vec) -{ - vector resAcc; - - - for(int i = 0; i < TRIES; i++) - { - #ifdef HAVE_CUDA - Array arr(vec); - auto view = arr.getView(); - - { - TIMER t([&](double res){resAcc.push_back(res);}); - SORTERFUNCTION(view); - } - - if(!is_sorted(view)) - notCorrectCounters++; - #else - vector tmp = vec; - - { - TIMER t([&](double res){resAcc.push_back(res);}); - SORTERFUNCTION(tmp); - } - - if(!std::is_sorted(tmp.begin(), tmp.end())) - notCorrectCounters++; - #endif - } - - return accumulate(resAcc.begin(), resAcc.end(), 0.0) / resAcc.size(); -} -#endif - -double sorted(int size) -{ - return measure(generateSorted(size)); -} - -double random(int size) -{ - return measure(generateRandom(size)); -} - -double shuffle(int size) -{ - return measure(generateShuffle(size)); -} - -double almostSorted(int size) -{ - return measure(generateAlmostSorted(size)); -} - -double decreasing(int size) -{ - return measure(generateDecreasing(size)); -} - -double zero_entropy(int size) -{ - return measure(generateZero_entropy(size)); -} - -double gaussian(int size) -{ - return measure(generateGaussian(size)); -} - -double bucket(int size) -{ - return measure(generateBucket(size)); -} - -double staggared(int size) -{ - return measure(generateStaggered(size)); -} +//------------------------------------------------------------ void start(ostream & out, string delim) { @@ -137,6 +38,8 @@ void start(ostream & out, string delim) out << "zero_entropy"; out << endl; + int wrongAnsCnt = 0; + for(int pow = LOW_POW; pow <= HIGH_POW; pow++) { int size =(1<< pow); @@ -145,33 +48,36 @@ void start(ostream & out, string delim) out << "2^" << pow << delim; out << fixed << setprecision(3); - out << random(size) << delim; - out.flush(); - - out << shuffle(size) << delim; - out.flush(); - - out << sorted(size) << delim; - out.flush(); - - out << almostSorted(size) << delim; - out.flush(); - - out << decreasing(size) << delim; - out.flush(); - - out << gaussian(size) << delim; - out.flush(); - - out << bucket(size) << delim; - out.flush(); - - out << staggared(size) << delim; - out.flush(); - - out << zero_entropy(size); + out << measure(generateRandom(size), TRIES, wrongAnsCnt); + out << delim; + + out << measure(generateShuffle(size), TRIES, wrongAnsCnt); + out << delim; + + out << measure(generateSorted(size), TRIES, wrongAnsCnt); + out << delim; + + out << measure(generateAlmostSorted(size), TRIES, wrongAnsCnt); + out << delim; + + out << measure(generateDecreasing(size), TRIES, wrongAnsCnt); + out << delim; + + out << measure(generateGaussian(size), TRIES, wrongAnsCnt) ; + out << delim; + + out << measure(generateBucket(size), TRIES, wrongAnsCnt); + out << delim; + + out << measure(generateStaggered(size), TRIES, wrongAnsCnt); + out << delim; + + out << measure(generateZero_entropy(size), TRIES, wrongAnsCnt); out << endl; } + + if(wrongAnsCnt > 0) + std::cerr << wrongAnsCnt << "tries were sorted incorrectly" << std::endl; } int main(int argc, char *argv[]) @@ -185,9 +91,5 @@ int main(int argc, char *argv[]) std::ofstream out(argv[1]); start(out, ","); } - if(notCorrectCounters > 0) - { - std::cerr << notCorrectCounters << "tries were sorted incorrectly" << std::endl; - } return 0; } \ No newline at end of file diff --git a/GPUSort/benchmark/bitonic_benchmark/benchmark.cu b/GPUSort/benchmark/bitonic_benchmark/benchmark.cu index 26e241be2..88c612fe8 100644 --- a/GPUSort/benchmark/bitonic_benchmark/benchmark.cu +++ b/GPUSort/benchmark/bitonic_benchmark/benchmark.cu @@ -1,4 +1,10 @@ #include "../../src/bitonicSort/bitonicSort.h" -#define SORTERFUNCTION bitonicSort -//--------------------------- -#include "../benchmarker.cpp" \ No newline at end of file + +#include "../benchmarker.cpp" +#include "../measure.cu" + +template +void sorter(ArrayView arr) +{ + bitonicSort(arr); +} \ No newline at end of file diff --git a/GPUSort/benchmark/measure.cpp b/GPUSort/benchmark/measure.cpp new file mode 100644 index 000000000..efc91af70 --- /dev/null +++ b/GPUSort/benchmark/measure.cpp @@ -0,0 +1,31 @@ +#pragma once + +#include "measure.h" +#include "../src/util/timer.h" + +//-------------------------------------------------------- + +template +void sorter(std::vector&vec); + +//-------------------------------------------------------- + +template +double measure(const std::vector&vec, int tries, int & wrongAnsCnt) +{ + vector resAcc; + + for(int i = 0; i < tries; i++) + { + vector tmp = vec; + { + TIMER t([&](double res){resAcc.push_back(res);}); + sorter(tmp); + } + + if(!std::is_sorted(tmp.begin(), tmp.end())) + wrongAnsCnt++; + } + + return accumulate(resAcc.begin(), resAcc.end(), 0.0) / resAcc.size(); +} \ No newline at end of file diff --git a/GPUSort/benchmark/measure.cu b/GPUSort/benchmark/measure.cu new file mode 100644 index 000000000..50033aadc --- /dev/null +++ b/GPUSort/benchmark/measure.cu @@ -0,0 +1,39 @@ +#pragma once + +#include + +#include "measure.h" +#include "../src/util/timer.h" + +#include +#include "../src/util/algorithm.h" +using namespace TNL; +using namespace TNL::Containers; + +//-------------------------------------------------------- + +template +void sorter(ArrayView arr); + +//-------------------------------------------------------- + +template +double measure(const std::vector&vec, int tries, int & wrongAnsCnt) +{ + vector resAcc; + + for(int i = 0; i < tries; i++) + { + Array arr(vec); + auto view = arr.getView(); + { + TIMER t([&](double res){resAcc.push_back(res);}); + sorter(view); + } + + if(!is_sorted(view)) + wrongAnsCnt++; + } + + return accumulate(resAcc.begin(), resAcc.end(), 0.0) / resAcc.size(); +} \ No newline at end of file diff --git a/GPUSort/benchmark/measure.h b/GPUSort/benchmark/measure.h new file mode 100644 index 000000000..3ebd633a1 --- /dev/null +++ b/GPUSort/benchmark/measure.h @@ -0,0 +1,6 @@ +#pragma once + +#include + +template +double measure(const std::vector&vec, int tries, int & wrongAnsCnt); \ No newline at end of file diff --git a/GPUSort/benchmark/quicksort_benchmark/benchmark.cu b/GPUSort/benchmark/quicksort_benchmark/benchmark.cu index 0a486ce3a..d361a1ee2 100644 --- a/GPUSort/benchmark/quicksort_benchmark/benchmark.cu +++ b/GPUSort/benchmark/quicksort_benchmark/benchmark.cu @@ -1,4 +1,10 @@ #include "../../src/quicksort/quicksort.cuh" -#define SORTERFUNCTION quicksort -//--------------------------- -#include "../benchmarker.cpp" \ No newline at end of file + +#include "../benchmarker.cpp" +#include "../measure.cu" + +template +void sorter(ArrayView arr) +{ + quicksort(arr); +} \ No newline at end of file diff --git a/GPUSort/benchmark/quicksort_dynamic_benchmark/benchmark.cu b/GPUSort/benchmark/quicksort_dynamic_benchmark/benchmark.cu index 64b28f453..5bbf2d5af 100644 --- a/GPUSort/benchmark/quicksort_dynamic_benchmark/benchmark.cu +++ b/GPUSort/benchmark/quicksort_dynamic_benchmark/benchmark.cu @@ -1,4 +1,10 @@ #include "../../src/quicksort_dynamic/quicksort.cuh" -#define SORTERFUNCTION quicksort -//--------------------------- -#include "../benchmarker.cpp" \ No newline at end of file + +#include "../benchmarker.cpp" +#include "../measure.cu" + +template +void sorter(ArrayView arr) +{ + quicksort(arr); +} -- GitLab From afd1046ec6b55874a46ee26f6d92d370559f0744 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sat, 10 Apr 2021 22:40:44 +0200 Subject: [PATCH 214/258] flush after print and remove not used libraries --- GPUSort/benchmark/benchmarker.cpp | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/GPUSort/benchmark/benchmarker.cpp b/GPUSort/benchmark/benchmarker.cpp index f31ce9fa0..55e2c618c 100644 --- a/GPUSort/benchmark/benchmarker.cpp +++ b/GPUSort/benchmark/benchmarker.cpp @@ -1,10 +1,7 @@ #include #include -#include #include #include -#include -#include using namespace std; #include "generators.cpp" @@ -45,32 +42,32 @@ void start(ostream & out, string delim) int size =(1<< pow); vector vec(size); - out << "2^" << pow << delim; + out << "2^" << pow << delim << flush; out << fixed << setprecision(3); out << measure(generateRandom(size), TRIES, wrongAnsCnt); - out << delim; + out << delim << flush; out << measure(generateShuffle(size), TRIES, wrongAnsCnt); - out << delim; + out << delim << flush; out << measure(generateSorted(size), TRIES, wrongAnsCnt); - out << delim; + out << delim << flush; out << measure(generateAlmostSorted(size), TRIES, wrongAnsCnt); - out << delim; + out << delim << flush; out << measure(generateDecreasing(size), TRIES, wrongAnsCnt); - out << delim; + out << delim << flush; out << measure(generateGaussian(size), TRIES, wrongAnsCnt) ; - out << delim; + out << delim << flush; out << measure(generateBucket(size), TRIES, wrongAnsCnt); - out << delim; + out << delim << flush; out << measure(generateStaggered(size), TRIES, wrongAnsCnt); - out << delim; + out << delim << flush; out << measure(generateZero_entropy(size), TRIES, wrongAnsCnt); out << endl; -- GitLab From 8224d0da90a5c2227fb1425f4e01805eb9af7b47 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sat, 10 Apr 2021 23:50:31 +0200 Subject: [PATCH 215/258] calculating max eleme per block --- GPUSort/src/quicksort/quicksort.cuh | 38 +++++++++++----------- GPUSort/src/quicksort/quicksort_kernel.cuh | 2 +- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index 170f08738..e2cf2857a 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -30,8 +30,7 @@ class QUICKSORT //-------------------------------------- - const int maxBitonicSize = threadsPerBlock * 8; - const int desired_2ndPhasElemPerBlock = maxBitonicSize; + int desired_2ndPhasElemPerBlock; const int g_maxTasks = 1 << 14; int maxTasks; @@ -58,6 +57,7 @@ public: arr(arr.getView()), auxMem(arr.getSize()), aux(auxMem.getView()), + desired_2ndPhasElemPerBlock(desiredElemPerBlock), maxTasks(min(arr.getSize(), g_maxTasks)), cuda_tasks(maxTasks), cuda_newTasks(maxTasks), cuda_2ndPhaseTasks(maxTasks), @@ -139,20 +139,20 @@ void QUICKSORT::sort(const Function &Cmp) TNL_CHECK_CUDA_DEVICE; #ifdef CHECK_RESULT_SORT -if(!is_sorted(arr)) -{ - std::ofstream out("error.txt"); - out << arr << std::endl; - out << aux << std::endl; - out << cuda_tasks << std::endl; - out << cuda_newTasks << std::endl; - out << cuda_2ndPhaseTasks << std::endl; + if (!is_sorted(arr)) + { + std::ofstream out("error.txt"); + out << arr << std::endl; + out << aux << std::endl; + out << cuda_tasks << std::endl; + out << cuda_newTasks << std::endl; + out << cuda_2ndPhaseTasks << std::endl; - out << cuda_newTasksAmount << std::endl; - out << cuda_2ndPhaseTasksAmount << std::endl; + out << cuda_newTasksAmount << std::endl; + out << cuda_2ndPhaseTasksAmount << std::endl; - out << iteration << std::endl; -} + out << iteration << std::endl; + } #endif return; @@ -169,7 +169,7 @@ void QUICKSORT::firstPhase(const Function &Cmp) if (host_1stPhaseTasksAmount >= maxTasks) break; - if(host_2ndPhaseTasksAmount >= maxTasks) //2nd phase occupies enoughs tasks to warrant premature 2nd phase sort + if (host_2ndPhaseTasksAmount >= maxTasks) //2nd phase occupies enoughs tasks to warrant premature 2nd phase sort { int tmp = host_1stPhaseTasksAmount; host_1stPhaseTasksAmount = 0; @@ -182,7 +182,7 @@ void QUICKSORT::firstPhase(const Function &Cmp) //bite the bullet and sort with single blocks if (host_1stPhaseTasksAmount * 2 >= maxTasks + (maxTasks - host_2ndPhaseTasksAmount)) { - if(host_2ndPhaseTasksAmount >= 0.75*maxTasks) //2nd phase occupies enoughs tasks to warrant premature 2nd phase sort + if (host_2ndPhaseTasksAmount >= 0.75 * maxTasks) //2nd phase occupies enoughs tasks to warrant premature 2nd phase sort { int tmp = host_1stPhaseTasksAmount; host_1stPhaseTasksAmount = 0; @@ -284,20 +284,20 @@ void QUICKSORT::secondPhase(const Function &Cmp) auto tasks2 = cuda_2ndPhaseTasks.getView(0, host_2ndPhaseTasksAmount); cudaQuickSort2ndPhase - <<>>(arr, aux, Cmp, tasks, tasks2, elemInShared, maxBitonicSize); + <<>>(arr, aux, Cmp, tasks, tasks2, elemInShared, desired_2ndPhasElemPerBlock); } else if (host_1stPhaseTasksAmount > 0) { auto tasks = leftoverTasks.getView(0, host_1stPhaseTasksAmount); cudaQuickSort2ndPhase - <<>>(arr, aux, Cmp, tasks, elemInShared, maxBitonicSize); + <<>>(arr, aux, Cmp, tasks, elemInShared, desired_2ndPhasElemPerBlock); } else { auto tasks2 = cuda_2ndPhaseTasks.getView(0, host_2ndPhaseTasksAmount); cudaQuickSort2ndPhase - <<>>(arr, aux, Cmp, tasks2, elemInShared, maxBitonicSize); + <<>>(arr, aux, Cmp, tasks2, elemInShared, desired_2ndPhasElemPerBlock); } } diff --git a/GPUSort/src/quicksort/quicksort_kernel.cuh b/GPUSort/src/quicksort/quicksort_kernel.cuh index 1d31cca46..60b6dd7ef 100644 --- a/GPUSort/src/quicksort/quicksort_kernel.cuh +++ b/GPUSort/src/quicksort/quicksort_kernel.cuh @@ -237,7 +237,7 @@ __global__ void cudaQuickSort2ndPhase(ArrayView arr, Array auto arrView = arr.getView(myTask.partitionBegin, myTask.partitionEnd); auto auxView = aux.getView(myTask.partitionBegin, myTask.partitionEnd); - if (elemInShared == 0) + if (elemInShared <= 0) { singleBlockQuickSort (arrView, auxView, Cmp, myTask.depth, sharedMem, 0, maxBitonicSize); -- GitLab From 20a478a4ada3c397589af31234f2cc3a72fe8c5d Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 14 Apr 2021 01:41:30 +0200 Subject: [PATCH 216/258] change cmp template name and remove refernce to cmp in kernel --- GPUSort/src/bitonicSort/bitonicSort.h | 56 +++++++++++++-------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/GPUSort/src/bitonicSort/bitonicSort.h b/GPUSort/src/bitonicSort/bitonicSort.h index 5e0905268..a133c50df 100644 --- a/GPUSort/src/bitonicSort/bitonicSort.h +++ b/GPUSort/src/bitonicSort/bitonicSort.h @@ -28,8 +28,8 @@ __host__ __device__ int closestPow2(int x) return ret; } -template -__host__ __device__ void cmpSwap(Value &a, Value &b, bool ascending, const Function &Cmp) +template +__host__ __device__ void cmpSwap(Value &a, Value &b, bool ascending, const CMP &Cmp) { if (ascending == Cmp(b, a)) TNL::swap(a, b); @@ -41,9 +41,9 @@ __host__ __device__ void cmpSwap(Value &a, Value &b, bool ascending, const Funct * this kernel simulates 1 exchange * splits input arr that is bitonic into 2 bitonic sequences */ -template +template __global__ void bitonicMergeGlobal(TNL::Containers::ArrayView arr, - const Function &Cmp, + CMP Cmp, int monotonicSeqLen, int len, int partsInSeq) { int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -73,9 +73,9 @@ __global__ void bitonicMergeGlobal(TNL::Containers::ArrayView +template __global__ void bitonicMergeSharedMemory(TNL::Containers::ArrayView arr, - const Function &Cmp, + CMP Cmp, int monotonicSeqLen, int len, int partsInSeq) { extern __shared__ int externMem[]; @@ -133,9 +133,9 @@ __global__ void bitonicMergeSharedMemory(TNL::Containers::ArrayView +template __global__ void bitonicMerge(TNL::Containers::ArrayView arr, - const Function &Cmp, + CMP Cmp, int monotonicSeqLen, int len, int partsInSeq) { //1st index and last index of subarray that this threadBlock should merge @@ -178,10 +178,10 @@ __global__ void bitonicMerge(TNL::Containers::ArrayView +template __device__ void bitonicSort_Block(TNL::Containers::ArrayView src, TNL::Containers::ArrayView dst, - Value *sharedMem, const Function &Cmp) + Value *sharedMem, const CMP &Cmp) { //copy from globalMem into sharedMem for(int i = threadIdx.x; i < src.getSize(); i += blockDim.x) @@ -236,9 +236,9 @@ __device__ void bitonicSort_Block(TNL::Containers::ArrayView +template __device__ void bitonicSort_Block(TNL::Containers::ArrayView src, - const Function &Cmp) + const CMP &Cmp) { int paddedSize = closestPow2_ptx(src.getSize()); @@ -274,8 +274,8 @@ __device__ void bitonicSort_Block(TNL::Containers::ArrayView -__global__ void bitoniSort1stStepSharedMemory(TNL::Containers::ArrayView arr, const Function &Cmp) +template +__global__ void bitoniSort1stStepSharedMemory(TNL::Containers::ArrayView arr, CMP Cmp) { extern __shared__ int externMem[]; int sharedMemLen = 2 * blockDim.x; @@ -294,8 +294,8 @@ __global__ void bitoniSort1stStepSharedMemory(TNL::Containers::ArrayView -__global__ void bitoniSort1stStep(TNL::Containers::ArrayView arr, const Function &Cmp) +template +__global__ void bitoniSort1stStep(TNL::Containers::ArrayView arr, CMP Cmp) { int myBlockStart = blockIdx.x * (2 * blockDim.x); int myBlockEnd = TNL::min(arr.getSize(), myBlockStart + (2 * blockDim.x)); @@ -308,8 +308,8 @@ __global__ void bitoniSort1stStep(TNL::Containers::ArrayView -void bitonicSort(TNL::Containers::ArrayView src, int begin, int end, const Function &Cmp) +template +void bitonicSort(TNL::Containers::ArrayView src, int begin, int end, const CMP &Cmp) { TNL::Containers::ArrayView arr = src.getView(begin, end); int paddedSize = closestPow2(arr.getSize()); @@ -365,14 +365,14 @@ void bitonicSort(TNL::Containers::ArrayView src, int //--------------------------------------------- -template +template void bitonicSort(TNL::Containers::ArrayView arr, int begin, int end) { bitonicSort(arr, begin, end, [] __cuda_callable__(const Value &a, const Value &b) { return a < b; }); } -template -void bitonicSort(TNL::Containers::ArrayView arr, const Function &Cmp) +template +void bitonicSort(TNL::Containers::ArrayView arr, const CMP &Cmp) { bitonicSort(arr, 0, arr.getSize(), Cmp); } @@ -384,8 +384,8 @@ void bitonicSort(TNL::Containers::ArrayView arr) } //--------------------------------------------- -template -void bitonicSort(std::vector &vec, int begin, int end, const Function &Cmp) +template +void bitonicSort(std::vector &vec, int begin, int end, const CMP &Cmp) { TNL::Containers::Array Arr(vec); auto view = Arr.getView(); @@ -401,8 +401,8 @@ void bitonicSort(std::vector &vec, int begin, int end) bitonicSort(vec, begin, end, [] __cuda_callable__(const Value &a, const Value &b) { return a < b; }); } -template -void bitonicSort(std::vector &vec, const Function &Cmp) +template +void bitonicSort(std::vector &vec, const CMP &Cmp) { bitonicSort(vec, 0, vec.size(), Cmp); } @@ -417,8 +417,7 @@ void bitonicSort(std::vector &vec) //--------------------------------------------- template -__global__ void bitonicMergeGlobal(int size, FETCH Fetch, - const CMP &Cmp, SWAP Swap, +__global__ void bitonicMergeGlobal(int size, FETCH Fetch, CMP Cmp, SWAP Swap, int monotonicSeqLen, int len, int partsInSeq) { int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -468,7 +467,8 @@ void bitonicSort(int begin, int end, FETCH Fetch, const CMP &Cmp, SWAP Swap) for (int len = monotonicSeqLen, partsInSeq = 1; len > 1; len /= 2, partsInSeq *= 2) { bitonicMergeGlobal<<>>( - size, fetchWithOffset, Cmp, swapWithOffset, monotonicSeqLen, len, partsInSeq); + size, fetchWithOffset, Cmp, swapWithOffset, + monotonicSeqLen, len, partsInSeq); } } cudaDeviceSynchronize(); -- GitLab From fa28344c34e9d1140db5777753dd46dff94c18a4 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 14 Apr 2021 02:00:46 +0200 Subject: [PATCH 217/258] refactor out big size check --- GPUSort/src/bitonicSort/bitonicSort.h | 130 ++++++++++++++++---------- 1 file changed, 83 insertions(+), 47 deletions(-) diff --git a/GPUSort/src/bitonicSort/bitonicSort.h b/GPUSort/src/bitonicSort/bitonicSort.h index a133c50df..e3afe783d 100644 --- a/GPUSort/src/bitonicSort/bitonicSort.h +++ b/GPUSort/src/bitonicSort/bitonicSort.h @@ -7,13 +7,15 @@ static __device__ __forceinline__ unsigned int __btflo(unsigned int word) { unsigned int ret; - asm volatile("bfind.u32 %0, %1;" : "=r"(ret) : "r"(word)); + asm volatile("bfind.u32 %0, %1;" + : "=r"(ret) + : "r"(word)); return ret; } __device__ int closestPow2_ptx(int len) { - return 1 << (__btflo((unsigned)len-1U)+1); + return 1 << (__btflo((unsigned)len - 1U) + 1); } __host__ __device__ int closestPow2(int x) @@ -88,7 +90,7 @@ __global__ void bitonicMergeSharedMemory(TNL::Containers::ArrayView 1; len /= 2) { - for(int i = threadIdx.x; ; i+=blockDim.x) //simulates other blocks in case src.size > blockDim.x*2 + for (int i = threadIdx.x;; i += blockDim.x) //simulates other blocks in case src.size > blockDim.x*2 { //calculates which 2 indexes will be compared and swap int part = i / (len / 2); int s = part * len + (i & ((len / 2) - 1)); int e = s + len / 2; - if(e >= src.getSize()) //touching virtual padding, the order dont swap + if (e >= src.getSize()) //touching virtual padding, the order dont swap break; //calculate the direction of swapping @@ -215,7 +217,7 @@ __device__ void bitonicSort_Block(TNL::Containers::ArrayView 1; len /= 2) { - for(int i = threadIdx.x; ; i+=blockDim.x) //simulates other blocks in case src.size > blockDim.x*2 + for (int i = threadIdx.x;; i += blockDim.x) //simulates other blocks in case src.size > blockDim.x*2 { //calculates which 2 indexes will be compared and swap int part = i / (len / 2); int s = part * len + (i & ((len / 2) - 1)); int e = s + len / 2; - if(e >= src.getSize()) + if (e >= src.getSize()) break; //calculate the direction of swapping @@ -307,55 +308,33 @@ __global__ void bitoniSort1stStep(TNL::Containers::ArrayView -void bitonicSort(TNL::Containers::ArrayView src, int begin, int end, const CMP &Cmp) +void bitonicSortWithShared(TNL::Containers::ArrayView view, const CMP &Cmp, + int gridDim, int blockDim, int sharedMemLen, int sharedMemSize) { - TNL::Containers::ArrayView arr = src.getView(begin, end); - int paddedSize = closestPow2(arr.getSize()); - - int threadsNeeded = arr.getSize() / 2 + (arr.getSize() % 2 != 0); - - const int maxThreadsPerBlock = 512; - int threadPerBlock = maxThreadsPerBlock; - int blocks = threadsNeeded / threadPerBlock + (threadsNeeded % threadPerBlock != 0); - - int sharedMemLen = threadPerBlock * 2; - int sharedMemSize = sharedMemLen * sizeof(Value); - - //--------------------------------------------------------------------------------- + int paddedSize = closestPow2(view.getSize()); - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, 0); - - //--------------------------------------------------------------------------------- - - if (sharedMemSize <= deviceProp.sharedMemPerBlock) - bitoniSort1stStepSharedMemory<<>>(arr, Cmp); - else - bitoniSort1stStep<<>>(arr, Cmp); + bitoniSort1stStepSharedMemory<<>>(view, Cmp); + //now alternating monotonic sequences with lenght of sharedMemLen + // \/ has length of 2 * sharedMemLen for (int monotonicSeqLen = 2 * sharedMemLen; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) { for (int len = monotonicSeqLen, partsInSeq = 1; len > 1; len /= 2, partsInSeq *= 2) { if (len > sharedMemLen) { - bitonicMergeGlobal<<>>( - arr, Cmp, monotonicSeqLen, len, partsInSeq); + bitonicMergeGlobal<<>>( + view, Cmp, monotonicSeqLen, len, partsInSeq); } else { - if (sharedMemSize <= deviceProp.sharedMemPerBlock) - { - bitonicMergeSharedMemory<<>>( - arr, Cmp, monotonicSeqLen, len, partsInSeq); - } - else - { - bitonicMerge<<>>( - arr, Cmp, monotonicSeqLen, len, partsInSeq); - } + bitonicMergeSharedMemory<<>>( + view, Cmp, monotonicSeqLen, len, partsInSeq); + + //simulates sorts until len == 2 already, no need to continue this loop break; } } @@ -365,6 +344,63 @@ void bitonicSort(TNL::Containers::ArrayView src, int //--------------------------------------------- +template +void bitonicSort(TNL::Containers::ArrayView view, + const CMP &Cmp, + int gridDim, int blockDim) + +{ + int paddedSize = closestPow2(view.getSize()); + + for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) + { + for (int len = monotonicSeqLen, partsInSeq = 1; len > 1; len /= 2, partsInSeq *= 2) + { + bitonicMergeGlobal<<>>(view, Cmp, monotonicSeqLen, len, partsInSeq); + } + } + cudaDeviceSynchronize(); +} + +//--------------------------------------------- +template +void bitonicSort(TNL::Containers::ArrayView src, int begin, int end, const CMP &Cmp) +{ + auto view = src.getView(begin, end); + + int threadsNeeded = view.getSize() / 2 + (view.getSize() % 2 != 0); + + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, 0); + + const int maxThreadsPerBlock = 512; + + int sharedMemLen = maxThreadsPerBlock * 2; + int sharedMemSize = sharedMemLen * sizeof(Value); + + if (sharedMemSize <= deviceProp.sharedMemPerBlock) + { + int blockDim = maxThreadsPerBlock; + int gridDim = threadsNeeded / blockDim + (threadsNeeded % blockDim != 0); + bitonicSortWithShared(view, Cmp, gridDim, blockDim, sharedMemLen, sharedMemSize); + } + else if (sharedMemSize / 2 <= deviceProp.sharedMemPerBlock) + { + int blockDim = maxThreadsPerBlock / 2; //256 + int gridDim = threadsNeeded / blockDim + (threadsNeeded % blockDim != 0); + sharedMemSize /= 2; + sharedMemLen /= 2; + bitonicSortWithShared(view, Cmp, gridDim, blockDim, sharedMemLen, sharedMemSize); + } + else + { + int gridDim = threadsNeeded / maxThreadsPerBlock + (threadsNeeded % maxThreadsPerBlock != 0); + bitonicSort(view, Cmp, gridDim, maxThreadsPerBlock); + } +} + +//--------------------------------------------- + template void bitonicSort(TNL::Containers::ArrayView arr, int begin, int end) { -- GitLab From 442a1fc176068bc598d9e702ee0d38b4e02a814a Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 14 Apr 2021 02:13:03 +0200 Subject: [PATCH 218/258] bitonic test for big size structs --- GPUSort/tests/bitonic_tests/unitTests.cu | 52 ++++++++++++++++++++++-- 1 file changed, 48 insertions(+), 4 deletions(-) diff --git a/GPUSort/tests/bitonic_tests/unitTests.cu b/GPUSort/tests/bitonic_tests/unitTests.cu index 3e0ce07b7..75cd5a0af 100644 --- a/GPUSort/tests/bitonic_tests/unitTests.cu +++ b/GPUSort/tests/bitonic_tests/unitTests.cu @@ -156,10 +156,11 @@ TEST(nonIntegerType, double_notPow2) struct TMPSTRUCT{ uint8_t m_data[6]; - TMPSTRUCT(){m_data[0] = 0;} - TMPSTRUCT(int first){m_data[0] = first;}; - bool operator <(const TMPSTRUCT& other) const { return m_data[0] < other.m_data[0];} - bool operator <=(const TMPSTRUCT& other) const { return m_data[0] <= other.m_data[0];} + __cuda_callable__ TMPSTRUCT(){m_data[0] = 0;} + __cuda_callable__ TMPSTRUCT(int first){m_data[0] = first;}; + __cuda_callable__ bool operator <(const TMPSTRUCT& other) const { return m_data[0] < other.m_data[0];} + __cuda_callable__ TMPSTRUCT& operator =(const TMPSTRUCT& other) {m_data[0] = other.m_data[0]; return *this;} + }; TEST(nonIntegerType, struct) @@ -170,6 +171,49 @@ TEST(nonIntegerType, struct) ASSERT_TRUE(is_sorted(view)); } +struct TMPSTRUCT_64b{ + uint8_t m_data[64]; + __cuda_callable__ TMPSTRUCT_64b(){m_data[0] = 0;} + __cuda_callable__ TMPSTRUCT_64b(int first){m_data[0] = first;}; + __cuda_callable__ bool operator <(const TMPSTRUCT_64b& other) const { return m_data[0] < other.m_data[0];} + __cuda_callable__ TMPSTRUCT_64b& operator =(const TMPSTRUCT_64b& other) {m_data[0] = other.m_data[0]; return *this;} +}; + +TEST(nonIntegerType, struct_64b) +{ + std::srand(61513); + int size = std::rand() % (1<<15); + std::vector vec(size); + for(auto & x : vec) + x = TMPSTRUCT_64b(std::rand()); + + TNL::Containers::Array cudaArr(vec); + auto view = cudaArr.getView(); + bitonicSort(view); + ASSERT_TRUE(is_sorted(view)); +} + +struct TMPSTRUCT_128b{ + uint8_t m_data[128]; + __cuda_callable__ TMPSTRUCT_128b(){m_data[0] = 0;} + __cuda_callable__ TMPSTRUCT_128b(int first){m_data[0] = first;}; + __cuda_callable__ bool operator <(const TMPSTRUCT_128b& other) const { return m_data[0] < other.m_data[0];} + __cuda_callable__ TMPSTRUCT_128b& operator =(const TMPSTRUCT_128b& other) {m_data[0] = other.m_data[0]; return *this;} +}; + +TEST(nonIntegerType, struct_128b) +{ + std::srand(98451); + int size = std::rand() % (1<<14); + std::vector vec(size); + for(auto & x : vec) + x = TMPSTRUCT_128b(std::rand()); + + TNL::Containers::Array cudaArr(vec); + auto view = cudaArr.getView(); + bitonicSort(view); + ASSERT_TRUE(is_sorted(view)); +} //error bypassing //https://mmg-gitlab.fjfi.cvut.cz/gitlab/tnl/tnl-dev/blob/fbc34f6a97c13ec865ef7969b9704533222ed408/src/UnitTests/Containers/VectorTest-8.h -- GitLab From c82a99aa450f9c72323a1a6b231929e474556bcf Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 14 Apr 2021 02:17:12 +0200 Subject: [PATCH 219/258] rename template FUNCTION to CMP --- GPUSort/src/quicksort/cudaPartition.cuh | 24 +++++------ GPUSort/src/quicksort/quicksort.cuh | 46 +++++++++++----------- GPUSort/src/quicksort/quicksort_1Block.cuh | 20 +++++----- GPUSort/src/quicksort/quicksort_kernel.cuh | 26 ++++++------ 4 files changed, 58 insertions(+), 58 deletions(-) diff --git a/GPUSort/src/quicksort/cudaPartition.cuh b/GPUSort/src/quicksort/cudaPartition.cuh index e6b9ad0b8..d220dfcdb 100644 --- a/GPUSort/src/quicksort/cudaPartition.cuh +++ b/GPUSort/src/quicksort/cudaPartition.cuh @@ -7,8 +7,8 @@ using namespace TNL; using namespace TNL::Containers; -template -__device__ Value pickPivot(TNL::Containers::ArrayView src, const Function &Cmp) +template +__device__ Value pickPivot(TNL::Containers::ArrayView src, const CMP &Cmp) { //return src[0]; //return src[src.getSize()-1]; @@ -38,8 +38,8 @@ __device__ Value pickPivot(TNL::Containers::ArrayView src, const } } -template -__device__ int pickPivotIdx(TNL::Containers::ArrayView src, const Function &Cmp) +template +__device__ int pickPivotIdx(TNL::Containers::ArrayView src, const CMP &Cmp) { //return 0; //return src.getSize()-1; @@ -71,9 +71,9 @@ __device__ int pickPivotIdx(TNL::Containers::ArrayView src, const //----------------------------------------------------------- -template +template __device__ void countElem(ArrayView arr, - const Function &Cmp, + const CMP &Cmp, int &smaller, int &bigger, const Value &pivot) { @@ -89,10 +89,10 @@ __device__ void countElem(ArrayView arr, //----------------------------------------------------------- -template +template __device__ void copyDataShared(ArrayView src, ArrayView dst, - const Function &Cmp, + const CMP &Cmp, Value *sharedMem, int smallerStart, int biggerStart, int smallerTotal, int biggerTotal, @@ -119,10 +119,10 @@ __device__ void copyDataShared(ArrayView src, } } -template +template __device__ void copyData(ArrayView src, ArrayView dst, - const Function &Cmp, + const CMP &Cmp, int smallerStart, int biggerStart, const Value &pivot) { @@ -150,10 +150,10 @@ __device__ void copyData(ArrayView src, //---------------------------------------------------------------------------------- -template +template __device__ void cudaPartition(ArrayView src, ArrayView dst, - const Function &Cmp, + const CMP &Cmp, Value *sharedMem, const Value &pivot, int elemPerBlock, TASK &task) diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index e2cf2857a..5b09db8c7 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -82,8 +82,8 @@ public: } //-------------------------------------------------------------------------------------- - template - void sort(const Function &Cmp); + template + void sort(const CMP &Cmp); //-------------------------------------------------------------------------------------- @@ -100,15 +100,15 @@ public: /** * returns the amount of blocks needed to start phase 1 while also initializing all tasks * */ - template - int initTasks(int elemPerBlock, const Function &Cmp); + template + int initTasks(int elemPerBlock, const CMP &Cmp); /** * does the 1st phase of quicksort until out of task memory or each task is small enough * for correctness, secondphase method needs to be called to sort each subsequences * */ - template - void firstPhase(const Function &Cmp); + template + void firstPhase(const CMP &Cmp); /** * update necessary variables after 1 phase1 sort @@ -118,16 +118,16 @@ public: /** * sorts all leftover tasks * */ - template - void secondPhase(const Function &Cmp); + template + void secondPhase(const CMP &Cmp); }; //--------------------------------------------------------------------------------------------- //--------------------------------------------------------------------------------------------- template -template -void QUICKSORT::sort(const Function &Cmp) +template +void QUICKSORT::sort(const CMP &Cmp) { firstPhase(Cmp); @@ -161,8 +161,8 @@ void QUICKSORT::sort(const Function &Cmp) //--------------------------------------------------------------------------------------------- template -template -void QUICKSORT::firstPhase(const Function &Cmp) +template +void QUICKSORT::firstPhase(const CMP &Cmp) { while (host_1stPhaseTasksAmount > 0) { @@ -223,14 +223,14 @@ void QUICKSORT::firstPhase(const Function &Cmp) * */ if (externMemByteSize <= maxSharable) { - cudaQuickSort1stPhase + cudaQuickSort1stPhase <<>>( arr, aux, Cmp, elemPerBlock, task, cuda_blockToTaskMapping); } else { - cudaQuickSort1stPhase + cudaQuickSort1stPhase <<>>( arr, aux, Cmp, elemPerBlock, task, cuda_blockToTaskMapping); @@ -263,8 +263,8 @@ void QUICKSORT::firstPhase(const Function &Cmp) //---------------------------------------------------------------------- template -template -void QUICKSORT::secondPhase(const Function &Cmp) +template +void QUICKSORT::secondPhase(const CMP &Cmp) { int total2ndPhase = host_1stPhaseTasksAmount + host_2ndPhaseTasksAmount; const int stackSize = 32; @@ -283,20 +283,20 @@ void QUICKSORT::secondPhase(const Function &Cmp) auto tasks = leftoverTasks.getView(0, host_1stPhaseTasksAmount); auto tasks2 = cuda_2ndPhaseTasks.getView(0, host_2ndPhaseTasksAmount); - cudaQuickSort2ndPhase + cudaQuickSort2ndPhase <<>>(arr, aux, Cmp, tasks, tasks2, elemInShared, desired_2ndPhasElemPerBlock); } else if (host_1stPhaseTasksAmount > 0) { auto tasks = leftoverTasks.getView(0, host_1stPhaseTasksAmount); - cudaQuickSort2ndPhase + cudaQuickSort2ndPhase <<>>(arr, aux, Cmp, tasks, elemInShared, desired_2ndPhasElemPerBlock); } else { auto tasks2 = cuda_2ndPhaseTasks.getView(0, host_2ndPhaseTasksAmount); - cudaQuickSort2ndPhase + cudaQuickSort2ndPhase <<>>(arr, aux, Cmp, tasks2, elemInShared, desired_2ndPhasElemPerBlock); } } @@ -336,8 +336,8 @@ int QUICKSORT::getElemPerBlock() const } template -template -int QUICKSORT::initTasks(int elemPerBlock, const Function &Cmp) +template +int QUICKSORT::initTasks(int elemPerBlock, const CMP &Cmp) { auto &src = iteration % 2 == 0 ? arr : aux; @@ -385,8 +385,8 @@ void QUICKSORT::processNewTasks() //----------------------------------------------------------- //----------------------------------------------------------- -template -void quicksort(ArrayView arr, const Function &Cmp) +template +void quicksort(ArrayView arr, const CMP &Cmp) { cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, 0); diff --git a/GPUSort/src/quicksort/quicksort_1Block.cuh b/GPUSort/src/quicksort/quicksort_1Block.cuh index 05621d697..de88eef01 100644 --- a/GPUSort/src/quicksort/quicksort_1Block.cuh +++ b/GPUSort/src/quicksort/quicksort_1Block.cuh @@ -9,17 +9,17 @@ using namespace TNL; using namespace TNL::Containers; -template +template __device__ void externSort(ArrayView src, ArrayView dst, - const Function &Cmp, Value *sharedMem) + const CMP &Cmp, Value *sharedMem) { bitonicSort_Block(src, dst, sharedMem, Cmp); } -template +template __device__ void externSort(ArrayView src, - const Function &Cmp) + const CMP &Cmp) { bitonicSort_Block(src, Cmp); } @@ -35,10 +35,10 @@ __device__ void stackPush(int stackArrBegin[], int stackArrEnd[], //--------------------------------------------------------------- -template +template __device__ void singleBlockQuickSort(ArrayView arr, ArrayView aux, - const Function &Cmp, int _depth, + const CMP &Cmp, int _depth, Value *sharedMem, int memSize, int maxBitonicSize) { @@ -46,10 +46,10 @@ __device__ void singleBlockQuickSort(ArrayView arr, { auto &src = (_depth & 1) == 0 ? arr : aux; if (useShared && arr.getSize() <= memSize) - externSort(src, arr, Cmp, sharedMem); + externSort(src, arr, Cmp, sharedMem); else { - externSort(src, Cmp); + externSort(src, Cmp); //extern sort without shared memory only works in-place, need to copy into from aux if ((_depth & 1) != 0) for (int i = threadIdx.x; i < arr.getSize(); i += blockDim.x) @@ -95,10 +95,10 @@ __device__ void singleBlockQuickSort(ArrayView arr, if (size <= maxBitonicSize) { if (useShared && size <= memSize) - externSort(src.getView(begin, end), arr.getView(begin, end), Cmp, sharedMem); + externSort(src.getView(begin, end), arr.getView(begin, end), Cmp, sharedMem); else { - externSort(src.getView(begin, end), Cmp); + externSort(src.getView(begin, end), Cmp); //extern sort without shared memory only works in-place, need to copy into from aux if ((depth & 1) != 0) for (int i = threadIdx.x; i < src.getSize(); i += blockDim.x) diff --git a/GPUSort/src/quicksort/quicksort_kernel.cuh b/GPUSort/src/quicksort/quicksort_kernel.cuh index 60b6dd7ef..877e7a406 100644 --- a/GPUSort/src/quicksort/quicksort_kernel.cuh +++ b/GPUSort/src/quicksort/quicksort_kernel.cuh @@ -31,11 +31,11 @@ __global__ void cudaCalcBlocksNeeded(ArrayView cuda_tasks, //----------------------------------------------------------- -template +template __global__ void cudaInitTask(ArrayView cuda_tasks, ArrayView cuda_blockToTaskMapping, ArrayView cuda_reductionTaskInitMem, - ArrayView src, const Function &Cmp) + ArrayView src, CMP Cmp) { if (blockIdx.x >= cuda_tasks.getSize()) return; @@ -55,9 +55,9 @@ __global__ void cudaInitTask(ArrayView cuda_tasks, //---------------------------------------------------- -template +template __global__ void cudaQuickSort1stPhase(ArrayView arr, ArrayView aux, - const Function &Cmp, int elemPerBlock, + const CMP &Cmp, int elemPerBlock, ArrayView tasks, ArrayView taskMapping) { @@ -74,7 +74,7 @@ __global__ void cudaQuickSort1stPhase(ArrayView arr, Array __syncthreads(); Value &pivot = *piv; - cudaPartition( + cudaPartition( src.getView(myTask.partitionBegin, myTask.partitionEnd), dst.getView(myTask.partitionBegin, myTask.partitionEnd), Cmp, sharedMem, pivot, @@ -181,9 +181,9 @@ __device__ void writeNewTask(int begin, int end, int depth, int maxElemFor2ndPha //----------------------------------------------------------- -template +template __global__ void cudaQuickSort2ndPhase(ArrayView arr, ArrayView aux, - const Function &Cmp, + CMP Cmp, ArrayView secondPhaseTasks, int elemInShared, int maxBitonicSize) { @@ -202,19 +202,19 @@ __global__ void cudaQuickSort2ndPhase(ArrayView arr, Array if (elemInShared == 0) { - singleBlockQuickSort + singleBlockQuickSort (arrView, auxView, Cmp, myTask.depth, sharedMem, 0, maxBitonicSize); } else { - singleBlockQuickSort + singleBlockQuickSort (arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared, maxBitonicSize); } } -template +template __global__ void cudaQuickSort2ndPhase(ArrayView arr, ArrayView aux, - const Function &Cmp, + CMP Cmp, ArrayView secondPhaseTasks1, ArrayView secondPhaseTasks2, int elemInShared, int maxBitonicSize) @@ -239,12 +239,12 @@ __global__ void cudaQuickSort2ndPhase(ArrayView arr, Array if (elemInShared <= 0) { - singleBlockQuickSort + singleBlockQuickSort (arrView, auxView, Cmp, myTask.depth, sharedMem, 0, maxBitonicSize); } else { - singleBlockQuickSort + singleBlockQuickSort (arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared, maxBitonicSize); } } -- GitLab From 43b7de3cdccf933d014a00723add8cf2292b40d9 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 14 Apr 2021 02:50:01 +0200 Subject: [PATCH 220/258] use TNL::scan instead of thrust --- GPUSort/src/quicksort/quicksort.cuh | 15 +++++++-------- GPUSort/src/quicksort/quicksort_kernel.cuh | 17 +++++++---------- 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/src/quicksort/quicksort.cuh index 5b09db8c7..e19aef9a5 100644 --- a/GPUSort/src/quicksort/quicksort.cuh +++ b/GPUSort/src/quicksort/quicksort.cuh @@ -1,12 +1,11 @@ #pragma once #include +#include +#include #include "task.h" #include "quicksort_kernel.cuh" -#include -#include - #include #define deb(x) std::cout << #x << " = " << x << std::endl; @@ -41,7 +40,7 @@ class QUICKSORT Array cuda_newTasksAmount, cuda_2ndPhaseTasksAmount; //is in reality 1 integer each Array cuda_blockToTaskMapping; - Array cuda_reductionTaskInitMem; + Vector cuda_reductionTaskInitMem; //-------------------------------------- @@ -349,11 +348,11 @@ int QUICKSORT::initTasks(int elemPerBlock, const CMP &Cmp) cudaCalcBlocksNeeded<<>>(tasks.getView(0, host_1stPhaseTasksAmount), elemPerBlock, cuda_reductionTaskInitMem.getView(0, host_1stPhaseTasksAmount)); //cuda_reductionTaskInitMem[i] == how many blocks task i needs + + auto reduce = [] __cuda_callable__(const int &a, const int &b) { return a + b; }; - thrust::inclusive_scan(thrust::device, - cuda_reductionTaskInitMem.getData(), - cuda_reductionTaskInitMem.getData() + host_1stPhaseTasksAmount, - cuda_reductionTaskInitMem.getData()); + Algorithms::Scan:: + perform(cuda_reductionTaskInitMem, 0, cuda_reductionTaskInitMem.getSize(), reduce, 0); //cuda_reductionTaskInitMem[i] == how many blocks task [0..i] need int blocksNeeded = cuda_reductionTaskInitMem.getElement(host_1stPhaseTasksAmount - 1); diff --git a/GPUSort/src/quicksort/quicksort_kernel.cuh b/GPUSort/src/quicksort/quicksort_kernel.cuh index 877e7a406..a764eb161 100644 --- a/GPUSort/src/quicksort/quicksort_kernel.cuh +++ b/GPUSort/src/quicksort/quicksort_kernel.cuh @@ -1,6 +1,7 @@ #pragma once #include +#include #include "../util/reduction.cuh" #include "task.h" #include "cudaPartition.cuh" @@ -18,7 +19,7 @@ __device__ void writeNewTask(int begin, int end, int depth, int maxElemFor2ndPha //----------------------------------------------------------- __global__ void cudaCalcBlocksNeeded(ArrayView cuda_tasks, int elemPerBlock, - ArrayView blocksNeeded) + VectorView blocksNeeded) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i >= cuda_tasks.getSize()) @@ -34,7 +35,7 @@ __global__ void cudaCalcBlocksNeeded(ArrayView cuda_tasks, template __global__ void cudaInitTask(ArrayView cuda_tasks, ArrayView cuda_blockToTaskMapping, - ArrayView cuda_reductionTaskInitMem, + VectorView cuda_reductionTaskInitMem, ArrayView src, CMP Cmp) { if (blockIdx.x >= cuda_tasks.getSize()) @@ -202,13 +203,11 @@ __global__ void cudaQuickSort2ndPhase(ArrayView arr, Array if (elemInShared == 0) { - singleBlockQuickSort - (arrView, auxView, Cmp, myTask.depth, sharedMem, 0, maxBitonicSize); + singleBlockQuickSort(arrView, auxView, Cmp, myTask.depth, sharedMem, 0, maxBitonicSize); } else { - singleBlockQuickSort - (arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared, maxBitonicSize); + singleBlockQuickSort(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared, maxBitonicSize); } } @@ -239,13 +238,11 @@ __global__ void cudaQuickSort2ndPhase(ArrayView arr, Array if (elemInShared <= 0) { - singleBlockQuickSort - (arrView, auxView, Cmp, myTask.depth, sharedMem, 0, maxBitonicSize); + singleBlockQuickSort(arrView, auxView, Cmp, myTask.depth, sharedMem, 0, maxBitonicSize); } else { - singleBlockQuickSort - (arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared, maxBitonicSize); + singleBlockQuickSort(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared, maxBitonicSize); } } -- GitLab From f06e3aaf5c695fd388da211706113eba3a665b56 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 5 May 2021 19:15:43 +0200 Subject: [PATCH 221/258] calculate partsInSeq inside kernel --- GPUSort/src/bitonicSort/bitonicSort.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/GPUSort/src/bitonicSort/bitonicSort.h b/GPUSort/src/bitonicSort/bitonicSort.h index e3afe783d..41c8dd789 100644 --- a/GPUSort/src/bitonicSort/bitonicSort.h +++ b/GPUSort/src/bitonicSort/bitonicSort.h @@ -46,7 +46,7 @@ __host__ __device__ void cmpSwap(Value &a, Value &b, bool ascending, const CMP & template __global__ void bitonicMergeGlobal(TNL::Containers::ArrayView arr, CMP Cmp, - int monotonicSeqLen, int len, int partsInSeq) + int monotonicSeqLen, int len) { int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -58,6 +58,7 @@ __global__ void bitonicMergeGlobal(TNL::Containers::ArrayView= arr.getSize()) //arr[e] is virtual padding and will not be exchanged with return; + int partsInSeq = monotonicSeqLen / len; //calculate the direction of swapping int monotonicSeqIdx = part / partsInSeq; bool ascending = (monotonicSeqIdx & 1) != 0; @@ -327,7 +328,7 @@ void bitonicSortWithShared(TNL::Containers::ArrayView if (len > sharedMemLen) { bitonicMergeGlobal<<>>( - view, Cmp, monotonicSeqLen, len, partsInSeq); + view, Cmp, monotonicSeqLen, len); } else { @@ -356,7 +357,7 @@ void bitonicSort(TNL::Containers::ArrayView view, { for (int len = monotonicSeqLen, partsInSeq = 1; len > 1; len /= 2, partsInSeq *= 2) { - bitonicMergeGlobal<<>>(view, Cmp, monotonicSeqLen, len, partsInSeq); + bitonicMergeGlobal<<>>(view, Cmp, monotonicSeqLen, len); } } cudaDeviceSynchronize(); @@ -454,7 +455,7 @@ void bitonicSort(std::vector &vec) template __global__ void bitonicMergeGlobal(int size, FETCH Fetch, CMP Cmp, SWAP Swap, - int monotonicSeqLen, int len, int partsInSeq) + int monotonicSeqLen, int len) { int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -467,6 +468,7 @@ __global__ void bitonicMergeGlobal(int size, FETCH Fetch, CMP Cmp, SWAP Swap, return; //calculate the direction of swapping + int partsInSeq = monotonicSeqLen / len; int monotonicSeqIdx = part / partsInSeq; bool ascending = (monotonicSeqIdx & 1) != 0; if ((monotonicSeqIdx + 1) * monotonicSeqLen >= size) //special case for part with no "partner" to be merged with in next phase @@ -504,7 +506,7 @@ void bitonicSort(int begin, int end, FETCH Fetch, const CMP &Cmp, SWAP Swap) { bitonicMergeGlobal<<>>( size, fetchWithOffset, Cmp, swapWithOffset, - monotonicSeqLen, len, partsInSeq); + monotonicSeqLen, len); } } cudaDeviceSynchronize(); -- GitLab From 47fe0ee7611057750352f6e4ce94bc26ec599c08 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 5 May 2021 19:16:54 +0200 Subject: [PATCH 222/258] threadPerBlock -> threadsPerBlock --- GPUSort/src/bitonicSort/bitonicSort.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/GPUSort/src/bitonicSort/bitonicSort.h b/GPUSort/src/bitonicSort/bitonicSort.h index 41c8dd789..5d128f7af 100644 --- a/GPUSort/src/bitonicSort/bitonicSort.h +++ b/GPUSort/src/bitonicSort/bitonicSort.h @@ -487,8 +487,8 @@ void bitonicSort(int begin, int end, FETCH Fetch, const CMP &Cmp, SWAP Swap) int threadsNeeded = size / 2 + (size % 2 != 0); const int maxThreadsPerBlock = 512; - int threadPerBlock = maxThreadsPerBlock; - int blocks = threadsNeeded / threadPerBlock + (threadsNeeded % threadPerBlock != 0); + int threadsPerBlock = maxThreadsPerBlock; + int blocks = threadsNeeded / threadsPerBlock + (threadsNeeded % threadsPerBlock != 0); auto fetchWithOffset = [=] __cuda_callable__(int i) { @@ -504,7 +504,7 @@ void bitonicSort(int begin, int end, FETCH Fetch, const CMP &Cmp, SWAP Swap) { for (int len = monotonicSeqLen, partsInSeq = 1; len > 1; len /= 2, partsInSeq *= 2) { - bitonicMergeGlobal<<>>( + bitonicMergeGlobal<<>>( size, fetchWithOffset, Cmp, swapWithOffset, monotonicSeqLen, len); } -- GitLab From 95a05956654e8fc0d9f4dd626001f995ac62c70a Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sat, 8 May 2021 19:01:06 +0200 Subject: [PATCH 223/258] refactor len -> bitonicLen --- GPUSort/src/bitonicSort/bitonicSort.h | 86 +++++++++++++-------------- GPUSort/src/util/config.mk | 4 ++ 2 files changed, 47 insertions(+), 43 deletions(-) diff --git a/GPUSort/src/bitonicSort/bitonicSort.h b/GPUSort/src/bitonicSort/bitonicSort.h index 5d128f7af..a4b8b1533 100644 --- a/GPUSort/src/bitonicSort/bitonicSort.h +++ b/GPUSort/src/bitonicSort/bitonicSort.h @@ -13,9 +13,9 @@ static __device__ __forceinline__ unsigned int __btflo(unsigned int word) return ret; } -__device__ int closestPow2_ptx(int len) +__device__ int closestPow2_ptx(int bitonicLen) { - return 1 << (__btflo((unsigned)len - 1U) + 1); + return 1 << (__btflo((unsigned)bitonicLen - 1U) + 1); } __host__ __device__ int closestPow2(int x) @@ -46,19 +46,19 @@ __host__ __device__ void cmpSwap(Value &a, Value &b, bool ascending, const CMP & template __global__ void bitonicMergeGlobal(TNL::Containers::ArrayView arr, CMP Cmp, - int monotonicSeqLen, int len) + int monotonicSeqLen, int bitonicLen) { int i = blockIdx.x * blockDim.x + threadIdx.x; - int part = i / (len / 2); //computes which sorting block this thread belongs to + int part = i / (bitonicLen / 2); //computes which sorting block this thread belongs to //the index of 2 elements that should be compared and swapped - int s = part * len + (i & ((len / 2) - 1)); - int e = s + len / 2; + int s = part * bitonicLen + (i & ((bitonicLen / 2) - 1)); + int e = s + bitonicLen / 2; if (e >= arr.getSize()) //arr[e] is virtual padding and will not be exchanged with return; - int partsInSeq = monotonicSeqLen / len; + int partsInSeq = monotonicSeqLen / bitonicLen; //calculate the direction of swapping int monotonicSeqIdx = part / partsInSeq; bool ascending = (monotonicSeqIdx & 1) != 0; @@ -79,7 +79,7 @@ __global__ void bitonicMergeGlobal(TNL::Containers::ArrayView __global__ void bitonicMergeSharedMemory(TNL::Containers::ArrayView arr, CMP Cmp, - int monotonicSeqLen, int len, int partsInSeq) + int monotonicSeqLen, int bitonicLen, int partsInSeq) { extern __shared__ int externMem[]; Value *sharedMem = (Value *)externMem; @@ -100,7 +100,7 @@ __global__ void bitonicMergeSharedMemory(TNL::Containers::ArrayView 1; len /= 2) + for (; bitonicLen > 1; bitonicLen /= 2) { //calculates which 2 indexes will be compared and swap - int part = threadIdx.x / (len / 2); - int s = part * len + (threadIdx.x & ((len / 2) - 1)); - int e = s + len / 2; + int part = threadIdx.x / (bitonicLen / 2); + int s = part * bitonicLen + (threadIdx.x & ((bitonicLen / 2) - 1)); + int e = s + bitonicLen / 2; if (e < myBlockEnd - myBlockStart) //not touching virtual padding cmpSwap(sharedMem[s], sharedMem[e], ascending, Cmp); @@ -139,7 +139,7 @@ __global__ void bitonicMergeSharedMemory(TNL::Containers::ArrayView __global__ void bitonicMerge(TNL::Containers::ArrayView arr, CMP Cmp, - int monotonicSeqLen, int len, int partsInSeq) + int monotonicSeqLen, int bitonicLen, int partsInSeq) { //1st index and last index of subarray that this threadBlock should merge int myBlockStart = blockIdx.x * (2 * blockDim.x); @@ -149,7 +149,7 @@ __global__ void bitonicMerge(TNL::Containers::ArrayView 1; len /= 2) + for (; bitonicLen > 1; bitonicLen /= 2) { //calculates which 2 indexes will be compared and swap - int part = threadIdx.x / (len / 2); - int s = part * len + (threadIdx.x & ((len / 2) - 1)); - int e = s + len / 2; + int part = threadIdx.x / (bitonicLen / 2); + int s = part * bitonicLen + (threadIdx.x & ((bitonicLen / 2) - 1)); + int e = s + bitonicLen / 2; if (e < myBlockEnd - myBlockStart) //not touching virtual padding cmpSwap(src[s], src[e], ascending, Cmp); @@ -198,14 +198,14 @@ __device__ void bitonicSort_Block(TNL::Containers::ArrayView 1; len /= 2) + for (int bitonicLen = monotonicSeqLen; bitonicLen > 1; bitonicLen /= 2) { for (int i = threadIdx.x;; i += blockDim.x) //simulates other blocks in case src.size > blockDim.x*2 { //calculates which 2 indexes will be compared and swap - int part = i / (len / 2); - int s = part * len + (i & ((len / 2) - 1)); - int e = s + len / 2; + int part = i / (bitonicLen / 2); + int s = part * bitonicLen + (i & ((bitonicLen / 2) - 1)); + int e = s + bitonicLen / 2; if (e >= src.getSize()) //touching virtual padding, the order dont swap break; @@ -246,14 +246,14 @@ __device__ void bitonicSort_Block(TNL::Containers::ArrayView 1; len /= 2) + for (int bitonicLen = monotonicSeqLen; bitonicLen > 1; bitonicLen /= 2) { for (int i = threadIdx.x;; i += blockDim.x) //simulates other blocks in case src.size > blockDim.x*2 { //calculates which 2 indexes will be compared and swap - int part = i / (len / 2); - int s = part * len + (i & ((len / 2) - 1)); - int e = s + len / 2; + int part = i / (bitonicLen / 2); + int s = part * bitonicLen + (i & ((bitonicLen / 2) - 1)); + int e = s + bitonicLen / 2; if (e >= src.getSize()) break; @@ -318,24 +318,24 @@ void bitonicSortWithShared(TNL::Containers::ArrayView int paddedSize = closestPow2(view.getSize()); bitoniSort1stStepSharedMemory<<>>(view, Cmp); - //now alternating monotonic sequences with lenght of sharedMemLen + //now alternating monotonic sequences with bitonicLenght of sharedMemLen - // \/ has length of 2 * sharedMemLen + // \/ has bitonicLength of 2 * sharedMemLen for (int monotonicSeqLen = 2 * sharedMemLen; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) { - for (int len = monotonicSeqLen, partsInSeq = 1; len > 1; len /= 2, partsInSeq *= 2) + for (int bitonicLen = monotonicSeqLen, partsInSeq = 1; bitonicLen > 1; bitonicLen /= 2, partsInSeq *= 2) { - if (len > sharedMemLen) + if (bitonicLen > sharedMemLen) { bitonicMergeGlobal<<>>( - view, Cmp, monotonicSeqLen, len); + view, Cmp, monotonicSeqLen, bitonicLen); } else { bitonicMergeSharedMemory<<>>( - view, Cmp, monotonicSeqLen, len, partsInSeq); + view, Cmp, monotonicSeqLen, bitonicLen, partsInSeq); - //simulates sorts until len == 2 already, no need to continue this loop + //simulates sorts until bitonicLen == 2 already, no need to continue this loop break; } } @@ -355,9 +355,9 @@ void bitonicSort(TNL::Containers::ArrayView view, for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) { - for (int len = monotonicSeqLen, partsInSeq = 1; len > 1; len /= 2, partsInSeq *= 2) + for (int bitonicLen = monotonicSeqLen, partsInSeq = 1; bitonicLen > 1; bitonicLen /= 2, partsInSeq *= 2) { - bitonicMergeGlobal<<>>(view, Cmp, monotonicSeqLen, len); + bitonicMergeGlobal<<>>(view, Cmp, monotonicSeqLen, bitonicLen); } } cudaDeviceSynchronize(); @@ -455,20 +455,20 @@ void bitonicSort(std::vector &vec) template __global__ void bitonicMergeGlobal(int size, FETCH Fetch, CMP Cmp, SWAP Swap, - int monotonicSeqLen, int len) + int monotonicSeqLen, int bitonicLen) { int i = blockIdx.x * blockDim.x + threadIdx.x; - int part = i / (len / 2); //computes which sorting block this thread belongs to + int part = i / (bitonicLen / 2); //computes which sorting block this thread belongs to //the index of 2 elements that should be compared and swapped - int s = part * len + (i & ((len / 2) - 1)); - int e = s + len / 2; + int s = part * bitonicLen + (i & ((bitonicLen / 2) - 1)); + int e = s + bitonicLen / 2; if (e >= size) //arr[e] is virtual padding and will not be exchanged with return; //calculate the direction of swapping - int partsInSeq = monotonicSeqLen / len; + int partsInSeq = monotonicSeqLen / bitonicLen; int monotonicSeqIdx = part / partsInSeq; bool ascending = (monotonicSeqIdx & 1) != 0; if ((monotonicSeqIdx + 1) * monotonicSeqLen >= size) //special case for part with no "partner" to be merged with in next phase @@ -502,11 +502,11 @@ void bitonicSort(int begin, int end, FETCH Fetch, const CMP &Cmp, SWAP Swap) for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) { - for (int len = monotonicSeqLen, partsInSeq = 1; len > 1; len /= 2, partsInSeq *= 2) + for (int bitonicLen = monotonicSeqLen, partsInSeq = 1; bitonicLen > 1; bitonicLen /= 2, partsInSeq *= 2) { bitonicMergeGlobal<<>>( size, fetchWithOffset, Cmp, swapWithOffset, - monotonicSeqLen, len); + monotonicSeqLen, bitonicLen); } } cudaDeviceSynchronize(); diff --git a/GPUSort/src/util/config.mk b/GPUSort/src/util/config.mk index 3715986f7..e7db43570 100644 --- a/GPUSort/src/util/config.mk +++ b/GPUSort/src/util/config.mk @@ -25,6 +25,10 @@ endif # CUDA compiler flags CUDA_CXXFLAGS := -std=c++14 --expt-relaxed-constexpr --expt-extended-lambda $(TNL_INCLUDE_DIRS) CUDA_CXXFLAGS += -DHAVE_CUDA +ifeq ($(WITH_DEBUG), no) + CUDA_CXXFLAGS += -O3 -DNDEBUG +endif + ifeq ($(CUDA_ARCH),auto) CUDA_CXXFLAGS += $(shell tnl-cuda-arch) else -- GitLab From af64fb386cafcff00f409894729fc3ee511cc753 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sat, 8 May 2021 20:44:57 +0200 Subject: [PATCH 224/258] remove useless param --- GPUSort/src/bitonicSort/bitonicSort.h | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/GPUSort/src/bitonicSort/bitonicSort.h b/GPUSort/src/bitonicSort/bitonicSort.h index a4b8b1533..0f14f7eb1 100644 --- a/GPUSort/src/bitonicSort/bitonicSort.h +++ b/GPUSort/src/bitonicSort/bitonicSort.h @@ -31,7 +31,7 @@ __host__ __device__ int closestPow2(int x) } template -__host__ __device__ void cmpSwap(Value &a, Value &b, bool ascending, const CMP &Cmp) +__cuda_callable__ void cmpSwap(Value &a, Value &b, bool ascending, const CMP &Cmp) { if (ascending == Cmp(b, a)) TNL::swap(a, b); @@ -79,7 +79,7 @@ __global__ void bitonicMergeGlobal(TNL::Containers::ArrayView __global__ void bitonicMergeSharedMemory(TNL::Containers::ArrayView arr, CMP Cmp, - int monotonicSeqLen, int bitonicLen, int partsInSeq) + int monotonicSeqLen, int bitonicLen) { extern __shared__ int externMem[]; Value *sharedMem = (Value *)externMem; @@ -101,6 +101,7 @@ __global__ void bitonicMergeSharedMemory(TNL::Containers::ArrayView __global__ void bitonicMerge(TNL::Containers::ArrayView arr, CMP Cmp, - int monotonicSeqLen, int bitonicLen, int partsInSeq) + int monotonicSeqLen, int bitonicLen) { //1st index and last index of subarray that this threadBlock should merge int myBlockStart = blockIdx.x * (2 * blockDim.x); @@ -150,6 +151,7 @@ __global__ void bitonicMerge(TNL::Containers::ArrayView // \/ has bitonicLength of 2 * sharedMemLen for (int monotonicSeqLen = 2 * sharedMemLen; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) { - for (int bitonicLen = monotonicSeqLen, partsInSeq = 1; bitonicLen > 1; bitonicLen /= 2, partsInSeq *= 2) + for (int bitonicLen = monotonicSeqLen; bitonicLen > 1; bitonicLen /= 2) { if (bitonicLen > sharedMemLen) { @@ -333,7 +335,7 @@ void bitonicSortWithShared(TNL::Containers::ArrayView else { bitonicMergeSharedMemory<<>>( - view, Cmp, monotonicSeqLen, bitonicLen, partsInSeq); + view, Cmp, monotonicSeqLen, bitonicLen); //simulates sorts until bitonicLen == 2 already, no need to continue this loop break; @@ -355,7 +357,7 @@ void bitonicSort(TNL::Containers::ArrayView view, for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) { - for (int bitonicLen = monotonicSeqLen, partsInSeq = 1; bitonicLen > 1; bitonicLen /= 2, partsInSeq *= 2) + for (int bitonicLen = monotonicSeqLen; bitonicLen > 1; bitonicLen /= 2) { bitonicMergeGlobal<<>>(view, Cmp, monotonicSeqLen, bitonicLen); } @@ -502,7 +504,7 @@ void bitonicSort(int begin, int end, FETCH Fetch, const CMP &Cmp, SWAP Swap) for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) { - for (int bitonicLen = monotonicSeqLen, partsInSeq = 1; bitonicLen > 1; bitonicLen /= 2, partsInSeq *= 2) + for (int bitonicLen = monotonicSeqLen; bitonicLen > 1; bitonicLen /= 2) { bitonicMergeGlobal<<>>( size, fetchWithOffset, Cmp, swapWithOffset, -- GitLab From abdb1f63384aa3ceb7cbb1465f79f7f057978e72 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sat, 8 May 2021 23:16:29 +0200 Subject: [PATCH 225/258] refactor depth -> iteration --- GPUSort/src/quicksort/quicksort_1Block.cuh | 32 +++++++++++----------- GPUSort/src/quicksort/quicksort_kernel.cuh | 30 ++++++++++---------- GPUSort/src/quicksort/task.h | 8 +++--- 3 files changed, 35 insertions(+), 35 deletions(-) diff --git a/GPUSort/src/quicksort/quicksort_1Block.cuh b/GPUSort/src/quicksort/quicksort_1Block.cuh index de88eef01..b0a310cdf 100644 --- a/GPUSort/src/quicksort/quicksort_1Block.cuh +++ b/GPUSort/src/quicksort/quicksort_1Block.cuh @@ -31,27 +31,27 @@ __device__ void stackPush(int stackArrBegin[], int stackArrEnd[], int stackDepth[], int &stackTop, int begin, int pivotBegin, int pivotEnd, int end, - int depth); + int iteration); //--------------------------------------------------------------- template __device__ void singleBlockQuickSort(ArrayView arr, ArrayView aux, - const CMP &Cmp, int _depth, + const CMP &Cmp, int _iteration, Value *sharedMem, int memSize, int maxBitonicSize) { if (arr.getSize() <= maxBitonicSize) { - auto &src = (_depth & 1) == 0 ? arr : aux; + auto &src = (_iteration & 1) == 0 ? arr : aux; if (useShared && arr.getSize() <= memSize) externSort(src, arr, Cmp, sharedMem); else { externSort(src, Cmp); //extern sort without shared memory only works in-place, need to copy into from aux - if ((_depth & 1) != 0) + if ((_iteration & 1) != 0) for (int i = threadIdx.x; i < arr.getSize(); i += blockDim.x) arr[i] = src[i]; } @@ -61,7 +61,7 @@ __device__ void singleBlockQuickSort(ArrayView arr, static __shared__ int stackTop; static __shared__ int stackArrBegin[stackSize], stackArrEnd[stackSize], stackDepth[stackSize]; - static __shared__ int begin, end, depth; + static __shared__ int begin, end, iteration; static __shared__ int pivotBegin, pivotEnd; Value *piv = sharedMem; sharedMem += 1; @@ -71,7 +71,7 @@ __device__ void singleBlockQuickSort(ArrayView arr, stackTop = 0; stackArrBegin[stackTop] = 0; stackArrEnd[stackTop] = arr.getSize(); - stackDepth[stackTop] = _depth; + stackDepth[stackTop] = _iteration; stackTop++; } __syncthreads(); @@ -83,13 +83,13 @@ __device__ void singleBlockQuickSort(ArrayView arr, { begin = stackArrBegin[stackTop - 1]; end = stackArrEnd[stackTop - 1]; - depth = stackDepth[stackTop - 1]; + iteration = stackDepth[stackTop - 1]; stackTop--; } __syncthreads(); int size = end - begin; - auto &src = (depth & 1) == 0 ? arr : aux; + auto &src = (iteration & 1) == 0 ? arr : aux; //small enough for for bitonic if (size <= maxBitonicSize) @@ -100,7 +100,7 @@ __device__ void singleBlockQuickSort(ArrayView arr, { externSort(src.getView(begin, end), Cmp); //extern sort without shared memory only works in-place, need to copy into from aux - if ((depth & 1) != 0) + if ((iteration & 1) != 0) for (int i = threadIdx.x; i < src.getSize(); i += blockDim.x) arr[begin + i] = src[i]; } @@ -134,7 +134,7 @@ __device__ void singleBlockQuickSort(ArrayView arr, * move elements, either use shared mem for coalesced access or without shared mem if data is too big * */ - auto &dst = (depth & 1) == 0 ? aux : arr; + auto &dst = (iteration & 1) == 0 ? aux : arr; if (useShared && size <= memSize) { @@ -172,7 +172,7 @@ __device__ void singleBlockQuickSort(ArrayView arr, stackPush(stackArrBegin, stackArrEnd, stackDepth, stackTop, begin, begin + pivotBegin, begin + pivotEnd, end, - depth); + iteration); } __syncthreads(); //sync to update stackTop } //ends while loop @@ -185,7 +185,7 @@ __device__ void stackPush(int stackArrBegin[], int stackArrEnd[], int stackDepth[], int &stackTop, int begin, int pivotBegin, int pivotEnd, int end, - int depth) + int iteration) { int sizeL = pivotBegin - begin, sizeR = end - pivotEnd; @@ -197,7 +197,7 @@ __device__ void stackPush(int stackArrBegin[], int stackArrEnd[], { stackArrBegin[stackTop] = begin; stackArrEnd[stackTop] = pivotBegin; - stackDepth[stackTop] = depth + 1; + stackDepth[stackTop] = iteration + 1; stackTop++; } @@ -207,7 +207,7 @@ __device__ void stackPush(int stackArrBegin[], int stackArrEnd[], stackArrBegin[stackTop] = pivotEnd; stackArrEnd[stackTop] = end; - stackDepth[stackTop] = depth + 1; + stackDepth[stackTop] = iteration + 1; stackTop++; } } @@ -217,7 +217,7 @@ __device__ void stackPush(int stackArrBegin[], int stackArrEnd[], { stackArrBegin[stackTop] = pivotEnd; stackArrEnd[stackTop] = end; - stackDepth[stackTop] = depth + 1; + stackDepth[stackTop] = iteration + 1; stackTop++; } @@ -227,7 +227,7 @@ __device__ void stackPush(int stackArrBegin[], int stackArrEnd[], stackArrBegin[stackTop] = begin; stackArrEnd[stackTop] = pivotBegin; - stackDepth[stackTop] = depth + 1; + stackDepth[stackTop] = iteration + 1; stackTop++; } } diff --git a/GPUSort/src/quicksort/quicksort_kernel.cuh b/GPUSort/src/quicksort/quicksort_kernel.cuh index a764eb161..824b58199 100644 --- a/GPUSort/src/quicksort/quicksort_kernel.cuh +++ b/GPUSort/src/quicksort/quicksort_kernel.cuh @@ -12,7 +12,7 @@ using namespace TNL::Containers; //----------------------------------------------------------- -__device__ void writeNewTask(int begin, int end, int depth, int maxElemFor2ndPhase, +__device__ void writeNewTask(int begin, int end, int iteration, int maxElemFor2ndPhase, ArrayView newTasks, int *newTasksCnt, ArrayView secondPhaseTasks, int *secondPhaseTasksCnt); @@ -67,8 +67,8 @@ __global__ void cudaQuickSort1stPhase(ArrayView arr, Array Value *sharedMem = piv + 1; TASK &myTask = tasks[taskMapping[blockIdx.x]]; - auto &src = (myTask.depth & 1) == 0 ? arr : aux; - auto &dst = (myTask.depth & 1) == 0 ? aux : arr; + auto &src = (myTask.iteration & 1) == 0 ? arr : aux; + auto &dst = (myTask.iteration & 1) == 0 ? aux : arr; if (threadIdx.x == 0) *piv = src[myTask.pivotIdx]; @@ -95,7 +95,7 @@ __global__ void cudaWritePivot(ArrayView arr, ArrayView arr, ArrayView 0) { - writeNewTask(leftBegin, leftEnd, myTask.depth, + writeNewTask(leftBegin, leftEnd, myTask.iteration, maxElemFor2ndPhase, newTasks, newTasksCnt, secondPhaseTasks, secondPhaseTasksCnt); @@ -126,7 +126,7 @@ __global__ void cudaWritePivot(ArrayView arr, ArrayView 0) { writeNewTask(rightBegin, rightEnd, - myTask.depth, maxElemFor2ndPhase, + myTask.iteration, maxElemFor2ndPhase, newTasks, newTasksCnt, secondPhaseTasks, secondPhaseTasksCnt); } @@ -134,7 +134,7 @@ __global__ void cudaWritePivot(ArrayView arr, ArrayView newTasks, int *newTasksCnt, ArrayView secondPhaseTasks, int *secondPhaseTasksCnt) { @@ -152,13 +152,13 @@ __device__ void writeNewTask(int begin, int end, int depth, int maxElemFor2ndPha { int idx = atomicAdd(secondPhaseTasksCnt, 1); if (idx < secondPhaseTasks.getSize()) - secondPhaseTasks[idx] = TASK(begin, end, depth + 1); + secondPhaseTasks[idx] = TASK(begin, end, iteration + 1); else { //printf("ran out of memory, trying backup\n"); int idx = atomicAdd(newTasksCnt, 1); if (idx < newTasks.getSize()) - newTasks[idx] = TASK(begin, end, depth + 1); + newTasks[idx] = TASK(begin, end, iteration + 1); else printf("ran out of memory for second phase task, there isnt even space in newTask list\nPart of array may stay unsorted!!!\n"); } @@ -167,13 +167,13 @@ __device__ void writeNewTask(int begin, int end, int depth, int maxElemFor2ndPha { int idx = atomicAdd(newTasksCnt, 1); if (idx < newTasks.getSize()) - newTasks[idx] = TASK(begin, end, depth + 1); + newTasks[idx] = TASK(begin, end, iteration + 1); else { //printf("ran out of memory, trying backup\n"); int idx = atomicAdd(secondPhaseTasksCnt, 1); if (idx < secondPhaseTasks.getSize()) - secondPhaseTasks[idx] = TASK(begin, end, depth + 1); + secondPhaseTasks[idx] = TASK(begin, end, iteration + 1); else printf("ran out of memory for newtask, there isnt even space in second phase task list\nPart of array may stay unsorted!!!\n"); } @@ -203,11 +203,11 @@ __global__ void cudaQuickSort2ndPhase(ArrayView arr, Array if (elemInShared == 0) { - singleBlockQuickSort(arrView, auxView, Cmp, myTask.depth, sharedMem, 0, maxBitonicSize); + singleBlockQuickSort(arrView, auxView, Cmp, myTask.iteration, sharedMem, 0, maxBitonicSize); } else { - singleBlockQuickSort(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared, maxBitonicSize); + singleBlockQuickSort(arrView, auxView, Cmp, myTask.iteration, sharedMem, elemInShared, maxBitonicSize); } } @@ -238,11 +238,11 @@ __global__ void cudaQuickSort2ndPhase(ArrayView arr, Array if (elemInShared <= 0) { - singleBlockQuickSort(arrView, auxView, Cmp, myTask.depth, sharedMem, 0, maxBitonicSize); + singleBlockQuickSort(arrView, auxView, Cmp, myTask.iteration, sharedMem, 0, maxBitonicSize); } else { - singleBlockQuickSort(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared, maxBitonicSize); + singleBlockQuickSort(arrView, auxView, Cmp, myTask.iteration, sharedMem, elemInShared, maxBitonicSize); } } diff --git a/GPUSort/src/quicksort/task.h b/GPUSort/src/quicksort/task.h index 49a514c31..e75758431 100644 --- a/GPUSort/src/quicksort/task.h +++ b/GPUSort/src/quicksort/task.h @@ -7,15 +7,15 @@ struct TASK //----------------------------------------------- //helper variables for blocks working on this task - int depth; + int iteration; int pivotIdx; int dstBegin, dstEnd; int firstBlock, blockCount;//for workers read only values __cuda_callable__ - TASK(int begin, int end, int depth) + TASK(int begin, int end, int iteration) : partitionBegin(begin), partitionEnd(end), - depth(depth), pivotIdx(-1), + iteration(iteration), pivotIdx(-1), dstBegin(-151561), dstEnd(-151561), firstBlock(-100), blockCount(-100) {} @@ -42,7 +42,7 @@ std::ostream& operator<<(std::ostream & out, const TASK & task) { out << "[ "; out << task.partitionBegin << " - " << task.partitionEnd; - out << " | " << "depth: " << task.depth; + out << " | " << "iteration: " << task.iteration; out << " | " << "pivotIdx: " << task.pivotIdx; return out << " ] "; } \ No newline at end of file -- GitLab From 8806e06872487e0ea2e3e127dcb73ef8c4eec3bc Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Sat, 8 May 2021 23:48:02 +0200 Subject: [PATCH 226/258] add other gpu sorts and measuring --- GPUSort/GPUSort/README.MD | 14 + .../{ => GPUSort}/benchmark/benchmarker.cpp | 0 .../benchmark/bitonic_benchmark/Makefile | 0 .../benchmark/bitonic_benchmark/benchmark.cu | 0 .../{ => GPUSort}/benchmark/generators.cpp | 0 GPUSort/{ => GPUSort}/benchmark/measure.cpp | 0 GPUSort/{ => GPUSort}/benchmark/measure.cu | 0 GPUSort/{ => GPUSort}/benchmark/measure.h | 0 .../benchmark/quicksort_benchmark/Makefile | 32 + .../quicksort_benchmark/benchmark.cu | 0 .../quicksort_dynamic_benchmark/Makefile | 0 .../quicksort_dynamic_benchmark/benchmark.cu | 0 .../src/bitonicSort/bitonicSort.h | 0 .../src/bitonicSort/sample/Makefile | 0 .../src/bitonicSort/sample/main.cu | 0 .../src/quicksort/cudaPartition.cuh | 0 .../{ => GPUSort}/src/quicksort/quicksort.cuh | 0 .../src/quicksort/quicksort_1Block.cuh | 0 .../src/quicksort/quicksort_kernel.cuh | 0 .../src/quicksort/sample/Makefile | 0 .../src/quicksort/sample/main.cu | 0 GPUSort/{ => GPUSort}/src/quicksort/task.h | 0 .../src/quicksort_dynamic/helper.cuh | 0 .../src/quicksort_dynamic/quicksort.cu | 0 .../src/quicksort_dynamic/quicksort.cuh | 0 .../src/quicksort_dynamic/sample/Makefile | 0 .../src/quicksort_dynamic/sample/main.cu | 0 .../src/quicksort_dynamic/task.h | 0 GPUSort/{ => GPUSort}/src/util/algorithm.h | 0 GPUSort/{ => GPUSort}/src/util/config.mk | 0 GPUSort/{ => GPUSort}/src/util/reduction.cuh | 0 GPUSort/{ => GPUSort}/src/util/timer.h | 0 .../tests/bitonic_tests/Makefile | 0 .../tests/bitonic_tests/unitTests.cu | 0 .../tests/quicksort_dynamic_tests/Makefile | 0 .../tests/quicksort_dynamic_tests/README.md | 1 + .../quicksort_dynamic_tests/unitTests.cu | 0 .../tests/quicksort_unitTests/Makefile | 0 .../tests/quicksort_unitTests/unitTests.cu | 0 GPUSort/README.md | 17 + GPUSort/measuring/README.md | 9 + .../TNL_implementation/bitonic/Makefile | 27 + .../TNL_implementation/bitonic/main.cu | 1 + .../bitonic/sameDir/.Makefile | 24 + .../bitonic/sameDir/main.cu | 118 ++ .../TNL_implementation/cdpquicksort/.Makefile | 32 + .../cdpquicksort/benchmark.cu | 4 + .../TNL_implementation/quicksort/Makefile | 27 + .../TNL_implementation/quicksort/benchmark.cu | 1 + GPUSort/measuring/cederman_quicksort/Makefile | 23 + GPUSort/measuring/cederman_quicksort/main.cu | 12 + .../measuring/cuda_example/bitonic/Makefile | 27 + .../measuring/cuda_example/bitonic/main.cu | 37 + .../cdpAdvancedQuicksort/Makefile | 32 + .../cdpAdvancedQuicksort/benchmark.cu | 13 + .../cuda_example/cdpsimplequicksort/.Makefile | 32 + .../cdpsimplequicksort/benchmark.cu | 15 + GPUSort/measuring/davors/quicksort/Makefile | 27 + .../measuring/davors/quicksort/benchmark.cu | 20 + GPUSort/measuring/manca_quicksort/Makefile | 27 + GPUSort/measuring/manca_quicksort/main.cu | 15 + GPUSort/measuring/nickjillings/Makefile | 27 + GPUSort/measuring/nickjillings/main.cu | 12 + GPUSort/measuring/script.sh | 15 + GPUSort/measuring/std_sort/Makefile | 12 + GPUSort/measuring/std_sort/main.cpp | 12 + .../thrust}/Makefile | 4 +- GPUSort/measuring/thrust/main.cu | 12 + GPUSort/measuring/util/config.mk | 49 + .../otherGPUsorts/cederman/cederman_qsort.cu | 1092 ++++++++++++++ .../manca_quicksort.cu | 1317 +++++++++++++++++ 71 files changed, 3137 insertions(+), 2 deletions(-) create mode 100644 GPUSort/GPUSort/README.MD rename GPUSort/{ => GPUSort}/benchmark/benchmarker.cpp (100%) rename GPUSort/{ => GPUSort}/benchmark/bitonic_benchmark/Makefile (100%) rename GPUSort/{ => GPUSort}/benchmark/bitonic_benchmark/benchmark.cu (100%) rename GPUSort/{ => GPUSort}/benchmark/generators.cpp (100%) rename GPUSort/{ => GPUSort}/benchmark/measure.cpp (100%) rename GPUSort/{ => GPUSort}/benchmark/measure.cu (100%) rename GPUSort/{ => GPUSort}/benchmark/measure.h (100%) create mode 100644 GPUSort/GPUSort/benchmark/quicksort_benchmark/Makefile rename GPUSort/{ => GPUSort}/benchmark/quicksort_benchmark/benchmark.cu (100%) rename GPUSort/{ => GPUSort}/benchmark/quicksort_dynamic_benchmark/Makefile (100%) rename GPUSort/{ => GPUSort}/benchmark/quicksort_dynamic_benchmark/benchmark.cu (100%) rename GPUSort/{ => GPUSort}/src/bitonicSort/bitonicSort.h (100%) rename GPUSort/{ => GPUSort}/src/bitonicSort/sample/Makefile (100%) rename GPUSort/{ => GPUSort}/src/bitonicSort/sample/main.cu (100%) rename GPUSort/{ => GPUSort}/src/quicksort/cudaPartition.cuh (100%) rename GPUSort/{ => GPUSort}/src/quicksort/quicksort.cuh (100%) rename GPUSort/{ => GPUSort}/src/quicksort/quicksort_1Block.cuh (100%) rename GPUSort/{ => GPUSort}/src/quicksort/quicksort_kernel.cuh (100%) rename GPUSort/{ => GPUSort}/src/quicksort/sample/Makefile (100%) rename GPUSort/{ => GPUSort}/src/quicksort/sample/main.cu (100%) rename GPUSort/{ => GPUSort}/src/quicksort/task.h (100%) rename GPUSort/{ => GPUSort}/src/quicksort_dynamic/helper.cuh (100%) rename GPUSort/{ => GPUSort}/src/quicksort_dynamic/quicksort.cu (100%) rename GPUSort/{ => GPUSort}/src/quicksort_dynamic/quicksort.cuh (100%) rename GPUSort/{ => GPUSort}/src/quicksort_dynamic/sample/Makefile (100%) rename GPUSort/{ => GPUSort}/src/quicksort_dynamic/sample/main.cu (100%) rename GPUSort/{ => GPUSort}/src/quicksort_dynamic/task.h (100%) rename GPUSort/{ => GPUSort}/src/util/algorithm.h (100%) rename GPUSort/{ => GPUSort}/src/util/config.mk (100%) rename GPUSort/{ => GPUSort}/src/util/reduction.cuh (100%) rename GPUSort/{ => GPUSort}/src/util/timer.h (100%) rename GPUSort/{ => GPUSort}/tests/bitonic_tests/Makefile (100%) rename GPUSort/{ => GPUSort}/tests/bitonic_tests/unitTests.cu (100%) rename GPUSort/{ => GPUSort}/tests/quicksort_dynamic_tests/Makefile (100%) create mode 100644 GPUSort/GPUSort/tests/quicksort_dynamic_tests/README.md rename GPUSort/{ => GPUSort}/tests/quicksort_dynamic_tests/unitTests.cu (100%) rename GPUSort/{ => GPUSort}/tests/quicksort_unitTests/Makefile (100%) rename GPUSort/{ => GPUSort}/tests/quicksort_unitTests/unitTests.cu (100%) create mode 100644 GPUSort/README.md create mode 100644 GPUSort/measuring/README.md create mode 100644 GPUSort/measuring/TNL_implementation/bitonic/Makefile create mode 100644 GPUSort/measuring/TNL_implementation/bitonic/main.cu create mode 100644 GPUSort/measuring/TNL_implementation/bitonic/sameDir/.Makefile create mode 100644 GPUSort/measuring/TNL_implementation/bitonic/sameDir/main.cu create mode 100644 GPUSort/measuring/TNL_implementation/cdpquicksort/.Makefile create mode 100644 GPUSort/measuring/TNL_implementation/cdpquicksort/benchmark.cu create mode 100644 GPUSort/measuring/TNL_implementation/quicksort/Makefile create mode 100644 GPUSort/measuring/TNL_implementation/quicksort/benchmark.cu create mode 100644 GPUSort/measuring/cederman_quicksort/Makefile create mode 100644 GPUSort/measuring/cederman_quicksort/main.cu create mode 100644 GPUSort/measuring/cuda_example/bitonic/Makefile create mode 100644 GPUSort/measuring/cuda_example/bitonic/main.cu create mode 100644 GPUSort/measuring/cuda_example/cdpAdvancedQuicksort/Makefile create mode 100644 GPUSort/measuring/cuda_example/cdpAdvancedQuicksort/benchmark.cu create mode 100644 GPUSort/measuring/cuda_example/cdpsimplequicksort/.Makefile create mode 100644 GPUSort/measuring/cuda_example/cdpsimplequicksort/benchmark.cu create mode 100644 GPUSort/measuring/davors/quicksort/Makefile create mode 100644 GPUSort/measuring/davors/quicksort/benchmark.cu create mode 100644 GPUSort/measuring/manca_quicksort/Makefile create mode 100644 GPUSort/measuring/manca_quicksort/main.cu create mode 100644 GPUSort/measuring/nickjillings/Makefile create mode 100644 GPUSort/measuring/nickjillings/main.cu create mode 100644 GPUSort/measuring/script.sh create mode 100644 GPUSort/measuring/std_sort/Makefile create mode 100644 GPUSort/measuring/std_sort/main.cpp rename GPUSort/{benchmark/quicksort_benchmark => measuring/thrust}/Makefile (90%) create mode 100644 GPUSort/measuring/thrust/main.cu create mode 100644 GPUSort/measuring/util/config.mk create mode 100644 GPUSort/otherGPUsorts/cederman/cederman_qsort.cu create mode 100644 GPUSort/otherGPUsorts/manca_quicksort_extracted/manca_quicksort.cu diff --git a/GPUSort/GPUSort/README.MD b/GPUSort/GPUSort/README.MD new file mode 100644 index 000000000..40e8b2f8e --- /dev/null +++ b/GPUSort/GPUSort/README.MD @@ -0,0 +1,14 @@ +## Code implemented by Nguyen Xuan Thang for bachelor thesis + +* benchmark + * folder containing benchmarking scripts + * the main function is in ``benchmarker.cpp`` + * for each implemented algorithm, there is a folder with a benchmarker and a Makefile, to test out the algorithm, run ``make run``, to clean up ``make clean`` +* src + * folder containing the implementation of Bitonic sort, Quick sort and CDP Quick sort + * inside each folder there is a ``sample`` folder + * to test out the algorithm, simply run ``make run`` +* tests + * folder containing unit tests for each algorithm + * inside each folder there is a tester and a Makefile + * to test out the implementation, run ``make run`` \ No newline at end of file diff --git a/GPUSort/benchmark/benchmarker.cpp b/GPUSort/GPUSort/benchmark/benchmarker.cpp similarity index 100% rename from GPUSort/benchmark/benchmarker.cpp rename to GPUSort/GPUSort/benchmark/benchmarker.cpp diff --git a/GPUSort/benchmark/bitonic_benchmark/Makefile b/GPUSort/GPUSort/benchmark/bitonic_benchmark/Makefile similarity index 100% rename from GPUSort/benchmark/bitonic_benchmark/Makefile rename to GPUSort/GPUSort/benchmark/bitonic_benchmark/Makefile diff --git a/GPUSort/benchmark/bitonic_benchmark/benchmark.cu b/GPUSort/GPUSort/benchmark/bitonic_benchmark/benchmark.cu similarity index 100% rename from GPUSort/benchmark/bitonic_benchmark/benchmark.cu rename to GPUSort/GPUSort/benchmark/bitonic_benchmark/benchmark.cu diff --git a/GPUSort/benchmark/generators.cpp b/GPUSort/GPUSort/benchmark/generators.cpp similarity index 100% rename from GPUSort/benchmark/generators.cpp rename to GPUSort/GPUSort/benchmark/generators.cpp diff --git a/GPUSort/benchmark/measure.cpp b/GPUSort/GPUSort/benchmark/measure.cpp similarity index 100% rename from GPUSort/benchmark/measure.cpp rename to GPUSort/GPUSort/benchmark/measure.cpp diff --git a/GPUSort/benchmark/measure.cu b/GPUSort/GPUSort/benchmark/measure.cu similarity index 100% rename from GPUSort/benchmark/measure.cu rename to GPUSort/GPUSort/benchmark/measure.cu diff --git a/GPUSort/benchmark/measure.h b/GPUSort/GPUSort/benchmark/measure.h similarity index 100% rename from GPUSort/benchmark/measure.h rename to GPUSort/GPUSort/benchmark/measure.h diff --git a/GPUSort/GPUSort/benchmark/quicksort_benchmark/Makefile b/GPUSort/GPUSort/benchmark/quicksort_benchmark/Makefile new file mode 100644 index 000000000..14d2022df --- /dev/null +++ b/GPUSort/GPUSort/benchmark/quicksort_benchmark/Makefile @@ -0,0 +1,32 @@ +include ../../src/util/config.mk + +CUDA_SOURCES := $(wildcard *.cu) +CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) + +## targets definitions follow +.PHONY: all host cuda +all: cuda +cuda: $(CUDA_TARGETS) + +run: cuda + ./$(CUDA_TARGETS) + +measure: cuda + ./$(CUDA_TARGETS) ../quicksort.csv + +.PHONY: clean +clean: + rm -f *.d *.o *.cuo $(CUDA_TARGETS) + +# use .cuo instead of .cu.o to avoid problems with the implicit rules: https://stackoverflow.com/q/62967939 +# (and use the host compiler for linking CUDA, nvcc does not understand that .cuo is an object file) +$(CUDA_TARGETS): % : %.cuo + $(CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS) + +$(CUDA_SOURCES:%.cu=%.cuo): %.cuo : %.cu + $(CUDA_CXX) $(CUDA_CXXFLAGS) -c -o $@ $< + +debug: + $(CUDA_CXX) -DCHECK_RESULT_SORT $(CUDA_CXXFLAGS) -c -o benchmark.cuo benchmark.cu + $(CXX) $(CUDA_LDFLAGS) -o benchmark benchmark.cuo $(CUDA_LDLIBS) + ./benchmark \ No newline at end of file diff --git a/GPUSort/benchmark/quicksort_benchmark/benchmark.cu b/GPUSort/GPUSort/benchmark/quicksort_benchmark/benchmark.cu similarity index 100% rename from GPUSort/benchmark/quicksort_benchmark/benchmark.cu rename to GPUSort/GPUSort/benchmark/quicksort_benchmark/benchmark.cu diff --git a/GPUSort/benchmark/quicksort_dynamic_benchmark/Makefile b/GPUSort/GPUSort/benchmark/quicksort_dynamic_benchmark/Makefile similarity index 100% rename from GPUSort/benchmark/quicksort_dynamic_benchmark/Makefile rename to GPUSort/GPUSort/benchmark/quicksort_dynamic_benchmark/Makefile diff --git a/GPUSort/benchmark/quicksort_dynamic_benchmark/benchmark.cu b/GPUSort/GPUSort/benchmark/quicksort_dynamic_benchmark/benchmark.cu similarity index 100% rename from GPUSort/benchmark/quicksort_dynamic_benchmark/benchmark.cu rename to GPUSort/GPUSort/benchmark/quicksort_dynamic_benchmark/benchmark.cu diff --git a/GPUSort/src/bitonicSort/bitonicSort.h b/GPUSort/GPUSort/src/bitonicSort/bitonicSort.h similarity index 100% rename from GPUSort/src/bitonicSort/bitonicSort.h rename to GPUSort/GPUSort/src/bitonicSort/bitonicSort.h diff --git a/GPUSort/src/bitonicSort/sample/Makefile b/GPUSort/GPUSort/src/bitonicSort/sample/Makefile similarity index 100% rename from GPUSort/src/bitonicSort/sample/Makefile rename to GPUSort/GPUSort/src/bitonicSort/sample/Makefile diff --git a/GPUSort/src/bitonicSort/sample/main.cu b/GPUSort/GPUSort/src/bitonicSort/sample/main.cu similarity index 100% rename from GPUSort/src/bitonicSort/sample/main.cu rename to GPUSort/GPUSort/src/bitonicSort/sample/main.cu diff --git a/GPUSort/src/quicksort/cudaPartition.cuh b/GPUSort/GPUSort/src/quicksort/cudaPartition.cuh similarity index 100% rename from GPUSort/src/quicksort/cudaPartition.cuh rename to GPUSort/GPUSort/src/quicksort/cudaPartition.cuh diff --git a/GPUSort/src/quicksort/quicksort.cuh b/GPUSort/GPUSort/src/quicksort/quicksort.cuh similarity index 100% rename from GPUSort/src/quicksort/quicksort.cuh rename to GPUSort/GPUSort/src/quicksort/quicksort.cuh diff --git a/GPUSort/src/quicksort/quicksort_1Block.cuh b/GPUSort/GPUSort/src/quicksort/quicksort_1Block.cuh similarity index 100% rename from GPUSort/src/quicksort/quicksort_1Block.cuh rename to GPUSort/GPUSort/src/quicksort/quicksort_1Block.cuh diff --git a/GPUSort/src/quicksort/quicksort_kernel.cuh b/GPUSort/GPUSort/src/quicksort/quicksort_kernel.cuh similarity index 100% rename from GPUSort/src/quicksort/quicksort_kernel.cuh rename to GPUSort/GPUSort/src/quicksort/quicksort_kernel.cuh diff --git a/GPUSort/src/quicksort/sample/Makefile b/GPUSort/GPUSort/src/quicksort/sample/Makefile similarity index 100% rename from GPUSort/src/quicksort/sample/Makefile rename to GPUSort/GPUSort/src/quicksort/sample/Makefile diff --git a/GPUSort/src/quicksort/sample/main.cu b/GPUSort/GPUSort/src/quicksort/sample/main.cu similarity index 100% rename from GPUSort/src/quicksort/sample/main.cu rename to GPUSort/GPUSort/src/quicksort/sample/main.cu diff --git a/GPUSort/src/quicksort/task.h b/GPUSort/GPUSort/src/quicksort/task.h similarity index 100% rename from GPUSort/src/quicksort/task.h rename to GPUSort/GPUSort/src/quicksort/task.h diff --git a/GPUSort/src/quicksort_dynamic/helper.cuh b/GPUSort/GPUSort/src/quicksort_dynamic/helper.cuh similarity index 100% rename from GPUSort/src/quicksort_dynamic/helper.cuh rename to GPUSort/GPUSort/src/quicksort_dynamic/helper.cuh diff --git a/GPUSort/src/quicksort_dynamic/quicksort.cu b/GPUSort/GPUSort/src/quicksort_dynamic/quicksort.cu similarity index 100% rename from GPUSort/src/quicksort_dynamic/quicksort.cu rename to GPUSort/GPUSort/src/quicksort_dynamic/quicksort.cu diff --git a/GPUSort/src/quicksort_dynamic/quicksort.cuh b/GPUSort/GPUSort/src/quicksort_dynamic/quicksort.cuh similarity index 100% rename from GPUSort/src/quicksort_dynamic/quicksort.cuh rename to GPUSort/GPUSort/src/quicksort_dynamic/quicksort.cuh diff --git a/GPUSort/src/quicksort_dynamic/sample/Makefile b/GPUSort/GPUSort/src/quicksort_dynamic/sample/Makefile similarity index 100% rename from GPUSort/src/quicksort_dynamic/sample/Makefile rename to GPUSort/GPUSort/src/quicksort_dynamic/sample/Makefile diff --git a/GPUSort/src/quicksort_dynamic/sample/main.cu b/GPUSort/GPUSort/src/quicksort_dynamic/sample/main.cu similarity index 100% rename from GPUSort/src/quicksort_dynamic/sample/main.cu rename to GPUSort/GPUSort/src/quicksort_dynamic/sample/main.cu diff --git a/GPUSort/src/quicksort_dynamic/task.h b/GPUSort/GPUSort/src/quicksort_dynamic/task.h similarity index 100% rename from GPUSort/src/quicksort_dynamic/task.h rename to GPUSort/GPUSort/src/quicksort_dynamic/task.h diff --git a/GPUSort/src/util/algorithm.h b/GPUSort/GPUSort/src/util/algorithm.h similarity index 100% rename from GPUSort/src/util/algorithm.h rename to GPUSort/GPUSort/src/util/algorithm.h diff --git a/GPUSort/src/util/config.mk b/GPUSort/GPUSort/src/util/config.mk similarity index 100% rename from GPUSort/src/util/config.mk rename to GPUSort/GPUSort/src/util/config.mk diff --git a/GPUSort/src/util/reduction.cuh b/GPUSort/GPUSort/src/util/reduction.cuh similarity index 100% rename from GPUSort/src/util/reduction.cuh rename to GPUSort/GPUSort/src/util/reduction.cuh diff --git a/GPUSort/src/util/timer.h b/GPUSort/GPUSort/src/util/timer.h similarity index 100% rename from GPUSort/src/util/timer.h rename to GPUSort/GPUSort/src/util/timer.h diff --git a/GPUSort/tests/bitonic_tests/Makefile b/GPUSort/GPUSort/tests/bitonic_tests/Makefile similarity index 100% rename from GPUSort/tests/bitonic_tests/Makefile rename to GPUSort/GPUSort/tests/bitonic_tests/Makefile diff --git a/GPUSort/tests/bitonic_tests/unitTests.cu b/GPUSort/GPUSort/tests/bitonic_tests/unitTests.cu similarity index 100% rename from GPUSort/tests/bitonic_tests/unitTests.cu rename to GPUSort/GPUSort/tests/bitonic_tests/unitTests.cu diff --git a/GPUSort/tests/quicksort_dynamic_tests/Makefile b/GPUSort/GPUSort/tests/quicksort_dynamic_tests/Makefile similarity index 100% rename from GPUSort/tests/quicksort_dynamic_tests/Makefile rename to GPUSort/GPUSort/tests/quicksort_dynamic_tests/Makefile diff --git a/GPUSort/GPUSort/tests/quicksort_dynamic_tests/README.md b/GPUSort/GPUSort/tests/quicksort_dynamic_tests/README.md new file mode 100644 index 000000000..85a1ddbe5 --- /dev/null +++ b/GPUSort/GPUSort/tests/quicksort_dynamic_tests/README.md @@ -0,0 +1 @@ +the implementation of CDP Quick sort is broken and some tests can not be passed \ No newline at end of file diff --git a/GPUSort/tests/quicksort_dynamic_tests/unitTests.cu b/GPUSort/GPUSort/tests/quicksort_dynamic_tests/unitTests.cu similarity index 100% rename from GPUSort/tests/quicksort_dynamic_tests/unitTests.cu rename to GPUSort/GPUSort/tests/quicksort_dynamic_tests/unitTests.cu diff --git a/GPUSort/tests/quicksort_unitTests/Makefile b/GPUSort/GPUSort/tests/quicksort_unitTests/Makefile similarity index 100% rename from GPUSort/tests/quicksort_unitTests/Makefile rename to GPUSort/GPUSort/tests/quicksort_unitTests/Makefile diff --git a/GPUSort/tests/quicksort_unitTests/unitTests.cu b/GPUSort/GPUSort/tests/quicksort_unitTests/unitTests.cu similarity index 100% rename from GPUSort/tests/quicksort_unitTests/unitTests.cu rename to GPUSort/GPUSort/tests/quicksort_unitTests/unitTests.cu diff --git a/GPUSort/README.md b/GPUSort/README.md new file mode 100644 index 000000000..2117c0478 --- /dev/null +++ b/GPUSort/README.md @@ -0,0 +1,17 @@ +## repository for bachelor thesis on Development of parallel sorting algorithms for GPU + + +# directory structure +* measuring + * scripts and codes used to make comparison between different algorithms +* otherGPUsorts + * code of other sorting algorithms +* GPUSort + * implementation Bitonic sort and Quick sort for the thesis + + +sidenote: + +warnings during compilation such as the one below are emitted by the TNL library and is an expected behaviour + +/home//.local/include/TNL/Containers/ArrayView.h(155): warning: __host__ annotation is ignored on a function("ArrayView") that is explicitly defaulted on its first declaration \ No newline at end of file diff --git a/GPUSort/measuring/README.md b/GPUSort/measuring/README.md new file mode 100644 index 000000000..c9a6ff84d --- /dev/null +++ b/GPUSort/measuring/README.md @@ -0,0 +1,9 @@ +## measuring folder + +* *.ipynb are python jupyter notebook used to process measured data +* ``script.sh`` is a bash scrip that will start all measurements and save the results into ``results`` folder +* ``results`` is a folder to store all .csv files generated after measurement +* each of the folder has a Makefile to start measuring + * to measure an algorithm manually, go into the folder, call ``make`` and execute the binary + * ``./a.out`` will print the results on the standard output + * ``./a.out ../results/my_results.csv`` will save the time measured into the given file location \ No newline at end of file diff --git a/GPUSort/measuring/TNL_implementation/bitonic/Makefile b/GPUSort/measuring/TNL_implementation/bitonic/Makefile new file mode 100644 index 000000000..a8fc1a3eb --- /dev/null +++ b/GPUSort/measuring/TNL_implementation/bitonic/Makefile @@ -0,0 +1,27 @@ +include ../../util/config.mk + +CUDA_SOURCES := $(wildcard *.cu) +CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) + +## targets definitions follow +.PHONY: all host cuda +all: cuda +cuda: $(CUDA_TARGETS) + +run: cuda + ./$(CUDA_TARGETS) + +measure: cuda + ./$(CUDA_TARGETS) ../../results/TNL_bitonicsort.csv + +.PHONY: clean +clean: + rm -f *.d *.o *.cuo $(CUDA_TARGETS) + +# use .cuo instead of .cu.o to avoid problems with the implicit rules: https://stackoverflow.com/q/62967939 +# (and use the host compiler for linking CUDA, nvcc does not understand that .cuo is an object file) +$(CUDA_TARGETS): % : %.cuo + $(CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS) + +$(CUDA_SOURCES:%.cu=%.cuo): %.cuo : %.cu + $(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -c -o $@ $< diff --git a/GPUSort/measuring/TNL_implementation/bitonic/main.cu b/GPUSort/measuring/TNL_implementation/bitonic/main.cu new file mode 100644 index 000000000..51df65df1 --- /dev/null +++ b/GPUSort/measuring/TNL_implementation/bitonic/main.cu @@ -0,0 +1 @@ +#include "../../../GPUSort/benchmark/bitonic_benchmark/benchmark.cu" \ No newline at end of file diff --git a/GPUSort/measuring/TNL_implementation/bitonic/sameDir/.Makefile b/GPUSort/measuring/TNL_implementation/bitonic/sameDir/.Makefile new file mode 100644 index 000000000..23593937b --- /dev/null +++ b/GPUSort/measuring/TNL_implementation/bitonic/sameDir/.Makefile @@ -0,0 +1,24 @@ +include ../../util/config.mk + +CUDA_SOURCES := $(wildcard *.cu) +CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) + +## targets definitions follow +.PHONY: all host cuda +all: cuda +cuda: $(CUDA_TARGETS) + +run: cuda + ./$(CUDA_TARGETS) + +.PHONY: clean +clean: + rm -f *.d *.o *.cuo $(CUDA_TARGETS) + +# use .cuo instead of .cu.o to avoid problems with the implicit rules: https://stackoverflow.com/q/62967939 +# (and use the host compiler for linking CUDA, nvcc does not understand that .cuo is an object file) +$(CUDA_TARGETS): % : %.cuo + $(CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS) + +$(CUDA_SOURCES:%.cu=%.cuo): %.cuo : %.cu + $(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -c -o $@ $< diff --git a/GPUSort/measuring/TNL_implementation/bitonic/sameDir/main.cu b/GPUSort/measuring/TNL_implementation/bitonic/sameDir/main.cu new file mode 100644 index 000000000..412bf0cf3 --- /dev/null +++ b/GPUSort/measuring/TNL_implementation/bitonic/sameDir/main.cu @@ -0,0 +1,118 @@ +#include "../../../GPUSort/bitonicGPU/bitonicSort.h" +#include +#include "../../util/timer.h" +#include "../../util/algorithm.h" + +#include +#include +#include +#include +#include +#include +#include + + +class NOT_SORTED_PROPERLY{}; + +using namespace std; +int main() +{ + ofstream out("TNL_sameDir.csv"); + out << "implementation,size,sorted,almost_sorted,decreasing,random" << endl; + + for(int pow = 3; pow <= 23 ; pow++) + { + int size =(1<< pow); + std::set sizes{size, size+1, size-1}; + for(int i = 0; i < 3; i++) + sizes.insert(size + (std::rand() % size)); + + for(auto x : sizes) + { + cout << "checking size =" << x << endl; + + out << "TNL," << x; + std::vector vec(x); + for(int i = 0; i < x ; ++i) + vec[i] = i; + TNL::Containers::Array arr; + + //sorted sequence + { + arr = vec; + auto view = arr.getView(); + { + TIMER t([&](double res){out << "," << res;}); + bitonicSort(arr.getView()); + } + + if(!is_sorted(arr.getView())) + { + cerr << "sorted seq" << endl; + throw NOT_SORTED_PROPERLY(); + } + } + + //almost sorted sequence + { + for(int i = 0; i < 3; i++) + { + int s = std::rand() % (x - 3); + std::swap(vec[s], vec[s + 1]); + } + + auto view = arr.getView(); + { + TIMER t([&](double res){out << "," << res;}); + bitonicSort(arr.getView()); + } + + if(!is_sorted(arr.getView())) + { + cerr << "almost sorted seq" << endl; + throw NOT_SORTED_PROPERLY(); + } + } + + //decreasing sequence + { + for(size_t i = 0; i < x; i++) + vec[i] = -i; + + auto view = arr.getView(); + { + TIMER t([&](double res){out << "," << res;}); + bitonicSort(arr.getView()); + } + + if(!is_sorted(arr.getView())) + { + cerr << "dec seq" << endl; + throw NOT_SORTED_PROPERLY(); + } + } + + //random sequence + { + std::random_shuffle(vec.begin(), vec.end()); + + auto view = arr.getView(); + { + TIMER t([&](double res){out << "," << res;}); + bitonicSort(arr.getView()); + } + + if(!is_sorted(arr.getView())) + { + cerr << "random seq" << endl; + throw NOT_SORTED_PROPERLY(); + } + } + + out << endl; + } + + } + + return 0; +} \ No newline at end of file diff --git a/GPUSort/measuring/TNL_implementation/cdpquicksort/.Makefile b/GPUSort/measuring/TNL_implementation/cdpquicksort/.Makefile new file mode 100644 index 000000000..080c69609 --- /dev/null +++ b/GPUSort/measuring/TNL_implementation/cdpquicksort/.Makefile @@ -0,0 +1,32 @@ +include ../../util/config.mk + +CUDA_SOURCES := $(wildcard *.cu) +CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) + +EXTRA_ARCH := -gencode arch=compute_52,code=sm_52 +DEVICE_CODE := -dc + +CUDA_LDLIBS += -lcudadevrt + +## targets definitions follow +.PHONY: all host cuda +all: cuda +cuda: $(CUDA_TARGETS) + +run: cuda + ./$(CUDA_TARGETS) + +measure: cuda + ./$(CUDA_TARGETS) ../../results/TNL_cdpQuicksort.csv + +.PHONY: clean +clean: + rm -f *.d *.o *.cuo $(CUDA_TARGETS) + +# use .cuo instead of .cu.o to avoid problems with the implicit rules: https://stackoverflow.com/q/62967939 +# (and use the host compiler for linking CUDA, nvcc does not understand that .cuo is an object file) +$(CUDA_TARGETS): % : %.o + $(CUDA_CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS) + +$(CUDA_SOURCES:%.cu=%.o): %.o : %.cu + $(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) $(EXTRA_ARCH) $(DEVICE_CODE) -I/home/xuant/NVIDIA_CUDA-11.1_Samples/common/inc -c -o $@ $< diff --git a/GPUSort/measuring/TNL_implementation/cdpquicksort/benchmark.cu b/GPUSort/measuring/TNL_implementation/cdpquicksort/benchmark.cu new file mode 100644 index 000000000..332eae267 --- /dev/null +++ b/GPUSort/measuring/TNL_implementation/cdpquicksort/benchmark.cu @@ -0,0 +1,4 @@ +#include "../../../GPUSort/src/quicksort_dynamic/quicksort.cu" +#define SORTERFUNCTION quicksort +//--------------------------- +#include "../../../GPUSort/benchmark/benchmarker.cpp" \ No newline at end of file diff --git a/GPUSort/measuring/TNL_implementation/quicksort/Makefile b/GPUSort/measuring/TNL_implementation/quicksort/Makefile new file mode 100644 index 000000000..5c9a6e863 --- /dev/null +++ b/GPUSort/measuring/TNL_implementation/quicksort/Makefile @@ -0,0 +1,27 @@ +include ../../util/config.mk + +CUDA_SOURCES := $(wildcard *.cu) +CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) + +## targets definitions follow +.PHONY: all host cuda +all: cuda +cuda: $(CUDA_TARGETS) + +run: cuda + ./$(CUDA_TARGETS) + +measure: cuda + ./$(CUDA_TARGETS) ../../results/TNL_quicksort.csv + +.PHONY: clean +clean: + rm -f *.d *.o *.cuo $(CUDA_TARGETS) + +# use .cuo instead of .cu.o to avoid problems with the implicit rules: https://stackoverflow.com/q/62967939 +# (and use the host compiler for linking CUDA, nvcc does not understand that .cuo is an object file) +$(CUDA_TARGETS): % : %.cuo + $(CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS) + +$(CUDA_SOURCES:%.cu=%.cuo): %.cuo : %.cu + $(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -c -o $@ $< diff --git a/GPUSort/measuring/TNL_implementation/quicksort/benchmark.cu b/GPUSort/measuring/TNL_implementation/quicksort/benchmark.cu new file mode 100644 index 000000000..0ed1c8400 --- /dev/null +++ b/GPUSort/measuring/TNL_implementation/quicksort/benchmark.cu @@ -0,0 +1 @@ +#include "../../../GPUSort/benchmark/quicksort_benchmark/benchmark.cu" \ No newline at end of file diff --git a/GPUSort/measuring/cederman_quicksort/Makefile b/GPUSort/measuring/cederman_quicksort/Makefile new file mode 100644 index 000000000..5872137b3 --- /dev/null +++ b/GPUSort/measuring/cederman_quicksort/Makefile @@ -0,0 +1,23 @@ +include ../util/config.mk + +CUDA_SOURCES := $(wildcard *.cu) +CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) + +## targets definitions follow +.PHONY: all host cuda +all: cuda +cuda: $(CUDA_TARGETS) + +run: cuda + ./$(CUDA_TARGETS) + +measure: cuda + ./$(CUDA_TARGETS) ../results/cederman_quicksort.csv + +.PHONY: clean +clean: + rm -f *.d *.o *.cuo $(CUDA_TARGETS) + +$(CUDA_TARGETS): $(CUDA_TARGETS).cu + nvcc $(CUDA_TARGETS).cu -o $(CUDA_TARGETS) + diff --git a/GPUSort/measuring/cederman_quicksort/main.cu b/GPUSort/measuring/cederman_quicksort/main.cu new file mode 100644 index 000000000..efa8c2540 --- /dev/null +++ b/GPUSort/measuring/cederman_quicksort/main.cu @@ -0,0 +1,12 @@ +#include "../../otherGPUsorts/cederman/cederman_qsort.cu" +#include + +void sorter(std::vector & vec) +{ + gpuqsort((unsigned int *)vec.data(), vec.size()); +} + +//------------------------------------ + +#include "../../GPUSort/benchmark/benchmarker.cpp" +#include "../../GPUSort/benchmark/measure.cpp" diff --git a/GPUSort/measuring/cuda_example/bitonic/Makefile b/GPUSort/measuring/cuda_example/bitonic/Makefile new file mode 100644 index 000000000..316d49a6e --- /dev/null +++ b/GPUSort/measuring/cuda_example/bitonic/Makefile @@ -0,0 +1,27 @@ +include ../../util/config.mk + +CUDA_SOURCES := $(wildcard *.cu) +CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) + +## targets definitions follow +.PHONY: all host cuda +all: cuda +cuda: $(CUDA_TARGETS) + +run: cuda + ./$(CUDA_TARGETS) + +measure: cuda + ./$(CUDA_TARGETS) ../../results/nvidia_bitonic.csv + +.PHONY: clean +clean: + rm -f *.d *.o *.cuo $(CUDA_TARGETS) + +# use .cuo instead of .cu.o to avoid problems with the implicit rules: https://stackoverflow.com/q/62967939 +# (and use the host compiler for linking CUDA, nvcc does not understand that .cuo is an object file) +$(CUDA_TARGETS): % : %.cuo + $(CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS) + +$(CUDA_SOURCES:%.cu=%.cuo): %.cuo : %.cu + $(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -I../../../otherGPUsorts/cudaExamples/inc -c -o $@ $< diff --git a/GPUSort/measuring/cuda_example/bitonic/main.cu b/GPUSort/measuring/cuda_example/bitonic/main.cu new file mode 100644 index 000000000..043dc4fe7 --- /dev/null +++ b/GPUSort/measuring/cuda_example/bitonic/main.cu @@ -0,0 +1,37 @@ +#include "../../../otherGPUsorts/cudaExamples/sortingNetworks/bitonicSort.cu" +#include "../../../GPUSort/src/util/timer.h" +#include "../../../GPUSort/src/util/algorithm.h" +#include +#include +#include +using namespace std; +using namespace TNL; +using namespace TNL::Containers; +//--------------------- + +double measure(const std::vector&vec, int tries, int & wrongAnsCnt) +{ + vector resAcc; + + Array arr(vec.size()); + Array arr2(vec.size()); + for(int i = 0; i < tries; i++) + { + arr = vec; + arr2 = vec; + { + TIMER t([&](double res){resAcc.push_back(res);}); + bitonicSort((unsigned *)arr.getData(), (unsigned *)arr2.getData(), + (unsigned *)arr.getData(), (unsigned *)arr2.getData(), + 1, arr.getSize(), 1); + cudaDeviceSynchronize(); + } + + if(!is_sorted(arr.getView())) + wrongAnsCnt++; + } + + return accumulate(resAcc.begin(), resAcc.end(), 0.0) / resAcc.size(); +} + +#include "../../../GPUSort/benchmark/benchmarker.cpp" \ No newline at end of file diff --git a/GPUSort/measuring/cuda_example/cdpAdvancedQuicksort/Makefile b/GPUSort/measuring/cuda_example/cdpAdvancedQuicksort/Makefile new file mode 100644 index 000000000..578bfe530 --- /dev/null +++ b/GPUSort/measuring/cuda_example/cdpAdvancedQuicksort/Makefile @@ -0,0 +1,32 @@ +include ../../util/config.mk + +CUDA_SOURCES := $(wildcard *.cu) +CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) + +EXTRA_ARCH := -gencode arch=compute_52,code=sm_52 +DEVICE_CODE := -dc + +CUDA_LDLIBS += -lcudadevrt + +## targets definitions follow +.PHONY: all host cuda +all: cuda +cuda: $(CUDA_TARGETS) + +run: cuda + ./$(CUDA_TARGETS) + +measure: cuda + ./$(CUDA_TARGETS) ../../results/nvidia_cdpAdvanced.csv + +.PHONY: clean +clean: + rm -f *.d *.o *.cuo $(CUDA_TARGETS) + +# use .cuo instead of .cu.o to avoid problems with the implicit rules: https://stackoverflow.com/q/62967939 +# (and use the host compiler for linking CUDA, nvcc does not understand that .cuo is an object file) +$(CUDA_TARGETS): % : %.o + $(CUDA_CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS) + +$(CUDA_SOURCES:%.cu=%.o): %.o : %.cu + $(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) $(EXTRA_ARCH) $(DEVICE_CODE) -I../../../otherGPUsorts/cudaExamples/inc -c -o $@ $< diff --git a/GPUSort/measuring/cuda_example/cdpAdvancedQuicksort/benchmark.cu b/GPUSort/measuring/cuda_example/cdpAdvancedQuicksort/benchmark.cu new file mode 100644 index 000000000..ec9f54058 --- /dev/null +++ b/GPUSort/measuring/cuda_example/cdpAdvancedQuicksort/benchmark.cu @@ -0,0 +1,13 @@ +#include "../../../otherGPUsorts/cudaExamples/cdpAdvancedQuicksort/cdpAdvancedQuicksort.cu" +#include "../../../otherGPUsorts/cudaExamples/cdpAdvancedQuicksort/cdpBitonicSort.cu" +#include + +//--------------------------- +void sorter(TNL::Containers::ArrayView view) +{ + TNL::Containers::Array aux(view.getSize()); + run_quicksort_cdp((unsigned int *)view.getData(), (unsigned int *)aux.getData(), view.getSize(), NULL); +} + +#include "../../../GPUSort/benchmark/benchmarker.cpp" +#include "../../../GPUSort/benchmark/measure.cu" \ No newline at end of file diff --git a/GPUSort/measuring/cuda_example/cdpsimplequicksort/.Makefile b/GPUSort/measuring/cuda_example/cdpsimplequicksort/.Makefile new file mode 100644 index 000000000..948720e0a --- /dev/null +++ b/GPUSort/measuring/cuda_example/cdpsimplequicksort/.Makefile @@ -0,0 +1,32 @@ +include ../../util/config.mk + +CUDA_SOURCES := $(wildcard *.cu) +CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) + +EXTRA_ARCH := -gencode arch=compute_52,code=sm_52 +DEVICE_CODE := -dc + +CUDA_LDLIBS += -lcudadevrt + +## targets definitions follow +.PHONY: all host cuda +all: cuda +cuda: $(CUDA_TARGETS) + +run: cuda + ./$(CUDA_TARGETS) + +measure: cuda + ./$(CUDA_TARGETS) ../../results/cdpSimple.csv + +.PHONY: clean +clean: + rm -f *.d *.o *.cuo $(CUDA_TARGETS) + +# use .cuo instead of .cu.o to avoid problems with the implicit rules: https://stackoverflow.com/q/62967939 +# (and use the host compiler for linking CUDA, nvcc does not understand that .cuo is an object file) +$(CUDA_TARGETS): % : %.o + $(CUDA_CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS) + +$(CUDA_SOURCES:%.cu=%.o): %.o : %.cu + $(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) $(EXTRA_ARCH) $(DEVICE_CODE) -I../../../otherGPUsorts/cudaExamples/inc -c -o $@ $< diff --git a/GPUSort/measuring/cuda_example/cdpsimplequicksort/benchmark.cu b/GPUSort/measuring/cuda_example/cdpsimplequicksort/benchmark.cu new file mode 100644 index 000000000..258d96d26 --- /dev/null +++ b/GPUSort/measuring/cuda_example/cdpsimplequicksort/benchmark.cu @@ -0,0 +1,15 @@ +#include "../../../otherGPUsorts/cudaExamples/cdpSimpleQuicksort/cdpSimpleQuicksort.cu" +#include + +#define SORTERFUNCTION nvidia_quick + +#define HIGH_POW 20 +//--------------------------- + +void nvidia_quick(TNL::Containers::ArrayView view) +{ + run_qsort((unsigned int *)view.getData(), view.getSize()); + cudaDeviceSynchronize(); +} + +#include "../../../GPUSort/benchmark/benchmarker.cpp" \ No newline at end of file diff --git a/GPUSort/measuring/davors/quicksort/Makefile b/GPUSort/measuring/davors/quicksort/Makefile new file mode 100644 index 000000000..82d5c87df --- /dev/null +++ b/GPUSort/measuring/davors/quicksort/Makefile @@ -0,0 +1,27 @@ +include ../../util/config.mk + +CUDA_SOURCES := $(wildcard *.cu) +CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) + +## targets definitions follow +.PHONY: all host cuda +all: cuda +cuda: $(CUDA_TARGETS) + +run: cuda + ./$(CUDA_TARGETS) + +measure: cuda + ./$(CUDA_TARGETS) ../../results/davors_bitonic.csv + +.PHONY: clean +clean: + rm -f *.d *.o *.cuo $(CUDA_TARGETS) + +# use .cuo instead of .cu.o to avoid problems with the implicit rules: https://stackoverflow.com/q/62967939 +# (and use the host compiler for linking CUDA, nvcc does not understand that .cuo is an object file) +$(CUDA_TARGETS): % : %.cuo + $(CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS) + +$(CUDA_SOURCES:%.cu=%.cuo): %.cuo : %.cu + $(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -c -o $@ $< diff --git a/GPUSort/measuring/davors/quicksort/benchmark.cu b/GPUSort/measuring/davors/quicksort/benchmark.cu new file mode 100644 index 000000000..9154dfb8d --- /dev/null +++ b/GPUSort/measuring/davors/quicksort/benchmark.cu @@ -0,0 +1,20 @@ +#include +#include "../../../otherGPUsorts/davors/BitonicSort/Sort/parallel.h" +//------------------------ + +#define LOW_POW 19 +#define HIGH_POW 20 + +void sorter(TNL::Containers::ArrayView view) +{ + auto sorter = new BitonicSortParallel(); + sorter->sort((data_t*)view.getData(), (uint_t)view.getSize(), ORDER_ASC); + cudaDeviceSynchronize(); + delete sorter; + return; +} + +//------------------------ + +#include "../../../GPUSort/benchmark/benchmarker.cpp" +#include "../../../GPUSort/benchmark/measure.cu" diff --git a/GPUSort/measuring/manca_quicksort/Makefile b/GPUSort/measuring/manca_quicksort/Makefile new file mode 100644 index 000000000..2303b4304 --- /dev/null +++ b/GPUSort/measuring/manca_quicksort/Makefile @@ -0,0 +1,27 @@ +include ../util/config.mk + +CUDA_SOURCES := $(wildcard *.cu) +CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) + +## targets definitions follow +.PHONY: all host cuda +all: cuda +cuda: $(CUDA_TARGETS) + +run: cuda + ./$(CUDA_TARGETS) + +measure: cuda + ./$(CUDA_TARGETS) ../results/manca_quicksort.csv + +.PHONY: clean +clean: + rm -f *.d *.o *.cuo $(CUDA_TARGETS) + +# use .cuo instead of .cu.o to avoid problems with the implicit rules: https://stackoverflow.com/q/62967939 +# (and use the host compiler for linking CUDA, nvcc does not understand that .cuo is an object file) +$(CUDA_TARGETS): % : %.cuo + $(CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS) + +$(CUDA_SOURCES:%.cu=%.cuo): %.cuo : %.cu + $(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -I../../otherGPUsorts/manca_quicksort/libraries/include/ -c -o $@ $< diff --git a/GPUSort/measuring/manca_quicksort/main.cu b/GPUSort/measuring/manca_quicksort/main.cu new file mode 100644 index 000000000..448f9cac0 --- /dev/null +++ b/GPUSort/measuring/manca_quicksort/main.cu @@ -0,0 +1,15 @@ +#include +#include "../../otherGPUsorts/manca_quicksort_extracted/manca_quicksort.cu" +//------------------------ + +void sorter(TNL::Containers::ArrayView view) +{ + double timer = 0; + CUDA_Quicksort((unsigned *)view.getData(), (unsigned *)view.getData(), view.getSize(), 256, 0, &timer); + return; +} + +//------------------------ + +#include "../../GPUSort/benchmark/benchmarker.cpp" +#include "../../GPUSort/benchmark/measure.cu" diff --git a/GPUSort/measuring/nickjillings/Makefile b/GPUSort/measuring/nickjillings/Makefile new file mode 100644 index 000000000..5e685427a --- /dev/null +++ b/GPUSort/measuring/nickjillings/Makefile @@ -0,0 +1,27 @@ +include ../util/config.mk + +CUDA_SOURCES := $(wildcard *.cu) +CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) + +## targets definitions follow +.PHONY: all host cuda +all: cuda +cuda: $(CUDA_TARGETS) + +run: cuda + ./$(CUDA_TARGETS) + +measure: cuda + ./$(CUDA_TARGETS) ../results/nickjillings.csv + +.PHONY: clean +clean: + rm -f *.d *.o *.cuo $(CUDA_TARGETS) + +# use .cuo instead of .cu.o to avoid problems with the implicit rules: https://stackoverflow.com/q/62967939 +# (and use the host compiler for linking CUDA, nvcc does not understand that .cuo is an object file) +$(CUDA_TARGETS): % : %.cuo + $(CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS) + +$(CUDA_SOURCES:%.cu=%.cuo): %.cuo : %.cu + $(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -c -o $@ $< diff --git a/GPUSort/measuring/nickjillings/main.cu b/GPUSort/measuring/nickjillings/main.cu new file mode 100644 index 000000000..75b93ea62 --- /dev/null +++ b/GPUSort/measuring/nickjillings/main.cu @@ -0,0 +1,12 @@ + +#include "../../otherGPUsorts/nickjillings/BitonicSortCUDA.cu" +#include + + +void sorter(TNL::Containers::ArrayView view) +{ + BitonicSort::BitonicSortCUDA((unsigned int *)view.getData(), view.getSize()); +} +//--------------------------- +#include "../../GPUSort/benchmark/benchmarker.cpp" +#include "../../GPUSort/benchmark/measure.cu" \ No newline at end of file diff --git a/GPUSort/measuring/script.sh b/GPUSort/measuring/script.sh new file mode 100644 index 000000000..2064d0f97 --- /dev/null +++ b/GPUSort/measuring/script.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +for i in $(find . -type f -name 'Makefile' | sed -r 's|/[^/]+$||' |sort |uniq) +do + echo going into $i + cd "$i" + echo starting... + make clean + make + make measure + echo done measuring + make clean + echo going out... + cd - +done diff --git a/GPUSort/measuring/std_sort/Makefile b/GPUSort/measuring/std_sort/Makefile new file mode 100644 index 000000000..f7ef2b3b3 --- /dev/null +++ b/GPUSort/measuring/std_sort/Makefile @@ -0,0 +1,12 @@ +main: main.cpp + g++ -Wall -pedantic -std=c++14 -O3 main.cpp -o main + +measure: main + ./main ../results/std_sort.csv + +run: main + ./main + +.PHONY: main +clean: + rm -f main diff --git a/GPUSort/measuring/std_sort/main.cpp b/GPUSort/measuring/std_sort/main.cpp new file mode 100644 index 000000000..8ce12d0ac --- /dev/null +++ b/GPUSort/measuring/std_sort/main.cpp @@ -0,0 +1,12 @@ +#include +#include + +#define TRIES 5 + +void sorter(std::vector&vec) +{ + std::sort(vec.begin(), vec.end()); +} +//--------------------------- +#include "../../GPUSort/benchmark/benchmarker.cpp" +#include "../../GPUSort/benchmark/measure.cpp" diff --git a/GPUSort/benchmark/quicksort_benchmark/Makefile b/GPUSort/measuring/thrust/Makefile similarity index 90% rename from GPUSort/benchmark/quicksort_benchmark/Makefile rename to GPUSort/measuring/thrust/Makefile index 3444b640f..1c21a5ac7 100644 --- a/GPUSort/benchmark/quicksort_benchmark/Makefile +++ b/GPUSort/measuring/thrust/Makefile @@ -1,4 +1,4 @@ -include ../../src/util/config.mk +include ../util/config.mk CUDA_SOURCES := $(wildcard *.cu) CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) @@ -12,7 +12,7 @@ run: cuda ./$(CUDA_TARGETS) measure: cuda - ./$(CUDA_TARGETS) ../quicksort.csv + ./$(CUDA_TARGETS) ../results/thrust.csv .PHONY: clean clean: diff --git a/GPUSort/measuring/thrust/main.cu b/GPUSort/measuring/thrust/main.cu new file mode 100644 index 000000000..e28f1b6bb --- /dev/null +++ b/GPUSort/measuring/thrust/main.cu @@ -0,0 +1,12 @@ +#include +#include +#include + +void sorter(TNL::Containers::ArrayView view) +{ + thrust::sort(thrust::device, view.getData(), view.getData() + view.getSize()); + cudaDeviceSynchronize(); +} +//--------------------------- +#include "../../GPUSort/benchmark/benchmarker.cpp" +#include "../../GPUSort/benchmark/measure.cu" \ No newline at end of file diff --git a/GPUSort/measuring/util/config.mk b/GPUSort/measuring/util/config.mk new file mode 100644 index 000000000..3715986f7 --- /dev/null +++ b/GPUSort/measuring/util/config.mk @@ -0,0 +1,49 @@ +# configure the include path(s) according to your TNL installation +TNL_INCLUDE_DIRS := -I ~/.local/include + +WITH_OPENMP := no +WITH_DEBUG := no + +# If TNL is installed on your system, the CUDA architecture can be detected +# automatically by tnl-cuda-arch. This is done if CUDA_ARCH is set to "auto". +# Otherwise, CUDA_ARCH has to be set manually to the desired CUDA architecture +# number, e.g. 60, 61, etc. +CUDA_ARCH := auto + +# compilers +CXX := g++ +CUDA_CXX := nvcc + +# host compiler flags +CXXFLAGS := -std=c++14 $(TNL_INCLUDE_DIRS) +ifeq ($(WITH_DEBUG),yes) + CXXFLAGS += -O0 -g +else + CXXFLAGS += -O3 -DNDEBUG +endif + +# CUDA compiler flags +CUDA_CXXFLAGS := -std=c++14 --expt-relaxed-constexpr --expt-extended-lambda $(TNL_INCLUDE_DIRS) +CUDA_CXXFLAGS += -DHAVE_CUDA +ifeq ($(CUDA_ARCH),auto) + CUDA_CXXFLAGS += $(shell tnl-cuda-arch) +else + CUDA_CXXFLAGS += -gencode arch=compute_$(CUDA_ARCH),code=sm_$(CUDA_ARCH) +endif + +# determine path to the CUDA toolkit installation +# (autodetection is attempted, set it manually if it fails) +CUDA_PATH ?= $(abspath $(dir $(shell command -v nvcc))/..) +#$(info Detected CUDA_PATH: $(CUDA_PATH)) + +# flags for linking CUDA with the host compiler +CUDA_LDFLAGS := -L $(CUDA_PATH)/lib64 +CUDA_LDLIBS := -lcudart -ldl -lrt + +# enable OpenMP +ifeq ($(WITH_OPENMP),yes) + CXXFLAGS += -fopenmp -DHAVE_OPENMP + LDLIBS += -lgomp + CUDA_CXXFLAGS += -Xcompiler -fopenmp -DHAVE_OPENMP + CUDA_LDLIBS += -lgomp +endif diff --git a/GPUSort/otherGPUsorts/cederman/cederman_qsort.cu b/GPUSort/otherGPUsorts/cederman/cederman_qsort.cu new file mode 100644 index 000000000..fd877fdb0 --- /dev/null +++ b/GPUSort/otherGPUsorts/cederman/cederman_qsort.cu @@ -0,0 +1,1092 @@ + + +#ifndef PQSORTH +#define PQSORTH + +#ifdef _MSC_VER +#ifdef BUILDING_DLL +#define DLLEXPORT __declspec(dllexport) +#else +#define DLLEXPORT /*__declspec(dllimport)*/ +#endif +#else +#ifdef HAVE_GCCVISIBILITYPATCH +#define DLLEXPORT __attribute__((visibility("default"))) +#else +#define DLLEXPORT +#endif +#endif + +#define MAXTHREADS 256 +#define MAXBLOCKS 2048 + +/** +* The main sort function +* @param data Data to be sorted +* @param size The length of the data +* @returns 0 if successful. For non-zero values, use getErrorStr() for more information about why it failed. +*/ +int gpuqsort(unsigned int *data, unsigned int size, unsigned int blockscount = 0, unsigned int threads = 0, unsigned int sbsize = 0, unsigned int phase = 0); + + +// Keep tracks of the data blocks in phase one +template +struct BlockSize +{ + unsigned int beg; + unsigned int end; + unsigned int orgbeg; + unsigned int orgend; + element rmaxpiv; + element lmaxpiv; + element rminpiv; + element lminpiv; + + bool altered; + bool flip; + element pivot; +}; + +// Holds parameters to the kernel in phase one +template +struct Params +{ + unsigned int from; + unsigned int end; + element pivot; + unsigned int ptr; + bool last; +}; + +// Used to perform a cumulative sum between blocks. +// Unnecessary for cards with atomic operations. +// Will be removed when these becomes more common +template +struct Length +{ + element maxpiv[MAXBLOCKS]; + element minpiv[MAXBLOCKS]; + + unsigned int left[MAXBLOCKS]; + unsigned int right[MAXBLOCKS]; +}; + +// Since we have divided up the kernel in to three +// we need to remember the result of the cumulative sum +// Unnecessary for cards with atomic operations. +// Will be removed when these becomes more common +struct Hist +{ + unsigned int left[(MAXTHREADS)*MAXBLOCKS]; + unsigned int right[(MAXTHREADS)*MAXBLOCKS]; +}; + +struct LQSortParams +{ + unsigned int beg; + unsigned int end; + bool flip; + unsigned int sbsize; +}; + +template +class GPUQSort +{ + element *ddata; + element *ddata2; + struct Params *params; + struct Params *dparams; + + LQSortParams *lqparams; + LQSortParams *dlqparams; + + Hist *dhists; + Length *dlength; + Length *length; + BlockSize *workset; + + float TK, TM, MK, MM, SM, SK; + + int err; + bool init; + + bool errCheck(int e); + +public: + GPUQSort(); + ~GPUQSort(); + + int sort(element *data, unsigned int size, unsigned int blockscount = 0, unsigned int threads = 0, unsigned int sbsize = 0, unsigned int phase = 0); + const char *getErrorStr(); +}; + +#endif + +#undef THREADS + +#define THREADS blockDim.x + +extern __shared__ unsigned int sarray[]; + +#ifdef HASATOMICS +__device__ unsigned int ohtotal = 0; +#endif + +/** +* Swaps the location of two unsigned ints +* @param a This unsigned int will swap place with unsigned int b +* @param b This unsigned int will swap place with unsigned int a +*/ +//template +__device__ inline void swap(unsigned int &a, unsigned int &b) +{ + unsigned int tmp = a; + a = b; + b = tmp; +} + +/** +* Perform a bitonic sort +* @param values The unsigned ints to be sorted +* @param target Where to place the sorted unsigned int when done +* @param size The number of unsigned ints +*/ +//template +__device__ inline void bitonicSort(unsigned int *fromvalues, unsigned int *tovalues, unsigned int from, unsigned int size) +{ + unsigned int *shared = (unsigned int *)sarray; + + unsigned int coal = (from & 0xf); + size = size + coal; + from = from - coal; + + int sb = 2 << (int)(__log2f(size)); + + // Buffer data to be sorted in the shared memory + for (int i = threadIdx.x; i < size; i += THREADS) + { + shared[i] = fromvalues[i + from]; + } + + for (int i = threadIdx.x; i < coal; i += THREADS) + shared[i] = 0; + + // Pad the data + for (int i = threadIdx.x + size; i < sb; i += THREADS) + shared[i] = 0xffffffff; + + __syncthreads(); + + // Parallel bitonic sort. + for (int k = 2; k <= sb; k *= 2) + { + // Bitonic merge: + for (int j = k / 2; j > 0; j /= 2) + { + for (int tid = threadIdx.x; tid < sb; tid += THREADS) + { + unsigned int ixj = tid ^ j; + + if (ixj > tid) + { + if ((tid & k) == 0) + { + if (shared[tid] > shared[ixj]) + { + swap(shared[tid], shared[ixj]); + } + } + else + { + if (shared[tid] < shared[ixj]) + { + swap(shared[tid], shared[ixj]); + } + } + } + } + + __syncthreads(); + } + } + __syncthreads(); + + // Write back the sorted data to its correct position + for (int i = threadIdx.x; i < size; i += THREADS) + if (i >= coal) + tovalues[i + from] = shared[i]; + __syncthreads(); +} + +/** +* Perform a cumulative count on two arrays +* @param lblock Array one +* @param rblock Array two +*/ +__device__ inline void cumcount(unsigned int *lblock, unsigned int *rblock) +{ + int tx = threadIdx.x; + + int offset = 1; + + __syncthreads(); + + for (int d = THREADS >> 1; d > 0; d >>= 1) // build sum in place up the tree + { + __syncthreads(); + + if (tx < d) + { + int ai = offset * (2 * tx + 1) - 1; + int bi = offset * (2 * tx + 2) - 1; + lblock[bi] += lblock[ai]; + rblock[bi] += rblock[ai]; + } + offset *= 2; + } + __syncthreads(); + if (tx == 0) + { + lblock[THREADS] = lblock[THREADS - 1]; + rblock[THREADS] = rblock[THREADS - 1]; + lblock[THREADS - 1] = 0; + rblock[THREADS - 1] = 0; + } // clear the last unsigned int */ + __syncthreads(); + + for (int d = 1; d < THREADS; d *= 2) // traverse down tree & build scan + { + offset >>= 1; + __syncthreads(); + + if (tx < d) + { + int ai = offset * (2 * tx + 1) - 1; + int bi = offset * (2 * tx + 2) - 1; + + int t = lblock[ai]; + lblock[ai] = lblock[bi]; + lblock[bi] += t; + + t = rblock[ai]; + rblock[ai] = rblock[bi]; + rblock[bi] += t; + } + } +} + +/** +* Part One - Counts the number of unsigned ints larger or smaller than the pivot. It then +* performs a cumulative sum so that each thread knows where to write +* @param data unsigned ints to be counted +* @param params Specifies which data each thread block is responsible for +* @param hist The cumulative sum for each thread is stored here +* @param lengths The total sum for each thread block is stored here +*/ +//template +__global__ void part1(unsigned int *data, Params *params, struct Hist *hist, Length *lengths) +{ + const int tx = threadIdx.x; + + unsigned int *lblock = (unsigned int *)sarray; + unsigned int *rblock = (unsigned int *)(&lblock[(blockDim.x + 1)]); + unsigned int *minpiv = (unsigned int *)(&rblock[(blockDim.x + 1)]); + unsigned int *maxpiv = (unsigned int *)(&minpiv[blockDim.x]); + + // Where should we read? + unsigned int start = params[blockIdx.x].from; + unsigned int end = params[blockIdx.x].end; + unsigned int pivot = params[blockIdx.x].pivot; + + // Stores the max and min value of the data. Used to decide a new pivot + minpiv[tx] = data[start + tx]; + maxpiv[tx] = data[start + tx]; + + __syncthreads(); + int ll = 0; + int lr = 0; + + __syncthreads(); + + int coal = (start & 0xf); + start = start - coal; + + // Go through the data + if (tx + start < end) + { + unsigned int d = data[tx + start]; + + if (!(tx < coal)) + { + + // Counting unsigned ints smaller... + if (d < pivot) + ll++; + else + // or larger than the pivot + if (d > pivot) + lr++; + + // Store the max and min unsigned int + minpiv[tx] = min(minpiv[tx], d); + maxpiv[tx] = max(maxpiv[tx], d); + } + } + + // Go through the data + for (unsigned int i = tx + start + THREADS; i < end; i += THREADS) + { + unsigned int d = data[i]; + + // Counting unsigned ints smaller... + if (d < pivot) + ll++; + else + // or larger than the pivot + if (d > pivot) + lr++; + + // Store the max and min unsigned int + minpiv[tx] = min(minpiv[tx], d); + maxpiv[tx] = max(maxpiv[tx], d); + } + + lblock[tx] = ll; + rblock[tx] = lr; + + __syncthreads(); + + // Perform a cumulative sum + cumcount((unsigned int *)lblock, (unsigned int *)rblock); + + if (tx == 0) + { + // Decide on max and min unsigned int + for (int i = 0; i < THREADS; i++) + { + minpiv[0] = min(minpiv[0], minpiv[i]); + maxpiv[0] = max(maxpiv[0], maxpiv[i]); + } + } + __syncthreads(); + + // Store each threads part of the cumulative count + hist->left[blockIdx.x * (THREADS) + threadIdx.x] = lblock[threadIdx.x + 1]; + hist->right[blockIdx.x * (THREADS) + threadIdx.x] = rblock[threadIdx.x + 1]; + + // Store the total sum + lengths->left[blockIdx.x] = lblock[THREADS]; + lengths->right[blockIdx.x] = rblock[THREADS]; + + // Store the max and min unsigned int + lengths->minpiv[blockIdx.x] = minpiv[0]; + lengths->maxpiv[blockIdx.x] = maxpiv[0]; +} + +/** +* Part Two - Move unsigned ints to their correct position in the auxillary array +* @param data unsigned ints to be moved +* @param data2 Destination for unsigned ints +* @param params Specifies which data each thread block is responsible for +* @param hist The cumulative sum for each thread is stored here +* @param lengths The total sum for each thread block is stored here +*/ +//template +__global__ void part2(unsigned int *data, unsigned int *data2, struct Params *params, struct Hist *hist, Length *lengths) +{ + const int tx = threadIdx.x; + const int bx = blockIdx.x; + + // Each thread uses the cumulative sum to know where to write + unsigned int x = lengths->left[bx] + hist->left[bx * (THREADS) + tx] - 1; // - 1; + unsigned int y = lengths->right[bx] - hist->right[bx * (THREADS) + tx]; + + // Where should we read? + unsigned int start = params[bx].from; + unsigned int end = params[bx].end; + unsigned int pivot = params[bx].pivot; + + __syncthreads(); + + int coal = (start & 0xf); + start = start - coal; + + // Go through all the assigned data + if (tx + start < end) + { + // Reading unsigned ints... + unsigned int d = data[tx + start]; + + if (!(tx < coal)) + { + + // and writing them to auxillary array + if (d < pivot) + { + if (x > 0) + data2[x--] = d; + else + data2[x] = d; + } + else if (d > pivot) + data2[y++] = d; + } + } + + __syncthreads(); + + // Go through all the assigned data + for (unsigned int i = start + tx + THREADS; i < end; i += THREADS) + { + // Reading unsigned ints... + unsigned int d = data[i]; + + // and writing them to auxillary array + if (d < pivot) + { + if (x > 0) + data2[x--] = d; + else + data2[x] = d; + } + else if (d > pivot) + data2[y++] = d; + } + + return; +} + +/** +* Part Three - Write the pivot value +* @param data Destination for pivot +* @param params Specifies which data each thread block is responsible for +* @param hist The cumulative sum for each thread is stored here +* @param lengths The total sum for each thread block is stored here +*/ +//template +__global__ void part3(unsigned int *data, struct Params *params, struct Hist *hist, Length *lengths) +{ + const int tx = threadIdx.x; + const int bx = blockIdx.x; + + // If we are the "last" thread block that is assigned to the same data sequence + // we write the pivot between the left and right block + if (params[bx].last) + { + // Get destination position + unsigned int x = lengths->left[bx] + hist->left[bx * THREADS + THREADS - 1] + tx; + unsigned int y = lengths->right[bx] - hist->right[bx * THREADS + THREADS - 1]; + unsigned int pivot = params[bx].pivot; + + // Write the pivot values + for (; x < y; x += THREADS) + data[x] = pivot; + } +} + +/** +* The local quicksort - sorts a block of data with no inter-block synchronization +* @param adata Contains some of the blocks to be sorted and also acts as the final +* destination for sorted data +* @param adata2 Contains some of the blocks to be sorted +* @param bs List of blocks to be sorted and a pointer telling if a specific block is +* in \a adata or \a adata2 +*/ +//template +__global__ void lqsort(unsigned int *adata, unsigned int *adata2, struct LQSortParams *bs, unsigned int phase) +{ + __shared__ unsigned int lphase; + lphase = phase; + + // Shorthand for the threadid + int tx = threadIdx.x; + + // Stack pointer + __shared__ int bi; + + // Stack unsigned ints + __shared__ unsigned int beg[32]; + __shared__ unsigned int end[32]; + __shared__ bool flip[32]; + + unsigned int *lblock = (unsigned int *)sarray; + unsigned int *rblock = (unsigned int *)(&lblock[(blockDim.x + 1)]); + + // The current pivot + __shared__ unsigned int pivot; + + // The sequence to be sorted + __shared__ unsigned int from; + __shared__ unsigned int to; + + // Since we switch between the primary and the auxillary buffer, + // these variables are required to keep track on which role + // a buffer currently has + __shared__ unsigned int *data; + __shared__ unsigned int *data2; + __shared__ unsigned int sbsize; + + __shared__ unsigned int bx; + if (threadIdx.x == 0) +#ifdef HASATOMICS + bx = atomicInc(&ohtotal, 50000); +#else + bx = blockIdx.x; +#endif + + __syncthreads(); + + while (bx < gridDim.x) + { + + // Thread 0 is in charge of the stack operations + if (tx == 0) + { + // We push our first block on the stack + // This is the block given by the bs parameter + beg[0] = bs[bx].beg; + end[0] = bs[bx].end; + flip[0] = bs[bx].flip; + sbsize = bs[bx].sbsize; + + bi = 0; + } + + __syncthreads(); + + // If we were given an empty block there is no need to continue + if (end[0] == beg[0]) + return; + + // While there are items left on the stack to sort + while (bi >= 0) + { + __syncthreads(); + // Thread 0 pops a fresh sequence from the stack + if (tx == 0) + { + from = beg[bi]; + to = end[bi]; + + // Check which buffer the sequence is in + if (!flip[bi]) + { + data = adata2; + data2 = adata; + } + else + { + data = adata; + data2 = adata2; + } + } + + __syncthreads(); + + // If the sequence is smaller than SBSIZE we sort it using + // an alternative sort. Otherwise each thread would sort just one + // or two unsigned ints and that wouldn't be efficient + if ((to - from) < (sbsize - 16)) + { + // Sort it using bitonic sort. This could be changed to some other + // sorting method. Store the result in the final destination buffer + if ((to - from >= 1) && (lphase != 2)) + bitonicSort(data, adata, from, to - from); + __syncthreads(); + + // Decrement the stack pointer + if (tx == 0) + bi--; + __syncthreads(); + // and continue with the next sequence + continue; + } + + if (tx == 0) + { + // Create a new pivot for the sequence + // Try to optimize this for your input distribution + // if you have some information about it + unsigned int mip = min(min(data[from], data[to - 1]), data[(from + to) / 2]); + unsigned int map = max(max(data[from], data[to - 1]), data[(from + to) / 2]); + pivot = min(max(mip / 2 + map / 2, mip), map); + } + + unsigned int ll = 0; + unsigned int lr = 0; + + __syncthreads(); + + unsigned int coal = (from)&0xf; + + if (tx + from - coal < to) + { + unsigned int d = data[tx + from - coal]; + + if (!(tx < coal)) + { + // Counting unsigned ints that have a higher value than the pivot + if (d < pivot) + ll++; + else + // or a lower + if (d > pivot) + lr++; + } + } + + // Go through the current sequence + for (int i = from + tx + THREADS - coal; i < to; i += THREADS) + { + unsigned int d = data[i]; + + // Counting unsigned ints that have a higher value than the pivot + if (d < pivot) + ll++; + else + // or a lower + if (d > pivot) + lr++; + } + + // Store the result in a shared array so that we can calculate a + // cumulative sum + lblock[tx] = ll; + rblock[tx] = lr; + + __syncthreads(); + + // Calculate the cumulative sum + cumcount((unsigned int *)lblock, (unsigned int *)rblock); + + __syncthreads(); + + // Let thread 0 add the new resulting subsequences to the stack + if (tx == 0) + { + // The sequences are in the other buffer now + flip[bi + 1] = !flip[bi]; + flip[bi] = !flip[bi]; + + // We need to place the smallest object on top of the stack + // to ensure that we don't run out of stack space + if (lblock[THREADS] < rblock[THREADS]) + { + beg[bi + 1] = beg[bi]; + beg[bi] = to - rblock[THREADS]; + end[bi + 1] = from + lblock[THREADS]; + } + else + { + end[bi + 1] = end[bi]; + end[bi] = from + lblock[THREADS]; + beg[bi + 1] = to - rblock[THREADS]; + } + // Increment the stack pointer + bi++; + } + + __syncthreads(); + + unsigned int x = from + lblock[tx + 1] - 1; + unsigned int y = to - rblock[tx + 1]; + + coal = from & 0xf; + + if (tx + from - coal < to) + { + unsigned int d = data[tx + from - coal]; + + if (!(tx < coal)) + { + if (d < pivot) + { + if (x > 0) + data2[x--] = d; + else + data2[x] = d; + } + else if (d > pivot) + data2[y++] = d; + } + } + + // Go through the data once again + // writing it to its correct position + for (unsigned int i = from + tx + THREADS - coal; i < to; i += THREADS) + { + unsigned int d = data[i]; + + if (d < pivot) + { + if (x > 0) + data2[x--] = d; + else + data2[x] = d; + } + else if (d > pivot) + data2[y++] = d; + } + + __syncthreads(); + + // As a final step, write the pivot value between the right and left + // subsequence. Write it to the final destination since this pivot + // is always correctly sorted + for (unsigned int i = from + lblock[THREADS] + tx; i < to - rblock[THREADS]; i += THREADS) + { + adata[i] = pivot; + } + + __syncthreads(); + } +#ifdef HASATOMICS + if (threadIdx.x == 0) + bx = atomicInc(&ohtotal, 50000); + __syncthreads(); +#else + break; +#endif + } + + __syncthreads(); +} + +#include +#include +#include + +#undef THREADS +#define THREADS threads + +/** +* The main sort function +* @param data Data to be sorted +* @param size The length of the data +* @returns 0 if successful. For non-zero values, use getErrorStr() for more information about why it failed. +*/ +template +int GPUQSort::sort(element *data, unsigned int size, unsigned int blockscount, unsigned int threads, unsigned int sbsize, unsigned int phase) +{ + if (!init) + return 1; + + if (!threads || !blockscount || !sbsize) + { + threads = 1 << (int)round(log(size * TK + TM) / log(2.0)); + blockscount = 1 << (int)round(log(size * MK + MM) / log(2.0)); + sbsize = 1 << (int)round(log(size * SK + SM) / log(2.0)); + } + +#ifdef HASATOMICS + unsigned int *doh; + unsigned int oh; + + cudaGetSymbolAddress((void **)&doh, "ohtotal"); + oh = 0; + cudaMemcpy(doh, &oh, 4, cudaMemcpyHostToDevice); +#endif + + if (threads > MAXTHREADS) + return 1; + + if (blockscount > MAXBLOCKS) + return 1; + + // Copy the data to the graphics card and create an auxiallary array + ddata2 = 0; + ddata = 0; + if (!errCheck(cudaMalloc((void **)&ddata2, (size) * sizeof(element)))) + return 1; + if (!errCheck(cudaMalloc((void **)&ddata, (size) * sizeof(element)))) + return 1; + if (!errCheck(cudaMemcpy(ddata, data, size * sizeof(element), cudaMemcpyHostToDevice))) + return 1; + + // We start with a set containg only the sequence to be sorted + // This will grow as we partition the data + workset[0].beg = 0; + workset[0].end = size; + workset[0].orgbeg = 0; + workset[0].orgend = size; + workset[0].altered = false; + workset[0].flip = false; + + // Get a starting pivot + workset[0].pivot = (min(min(data[0], data[size / 2]), data[size - 1]) + max(max(data[0], data[size / 2]), data[size - 1])) / 2; + unsigned int worksize = 1; + + unsigned int blocks = blockscount / 2; + unsigned totsize = size; + unsigned int maxlength = (size / blocks) / 4; + + unsigned int iterations = 0; + bool flip = true; + + // Partition the sequences until we have enough + while (worksize < blocks) + { + unsigned int ws = totsize / blocks; + unsigned int paramsize = 0; + + // Go through the sequences we have and divide them into sections + // and assign thread blocks according to their size + for (unsigned int i = 0; i < worksize; i++) + { + if ((workset[i].end - workset[i].beg) < maxlength) + continue; + + // Larger sequences gets more thread blocks assigned to them + unsigned int blocksassigned = max((workset[i].end - workset[i].beg) / ws, 1); + for (unsigned int q = 0; q < blocksassigned; q++) + { + params[paramsize].from = workset[i].beg + ws * q; + params[paramsize].end = params[paramsize].from + ws; + params[paramsize].pivot = workset[i].pivot; + params[paramsize].ptr = i; + params[paramsize].last = false; + paramsize++; + } + params[paramsize - 1].last = true; + params[paramsize - 1].end = workset[i].end; + + workset[i].lmaxpiv = 0; + workset[i].lminpiv = 0xffffffff; + workset[i].rmaxpiv = 0; + workset[i].rminpiv = 0xffffffff; + } + + if (paramsize == 0) + break; + + // Copy the block assignment to the GPU + if (!errCheck(cudaMemcpy(dparams, params, paramsize * sizeof(Params), cudaMemcpyHostToDevice))) + return 1; + + // Do the cumulative sum + if (flip) + part1<<>>(ddata, dparams, dhists, dlength); + else + part1<<>>(ddata2, dparams, dhists, dlength); + if (!errCheck((cudaMemcpy(length, dlength, sizeof(Length), cudaMemcpyDeviceToHost)))) + return 1; + + // Do the block cumulative sum. Done on the CPU since not all cards have support for + // atomic operations yet. + for (unsigned int i = 0; i < paramsize; i++) + { + unsigned int l = length->left[i]; + unsigned int r = length->right[i]; + + length->left[i] = workset[params[i].ptr].beg; + length->right[i] = workset[params[i].ptr].end; + + workset[params[i].ptr].beg += l; + workset[params[i].ptr].end -= r; + workset[params[i].ptr].altered = true; + + workset[params[i].ptr].rmaxpiv = max(length->maxpiv[i], workset[params[i].ptr].rmaxpiv); + workset[params[i].ptr].lminpiv = min(length->minpiv[i], workset[params[i].ptr].lminpiv); + + workset[params[i].ptr].lmaxpiv = min(workset[params[i].ptr].pivot, workset[params[i].ptr].rmaxpiv); + workset[params[i].ptr].rminpiv = max(workset[params[i].ptr].pivot, workset[params[i].ptr].lminpiv); + } + + // Copy the result of the block cumulative sum to the GPU + if (!errCheck((cudaMemcpy(dlength, length, sizeof(Length), cudaMemcpyHostToDevice)))) + return 1; + + // Move the elements to their correct position + if (flip) + part2<<>>(ddata, ddata2, dparams, dhists, dlength); + else + part2<<>>(ddata2, ddata, dparams, dhists, dlength); + + // Fill in the pivot value between the left and right blocks + part3<<>>(ddata, dparams, dhists, dlength); + + flip = !flip; + + // Add the sequences resulting from the partitioning + // to set + unsigned int oldworksize = worksize; + totsize = 0; + for (unsigned int i = 0; i < oldworksize; i++) + { + if (workset[i].altered) + { + if (workset[i].beg - workset[i].orgbeg >= maxlength) + totsize += workset[i].beg - workset[i].orgbeg; + if (workset[i].orgend - workset[i].end >= maxlength) + totsize += workset[i].orgend - workset[i].end; + + workset[worksize].beg = workset[worksize].orgbeg = workset[i].orgbeg; + workset[worksize].end = workset[worksize].orgend = workset[i].beg; + workset[worksize].flip = flip; + workset[worksize].altered = false; + workset[worksize].pivot = (workset[i].lminpiv / 2 + workset[i].lmaxpiv / 2); + + worksize++; + + workset[i].orgbeg = workset[i].beg = workset[i].end; + workset[i].end = workset[i].orgend; + workset[i].flip = flip; + workset[i].pivot = (workset[i].rminpiv / 2 + workset[i].rmaxpiv / 2); + workset[i].altered = false; + } + } + iterations++; + } + + // Due to the poor scheduler on some graphics card + // we need to sort the order in which the blocks + // are sorted to avoid poor scheduling decisions + unsigned int sortblocks[MAXBLOCKS * 2]; + for (int i = 0; i < worksize; i++) + sortblocks[i] = ((workset[i].end - workset[i].beg) << (int)round(log((float)(MAXBLOCKS * 4.0f)) / log(2.0f))) + i; + std::sort(&sortblocks[0], &sortblocks[worksize]); + + if (worksize != 0) + { + // Copy the block assignments to the GPU + for (int i = 0; i < worksize; i++) + { + unsigned int q = (worksize - 1) - (sortblocks[i] & (MAXBLOCKS * 4 - 1)); + + lqparams[i].beg = workset[q].beg; + lqparams[i].end = workset[q].end; + lqparams[i].flip = workset[q].flip; + lqparams[i].sbsize = sbsize; + } + + if (!errCheck((cudaMemcpy(dlqparams, lqparams, worksize * sizeof(LQSortParams), cudaMemcpyHostToDevice)))) + return 1; + + // Run the local quicksort, the one that doesn't need inter-block synchronization + if (phase != 1) + lqsort<<>>(ddata, ddata2, dlqparams, phase); + } + + err = cudaDeviceSynchronize(); + // Free the data + if (err != cudaSuccess) + { + cudaFree(ddata); + cudaFree(ddata2); + return 1; + } + + // Copy the result back to the CPU + if (!errCheck((cudaMemcpy(data, ddata, size * sizeof(element), cudaMemcpyDeviceToHost)))) + return 1; + + cudaFree(ddata); + cudaFree(ddata2); + + return 0; +} + +template +bool GPUQSort::errCheck(int e) +{ + if (e == cudaSuccess) + return true; + + err = e; + cudaFree(ddata); + cudaFree(ddata2); + return false; +} + +template +GPUQSort::GPUQSort() : init(false), workset(0), params(0), length(0), lqparams(0), dlqparams(0), + dhists(0), dlength(0), dparams(0) +{ + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, 0); + if (!strcmp(deviceProp.name, "GeForce 8800 GTX")) + { + TK = 1.17125033316e-005f; + TM = 52.855721393f; + MK = 3.7480010661e-005f; + MM = 476.338308458f; + SK = 4.68500133262e-005f; + SM = 211.422885572f; + } + else if (!strcmp(deviceProp.name, "GeForce 8600 GTS")) + { + TK = 0.0f; + TM = 64.0f; + MK = 0.0000951623403898f; + MM = 476.338308458f; + SK = 0.0000321583081317f; + SM = 202.666666667f; + } + else + { + TK = 0; + TM = 128; + MK = 0; + MM = 512; + SK = 0; + SM = 512; + } + + if (cudaMallocHost((void **)&workset, MAXBLOCKS * 2 * sizeof(BlockSize)) != cudaSuccess) + return; + if (cudaMallocHost((void **)¶ms, MAXBLOCKS * sizeof(Params)) != cudaSuccess) + return; + if (cudaMallocHost((void **)&length, sizeof(Length)) != cudaSuccess) + return; + if (cudaMallocHost((void **)&lqparams, MAXBLOCKS * sizeof(LQSortParams)) != cudaSuccess) + return; + if (cudaMalloc((void **)&dlqparams, MAXBLOCKS * sizeof(LQSortParams)) != cudaSuccess) + return; + if (cudaMalloc((void **)&dhists, sizeof(Hist)) != cudaSuccess) + return; + if (cudaMalloc((void **)&dlength, sizeof(Length)) != cudaSuccess) + return; + if (cudaMalloc((void **)&dparams, MAXBLOCKS * sizeof(Params)) != cudaSuccess) + return; + + init = true; +} + +/** +* Returns the latest error message +* @returns the latest error message +*/ +template +const char *GPUQSort::getErrorStr() +{ + return cudaGetErrorString((cudaError_t)err); +} + +template +GPUQSort::~GPUQSort() +{ + cudaFreeHost(workset); + cudaFreeHost(params); + cudaFreeHost(length); + cudaFreeHost(lqparams); + cudaFree(dparams); + cudaFree(dlqparams); + cudaFree(dhists); + cudaFree(dlength); +} + +int gpuqsort(unsigned int *data, unsigned int size, unsigned int blockscount, unsigned int threads, unsigned int sbsize, unsigned int phase) +{ + GPUQSort *s = new GPUQSort(); + + if (s->sort(data, size, blockscount, threads, sbsize, phase) != 0) + { + delete s; + return 1; + } + else + { + delete s; + return 0; + } +} diff --git a/GPUSort/otherGPUsorts/manca_quicksort_extracted/manca_quicksort.cu b/GPUSort/otherGPUsorts/manca_quicksort_extracted/manca_quicksort.cu new file mode 100644 index 000000000..77bdff82c --- /dev/null +++ b/GPUSort/otherGPUsorts/manca_quicksort_extracted/manca_quicksort.cu @@ -0,0 +1,1317 @@ +//defines the shared memory size +#define SHARED_LIMIT 1024 + +#define GIGA 1073741824 +/* + * division of the vector to be sorted in buckets + * the attributes of the object Block are the parameters of each bucket + */ +template +struct Block +{ + + unsigned int begin; + unsigned int end; + + unsigned int nextbegin; + unsigned int nextend; + + Type pivot; + + //max of the bucket items + Type maxPiv; + //min of the bucket items + Type minPiv; + //done indicates that a bucket has been analyzed + short done; + short select; +}; + +template +struct Partition +{ + + unsigned int ibucket; + unsigned int from; + unsigned int end; + Type pivot; +}; + +void CUDA_Quicksort(unsigned int *inData, unsigned int *outData, unsigned int dataSize, unsigned int threads, int Device, double *timer); + +void CUDA_Quicksort_64(double *inData, double *outData, unsigned int size, unsigned int threads, int Device, double *timer); + +typedef unsigned int Type; + +void test_bitonicSort(unsigned int *h_InputKey, unsigned int N, double *timer); +void test_MergeSort(unsigned int *h_SrcKey, unsigned int N, double *timer); +void test_thrustSort(Type *h_data, unsigned int N, double *timer); + +typedef unsigned int uint; + +size_t scanInclusiveShort( + uint *d_Dst, + uint *d_Src, + uint batchSize, + uint arrayLength); + +size_t scanInclusiveLarge( + uint *d_Dst, + uint *d_Src, + uint batchSize, + uint arrayLength); + +template +inline __device__ void warpScanInclusive2(Type &idata, Type &idata2, volatile Type *s_Data, volatile Type *s_Data2, uint size) +{ + + //volatile uint* s_Data2; + //s_Data2 = s_Data + blockDim.x*2; + + uint pos = 2 * threadIdx.x - (threadIdx.x & (size - 1)); + s_Data[pos] = 0; + s_Data2[pos] = 0; + pos += size; + s_Data[pos] = idata; + s_Data2[pos] = idata2; + + for (uint offset = 1; offset < size; offset <<= 1) + { + s_Data[pos] += s_Data[pos - offset]; + s_Data2[pos] += s_Data2[pos - offset]; + } + + idata = s_Data[pos]; + idata2 = s_Data2[pos]; +} + +template +inline __device__ void warpScanExclusive2(Type &idata, Type &idata2, volatile Type *s_Data, volatile Type *s_Data2, uint size) +{ + + //volatile uint* s_Data2; + //s_Data2 = s_Data + blockDim.x*2; + + uint pos = 2 * threadIdx.x - (threadIdx.x & (size - 1)); + s_Data[pos] = 0; + s_Data2[pos] = 0; + pos += size; + s_Data[pos] = idata; + s_Data2[pos] = idata2; + + for (uint offset = 1; offset < size; offset <<= 1) + { + s_Data[pos] += s_Data[pos - offset]; + s_Data2[pos] += s_Data2[pos - offset]; + } + + idata = s_Data[pos] - idata; + idata2 = s_Data2[pos] - idata2; +} + +#define LOG2_WARP_SIZE 5U +#define WARP_SIZE (1U << LOG2_WARP_SIZE) + +template +inline __device__ void scan1Inclusive2(Type &idata, Type &idata2, volatile Type *s_Data, uint size) +{ + + volatile Type *s_Data2; + s_Data2 = s_Data + blockDim.x * 2; + + if (size > WARP_SIZE) + { + + //Bottom-level inclusive warp scan + warpScanInclusive2(idata, idata2, s_Data, s_Data2, WARP_SIZE); + + //Save top Types of each warp for exclusive warp scan + //sync to wait for warp scans to complete (because s_Data is being overwritten) + __syncthreads(); + if ((threadIdx.x & (WARP_SIZE - 1)) == (WARP_SIZE - 1)) + { + s_Data[threadIdx.x >> LOG2_WARP_SIZE] = idata; + s_Data2[threadIdx.x >> LOG2_WARP_SIZE] = idata2; + } + + //wait for warp scans to complete + __syncthreads(); + if (threadIdx.x < (blockDim.x / WARP_SIZE)) + { + //grab top warp Types + Type val = s_Data[threadIdx.x]; + Type val2 = s_Data2[threadIdx.x]; + //calculate exclsive scan and write back to shared memory + warpScanExclusive2(val, val2, s_Data, s_Data2, size >> LOG2_WARP_SIZE); + s_Data[threadIdx.x] = val; + s_Data2[threadIdx.x] = val2; + } + + //return updated warp scans with exclusive scan results + __syncthreads(); + idata += s_Data[threadIdx.x >> LOG2_WARP_SIZE]; + idata2 += s_Data2[threadIdx.x >> LOG2_WARP_SIZE]; + } + else + warpScanInclusive2(idata, idata2, s_Data, s_Data2, size); +} + +template +inline __device__ void warpCompareInclusive(Type &idata, Type &idata2, volatile Type *s_Data, uint size) +{ + + volatile Type *s_Data2; + s_Data2 = s_Data + blockDim.x * 2; + uint pos = 2 * threadIdx.x - (threadIdx.x & (size - 1)); + s_Data[pos] = 0; + s_Data2[pos] = 0; + pos += size; + s_Data[pos] = idata; + s_Data2[pos] = idata2; + + for (uint offset = 1; offset < size; offset <<= 1) + { + s_Data[pos] = max(s_Data[pos], s_Data[pos - offset]); + s_Data2[pos] = min(s_Data2[pos], s_Data2[pos - offset]); + } + + idata = s_Data[pos]; + idata2 = s_Data2[pos]; +} + +template +inline __device__ void compareInclusive(Type &idata, Type &idata2, volatile Type *s_Data, uint size) +{ + + volatile Type *s_Data2; + s_Data2 = s_Data + blockDim.x * 2; + //Bottom-level inclusive warp scan + warpCompareInclusive(idata, idata2, s_Data, WARP_SIZE); + + //Save top Types of each warp for exclusive warp scan + //sync to wait for warp scans to complete (because s_Data is being overwritten) + __syncthreads(); + if ((threadIdx.x & (WARP_SIZE - 1)) == (WARP_SIZE - 1)) + { + s_Data[threadIdx.x >> LOG2_WARP_SIZE] = idata; + s_Data2[threadIdx.x >> LOG2_WARP_SIZE] = idata2; + } + + //wait for warp scans to complete + __syncthreads(); + if (threadIdx.x < (blockDim.x / WARP_SIZE)) + { + //grab top warp Types + Type val = s_Data[threadIdx.x]; + Type val2 = s_Data2[threadIdx.x]; + //calculate exclsive scan and write back to shared memory + warpCompareInclusive(val, val2, s_Data, size >> LOG2_WARP_SIZE); + s_Data[threadIdx.x] = val; + s_Data2[threadIdx.x] = val2; + } + + //return updated warp scans with exclusive scan results + __syncthreads(); + idata = max(idata, s_Data[threadIdx.x >> LOG2_WARP_SIZE]); + idata2 = min(idata2, s_Data2[threadIdx.x >> LOG2_WARP_SIZE]); +} + +#include +#include +#include + +//All three kernels run 512 threads per workgroup +//Must be a power of two +#define THREADBLOCK_SIZE 256 + +//////////////////////////////////////////////////////////////////////////////// +// Basic ccan codelets +//////////////////////////////////////////////////////////////////////////////// +#if (0) +//Naive inclusive scan: O(N * log2(N)) operations +//Allocate 2 * 'size' local memory, initialize the first half +//with 'size' zeros avoiding if(pos >= offset) condition evaluation +//and saving instructions +inline __device__ uint scan1Inclusive(uint idata, volatile uint *s_Data, uint size) +{ + uint pos = 2 * threadIdx.x - (threadIdx.x & (size - 1)); + s_Data[pos] = 0; + pos += size; + s_Data[pos] = idata; + + for (uint offset = 1; offset < size; offset <<= 1) + { + __syncthreads(); + uint t = s_Data[pos] + s_Data[pos - offset]; + __syncthreads(); + s_Data[pos] = t; + } + + return s_Data[pos]; +} + +inline __device__ uint scan1Exclusive(uint idata, volatile uint *s_Data, uint size) +{ + return scan1Inclusive(idata, s_Data, size) - idata; +} + +#else +#define LOG2_WARP_SIZE 5U +#define WARP_SIZE (1U << LOG2_WARP_SIZE) + +//Almost the same as naive scan1Inclusive, but doesn't need __syncthreads() +//assuming size <= WARP_SIZE +inline __device__ uint warpScanInclusive(uint idata, volatile uint *s_Data, uint size) +{ + uint pos = 2 * threadIdx.x - (threadIdx.x & (size - 1)); + s_Data[pos] = 0; + pos += size; + s_Data[pos] = idata; + + for (uint offset = 1; offset < size; offset <<= 1) + s_Data[pos] += s_Data[pos - offset]; + + return s_Data[pos]; +} + +inline __device__ uint warpScanExclusive(uint idata, volatile uint *s_Data, uint size) +{ + return warpScanInclusive(idata, s_Data, size) - idata; +} + +inline __device__ uint scan1Inclusive(uint idata, volatile uint *s_Data, uint size) +{ + if (size > WARP_SIZE) + { + //Bottom-level inclusive warp scan + uint warpResult = warpScanInclusive(idata, s_Data, WARP_SIZE); + + //Save top elements of each warp for exclusive warp scan + //sync to wait for warp scans to complete (because s_Data is being overwritten) + __syncthreads(); + if ((threadIdx.x & (WARP_SIZE - 1)) == (WARP_SIZE - 1)) + s_Data[threadIdx.x >> LOG2_WARP_SIZE] = warpResult; + + //wait for warp scans to complete + __syncthreads(); + if (threadIdx.x < (THREADBLOCK_SIZE / WARP_SIZE)) + { + //grab top warp elements + uint val = s_Data[threadIdx.x]; + //calculate exclsive scan and write back to shared memory + s_Data[threadIdx.x] = warpScanExclusive(val, s_Data, size >> LOG2_WARP_SIZE); + } + + //return updated warp scans with exclusive scan results + __syncthreads(); + return warpResult + s_Data[threadIdx.x >> LOG2_WARP_SIZE]; + } + else + { + return warpScanInclusive(idata, s_Data, size); + } +} + +inline __device__ uint scan1Exclusive(uint idata, volatile uint *s_Data, uint size) +{ + return scan1Inclusive(idata, s_Data, size) - idata; +} + +#endif + +inline __device__ uint4 scan4Inclusive(uint4 idata4, volatile uint *s_Data, uint size) +{ + //Level-0 exclusive scan + idata4.y += idata4.x; + idata4.z += idata4.y; + idata4.w += idata4.z; + + //Level-1 exclusive scan + uint oval = scan1Exclusive(idata4.w, s_Data, size / 4); + + idata4.x += oval; + idata4.y += oval; + idata4.z += oval; + idata4.w += oval; + + return idata4; +} + +//Exclusive vector scan: the array to be scanned is stored +//in local thread memory scope as uint4 +inline __device__ uint4 scan4Exclusive(uint4 idata4, volatile uint *s_Data, uint size) +{ + uint4 odata4 = scan4Inclusive(idata4, s_Data, size); + odata4.x -= idata4.x; + odata4.y -= idata4.y; + odata4.z -= idata4.z; + odata4.w -= idata4.w; + return odata4; +} + +//////////////////////////////////////////////////////////////////////////////// +// Scan kernels +//////////////////////////////////////////////////////////////////////////////// +__global__ void scanExclusiveShared( + uint4 *d_Dst, + uint4 *d_Src, + uint size) +{ + __shared__ uint s_Data[2 * THREADBLOCK_SIZE]; + + uint pos = blockIdx.x * blockDim.x + threadIdx.x; + + //Load data + uint4 idata4 = d_Src[pos]; + + //Calculate exclusive scan + uint4 odata4 = scan4Exclusive(idata4, s_Data, size); + + //Write back + d_Dst[pos] = odata4; +} + +//Exclusive scan of top elements of bottom-level scans (4 * THREADBLOCK_SIZE) +__global__ void scanExclusiveShared2( + uint *d_Buf, + uint *d_Dst, + uint *d_Src, + uint N, + uint arrayLength) +{ + __shared__ uint s_Data[2 * THREADBLOCK_SIZE]; + + //Skip loads and stores for inactive threads of last threadblock (pos >= N) + uint pos = blockIdx.x * blockDim.x + threadIdx.x; + + //Load top elements + //Convert results of bottom-level scan back to inclusive + uint idata = 0; + if (pos < N) + idata = + d_Dst[(4 * THREADBLOCK_SIZE) - 1 + (4 * THREADBLOCK_SIZE) * pos] + + d_Src[(4 * THREADBLOCK_SIZE) - 1 + (4 * THREADBLOCK_SIZE) * pos]; + + //Compute + uint odata = scan1Exclusive(idata, s_Data, arrayLength); + + //Avoid out-of-bound access + if (pos < N) + d_Buf[pos] = odata; +} + +//Final step of large-array scan: combine basic inclusive scan with exclusive scan of top elements of input arrays +__global__ void uniformUpdate( + uint4 *d_Data, + uint *d_Buffer) +{ + __shared__ uint buf; + uint pos = blockIdx.x * blockDim.x + threadIdx.x; + + if (threadIdx.x == 0) + buf = d_Buffer[blockIdx.x]; + __syncthreads(); + + uint4 data4 = d_Data[pos]; + data4.x += buf; + data4.y += buf; + data4.z += buf; + data4.w += buf; + d_Data[pos] = data4; +} + +//////////////////////////////////////////////////////////////////////////////// +// Interface function +//////////////////////////////////////////////////////////////////////////////// +//Derived as 32768 (max power-of-two gridDim.x) * 4 * THREADBLOCK_SIZE +//Due to scanExclusiveShared<<<>>>() 1D block addressing +const uint MAX_BATCH_ELEMENTS = 64 * 1048576; +const uint MIN_SHORT_ARRAY_SIZE = 4; +const uint MAX_SHORT_ARRAY_SIZE = 4 * THREADBLOCK_SIZE; +const uint MIN_LARGE_ARRAY_SIZE = 8 * THREADBLOCK_SIZE; +const uint MAX_LARGE_ARRAY_SIZE = 4 * THREADBLOCK_SIZE * THREADBLOCK_SIZE; + +//Internal exclusive scan buffer +static uint *d_Buf; + +void initScan(void) +{ + checkCudaErrors(cudaMalloc((void **)&d_Buf, (MAX_BATCH_ELEMENTS / (4 * THREADBLOCK_SIZE)) * sizeof(uint))); +} + +void closeScan(void) +{ + checkCudaErrors(cudaFree(d_Buf)); +} + +static uint factorRadix2(uint &log2L, uint L) +{ + if (!L) + { + log2L = 0; + return 0; + } + else + { + for (log2L = 0; (L & 1) == 0; L >>= 1, log2L++) + ; + return L; + } +} + +static uint iDivUp(uint dividend, uint divisor) +{ + return ((dividend % divisor) == 0) ? (dividend / divisor) : (dividend / divisor + 1); +} + +size_t scanExclusiveShort( + uint *d_Dst, + uint *d_Src, + uint batchSize, + uint arrayLength) +{ + //Check power-of-two factorization + uint log2L; + uint factorizationRemainder = factorRadix2(log2L, arrayLength); + assert(factorizationRemainder == 1); + + //Check supported size range + assert((arrayLength >= MIN_SHORT_ARRAY_SIZE) && (arrayLength <= MAX_SHORT_ARRAY_SIZE)); + + //Check total batch size limit + assert((batchSize * arrayLength) <= MAX_BATCH_ELEMENTS); + + //Check all threadblocks to be fully packed with data + assert((batchSize * arrayLength) % (4 * THREADBLOCK_SIZE) == 0); + + scanExclusiveShared<<<(batchSize * arrayLength) / (4 * THREADBLOCK_SIZE), THREADBLOCK_SIZE>>>( + (uint4 *)d_Dst, + (uint4 *)d_Src, + arrayLength); + getLastCudaError("scanExclusiveShared() execution FAILED\n"); + + return THREADBLOCK_SIZE; +} + +size_t scanExclusiveLarge( + uint *d_Dst, + uint *d_Src, + uint batchSize, + uint arrayLength) +{ + //Check power-of-two factorization + uint log2L; + uint factorizationRemainder = factorRadix2(log2L, arrayLength); + assert(factorizationRemainder == 1); + + //Check supported size range + assert((arrayLength >= MIN_LARGE_ARRAY_SIZE) && (arrayLength <= MAX_LARGE_ARRAY_SIZE)); + + //Check total batch size limit + assert((batchSize * arrayLength) <= MAX_BATCH_ELEMENTS); + + scanExclusiveShared<<<(batchSize * arrayLength) / (4 * THREADBLOCK_SIZE), THREADBLOCK_SIZE>>>( + (uint4 *)d_Dst, + (uint4 *)d_Src, + 4 * THREADBLOCK_SIZE); + getLastCudaError("scanExclusiveShared() execution FAILED\n"); + + //Not all threadblocks need to be packed with input data: + //inactive threads of highest threadblock just don't do global reads and writes + const uint blockCount2 = iDivUp((batchSize * arrayLength) / (4 * THREADBLOCK_SIZE), THREADBLOCK_SIZE); + scanExclusiveShared2<<>>( + (uint *)d_Buf, + (uint *)d_Dst, + (uint *)d_Src, + (batchSize * arrayLength) / (4 * THREADBLOCK_SIZE), + arrayLength / (4 * THREADBLOCK_SIZE)); + getLastCudaError("scanExclusiveShared2() execution FAILED\n"); + + uniformUpdate<<<(batchSize * arrayLength) / (4 * THREADBLOCK_SIZE), THREADBLOCK_SIZE>>>( + (uint4 *)d_Dst, + (uint *)d_Buf); + getLastCudaError("uniformUpdate() execution FAILED\n"); + + return THREADBLOCK_SIZE; +} + +__global__ void scanInclusiveShared2( + uint *d_Buf, + uint *d_Dst, + uint N, + uint arrayLength) +{ + __shared__ uint s_Data[2 * THREADBLOCK_SIZE]; + + //Skip loads and stores for inactive threads of last threadblock (pos >= N) + uint pos = blockIdx.x * blockDim.x + threadIdx.x; + + //Load top elements + //Convert results of bottom-level scan back to inclusive + uint idata = 0; + if (pos < N) + idata = d_Dst[(4 * THREADBLOCK_SIZE) - 1 + (4 * THREADBLOCK_SIZE) * pos]; + + //Compute + uint odata = scan1Exclusive(idata, s_Data, arrayLength); + + //Avoid out-of-bound access + if (pos < N) + d_Buf[pos] = odata; +} + +__global__ void scanInclusiveShared( + uint4 *d_Dst, + uint4 *d_Src, + uint size) +{ + __shared__ uint s_Data[2 * THREADBLOCK_SIZE]; + + uint pos = blockIdx.x * blockDim.x + threadIdx.x; + + // if(pos= MIN_SHORT_ARRAY_SIZE) && (arrayLength <= MAX_SHORT_ARRAY_SIZE) ); + + //Check total batch size limit + // assert( (batchSize * arrayLength) <= MAX_BATCH_ELEMENTS ); + + //Check all threadblocks to be fully packed with data + //assert( (batchSize * arrayLength) % (4 * THREADBLOCK_SIZE) == 0 ); + int blocks = (batchSize * arrayLength + 4 * THREADBLOCK_SIZE - 1) / (4 * THREADBLOCK_SIZE); + scanInclusiveShared<<>>( + (uint4 *)d_Dst, + (uint4 *)d_Src, + arrayLength); + getLastCudaError("scanExclusiveShared() execution FAILED\n"); + + return THREADBLOCK_SIZE; +} + +size_t scanInclusiveLarge( + uint *d_Dst, + uint *d_Src, + uint batchSize, + uint arrayLength) +{ + //Check power-of-two factorization + uint log2L; + uint factorizationRemainder = factorRadix2(log2L, arrayLength); + assert(factorizationRemainder == 1); + + //Check supported size range + //assert( (arrayLength >= MIN_LARGE_ARRAY_SIZE) && (arrayLength <= MAX_LARGE_ARRAY_SIZE) ); + + //Check total batch size limit + //assert( (batchSize * arrayLength) <= MAX_BATCH_ELEMENTS ); + + scanInclusiveShared<<<(batchSize * arrayLength) / (4 * THREADBLOCK_SIZE), THREADBLOCK_SIZE>>>( + (uint4 *)d_Dst, + (uint4 *)d_Src, + 4 * THREADBLOCK_SIZE); + getLastCudaError("scanExclusiveShared() execution FAILED\n"); + + //Not all threadblocks need to be packed with input data: + //inactive threads of highest threadblock just don't do global reads and writes + const uint blockCount2 = iDivUp((batchSize * arrayLength) / (4 * THREADBLOCK_SIZE), THREADBLOCK_SIZE); + scanInclusiveShared2<<>>( + (uint *)d_Buf, + (uint *)d_Dst, + (batchSize * arrayLength) / (4 * THREADBLOCK_SIZE), + arrayLength / (4 * THREADBLOCK_SIZE)); + getLastCudaError("scanExclusiveShared2() execution FAILED\n"); + + uniformUpdate<<<(batchSize * arrayLength) / (4 * THREADBLOCK_SIZE), THREADBLOCK_SIZE>>>( + (uint4 *)d_Dst, + (uint *)d_Buf); + getLastCudaError("uniformUpdate() execution FAILED\n"); + + return THREADBLOCK_SIZE; +} + + +#include +#include +#include +#include + +extern __shared__ uint sMemory[]; + +__device__ inline double atomicMax(double *address, double val) +{ + unsigned long long int *address_as_ull = (unsigned long long int *)address; + unsigned long long int assumed; + unsigned long long int old = *address_as_ull; + + assumed = old; + old = atomicCAS(address_as_ull, + assumed, + __double_as_longlong(max(val, __longlong_as_double(assumed)))); + + while (assumed != old) + { + assumed = old; + old = atomicCAS(address_as_ull, + assumed, + __double_as_longlong(max(val, __longlong_as_double(assumed)))); + } + return __longlong_as_double(old); +} + +__device__ inline double atomicMin(double *address, double val) +{ + unsigned long long int *address_as_ull = (unsigned long long int *)address; + unsigned long long int old = *address_as_ull, assumed; + + assumed = old; + old = atomicCAS(address_as_ull, + assumed, + __double_as_longlong(min(val, __longlong_as_double(assumed)))); + while (assumed != old) + { + assumed = old; + old = atomicCAS(address_as_ull, + assumed, + __double_as_longlong(min(val, __longlong_as_double(assumed)))); + } + return __longlong_as_double(old); +} + +template +__device__ inline void Comparator( + + Type &valA, + Type &valB, + uint dir) +{ + Type t; + if ((valA > valB) == dir) + { + t = valA; + valA = valB; + valB = t; + } +} + +static __device__ __forceinline__ unsigned int __qsflo(unsigned int word) +{ + unsigned int ret; + asm volatile("bfind.u32 %0, %1;" + : "=r"(ret) + : "r"(word)); + return ret; +} + +template +__global__ void globalBitonicSort(Type *indata, Type *outdata, Block *bucket, bool inputSelect) +{ + __shared__ uint shared[1024]; + + Type *data; + + Block cord = bucket[blockIdx.x]; + + uint size = cord.end - cord.begin; + bool select = !(cord.select); + + if (cord.end - cord.begin > 1024 || cord.end - cord.begin == 0) + return; + + unsigned int bitonicSize = 1 << (__qsflo(size - 1U) + 1); + + if (select) + data = indata; + else + data = outdata; + + //__syncthreads(); + + for (int i = threadIdx.x; i < size; i += blockDim.x) + shared[i] = data[i + cord.begin]; + + for (int i = threadIdx.x + size; i < bitonicSize; i += blockDim.x) + shared[i] = 0xffffffff; + + __syncthreads(); + + for (uint size = 2; size < bitonicSize; size <<= 1) + { + //Bitonic merge + uint ddd = 1 ^ ((threadIdx.x & (size / 2)) != 0); + for (uint stride = size / 2; stride > 0; stride >>= 1) + { + __syncthreads(); + uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); + //if(pos 0; stride >>= 1) + { + __syncthreads(); + uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); + // if(pos +__global__ void quick(Type *indata, Type *buffer, Partition *partition, Block *bucket) +{ + + __shared__ Type sh_out[1024]; + + __shared__ uint start1, end1; + __shared__ uint left, right; + + int tix = threadIdx.x; + + uint start = partition[blockIdx.x].from; + uint end = partition[blockIdx.x].end; + Type pivot = partition[blockIdx.x].pivot; + uint nseq = partition[blockIdx.x].ibucket; + + uint lo = 0; + uint hi = 0; + + Type lmin = 0xffffffff; + Type rmax = 0; + + Type d; + + // start read on 1° tile and store the coordinates of the items that must + // be moved on the left or on the right of the pivot + + if (tix + start < end) + { + d = indata[tix + start]; + + //count items smaller or bigger than the pivot + // if d= pivot) * lo; + // if d>pivot then lr++ else lr + hi = (d <= pivot) * (hi) + (d > pivot) * (hi + 1); + + lmin = d; + rmax = d; + } + + //read and store the coordinates on next tiles for each block + for (uint i = tix + start + blockDim.x; i < end; i += blockDim.x) + { + Type d = indata[i]; + + //count items smaller or bigger than the pivot + lo = (d < pivot) * (lo + 1) + (d >= pivot) * lo; + hi = (d <= pivot) * (hi) + (d > pivot) * (hi + 1); + + //compute max and min of tile items + lmin = min(lmin, d); + rmax = max(rmax, d); + } + + //compute max and min of every partition + + compareInclusive(rmax, lmin, (Type *)sh_out, blockDim.x); + + __syncthreads(); + + if (tix == blockDim.x - 1) + { + //compute absolute max and min for the bucket + atomicMax(&bucket[nseq].maxPiv, rmax); + atomicMin(&bucket[nseq].minPiv, lmin); + } + + __syncthreads(); + + /* + * calculate the coordinates of its assigned item to each thread, + * which are necessary to known in which subsequences the item must be copied + * + */ + scan1Inclusive2(lo, hi, (uint *)sh_out, blockDim.x); + lo = lo - 1; + hi = SHARED_LIMIT - hi; + + if (tix == blockDim.x - 1) + { + left = lo + 1; + right = SHARED_LIMIT - hi; + + start1 = atomicAdd(&bucket[nseq].nextbegin, left); + end1 = atomicSub(&bucket[nseq].nextend, right); + } + + __syncthreads(); + + //thread blocks write on the shared memory the items smaller and bigger than the first tile's pivot + if (tix + start < end) + { + //items smaller than pivot + if (d < pivot) + { + sh_out[lo] = d; + lo--; + } + + //items bigger than pivot + if (d > pivot) + { + sh_out[hi] = d; + hi++; + } + } + + //thread blocks write on the shared memory the items smaller and bigger than next tiles' pivot + for (uint i = start + tix + blockDim.x; i < end; i += blockDim.x) + { + + Type d = indata[i]; + //items smaller than the pivot + if (d < pivot) + { + sh_out[lo] = d; + lo--; + } + + //items bigger than the pivot + if (d > pivot) + { + sh_out[hi] = d; + hi++; + } + } + + __syncthreads(); + + //items smaller and bigger than the pivot already sorted in the shared memory are coalesced written on the global memory + //partial results of each thread block stored on the shared memory are merged together in two subsequences within the global memory + //coalesced writing of next tiles on the global memory + for (uint i = tix; i < SHARED_LIMIT; i += blockDim.x) + { + if (i < left) + buffer[start1 + i] = sh_out[i]; + + if (i >= SHARED_LIMIT - right) + buffer[end1 + i - SHARED_LIMIT] = sh_out[i]; + } +} + +//this function assigns the attributes to each partition of each bucket +//a thread block is assigned to a specific partition +template +__global__ void partitionAssign(struct Block *bucket, uint *npartitions, struct Partition *partition) +{ + int tx = threadIdx.x; + int bx = blockIdx.x; + + uint beg = bucket[bx].nextbegin; + uint end = bucket[bx].nextend; + Type pivot = bucket[bx].pivot; + + uint from; + uint to; + + if (bx > 0) + { + from = npartitions[bx - 1]; + to = npartitions[bx]; + } + else + { + from = 0; + to = npartitions[bx]; + } + + uint i = tx + from; + + if (i < to) + { + uint begin = beg + SHARED_LIMIT * tx; + partition[i].from = begin; + partition[i].end = begin + SHARED_LIMIT; + partition[i].pivot = pivot; + partition[i].ibucket = bx; + } + + for (uint i = tx + from + blockDim.x; i < to; i += blockDim.x) + { + uint begin = beg + SHARED_LIMIT * (i - from); + partition[i].from = begin; + partition[i].end = begin + SHARED_LIMIT; + partition[i].pivot = pivot; + partition[i].ibucket = bx; + } + __syncthreads(); + if (tx == 0 && to - from > 0) + partition[to - 1].end = end; +} + +//this function enters the pivot value in the central bucket's items +template +__global__ void insertPivot(Type *data, struct Block *bucket, int nbucket) +{ + + Type pivot = bucket[blockIdx.x].pivot; + uint start = bucket[blockIdx.x].nextbegin; + uint end = bucket[blockIdx.x].nextend; + bool is_altered = bucket[blockIdx.x].done; + + if (is_altered && blockIdx.x < nbucket) + for (uint j = start + threadIdx.x; j < end; j += blockDim.x) + data[j] = pivot; +} + +//this function assigns the new attributes of each bucket +template +__global__ void bucketAssign(Block *bucket, uint *npartitions, int nbucket, int select) +{ + + uint i = blockIdx.x * blockDim.x + threadIdx.x; + + if (i < nbucket) + { + bool is_altered = bucket[i].done; + if (is_altered) + { + //read on i node + uint orgbeg = bucket[i].begin; + uint from = bucket[i].nextbegin; + uint orgend = bucket[i].end; + uint end = bucket[i].nextend; + Type pivot = bucket[i].pivot; + Type minPiv = bucket[i].minPiv; + Type maxPiv = bucket[i].maxPiv; + + //compare each bucket's max and min to the pivot + Type lmaxpiv = min(pivot, maxPiv); + Type rminpiv = max(pivot, minPiv); + + //write on i+nbucket node + bucket[i + nbucket].begin = orgbeg; + bucket[i + nbucket].nextbegin = orgbeg; + bucket[i + nbucket].nextend = from; + bucket[i + nbucket].end = from; + bucket[i + nbucket].pivot = (minPiv + lmaxpiv) / 2; + + //if(select) + // bucket[i+nbucket].done = (from-orgbeg)>1024;// && (minPiv!=maxPiv); + //else + bucket[i + nbucket].done = (from - orgbeg) > 1024 && (minPiv != maxPiv); + bucket[i + nbucket].select = select; + bucket[i + nbucket].minPiv = 0xffffffffffffffff; + bucket[i + nbucket].maxPiv = 0; + //bucket[i+nbucket].finish=false; + + //calculate the number of partitions (npartitions) necessary to the i+nbucket bucket + if (!bucket[i + nbucket].done) + npartitions[i + nbucket] = 0; + else + npartitions[i + nbucket] = (from - orgbeg + SHARED_LIMIT - 1) / SHARED_LIMIT; + + //write on i node + bucket[i].begin = end; + bucket[i].nextbegin = end; + bucket[i].nextend = orgend; + bucket[i].pivot = (rminpiv + maxPiv) / 2 + 1; + + //if(select) + //bucket[i].done = (orgend-end)>1024;// && (minPiv!=maxPiv); + // else + bucket[i].done = (orgend - end) > 1024 && (minPiv != maxPiv); + bucket[i].select = select; + bucket[i].minPiv = 0xffffffffffffffff; + bucket[i].maxPiv = 0; + //bucket[i].finish=false; + + //calculate the number of partitions (npartitions) necessary to the i-bucket bucket + if (!bucket[i].done) + npartitions[i] = 0; + else + npartitions[i] = (orgend - end + SHARED_LIMIT - 1) / SHARED_LIMIT; + } + } +} + +template +__global__ void init(Type *data, Block *bucket, uint *npartitions, int size, int nblocks) +{ + uint i = blockIdx.x * blockDim.x + threadIdx.x; + + if (i < nblocks) + { + bucket[i].nextbegin = 0; + bucket[i].begin = 0; + + bucket[i].nextend = 0 + size * (i == 0); + bucket[i].end = 0 + size * (i == 0); + npartitions[i] = 0; + bucket[i].done = false + i == 0; + bucket[i].select = false; + bucket[i].maxPiv = 0x0; + bucket[i].minPiv = 0xffffffffffffffff; + //bucket[i].pivot = 0+ (i==0)*((min(min(data[0],data[size/2]),data[size-1]) + max(max(data[0],data[size/2]),data[size-1]))/2); + bucket[i].pivot = data[size / 2]; + } +} + +template +void sort(Type *ddata, Type *outputData, uint size, uint threadCount, int device, double *wallClock) +{ + + cudaSetDevice(device); + + cudaGetLastError(); + //cudaDeviceReset(); + + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, device); + + StopWatchInterface *htimer = NULL; + Type *dbuffer; + + Block *dbucket; + struct Partition *partition; + uint *npartitions1, *npartitions2; + + uint *cudaBlocks = (uint *)malloc(4); + + uint blocks = (size + SHARED_LIMIT - 1) / SHARED_LIMIT; + uint nblock = 10 * blocks; + int partition_max = 262144; + + unsigned long long int total = partition_max * sizeof(Block) + nblock * sizeof(Partition) + 2 * partition_max * sizeof(uint) + 3 * (size) * sizeof(Type); + + //Allocating and initializing CUDA arrays + sdkCreateTimer(&htimer); + checkCudaErrors(cudaMalloc((void **)&dbucket, partition_max * sizeof(Block))); + checkCudaErrors(cudaMalloc((void **)&partition, nblock * sizeof(Partition))); //nblock + + checkCudaErrors(cudaMalloc((void **)&npartitions1, partition_max * sizeof(uint))); + checkCudaErrors(cudaMalloc((void **)&npartitions2, partition_max * sizeof(uint))); + + checkCudaErrors(cudaMalloc((void **)&dbuffer, (size) * sizeof(Type))); + + initScan(); + + //setting GPU Cache + cudaFuncSetCacheConfig(init, cudaFuncCachePreferL1); + cudaFuncSetCacheConfig(insertPivot, cudaFuncCachePreferL1); + cudaFuncSetCacheConfig(bucketAssign, cudaFuncCachePreferL1); + cudaFuncSetCacheConfig(partitionAssign, cudaFuncCachePreferL1); + cudaFuncSetCacheConfig(quick, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(globalBitonicSort, cudaFuncCachePreferShared); + + checkCudaErrors(cudaDeviceSynchronize()); + sdkResetTimer(&htimer); + sdkStartTimer(&htimer); + + //initializing bucket array: initial attributes for each bucket + init<<<(nblock + 255) / 256, 256>>>(ddata, dbucket, npartitions1, size, partition_max); + + uint nbucket = 1; + uint numIterations = 0; + bool inputSelect = true; + + *cudaBlocks = blocks; + checkCudaErrors(cudaDeviceSynchronize()); + getLastCudaError("init() execution FAILED\n"); + checkCudaErrors(cudaMemcpy(&npartitions2[0], cudaBlocks, sizeof(uint), cudaMemcpyHostToDevice)); + + // beginning of the first phase + // this phase goes on until the size of the buckets is comparable to the SHARED_LIMIT size + while (1) + { + + /* + * --------------------- Pre-processing: Partitioning --------------------- + * + * buckets are further divided in partitions based on their size + * the number of partitions needed for each subsequence is determined by the number of elements which can be + * processed by each thread block. + * + * the number of partitions (npartitions) for each block will depend on the shared memory size (SHARED_LIMIT) + * + */ + + if (numIterations > 0) + { //1024 is the shared memory limit of scanInclusiveShort() + if (nbucket <= 1024) + scanInclusiveShort(npartitions2, npartitions1, 1, nbucket); + else + scanInclusiveLarge(npartitions2, npartitions1, 1, nbucket); + + checkCudaErrors(cudaMemcpy(cudaBlocks, &npartitions2[nbucket - 1], sizeof(uint), cudaMemcpyDeviceToHost)); + } + + if (*cudaBlocks == 0) + break; + + /* + * --------------------- step 1 --------------------- + * + * A thread block is assigned to each different partition + * each partition is assigned coordinates, pivot and .... + */ + + partitionAssign<<>>(dbucket, npartitions2, partition); + cudaDeviceSynchronize(); + getLastCudaError("partitionAssign() execution FAILED\n"); + + /* + --------------------- step 2a --------------------- + + in this function each thread block creates two subsequences + to divide the items in the partition whose value is lower than + the pivot value, from the items whose value is higher than the pivot value + */ + + if (inputSelect) + quick<<<*cudaBlocks, threadCount>>>(ddata, dbuffer, partition, dbucket); + else + quick<<<*cudaBlocks, threadCount>>>(dbuffer, ddata, partition, dbucket); + cudaDeviceSynchronize(); + getLastCudaError("quick() execution FAILED\n"); + + //step 2b: this function enters the pivot value in the central bucket's items + insertPivot<<>>(ddata, dbucket, nbucket); + + //step 3: parameters are assigned, linked to the two new buckets created in step 2 + bucketAssign<<<(nbucket + 255) / 256, 256>>>(dbucket, npartitions1, nbucket, inputSelect); + cudaDeviceSynchronize(); + getLastCudaError("insertPivot() or bucketAssign() execution FAILED\n"); + + nbucket *= 2; + + inputSelect = !inputSelect; + numIterations++; + if (nbucket > deviceProp.maxGridSize[0]) + break; + //if(numIterations==18) break; + } + + /* + * start second phase: + * now the size of the buckets is such that they can be entirely processed by a thread block + * + */ + + if (nbucket > deviceProp.maxGridSize[0]) + fprintf(stderr, "ERROR: CUDA-Quicksort can't terminate sorting as the block threads needed to finish it are more than the Maximum x-dimension of FERMI GPU thread blocks. Please use Kepler GPUs as the Maximum x-dimension of their thread blocks is much higher\n"); + else + globalBitonicSort<<>>(ddata, dbuffer, dbucket, inputSelect); + + checkCudaErrors(cudaDeviceSynchronize()); + getLastCudaError("globalBitonicSort() execution FAILED\n"); + + sdkStopTimer(&htimer); + *wallClock = sdkGetTimerValue(&htimer); + + // release resources + checkCudaErrors(cudaFree(dbuffer)); + checkCudaErrors(cudaFree(dbucket)); + checkCudaErrors(cudaFree(npartitions2)); + checkCudaErrors(cudaFree(npartitions1)); + free(cudaBlocks); + + closeScan(); + return; +} + +void CUDA_Quicksort(uint* inputData, uint* outputData, uint dataSize, uint threadCount, int Device, double* wallClock) +{ + + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, Device); + + if(deviceProp.major<2) + { + fprintf(stderr, "Error: the GPU device %d has a Compute Capability of %d.%d, while a Compute Capability of 2.x is required to run the code\n", + Device, deviceProp.major, deviceProp.minor); + + int deviceCount; + cudaGetDeviceCount(&deviceCount); + + fprintf(stderr, " the Host system has the following GPU devices:\n"); + + for (int device = 0; device < deviceCount; device++) { + + fprintf(stderr, "\t the GPU device %d is a %s, with Compute Capability %d.%d\n", + device, deviceProp.name, deviceProp.major, deviceProp.minor); + } + + return; + } + + sort(inputData,outputData, dataSize,threadCount,Device, wallClock); +} + +void CUDA_Quicksort_64(double* inputData,double* outputData, uint dataSize, uint threadCount, int Device, double* wallClock) +{ + + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, Device); + + if(deviceProp.major<2) + { + fprintf(stderr, "Error: the GPU device %d has a Compute Capability of %d.%d, while a Compute Capability of 2.x is required to run the code\n", + Device, deviceProp.major, deviceProp.minor); + + int deviceCount; + cudaGetDeviceCount(&deviceCount); + + fprintf(stderr, " the Host system has the following GPU devices:\n"); + + for (int device = 0; device < deviceCount; device++) { + + fprintf(stderr, "\t the GPU device %d is a %s, with Compute Capability %d.%d\n", + device, deviceProp.name, deviceProp.major, deviceProp.minor); + } + + return; + } + + sort(inputData,outputData, dataSize,threadCount,Device,wallClock); + +} -- GitLab From 2383aa3e1f3a00f265392b5dbe79547bc7224480 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Mon, 10 May 2021 20:16:21 +0200 Subject: [PATCH 227/258] refactor out block Bitonic sort into own file --- GPUSort/GPUSort/src/bitonicSort/bitonicSort.h | 199 +----------------- .../src/bitonicSort/blockBitonicSort.cuh | 100 +++++++++ GPUSort/GPUSort/src/bitonicSort/helpers.h | 36 ++++ 3 files changed, 140 insertions(+), 195 deletions(-) create mode 100644 GPUSort/GPUSort/src/bitonicSort/blockBitonicSort.cuh create mode 100644 GPUSort/GPUSort/src/bitonicSort/helpers.h diff --git a/GPUSort/GPUSort/src/bitonicSort/bitonicSort.h b/GPUSort/GPUSort/src/bitonicSort/bitonicSort.h index 0f14f7eb1..72a1c78f8 100644 --- a/GPUSort/GPUSort/src/bitonicSort/bitonicSort.h +++ b/GPUSort/GPUSort/src/bitonicSort/bitonicSort.h @@ -1,41 +1,7 @@ #pragma once #include - -//--------------------------------------------- - -// Inline PTX call to return index of highest non-zero bit in a word -static __device__ __forceinline__ unsigned int __btflo(unsigned int word) -{ - unsigned int ret; - asm volatile("bfind.u32 %0, %1;" - : "=r"(ret) - : "r"(word)); - return ret; -} - -__device__ int closestPow2_ptx(int bitonicLen) -{ - return 1 << (__btflo((unsigned)bitonicLen - 1U) + 1); -} - -__host__ __device__ int closestPow2(int x) -{ - if (x == 0) - return 0; - - int ret = 1; - while (ret < x) - ret <<= 1; - - return ret; -} - -template -__cuda_callable__ void cmpSwap(Value &a, Value &b, bool ascending, const CMP &Cmp) -{ - if (ascending == Cmp(b, a)) - TNL::swap(a, b); -} +#include "blockBitonicSort.cuh" +#include "helpers.h" //--------------------------------------------- @@ -67,6 +33,7 @@ __global__ void bitonicMergeGlobal(TNL::Containers::ArrayView -__global__ void bitonicMerge(TNL::Containers::ArrayView arr, - CMP Cmp, - int monotonicSeqLen, int bitonicLen) -{ - //1st index and last index of subarray that this threadBlock should merge - int myBlockStart = blockIdx.x * (2 * blockDim.x); - int myBlockEnd = TNL::min(arr.getSize(), myBlockStart + (2 * blockDim.x)); - - auto src = arr.getView(myBlockStart, myBlockEnd); - - //calculate the direction of swapping - int i = blockIdx.x * blockDim.x + threadIdx.x; - int part = i / (bitonicLen / 2); - int partsInSeq = monotonicSeqLen / bitonicLen; - int monotonicSeqIdx = part / partsInSeq; - - bool ascending = (monotonicSeqIdx & 1) != 0; - //special case for parts with no "partner" - if ((monotonicSeqIdx + 1) * monotonicSeqLen >= arr.getSize()) - ascending = true; - //------------------------------------------ - - //do bitonic merge - for (; bitonicLen > 1; bitonicLen /= 2) - { - //calculates which 2 indexes will be compared and swap - int part = threadIdx.x / (bitonicLen / 2); - int s = part * bitonicLen + (threadIdx.x & ((bitonicLen / 2) - 1)); - int e = s + bitonicLen / 2; - - if (e < myBlockEnd - myBlockStart) //not touching virtual padding - cmpSwap(src[s], src[e], ascending, Cmp); - __syncthreads(); - } -} - -//--------------------------------------------- - -/** - * IMPORTANT: all threads in block have to call this function to work properly - * the size of src isn't limited, but for optimal efficiency, no more than 8*blockDim.x should be used - * Description: sorts src and writes into dst within a block - * works independently from other concurrent blocks - * @param sharedMem sharedMem pointer has to be able to store all of src elements - * */ -template -__device__ void bitonicSort_Block(TNL::Containers::ArrayView src, - TNL::Containers::ArrayView dst, - Value *sharedMem, const CMP &Cmp) -{ - //copy from globalMem into sharedMem - for (int i = threadIdx.x; i < src.getSize(); i += blockDim.x) - sharedMem[i] = src[i]; - __syncthreads(); - - //------------------------------------------ - //bitonic activity - { - int paddedSize = closestPow2_ptx(src.getSize()); - - for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) - { - for (int bitonicLen = monotonicSeqLen; bitonicLen > 1; bitonicLen /= 2) - { - for (int i = threadIdx.x;; i += blockDim.x) //simulates other blocks in case src.size > blockDim.x*2 - { - //calculates which 2 indexes will be compared and swap - int part = i / (bitonicLen / 2); - int s = part * bitonicLen + (i & ((bitonicLen / 2) - 1)); - int e = s + bitonicLen / 2; - - if (e >= src.getSize()) //touching virtual padding, the order dont swap - break; - - //calculate the direction of swapping - int monotonicSeqIdx = i / (monotonicSeqLen / 2); - bool ascending = (monotonicSeqIdx & 1) != 0; - if ((monotonicSeqIdx + 1) * monotonicSeqLen >= src.getSize()) //special case for parts with no "partner" - ascending = true; - - cmpSwap(sharedMem[s], sharedMem[e], ascending, Cmp); - } - - __syncthreads(); //only 1 synchronization needed - } - } - } - - //------------------------------------------ - //writeback to global memory - for (int i = threadIdx.x; i < dst.getSize(); i += blockDim.x) - dst[i] = sharedMem[i]; -} - -/** - * IMPORTANT: all threads in block have to call this function to work properly - * IMPORTANT: unlike the counterpart with shared memory, this function only works in-place - * the size of src isn't limited, but for optimal efficiency, no more than 8*blockDim.x should be used - * Description: sorts src in place using bitonic sort - * works independently from other concurrent blocks - * this version doesnt use shared memory and is prefered for Value with big size - * */ -template -__device__ void bitonicSort_Block(TNL::Containers::ArrayView src, - const CMP &Cmp) -{ - int paddedSize = closestPow2_ptx(src.getSize()); - - for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) - { - for (int bitonicLen = monotonicSeqLen; bitonicLen > 1; bitonicLen /= 2) - { - for (int i = threadIdx.x;; i += blockDim.x) //simulates other blocks in case src.size > blockDim.x*2 - { - //calculates which 2 indexes will be compared and swap - int part = i / (bitonicLen / 2); - int s = part * bitonicLen + (i & ((bitonicLen / 2) - 1)); - int e = s + bitonicLen / 2; - - if (e >= src.getSize()) - break; - - //calculate the direction of swapping - int monotonicSeqIdx = i / (monotonicSeqLen / 2); - bool ascending = (monotonicSeqIdx & 1) != 0; - if ((monotonicSeqIdx + 1) * monotonicSeqLen >= src.getSize()) //special case for parts with no "partner" - ascending = true; - - cmpSwap(src[s], src[e], ascending, Cmp); - } - __syncthreads(); - } - } -} /** * entrypoint for bitonicSort_Block @@ -293,26 +119,9 @@ __global__ void bitoniSort1stStepSharedMemory(TNL::Containers::ArrayView -__global__ void bitoniSort1stStep(TNL::Containers::ArrayView arr, CMP Cmp) -{ - int myBlockStart = blockIdx.x * (2 * blockDim.x); - int myBlockEnd = TNL::min(arr.getSize(), myBlockStart + (2 * blockDim.x)); - - if (blockIdx.x % 2 || blockIdx.x + 1 == gridDim.x) - bitonicSort_Block(arr.getView(myBlockStart, myBlockEnd), Cmp); - else - bitonicSort_Block(arr.getView(myBlockStart, myBlockEnd), - [&] __cuda_callable__(const Value &a, const Value &b) { return Cmp(b, a); }); -} - //--------------------------------------------- //--------------------------------------------- + template void bitonicSortWithShared(TNL::Containers::ArrayView view, const CMP &Cmp, int gridDim, int blockDim, int sharedMemLen, int sharedMemSize) diff --git a/GPUSort/GPUSort/src/bitonicSort/blockBitonicSort.cuh b/GPUSort/GPUSort/src/bitonicSort/blockBitonicSort.cuh new file mode 100644 index 000000000..26d962a27 --- /dev/null +++ b/GPUSort/GPUSort/src/bitonicSort/blockBitonicSort.cuh @@ -0,0 +1,100 @@ +#pragma once +#include "helpers.h" +#include + +/** + * IMPORTANT: all threads in block have to call this function to work properly + * the size of src isn't limited, but for optimal efficiency, no more than 8*blockDim.x should be used + * Description: sorts src and writes into dst within a block + * works independently from other concurrent blocks + * @param sharedMem sharedMem pointer has to be able to store all of src elements + * */ +template +__device__ void bitonicSort_Block(TNL::Containers::ArrayView src, + TNL::Containers::ArrayView dst, + Value *sharedMem, const CMP &Cmp) +{ + //copy from globalMem into sharedMem + for (int i = threadIdx.x; i < src.getSize(); i += blockDim.x) + sharedMem[i] = src[i]; + __syncthreads(); + + //------------------------------------------ + //bitonic activity + { + int paddedSize = closestPow2_ptx(src.getSize()); + + for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) + { + for (int bitonicLen = monotonicSeqLen; bitonicLen > 1; bitonicLen /= 2) + { + for (int i = threadIdx.x;; i += blockDim.x) //simulates other blocks in case src.size > blockDim.x*2 + { + //calculates which 2 indexes will be compared and swap + int part = i / (bitonicLen / 2); + int s = part * bitonicLen + (i & ((bitonicLen / 2) - 1)); + int e = s + bitonicLen / 2; + + if (e >= src.getSize()) //touching virtual padding, the order dont swap + break; + + //calculate the direction of swapping + int monotonicSeqIdx = i / (monotonicSeqLen / 2); + bool ascending = (monotonicSeqIdx & 1) != 0; + if ((monotonicSeqIdx + 1) * monotonicSeqLen >= src.getSize()) //special case for parts with no "partner" + ascending = true; + + cmpSwap(sharedMem[s], sharedMem[e], ascending, Cmp); + } + + __syncthreads(); //only 1 synchronization needed + } + } + } + + //------------------------------------------ + //writeback to global memory + for (int i = threadIdx.x; i < dst.getSize(); i += blockDim.x) + dst[i] = sharedMem[i]; +} + +/** + * IMPORTANT: all threads in block have to call this function to work properly + * IMPORTANT: unlike the counterpart with shared memory, this function only works in-place + * the size of src isn't limited, but for optimal efficiency, no more than 8*blockDim.x should be used + * Description: sorts src in place using bitonic sort + * works independently from other concurrent blocks + * this version doesnt use shared memory and is prefered for Value with big size + * */ +template +__device__ void bitonicSort_Block(TNL::Containers::ArrayView src, + const CMP &Cmp) +{ + int paddedSize = closestPow2_ptx(src.getSize()); + + for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) + { + for (int bitonicLen = monotonicSeqLen; bitonicLen > 1; bitonicLen /= 2) + { + for (int i = threadIdx.x;; i += blockDim.x) //simulates other blocks in case src.size > blockDim.x*2 + { + //calculates which 2 indexes will be compared and swap + int part = i / (bitonicLen / 2); + int s = part * bitonicLen + (i & ((bitonicLen / 2) - 1)); + int e = s + bitonicLen / 2; + + if (e >= src.getSize()) + break; + + //calculate the direction of swapping + int monotonicSeqIdx = i / (monotonicSeqLen / 2); + bool ascending = (monotonicSeqIdx & 1) != 0; + if ((monotonicSeqIdx + 1) * monotonicSeqLen >= src.getSize()) //special case for parts with no "partner" + ascending = true; + + cmpSwap(src[s], src[e], ascending, Cmp); + } + __syncthreads(); + } + } +} \ No newline at end of file diff --git a/GPUSort/GPUSort/src/bitonicSort/helpers.h b/GPUSort/GPUSort/src/bitonicSort/helpers.h new file mode 100644 index 000000000..44629206a --- /dev/null +++ b/GPUSort/GPUSort/src/bitonicSort/helpers.h @@ -0,0 +1,36 @@ +#pragma once +#include + +// Inline PTX call to return index of highest non-zero bit in a word +static __device__ __forceinline__ unsigned int __btflo(unsigned int word) +{ + unsigned int ret; + asm volatile("bfind.u32 %0, %1;" + : "=r"(ret) + : "r"(word)); + return ret; +} + +__device__ int closestPow2_ptx(int bitonicLen) +{ + return 1 << (__btflo((unsigned)bitonicLen - 1U) + 1); +} + +__host__ __device__ int closestPow2(int x) +{ + if (x == 0) + return 0; + + int ret = 1; + while (ret < x) + ret <<= 1; + + return ret; +} + +template +__cuda_callable__ void cmpSwap(Value &a, Value &b, bool ascending, const CMP &Cmp) +{ + if (ascending == Cmp(b, a)) + TNL::swap(a, b); +} \ No newline at end of file -- GitLab From 8406ae82115119c8986fab89621882f78794bf30 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Mon, 10 May 2021 20:26:21 +0200 Subject: [PATCH 228/258] roll back 1st step --- GPUSort/GPUSort/src/bitonicSort/bitonicSort.h | 48 ++++++++++++++++--- 1 file changed, 41 insertions(+), 7 deletions(-) diff --git a/GPUSort/GPUSort/src/bitonicSort/bitonicSort.h b/GPUSort/GPUSort/src/bitonicSort/bitonicSort.h index 72a1c78f8..0ca7e5e0d 100644 --- a/GPUSort/GPUSort/src/bitonicSort/bitonicSort.h +++ b/GPUSort/GPUSort/src/bitonicSort/bitonicSort.h @@ -108,15 +108,49 @@ template __global__ void bitoniSort1stStepSharedMemory(TNL::Containers::ArrayView arr, CMP Cmp) { extern __shared__ int externMem[]; - int sharedMemLen = 2 * blockDim.x; + + Value * sharedMem = (Value *)externMem; + int sharedMemLen = 2*blockDim.x; + int myBlockStart = blockIdx.x * sharedMemLen; - int myBlockEnd = TNL::min(arr.getSize(), myBlockStart + sharedMemLen); + int myBlockEnd = TNL::min(arr.getSize(), myBlockStart+sharedMemLen); - if (blockIdx.x % 2 || blockIdx.x + 1 == gridDim.x) - bitonicSort_Block(arr.getView(myBlockStart, myBlockEnd), arr.getView(myBlockStart, myBlockEnd), (Value *)externMem, Cmp); - else - bitonicSort_Block(arr.getView(myBlockStart, myBlockEnd), arr.getView(myBlockStart, myBlockEnd), (Value *)externMem, - [&] __cuda_callable__(const Value &a, const Value &b) { return Cmp(b, a); }); + //copy from globalMem into sharedMem + for (int i = threadIdx.x; myBlockStart + i < myBlockEnd; i += blockDim.x) + sharedMem[i] = arr[myBlockStart + i]; + __syncthreads(); + + //------------------------------------------ + //bitonic activity + { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int paddedSize = closestPow2(myBlockEnd - myBlockStart); + + for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) + { + //calculate the direction of swapping + int monotonicSeqIdx = i / (monotonicSeqLen/2); + bool ascending = (monotonicSeqIdx & 1) != 0; + if ((monotonicSeqIdx + 1) * monotonicSeqLen >= arr.getSize()) //special case for parts with no "partner" + ascending = true; + + for (int len = monotonicSeqLen; len > 1; len /= 2) + { + //calculates which 2 indexes will be compared and swap + int part = threadIdx.x / (len / 2); + int s = part * len + (threadIdx.x & ((len / 2) - 1)); + int e = s + len / 2; + + if(e < myBlockEnd - myBlockStart) //touching virtual padding + cmpSwap(sharedMem[s], sharedMem[e], ascending, Cmp); + __syncthreads(); + } + } + } + + //writeback to global memory + for (int i = threadIdx.x; myBlockStart + i < myBlockEnd; i += blockDim.x) + arr[myBlockStart + i] = sharedMem[i]; } //--------------------------------------------- -- GitLab From af0b281ac77171bf56d20579f595862ad03576a0 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Mon, 10 May 2021 20:27:59 +0200 Subject: [PATCH 229/258] ignore python files --- GPUSort/.gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/GPUSort/.gitignore b/GPUSort/.gitignore index 07c569033..cb874b4e4 100644 --- a/GPUSort/.gitignore +++ b/GPUSort/.gitignore @@ -2,4 +2,5 @@ backup *.csv *.o -*.cuo \ No newline at end of file +*.cuo +*.ipynb \ No newline at end of file -- GitLab From 266e80c5efe532909083dcc54b1307458e4bd618 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen Date: Wed, 12 May 2021 21:42:32 +0000 Subject: [PATCH 230/258] Add installation comments --- GPUSort/GPUSort/README.MD | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/GPUSort/GPUSort/README.MD b/GPUSort/GPUSort/README.MD index 40e8b2f8e..f41427f52 100644 --- a/GPUSort/GPUSort/README.MD +++ b/GPUSort/GPUSort/README.MD @@ -1,5 +1,7 @@ ## Code implemented by Nguyen Xuan Thang for bachelor thesis +Needs to have CUDA and TNL installed. + * benchmark * folder containing benchmarking scripts * the main function is in ``benchmarker.cpp`` @@ -11,4 +13,10 @@ * tests * folder containing unit tests for each algorithm * inside each folder there is a tester and a Makefile - * to test out the implementation, run ``make run`` \ No newline at end of file + * to test out the implementation, run ``make run`` + * needs gTests installed + + + +* To install TNL, read https://mmg-gitlab.fjfi.cvut.cz/doc/tnl/#installation +* To install CUDA, https://developer.nvidia.com/cuda-downloads -- GitLab From 8afc91a45893dfb9e4ae3d37928407b69d586e5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Mon, 12 Jul 2021 19:52:17 +0200 Subject: [PATCH 231/258] Moving and refactoring source code of quicksort and bitonic sort. --- GPUSort/GPUSort/benchmark/benchmarker.cpp | 92 -- GPUSort/GPUSort/benchmark/measure.cpp | 31 - GPUSort/GPUSort/benchmark/measure.cu | 39 - GPUSort/GPUSort/benchmark/measure.h | 6 - src/Benchmarks/CMakeLists.txt | 1 + src/Benchmarks/Sorting/CMakeLists.txt | 8 + src/Benchmarks/Sorting/Measurer.h | 90 ++ .../Benchmarks/Sorting/Quicksorter.h | 0 .../ReferenceAlgorithms/cederman_qsort.h | 0 .../ReferenceAlgorithms/helper_timer.h | 495 +++++++++ .../ReferenceAlgorithms/helpers/exception.h | 151 +++ .../ReferenceAlgorithms/helpers/helper_cuda.h | 966 ++++++++++++++++++ .../helpers/helper_string.h | 421 ++++++++ .../helpers/helper_timer.h | 495 +++++++++ .../ReferenceAlgorithms/helpers/scan_common.h | 61 ++ .../ReferenceAlgorithms/manca_quicksort.h | 12 +- .../Sorting/bitonicsortBenchmark.cu | 0 .../Benchmarks/Sorting/generators.h | 0 .../util => src/Benchmarks/Sorting}/timer.h | 0 src/Benchmarks/Sorting/tnl-benchmark-sort.cpp | 1 + src/Benchmarks/Sorting/tnl-benchmark-sort.cu | 1 + src/Benchmarks/Sorting/tnl-benchmark-sort.h | 113 ++ src/TNL/Algorithms/Sort.h | 30 + .../Algorithms/detail/Sorting}/algorithm.h | 2 +- .../Algorithms/detail/Sorting}/bitonicSort.h | 14 +- .../detail/Sorting}/blockBitonicSort.cuh | 2 +- .../detail/Sorting}/cudaPartition.cuh | 4 +- .../TNL/Algorithms/detail/Sorting}/helpers.h | 0 .../Algorithms/detail/Sorting}/quicksort.cuh | 14 +- .../detail/Sorting}/quicksort_1Block.cuh | 6 +- .../detail/Sorting}/quicksort_kernel.cuh | 8 +- .../Algorithms/detail/Sorting}/reduction.cuh | 0 .../TNL/Algorithms/detail/Sorting}/task.h | 0 src/UnitTests/Algorithms/CMakeLists.txt | 1 + .../Algorithms/Sorting/BitonicSortTest.cpp | 1 + .../Algorithms/Sorting/BitonicSortTest.cu | 1 + .../Algorithms/Sorting/BitonicSortTest.h | 4 +- .../Algorithms/Sorting/CMakeLists.txt | 27 + .../Algorithms/Sorting/QuicksortTest.cpp | 1 + .../Algorithms/Sorting/QuicksortTest.cu | 1 + .../Algorithms/Sorting/QuicksortTest.h | 4 +- 41 files changed, 2907 insertions(+), 196 deletions(-) delete mode 100644 GPUSort/GPUSort/benchmark/benchmarker.cpp delete mode 100644 GPUSort/GPUSort/benchmark/measure.cpp delete mode 100644 GPUSort/GPUSort/benchmark/measure.cu delete mode 100644 GPUSort/GPUSort/benchmark/measure.h create mode 100644 src/Benchmarks/Sorting/CMakeLists.txt create mode 100644 src/Benchmarks/Sorting/Measurer.h rename GPUSort/GPUSort/benchmark/quicksort_benchmark/benchmark.cu => src/Benchmarks/Sorting/Quicksorter.h (100%) rename GPUSort/otherGPUsorts/cederman/cederman_qsort.cu => src/Benchmarks/Sorting/ReferenceAlgorithms/cederman_qsort.h (100%) create mode 100644 src/Benchmarks/Sorting/ReferenceAlgorithms/helper_timer.h create mode 100644 src/Benchmarks/Sorting/ReferenceAlgorithms/helpers/exception.h create mode 100644 src/Benchmarks/Sorting/ReferenceAlgorithms/helpers/helper_cuda.h create mode 100644 src/Benchmarks/Sorting/ReferenceAlgorithms/helpers/helper_string.h create mode 100644 src/Benchmarks/Sorting/ReferenceAlgorithms/helpers/helper_timer.h create mode 100644 src/Benchmarks/Sorting/ReferenceAlgorithms/helpers/scan_common.h rename GPUSort/otherGPUsorts/manca_quicksort_extracted/manca_quicksort.cu => src/Benchmarks/Sorting/ReferenceAlgorithms/manca_quicksort.h (99%) rename GPUSort/GPUSort/benchmark/bitonic_benchmark/benchmark.cu => src/Benchmarks/Sorting/bitonicsortBenchmark.cu (100%) rename GPUSort/GPUSort/benchmark/generators.cpp => src/Benchmarks/Sorting/generators.h (100%) rename {GPUSort/GPUSort/src/util => src/Benchmarks/Sorting}/timer.h (100%) create mode 100644 src/Benchmarks/Sorting/tnl-benchmark-sort.cpp create mode 120000 src/Benchmarks/Sorting/tnl-benchmark-sort.cu create mode 100644 src/Benchmarks/Sorting/tnl-benchmark-sort.h create mode 100644 src/TNL/Algorithms/Sort.h rename {GPUSort/GPUSort/src/util => src/TNL/Algorithms/detail/Sorting}/algorithm.h (85%) rename {GPUSort/GPUSort/src/bitonicSort => src/TNL/Algorithms/detail/Sorting}/bitonicSort.h (97%) rename {GPUSort/GPUSort/src/bitonicSort => src/TNL/Algorithms/detail/Sorting}/blockBitonicSort.cuh (98%) rename {GPUSort/GPUSort/src/quicksort => src/TNL/Algorithms/detail/Sorting}/cudaPartition.cuh (98%) rename {GPUSort/GPUSort/src/bitonicSort => src/TNL/Algorithms/detail/Sorting}/helpers.h (100%) rename {GPUSort/GPUSort/src/quicksort => src/TNL/Algorithms/detail/Sorting}/quicksort.cuh (97%) rename {GPUSort/GPUSort/src/quicksort => src/TNL/Algorithms/detail/Sorting}/quicksort_1Block.cuh (97%) rename {GPUSort/GPUSort/src/quicksort => src/TNL/Algorithms/detail/Sorting}/quicksort_kernel.cuh (97%) rename {GPUSort/GPUSort/src/util => src/TNL/Algorithms/detail/Sorting}/reduction.cuh (100%) rename {GPUSort/GPUSort/src/quicksort => src/TNL/Algorithms/detail/Sorting}/task.h (100%) create mode 100644 src/UnitTests/Algorithms/Sorting/BitonicSortTest.cpp create mode 120000 src/UnitTests/Algorithms/Sorting/BitonicSortTest.cu rename GPUSort/GPUSort/tests/bitonic_tests/unitTests.cu => src/UnitTests/Algorithms/Sorting/BitonicSortTest.h (99%) create mode 100644 src/UnitTests/Algorithms/Sorting/CMakeLists.txt create mode 100644 src/UnitTests/Algorithms/Sorting/QuicksortTest.cpp create mode 120000 src/UnitTests/Algorithms/Sorting/QuicksortTest.cu rename GPUSort/GPUSort/tests/quicksort_unitTests/unitTests.cu => src/UnitTests/Algorithms/Sorting/QuicksortTest.h (98%) diff --git a/GPUSort/GPUSort/benchmark/benchmarker.cpp b/GPUSort/GPUSort/benchmark/benchmarker.cpp deleted file mode 100644 index 55e2c618c..000000000 --- a/GPUSort/GPUSort/benchmark/benchmarker.cpp +++ /dev/null @@ -1,92 +0,0 @@ -#include -#include -#include -#include -using namespace std; - -#include "generators.cpp" -#include "measure.h" - -#ifndef LOW_POW - #define LOW_POW 10 -#endif - -#ifndef HIGH_POW - #define HIGH_POW 25 -#endif - -#ifndef TRIES - #define TRIES 20 -#endif - -//------------------------------------------------------------ - -void start(ostream & out, string delim) -{ - out << "size" << delim; - out << "random" << delim; - out << "shuffle" << delim; - out << "sorted" << delim; - out << "almost" << delim; - out << "decreas" << delim; - out << "gauss" << delim; - out << "bucket" << delim; - out << "stagger" << delim; - out << "zero_entropy"; - out << endl; - - int wrongAnsCnt = 0; - - for(int pow = LOW_POW; pow <= HIGH_POW; pow++) - { - int size =(1<< pow); - vector vec(size); - - out << "2^" << pow << delim << flush; - out << fixed << setprecision(3); - - out << measure(generateRandom(size), TRIES, wrongAnsCnt); - out << delim << flush; - - out << measure(generateShuffle(size), TRIES, wrongAnsCnt); - out << delim << flush; - - out << measure(generateSorted(size), TRIES, wrongAnsCnt); - out << delim << flush; - - out << measure(generateAlmostSorted(size), TRIES, wrongAnsCnt); - out << delim << flush; - - out << measure(generateDecreasing(size), TRIES, wrongAnsCnt); - out << delim << flush; - - out << measure(generateGaussian(size), TRIES, wrongAnsCnt) ; - out << delim << flush; - - out << measure(generateBucket(size), TRIES, wrongAnsCnt); - out << delim << flush; - - out << measure(generateStaggered(size), TRIES, wrongAnsCnt); - out << delim << flush; - - out << measure(generateZero_entropy(size), TRIES, wrongAnsCnt); - out << endl; - } - - if(wrongAnsCnt > 0) - std::cerr << wrongAnsCnt << "tries were sorted incorrectly" << std::endl; -} - -int main(int argc, char *argv[]) -{ - if(argc == 1) - { - start(cout, "\t"); - } - else - { - std::ofstream out(argv[1]); - start(out, ","); - } - return 0; -} \ No newline at end of file diff --git a/GPUSort/GPUSort/benchmark/measure.cpp b/GPUSort/GPUSort/benchmark/measure.cpp deleted file mode 100644 index efc91af70..000000000 --- a/GPUSort/GPUSort/benchmark/measure.cpp +++ /dev/null @@ -1,31 +0,0 @@ -#pragma once - -#include "measure.h" -#include "../src/util/timer.h" - -//-------------------------------------------------------- - -template -void sorter(std::vector&vec); - -//-------------------------------------------------------- - -template -double measure(const std::vector&vec, int tries, int & wrongAnsCnt) -{ - vector resAcc; - - for(int i = 0; i < tries; i++) - { - vector tmp = vec; - { - TIMER t([&](double res){resAcc.push_back(res);}); - sorter(tmp); - } - - if(!std::is_sorted(tmp.begin(), tmp.end())) - wrongAnsCnt++; - } - - return accumulate(resAcc.begin(), resAcc.end(), 0.0) / resAcc.size(); -} \ No newline at end of file diff --git a/GPUSort/GPUSort/benchmark/measure.cu b/GPUSort/GPUSort/benchmark/measure.cu deleted file mode 100644 index 50033aadc..000000000 --- a/GPUSort/GPUSort/benchmark/measure.cu +++ /dev/null @@ -1,39 +0,0 @@ -#pragma once - -#include - -#include "measure.h" -#include "../src/util/timer.h" - -#include -#include "../src/util/algorithm.h" -using namespace TNL; -using namespace TNL::Containers; - -//-------------------------------------------------------- - -template -void sorter(ArrayView arr); - -//-------------------------------------------------------- - -template -double measure(const std::vector&vec, int tries, int & wrongAnsCnt) -{ - vector resAcc; - - for(int i = 0; i < tries; i++) - { - Array arr(vec); - auto view = arr.getView(); - { - TIMER t([&](double res){resAcc.push_back(res);}); - sorter(view); - } - - if(!is_sorted(view)) - wrongAnsCnt++; - } - - return accumulate(resAcc.begin(), resAcc.end(), 0.0) / resAcc.size(); -} \ No newline at end of file diff --git a/GPUSort/GPUSort/benchmark/measure.h b/GPUSort/GPUSort/benchmark/measure.h deleted file mode 100644 index 3ebd633a1..000000000 --- a/GPUSort/GPUSort/benchmark/measure.h +++ /dev/null @@ -1,6 +0,0 @@ -#pragma once - -#include - -template -double measure(const std::vector&vec, int tries, int & wrongAnsCnt); \ No newline at end of file diff --git a/src/Benchmarks/CMakeLists.txt b/src/Benchmarks/CMakeLists.txt index 6f3185329..4e1961b3c 100644 --- a/src/Benchmarks/CMakeLists.txt +++ b/src/Benchmarks/CMakeLists.txt @@ -5,6 +5,7 @@ add_subdirectory( SpMV ) add_subdirectory( DistSpMV ) add_subdirectory( LinearSolvers ) add_subdirectory( ODESolvers ) +add_subdirectory( Sorting ) add_subdirectory( Traversers ) set( headers diff --git a/src/Benchmarks/Sorting/CMakeLists.txt b/src/Benchmarks/Sorting/CMakeLists.txt new file mode 100644 index 000000000..cb1454c09 --- /dev/null +++ b/src/Benchmarks/Sorting/CMakeLists.txt @@ -0,0 +1,8 @@ +if( BUILD_CUDA ) + CUDA_ADD_EXECUTABLE( tnl-benchmark-sort tnl-benchmark-sort.cu ) + TARGET_LINK_LIBRARIES( tnl-benchmark-sort ${CUDA_cusparse_LIBRARY} ${CUDA_cudadevrt_LIBRARY} ) +else() + ADD_EXECUTABLE( tnl-benchmark-sort tnl-benchmark-sort.cpp ) +endif() + +install( TARGETS tnl-benchmark-sort RUNTIME DESTINATION bin ) diff --git a/src/Benchmarks/Sorting/Measurer.h b/src/Benchmarks/Sorting/Measurer.h new file mode 100644 index 000000000..569344e62 --- /dev/null +++ b/src/Benchmarks/Sorting/Measurer.h @@ -0,0 +1,90 @@ +#pragma once + +#include +#include +#include +#include "ReferenceAlgorithms/manca_quicksort.h" +#include "ReferenceAlgorithms/cederman_qsort.h" +#include "timer.h" + +using namespace TNL; + +struct QuicksortSorter +{ + template< typename Array > + static void sort( Array& array ) { Algorithms::detail::quicksort( array ); }; +}; + +struct BitonicSortSorter +{ + template< typename Array > + static void sort( Array& array ) { Algorithms::detail::bitonicSort( array ); }; +}; + +struct STLSorter +{ + template< typename Value > + static void sort( std::vector< Value >& vec ) { std::sort( vec.begin(), vec.end() ); }; +}; + +struct MancaQuicksortSorter +{ + static void sort( Containers::ArrayView< int, Devices::Cuda >& array ) + { + double timer; + CUDA_Quicksort( ( unsigned * ) array.getData(), (unsigned * ) array.getData(), array.getSize(), 256, 0, &timer ); + //return; + } +}; + +struct CedermanQuicksortSorter +{ + static void sort( Containers::ArrayView< int, Devices::Cuda >& array ) + { + gpuqsort( ( unsigned int * ) array.getData(), ( unsigned int ) array.getSize() ); + } +}; + +template< typename Sorter > +struct Measurer +{ + template< typename Value > + static double measure( const std::vector&vec, int tries, int & wrongAnsCnt ) + { + vector resAcc; + + for(int i = 0; i < tries; i++) + { + Containers::Array arr(vec); + auto view = arr.getView(); + { + TIMER t([&](double res){resAcc.push_back(res);}); + Sorter::sort(view); + } + + if(!is_sorted(view)) + wrongAnsCnt++; + } + return accumulate(resAcc.begin(), resAcc.end(), 0.0) / resAcc.size(); + } +}; + +template<> +struct Measurer< STLSorter > +{ + template< typename Value > + static double measure( const std::vector&vec, int tries, int & wrongAnsCnt ) + { + vector resAcc; + + for(int i = 0; i < tries; i++) + { + std::vector< Value > vec2 = vec; + { + TIMER t([&](double res){resAcc.push_back(res);}); + STLSorter::sort( vec2 ); + } + } + return accumulate(resAcc.begin(), resAcc.end(), 0.0) / resAcc.size(); + } +}; diff --git a/GPUSort/GPUSort/benchmark/quicksort_benchmark/benchmark.cu b/src/Benchmarks/Sorting/Quicksorter.h similarity index 100% rename from GPUSort/GPUSort/benchmark/quicksort_benchmark/benchmark.cu rename to src/Benchmarks/Sorting/Quicksorter.h diff --git a/GPUSort/otherGPUsorts/cederman/cederman_qsort.cu b/src/Benchmarks/Sorting/ReferenceAlgorithms/cederman_qsort.h similarity index 100% rename from GPUSort/otherGPUsorts/cederman/cederman_qsort.cu rename to src/Benchmarks/Sorting/ReferenceAlgorithms/cederman_qsort.h diff --git a/src/Benchmarks/Sorting/ReferenceAlgorithms/helper_timer.h b/src/Benchmarks/Sorting/ReferenceAlgorithms/helper_timer.h new file mode 100644 index 000000000..c1e411650 --- /dev/null +++ b/src/Benchmarks/Sorting/ReferenceAlgorithms/helper_timer.h @@ -0,0 +1,495 @@ +/** + * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +// Helper Timing Functions +#ifndef HELPER_TIMER_H +#define HELPER_TIMER_H + +// includes, system +#include + +// includes, project +#include "helpers/exception.h" + +// Definition of the StopWatch Interface, this is used if we don't want to use the CUT functions +// But rather in a self contained class interface +class StopWatchInterface +{ + public: + StopWatchInterface() {}; + virtual ~StopWatchInterface() {}; + + public: + //! Start time measurement + virtual void start() = 0; + + //! Stop time measurement + virtual void stop() = 0; + + //! Reset time counters to zero + virtual void reset() = 0; + + //! Time in msec. after start. If the stop watch is still running (i.e. there + //! was no call to stop()) then the elapsed time is returned, otherwise the + //! time between the last start() and stop call is returned + virtual float getTime() = 0; + + //! Mean time to date based on the number of times the stopwatch has been + //! _stopped_ (ie finished sessions) and the current total time + virtual float getAverageTime() = 0; +}; + + +////////////////////////////////////////////////////////////////// +// Begin Stopwatch timer class definitions for all OS platforms // +////////////////////////////////////////////////////////////////// +#ifdef _WIN32 +// includes, system +#define WINDOWS_LEAN_AND_MEAN +#include +#undef min +#undef max + +//! Windows specific implementation of StopWatch +class StopWatchWin : public StopWatchInterface +{ + public: + //! Constructor, default + StopWatchWin() : + start_time(), end_time(), + diff_time(0.0f), total_time(0.0f), + running(false), clock_sessions(0), freq(0), freq_set(false) + { + if (! freq_set) + { + // helper variable + LARGE_INTEGER temp; + + // get the tick frequency from the OS + QueryPerformanceFrequency((LARGE_INTEGER *) &temp); + + // convert to type in which it is needed + freq = ((double) temp.QuadPart) / 1000.0; + + // rememeber query + freq_set = true; + } + }; + + // Destructor + ~StopWatchWin() { }; + + public: + //! Start time measurement + inline void start(); + + //! Stop time measurement + inline void stop(); + + //! Reset time counters to zero + inline void reset(); + + //! Time in msec. after start. If the stop watch is still running (i.e. there + //! was no call to stop()) then the elapsed time is returned, otherwise the + //! time between the last start() and stop call is returned + inline float getTime(); + + //! Mean time to date based on the number of times the stopwatch has been + //! _stopped_ (ie finished sessions) and the current total time + inline float getAverageTime(); + + private: + // member variables + + //! Start of measurement + LARGE_INTEGER start_time; + //! End of measurement + LARGE_INTEGER end_time; + + //! Time difference between the last start and stop + float diff_time; + + //! TOTAL time difference between starts and stops + float total_time; + + //! flag if the stop watch is running + bool running; + + //! Number of times clock has been started + //! and stopped to allow averaging + int clock_sessions; + + //! tick frequency + double freq; + + //! flag if the frequency has been set + bool freq_set; +}; + +// functions, inlined + +//////////////////////////////////////////////////////////////////////////////// +//! Start time measurement +//////////////////////////////////////////////////////////////////////////////// +inline void +StopWatchWin::start() +{ + QueryPerformanceCounter((LARGE_INTEGER *) &start_time); + running = true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Stop time measurement and increment add to the current diff_time summation +//! variable. Also increment the number of times this clock has been run. +//////////////////////////////////////////////////////////////////////////////// +inline void +StopWatchWin::stop() +{ + QueryPerformanceCounter((LARGE_INTEGER *) &end_time); + diff_time = (float) + (((double) end_time.QuadPart - (double) start_time.QuadPart) / freq); + + total_time += diff_time; + clock_sessions++; + running = false; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Reset the timer to 0. Does not change the timer running state but does +//! recapture this point in time as the current start time if it is running. +//////////////////////////////////////////////////////////////////////////////// +inline void +StopWatchWin::reset() +{ + diff_time = 0; + total_time = 0; + clock_sessions = 0; + + if (running) + { + QueryPerformanceCounter((LARGE_INTEGER *) &start_time); + } +} + + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. after start. If the stop watch is still running (i.e. there +//! was no call to stop()) then the elapsed time is returned added to the +//! current diff_time sum, otherwise the current summed time difference alone +//! is returned. +//////////////////////////////////////////////////////////////////////////////// +inline float +StopWatchWin::getTime() +{ + // Return the TOTAL time to date + float retval = total_time; + + if (running) + { + LARGE_INTEGER temp; + QueryPerformanceCounter((LARGE_INTEGER *) &temp); + retval += (float) + (((double)(temp.QuadPart - start_time.QuadPart)) / freq); + } + + return retval; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. for a single run based on the total number of COMPLETED runs +//! and the total time. +//////////////////////////////////////////////////////////////////////////////// +inline float +StopWatchWin::getAverageTime() +{ + return (clock_sessions > 0) ? (total_time/clock_sessions) : 0.0f; +} +#else +// Declarations for Stopwatch on Linux and Mac OSX +// includes, system +#include +#include + +//! Windows specific implementation of StopWatch +class StopWatchLinux : public StopWatchInterface +{ + public: + //! Constructor, default + StopWatchLinux() : + start_time(), diff_time(0.0), total_time(0.0), + running(false), clock_sessions(0) + { }; + + // Destructor + virtual ~StopWatchLinux() + { }; + + public: + //! Start time measurement + inline void start(); + + //! Stop time measurement + inline void stop(); + + //! Reset time counters to zero + inline void reset(); + + //! Time in msec. after start. If the stop watch is still running (i.e. there + //! was no call to stop()) then the elapsed time is returned, otherwise the + //! time between the last start() and stop call is returned + inline float getTime(); + + //! Mean time to date based on the number of times the stopwatch has been + //! _stopped_ (ie finished sessions) and the current total time + inline float getAverageTime(); + + private: + + // helper functions + + //! Get difference between start time and current time + inline float getDiffTime(); + + private: + + // member variables + + //! Start of measurement + struct timeval start_time; + + //! Time difference between the last start and stop + float diff_time; + + //! TOTAL time difference between starts and stops + float total_time; + + //! flag if the stop watch is running + bool running; + + //! Number of times clock has been started + //! and stopped to allow averaging + int clock_sessions; +}; + +// functions, inlined + +//////////////////////////////////////////////////////////////////////////////// +//! Start time measurement +//////////////////////////////////////////////////////////////////////////////// +inline void +StopWatchLinux::start() +{ + gettimeofday(&start_time, 0); + running = true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Stop time measurement and increment add to the current diff_time summation +//! variable. Also increment the number of times this clock has been run. +//////////////////////////////////////////////////////////////////////////////// +inline void +StopWatchLinux::stop() +{ + diff_time = getDiffTime(); + total_time += diff_time; + running = false; + clock_sessions++; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Reset the timer to 0. Does not change the timer running state but does +//! recapture this point in time as the current start time if it is running. +//////////////////////////////////////////////////////////////////////////////// +inline void +StopWatchLinux::reset() +{ + diff_time = 0; + total_time = 0; + clock_sessions = 0; + + if (running) + { + gettimeofday(&start_time, 0); + } +} + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. after start. If the stop watch is still running (i.e. there +//! was no call to stop()) then the elapsed time is returned added to the +//! current diff_time sum, otherwise the current summed time difference alone +//! is returned. +//////////////////////////////////////////////////////////////////////////////// +inline float +StopWatchLinux::getTime() +{ + // Return the TOTAL time to date + float retval = total_time; + + if (running) + { + retval += getDiffTime(); + } + + return retval; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. for a single run based on the total number of COMPLETED runs +//! and the total time. +//////////////////////////////////////////////////////////////////////////////// +inline float +StopWatchLinux::getAverageTime() +{ + return (clock_sessions > 0) ? (total_time/clock_sessions) : 0.0f; +} +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +inline float +StopWatchLinux::getDiffTime() +{ + struct timeval t_time; + gettimeofday(&t_time, 0); + + // time difference in milli-seconds + return (float)(1000.0 * (t_time.tv_sec - start_time.tv_sec) + + (0.001 * (t_time.tv_usec - start_time.tv_usec))); +} +#endif // _WIN32 + +//////////////////////////////////////////////////////////////////////////////// +//! Timer functionality exported + +//////////////////////////////////////////////////////////////////////////////// +//! Create a new timer +//! @return true if a time has been created, otherwise false +//! @param name of the new timer, 0 if the creation failed +//////////////////////////////////////////////////////////////////////////////// +inline bool +sdkCreateTimer(StopWatchInterface **timer_interface) +{ + //printf("sdkCreateTimer called object %08x\n", (void *)*timer_interface); +#ifdef _WIN32 + *timer_interface = (StopWatchInterface *)new StopWatchWin(); +#else + *timer_interface = (StopWatchInterface *)new StopWatchLinux(); +#endif + return (*timer_interface != NULL) ? true : false; +} + + +//////////////////////////////////////////////////////////////////////////////// +//! Delete a timer +//! @return true if a time has been deleted, otherwise false +//! @param name of the timer to delete +//////////////////////////////////////////////////////////////////////////////// +inline bool +sdkDeleteTimer(StopWatchInterface **timer_interface) +{ + //printf("sdkDeleteTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) + { + delete *timer_interface; + *timer_interface = NULL; + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Start the time with name \a name +//! @param name name of the timer to start +//////////////////////////////////////////////////////////////////////////////// +inline bool +sdkStartTimer(StopWatchInterface **timer_interface) +{ + //printf("sdkStartTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) + { + (*timer_interface)->start(); + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Stop the time with name \a name. Does not reset. +//! @param name name of the timer to stop +//////////////////////////////////////////////////////////////////////////////// +inline bool +sdkStopTimer(StopWatchInterface **timer_interface) +{ + // printf("sdkStopTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) + { + (*timer_interface)->stop(); + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Resets the timer's counter. +//! @param name name of the timer to reset. +//////////////////////////////////////////////////////////////////////////////// +inline bool +sdkResetTimer(StopWatchInterface **timer_interface) +{ + // printf("sdkResetTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) + { + (*timer_interface)->reset(); + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Return the average time for timer execution as the total time +//! for the timer dividied by the number of completed (stopped) runs the timer +//! has made. +//! Excludes the current running time if the timer is currently running. +//! @param name name of the timer to return the time of +//////////////////////////////////////////////////////////////////////////////// +inline float +sdkGetAverageTimerValue(StopWatchInterface **timer_interface) +{ + // printf("sdkGetAverageTimerValue called object %08x\n", (void *)*timer_interface); + if (*timer_interface) + { + return (*timer_interface)->getAverageTime(); + } + else + { + return 0.0f; + } +} + +//////////////////////////////////////////////////////////////////////////////// +//! Total execution time for the timer over all runs since the last reset +//! or timer creation. +//! @param name name of the timer to obtain the value of. +//////////////////////////////////////////////////////////////////////////////// +inline float +sdkGetTimerValue(StopWatchInterface **timer_interface) +{ + // printf("sdkGetTimerValue called object %08x\n", (void *)*timer_interface); + if (*timer_interface) + { + return (*timer_interface)->getTime(); + } + else + { + return 0.0f; + } +} + +#endif // HELPER_TIMER_H diff --git a/src/Benchmarks/Sorting/ReferenceAlgorithms/helpers/exception.h b/src/Benchmarks/Sorting/ReferenceAlgorithms/helpers/exception.h new file mode 100644 index 000000000..ff12dbb5a --- /dev/null +++ b/src/Benchmarks/Sorting/ReferenceAlgorithms/helpers/exception.h @@ -0,0 +1,151 @@ +/* +* Copyright 1993-2012 NVIDIA Corporation. All rights reserved. +* +* Please refer to the NVIDIA end user license agreement (EULA) associated +* with this source code for terms and conditions that govern your use of +* this software. Any use, reproduction, disclosure, or distribution of +* this software and related documentation outside the terms of the EULA +* is strictly prohibited. +* +*/ + +/* CUda UTility Library */ +#ifndef _EXCEPTION_H_ +#define _EXCEPTION_H_ + +// includes, system +#include +#include +#include +#include + +//! Exception wrapper. +//! @param Std_Exception Exception out of namespace std for easy typing. +template +class Exception : public Std_Exception +{ + public: + + //! @brief Static construction interface + //! @return Alwayss throws ( Located_Exception) + //! @param file file in which the Exception occurs + //! @param line line in which the Exception occurs + //! @param detailed details on the code fragment causing the Exception + static void throw_it(const char *file, + const int line, + const char *detailed = "-"); + + //! Static construction interface + //! @return Alwayss throws ( Located_Exception) + //! @param file file in which the Exception occurs + //! @param line line in which the Exception occurs + //! @param detailed details on the code fragment causing the Exception + static void throw_it(const char *file, + const int line, + const std::string &detailed); + + //! Destructor + virtual ~Exception() throw(); + + private: + + //! Constructor, default (private) + Exception(); + + //! Constructor, standard + //! @param str string returned by what() + Exception(const std::string &str); + +}; + +//////////////////////////////////////////////////////////////////////////////// +//! Exception handler function for arbitrary exceptions +//! @param ex exception to handle +//////////////////////////////////////////////////////////////////////////////// +template +inline void +handleException(const Exception_Typ &ex) +{ + std::cerr << ex.what() << std::endl; + + exit(EXIT_FAILURE); +} + +//! Convenience macros + +//! Exception caused by dynamic program behavior, e.g. file does not exist +#define RUNTIME_EXCEPTION( msg) \ + Exception::throw_it( __FILE__, __LINE__, msg) + +//! Logic exception in program, e.g. an assert failed +#define LOGIC_EXCEPTION( msg) \ + Exception::throw_it( __FILE__, __LINE__, msg) + +//! Out of range exception +#define RANGE_EXCEPTION( msg) \ + Exception::throw_it( __FILE__, __LINE__, msg) + +//////////////////////////////////////////////////////////////////////////////// +//! Implementation + +// includes, system +#include + +//////////////////////////////////////////////////////////////////////////////// +//! Static construction interface. +//! @param Exception causing code fragment (file and line) and detailed infos. +//////////////////////////////////////////////////////////////////////////////// +/*static*/ template +void +Exception:: +throw_it(const char *file, const int line, const char *detailed) +{ + std::stringstream s; + + // Quiet heavy-weight but exceptions are not for + // performance / release versions + s << "Exception in file '" << file << "' in line " << line << "\n" + << "Detailed description: " << detailed << "\n"; + + throw Exception(s.str()); +} + +//////////////////////////////////////////////////////////////////////////////// +//! Static construction interface. +//! @param Exception causing code fragment (file and line) and detailed infos. +//////////////////////////////////////////////////////////////////////////////// +/*static*/ template +void +Exception:: +throw_it(const char *file, const int line, const std::string &msg) +{ + throw_it(file, line, msg.c_str()); +} + +//////////////////////////////////////////////////////////////////////////////// +//! Constructor, default (private). +//////////////////////////////////////////////////////////////////////////////// +template +Exception::Exception() : + Exception("Unknown Exception.\n") +{ } + +//////////////////////////////////////////////////////////////////////////////// +//! Constructor, standard (private). +//! String returned by what(). +//////////////////////////////////////////////////////////////////////////////// +template +Exception::Exception(const std::string &s) : + Std_Exception(s) +{ } + +//////////////////////////////////////////////////////////////////////////////// +//! Destructor +//////////////////////////////////////////////////////////////////////////////// +template +Exception::~Exception() throw() { } + +// functions, exported + +#endif // #ifndef _EXCEPTION_H_ + diff --git a/src/Benchmarks/Sorting/ReferenceAlgorithms/helpers/helper_cuda.h b/src/Benchmarks/Sorting/ReferenceAlgorithms/helpers/helper_cuda.h new file mode 100644 index 000000000..cddfe76a3 --- /dev/null +++ b/src/Benchmarks/Sorting/ReferenceAlgorithms/helpers/helper_cuda.h @@ -0,0 +1,966 @@ +/** + * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +//////////////////////////////////////////////////////////////////////////////// +// These are CUDA Helper functions for initialization and error checking + +#ifndef HELPER_CUDA_H +#define HELPER_CUDA_H + +#pragma once + +#include +#include +#include + +#include "helper_string.h" + +//#include +//#include +//#include + +// Note, it is required that your SDK sample to include the proper header files, please +// refer the CUDA examples for examples of the needed CUDA headers, which may change depending +// on which CUDA functions are used. + +// CUDA Runtime error messages +#ifdef __DRIVER_TYPES_H__ +static const char *_cudaGetErrorEnum(cudaError_t error) +{ + switch (error) + { + case cudaSuccess: + return "cudaSuccess"; + + case cudaErrorMissingConfiguration: + return "cudaErrorMissingConfiguration"; + + case cudaErrorMemoryAllocation: + return "cudaErrorMemoryAllocation"; + + case cudaErrorInitializationError: + return "cudaErrorInitializationError"; + + case cudaErrorLaunchFailure: + return "cudaErrorLaunchFailure"; + + case cudaErrorPriorLaunchFailure: + return "cudaErrorPriorLaunchFailure"; + + case cudaErrorLaunchTimeout: + return "cudaErrorLaunchTimeout"; + + case cudaErrorLaunchOutOfResources: + return "cudaErrorLaunchOutOfResources"; + + case cudaErrorInvalidDeviceFunction: + return "cudaErrorInvalidDeviceFunction"; + + case cudaErrorInvalidConfiguration: + return "cudaErrorInvalidConfiguration"; + + case cudaErrorInvalidDevice: + return "cudaErrorInvalidDevice"; + + case cudaErrorInvalidValue: + return "cudaErrorInvalidValue"; + + case cudaErrorInvalidPitchValue: + return "cudaErrorInvalidPitchValue"; + + case cudaErrorInvalidSymbol: + return "cudaErrorInvalidSymbol"; + + case cudaErrorMapBufferObjectFailed: + return "cudaErrorMapBufferObjectFailed"; + + case cudaErrorUnmapBufferObjectFailed: + return "cudaErrorUnmapBufferObjectFailed"; + + case cudaErrorInvalidHostPointer: + return "cudaErrorInvalidHostPointer"; + + case cudaErrorInvalidDevicePointer: + return "cudaErrorInvalidDevicePointer"; + + case cudaErrorInvalidTexture: + return "cudaErrorInvalidTexture"; + + case cudaErrorInvalidTextureBinding: + return "cudaErrorInvalidTextureBinding"; + + case cudaErrorInvalidChannelDescriptor: + return "cudaErrorInvalidChannelDescriptor"; + + case cudaErrorInvalidMemcpyDirection: + return "cudaErrorInvalidMemcpyDirection"; + + case cudaErrorAddressOfConstant: + return "cudaErrorAddressOfConstant"; + + case cudaErrorTextureFetchFailed: + return "cudaErrorTextureFetchFailed"; + + case cudaErrorTextureNotBound: + return "cudaErrorTextureNotBound"; + + case cudaErrorSynchronizationError: + return "cudaErrorSynchronizationError"; + + case cudaErrorInvalidFilterSetting: + return "cudaErrorInvalidFilterSetting"; + + case cudaErrorInvalidNormSetting: + return "cudaErrorInvalidNormSetting"; + + case cudaErrorMixedDeviceExecution: + return "cudaErrorMixedDeviceExecution"; + + case cudaErrorCudartUnloading: + return "cudaErrorCudartUnloading"; + + case cudaErrorUnknown: + return "cudaErrorUnknown"; + + case cudaErrorNotYetImplemented: + return "cudaErrorNotYetImplemented"; + + case cudaErrorMemoryValueTooLarge: + return "cudaErrorMemoryValueTooLarge"; + + case cudaErrorInvalidResourceHandle: + return "cudaErrorInvalidResourceHandle"; + + case cudaErrorNotReady: + return "cudaErrorNotReady"; + + case cudaErrorInsufficientDriver: + return "cudaErrorInsufficientDriver"; + + case cudaErrorSetOnActiveProcess: + return "cudaErrorSetOnActiveProcess"; + + case cudaErrorInvalidSurface: + return "cudaErrorInvalidSurface"; + + case cudaErrorNoDevice: + return "cudaErrorNoDevice"; + + case cudaErrorECCUncorrectable: + return "cudaErrorECCUncorrectable"; + + case cudaErrorSharedObjectSymbolNotFound: + return "cudaErrorSharedObjectSymbolNotFound"; + + case cudaErrorSharedObjectInitFailed: + return "cudaErrorSharedObjectInitFailed"; + + case cudaErrorUnsupportedLimit: + return "cudaErrorUnsupportedLimit"; + + case cudaErrorDuplicateVariableName: + return "cudaErrorDuplicateVariableName"; + + case cudaErrorDuplicateTextureName: + return "cudaErrorDuplicateTextureName"; + + case cudaErrorDuplicateSurfaceName: + return "cudaErrorDuplicateSurfaceName"; + + case cudaErrorDevicesUnavailable: + return "cudaErrorDevicesUnavailable"; + + case cudaErrorInvalidKernelImage: + return "cudaErrorInvalidKernelImage"; + + case cudaErrorNoKernelImageForDevice: + return "cudaErrorNoKernelImageForDevice"; + + case cudaErrorIncompatibleDriverContext: + return "cudaErrorIncompatibleDriverContext"; + + case cudaErrorPeerAccessAlreadyEnabled: + return "cudaErrorPeerAccessAlreadyEnabled"; + + case cudaErrorPeerAccessNotEnabled: + return "cudaErrorPeerAccessNotEnabled"; + + case cudaErrorDeviceAlreadyInUse: + return "cudaErrorDeviceAlreadyInUse"; + + case cudaErrorProfilerDisabled: + return "cudaErrorProfilerDisabled"; + + case cudaErrorProfilerNotInitialized: + return "cudaErrorProfilerNotInitialized"; + + case cudaErrorProfilerAlreadyStarted: + return "cudaErrorProfilerAlreadyStarted"; + + case cudaErrorProfilerAlreadyStopped: + return "cudaErrorProfilerAlreadyStopped"; + +#if __CUDA_API_VERSION >= 0x4000 + + case cudaErrorAssert: + return "cudaErrorAssert"; + + case cudaErrorTooManyPeers: + return "cudaErrorTooManyPeers"; + + case cudaErrorHostMemoryAlreadyRegistered: + return "cudaErrorHostMemoryAlreadyRegistered"; + + case cudaErrorHostMemoryNotRegistered: + return "cudaErrorHostMemoryNotRegistered"; +#endif + + case cudaErrorStartupFailure: + return "cudaErrorStartupFailure"; + + case cudaErrorApiFailureBase: + return "cudaErrorApiFailureBase"; + } + + return ""; +} +#endif + +#ifdef __cuda_cuda_h__ +// CUDA Driver API errors +static const char *_cudaGetErrorEnum(CUresult error) +{ + switch (error) + { + case CUDA_SUCCESS: + return "CUDA_SUCCESS"; + + case CUDA_ERROR_INVALID_VALUE: + return "CUDA_ERROR_INVALID_VALUE"; + + case CUDA_ERROR_OUT_OF_MEMORY: + return "CUDA_ERROR_OUT_OF_MEMORY"; + + case CUDA_ERROR_NOT_INITIALIZED: + return "CUDA_ERROR_NOT_INITIALIZED"; + + case CUDA_ERROR_DEINITIALIZED: + return "CUDA_ERROR_DEINITIALIZED"; + + case CUDA_ERROR_PROFILER_DISABLED: + return "CUDA_ERROR_PROFILER_DISABLED"; + + case CUDA_ERROR_PROFILER_NOT_INITIALIZED: + return "CUDA_ERROR_PROFILER_NOT_INITIALIZED"; + + case CUDA_ERROR_PROFILER_ALREADY_STARTED: + return "CUDA_ERROR_PROFILER_ALREADY_STARTED"; + + case CUDA_ERROR_PROFILER_ALREADY_STOPPED: + return "CUDA_ERROR_PROFILER_ALREADY_STOPPED"; + + case CUDA_ERROR_NO_DEVICE: + return "CUDA_ERROR_NO_DEVICE"; + + case CUDA_ERROR_INVALID_DEVICE: + return "CUDA_ERROR_INVALID_DEVICE"; + + case CUDA_ERROR_INVALID_IMAGE: + return "CUDA_ERROR_INVALID_IMAGE"; + + case CUDA_ERROR_INVALID_CONTEXT: + return "CUDA_ERROR_INVALID_CONTEXT"; + + case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: + return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT"; + + case CUDA_ERROR_MAP_FAILED: + return "CUDA_ERROR_MAP_FAILED"; + + case CUDA_ERROR_UNMAP_FAILED: + return "CUDA_ERROR_UNMAP_FAILED"; + + case CUDA_ERROR_ARRAY_IS_MAPPED: + return "CUDA_ERROR_ARRAY_IS_MAPPED"; + + case CUDA_ERROR_ALREADY_MAPPED: + return "CUDA_ERROR_ALREADY_MAPPED"; + + case CUDA_ERROR_NO_BINARY_FOR_GPU: + return "CUDA_ERROR_NO_BINARY_FOR_GPU"; + + case CUDA_ERROR_ALREADY_ACQUIRED: + return "CUDA_ERROR_ALREADY_ACQUIRED"; + + case CUDA_ERROR_NOT_MAPPED: + return "CUDA_ERROR_NOT_MAPPED"; + + case CUDA_ERROR_NOT_MAPPED_AS_ARRAY: + return "CUDA_ERROR_NOT_MAPPED_AS_ARRAY"; + + case CUDA_ERROR_NOT_MAPPED_AS_POINTER: + return "CUDA_ERROR_NOT_MAPPED_AS_POINTER"; + + case CUDA_ERROR_ECC_UNCORRECTABLE: + return "CUDA_ERROR_ECC_UNCORRECTABLE"; + + case CUDA_ERROR_UNSUPPORTED_LIMIT: + return "CUDA_ERROR_UNSUPPORTED_LIMIT"; + + case CUDA_ERROR_CONTEXT_ALREADY_IN_USE: + return "CUDA_ERROR_CONTEXT_ALREADY_IN_USE"; + + case CUDA_ERROR_INVALID_SOURCE: + return "CUDA_ERROR_INVALID_SOURCE"; + + case CUDA_ERROR_FILE_NOT_FOUND: + return "CUDA_ERROR_FILE_NOT_FOUND"; + + case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: + return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND"; + + case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: + return "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED"; + + case CUDA_ERROR_OPERATING_SYSTEM: + return "CUDA_ERROR_OPERATING_SYSTEM"; + + case CUDA_ERROR_INVALID_HANDLE: + return "CUDA_ERROR_INVALID_HANDLE"; + + case CUDA_ERROR_NOT_FOUND: + return "CUDA_ERROR_NOT_FOUND"; + + case CUDA_ERROR_NOT_READY: + return "CUDA_ERROR_NOT_READY"; + + case CUDA_ERROR_LAUNCH_FAILED: + return "CUDA_ERROR_LAUNCH_FAILED"; + + case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: + return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES"; + + case CUDA_ERROR_LAUNCH_TIMEOUT: + return "CUDA_ERROR_LAUNCH_TIMEOUT"; + + case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: + return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING"; + + case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED: + return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED"; + + case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED: + return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED"; + + case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE: + return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE"; + + case CUDA_ERROR_CONTEXT_IS_DESTROYED: + return "CUDA_ERROR_CONTEXT_IS_DESTROYED"; + + case CUDA_ERROR_ASSERT: + return "CUDA_ERROR_ASSERT"; + + case CUDA_ERROR_TOO_MANY_PEERS: + return "CUDA_ERROR_TOO_MANY_PEERS"; + + case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED: + return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED"; + + case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED: + return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED"; + + case CUDA_ERROR_UNKNOWN: + return "CUDA_ERROR_UNKNOWN"; + } + + return ""; +} +#endif + +#ifdef CUBLAS_API_H_ +// cuBLAS API errors +static const char *_cudaGetErrorEnum(cublasStatus_t error) +{ + switch (error) + { + case CUBLAS_STATUS_SUCCESS: + return "CUBLAS_STATUS_SUCCESS"; + + case CUBLAS_STATUS_NOT_INITIALIZED: + return "CUBLAS_STATUS_NOT_INITIALIZED"; + + case CUBLAS_STATUS_ALLOC_FAILED: + return "CUBLAS_STATUS_ALLOC_FAILED"; + + case CUBLAS_STATUS_INVALID_VALUE: + return "CUBLAS_STATUS_INVALID_VALUE"; + + case CUBLAS_STATUS_ARCH_MISMATCH: + return "CUBLAS_STATUS_ARCH_MISMATCH"; + + case CUBLAS_STATUS_MAPPING_ERROR: + return "CUBLAS_STATUS_MAPPING_ERROR"; + + case CUBLAS_STATUS_EXECUTION_FAILED: + return "CUBLAS_STATUS_EXECUTION_FAILED"; + + case CUBLAS_STATUS_INTERNAL_ERROR: + return "CUBLAS_STATUS_INTERNAL_ERROR"; + } + + return ""; +} +#endif + +#ifdef _CUFFT_H_ +// cuFFT API errors +static const char *_cudaGetErrorEnum(cufftResult error) +{ + switch (error) + { + case CUFFT_SUCCESS: + return "CUFFT_SUCCESS"; + + case CUFFT_INVALID_PLAN: + return "CUFFT_INVALID_PLAN"; + + case CUFFT_ALLOC_FAILED: + return "CUFFT_ALLOC_FAILED"; + + case CUFFT_INVALID_TYPE: + return "CUFFT_INVALID_TYPE"; + + case CUFFT_INVALID_VALUE: + return "CUFFT_INVALID_VALUE"; + + case CUFFT_INTERNAL_ERROR: + return "CUFFT_INTERNAL_ERROR"; + + case CUFFT_EXEC_FAILED: + return "CUFFT_EXEC_FAILED"; + + case CUFFT_SETUP_FAILED: + return "CUFFT_SETUP_FAILED"; + + case CUFFT_INVALID_SIZE: + return "CUFFT_INVALID_SIZE"; + + case CUFFT_UNALIGNED_DATA: + return "CUFFT_UNALIGNED_DATA"; + } + + return ""; +} +#endif + + +#ifdef CUSPARSEAPI +// cuSPARSE API errors +static const char *_cudaGetErrorEnum(cusparseStatus_t error) +{ + switch (error) + { + case CUSPARSE_STATUS_SUCCESS: + return "CUSPARSE_STATUS_SUCCESS"; + + case CUSPARSE_STATUS_NOT_INITIALIZED: + return "CUSPARSE_STATUS_NOT_INITIALIZED"; + + case CUSPARSE_STATUS_ALLOC_FAILED: + return "CUSPARSE_STATUS_ALLOC_FAILED"; + + case CUSPARSE_STATUS_INVALID_VALUE: + return "CUSPARSE_STATUS_INVALID_VALUE"; + + case CUSPARSE_STATUS_ARCH_MISMATCH: + return "CUSPARSE_STATUS_ARCH_MISMATCH"; + + case CUSPARSE_STATUS_MAPPING_ERROR: + return "CUSPARSE_STATUS_MAPPING_ERROR"; + + case CUSPARSE_STATUS_EXECUTION_FAILED: + return "CUSPARSE_STATUS_EXECUTION_FAILED"; + + case CUSPARSE_STATUS_INTERNAL_ERROR: + return "CUSPARSE_STATUS_INTERNAL_ERROR"; + + case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + } + + return ""; +} +#endif + +#ifdef CURAND_H_ +// cuRAND API errors +static const char *_cudaGetErrorEnum(curandStatus_t error) +{ + switch (error) + { + case CURAND_STATUS_SUCCESS: + return "CURAND_STATUS_SUCCESS"; + + case CURAND_STATUS_VERSION_MISMATCH: + return "CURAND_STATUS_VERSION_MISMATCH"; + + case CURAND_STATUS_NOT_INITIALIZED: + return "CURAND_STATUS_NOT_INITIALIZED"; + + case CURAND_STATUS_ALLOCATION_FAILED: + return "CURAND_STATUS_ALLOCATION_FAILED"; + + case CURAND_STATUS_TYPE_ERROR: + return "CURAND_STATUS_TYPE_ERROR"; + + case CURAND_STATUS_OUT_OF_RANGE: + return "CURAND_STATUS_OUT_OF_RANGE"; + + case CURAND_STATUS_LENGTH_NOT_MULTIPLE: + return "CURAND_STATUS_LENGTH_NOT_MULTIPLE"; + + case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: + return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; + + case CURAND_STATUS_LAUNCH_FAILURE: + return "CURAND_STATUS_LAUNCH_FAILURE"; + + case CURAND_STATUS_PREEXISTING_FAILURE: + return "CURAND_STATUS_PREEXISTING_FAILURE"; + + case CURAND_STATUS_INITIALIZATION_FAILED: + return "CURAND_STATUS_INITIALIZATION_FAILED"; + + case CURAND_STATUS_ARCH_MISMATCH: + return "CURAND_STATUS_ARCH_MISMATCH"; + + case CURAND_STATUS_INTERNAL_ERROR: + return "CURAND_STATUS_INTERNAL_ERROR"; + } + + return ""; +} +#endif + +#ifdef NV_NPPIDEFS_H +// NPP API errors +static const char *_cudaGetErrorEnum(NppStatus error) +{ + switch (error) + { + case NPP_NOT_SUPPORTED_MODE_ERROR: + return "NPP_NOT_SUPPORTED_MODE_ERROR"; + + case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR: + return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR"; + + case NPP_RESIZE_NO_OPERATION_ERROR: + return "NPP_RESIZE_NO_OPERATION_ERROR"; + + case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY: + return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY"; + + case NPP_BAD_ARG_ERROR: + return "NPP_BAD_ARG_ERROR"; + + case NPP_LUT_NUMBER_OF_LEVELS_ERROR: + return "NPP_LUT_NUMBER_OF_LEVELS_ERROR"; + + case NPP_TEXTURE_BIND_ERROR: + return "NPP_TEXTURE_BIND_ERROR"; + + case NPP_COEFF_ERROR: + return "NPP_COEFF_ERROR"; + + case NPP_RECT_ERROR: + return "NPP_RECT_ERROR"; + + case NPP_QUAD_ERROR: + return "NPP_QUAD_ERROR"; + + case NPP_WRONG_INTERSECTION_ROI_ERROR: + return "NPP_WRONG_INTERSECTION_ROI_ERROR"; + + case NPP_NOT_EVEN_STEP_ERROR: + return "NPP_NOT_EVEN_STEP_ERROR"; + + case NPP_INTERPOLATION_ERROR: + return "NPP_INTERPOLATION_ERROR"; + + case NPP_RESIZE_FACTOR_ERROR: + return "NPP_RESIZE_FACTOR_ERROR"; + + case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR: + return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR"; + + case NPP_MEMFREE_ERR: + return "NPP_MEMFREE_ERR"; + + case NPP_MEMSET_ERR: + return "NPP_MEMSET_ERR"; + + case NPP_MEMCPY_ERROR: + return "NPP_MEMCPY_ERROR"; + + case NPP_MEM_ALLOC_ERR: + return "NPP_MEM_ALLOC_ERR"; + + case NPP_HISTO_NUMBER_OF_LEVELS_ERROR: + return "NPP_HISTO_NUMBER_OF_LEVELS_ERROR"; + + case NPP_MIRROR_FLIP_ERR: + return "NPP_MIRROR_FLIP_ERR"; + + case NPP_INVALID_INPUT: + return "NPP_INVALID_INPUT"; + + case NPP_ALIGNMENT_ERROR: + return "NPP_ALIGNMENT_ERROR"; + + case NPP_STEP_ERROR: + return "NPP_STEP_ERROR"; + + case NPP_SIZE_ERROR: + return "NPP_SIZE_ERROR"; + + case NPP_POINTER_ERROR: + return "NPP_POINTER_ERROR"; + + case NPP_NULL_POINTER_ERROR: + return "NPP_NULL_POINTER_ERROR"; + + case NPP_CUDA_KERNEL_EXECUTION_ERROR: + return "NPP_CUDA_KERNEL_EXECUTION_ERROR"; + + case NPP_NOT_IMPLEMENTED_ERROR: + return "NPP_NOT_IMPLEMENTED_ERROR"; + + case NPP_ERROR: + return "NPP_ERROR"; + + case NPP_SUCCESS: + return "NPP_SUCCESS"; + + case NPP_WARNING: + return "NPP_WARNING"; + + case NPP_WRONG_INTERSECTION_QUAD_WARNING: + return "NPP_WRONG_INTERSECTION_QUAD_WARNING"; + + case NPP_MISALIGNED_DST_ROI_WARNING: + return "NPP_MISALIGNED_DST_ROI_WARNING"; + + case NPP_AFFINE_QUAD_INCORRECT_WARNING: + return "NPP_AFFINE_QUAD_INCORRECT_WARNING"; + + case NPP_DOUBLE_SIZE_WARNING: + return "NPP_DOUBLE_SIZE_WARNING"; + + case NPP_ODD_ROI_WARNING: + return "NPP_ODD_ROI_WARNING"; + + case NPP_WRONG_INTERSECTION_ROI_WARNING: + return "NPP_WRONG_INTERSECTION_ROI_WARNING"; + } + + return ""; +} +#endif + +template< typename T > +bool check(T result, char const *const func, const char *const file, int const line) +{ + if (result) + { + fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", + file, line, static_cast(result), _cudaGetErrorEnum(result), func); + /* + std::stringstream ss; + std::string msg("CUDA error at "); + msg += file; + msg += ":"; + ss << line; + msg += ss.str(); + msg += " code="; + ss << static_cast(result); + msg += ss.str(); + msg += " ("; + msg += _cudaGetErrorEnum(result); + msg += ") \""; + msg += func; + msg += "\""; + //throw msg; + std::cerr << msg <<"\n"; + */ + return true; + } + else + { + return false; + } +} + +#ifdef __DRIVER_TYPES_H__ +// This will output the proper CUDA error strings in the event that a CUDA host call returns an error +#define checkCudaErrors(val) check ( (val), #val, __FILE__, __LINE__ ) + +// This will output the proper error string when calling cudaGetLastError +#define getLastCudaError(msg) __getLastCudaError (msg, __FILE__, __LINE__) + +inline void __getLastCudaError(const char *errorMessage, const char *file, const int line) +{ + cudaError_t err = cudaGetLastError(); + + if (cudaSuccess != err) + { + fprintf(stderr, "%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n", + file, line, errorMessage, (int)err, cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } +} +#endif + +#ifndef MAX +#define MAX(a,b) (a > b ? a : b) +#endif + +// Beginning of GPU Architecture definitions +inline int _ConvertSMVer2Cores(int major, int minor) +{ + // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM + typedef struct + { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version + int Cores; + } sSMtoCores; + + sSMtoCores nGpuArchCoresPerSM[] = + { + { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class + { 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class + { 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class + { 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class + { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class + { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class + { 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class + { 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class + { -1, -1 } + }; + + int index = 0; + + while (nGpuArchCoresPerSM[index].SM != -1) + { + if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) + { + return nGpuArchCoresPerSM[index].Cores; + } + + index++; + } + + // If we don't find the values, we default use the previous one to run properly + printf("MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[7].Cores); + return nGpuArchCoresPerSM[7].Cores; +} +// end of GPU Architecture definitions + +#ifdef __CUDA_RUNTIME_H__ +// General GPU Device CUDA Initialization +inline int gpuDeviceInit(int devID) +{ + int deviceCount; + checkCudaErrors(cudaGetDeviceCount(&deviceCount)); + + if (deviceCount == 0) + { + fprintf(stderr, "gpuDeviceInit() CUDA error: no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + if (devID < 0) + { + devID = 0; + } + + if (devID > deviceCount-1) + { + fprintf(stderr, "\n"); + fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount); + fprintf(stderr, ">> gpuDeviceInit (-device=%d) is not a valid GPU device. <<\n", devID); + fprintf(stderr, "\n"); + return -devID; + } + + cudaDeviceProp deviceProp; + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); + + if (deviceProp.computeMode == cudaComputeModeProhibited) + { + fprintf(stderr, "Error: device is running in , no threads can use ::cudaSetDevice().\n"); + return -1; + } + + if (deviceProp.major < 1) + { + fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n"); + exit(EXIT_FAILURE); + } + + checkCudaErrors(cudaSetDevice(devID)); + printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, deviceProp.name); + + return devID; +} + +// This function returns the best GPU (with maximum GFLOPS) +inline int gpuGetMaxGflopsDeviceId() +{ + int current_device = 0, sm_per_multiproc = 0; + int max_compute_perf = 0, max_perf_device = 0; + int device_count = 0, best_SM_arch = 0; + cudaDeviceProp deviceProp; + cudaGetDeviceCount(&device_count); + + // Find the best major SM Architecture GPU device + while (current_device < device_count) + { + cudaGetDeviceProperties(&deviceProp, current_device); + + // If this GPU is not running on Compute Mode prohibited, then we can add it to the list + if (deviceProp.computeMode != cudaComputeModeProhibited) + { + if (deviceProp.major > 0 && deviceProp.major < 9999) + { + best_SM_arch = MAX(best_SM_arch, deviceProp.major); + } + } + + current_device++; + } + + // Find the best CUDA capable GPU device + current_device = 0; + + while (current_device < device_count) + { + cudaGetDeviceProperties(&deviceProp, current_device); + + // If this GPU is not running on Compute Mode prohibited, then we can add it to the list + if (deviceProp.computeMode != cudaComputeModeProhibited) + { + if (deviceProp.major == 9999 && deviceProp.minor == 9999) + { + sm_per_multiproc = 1; + } + else + { + sm_per_multiproc = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor); + } + + int compute_perf = deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate; + + if (compute_perf > max_compute_perf) + { + // If we find GPU with SM major > 2, search only these + if (best_SM_arch > 2) + { + // If our device==dest_SM_arch, choose this, or else pass + if (deviceProp.major == best_SM_arch) + { + max_compute_perf = compute_perf; + max_perf_device = current_device; + } + } + else + { + max_compute_perf = compute_perf; + max_perf_device = current_device; + } + } + } + + ++current_device; + } + + return max_perf_device; +} + + +// Initialization code to find the best CUDA Device +inline int findCudaDevice(int argc, const char **argv) +{ + cudaDeviceProp deviceProp; + int devID = 0; + + // If the command-line has a device number specified, use it + if (checkCmdLineFlag(argc, argv, "device")) + { + devID = getCmdLineArgumentInt(argc, argv, "device="); + + if (devID < 0) + { + printf("Invalid command line parameter\n "); + exit(EXIT_FAILURE); + } + else + { + devID = gpuDeviceInit(devID); + + if (devID < 0) + { + printf("exiting...\n"); + exit(EXIT_FAILURE); + } + } + } + else + { + // Otherwise pick the device with highest Gflops/s + devID = gpuGetMaxGflopsDeviceId(); + checkCudaErrors(cudaSetDevice(devID)); + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor); + } + + return devID; +} + +// General check for CUDA GPU SM Capabilities +inline bool checkCudaCapabilities(int major_version, int minor_version) +{ + cudaDeviceProp deviceProp; + deviceProp.major = 0; + deviceProp.minor = 0; + int dev; + + checkCudaErrors(cudaGetDevice(&dev)); + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev)); + + if ((deviceProp.major > major_version) || + (deviceProp.major == major_version && deviceProp.minor >= minor_version)) + { + printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", dev, deviceProp.name, deviceProp.major, deviceProp.minor); + return true; + } + else + { + printf("No GPU device was found that can support CUDA compute capability %d.%d.\n", major_version, minor_version); + return false; + } +} +#endif + +// end of CUDA Helper Functions + + +#endif diff --git a/src/Benchmarks/Sorting/ReferenceAlgorithms/helpers/helper_string.h b/src/Benchmarks/Sorting/ReferenceAlgorithms/helpers/helper_string.h new file mode 100644 index 000000000..62b7156bc --- /dev/null +++ b/src/Benchmarks/Sorting/ReferenceAlgorithms/helpers/helper_string.h @@ -0,0 +1,421 @@ +/** + * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +// These are helper functions for the SDK samples (string parsing, timers, etc) +#ifndef STRING_HELPER_H +#define STRING_HELPER_H + +#include +#include +#include +#include + +#ifdef _WIN32 +#ifndef STRCASECMP +#define STRCASECMP _stricmp +#endif +#ifndef STRNCASECMP +#define STRNCASECMP _strnicmp +#endif +#ifndef STRCPY +#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath) +#endif + +#ifndef FOPEN +#define FOPEN(fHandle,filename,mode) fopen_s(&fHandle, filename, mode) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result != 0) +#endif +#ifndef SSCANF +#define SSCANF sscanf_s +#endif + +#else +#include +#include + +#ifndef STRCASECMP +#define STRCASECMP strcasecmp +#endif +#ifndef STRNCASECMP +#define STRNCASECMP strncasecmp +#endif +#ifndef STRCPY +#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath) +#endif + +#ifndef FOPEN +#define FOPEN(fHandle,filename,mode) (fHandle = fopen(filename, mode)) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result == NULL) +#endif +#ifndef SSCANF +#define SSCANF sscanf +#endif +#endif + +// CUDA Utility Helper Functions +inline int stringRemoveDelimiter(char delimiter, const char *string) +{ + int string_start = 0; + + while (string[string_start] == delimiter) + { + string_start++; + } + + if (string_start >= (int)strlen(string)-1) + { + return 0; + } + + return string_start; +} + +inline int getFileExtension(char *filename, char **extension) +{ + int string_length = (int)strlen(filename); + + while (filename[string_length--] != '.') { + if (string_length == 0) + break; + } + if (string_length > 0) string_length += 2; + + if (string_length == 0) + *extension = NULL; + else + *extension = &filename[string_length]; + + return string_length; +} + + +inline int checkCmdLineFlag(const int argc, const char **argv, const char *string_ref) +{ + bool bFound = false; + + if (argc >= 1) + { + for (int i=1; i < argc; i++) + { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + + const char *equal_pos = strchr(string_argv, '='); + int argv_length = (int)(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv); + + int length = (int)strlen(string_ref); + + if (length == argv_length && !STRNCASECMP(string_argv, string_ref, length)) + { + + bFound = true; + continue; + } + } + } + + return (int)bFound; +} + +inline int getCmdLineArgumentInt(const int argc, const char **argv, const char *string_ref) +{ + bool bFound = false; + int value = -1; + + if (argc >= 1) + { + for (int i=1; i < argc; i++) + { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + int length = (int)strlen(string_ref); + + if (!STRNCASECMP(string_argv, string_ref, length)) + { + if (length+1 <= (int)strlen(string_argv)) + { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + value = atoi(&string_argv[length + auto_inc]); + } + else + { + value = 0; + } + + bFound = true; + continue; + } + } + } + + if (bFound) + { + return value; + } + else + { + return 0; + } +} + +inline float getCmdLineArgumentFloat(const int argc, const char **argv, const char *string_ref) +{ + bool bFound = false; + float value = -1; + + if (argc >= 1) + { + for (int i=1; i < argc; i++) + { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + int length = (int)strlen(string_ref); + + if (!STRNCASECMP(string_argv, string_ref, length)) + { + if (length+1 <= (int)strlen(string_argv)) + { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + value = (float)atof(&string_argv[length + auto_inc]); + } + else + { + value = 0.f; + } + + bFound = true; + continue; + } + } + } + + if (bFound) + { + return value; + } + else + { + return 0; + } +} + +inline bool getCmdLineArgumentString(const int argc, const char **argv, + const char *string_ref, char **string_retval) +{ + bool bFound = false; + + if (argc >= 1) + { + for (int i=1; i < argc; i++) + { + int string_start = stringRemoveDelimiter('-', argv[i]); + char *string_argv = (char *)&argv[i][string_start]; + int length = (int)strlen(string_ref); + + if (!STRNCASECMP(string_argv, string_ref, length)) + { + *string_retval = &string_argv[length+1]; + bFound = true; + continue; + } + } + } + + if (!bFound) + { + *string_retval = NULL; + } + + return bFound; +} + +////////////////////////////////////////////////////////////////////////////// +//! Find the path for a file assuming that +//! files are found in the searchPath. +//! +//! @return the path if succeeded, otherwise 0 +//! @param filename name of the file +//! @param executable_path optional absolute path of the executable +////////////////////////////////////////////////////////////////////////////// +inline char *sdkFindFilePath(const char *filename, const char *executable_path) +{ + // defines a variable that is replaced with the name of the executable + + // Typical relative search paths to locate needed companion files (e.g. sample input data, or JIT source files) + // The origin for the relative search may be the .exe file, a .bat file launching an .exe, a browser .exe launching the .exe or .bat, etc + const char *searchPath[] = + { + "./", // same dir + "./common/", // "/common/" subdir + "./common/data/", // "/common/data/" subdir + "./data/", // "/data/" subdir + "./src/", // "/src/" subdir + "./src//data/", // "/src//data/" subdir + "./inc/", // "/inc/" subdir + "./0_Simple/", // "/0_Simple/" subdir + "./1_Utilities/", // "/1_Utilities/" subdir + "./2_Graphics/", // "/2_Graphics/" subdir + "./3_Imaging/", // "/3_Imaging/" subdir + "./4_Financial/", // "/4_Financial/" subdir + "./5_Simulations/", // "/5_Simulations/" subdir + "./6_Advanced/", // "/6_Advanced/" subdir + "./7_CUDALibraries/", // "/7_CUDALibraries/" subdir + + "../", // up 1 in tree + "../common/", // up 1 in tree, "/common/" subdir + "../common/data/", // up 1 in tree, "/common/data/" subdir + "../data/", // up 1 in tree, "/data/" subdir + "../src/", // up 1 in tree, "/src/" subdir + "../inc/", // up 1 in tree, "/inc/" subdir + "../C/src//", // up 1 in tree, "/C/src//" subdir + "../C/src//data/", // up 1 in tree, "/C/src//data/" subdir + "../C/src//src/", // up 1 in tree, "/C/src//src/" subdir + "../C/src//inc/", // up 1 in tree, "/C/src//inc/" subdir + "../C/", // up 1 in tree + "../C/common/", // up 1 in tree, "/common/" subdir + "../C/common/data/", // up 1 in tree, "/common/data/" subdir + "../C/data/", // up 1 in tree, "/data/" subdir + "../C/src/", // up 1 in tree, "/src/" subdir + "../C/inc/", // up 1 in tree, "/inc/" subdir + "../C/0_Simple//data/", // up 1 in tree, "/0_Simple//" subdir + "../C/1_Utilities//data/", // up 1 in tree, "/1_Utilities//" subdir + "../C/2_Graphics//data/", // up 1 in tree, "/2_Graphics//" subdir + "../C/3_Imaging//data/", // up 1 in tree, "/3_Imaging//" subdir + "../C/4_Financial//data/", // up 1 in tree, "/4_Financial//" subdir + "../C/5_Simulations//data/", // up 1 in tree, "/5_Simulations//" subdir + "../C/6_Advanced//data/", // up 1 in tree, "/6_Advanced//" subdir + "../C/7_CUDALibraries//data/", // up 1 in tree, "/7_CUDALibraries//" subdir + + "../0_Simple//data/", // up 1 in tree, "/0_Simple//" subdir + "../1_Utilities//data/", // up 1 in tree, "/1_Utilities//" subdir + "../2_Graphics//data/", // up 1 in tree, "/2_Graphics//" subdir + "../3_Imaging//data/", // up 1 in tree, "/3_Imaging//" subdir + "../4_Financial//data/", // up 1 in tree, "/4_Financial//" subdir + "../5_Simulations//data/", // up 1 in tree, "/5_Simulations//" subdir + "../6_Advanced//data/", // up 1 in tree, "/6_Advanced//" subdir + "../7_CUDALibraries//data/", // up 1 in tree, "/7_CUDALibraries//" subdir + "../../", // up 2 in tree + "../../common/", // up 2 in tree, "/common/" subdir + "../../common/data/", // up 2 in tree, "/common/data/" subdir + "../../data/", // up 2 in tree, "/data/" subdir + "../../src/", // up 2 in tree, "/src/" subdir + "../../inc/", // up 2 in tree, "/inc/" subdir + "../../sandbox//data/", // up 2 in tree, "/sandbox//" subdir + "../../0_Simple//data/", // up 2 in tree, "/0_Simple//" subdir + "../../1_Utilities//data/", // up 2 in tree, "/1_Utilities//" subdir + "../../2_Graphics//data/", // up 2 in tree, "/2_Graphics//" subdir + "../../3_Imaging//data/", // up 2 in tree, "/3_Imaging//" subdir + "../../4_Financial//data/", // up 2 in tree, "/4_Financial//" subdir + "../../5_Simulations//data/", // up 2 in tree, "/5_Simulations//" subdir + "../../6_Advanced//data/", // up 2 in tree, "/6_Advanced//" subdir + "../../7_CUDALibraries//data/", // up 2 in tree, "/7_CUDALibraries//" subdir + "../../../", // up 3 in tree + "../../../src//", // up 3 in tree, "/src//" subdir + "../../../src//data/", // up 3 in tree, "/src//data/" subdir + "../../../src//src/", // up 3 in tree, "/src//src/" subdir + "../../../src//inc/", // up 3 in tree, "/src//inc/" subdir + "../../../sandbox//", // up 3 in tree, "/sandbox//" subdir + "../../../sandbox//data/", // up 3 in tree, "/sandbox//data/" subdir + "../../../sandbox//src/", // up 3 in tree, "/sandbox//src/" subdir + "../../../sandbox//inc/", // up 3 in tree, "/sandbox//inc/" subdir + "../../../0_Simple//data/", // up 3 in tree, "/0_Simple//" subdir + "../../../1_Utilities//data/", // up 3 in tree, "/1_Utilities//" subdir + "../../../2_Graphics//data/", // up 3 in tree, "/2_Graphics//" subdir + "../../../3_Imaging//data/", // up 3 in tree, "/3_Imaging//" subdir + "../../../4_Financial//data/", // up 3 in tree, "/4_Financial//" subdir + "../../../5_Simulations//data/",// up 3 in tree, "/5_Simulations//" subdir + "../../../6_Advanced//data/", // up 3 in tree, "/6_Advanced//" subdir + "../../../7_CUDALibraries//data/", // up 3 in tree, "/7_CUDALibraries//" subdir + "../../../common/", // up 3 in tree, "../../../common/" subdir + "../../../common/data/", // up 3 in tree, "../../../common/data/" subdir + "../../../data/", // up 3 in tree, "../../../data/" subdir + }; + + // Extract the executable name + std::string executable_name; + + if (executable_path != 0) + { + executable_name = std::string(executable_path); + +#ifdef _WIN32 + // Windows path delimiter + size_t delimiter_pos = executable_name.find_last_of('\\'); + executable_name.erase(0, delimiter_pos + 1); + + if (executable_name.rfind(".exe") != std::string::npos) + { + // we strip .exe, only if the .exe is found + executable_name.resize(executable_name.size() - 4); + } + +#else + // Linux & OSX path delimiter + size_t delimiter_pos = executable_name.find_last_of('/'); + executable_name.erase(0,delimiter_pos+1); +#endif + } + + // Loop over all search paths and return the first hit + for (unsigned int i = 0; i < sizeof(searchPath)/sizeof(char *); ++i) + { + std::string path(searchPath[i]); + size_t executable_name_pos = path.find(""); + + // If there is executable_name variable in the searchPath + // replace it with the value + if (executable_name_pos != std::string::npos) + { + if (executable_path != 0) + { + path.replace(executable_name_pos, strlen(""), executable_name); + } + else + { + // Skip this path entry if no executable argument is given + continue; + } + } + +#ifdef _DEBUG + printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str()); +#endif + + // Test if the file exists + path.append(filename); + FILE *fp; + FOPEN(fp, path.c_str(), "rb"); + + if (fp != NULL) + { + fclose(fp); + // File found + // returning an allocated array here for backwards compatibility reasons + char *file_path = (char *) malloc(path.length() + 1); + STRCPY(file_path, path.length() + 1, path.c_str()); + return file_path; + } + + if (fp) + { + fclose(fp); + } + } + + // File not found + return 0; +} + +#endif diff --git a/src/Benchmarks/Sorting/ReferenceAlgorithms/helpers/helper_timer.h b/src/Benchmarks/Sorting/ReferenceAlgorithms/helpers/helper_timer.h new file mode 100644 index 000000000..3cb4fece4 --- /dev/null +++ b/src/Benchmarks/Sorting/ReferenceAlgorithms/helpers/helper_timer.h @@ -0,0 +1,495 @@ +/** + * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +// Helper Timing Functions +#ifndef HELPER_TIMER_H +#define HELPER_TIMER_H + +// includes, system +#include + +// includes, project +#include "exception.h" + +// Definition of the StopWatch Interface, this is used if we don't want to use the CUT functions +// But rather in a self contained class interface +class StopWatchInterface +{ + public: + StopWatchInterface() {}; + virtual ~StopWatchInterface() {}; + + public: + //! Start time measurement + virtual void start() = 0; + + //! Stop time measurement + virtual void stop() = 0; + + //! Reset time counters to zero + virtual void reset() = 0; + + //! Time in msec. after start. If the stop watch is still running (i.e. there + //! was no call to stop()) then the elapsed time is returned, otherwise the + //! time between the last start() and stop call is returned + virtual float getTime() = 0; + + //! Mean time to date based on the number of times the stopwatch has been + //! _stopped_ (ie finished sessions) and the current total time + virtual float getAverageTime() = 0; +}; + + +////////////////////////////////////////////////////////////////// +// Begin Stopwatch timer class definitions for all OS platforms // +////////////////////////////////////////////////////////////////// +#ifdef _WIN32 +// includes, system +#define WINDOWS_LEAN_AND_MEAN +#include +#undef min +#undef max + +//! Windows specific implementation of StopWatch +class StopWatchWin : public StopWatchInterface +{ + public: + //! Constructor, default + StopWatchWin() : + start_time(), end_time(), + diff_time(0.0f), total_time(0.0f), + running(false), clock_sessions(0), freq(0), freq_set(false) + { + if (! freq_set) + { + // helper variable + LARGE_INTEGER temp; + + // get the tick frequency from the OS + QueryPerformanceFrequency((LARGE_INTEGER *) &temp); + + // convert to type in which it is needed + freq = ((double) temp.QuadPart) / 1000.0; + + // rememeber query + freq_set = true; + } + }; + + // Destructor + ~StopWatchWin() { }; + + public: + //! Start time measurement + inline void start(); + + //! Stop time measurement + inline void stop(); + + //! Reset time counters to zero + inline void reset(); + + //! Time in msec. after start. If the stop watch is still running (i.e. there + //! was no call to stop()) then the elapsed time is returned, otherwise the + //! time between the last start() and stop call is returned + inline float getTime(); + + //! Mean time to date based on the number of times the stopwatch has been + //! _stopped_ (ie finished sessions) and the current total time + inline float getAverageTime(); + + private: + // member variables + + //! Start of measurement + LARGE_INTEGER start_time; + //! End of measurement + LARGE_INTEGER end_time; + + //! Time difference between the last start and stop + float diff_time; + + //! TOTAL time difference between starts and stops + float total_time; + + //! flag if the stop watch is running + bool running; + + //! Number of times clock has been started + //! and stopped to allow averaging + int clock_sessions; + + //! tick frequency + double freq; + + //! flag if the frequency has been set + bool freq_set; +}; + +// functions, inlined + +//////////////////////////////////////////////////////////////////////////////// +//! Start time measurement +//////////////////////////////////////////////////////////////////////////////// +inline void +StopWatchWin::start() +{ + QueryPerformanceCounter((LARGE_INTEGER *) &start_time); + running = true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Stop time measurement and increment add to the current diff_time summation +//! variable. Also increment the number of times this clock has been run. +//////////////////////////////////////////////////////////////////////////////// +inline void +StopWatchWin::stop() +{ + QueryPerformanceCounter((LARGE_INTEGER *) &end_time); + diff_time = (float) + (((double) end_time.QuadPart - (double) start_time.QuadPart) / freq); + + total_time += diff_time; + clock_sessions++; + running = false; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Reset the timer to 0. Does not change the timer running state but does +//! recapture this point in time as the current start time if it is running. +//////////////////////////////////////////////////////////////////////////////// +inline void +StopWatchWin::reset() +{ + diff_time = 0; + total_time = 0; + clock_sessions = 0; + + if (running) + { + QueryPerformanceCounter((LARGE_INTEGER *) &start_time); + } +} + + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. after start. If the stop watch is still running (i.e. there +//! was no call to stop()) then the elapsed time is returned added to the +//! current diff_time sum, otherwise the current summed time difference alone +//! is returned. +//////////////////////////////////////////////////////////////////////////////// +inline float +StopWatchWin::getTime() +{ + // Return the TOTAL time to date + float retval = total_time; + + if (running) + { + LARGE_INTEGER temp; + QueryPerformanceCounter((LARGE_INTEGER *) &temp); + retval += (float) + (((double)(temp.QuadPart - start_time.QuadPart)) / freq); + } + + return retval; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. for a single run based on the total number of COMPLETED runs +//! and the total time. +//////////////////////////////////////////////////////////////////////////////// +inline float +StopWatchWin::getAverageTime() +{ + return (clock_sessions > 0) ? (total_time/clock_sessions) : 0.0f; +} +#else +// Declarations for Stopwatch on Linux and Mac OSX +// includes, system +#include +#include + +//! Windows specific implementation of StopWatch +class StopWatchLinux : public StopWatchInterface +{ + public: + //! Constructor, default + StopWatchLinux() : + start_time(), diff_time(0.0), total_time(0.0), + running(false), clock_sessions(0) + { }; + + // Destructor + virtual ~StopWatchLinux() + { }; + + public: + //! Start time measurement + inline void start(); + + //! Stop time measurement + inline void stop(); + + //! Reset time counters to zero + inline void reset(); + + //! Time in msec. after start. If the stop watch is still running (i.e. there + //! was no call to stop()) then the elapsed time is returned, otherwise the + //! time between the last start() and stop call is returned + inline float getTime(); + + //! Mean time to date based on the number of times the stopwatch has been + //! _stopped_ (ie finished sessions) and the current total time + inline float getAverageTime(); + + private: + + // helper functions + + //! Get difference between start time and current time + inline float getDiffTime(); + + private: + + // member variables + + //! Start of measurement + struct timeval start_time; + + //! Time difference between the last start and stop + float diff_time; + + //! TOTAL time difference between starts and stops + float total_time; + + //! flag if the stop watch is running + bool running; + + //! Number of times clock has been started + //! and stopped to allow averaging + int clock_sessions; +}; + +// functions, inlined + +//////////////////////////////////////////////////////////////////////////////// +//! Start time measurement +//////////////////////////////////////////////////////////////////////////////// +inline void +StopWatchLinux::start() +{ + gettimeofday(&start_time, 0); + running = true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Stop time measurement and increment add to the current diff_time summation +//! variable. Also increment the number of times this clock has been run. +//////////////////////////////////////////////////////////////////////////////// +inline void +StopWatchLinux::stop() +{ + diff_time = getDiffTime(); + total_time += diff_time; + running = false; + clock_sessions++; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Reset the timer to 0. Does not change the timer running state but does +//! recapture this point in time as the current start time if it is running. +//////////////////////////////////////////////////////////////////////////////// +inline void +StopWatchLinux::reset() +{ + diff_time = 0; + total_time = 0; + clock_sessions = 0; + + if (running) + { + gettimeofday(&start_time, 0); + } +} + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. after start. If the stop watch is still running (i.e. there +//! was no call to stop()) then the elapsed time is returned added to the +//! current diff_time sum, otherwise the current summed time difference alone +//! is returned. +//////////////////////////////////////////////////////////////////////////////// +inline float +StopWatchLinux::getTime() +{ + // Return the TOTAL time to date + float retval = total_time; + + if (running) + { + retval += getDiffTime(); + } + + return retval; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. for a single run based on the total number of COMPLETED runs +//! and the total time. +//////////////////////////////////////////////////////////////////////////////// +inline float +StopWatchLinux::getAverageTime() +{ + return (clock_sessions > 0) ? (total_time/clock_sessions) : 0.0f; +} +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +inline float +StopWatchLinux::getDiffTime() +{ + struct timeval t_time; + gettimeofday(&t_time, 0); + + // time difference in milli-seconds + return (float)(1000.0 * (t_time.tv_sec - start_time.tv_sec) + + (0.001 * (t_time.tv_usec - start_time.tv_usec))); +} +#endif // _WIN32 + +//////////////////////////////////////////////////////////////////////////////// +//! Timer functionality exported + +//////////////////////////////////////////////////////////////////////////////// +//! Create a new timer +//! @return true if a time has been created, otherwise false +//! @param name of the new timer, 0 if the creation failed +//////////////////////////////////////////////////////////////////////////////// +inline bool +sdkCreateTimer(StopWatchInterface **timer_interface) +{ + //printf("sdkCreateTimer called object %08x\n", (void *)*timer_interface); +#ifdef _WIN32 + *timer_interface = (StopWatchInterface *)new StopWatchWin(); +#else + *timer_interface = (StopWatchInterface *)new StopWatchLinux(); +#endif + return (*timer_interface != NULL) ? true : false; +} + + +//////////////////////////////////////////////////////////////////////////////// +//! Delete a timer +//! @return true if a time has been deleted, otherwise false +//! @param name of the timer to delete +//////////////////////////////////////////////////////////////////////////////// +inline bool +sdkDeleteTimer(StopWatchInterface **timer_interface) +{ + //printf("sdkDeleteTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) + { + delete *timer_interface; + *timer_interface = NULL; + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Start the time with name \a name +//! @param name name of the timer to start +//////////////////////////////////////////////////////////////////////////////// +inline bool +sdkStartTimer(StopWatchInterface **timer_interface) +{ + //printf("sdkStartTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) + { + (*timer_interface)->start(); + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Stop the time with name \a name. Does not reset. +//! @param name name of the timer to stop +//////////////////////////////////////////////////////////////////////////////// +inline bool +sdkStopTimer(StopWatchInterface **timer_interface) +{ + // printf("sdkStopTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) + { + (*timer_interface)->stop(); + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Resets the timer's counter. +//! @param name name of the timer to reset. +//////////////////////////////////////////////////////////////////////////////// +inline bool +sdkResetTimer(StopWatchInterface **timer_interface) +{ + // printf("sdkResetTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) + { + (*timer_interface)->reset(); + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Return the average time for timer execution as the total time +//! for the timer dividied by the number of completed (stopped) runs the timer +//! has made. +//! Excludes the current running time if the timer is currently running. +//! @param name name of the timer to return the time of +//////////////////////////////////////////////////////////////////////////////// +inline float +sdkGetAverageTimerValue(StopWatchInterface **timer_interface) +{ + // printf("sdkGetAverageTimerValue called object %08x\n", (void *)*timer_interface); + if (*timer_interface) + { + return (*timer_interface)->getAverageTime(); + } + else + { + return 0.0f; + } +} + +//////////////////////////////////////////////////////////////////////////////// +//! Total execution time for the timer over all runs since the last reset +//! or timer creation. +//! @param name name of the timer to obtain the value of. +//////////////////////////////////////////////////////////////////////////////// +inline float +sdkGetTimerValue(StopWatchInterface **timer_interface) +{ + // printf("sdkGetTimerValue called object %08x\n", (void *)*timer_interface); + if (*timer_interface) + { + return (*timer_interface)->getTime(); + } + else + { + return 0.0f; + } +} + +#endif // HELPER_TIMER_H diff --git a/src/Benchmarks/Sorting/ReferenceAlgorithms/helpers/scan_common.h b/src/Benchmarks/Sorting/ReferenceAlgorithms/helpers/scan_common.h new file mode 100644 index 000000000..80b93d574 --- /dev/null +++ b/src/Benchmarks/Sorting/ReferenceAlgorithms/helpers/scan_common.h @@ -0,0 +1,61 @@ +/* + * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +#ifndef SCAN_COMMON_H +#define SCAN_COMMON_H + +#include + +//////////////////////////////////////////////////////////////////////////////// +// Shortcut typename +//////////////////////////////////////////////////////////////////////////////// +typedef unsigned int uint; + +//////////////////////////////////////////////////////////////////////////////// +// Implementation limits +//////////////////////////////////////////////////////////////////////////////// +extern "C" const uint MAX_BATCH_ELEMENTS; +extern "C" const uint MIN_SHORT_ARRAY_SIZE; +extern "C" const uint MAX_SHORT_ARRAY_SIZE; +extern "C" const uint MIN_LARGE_ARRAY_SIZE; +extern "C" const uint MAX_LARGE_ARRAY_SIZE; + +//////////////////////////////////////////////////////////////////////////////// +// CUDA scan +//////////////////////////////////////////////////////////////////////////////// +extern "C" void initScan(void); +extern "C" void closeScan(void); + +extern "C" size_t scanExclusiveShort( + uint *d_Dst, + uint *d_Src, + uint batchSize, + uint arrayLength +); + +extern "C" size_t scanExclusiveLarge( + uint *d_Dst, + uint *d_Src, + uint batchSize, + uint arrayLength +); + +//////////////////////////////////////////////////////////////////////////////// +// Reference CPU scan +//////////////////////////////////////////////////////////////////////////////// +extern "C" void scanExclusiveHost( + uint *dst, + uint *src, + uint batchSize, + uint arrayLength +); + +#endif diff --git a/GPUSort/otherGPUsorts/manca_quicksort_extracted/manca_quicksort.cu b/src/Benchmarks/Sorting/ReferenceAlgorithms/manca_quicksort.h similarity index 99% rename from GPUSort/otherGPUsorts/manca_quicksort_extracted/manca_quicksort.cu rename to src/Benchmarks/Sorting/ReferenceAlgorithms/manca_quicksort.h index 77bdff82c..bc7fe0d09 100644 --- a/GPUSort/otherGPUsorts/manca_quicksort_extracted/manca_quicksort.cu +++ b/src/Benchmarks/Sorting/ReferenceAlgorithms/manca_quicksort.h @@ -217,8 +217,8 @@ inline __device__ void compareInclusive(Type &idata, Type &idata2, volatile Type } #include -#include -#include +#include "helpers/helper_cuda.h" +#include "helpers/scan_common.h" //All three kernels run 512 threads per workgroup //Must be a power of two @@ -654,9 +654,9 @@ size_t scanInclusiveLarge( #include -#include -#include -#include +#include "helpers/helper_cuda.h" +#include "helpers/helper_timer.h" +#include "helpers/scan_common.h" extern __shared__ uint sMemory[]; @@ -1121,7 +1121,7 @@ void sort(Type *ddata, Type *outputData, uint size, uint threadCount, int device uint nblock = 10 * blocks; int partition_max = 262144; - unsigned long long int total = partition_max * sizeof(Block) + nblock * sizeof(Partition) + 2 * partition_max * sizeof(uint) + 3 * (size) * sizeof(Type); + //unsigned long long int total = partition_max * sizeof(Block) + nblock * sizeof(Partition) + 2 * partition_max * sizeof(uint) + 3 * (size) * sizeof(Type); //Allocating and initializing CUDA arrays sdkCreateTimer(&htimer); diff --git a/GPUSort/GPUSort/benchmark/bitonic_benchmark/benchmark.cu b/src/Benchmarks/Sorting/bitonicsortBenchmark.cu similarity index 100% rename from GPUSort/GPUSort/benchmark/bitonic_benchmark/benchmark.cu rename to src/Benchmarks/Sorting/bitonicsortBenchmark.cu diff --git a/GPUSort/GPUSort/benchmark/generators.cpp b/src/Benchmarks/Sorting/generators.h similarity index 100% rename from GPUSort/GPUSort/benchmark/generators.cpp rename to src/Benchmarks/Sorting/generators.h diff --git a/GPUSort/GPUSort/src/util/timer.h b/src/Benchmarks/Sorting/timer.h similarity index 100% rename from GPUSort/GPUSort/src/util/timer.h rename to src/Benchmarks/Sorting/timer.h diff --git a/src/Benchmarks/Sorting/tnl-benchmark-sort.cpp b/src/Benchmarks/Sorting/tnl-benchmark-sort.cpp new file mode 100644 index 000000000..489a9497a --- /dev/null +++ b/src/Benchmarks/Sorting/tnl-benchmark-sort.cpp @@ -0,0 +1 @@ +#include "tnl-benchmark-sort.h" diff --git a/src/Benchmarks/Sorting/tnl-benchmark-sort.cu b/src/Benchmarks/Sorting/tnl-benchmark-sort.cu new file mode 120000 index 000000000..26b452a61 --- /dev/null +++ b/src/Benchmarks/Sorting/tnl-benchmark-sort.cu @@ -0,0 +1 @@ +tnl-benchmark-sort.cpp \ No newline at end of file diff --git a/src/Benchmarks/Sorting/tnl-benchmark-sort.h b/src/Benchmarks/Sorting/tnl-benchmark-sort.h new file mode 100644 index 000000000..1432a25f4 --- /dev/null +++ b/src/Benchmarks/Sorting/tnl-benchmark-sort.h @@ -0,0 +1,113 @@ +#include +#include +#include +#include +#include +using namespace std; + +#include "generators.h" +#include "Measurer.h" + +#ifndef LOW_POW + #define LOW_POW 10 +#endif + +#ifndef HIGH_POW + #define HIGH_POW 25 +#endif + +#ifndef TRIES + #define TRIES 20 +#endif + +using namespace TNL; + + +template< typename Sorter > +void start(ostream & out, string delim) +{ + out << "size" << delim; + out << "random" << delim; + out << "shuffle" << delim; + out << "sorted" << delim; + out << "almost" << delim; + out << "decreas" << delim; + out << "gauss" << delim; + out << "bucket" << delim; + out << "stagger" << delim; + out << "zero_entropy"; + out << endl; + + int wrongAnsCnt = 0; + + for(int pow = LOW_POW; pow <= HIGH_POW; pow++) + { + int size =(1<< pow); + vector vec(size); + + out << "2^" << pow << delim << flush; + out << fixed << setprecision(3); + + out << Measurer< Sorter >::measure( generateRandom(size), TRIES, wrongAnsCnt); + out << delim << flush; + + out << Measurer< Sorter >::measure( generateShuffle(size), TRIES, wrongAnsCnt); + out << delim << flush; + + out << Measurer< Sorter >::measure( generateSorted(size), TRIES, wrongAnsCnt); + out << delim << flush; + + out << Measurer< Sorter >::measure( generateAlmostSorted(size), TRIES, wrongAnsCnt); + out << delim << flush; + + out << Measurer< Sorter >::measure( generateDecreasing(size), TRIES, wrongAnsCnt); + out << delim << flush; + + out << Measurer< Sorter >::measure( generateGaussian(size), TRIES, wrongAnsCnt) ; + out << delim << flush; + + out << Measurer< Sorter >::measure( generateBucket(size), TRIES, wrongAnsCnt); + out << delim << flush; + + out << Measurer< Sorter >::measure( generateStaggered(size), TRIES, wrongAnsCnt); + out << delim << flush; + + out << Measurer< Sorter >::measure( generateZero_entropy(size), TRIES, wrongAnsCnt); + out << endl; + } + + if(wrongAnsCnt > 0) + std::cerr << wrongAnsCnt << "tries were sorted incorrectly" << std::endl; +} + +int main(int argc, char *argv[]) +{ + if(argc == 1) + { + std::cout << "STL sort on CPU ... " << std::endl; + start< STLSorter >( cout, "\t" ); + std::cout << "Quicksort on GPU ... " << std::endl; + start< QuicksortSorter >(cout, "\t"); + std::cout << "Bitonic sort on GPU ... " << std::endl; + start< BitonicSortSorter >( cout, "\t" ); + std::cout << "Manca quicksort on GPU ... " << std::endl; + start< MancaQuicksortSorter >( cout, "\t" ); + std::cout << "Cederman quicksort on GPU ... " << std::endl; + start< CedermanQuicksortSorter >( cout, "\t" ); + } + else + { + std::ofstream out(argv[1]); + std::cout << "STL sort on CPU ... " << std::endl; + start< STLSorter >( out, "," ); + std::cout << "Quicksort on GPU ... " << std::endl; + start< QuicksortSorter >(out, ","); + std::cout << "Bitonic sort on GPU ... " << std::endl; + start< BitonicSortSorter >(out, ","); + std::cout << "Manca quicksort on GPU ... " << std::endl; + start< MancaQuicksortSorter >( out, "," ); + std::cout << "Cederman quicksort on GPU ... " << std::endl; + start< CedermanQuicksortSorter >( out, "," ); + } + return 0; +} diff --git a/src/TNL/Algorithms/Sort.h b/src/TNL/Algorithms/Sort.h new file mode 100644 index 000000000..0600977cf --- /dev/null +++ b/src/TNL/Algorithms/Sort.h @@ -0,0 +1,30 @@ +/*************************************************************************** + Sort.h - description + ------------------- + begin : Jul 12, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber, Xuan Thang Nguyen + +#pragma once + +#include // std::pair, std::forward + +#include +#include +#include +#include +#include +#include + +namespace TNL { + namespace Algorithms { + + + + } // namespace Algorithms +} // namespace TNL \ No newline at end of file diff --git a/GPUSort/GPUSort/src/util/algorithm.h b/src/TNL/Algorithms/detail/Sorting/algorithm.h similarity index 85% rename from GPUSort/GPUSort/src/util/algorithm.h rename to src/TNL/Algorithms/detail/Sorting/algorithm.h index ac3cac57c..58282afbd 100644 --- a/GPUSort/GPUSort/src/util/algorithm.h +++ b/src/TNL/Algorithms/detail/Sorting/algorithm.h @@ -10,7 +10,7 @@ bool is_sorted(TNL::Containers::ArrayView arr, const auto fetch = [=] __cuda_callable__(int i) { return !Cmp(arr[i], arr[i - 1]); }; auto reduction = [] __cuda_callable__(bool a, bool b) { return a && b; }; - return TNL::Algorithms::Reduction::reduce(1, arr.getSize(), fetch, reduction, true); + return TNL::Algorithms::reduce(1, arr.getSize(), fetch, reduction, true); } template diff --git a/GPUSort/GPUSort/src/bitonicSort/bitonicSort.h b/src/TNL/Algorithms/detail/Sorting/bitonicSort.h similarity index 97% rename from GPUSort/GPUSort/src/bitonicSort/bitonicSort.h rename to src/TNL/Algorithms/detail/Sorting/bitonicSort.h index 0ca7e5e0d..f7d24b4fe 100644 --- a/GPUSort/GPUSort/src/bitonicSort/bitonicSort.h +++ b/src/TNL/Algorithms/detail/Sorting/bitonicSort.h @@ -1,9 +1,11 @@ #pragma once #include -#include "blockBitonicSort.cuh" -#include "helpers.h" +#include +#include -//--------------------------------------------- +namespace TNL { + namespace Algorithms { + namespace detail { /** * this kernel simulates 1 exchange @@ -355,4 +357,8 @@ void bitonicSort(int begin, int end, FETCH Fetch, const CMP &Cmp, SWAP Swap) } } cudaDeviceSynchronize(); -} \ No newline at end of file +} + + } // namespace detail + } // namespace Algorithms +} // namespace TNL \ No newline at end of file diff --git a/GPUSort/GPUSort/src/bitonicSort/blockBitonicSort.cuh b/src/TNL/Algorithms/detail/Sorting/blockBitonicSort.cuh similarity index 98% rename from GPUSort/GPUSort/src/bitonicSort/blockBitonicSort.cuh rename to src/TNL/Algorithms/detail/Sorting/blockBitonicSort.cuh index 26d962a27..bc03f7b94 100644 --- a/GPUSort/GPUSort/src/bitonicSort/blockBitonicSort.cuh +++ b/src/TNL/Algorithms/detail/Sorting/blockBitonicSort.cuh @@ -1,5 +1,5 @@ #pragma once -#include "helpers.h" +#include #include /** diff --git a/GPUSort/GPUSort/src/quicksort/cudaPartition.cuh b/src/TNL/Algorithms/detail/Sorting/cudaPartition.cuh similarity index 98% rename from GPUSort/GPUSort/src/quicksort/cudaPartition.cuh rename to src/TNL/Algorithms/detail/Sorting/cudaPartition.cuh index d220dfcdb..f9a292c00 100644 --- a/GPUSort/GPUSort/src/quicksort/cudaPartition.cuh +++ b/src/TNL/Algorithms/detail/Sorting/cudaPartition.cuh @@ -1,8 +1,8 @@ #pragma once #include -#include "../util/reduction.cuh" -#include "task.h" +#include +#include using namespace TNL; using namespace TNL::Containers; diff --git a/GPUSort/GPUSort/src/bitonicSort/helpers.h b/src/TNL/Algorithms/detail/Sorting/helpers.h similarity index 100% rename from GPUSort/GPUSort/src/bitonicSort/helpers.h rename to src/TNL/Algorithms/detail/Sorting/helpers.h diff --git a/GPUSort/GPUSort/src/quicksort/quicksort.cuh b/src/TNL/Algorithms/detail/Sorting/quicksort.cuh similarity index 97% rename from GPUSort/GPUSort/src/quicksort/quicksort.cuh rename to src/TNL/Algorithms/detail/Sorting/quicksort.cuh index e19aef9a5..fb4e8709a 100644 --- a/GPUSort/GPUSort/src/quicksort/quicksort.cuh +++ b/src/TNL/Algorithms/detail/Sorting/quicksort.cuh @@ -3,8 +3,8 @@ #include #include #include -#include "task.h" -#include "quicksort_kernel.cuh" +#include +#include #include #define deb(x) std::cout << #x << " = " << x << std::endl; @@ -17,6 +17,10 @@ using namespace TNL; using namespace TNL::Containers; +namespace TNL { + namespace Algorithms { + namespace detail { + template class QUICKSORT { @@ -312,7 +316,7 @@ int QUICKSORT::getSetsNeeded(int elemPerBlock) const return size / elemPerBlock + (size % elemPerBlock != 0); }; auto reduction = [] __cuda_callable__(int a, int b) { return a + b; }; - return Algorithms::Reduction::reduce(0, host_1stPhaseTasksAmount, fetch, reduction, 0); + return Algorithms::reduce(0, host_1stPhaseTasksAmount, fetch, reduction, 0); } template @@ -437,3 +441,7 @@ void quicksort(ArrayView arr) { quicksort(arr, [] __cuda_callable__(const Value &a, const Value &b) { return a < b; }); } + + } // namespace detail + } // namespace Algorithms +}// namespace TNL \ No newline at end of file diff --git a/GPUSort/GPUSort/src/quicksort/quicksort_1Block.cuh b/src/TNL/Algorithms/detail/Sorting/quicksort_1Block.cuh similarity index 97% rename from GPUSort/GPUSort/src/quicksort/quicksort_1Block.cuh rename to src/TNL/Algorithms/detail/Sorting/quicksort_1Block.cuh index b0a310cdf..50314c21c 100644 --- a/GPUSort/GPUSort/src/quicksort/quicksort_1Block.cuh +++ b/src/TNL/Algorithms/detail/Sorting/quicksort_1Block.cuh @@ -2,9 +2,9 @@ #include #include "cassert" -#include "../bitonicSort/bitonicSort.h" -#include "../util/reduction.cuh" -#include "cudaPartition.cuh" +#include +#include +#include using namespace TNL; using namespace TNL::Containers; diff --git a/GPUSort/GPUSort/src/quicksort/quicksort_kernel.cuh b/src/TNL/Algorithms/detail/Sorting/quicksort_kernel.cuh similarity index 97% rename from GPUSort/GPUSort/src/quicksort/quicksort_kernel.cuh rename to src/TNL/Algorithms/detail/Sorting/quicksort_kernel.cuh index 824b58199..344d1e8f1 100644 --- a/GPUSort/GPUSort/src/quicksort/quicksort_kernel.cuh +++ b/src/TNL/Algorithms/detail/Sorting/quicksort_kernel.cuh @@ -2,10 +2,10 @@ #include #include -#include "../util/reduction.cuh" -#include "task.h" -#include "cudaPartition.cuh" -#include "quicksort_1Block.cuh" +#include +#include +#include +#include using namespace TNL; using namespace TNL::Containers; diff --git a/GPUSort/GPUSort/src/util/reduction.cuh b/src/TNL/Algorithms/detail/Sorting/reduction.cuh similarity index 100% rename from GPUSort/GPUSort/src/util/reduction.cuh rename to src/TNL/Algorithms/detail/Sorting/reduction.cuh diff --git a/GPUSort/GPUSort/src/quicksort/task.h b/src/TNL/Algorithms/detail/Sorting/task.h similarity index 100% rename from GPUSort/GPUSort/src/quicksort/task.h rename to src/TNL/Algorithms/detail/Sorting/task.h diff --git a/src/UnitTests/Algorithms/CMakeLists.txt b/src/UnitTests/Algorithms/CMakeLists.txt index 14a7d43ab..31028036b 100644 --- a/src/UnitTests/Algorithms/CMakeLists.txt +++ b/src/UnitTests/Algorithms/CMakeLists.txt @@ -1,4 +1,5 @@ ADD_SUBDIRECTORY( Segments ) +ADD_SUBDIRECTORY( Sorting ) set( COMMON_TESTS MemoryOperationsTest diff --git a/src/UnitTests/Algorithms/Sorting/BitonicSortTest.cpp b/src/UnitTests/Algorithms/Sorting/BitonicSortTest.cpp new file mode 100644 index 000000000..6d48cf991 --- /dev/null +++ b/src/UnitTests/Algorithms/Sorting/BitonicSortTest.cpp @@ -0,0 +1 @@ +#include "BitonicSortTest.h" \ No newline at end of file diff --git a/src/UnitTests/Algorithms/Sorting/BitonicSortTest.cu b/src/UnitTests/Algorithms/Sorting/BitonicSortTest.cu new file mode 120000 index 000000000..dfdfdf06d --- /dev/null +++ b/src/UnitTests/Algorithms/Sorting/BitonicSortTest.cu @@ -0,0 +1 @@ +BitonicSortTest.cpp \ No newline at end of file diff --git a/GPUSort/GPUSort/tests/bitonic_tests/unitTests.cu b/src/UnitTests/Algorithms/Sorting/BitonicSortTest.h similarity index 99% rename from GPUSort/GPUSort/tests/bitonic_tests/unitTests.cu rename to src/UnitTests/Algorithms/Sorting/BitonicSortTest.h index 75cd5a0af..d1f4e8764 100644 --- a/GPUSort/GPUSort/tests/bitonic_tests/unitTests.cu +++ b/src/UnitTests/Algorithms/Sorting/BitonicSortTest.h @@ -7,8 +7,8 @@ #include #include -#include "../../src/bitonicSort/bitonicSort.h" -#include "../../src/util/algorithm.h" +#include +#include //---------------------------------------------------------------------------------- diff --git a/src/UnitTests/Algorithms/Sorting/CMakeLists.txt b/src/UnitTests/Algorithms/Sorting/CMakeLists.txt new file mode 100644 index 000000000..5dfcff323 --- /dev/null +++ b/src/UnitTests/Algorithms/Sorting/CMakeLists.txt @@ -0,0 +1,27 @@ +set( COMMON_TESTS + BitonicSortTest + QuicksortTest +) + +set( CPP_TESTS ) +set( CUDA_TESTS ) +if( BUILD_CUDA ) + set( CUDA_TESTS ${CUDA_TESTS} ${COMMON_TESTS} ) +else() + set( CPP_TESTS ${CPP_TESTS} ${COMMON_TESTS} ) +endif() + +foreach( target IN ITEMS ${CPP_TESTS} ) + add_executable( ${target} ${target}.cpp ) + target_compile_options( ${target} PRIVATE ${CXX_TESTS_FLAGS} ) + target_link_libraries( ${target} ${GTEST_BOTH_LIBRARIES} ) + add_test( ${target} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${target}${CMAKE_EXECUTABLE_SUFFIX} ) +endforeach() + +if( BUILD_CUDA ) + foreach( target IN ITEMS ${CUDA_TESTS} ) + cuda_add_executable( ${target} ${target}.cu OPTIONS ${CXX_TESTS_FLAGS} ) + target_link_libraries( ${target} ${GTEST_BOTH_LIBRARIES} ) + add_test( ${target} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${target}${CMAKE_EXECUTABLE_SUFFIX} ) + endforeach() +endif() diff --git a/src/UnitTests/Algorithms/Sorting/QuicksortTest.cpp b/src/UnitTests/Algorithms/Sorting/QuicksortTest.cpp new file mode 100644 index 000000000..04be4c54a --- /dev/null +++ b/src/UnitTests/Algorithms/Sorting/QuicksortTest.cpp @@ -0,0 +1 @@ +#include "QuicksortTest.h" \ No newline at end of file diff --git a/src/UnitTests/Algorithms/Sorting/QuicksortTest.cu b/src/UnitTests/Algorithms/Sorting/QuicksortTest.cu new file mode 120000 index 000000000..d6099b8e1 --- /dev/null +++ b/src/UnitTests/Algorithms/Sorting/QuicksortTest.cu @@ -0,0 +1 @@ +QuicksortTest.cpp \ No newline at end of file diff --git a/GPUSort/GPUSort/tests/quicksort_unitTests/unitTests.cu b/src/UnitTests/Algorithms/Sorting/QuicksortTest.h similarity index 98% rename from GPUSort/GPUSort/tests/quicksort_unitTests/unitTests.cu rename to src/UnitTests/Algorithms/Sorting/QuicksortTest.h index abcdfc3c0..b7ac83acc 100644 --- a/GPUSort/GPUSort/tests/quicksort_unitTests/unitTests.cu +++ b/src/UnitTests/Algorithms/Sorting/QuicksortTest.h @@ -8,8 +8,8 @@ #include #include -#include "../../src/quicksort/quicksort.cuh" -#include "../../src/util/algorithm.h" +#include +#include //---------------------------------------------------------------------------------- -- GitLab From 1a5effaa65759f96057c29a5701e1cbd9163bb4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Tue, 13 Jul 2021 12:23:22 +0200 Subject: [PATCH 232/258] Refactoring unit tests on sorting and fixing tests compilation without CUDA. --- .../Algorithms/detail/Sorting/bitonicSort.h | 22 ++-- .../detail/Sorting/blockBitonicSort.cuh | 6 +- .../detail/Sorting/cudaPartition.cuh | 6 +- src/TNL/Algorithms/detail/Sorting/helpers.h | 6 +- .../Algorithms/detail/Sorting/quicksort.cuh | 27 +++-- .../detail/Sorting/quicksort_1Block.cuh | 6 +- .../detail/Sorting/quicksort_kernel.cuh | 4 +- .../Algorithms/detail/Sorting/reduction.cuh | 12 +- .../Algorithms/Sorting/BitonicSortTest.h | 105 +++++++++--------- .../Algorithms/Sorting/QuicksortTest.h | 47 ++++---- 10 files changed, 139 insertions(+), 102 deletions(-) diff --git a/src/TNL/Algorithms/detail/Sorting/bitonicSort.h b/src/TNL/Algorithms/detail/Sorting/bitonicSort.h index f7d24b4fe..a4f9dd435 100644 --- a/src/TNL/Algorithms/detail/Sorting/bitonicSort.h +++ b/src/TNL/Algorithms/detail/Sorting/bitonicSort.h @@ -7,8 +7,10 @@ namespace TNL { namespace Algorithms { namespace detail { +#ifdef HAVE_CUDA + /** - * this kernel simulates 1 exchange + * this kernel simulates 1 exchange * splits input arr that is bitonic into 2 bitonic sequences */ template @@ -42,7 +44,7 @@ __global__ void bitonicMergeGlobal(TNL::Containers::ArrayView @@ -110,7 +112,7 @@ template __global__ void bitoniSort1stStepSharedMemory(TNL::Containers::ArrayView arr, CMP Cmp) { extern __shared__ int externMem[]; - + Value * sharedMem = (Value *)externMem; int sharedMemLen = 2*blockDim.x; @@ -154,14 +156,14 @@ __global__ void bitoniSort1stStepSharedMemory(TNL::Containers::ArrayView void bitonicSortWithShared(TNL::Containers::ArrayView view, const CMP &Cmp, int gridDim, int blockDim, int sharedMemLen, int sharedMemSize) { +#ifdef HAVE_CUDA int paddedSize = closestPow2(view.getSize()); bitoniSort1stStepSharedMemory<<>>(view, Cmp); @@ -188,6 +190,7 @@ void bitonicSortWithShared(TNL::Containers::ArrayView } } cudaDeviceSynchronize(); +#endif } //--------------------------------------------- @@ -198,6 +201,7 @@ void bitonicSort(TNL::Containers::ArrayView view, int gridDim, int blockDim) { +#ifdef HAVE_CUDA int paddedSize = closestPow2(view.getSize()); for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) @@ -208,12 +212,14 @@ void bitonicSort(TNL::Containers::ArrayView view, } } cudaDeviceSynchronize(); +#endif } //--------------------------------------------- template void bitonicSort(TNL::Containers::ArrayView src, int begin, int end, const CMP &Cmp) { +#ifdef HAVE_CUDA auto view = src.getView(begin, end); int threadsNeeded = view.getSize() / 2 + (view.getSize() % 2 != 0); @@ -245,6 +251,7 @@ void bitonicSort(TNL::Containers::ArrayView src, int int gridDim = threadsNeeded / maxThreadsPerBlock + (threadsNeeded % maxThreadsPerBlock != 0); bitonicSort(view, Cmp, gridDim, maxThreadsPerBlock); } +#endif } //--------------------------------------------- @@ -300,6 +307,7 @@ void bitonicSort(std::vector &vec) //--------------------------------------------- //--------------------------------------------- +#ifdef HAVE_CUDA template __global__ void bitonicMergeGlobal(int size, FETCH Fetch, CMP Cmp, SWAP Swap, int monotonicSeqLen, int bitonicLen) @@ -358,7 +366,7 @@ void bitonicSort(int begin, int end, FETCH Fetch, const CMP &Cmp, SWAP Swap) } cudaDeviceSynchronize(); } - +#endif } // namespace detail } // namespace Algorithms -} // namespace TNL \ No newline at end of file +} // namespace TNL diff --git a/src/TNL/Algorithms/detail/Sorting/blockBitonicSort.cuh b/src/TNL/Algorithms/detail/Sorting/blockBitonicSort.cuh index bc03f7b94..931f154d2 100644 --- a/src/TNL/Algorithms/detail/Sorting/blockBitonicSort.cuh +++ b/src/TNL/Algorithms/detail/Sorting/blockBitonicSort.cuh @@ -2,6 +2,8 @@ #include #include +#ifdef HAVE_CUDA + /** * IMPORTANT: all threads in block have to call this function to work properly * the size of src isn't limited, but for optimal efficiency, no more than 8*blockDim.x should be used @@ -97,4 +99,6 @@ __device__ void bitonicSort_Block(TNL::Containers::ArrayView #include +#ifdef HAVE_CUDA + using namespace TNL; using namespace TNL::Containers; @@ -203,4 +205,6 @@ __device__ void cudaPartition(ArrayView src, int destBigger = biggerStart + biggerPrefSumInc - bigger; copyData(srcView, dst, Cmp, destSmaller, destBigger, pivot); } -} \ No newline at end of file +} + +#endif \ No newline at end of file diff --git a/src/TNL/Algorithms/detail/Sorting/helpers.h b/src/TNL/Algorithms/detail/Sorting/helpers.h index 44629206a..928314f2c 100644 --- a/src/TNL/Algorithms/detail/Sorting/helpers.h +++ b/src/TNL/Algorithms/detail/Sorting/helpers.h @@ -1,6 +1,8 @@ #pragma once #include +#ifdef HAVE_CUDA + // Inline PTX call to return index of highest non-zero bit in a word static __device__ __forceinline__ unsigned int __btflo(unsigned int word) { @@ -33,4 +35,6 @@ __cuda_callable__ void cmpSwap(Value &a, Value &b, bool ascending, const CMP &Cm { if (ascending == Cmp(b, a)) TNL::swap(a, b); -} \ No newline at end of file +} + +#endif \ No newline at end of file diff --git a/src/TNL/Algorithms/detail/Sorting/quicksort.cuh b/src/TNL/Algorithms/detail/Sorting/quicksort.cuh index fb4e8709a..e8c2e86e7 100644 --- a/src/TNL/Algorithms/detail/Sorting/quicksort.cuh +++ b/src/TNL/Algorithms/detail/Sorting/quicksort.cuh @@ -96,7 +96,7 @@ public: int getSetsNeeded(int elemPerBlock) const; /** - * returns the optimal amount of elements per thread needed for phase + * returns the optimal amount of elements per thread needed for phase * */ int getElemPerBlock() const; @@ -132,6 +132,7 @@ template template void QUICKSORT::sort(const CMP &Cmp) { +#ifdef HAVE_CUDA firstPhase(Cmp); int total2ndPhase = host_1stPhaseTasksAmount + host_2ndPhaseTasksAmount; @@ -157,8 +158,7 @@ void QUICKSORT::sort(const CMP &Cmp) out << iteration << std::endl; } #endif - - return; +#endif } //--------------------------------------------------------------------------------------------- @@ -167,6 +167,7 @@ template template void QUICKSORT::firstPhase(const CMP &Cmp) { +#ifdef HAVE_CUDA while (host_1stPhaseTasksAmount > 0) { if (host_1stPhaseTasksAmount >= maxTasks) @@ -220,7 +221,7 @@ void QUICKSORT::firstPhase(const CMP &Cmp) /** * check if partition procedure can use shared memory for coalesced write after reordering - * + * * move elements smaller than pivot to the left and bigger to the right * note: pivot isnt inserted in the middle yet * */ @@ -243,7 +244,7 @@ void QUICKSORT::firstPhase(const CMP &Cmp) /** * fill in the gap between smaller and bigger with elements == pivot * after writing also create new tasks, each task generates at max 2 tasks - * + * * tasks smaller than desired_2ndPhasElemPerBlock go into 2nd phase * bigger need more blocks to partition and are written into newTask * with iteration %2, rotate between the 2 tasks array to save from copying @@ -261,6 +262,7 @@ void QUICKSORT::firstPhase(const CMP &Cmp) processNewTasks(); iteration++; } +#endif } //---------------------------------------------------------------------- @@ -269,6 +271,7 @@ template template void QUICKSORT::secondPhase(const CMP &Cmp) { +#ifdef HAVE_CUDA int total2ndPhase = host_1stPhaseTasksAmount + host_2ndPhaseTasksAmount; const int stackSize = 32; auto &leftoverTasks = iteration % 2 == 0 ? cuda_tasks : cuda_newTasks; @@ -302,6 +305,7 @@ void QUICKSORT::secondPhase(const CMP &Cmp) cudaQuickSort2ndPhase <<>>(arr, aux, Cmp, tasks2, elemInShared, desired_2ndPhasElemPerBlock); } +#endif } //---------------------------------------------------------------------- @@ -342,7 +346,7 @@ template template int QUICKSORT::initTasks(int elemPerBlock, const CMP &Cmp) { - +#ifdef HAVE_CUDA auto &src = iteration % 2 == 0 ? arr : aux; auto &tasks = iteration % 2 == 0 ? cuda_tasks : cuda_newTasks; @@ -352,7 +356,7 @@ int QUICKSORT::initTasks(int elemPerBlock, const CMP &Cmp) cudaCalcBlocksNeeded<<>>(tasks.getView(0, host_1stPhaseTasksAmount), elemPerBlock, cuda_reductionTaskInitMem.getView(0, host_1stPhaseTasksAmount)); //cuda_reductionTaskInitMem[i] == how many blocks task i needs - + auto reduce = [] __cuda_callable__(const int &a, const int &b) { return a + b; }; Algorithms::Scan:: @@ -375,6 +379,9 @@ int QUICKSORT::initTasks(int elemPerBlock, const CMP &Cmp) cuda_newTasksAmount.setElement(0, 0); //resets new element counter return blocksNeeded; +#else + return -1; +#endif } template @@ -391,6 +398,7 @@ void QUICKSORT::processNewTasks() template void quicksort(ArrayView arr, const CMP &Cmp) { +#ifdef HAVE_CUDA cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, 0); @@ -406,7 +414,7 @@ void quicksort(ArrayView arr, const CMP &Cmp) * the goal is to use shared memory as often as possible * each thread in a block will process n elements, n==multiplier * + 1 reserved for pivot (statically allocating Value type throws weird error, hence it needs to be dynamic) - * + * * blockDim*multiplier*sizeof(Value) + 1*sizeof(Value) <= maxSharable * */ int elemPerBlock = (maxSharable - sizeof(Value)) / sizeof(Value); //try to use up all of shared memory to store elements @@ -434,6 +442,7 @@ void quicksort(ArrayView arr, const CMP &Cmp) QUICKSORT sorter(arr, maxBlocks, blockDim, multiplier * blockDim, maxSharable); sorter.sort(Cmp); +#endif } template @@ -444,4 +453,4 @@ void quicksort(ArrayView arr) } // namespace detail } // namespace Algorithms -}// namespace TNL \ No newline at end of file +}// namespace TNL diff --git a/src/TNL/Algorithms/detail/Sorting/quicksort_1Block.cuh b/src/TNL/Algorithms/detail/Sorting/quicksort_1Block.cuh index 50314c21c..5582deca7 100644 --- a/src/TNL/Algorithms/detail/Sorting/quicksort_1Block.cuh +++ b/src/TNL/Algorithms/detail/Sorting/quicksort_1Block.cuh @@ -9,6 +9,8 @@ using namespace TNL; using namespace TNL::Containers; +#ifdef HAVE_CUDA + template __device__ void externSort(ArrayView src, ArrayView dst, @@ -231,4 +233,6 @@ __device__ void stackPush(int stackArrBegin[], int stackArrEnd[], stackTop++; } } -} \ No newline at end of file +} + +#endif \ No newline at end of file diff --git a/src/TNL/Algorithms/detail/Sorting/quicksort_kernel.cuh b/src/TNL/Algorithms/detail/Sorting/quicksort_kernel.cuh index 344d1e8f1..527642f1a 100644 --- a/src/TNL/Algorithms/detail/Sorting/quicksort_kernel.cuh +++ b/src/TNL/Algorithms/detail/Sorting/quicksort_kernel.cuh @@ -10,7 +10,7 @@ using namespace TNL; using namespace TNL::Containers; -//----------------------------------------------------------- +#ifdef HAVE_CUDA __device__ void writeNewTask(int begin, int end, int iteration, int maxElemFor2ndPhase, ArrayView newTasks, int *newTasksCnt, @@ -246,4 +246,4 @@ __global__ void cudaQuickSort2ndPhase(ArrayView arr, Array } } -//----------------------------------------------------------- \ No newline at end of file +#endif \ No newline at end of file diff --git a/src/TNL/Algorithms/detail/Sorting/reduction.cuh b/src/TNL/Algorithms/detail/Sorting/reduction.cuh index 47a934001..49b2ef7d6 100644 --- a/src/TNL/Algorithms/detail/Sorting/reduction.cuh +++ b/src/TNL/Algorithms/detail/Sorting/reduction.cuh @@ -1,8 +1,12 @@ #pragma once + +#ifdef HAVE_CUDA + /** * https://developer.nvidia.com/blog/faster-parallel-reductions-kepler/ * */ + __device__ int warpReduceSum(int initVal) { const unsigned int maskConstant = 0xffffffff; //not used @@ -69,7 +73,7 @@ __device__ int blockInclusivePrefixSum(int value) if (wid == 0) shared[lane] = warpInclusivePrefixSum(tmp2) - tmp2; __syncthreads(); - + tmp += shared[wid]; return tmp; } @@ -97,7 +101,7 @@ __device__ int blockCmpReduce(int val, const Operator & Cmp) if (lane == 0) shared[wid] = val; - __syncthreads(); + __syncthreads(); val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : shared[0]; @@ -106,7 +110,9 @@ __device__ int blockCmpReduce(int val, const Operator & Cmp) if(threadIdx.x == 0) shared[0] = val; - __syncthreads(); + __syncthreads(); return shared[0]; } + +#endif \ No newline at end of file diff --git a/src/UnitTests/Algorithms/Sorting/BitonicSortTest.h b/src/UnitTests/Algorithms/Sorting/BitonicSortTest.h index d1f4e8764..2e069e5bf 100644 --- a/src/UnitTests/Algorithms/Sorting/BitonicSortTest.h +++ b/src/UnitTests/Algorithms/Sorting/BitonicSortTest.h @@ -1,4 +1,3 @@ -#include "gtest/gtest.h" #include #include #include @@ -10,7 +9,13 @@ #include #include -//---------------------------------------------------------------------------------- +#if defined HAVE_GTEST && defined HAVE_CUDA +#include + + +using namespace TNL; +using namespace TNL::Algorithms; +using namespace TNL::Algorithms::detail; TEST(permutations, allPermutationSize_2_to_7) { @@ -27,8 +32,8 @@ TEST(permutations, allPermutationSize_2_to_7) bitonicSort(view); - ASSERT_TRUE(is_sorted(view)) << "failed " << i << std::endl; - } + EXPECT_TRUE(is_sorted(view)) << "failed " << i << std::endl; + } while (std::next_permutation(orig.begin(), orig.end())); } } @@ -52,7 +57,7 @@ TEST(permutations, allPermutationSize_8) bitonicSort(view); - ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl; + EXPECT_TRUE(is_sorted(view)) << "result " << view << std::endl; } while (std::next_permutation(orig.begin(), orig.end())); } @@ -76,20 +81,18 @@ TEST(permutations, somePermutationSize9) bitonicSort(view); - ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl; + EXPECT_TRUE(is_sorted(view)) << "result " << view << std::endl; } while (std::next_permutation(orig.begin(), orig.end())); } -//----------------------------------------------------------------------- - TEST(selectedSize, size15) { TNL::Containers::Array cudaArr{5, 9, 4, 8, 6, 1, 2, 3, 4, 8, 1, 6, 9, 4, 9}; auto view = cudaArr.getView(); - ASSERT_EQ(15, view.getSize()) << "size not 15" << std::endl; + EXPECT_EQ(15, view.getSize()) << "size not 15" << std::endl; bitonicSort(view); - ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl; + EXPECT_TRUE(is_sorted(view)) << "result " << view << std::endl; } TEST(multiblock, 32768_decreasingNegative) @@ -97,12 +100,12 @@ TEST(multiblock, 32768_decreasingNegative) std::vector arr(1<<15); for (size_t i = 0; i < arr.size(); i++) arr[i] = -i; - + TNL::Containers::Array cudaArr(arr); auto view = cudaArr.getView(); bitonicSort(view); - ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl; + EXPECT_TRUE(is_sorted(view)) << "result " << view << std::endl; } TEST(randomGenerated, smallArray_randomVal) @@ -118,7 +121,7 @@ TEST(randomGenerated, smallArray_randomVal) auto view = cudaArr.getView(); bitonicSort(view); - ASSERT_TRUE(is_sorted(view)); + EXPECT_TRUE(is_sorted(view)); } } @@ -133,7 +136,7 @@ TEST(randomGenerated, bigArray_all0) auto view = cudaArr.getView(); bitonicSort(view); - ASSERT_TRUE(is_sorted(view)); + EXPECT_TRUE(is_sorted(view)); } } @@ -142,7 +145,7 @@ TEST(nonIntegerType, float_notPow2) TNL::Containers::Array cudaArr{5.0, 9.4, 4.6, 8.9, 6.2, 1.15184, 2.23}; auto view = cudaArr.getView(); bitonicSort(view); - ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl; + EXPECT_TRUE(is_sorted(view)) << "result " << view << std::endl; } TEST(nonIntegerType, double_notPow2) @@ -150,7 +153,7 @@ TEST(nonIntegerType, double_notPow2) TNL::Containers::Array cudaArr{5.0, 9.4, 4.6, 8.9, 6.2, 1.15184, 2.23}; auto view = cudaArr.getView(); bitonicSort(view); - ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl; + EXPECT_TRUE(is_sorted(view)) << "result " << view << std::endl; } @@ -168,7 +171,7 @@ TEST(nonIntegerType, struct) TNL::Containers::Array cudaArr{TMPSTRUCT(5), TMPSTRUCT(6), TMPSTRUCT(9), TMPSTRUCT(1)}; auto view = cudaArr.getView(); bitonicSort(view); - ASSERT_TRUE(is_sorted(view)); + EXPECT_TRUE(is_sorted(view)); } struct TMPSTRUCT_64b{ @@ -190,7 +193,7 @@ TEST(nonIntegerType, struct_64b) TNL::Containers::Array cudaArr(vec); auto view = cudaArr.getView(); bitonicSort(view); - ASSERT_TRUE(is_sorted(view)); + EXPECT_TRUE(is_sorted(view)); } struct TMPSTRUCT_128b{ @@ -212,7 +215,7 @@ TEST(nonIntegerType, struct_128b) TNL::Containers::Array cudaArr(vec); auto view = cudaArr.getView(); bitonicSort(view); - ASSERT_TRUE(is_sorted(view)); + EXPECT_TRUE(is_sorted(view)); } //error bypassing @@ -229,13 +232,13 @@ TEST(sortWithFunction, descending) auto view = cudaArr.getView(); descendingSort(view); - ASSERT_FALSE(is_sorted(view)) << "result " << view << std::endl; - - ASSERT_TRUE(view.getElement(0) == 9); - ASSERT_TRUE(view.getElement(1) == 6); - ASSERT_TRUE(view.getElement(2) == 4); - ASSERT_TRUE(view.getElement(3) == 3); - ASSERT_TRUE(view.getElement(4) == 2); + EXPECT_FALSE(is_sorted(view)) << "result " << view << std::endl; + + EXPECT_TRUE(view.getElement(0) == 9); + EXPECT_TRUE(view.getElement(1) == 6); + EXPECT_TRUE(view.getElement(2) == 4); + EXPECT_TRUE(view.getElement(3) == 3); + EXPECT_TRUE(view.getElement(4) == 2); } TEST(sortstdVector, stdvector) @@ -246,7 +249,7 @@ TEST(sortstdVector, stdvector) bitonicSort(arr); - ASSERT_TRUE(std::is_sorted(arr.begin(), arr.end())); + EXPECT_TRUE(std::is_sorted(arr.begin(), arr.end())); } TEST(sortRange, secondHalf) @@ -258,9 +261,9 @@ TEST(sortRange, secondHalf) bitonicSort(arr, s, 19); - ASSERT_TRUE(std::is_sorted(arr.begin() + s, arr.end())); - ASSERT_TRUE(arr[0] == -1); - ASSERT_TRUE(arr[s-1] == -1); + EXPECT_TRUE(std::is_sorted(arr.begin() + s, arr.end())); + EXPECT_TRUE(arr[0] == -1); + EXPECT_TRUE(arr[s-1] == -1); } TEST(sortRange, middle) @@ -276,11 +279,11 @@ TEST(sortRange, middle) bitonicSort(arr, s, e); - ASSERT_TRUE(std::is_sorted(arr.begin() + s, arr.begin() + e)); - ASSERT_TRUE(arr[0] == -1); - ASSERT_TRUE(arr.back() == -1); - ASSERT_TRUE(arr[s-1] == -1); - ASSERT_TRUE(arr[e] == -1); + EXPECT_TRUE(std::is_sorted(arr.begin() + s, arr.begin() + e)); + EXPECT_TRUE(arr[0] == -1); + EXPECT_TRUE(arr.back() == -1); + EXPECT_TRUE(arr[s-1] == -1); + EXPECT_TRUE(arr[e] == -1); } TEST(sortRange, middleMultiBlock) @@ -297,15 +300,15 @@ TEST(sortRange, middleMultiBlock) bitonicSort(arr, s, e); - ASSERT_TRUE(std::is_sorted(arr.begin() + s, arr.begin() + e)); + EXPECT_TRUE(std::is_sorted(arr.begin() + s, arr.begin() + e)); - ASSERT_TRUE(arr[0] == -1); - ASSERT_TRUE(arr[std::rand() % s] == -1); - ASSERT_TRUE(arr[s-1] == -1); + EXPECT_TRUE(arr[0] == -1); + EXPECT_TRUE(arr[std::rand() % s] == -1); + EXPECT_TRUE(arr[s-1] == -1); - ASSERT_TRUE(arr[e] == -1); - ASSERT_TRUE(arr[e + (std::rand() % (size - e))] == -1); - ASSERT_TRUE(arr.back() == -1); + EXPECT_TRUE(arr[e] == -1); + EXPECT_TRUE(arr[e + (std::rand() % (size - e))] == -1); + EXPECT_TRUE(arr.back() == -1); } template @@ -334,7 +337,7 @@ TEST(fetchAndSwap, oneBlockSort) TNL::Containers::Array cudaArr(orig); auto view = cudaArr.getView(); fetchAndSwapSorter(view); - ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl; + EXPECT_TRUE(is_sorted(view)) << "result " << view << std::endl; } while (std::next_permutation(orig.begin(), orig.end())); } @@ -350,7 +353,7 @@ TEST(fetchAndSwap, typeDouble) TNL::Containers::Array cudaArr(orig); auto view = cudaArr.getView(); fetchAndSwapSorter(view); - ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl; + EXPECT_TRUE(is_sorted(view)) << "result " << view << std::endl; } while (std::next_permutation(orig.begin(), orig.end())); } @@ -371,21 +374,15 @@ TEST(fetchAndSwap, sortMiddle) int from = 3, to = 8; fetchAndSwap_sortMiddle(view, from, to); - ASSERT_TRUE(is_sorted(view.getView(3, 8))) << "result " << view << std::endl; + EXPECT_TRUE(is_sorted(view.getView(3, 8))) << "result " << view << std::endl; for(size_t i = 0; i < orig.size(); i++) { if(i < from || i >= to) - ASSERT_TRUE(view.getElement(i) == orig[i]); + EXPECT_TRUE(view.getElement(i) == orig[i]); } } +#endif -//---------------------------------------------------------------------------------- - -int main(int argc, char **argv) -{ - testing::InitGoogleTest(&argc, argv); - - return RUN_ALL_TESTS(); -} \ No newline at end of file +#include "../../main.h" diff --git a/src/UnitTests/Algorithms/Sorting/QuicksortTest.h b/src/UnitTests/Algorithms/Sorting/QuicksortTest.h index b7ac83acc..a53e7f369 100644 --- a/src/UnitTests/Algorithms/Sorting/QuicksortTest.h +++ b/src/UnitTests/Algorithms/Sorting/QuicksortTest.h @@ -1,25 +1,31 @@ -#include "gtest/gtest.h" #include #include #include #include -#include -#include + #include #include #include #include -//---------------------------------------------------------------------------------- +#if defined HAVE_CUDA_&& defined HAVE_GTEST +#include +#include + +#include + +using namespace TNL; +using namespace TNL::Algorithms; +using namespace TNL::Algorithms::detail; TEST(selectedSize, size15) { TNL::Containers::Array cudaArr{5, 9, 4, 8, 6, 1, 2, 3, 4, 8, 1, 6, 9, 4, 9}; auto view = cudaArr.getView(); - ASSERT_EQ(15, view.getSize()) << "size not 15" << std::endl; + EXPECT_EQ(15, view.getSize()) << "size not 15" << std::endl; quicksort(view); - ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl; + EXPECT_TRUE(is_sorted(view)) << "result " << view << std::endl; } TEST(multiblock, 32768_decreasingNegative) @@ -27,12 +33,12 @@ TEST(multiblock, 32768_decreasingNegative) std::vector arr(1<<15); for (size_t i = 0; i < arr.size(); i++) arr[i] = -i; - + TNL::Containers::Array cudaArr(arr); auto view = cudaArr.getView(); quicksort(view); - ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl; + EXPECT_TRUE(is_sorted(view)) << "result " << view << std::endl; } TEST(randomGenerated, smallArray_randomVal) @@ -48,7 +54,7 @@ TEST(randomGenerated, smallArray_randomVal) auto view = cudaArr.getView(); quicksort(view); - ASSERT_TRUE(is_sorted(view)); + EXPECT_TRUE(is_sorted(view)); } } @@ -65,7 +71,7 @@ TEST(randomGenerated, bigArray_randomVal) auto view = cudaArr.getView(); quicksort(view); - ASSERT_TRUE(is_sorted(view)); + EXPECT_TRUE(is_sorted(view)); } } @@ -83,7 +89,7 @@ TEST(noLostElement, smallArray) std::sort(arr.begin(), arr.end()); TNL::Containers::Array cudaArr2(arr); - ASSERT_TRUE(view == cudaArr2.getView()); + EXPECT_TRUE(view == cudaArr2.getView()); } TEST(noLostElement, midSizedArray) @@ -100,7 +106,7 @@ TEST(noLostElement, midSizedArray) std::sort(arr.begin(), arr.end()); TNL::Containers::Array cudaArr2(arr); - ASSERT_TRUE(view == cudaArr2.getView()); + EXPECT_TRUE(view == cudaArr2.getView()); } TEST(noLostElement, bigSizedArray) @@ -119,7 +125,7 @@ TEST(noLostElement, bigSizedArray) TNL::Containers::Array cudaArr2(arr); thrust::sort(thrust::device, cudaArr2.getData(), cudaArr2.getData() + cudaArr2.getSize()); - ASSERT_TRUE(view == cudaArr2.getView()); + EXPECT_TRUE(view == cudaArr2.getView()); } TEST(types, type_double) @@ -138,7 +144,7 @@ TEST(types, type_double) TNL::Containers::Array cudaArr2(arr); thrust::sort(thrust::device, cudaArr2.getData(), cudaArr2.getData() + cudaArr2.getSize()); - ASSERT_TRUE(view == cudaArr2.getView()); + EXPECT_TRUE(view == cudaArr2.getView()); } struct TMPSTRUCT_xyz{ @@ -163,7 +169,7 @@ TEST(types, struct_3D_points) //thrust::sort(thrust::device, cudaArr.getData(), cudaArr.getData() + cudaArr.getSize()); //std::cout << view << std::endl; quicksort(view); - ASSERT_TRUE(is_sorted(view)); + EXPECT_TRUE(is_sorted(view)); } struct TMPSTRUCT_64b{ @@ -188,14 +194,9 @@ TEST(types, struct_64b) //thrust::sort(thrust::device, cudaArr.getData(), cudaArr.getData() + cudaArr.getSize()); //std::cout << view << std::endl; quicksort(view); - ASSERT_TRUE(is_sorted(view)); + EXPECT_TRUE(is_sorted(view)); } -//---------------------------------------------------------------------------------- - -int main(int argc, char **argv) -{ - testing::InitGoogleTest(&argc, argv); +#endif - return RUN_ALL_TESTS(); -} \ No newline at end of file +#include "../../main.h" \ No newline at end of file -- GitLab From 7c4403c94d7fffd531e2c8a236d541070135a735 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Tue, 13 Jul 2021 12:33:05 +0200 Subject: [PATCH 233/258] Fixing tnl-benchmark-sort to build without CUDA. --- src/Benchmarks/Sorting/Measurer.h | 7 +++++++ src/Benchmarks/Sorting/tnl-benchmark-sort.h | 4 ++++ 2 files changed, 11 insertions(+) diff --git a/src/Benchmarks/Sorting/Measurer.h b/src/Benchmarks/Sorting/Measurer.h index 569344e62..025d3fd9b 100644 --- a/src/Benchmarks/Sorting/Measurer.h +++ b/src/Benchmarks/Sorting/Measurer.h @@ -3,8 +3,12 @@ #include #include #include + +#ifdef HAVE_CUDA #include "ReferenceAlgorithms/manca_quicksort.h" #include "ReferenceAlgorithms/cederman_qsort.h" +#endif + #include "timer.h" using namespace TNL; @@ -27,6 +31,7 @@ struct STLSorter static void sort( std::vector< Value >& vec ) { std::sort( vec.begin(), vec.end() ); }; }; +#ifdef HAVE_CUDA struct MancaQuicksortSorter { static void sort( Containers::ArrayView< int, Devices::Cuda >& array ) @@ -44,6 +49,8 @@ struct CedermanQuicksortSorter gpuqsort( ( unsigned int * ) array.getData(), ( unsigned int ) array.getSize() ); } }; +#endif + template< typename Sorter > struct Measurer diff --git a/src/Benchmarks/Sorting/tnl-benchmark-sort.h b/src/Benchmarks/Sorting/tnl-benchmark-sort.h index 1432a25f4..aa2885f4f 100644 --- a/src/Benchmarks/Sorting/tnl-benchmark-sort.h +++ b/src/Benchmarks/Sorting/tnl-benchmark-sort.h @@ -90,10 +90,12 @@ int main(int argc, char *argv[]) start< QuicksortSorter >(cout, "\t"); std::cout << "Bitonic sort on GPU ... " << std::endl; start< BitonicSortSorter >( cout, "\t" ); +#ifdef HAVE_CUDA std::cout << "Manca quicksort on GPU ... " << std::endl; start< MancaQuicksortSorter >( cout, "\t" ); std::cout << "Cederman quicksort on GPU ... " << std::endl; start< CedermanQuicksortSorter >( cout, "\t" ); +#endif } else { @@ -104,10 +106,12 @@ int main(int argc, char *argv[]) start< QuicksortSorter >(out, ","); std::cout << "Bitonic sort on GPU ... " << std::endl; start< BitonicSortSorter >(out, ","); +#ifdef HAVE_CUDA std::cout << "Manca quicksort on GPU ... " << std::endl; start< MancaQuicksortSorter >( out, "," ); std::cout << "Cederman quicksort on GPU ... " << std::endl; start< CedermanQuicksortSorter >( out, "," ); +#endif } return 0; } -- GitLab From 25f4953678aee7cf90d5ebd9aa638fc2103be0f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Tue, 13 Jul 2021 12:37:07 +0200 Subject: [PATCH 234/258] Renaming .cuh headers to .h headers. --- src/TNL/Algorithms/Sort.h | 4 ++-- src/TNL/Algorithms/detail/Sorting/bitonicSort.h | 2 +- .../Sorting/{blockBitonicSort.cuh => blockBitonicSort.h} | 0 .../detail/Sorting/{cudaPartition.cuh => cudaPartition.h} | 4 ++-- .../detail/Sorting/{quicksort.cuh => quicksort.h} | 2 +- .../Sorting/{quicksort_1Block.cuh => quicksort_1Block.h} | 6 +++--- .../Sorting/{quicksort_kernel.cuh => quicksort_kernel.h} | 8 ++++---- .../detail/Sorting/{reduction.cuh => reduction.h} | 0 8 files changed, 13 insertions(+), 13 deletions(-) rename src/TNL/Algorithms/detail/Sorting/{blockBitonicSort.cuh => blockBitonicSort.h} (100%) rename src/TNL/Algorithms/detail/Sorting/{cudaPartition.cuh => cudaPartition.h} (99%) rename src/TNL/Algorithms/detail/Sorting/{quicksort.cuh => quicksort.h} (99%) rename src/TNL/Algorithms/detail/Sorting/{quicksort_1Block.cuh => quicksort_1Block.h} (98%) rename src/TNL/Algorithms/detail/Sorting/{quicksort_kernel.cuh => quicksort_kernel.h} (98%) rename src/TNL/Algorithms/detail/Sorting/{reduction.cuh => reduction.h} (100%) diff --git a/src/TNL/Algorithms/Sort.h b/src/TNL/Algorithms/Sort.h index 0600977cf..fb09fa448 100644 --- a/src/TNL/Algorithms/Sort.h +++ b/src/TNL/Algorithms/Sort.h @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include namespace TNL { @@ -27,4 +27,4 @@ namespace TNL { } // namespace Algorithms -} // namespace TNL \ No newline at end of file +} // namespace TNL diff --git a/src/TNL/Algorithms/detail/Sorting/bitonicSort.h b/src/TNL/Algorithms/detail/Sorting/bitonicSort.h index a4f9dd435..8ccf104d2 100644 --- a/src/TNL/Algorithms/detail/Sorting/bitonicSort.h +++ b/src/TNL/Algorithms/detail/Sorting/bitonicSort.h @@ -1,6 +1,6 @@ #pragma once #include -#include +#include #include namespace TNL { diff --git a/src/TNL/Algorithms/detail/Sorting/blockBitonicSort.cuh b/src/TNL/Algorithms/detail/Sorting/blockBitonicSort.h similarity index 100% rename from src/TNL/Algorithms/detail/Sorting/blockBitonicSort.cuh rename to src/TNL/Algorithms/detail/Sorting/blockBitonicSort.h diff --git a/src/TNL/Algorithms/detail/Sorting/cudaPartition.cuh b/src/TNL/Algorithms/detail/Sorting/cudaPartition.h similarity index 99% rename from src/TNL/Algorithms/detail/Sorting/cudaPartition.cuh rename to src/TNL/Algorithms/detail/Sorting/cudaPartition.h index ac94bc85b..e155d8c5d 100644 --- a/src/TNL/Algorithms/detail/Sorting/cudaPartition.cuh +++ b/src/TNL/Algorithms/detail/Sorting/cudaPartition.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include #include #ifdef HAVE_CUDA @@ -207,4 +207,4 @@ __device__ void cudaPartition(ArrayView src, } } -#endif \ No newline at end of file +#endif diff --git a/src/TNL/Algorithms/detail/Sorting/quicksort.cuh b/src/TNL/Algorithms/detail/Sorting/quicksort.h similarity index 99% rename from src/TNL/Algorithms/detail/Sorting/quicksort.cuh rename to src/TNL/Algorithms/detail/Sorting/quicksort.h index e8c2e86e7..beb564691 100644 --- a/src/TNL/Algorithms/detail/Sorting/quicksort.cuh +++ b/src/TNL/Algorithms/detail/Sorting/quicksort.h @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #define deb(x) std::cout << #x << " = " << x << std::endl; diff --git a/src/TNL/Algorithms/detail/Sorting/quicksort_1Block.cuh b/src/TNL/Algorithms/detail/Sorting/quicksort_1Block.h similarity index 98% rename from src/TNL/Algorithms/detail/Sorting/quicksort_1Block.cuh rename to src/TNL/Algorithms/detail/Sorting/quicksort_1Block.h index 5582deca7..88be0b803 100644 --- a/src/TNL/Algorithms/detail/Sorting/quicksort_1Block.cuh +++ b/src/TNL/Algorithms/detail/Sorting/quicksort_1Block.h @@ -3,8 +3,8 @@ #include #include "cassert" #include -#include -#include +#include +#include using namespace TNL; using namespace TNL::Containers; @@ -235,4 +235,4 @@ __device__ void stackPush(int stackArrBegin[], int stackArrEnd[], } } -#endif \ No newline at end of file +#endif diff --git a/src/TNL/Algorithms/detail/Sorting/quicksort_kernel.cuh b/src/TNL/Algorithms/detail/Sorting/quicksort_kernel.h similarity index 98% rename from src/TNL/Algorithms/detail/Sorting/quicksort_kernel.cuh rename to src/TNL/Algorithms/detail/Sorting/quicksort_kernel.h index 527642f1a..4f9d82d07 100644 --- a/src/TNL/Algorithms/detail/Sorting/quicksort_kernel.cuh +++ b/src/TNL/Algorithms/detail/Sorting/quicksort_kernel.h @@ -2,10 +2,10 @@ #include #include -#include +#include #include -#include -#include +#include +#include using namespace TNL; using namespace TNL::Containers; @@ -246,4 +246,4 @@ __global__ void cudaQuickSort2ndPhase(ArrayView arr, Array } } -#endif \ No newline at end of file +#endif diff --git a/src/TNL/Algorithms/detail/Sorting/reduction.cuh b/src/TNL/Algorithms/detail/Sorting/reduction.h similarity index 100% rename from src/TNL/Algorithms/detail/Sorting/reduction.cuh rename to src/TNL/Algorithms/detail/Sorting/reduction.h -- GitLab From 89ea8638c5d727e60c0063d3a3defd5ba145a18d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Tue, 13 Jul 2021 17:34:04 +0200 Subject: [PATCH 235/258] Refactoring function is_sorted to Algorithms::isSorted. --- src/Benchmarks/Sorting/Measurer.h | 3 ++- src/TNL/Algorithms/Sort.h | 20 +++++++++++++++++++ src/TNL/Algorithms/detail/Sorting/algorithm.h | 20 ------------------- .../Algorithms/Sorting/QuicksortTest.h | 4 ++-- 4 files changed, 24 insertions(+), 23 deletions(-) delete mode 100644 src/TNL/Algorithms/detail/Sorting/algorithm.h diff --git a/src/Benchmarks/Sorting/Measurer.h b/src/Benchmarks/Sorting/Measurer.h index 025d3fd9b..d41f70133 100644 --- a/src/Benchmarks/Sorting/Measurer.h +++ b/src/Benchmarks/Sorting/Measurer.h @@ -3,6 +3,7 @@ #include #include #include +#include #ifdef HAVE_CUDA #include "ReferenceAlgorithms/manca_quicksort.h" @@ -69,7 +70,7 @@ struct Measurer Sorter::sort(view); } - if(!is_sorted(view)) + if( ! Algorithms::isSorted( view ) ) wrongAnsCnt++; } return accumulate(resAcc.begin(), resAcc.end(), 0.0) / resAcc.size(); diff --git a/src/TNL/Algorithms/Sort.h b/src/TNL/Algorithms/Sort.h index fb09fa448..cd61707b0 100644 --- a/src/TNL/Algorithms/Sort.h +++ b/src/TNL/Algorithms/Sort.h @@ -25,6 +25,26 @@ namespace TNL { namespace Algorithms { +template +bool isSorted( const Array& arr, const Function& cmp ) +{ + using Device = typename Array::DeviceType; + if (arr.getSize() <= 1) + return true; + + auto view = arr.getConstView(); + auto fetch = [=] __cuda_callable__(int i) { return ! cmp( view[ i ], view[ i - 1 ] ); }; + auto reduction = [] __cuda_callable__(bool a, bool b) { return a && b; }; + return TNL::Algorithms::reduce< Device >( 1, arr.getSize(), fetch, reduction, true ); +} + +template< typename Array > +bool isSorted( const Array& arr) +{ + using Value = typename Array::ValueType; + return isSorted( arr, [] __cuda_callable__( const Value& a, const Value& b ) { return a < b; }); +} + } // namespace Algorithms } // namespace TNL diff --git a/src/TNL/Algorithms/detail/Sorting/algorithm.h b/src/TNL/Algorithms/detail/Sorting/algorithm.h deleted file mode 100644 index 58282afbd..000000000 --- a/src/TNL/Algorithms/detail/Sorting/algorithm.h +++ /dev/null @@ -1,20 +0,0 @@ -#pragma once -#include -#include - -template -bool is_sorted(TNL::Containers::ArrayView arr, const Function &Cmp) -{ - if (arr.getSize() <= 1) - return true; - - auto fetch = [=] __cuda_callable__(int i) { return !Cmp(arr[i], arr[i - 1]); }; - auto reduction = [] __cuda_callable__(bool a, bool b) { return a && b; }; - return TNL::Algorithms::reduce(1, arr.getSize(), fetch, reduction, true); -} - -template -bool is_sorted(TNL::Containers::ArrayView arr) -{ - return is_sorted(arr, [] __cuda_callable__(const Value &a, const Value &b) { return a < b; }); -} diff --git a/src/UnitTests/Algorithms/Sorting/QuicksortTest.h b/src/UnitTests/Algorithms/Sorting/QuicksortTest.h index a53e7f369..7a34c296a 100644 --- a/src/UnitTests/Algorithms/Sorting/QuicksortTest.h +++ b/src/UnitTests/Algorithms/Sorting/QuicksortTest.h @@ -6,7 +6,7 @@ #include #include -#include +#include #include #if defined HAVE_CUDA_&& defined HAVE_GTEST @@ -199,4 +199,4 @@ TEST(types, struct_64b) #endif -#include "../../main.h" \ No newline at end of file +#include "../../main.h" -- GitLab From a4a562a7a7eb7f1ab16ae416fa982227d8b979ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Tue, 13 Jul 2021 17:50:42 +0200 Subject: [PATCH 236/258] Moving TNL/Algorithms/details/Sorting into TNL/Algorithms/Sorting. --- src/TNL/Algorithms/Sort.h | 5 +-- .../{detail => }/Sorting/bitonicSort.h | 4 +- .../{detail => }/Sorting/blockBitonicSort.h | 4 +- .../{detail => }/Sorting/cudaPartition.h | 4 +- .../Algorithms/{detail => }/Sorting/helpers.h | 0 .../{detail => }/Sorting/quicksort.h | 4 +- .../{detail => }/Sorting/quicksort_1Block.h | 6 +-- .../{detail => }/Sorting/quicksort_kernel.h | 8 ++-- .../{detail => }/Sorting/reduction.h | 0 .../Algorithms/{detail => }/Sorting/task.h | 0 .../Algorithms/Sorting/BitonicSortTest.h | 44 +++++++++---------- .../Algorithms/Sorting/QuicksortTest.h | 16 +++---- 12 files changed, 47 insertions(+), 48 deletions(-) rename src/TNL/Algorithms/{detail => }/Sorting/bitonicSort.h (99%) rename src/TNL/Algorithms/{detail => }/Sorting/blockBitonicSort.h (98%) rename src/TNL/Algorithms/{detail => }/Sorting/cudaPartition.h (98%) rename src/TNL/Algorithms/{detail => }/Sorting/helpers.h (100%) rename src/TNL/Algorithms/{detail => }/Sorting/quicksort.h (99%) rename src/TNL/Algorithms/{detail => }/Sorting/quicksort_1Block.h (97%) rename src/TNL/Algorithms/{detail => }/Sorting/quicksort_kernel.h (97%) rename src/TNL/Algorithms/{detail => }/Sorting/reduction.h (100%) rename src/TNL/Algorithms/{detail => }/Sorting/task.h (100%) diff --git a/src/TNL/Algorithms/Sort.h b/src/TNL/Algorithms/Sort.h index cd61707b0..f0724165a 100644 --- a/src/TNL/Algorithms/Sort.h +++ b/src/TNL/Algorithms/Sort.h @@ -17,9 +17,8 @@ #include #include #include -#include -#include -#include +#include +#include namespace TNL { namespace Algorithms { diff --git a/src/TNL/Algorithms/detail/Sorting/bitonicSort.h b/src/TNL/Algorithms/Sorting/bitonicSort.h similarity index 99% rename from src/TNL/Algorithms/detail/Sorting/bitonicSort.h rename to src/TNL/Algorithms/Sorting/bitonicSort.h index 8ccf104d2..461ac187b 100644 --- a/src/TNL/Algorithms/detail/Sorting/bitonicSort.h +++ b/src/TNL/Algorithms/Sorting/bitonicSort.h @@ -1,7 +1,7 @@ #pragma once #include -#include -#include +#include +#include namespace TNL { namespace Algorithms { diff --git a/src/TNL/Algorithms/detail/Sorting/blockBitonicSort.h b/src/TNL/Algorithms/Sorting/blockBitonicSort.h similarity index 98% rename from src/TNL/Algorithms/detail/Sorting/blockBitonicSort.h rename to src/TNL/Algorithms/Sorting/blockBitonicSort.h index 931f154d2..5167db973 100644 --- a/src/TNL/Algorithms/detail/Sorting/blockBitonicSort.h +++ b/src/TNL/Algorithms/Sorting/blockBitonicSort.h @@ -1,5 +1,5 @@ #pragma once -#include +#include #include #ifdef HAVE_CUDA @@ -101,4 +101,4 @@ __device__ void bitonicSort_Block(TNL::Containers::ArrayView -#include -#include +#include +#include #ifdef HAVE_CUDA diff --git a/src/TNL/Algorithms/detail/Sorting/helpers.h b/src/TNL/Algorithms/Sorting/helpers.h similarity index 100% rename from src/TNL/Algorithms/detail/Sorting/helpers.h rename to src/TNL/Algorithms/Sorting/helpers.h diff --git a/src/TNL/Algorithms/detail/Sorting/quicksort.h b/src/TNL/Algorithms/Sorting/quicksort.h similarity index 99% rename from src/TNL/Algorithms/detail/Sorting/quicksort.h rename to src/TNL/Algorithms/Sorting/quicksort.h index beb564691..545137464 100644 --- a/src/TNL/Algorithms/detail/Sorting/quicksort.h +++ b/src/TNL/Algorithms/Sorting/quicksort.h @@ -3,8 +3,8 @@ #include #include #include -#include -#include +#include +#include #include #define deb(x) std::cout << #x << " = " << x << std::endl; diff --git a/src/TNL/Algorithms/detail/Sorting/quicksort_1Block.h b/src/TNL/Algorithms/Sorting/quicksort_1Block.h similarity index 97% rename from src/TNL/Algorithms/detail/Sorting/quicksort_1Block.h rename to src/TNL/Algorithms/Sorting/quicksort_1Block.h index 88be0b803..dce65bb1f 100644 --- a/src/TNL/Algorithms/detail/Sorting/quicksort_1Block.h +++ b/src/TNL/Algorithms/Sorting/quicksort_1Block.h @@ -2,9 +2,9 @@ #include #include "cassert" -#include -#include -#include +#include +#include +#include using namespace TNL; using namespace TNL::Containers; diff --git a/src/TNL/Algorithms/detail/Sorting/quicksort_kernel.h b/src/TNL/Algorithms/Sorting/quicksort_kernel.h similarity index 97% rename from src/TNL/Algorithms/detail/Sorting/quicksort_kernel.h rename to src/TNL/Algorithms/Sorting/quicksort_kernel.h index 4f9d82d07..6da60041d 100644 --- a/src/TNL/Algorithms/detail/Sorting/quicksort_kernel.h +++ b/src/TNL/Algorithms/Sorting/quicksort_kernel.h @@ -2,10 +2,10 @@ #include #include -#include -#include -#include -#include +#include +#include +#include +#include using namespace TNL; using namespace TNL::Containers; diff --git a/src/TNL/Algorithms/detail/Sorting/reduction.h b/src/TNL/Algorithms/Sorting/reduction.h similarity index 100% rename from src/TNL/Algorithms/detail/Sorting/reduction.h rename to src/TNL/Algorithms/Sorting/reduction.h diff --git a/src/TNL/Algorithms/detail/Sorting/task.h b/src/TNL/Algorithms/Sorting/task.h similarity index 100% rename from src/TNL/Algorithms/detail/Sorting/task.h rename to src/TNL/Algorithms/Sorting/task.h diff --git a/src/UnitTests/Algorithms/Sorting/BitonicSortTest.h b/src/UnitTests/Algorithms/Sorting/BitonicSortTest.h index 2e069e5bf..e7878199e 100644 --- a/src/UnitTests/Algorithms/Sorting/BitonicSortTest.h +++ b/src/UnitTests/Algorithms/Sorting/BitonicSortTest.h @@ -6,8 +6,8 @@ #include #include -#include -#include +#include +#include #if defined HAVE_GTEST && defined HAVE_CUDA #include @@ -32,7 +32,7 @@ TEST(permutations, allPermutationSize_2_to_7) bitonicSort(view); - EXPECT_TRUE(is_sorted(view)) << "failed " << i << std::endl; + EXPECT_TRUE( Algorithms::isSorted( view ) ) << "failed " << i << std::endl; } while (std::next_permutation(orig.begin(), orig.end())); } @@ -57,7 +57,7 @@ TEST(permutations, allPermutationSize_8) bitonicSort(view); - EXPECT_TRUE(is_sorted(view)) << "result " << view << std::endl; + EXPECT_TRUE(Algorithms::isSorted(view)) << "result " << view << std::endl; } while (std::next_permutation(orig.begin(), orig.end())); } @@ -81,7 +81,7 @@ TEST(permutations, somePermutationSize9) bitonicSort(view); - EXPECT_TRUE(is_sorted(view)) << "result " << view << std::endl; + EXPECT_TRUE(Algorithms::isSorted(view)) << "result " << view << std::endl; } while (std::next_permutation(orig.begin(), orig.end())); } @@ -92,7 +92,7 @@ TEST(selectedSize, size15) auto view = cudaArr.getView(); EXPECT_EQ(15, view.getSize()) << "size not 15" << std::endl; bitonicSort(view); - EXPECT_TRUE(is_sorted(view)) << "result " << view << std::endl; + EXPECT_TRUE(Algorithms::isSorted(view)) << "result " << view << std::endl; } TEST(multiblock, 32768_decreasingNegative) @@ -105,7 +105,7 @@ TEST(multiblock, 32768_decreasingNegative) auto view = cudaArr.getView(); bitonicSort(view); - EXPECT_TRUE(is_sorted(view)) << "result " << view << std::endl; + EXPECT_TRUE(Algorithms::isSorted(view)) << "result " << view << std::endl; } TEST(randomGenerated, smallArray_randomVal) @@ -121,7 +121,7 @@ TEST(randomGenerated, smallArray_randomVal) auto view = cudaArr.getView(); bitonicSort(view); - EXPECT_TRUE(is_sorted(view)); + EXPECT_TRUE(Algorithms::isSorted(view)); } } @@ -136,7 +136,7 @@ TEST(randomGenerated, bigArray_all0) auto view = cudaArr.getView(); bitonicSort(view); - EXPECT_TRUE(is_sorted(view)); + EXPECT_TRUE(Algorithms::isSorted(view)); } } @@ -145,7 +145,7 @@ TEST(nonIntegerType, float_notPow2) TNL::Containers::Array cudaArr{5.0, 9.4, 4.6, 8.9, 6.2, 1.15184, 2.23}; auto view = cudaArr.getView(); bitonicSort(view); - EXPECT_TRUE(is_sorted(view)) << "result " << view << std::endl; + EXPECT_TRUE(Algorithms::isSorted(view)) << "result " << view << std::endl; } TEST(nonIntegerType, double_notPow2) @@ -153,7 +153,7 @@ TEST(nonIntegerType, double_notPow2) TNL::Containers::Array cudaArr{5.0, 9.4, 4.6, 8.9, 6.2, 1.15184, 2.23}; auto view = cudaArr.getView(); bitonicSort(view); - EXPECT_TRUE(is_sorted(view)) << "result " << view << std::endl; + EXPECT_TRUE(Algorithms::isSorted(view)) << "result " << view << std::endl; } @@ -171,7 +171,7 @@ TEST(nonIntegerType, struct) TNL::Containers::Array cudaArr{TMPSTRUCT(5), TMPSTRUCT(6), TMPSTRUCT(9), TMPSTRUCT(1)}; auto view = cudaArr.getView(); bitonicSort(view); - EXPECT_TRUE(is_sorted(view)); + EXPECT_TRUE(Algorithms::isSorted(view)); } struct TMPSTRUCT_64b{ @@ -193,7 +193,7 @@ TEST(nonIntegerType, struct_64b) TNL::Containers::Array cudaArr(vec); auto view = cudaArr.getView(); bitonicSort(view); - EXPECT_TRUE(is_sorted(view)); + EXPECT_TRUE(Algorithms::isSorted(view)); } struct TMPSTRUCT_128b{ @@ -215,7 +215,7 @@ TEST(nonIntegerType, struct_128b) TNL::Containers::Array cudaArr(vec); auto view = cudaArr.getView(); bitonicSort(view); - EXPECT_TRUE(is_sorted(view)); + EXPECT_TRUE(Algorithms::isSorted(view)); } //error bypassing @@ -232,7 +232,7 @@ TEST(sortWithFunction, descending) auto view = cudaArr.getView(); descendingSort(view); - EXPECT_FALSE(is_sorted(view)) << "result " << view << std::endl; + EXPECT_FALSE(Algorithms::isSorted(view)) << "result " << view << std::endl; EXPECT_TRUE(view.getElement(0) == 9); EXPECT_TRUE(view.getElement(1) == 6); @@ -249,7 +249,7 @@ TEST(sortstdVector, stdvector) bitonicSort(arr); - EXPECT_TRUE(std::is_sorted(arr.begin(), arr.end())); + EXPECT_TRUE(std::Algorithms::isSorted(arr.begin(), arr.end())); } TEST(sortRange, secondHalf) @@ -261,7 +261,7 @@ TEST(sortRange, secondHalf) bitonicSort(arr, s, 19); - EXPECT_TRUE(std::is_sorted(arr.begin() + s, arr.end())); + EXPECT_TRUE(std::Algorithms::isSorted(arr.begin() + s, arr.end())); EXPECT_TRUE(arr[0] == -1); EXPECT_TRUE(arr[s-1] == -1); } @@ -279,7 +279,7 @@ TEST(sortRange, middle) bitonicSort(arr, s, e); - EXPECT_TRUE(std::is_sorted(arr.begin() + s, arr.begin() + e)); + EXPECT_TRUE(std::Algorithms::isSorted(arr.begin() + s, arr.begin() + e)); EXPECT_TRUE(arr[0] == -1); EXPECT_TRUE(arr.back() == -1); EXPECT_TRUE(arr[s-1] == -1); @@ -300,7 +300,7 @@ TEST(sortRange, middleMultiBlock) bitonicSort(arr, s, e); - EXPECT_TRUE(std::is_sorted(arr.begin() + s, arr.begin() + e)); + EXPECT_TRUE(std::Algorithms::isSorted(arr.begin() + s, arr.begin() + e)); EXPECT_TRUE(arr[0] == -1); EXPECT_TRUE(arr[std::rand() % s] == -1); @@ -337,7 +337,7 @@ TEST(fetchAndSwap, oneBlockSort) TNL::Containers::Array cudaArr(orig); auto view = cudaArr.getView(); fetchAndSwapSorter(view); - EXPECT_TRUE(is_sorted(view)) << "result " << view << std::endl; + EXPECT_TRUE(Algorithms::isSorted(view)) << "result " << view << std::endl; } while (std::next_permutation(orig.begin(), orig.end())); } @@ -353,7 +353,7 @@ TEST(fetchAndSwap, typeDouble) TNL::Containers::Array cudaArr(orig); auto view = cudaArr.getView(); fetchAndSwapSorter(view); - EXPECT_TRUE(is_sorted(view)) << "result " << view << std::endl; + EXPECT_TRUE(Algorithms::isSorted(view)) << "result " << view << std::endl; } while (std::next_permutation(orig.begin(), orig.end())); } @@ -374,7 +374,7 @@ TEST(fetchAndSwap, sortMiddle) int from = 3, to = 8; fetchAndSwap_sortMiddle(view, from, to); - EXPECT_TRUE(is_sorted(view.getView(3, 8))) << "result " << view << std::endl; + EXPECT_TRUE(Algorithms::isSorted(view.getView(3, 8))) << "result " << view << std::endl; for(size_t i = 0; i < orig.size(); i++) { diff --git a/src/UnitTests/Algorithms/Sorting/QuicksortTest.h b/src/UnitTests/Algorithms/Sorting/QuicksortTest.h index 7a34c296a..f8a6fe6da 100644 --- a/src/UnitTests/Algorithms/Sorting/QuicksortTest.h +++ b/src/UnitTests/Algorithms/Sorting/QuicksortTest.h @@ -6,8 +6,8 @@ #include #include -#include -#include +#include +#include #if defined HAVE_CUDA_&& defined HAVE_GTEST #include @@ -25,7 +25,7 @@ TEST(selectedSize, size15) auto view = cudaArr.getView(); EXPECT_EQ(15, view.getSize()) << "size not 15" << std::endl; quicksort(view); - EXPECT_TRUE(is_sorted(view)) << "result " << view << std::endl; + EXPECT_TRUE(Algorithms::isSorted(view)) << "result " << view << std::endl; } TEST(multiblock, 32768_decreasingNegative) @@ -38,7 +38,7 @@ TEST(multiblock, 32768_decreasingNegative) auto view = cudaArr.getView(); quicksort(view); - EXPECT_TRUE(is_sorted(view)) << "result " << view << std::endl; + EXPECT_TRUE(Algorithms::isSorted(view)) << "result " << view << std::endl; } TEST(randomGenerated, smallArray_randomVal) @@ -54,7 +54,7 @@ TEST(randomGenerated, smallArray_randomVal) auto view = cudaArr.getView(); quicksort(view); - EXPECT_TRUE(is_sorted(view)); + EXPECT_TRUE(Algorithms::isSorted(view)); } } @@ -71,7 +71,7 @@ TEST(randomGenerated, bigArray_randomVal) auto view = cudaArr.getView(); quicksort(view); - EXPECT_TRUE(is_sorted(view)); + EXPECT_TRUE(Algorithms::isSorted(view)); } } @@ -169,7 +169,7 @@ TEST(types, struct_3D_points) //thrust::sort(thrust::device, cudaArr.getData(), cudaArr.getData() + cudaArr.getSize()); //std::cout << view << std::endl; quicksort(view); - EXPECT_TRUE(is_sorted(view)); + EXPECT_TRUE(Algorithms::isSorted(view)); } struct TMPSTRUCT_64b{ @@ -194,7 +194,7 @@ TEST(types, struct_64b) //thrust::sort(thrust::device, cudaArr.getData(), cudaArr.getData() + cudaArr.getSize()); //std::cout << view << std::endl; quicksort(view); - EXPECT_TRUE(is_sorted(view)); + EXPECT_TRUE(Algorithms::isSorted(view)); } #endif -- GitLab From 3f12f3df3f918e04061e4b6be9085d4c67e53703 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Wed, 14 Jul 2021 10:24:35 +0200 Subject: [PATCH 237/258] Refactoring quicksort. --- src/Benchmarks/Sorting/Measurer.h | 4 +- src/TNL/Algorithms/Sort.h | 2 +- src/TNL/Algorithms/Sorting/Quicksort.h | 43 +++ src/TNL/Algorithms/Sorting/bitonicSort.h | 9 +- src/TNL/Algorithms/Sorting/blockBitonicSort.h | 2 +- .../Algorithms/Sorting/detail/Quicksorter.h | 111 ++++++ .../{quicksort.h => detail/Quicksorter.hpp} | 346 ++++++++---------- .../Sorting/{ => detail}/cudaPartition.h | 16 +- .../Algorithms/Sorting/{ => detail}/helpers.h | 12 + .../Sorting/{ => detail}/quicksort_1Block.h | 16 +- .../Sorting/{ => detail}/quicksort_kernel.h | 30 +- .../Sorting/{ => detail}/reduction.h | 12 + .../Algorithms/Sorting/{ => detail}/task.h | 12 + .../Algorithms/Sorting/BitonicSortTest.h | 20 +- .../Algorithms/Sorting/QuicksortTest.h | 235 ++++++------ 15 files changed, 535 insertions(+), 335 deletions(-) create mode 100644 src/TNL/Algorithms/Sorting/Quicksort.h create mode 100644 src/TNL/Algorithms/Sorting/detail/Quicksorter.h rename src/TNL/Algorithms/Sorting/{quicksort.h => detail/Quicksorter.hpp} (63%) rename src/TNL/Algorithms/Sorting/{ => detail}/cudaPartition.h (91%) rename src/TNL/Algorithms/Sorting/{ => detail}/helpers.h (61%) rename src/TNL/Algorithms/Sorting/{ => detail}/quicksort_1Block.h (92%) rename src/TNL/Algorithms/Sorting/{ => detail}/quicksort_kernel.h (91%) rename src/TNL/Algorithms/Sorting/{ => detail}/reduction.h (84%) rename src/TNL/Algorithms/Sorting/{ => detail}/task.h (72%) diff --git a/src/Benchmarks/Sorting/Measurer.h b/src/Benchmarks/Sorting/Measurer.h index d41f70133..dee607410 100644 --- a/src/Benchmarks/Sorting/Measurer.h +++ b/src/Benchmarks/Sorting/Measurer.h @@ -17,7 +17,9 @@ using namespace TNL; struct QuicksortSorter { template< typename Array > - static void sort( Array& array ) { Algorithms::detail::quicksort( array ); }; + static void sort( Array& array ) { + Algorithms::Sorting::Quicksort::sort( array ); + }; }; struct BitonicSortSorter diff --git a/src/TNL/Algorithms/Sort.h b/src/TNL/Algorithms/Sort.h index f0724165a..629e6a846 100644 --- a/src/TNL/Algorithms/Sort.h +++ b/src/TNL/Algorithms/Sort.h @@ -18,7 +18,7 @@ #include #include #include -#include +#include namespace TNL { namespace Algorithms { diff --git a/src/TNL/Algorithms/Sorting/Quicksort.h b/src/TNL/Algorithms/Sorting/Quicksort.h new file mode 100644 index 000000000..4df0bf471 --- /dev/null +++ b/src/TNL/Algorithms/Sorting/Quicksort.h @@ -0,0 +1,43 @@ +/*************************************************************************** + Quicksort.h - description + ------------------- + begin : Jul 14, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Xuan Thang Nguyen, Tomas Oberhuber + +#pragma once + +#include + +namespace TNL { + namespace Algorithms { + namespace Sorting { + +struct Quicksort +{ + template< typename Array > + void static sort( Array& array ) + { + Quicksorter< typename Array::ValueType, typename Array::DeviceType > qs; + qs.sort( array ); + } + + template< typename Array, typename Compare > + void static sort( Array& array, const Compare& compare ) + { + Quicksorter< typename Array::ValueType, typename Array::DeviceType > qs; + qs.sort( array, compare ); + } + +}; + + } // namespace Sorting + } // namespace Algorithms +} //namespace TNL + + diff --git a/src/TNL/Algorithms/Sorting/bitonicSort.h b/src/TNL/Algorithms/Sorting/bitonicSort.h index 461ac187b..1d42bed0c 100644 --- a/src/TNL/Algorithms/Sorting/bitonicSort.h +++ b/src/TNL/Algorithms/Sorting/bitonicSort.h @@ -1,7 +1,7 @@ #pragma once #include #include -#include +#include namespace TNL { namespace Algorithms { @@ -304,6 +304,13 @@ void bitonicSort(std::vector &vec) bitonicSort(vec, [] __cuda_callable__(const Value &a, const Value &b) { return a < b; }); } +template +void bitonicSort( TNL::Containers::Array< Value, TNL::Devices::Host > &vec) +{ + bitonicSort(vec, [] __cuda_callable__(const Value &a, const Value &b) { return a < b; }); +} + + //--------------------------------------------- //--------------------------------------------- diff --git a/src/TNL/Algorithms/Sorting/blockBitonicSort.h b/src/TNL/Algorithms/Sorting/blockBitonicSort.h index 5167db973..413d74456 100644 --- a/src/TNL/Algorithms/Sorting/blockBitonicSort.h +++ b/src/TNL/Algorithms/Sorting/blockBitonicSort.h @@ -1,5 +1,5 @@ #pragma once -#include +#include #include #ifdef HAVE_CUDA diff --git a/src/TNL/Algorithms/Sorting/detail/Quicksorter.h b/src/TNL/Algorithms/Sorting/detail/Quicksorter.h new file mode 100644 index 000000000..3f891fa59 --- /dev/null +++ b/src/TNL/Algorithms/Sorting/detail/Quicksorter.h @@ -0,0 +1,111 @@ +/*************************************************************************** + Quicksorter.h - description + ------------------- + begin : Jul 13, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Xuan Thang Nguyen, Tomas Oberhuber + +#pragma once + +#include +#include +#include +#include + +namespace TNL { + namespace Algorithms { + namespace Sorting { + +template< typename Value, typename Device > +class Quicksorter; + +template< typename Value > +class Quicksorter< Value, Devices::Cuda > +{ + public: + + using ValueType = Value; + using DeviceType = Devices::Cuda; + + + template< typename Array, typename Compare > + void sort( Array& arr, const Compare& cmp ); + + template< typename Array > + void sort( Array& arr ); + + protected: + + void init( Containers::ArrayView arr, int gridDim, int blockDim, int desiredElemPerBlock, int maxSharable); + + template< typename CMP > + void performSort( const CMP &Cmp ); + + + /** + * returns how many blocks are needed to start sort phase 1 if @param elemPerBlock were to be used + * */ + int getSetsNeeded(int elemPerBlock) const; + + /** + * returns the optimal amount of elements per thread needed for phase + * */ + int getElemPerBlock() const; + + /** + * returns the amount of blocks needed to start phase 1 while also initializing all tasks + * */ + template< typename CMP > + int initTasks(int elemPerBlock, const CMP &Cmp); + + /** + * does the 1st phase of Quicksort until out of task memory or each task is small enough + * for correctness, secondphase method needs to be called to sort each subsequences + * */ + template + void firstPhase(const CMP &Cmp); + + /** + * update necessary variables after 1 phase1 sort + * */ + void processNewTasks(); + + /** + * sorts all leftover tasks + * */ + template + void secondPhase( const CMP &Cmp) ; + + int maxBlocks, threadsPerBlock, desiredElemPerBlock, maxSharable; //kernel config + + Containers::Array auxMem; + Containers::ArrayView arr, aux; + + int desired_2ndPhasElemPerBlock; + const int g_maxTasks = 1 << 14; + int maxTasks; + + + Containers::Array cuda_tasks, cuda_newTasks, cuda_2ndPhaseTasks; //1 set of 2 rotating tasks and 2nd phase + Containers::Array cuda_newTasksAmount, cuda_2ndPhaseTasksAmount; //is in reality 1 integer each + + Containers::Array cuda_blockToTaskMapping; + Containers::Vector cuda_reductionTaskInitMem; + + int host_1stPhaseTasksAmount = 0, host_2ndPhaseTasksAmount = 0; + int iteration = 0; + + template< typename T > + friend int getSetsNeededFunction(int elemPerBlock, const Quicksorter< T, Devices::Cuda >& quicksort ); +}; + + } // namespace Sorting + } // namespace Algorithms +}// namespace TNL + +#include diff --git a/src/TNL/Algorithms/Sorting/quicksort.h b/src/TNL/Algorithms/Sorting/detail/Quicksorter.hpp similarity index 63% rename from src/TNL/Algorithms/Sorting/quicksort.h rename to src/TNL/Algorithms/Sorting/detail/Quicksorter.hpp index 545137464..7c6a33ef9 100644 --- a/src/TNL/Algorithms/Sorting/quicksort.h +++ b/src/TNL/Algorithms/Sorting/detail/Quicksorter.hpp @@ -1,136 +1,133 @@ -#pragma once +/*************************************************************************** + Quicksorter.h - description + ------------------- + begin : Jul 13, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ -#include -#include -#include -#include -#include +/* See Copyright Notice in tnl/Copyright */ -#include -#define deb(x) std::cout << #x << " = " << x << std::endl; +// Implemented by: Xuan Thang Nguyen, Tomas Oberhuber -#ifdef CHECK_RESULT_SORT -#include "../util/algorithm.h" -#include -#endif +#pragma once -using namespace TNL; -using namespace TNL::Containers; +#include +#include +#include +#include +#include namespace TNL { namespace Algorithms { - namespace detail { - -template -class QUICKSORT -{ - int maxBlocks, threadsPerBlock, desiredElemPerBlock, maxSharable; //kernel config - - //-------------------------------------- - - Array auxMem; - ArrayView arr, aux; + namespace Sorting { - //-------------------------------------- - int desired_2ndPhasElemPerBlock; - const int g_maxTasks = 1 << 14; - int maxTasks; - - //-------------------------------------- - - //cuda side task initialization and storing - Array cuda_tasks, cuda_newTasks, cuda_2ndPhaseTasks; //1 set of 2 rotating tasks and 2nd phase - Array cuda_newTasksAmount, cuda_2ndPhaseTasksAmount; //is in reality 1 integer each - - Array cuda_blockToTaskMapping; - Vector cuda_reductionTaskInitMem; - - //-------------------------------------- - - int host_1stPhaseTasksAmount = 0, host_2ndPhaseTasksAmount = 0; - int iteration = 0; - - //-------------------------------------------------------------------------------------- - //-------------------------------------------------------------------------------------- -public: - QUICKSORT(ArrayView arr, int gridDim, int blockDim, int desiredElemPerBlock, int maxSharable) - : maxBlocks(gridDim), threadsPerBlock(blockDim), - desiredElemPerBlock(desiredElemPerBlock), maxSharable(maxSharable), +template< typename Value > + template< typename Array, typename Compare > +void +Quicksorter< Value, Devices::Cuda >:: +sort( Array& arr, const Compare& cmp ) +{ +#ifdef HAVE_CUDA + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, 0); - arr(arr.getView()), auxMem(arr.getSize()), aux(auxMem.getView()), + /** + * for every block there is a bit of shared memory reserved, the actual value can slightly differ + * */ + int sharedReserve = sizeof(int) * (16 + 3 * 32); + int maxSharable = deviceProp.sharedMemPerBlock - sharedReserve; - desired_2ndPhasElemPerBlock(desiredElemPerBlock), - maxTasks(min(arr.getSize(), g_maxTasks)), + int blockDim = 512; //best case - cuda_tasks(maxTasks), cuda_newTasks(maxTasks), cuda_2ndPhaseTasks(maxTasks), - cuda_newTasksAmount(1), cuda_2ndPhaseTasksAmount(1), + /** + * the goal is to use shared memory as often as possible + * each thread in a block will process n elements, n==multiplier + * + 1 reserved for pivot (statically allocating Value type throws weird error, hence it needs to be dynamic) + * + * blockDim*multiplier*sizeof(Value) + 1*sizeof(Value) <= maxSharable + * */ + int elemPerBlock = (maxSharable - sizeof(Value)) / sizeof(Value); //try to use up all of shared memory to store elements + const int maxBlocks = (1 << 20); + const int maxMultiplier = 8; + int multiplier = min(elemPerBlock / blockDim, maxMultiplier); - cuda_blockToTaskMapping(maxBlocks), - cuda_reductionTaskInitMem(maxTasks) + if (multiplier <= 0) //a block cant store 512 elements, sorting some really big data { - if (arr.getSize() > desired_2ndPhasElemPerBlock) - { - cuda_tasks.setElement(0, TASK(0, arr.getSize(), 0)); - host_1stPhaseTasksAmount = 1; - } - else + blockDim = 256; //try to fit 256 elements + multiplier = min(elemPerBlock / blockDim, maxMultiplier); + + if (multiplier <= 0) { - cuda_2ndPhaseTasks.setElement(0, TASK(0, arr.getSize(), 0)); - host_2ndPhaseTasksAmount = 1; - } + //worst case scenario, shared memory cant be utilized at all because of the sheer size of Value + //sort has to be done with the use of global memory alone - cuda_2ndPhaseTasksAmount = 0; - TNL_CHECK_CUDA_DEVICE; + this->init(arr, maxBlocks, 512, 0, 0); + this->performSort( cmp ); + return; + } } - //-------------------------------------------------------------------------------------- - - template - void sort(const CMP &Cmp); - //-------------------------------------------------------------------------------------- + TNL_ASSERT_LE( blockDim * multiplier * sizeof(Value), maxSharable,"" ); - /** - * returns how many blocks are needed to start sort phase 1 if @param elemPerBlock were to be used - * */ - int getSetsNeeded(int elemPerBlock) const; - - /** - * returns the optimal amount of elements per thread needed for phase - * */ - int getElemPerBlock() const; - - /** - * returns the amount of blocks needed to start phase 1 while also initializing all tasks - * */ - template - int initTasks(int elemPerBlock, const CMP &Cmp); - - /** - * does the 1st phase of quicksort until out of task memory or each task is small enough - * for correctness, secondphase method needs to be called to sort each subsequences - * */ - template - void firstPhase(const CMP &Cmp); + this->init(arr, maxBlocks, blockDim, multiplier * blockDim, maxSharable); + this->performSort( cmp ); +#endif +} - /** - * update necessary variables after 1 phase1 sort - * */ - void processNewTasks(); +template< typename Value > + template< typename Array > +void +Quicksorter< Value, Devices::Cuda >:: +sort( Array& arr ) +{ + this->sort(arr, [] __cuda_callable__( const Value& a, const Value& b ) { return a < b; } ); +} - /** - * sorts all leftover tasks - * */ - template - void secondPhase(const CMP &Cmp); -}; +template< typename Value > +void +Quicksorter< Value, Devices::Cuda >:: +init( ArrayView arr, int gridDim, int blockDim, int desiredElemPerBlock, int maxSharable) +{ + this->maxBlocks = gridDim; + this->threadsPerBlock = blockDim; + this->desiredElemPerBlock = desiredElemPerBlock; + this->maxSharable = maxSharable; + this->arr.bind( arr ); + this->auxMem.setSize( arr.getSize() ); + this->aux.bind( auxMem.getView() ); + this->desired_2ndPhasElemPerBlock = desiredElemPerBlock; + this->maxTasks = min( arr.getSize(), g_maxTasks ); + this->cuda_tasks.setSize(maxTasks); + this->cuda_newTasks.setSize(maxTasks); + this->cuda_2ndPhaseTasks.setSize(maxTasks); + this->cuda_newTasksAmount.setSize(1); + this->cuda_2ndPhaseTasksAmount.setSize(1); + this->cuda_blockToTaskMapping.setSize(maxBlocks); + this->cuda_reductionTaskInitMem.setSize(maxTasks); + + if (arr.getSize() > desired_2ndPhasElemPerBlock) + { + cuda_tasks.setElement(0, TASK(0, arr.getSize(), 0)); + host_1stPhaseTasksAmount = 1; + } + else + { + cuda_2ndPhaseTasks.setElement(0, TASK(0, arr.getSize(), 0)); + host_2ndPhaseTasksAmount = 1; + } + + cuda_2ndPhaseTasksAmount = 0; + TNL_CHECK_CUDA_DEVICE; +} -//--------------------------------------------------------------------------------------------- -//--------------------------------------------------------------------------------------------- -template -template -void QUICKSORT::sort(const CMP &Cmp) +template< typename Value > + template< typename CMP > +void +Quicksorter< Value, Devices::Cuda >:: +performSort( const CMP &Cmp ) { #ifdef HAVE_CUDA firstPhase(Cmp); @@ -161,11 +158,11 @@ void QUICKSORT::sort(const CMP &Cmp) #endif } -//--------------------------------------------------------------------------------------------- - -template -template -void QUICKSORT::firstPhase(const CMP &Cmp) +template< typename Value > + template< typename CMP > +void +Quicksorter< Value, Devices::Cuda >:: +firstPhase( const CMP &Cmp ) { #ifdef HAVE_CUDA while (host_1stPhaseTasksAmount > 0) @@ -265,11 +262,11 @@ void QUICKSORT::firstPhase(const CMP &Cmp) #endif } -//---------------------------------------------------------------------- - -template -template -void QUICKSORT::secondPhase(const CMP &Cmp) +template< typename Value > + template< typename CMP > +void +Quicksorter< Value, Devices::Cuda >:: +secondPhase(const CMP &Cmp) { #ifdef HAVE_CUDA int total2ndPhase = host_1stPhaseTasksAmount + host_2ndPhaseTasksAmount; @@ -308,23 +305,39 @@ void QUICKSORT::secondPhase(const CMP &Cmp) #endif } -//---------------------------------------------------------------------- +template< typename Value > +int getSetsNeededFunction(int elemPerBlock, const Quicksorter< Value, Devices::Cuda >& quicksort ) +{ + auto view = quicksort.iteration % 2 == 0 ? quicksort.cuda_tasks.getConstView() : quicksort.cuda_newTasks.getConstView(); + auto fetch = [=] __cuda_callable__(int i) -> int { + const auto &task = view[i]; + int size = task.partitionEnd - task.partitionBegin; + return size / elemPerBlock + (size % elemPerBlock != 0); + }; + auto reduction = [] __cuda_callable__(int a, int b) { return a + b; }; + return Algorithms::reduce( 0, quicksort.host_1stPhaseTasksAmount, fetch, reduction, 0 ); +} -template -int QUICKSORT::getSetsNeeded(int elemPerBlock) const +template< typename Value > +int +Quicksorter< Value, Devices::Cuda >:: +getSetsNeeded(int elemPerBlock) const { - auto view = iteration % 2 == 0 ? cuda_tasks.getConstView() : cuda_newTasks.getConstView(); + /*auto view = iteration % 2 == 0 ? cuda_tasks.getConstView() : cuda_newTasks.getConstView(); auto fetch = [=] __cuda_callable__(int i) { const auto &task = view[i]; int size = task.partitionEnd - task.partitionBegin; return size / elemPerBlock + (size % elemPerBlock != 0); }; auto reduction = [] __cuda_callable__(int a, int b) { return a + b; }; - return Algorithms::reduce(0, host_1stPhaseTasksAmount, fetch, reduction, 0); + return Algorithms::reduce(0, host_1stPhaseTasksAmount, fetch, reduction, 0);*/ + return getSetsNeededFunction< Value >( elemPerBlock, *this ); } -template -int QUICKSORT::getElemPerBlock() const +template< typename Value > +int +Quicksorter< Value, Devices::Cuda >:: +getElemPerBlock() const { return desiredElemPerBlock; @@ -342,9 +355,11 @@ int QUICKSORT::getElemPerBlock() const return setsNeeded * threadsPerBlock; } -template -template -int QUICKSORT::initTasks(int elemPerBlock, const CMP &Cmp) +template< typename Value > + template< typename CMP > +int +Quicksorter< Value, Devices::Cuda >:: +initTasks(int elemPerBlock, const CMP &Cmp) { #ifdef HAVE_CUDA auto &src = iteration % 2 == 0 ? arr : aux; @@ -357,10 +372,10 @@ int QUICKSORT::initTasks(int elemPerBlock, const CMP &Cmp) cuda_reductionTaskInitMem.getView(0, host_1stPhaseTasksAmount)); //cuda_reductionTaskInitMem[i] == how many blocks task i needs - auto reduce = [] __cuda_callable__(const int &a, const int &b) { return a + b; }; + //auto reduce = [] __cuda_callable__(const int &a, const int &b) { return a + b; }; Algorithms::Scan:: - perform(cuda_reductionTaskInitMem, 0, cuda_reductionTaskInitMem.getSize(), reduce, 0); + perform(cuda_reductionTaskInitMem, 0, cuda_reductionTaskInitMem.getSize(), TNL::Plus{}, 0); //cuda_reductionTaskInitMem[i] == how many blocks task [0..i] need int blocksNeeded = cuda_reductionTaskInitMem.getElement(host_1stPhaseTasksAmount - 1); @@ -385,72 +400,15 @@ int QUICKSORT::initTasks(int elemPerBlock, const CMP &Cmp) } template -void QUICKSORT::processNewTasks() +void +Quicksorter< Value, Devices::Cuda >:: +processNewTasks() { host_1stPhaseTasksAmount = min(cuda_newTasksAmount.getElement(0), maxTasks); host_2ndPhaseTasksAmount = min(cuda_2ndPhaseTasksAmount.getElement(0), maxTasks); } -//----------------------------------------------------------- -//----------------------------------------------------------- -//----------------------------------------------------------- - -template -void quicksort(ArrayView arr, const CMP &Cmp) -{ -#ifdef HAVE_CUDA - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, 0); - - /** - * for every block there is a bit of shared memory reserved, the actual value can slightly differ - * */ - int sharedReserve = sizeof(int) * (16 + 3 * 32); - int maxSharable = deviceProp.sharedMemPerBlock - sharedReserve; - - int blockDim = 512; //best case - - /** - * the goal is to use shared memory as often as possible - * each thread in a block will process n elements, n==multiplier - * + 1 reserved for pivot (statically allocating Value type throws weird error, hence it needs to be dynamic) - * - * blockDim*multiplier*sizeof(Value) + 1*sizeof(Value) <= maxSharable - * */ - int elemPerBlock = (maxSharable - sizeof(Value)) / sizeof(Value); //try to use up all of shared memory to store elements - const int maxBlocks = (1 << 20); - const int maxMultiplier = 8; - int multiplier = min(elemPerBlock / blockDim, maxMultiplier); - - if (multiplier <= 0) //a block cant store 512 elements, sorting some really big data - { - blockDim = 256; //try to fit 256 elements - multiplier = min(elemPerBlock / blockDim, maxMultiplier); - - if (multiplier <= 0) - { - //worst case scenario, shared memory cant be utilized at all because of the sheer size of Value - //sort has to be done with the use of global memory alone - - QUICKSORT sorter(arr, maxBlocks, 512, 0, 0); - sorter.sort(Cmp); - return; - } - } - - assert(blockDim * multiplier * sizeof(Value) <= maxSharable); - - QUICKSORT sorter(arr, maxBlocks, blockDim, multiplier * blockDim, maxSharable); - sorter.sort(Cmp); -#endif -} - -template -void quicksort(ArrayView arr) -{ - quicksort(arr, [] __cuda_callable__(const Value &a, const Value &b) { return a < b; }); -} - } // namespace detail + } // namespace Sorting } // namespace Algorithms -}// namespace TNL +} // namespace TNL diff --git a/src/TNL/Algorithms/Sorting/cudaPartition.h b/src/TNL/Algorithms/Sorting/detail/cudaPartition.h similarity index 91% rename from src/TNL/Algorithms/Sorting/cudaPartition.h rename to src/TNL/Algorithms/Sorting/detail/cudaPartition.h index b01c2c509..c7a54dd84 100644 --- a/src/TNL/Algorithms/Sorting/cudaPartition.h +++ b/src/TNL/Algorithms/Sorting/detail/cudaPartition.h @@ -1,8 +1,20 @@ +/*************************************************************************** + cudaPartition.h - description + ------------------- + begin : Jul 13, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Xuan Thang Nguyen + #pragma once #include -#include -#include +#include +#include #ifdef HAVE_CUDA diff --git a/src/TNL/Algorithms/Sorting/helpers.h b/src/TNL/Algorithms/Sorting/detail/helpers.h similarity index 61% rename from src/TNL/Algorithms/Sorting/helpers.h rename to src/TNL/Algorithms/Sorting/detail/helpers.h index 928314f2c..bf5f6f9d0 100644 --- a/src/TNL/Algorithms/Sorting/helpers.h +++ b/src/TNL/Algorithms/Sorting/detail/helpers.h @@ -1,3 +1,15 @@ +/*************************************************************************** + helpers.h - description + ------------------- + begin : Jul 13, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Xuan Thang Nguyen + #pragma once #include diff --git a/src/TNL/Algorithms/Sorting/quicksort_1Block.h b/src/TNL/Algorithms/Sorting/detail/quicksort_1Block.h similarity index 92% rename from src/TNL/Algorithms/Sorting/quicksort_1Block.h rename to src/TNL/Algorithms/Sorting/detail/quicksort_1Block.h index dce65bb1f..48cc9cd4d 100644 --- a/src/TNL/Algorithms/Sorting/quicksort_1Block.h +++ b/src/TNL/Algorithms/Sorting/detail/quicksort_1Block.h @@ -1,10 +1,22 @@ +/*************************************************************************** + quicksort_1Block.h - description + ------------------- + begin : Jul 13, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Xuan Thang Nguyen + #pragma once #include #include "cassert" #include -#include -#include +#include +#include using namespace TNL; using namespace TNL::Containers; diff --git a/src/TNL/Algorithms/Sorting/quicksort_kernel.h b/src/TNL/Algorithms/Sorting/detail/quicksort_kernel.h similarity index 91% rename from src/TNL/Algorithms/Sorting/quicksort_kernel.h rename to src/TNL/Algorithms/Sorting/detail/quicksort_kernel.h index 6da60041d..882316ac2 100644 --- a/src/TNL/Algorithms/Sorting/quicksort_kernel.h +++ b/src/TNL/Algorithms/Sorting/detail/quicksort_kernel.h @@ -1,14 +1,28 @@ +/*************************************************************************** + quicksort_kernel.h - description + ------------------- + begin : Jul 13, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Xuan Thang Nguyen + #pragma once #include #include -#include -#include -#include -#include +#include +#include +#include +#include -using namespace TNL; -using namespace TNL::Containers; + +namespace TNL { + namespace Algorithms { + namespace Sorting { #ifdef HAVE_CUDA @@ -247,3 +261,7 @@ __global__ void cudaQuickSort2ndPhase(ArrayView arr, Array } #endif + + } // namespace Sorting + } // namespace Algorithms +} // namespace TNL diff --git a/src/TNL/Algorithms/Sorting/reduction.h b/src/TNL/Algorithms/Sorting/detail/reduction.h similarity index 84% rename from src/TNL/Algorithms/Sorting/reduction.h rename to src/TNL/Algorithms/Sorting/detail/reduction.h index 49b2ef7d6..e1406ec46 100644 --- a/src/TNL/Algorithms/Sorting/reduction.h +++ b/src/TNL/Algorithms/Sorting/detail/reduction.h @@ -1,3 +1,15 @@ +/*************************************************************************** + reduction.h - description + ------------------- + begin : Jul 13, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Xuan Thang Nguyen + #pragma once #ifdef HAVE_CUDA diff --git a/src/TNL/Algorithms/Sorting/task.h b/src/TNL/Algorithms/Sorting/detail/task.h similarity index 72% rename from src/TNL/Algorithms/Sorting/task.h rename to src/TNL/Algorithms/Sorting/detail/task.h index e75758431..bafc5e64b 100644 --- a/src/TNL/Algorithms/Sorting/task.h +++ b/src/TNL/Algorithms/Sorting/detail/task.h @@ -1,3 +1,15 @@ +/*************************************************************************** + TASK.h - description + ------------------- + begin : Jul 13, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Xuan Thang Nguyen + #pragma once struct TASK diff --git a/src/UnitTests/Algorithms/Sorting/BitonicSortTest.h b/src/UnitTests/Algorithms/Sorting/BitonicSortTest.h index e7878199e..4aa1d5db4 100644 --- a/src/UnitTests/Algorithms/Sorting/BitonicSortTest.h +++ b/src/UnitTests/Algorithms/Sorting/BitonicSortTest.h @@ -241,18 +241,18 @@ TEST(sortWithFunction, descending) EXPECT_TRUE(view.getElement(4) == 2); } -TEST(sortstdVector, stdvector) +/*TEST(sortHostArray, hostArray) { - std::vector arr(84561); - for(size_t i = 0; i < arr.size(); i++) + TNL::Containers::Array< int > arr( 84561 ); + for( size_t i = 0; i < arr.getSize(); i++ ) arr[i] = -i; bitonicSort(arr); - EXPECT_TRUE(std::Algorithms::isSorted(arr.begin(), arr.end())); -} + EXPECT_TRUE( TNL::Algorithms::isSorted(arr) ); +}*/ -TEST(sortRange, secondHalf) +/*TEST(sortRange, secondHalf) { std::vector arr(19); int s = 19/2; @@ -261,7 +261,7 @@ TEST(sortRange, secondHalf) bitonicSort(arr, s, 19); - EXPECT_TRUE(std::Algorithms::isSorted(arr.begin() + s, arr.end())); + EXPECT_TRUE(TNL::Algorithms::isSorted(arr.begin() + s, arr.end())); EXPECT_TRUE(arr[0] == -1); EXPECT_TRUE(arr[s-1] == -1); } @@ -279,7 +279,7 @@ TEST(sortRange, middle) bitonicSort(arr, s, e); - EXPECT_TRUE(std::Algorithms::isSorted(arr.begin() + s, arr.begin() + e)); + EXPECT_TRUE(TNL::Algorithms::isSorted(arr.begin() + s, arr.begin() + e)); EXPECT_TRUE(arr[0] == -1); EXPECT_TRUE(arr.back() == -1); EXPECT_TRUE(arr[s-1] == -1); @@ -300,7 +300,7 @@ TEST(sortRange, middleMultiBlock) bitonicSort(arr, s, e); - EXPECT_TRUE(std::Algorithms::isSorted(arr.begin() + s, arr.begin() + e)); + EXPECT_TRUE(TNL::Algorithms::isSorted(arr.begin() + s, arr.begin() + e)); EXPECT_TRUE(arr[0] == -1); EXPECT_TRUE(arr[std::rand() % s] == -1); @@ -309,7 +309,7 @@ TEST(sortRange, middleMultiBlock) EXPECT_TRUE(arr[e] == -1); EXPECT_TRUE(arr[e + (std::rand() % (size - e))] == -1); EXPECT_TRUE(arr.back() == -1); -} +}*/ template void fetchAndSwapSorter(TNL::Containers::ArrayView view) diff --git a/src/UnitTests/Algorithms/Sorting/QuicksortTest.h b/src/UnitTests/Algorithms/Sorting/QuicksortTest.h index f8a6fe6da..7a1333fbc 100644 --- a/src/UnitTests/Algorithms/Sorting/QuicksortTest.h +++ b/src/UnitTests/Algorithms/Sorting/QuicksortTest.h @@ -7,9 +7,9 @@ #include #include #include -#include +#include -#if defined HAVE_CUDA_&& defined HAVE_GTEST +#if defined HAVE_CUDA && defined HAVE_GTEST #include #include @@ -17,184 +17,185 @@ using namespace TNL; using namespace TNL::Algorithms; -using namespace TNL::Algorithms::detail; +using namespace TNL::Algorithms::Sorting; TEST(selectedSize, size15) { - TNL::Containers::Array cudaArr{5, 9, 4, 8, 6, 1, 2, 3, 4, 8, 1, 6, 9, 4, 9}; - auto view = cudaArr.getView(); - EXPECT_EQ(15, view.getSize()) << "size not 15" << std::endl; - quicksort(view); - EXPECT_TRUE(Algorithms::isSorted(view)) << "result " << view << std::endl; + TNL::Containers::Array cudaArr{5, 9, 4, 8, 6, 1, 2, 3, 4, 8, 1, 6, 9, 4, 9}; + auto view = cudaArr.getView(); + EXPECT_EQ(15, view.getSize()) << "size not 15" << std::endl; + Quicksort::sort( view ); + EXPECT_TRUE(Algorithms::isSorted(view)) << "result " << view << std::endl; } TEST(multiblock, 32768_decreasingNegative) { - std::vector arr(1<<15); - for (size_t i = 0; i < arr.size(); i++) - arr[i] = -i; + std::vector arr(1<<15); + for (size_t i = 0; i < arr.size(); i++) + arr[i] = -i; - TNL::Containers::Array cudaArr(arr); - auto view = cudaArr.getView(); + TNL::Containers::Array cudaArr(arr); + auto view = cudaArr.getView(); + Quicksort::sort( view ); - quicksort(view); - EXPECT_TRUE(Algorithms::isSorted(view)) << "result " << view << std::endl; + EXPECT_TRUE(Algorithms::isSorted(view)) << "result " << view << std::endl; } TEST(randomGenerated, smallArray_randomVal) { - std::srand(2006); - for(int i = 0; i < 100; i++) - { - std::vector arr(std::rand()%(1<<10)); - for(auto & x : arr) - x = std::rand(); - - TNL::Containers::Array cudaArr(arr); - - auto view = cudaArr.getView(); - quicksort(view); - EXPECT_TRUE(Algorithms::isSorted(view)); + std::srand(2006); + for(int i = 0; i < 100; i++) + { + std::vector arr(std::rand()%(1<<10)); + for(auto & x : arr) + x = std::rand(); + + TNL::Containers::Array cudaArr(arr); + auto view = cudaArr.getView(); + Quicksort::sort( view ); + + EXPECT_TRUE(Algorithms::isSorted(view)); } } - TEST(randomGenerated, bigArray_randomVal) { - std::srand(304); - for(int i = 0; i < 50; i++) - { - int size = (1<<20) + (std::rand()% (1<<19)); - std::vector arr(size); - for(auto & x : arr) x = std::rand(); - TNL::Containers::Array cudaArr(arr); - - auto view = cudaArr.getView(); - quicksort(view); - EXPECT_TRUE(Algorithms::isSorted(view)); + std::srand(304); + for(int i = 0; i < 50; i++) + { + int size = (1<<20) + (std::rand()% (1<<19)); + std::vector arr(size); + for(auto & x : arr) x = std::rand(); + TNL::Containers::Array cudaArr(arr); + + auto view = cudaArr.getView(); + Quicksort::sort( view ); + EXPECT_TRUE(Algorithms::isSorted(view)); } } TEST(noLostElement, smallArray) { - std::srand(9151); + std::srand(9151); - int size = (1<<7); - std::vector arr(size); - for(auto & x : arr) x = std::rand(); + int size = (1<<7); + std::vector arr(size); + for(auto & x : arr) x = std::rand(); - TNL::Containers::Array cudaArr(arr); - auto view = cudaArr.getView(); - quicksort(view); + TNL::Containers::Array cudaArr(arr); + auto view = cudaArr.getView(); + Quicksort::sort( view ); - std::sort(arr.begin(), arr.end()); - TNL::Containers::Array cudaArr2(arr); - EXPECT_TRUE(view == cudaArr2.getView()); + std::sort(arr.begin(), arr.end()); + TNL::Containers::Array cudaArr2(arr); + EXPECT_TRUE(view == cudaArr2.getView()); } TEST(noLostElement, midSizedArray) { - std::srand(91503); + std::srand(91503); - int size = (1<<15); - std::vector arr(size); - for(auto & x : arr) x = std::rand(); + int size = (1<<15); + std::vector arr(size); + for(auto & x : arr) x = std::rand(); - TNL::Containers::Array cudaArr(arr); - auto view = cudaArr.getView(); - quicksort(view); + TNL::Containers::Array cudaArr(arr); + auto view = cudaArr.getView(); + Quicksort::sort( view ); - std::sort(arr.begin(), arr.end()); - TNL::Containers::Array cudaArr2(arr); - EXPECT_TRUE(view == cudaArr2.getView()); + std::sort(arr.begin(), arr.end()); + TNL::Containers::Array cudaArr2(arr); + EXPECT_TRUE(view == cudaArr2.getView()); } TEST(noLostElement, bigSizedArray) { - std::srand(15611); + std::srand(15611); - int size = (1<<22); - std::vector arr(size); - for(auto & x : arr) x = std::rand(); - for(int i = 0; i < 10000; i++) - arr[std::rand() % arr.size()] = (1<<10); + int size = (1<<22); + std::vector arr(size); + for(auto & x : arr) x = std::rand(); + for(int i = 0; i < 10000; i++) + arr[std::rand() % arr.size()] = (1<<10); - TNL::Containers::Array cudaArr(arr); - auto view = cudaArr.getView(); - quicksort(view); + TNL::Containers::Array cudaArr(arr); + auto view = cudaArr.getView(); + Quicksort::sort( view ); - TNL::Containers::Array cudaArr2(arr); - thrust::sort(thrust::device, cudaArr2.getData(), cudaArr2.getData() + cudaArr2.getSize()); - EXPECT_TRUE(view == cudaArr2.getView()); + TNL::Containers::Array cudaArr2(arr); + thrust::sort(thrust::device, cudaArr2.getData(), cudaArr2.getData() + cudaArr2.getSize()); + EXPECT_TRUE(view == cudaArr2.getView()); } TEST(types, type_double) { - std::srand(8451); + std::srand(8451); - int size = (1<<16); - std::vector arr(size); - for(auto & x : arr) x = std::rand(); - for(int i = 0; i < 10000; i++) - arr[std::rand() % arr.size()] = (1<<10); + int size = (1<<16); + std::vector arr(size); + for(auto & x : arr) x = std::rand(); + for(int i = 0; i < 10000; i++) + arr[std::rand() % arr.size()] = (1<<10); - TNL::Containers::Array cudaArr(arr); - auto view = cudaArr.getView(); - quicksort(view); + TNL::Containers::Array cudaArr(arr); + auto view = cudaArr.getView(); + Quicksort::sort( view ); - TNL::Containers::Array cudaArr2(arr); - thrust::sort(thrust::device, cudaArr2.getData(), cudaArr2.getData() + cudaArr2.getSize()); - EXPECT_TRUE(view == cudaArr2.getView()); + TNL::Containers::Array cudaArr2(arr); + thrust::sort(thrust::device, cudaArr2.getData(), cudaArr2.getData() + cudaArr2.getSize()); + EXPECT_TRUE(view == cudaArr2.getView()); } struct TMPSTRUCT_xyz{ - double x, y, z; - __cuda_callable__ TMPSTRUCT_xyz(): x(0){} - __cuda_callable__ TMPSTRUCT_xyz(int first){x = first;}; - __cuda_callable__ bool operator <(const TMPSTRUCT_xyz& other) const { return x< other.x;} - __cuda_callable__ TMPSTRUCT_xyz& operator =(const TMPSTRUCT_xyz& other) {x = other.x; return *this;} + double x, y, z; + __cuda_callable__ TMPSTRUCT_xyz(): x(0){} + __cuda_callable__ TMPSTRUCT_xyz(int first){x = first;}; + __cuda_callable__ bool operator <(const TMPSTRUCT_xyz& other) const { return x< other.x;} + __cuda_callable__ TMPSTRUCT_xyz& operator =(const TMPSTRUCT_xyz& other) {x = other.x; return *this;} }; std::ostream & operator<<(std::ostream & out, const TMPSTRUCT_xyz & data){return out << data.x;} TEST(types, struct_3D_points) { - std::srand(46151); - - int size = (1<<18); - std::vector arr(size); - for(auto & x : arr) x = TMPSTRUCT_xyz(std::rand()); - - TNL::Containers::Array cudaArr(arr); - auto view = cudaArr.getView(); - //thrust::sort(thrust::device, cudaArr.getData(), cudaArr.getData() + cudaArr.getSize()); - //std::cout << view << std::endl; - quicksort(view); - EXPECT_TRUE(Algorithms::isSorted(view)); + std::srand(46151); + + int size = (1<<18); + std::vector arr(size); + for(auto & x : arr) x = TMPSTRUCT_xyz(std::rand()); + + TNL::Containers::Array cudaArr(arr); + auto view = cudaArr.getView(); + //thrust::sort(thrust::device, cudaArr.getData(), cudaArr.getData() + cudaArr.getSize()); + //std::cout << view << std::endl; + Quicksort::sort( view ); + + EXPECT_TRUE(Algorithms::isSorted(view)); } struct TMPSTRUCT_64b{ - uint8_t m_Data[64]; - __cuda_callable__ TMPSTRUCT_64b() {m_Data[0] = 0;} - __cuda_callable__ TMPSTRUCT_64b(int first){m_Data[0] = first;}; - __cuda_callable__ bool operator <(const TMPSTRUCT_64b& other) const { return m_Data[0]< other.m_Data[0];} - __cuda_callable__ TMPSTRUCT_64b& operator =(const TMPSTRUCT_64b& other) {m_Data[0] = other.m_Data[0]; return *this;} + uint8_t m_Data[64]; + __cuda_callable__ TMPSTRUCT_64b() {m_Data[0] = 0;} + __cuda_callable__ TMPSTRUCT_64b(int first){m_Data[0] = first;}; + __cuda_callable__ bool operator <(const TMPSTRUCT_64b& other) const { return m_Data[0]< other.m_Data[0];} + __cuda_callable__ TMPSTRUCT_64b& operator =(const TMPSTRUCT_64b& other) {m_Data[0] = other.m_Data[0]; return *this;} }; std::ostream & operator<<(std::ostream & out, const TMPSTRUCT_64b & data){return out << (unsigned) data.m_Data[0];} TEST(types, struct_64b) { - std::srand(96); - - int size = (1<<18); - std::vector arr(size); - for(auto & x : arr) x = TMPSTRUCT_64b(std::rand() % 512); - - TNL::Containers::Array cudaArr(arr); - auto view = cudaArr.getView(); - //thrust::sort(thrust::device, cudaArr.getData(), cudaArr.getData() + cudaArr.getSize()); - //std::cout << view << std::endl; - quicksort(view); - EXPECT_TRUE(Algorithms::isSorted(view)); + std::srand(96); + + int size = (1<<18); + std::vector arr(size); + for(auto & x : arr) x = TMPSTRUCT_64b(std::rand() % 512); + + TNL::Containers::Array cudaArr(arr); + auto view = cudaArr.getView(); + //thrust::sort(thrust::device, cudaArr.getData(), cudaArr.getData() + cudaArr.getSize()); + //std::cout << view << std::endl; + Quicksort::sort( view ); + + EXPECT_TRUE(Algorithms::isSorted(view)); } #endif -- GitLab From b855b919f8bef5f987607824c3f2dfdba062ec06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Wed, 14 Jul 2021 20:29:49 +0200 Subject: [PATCH 238/258] Refactoring bitonic sort. --- src/Benchmarks/Sorting/Measurer.h | 2 +- src/TNL/Algorithms/Sort.h | 2 +- src/TNL/Algorithms/Sorting/BitonicSort.h | 41 +++++++++++++++++++ .../Sorting/{ => detail}/bitonicSort.h | 4 +- .../Sorting/{ => detail}/blockBitonicSort.h | 0 .../Sorting/detail/quicksort_1Block.h | 2 +- .../Algorithms/Sorting/BitonicSortTest.h | 28 ++++++------- 7 files changed, 60 insertions(+), 19 deletions(-) create mode 100644 src/TNL/Algorithms/Sorting/BitonicSort.h rename src/TNL/Algorithms/Sorting/{ => detail}/bitonicSort.h (99%) rename src/TNL/Algorithms/Sorting/{ => detail}/blockBitonicSort.h (100%) diff --git a/src/Benchmarks/Sorting/Measurer.h b/src/Benchmarks/Sorting/Measurer.h index dee607410..aff1485cc 100644 --- a/src/Benchmarks/Sorting/Measurer.h +++ b/src/Benchmarks/Sorting/Measurer.h @@ -25,7 +25,7 @@ struct QuicksortSorter struct BitonicSortSorter { template< typename Array > - static void sort( Array& array ) { Algorithms::detail::bitonicSort( array ); }; + static void sort( Array& array ) { Algorithms::Sorting::BitonicSort::sort( array ); }; }; struct STLSorter diff --git a/src/TNL/Algorithms/Sort.h b/src/TNL/Algorithms/Sort.h index 629e6a846..498ae45fe 100644 --- a/src/TNL/Algorithms/Sort.h +++ b/src/TNL/Algorithms/Sort.h @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include namespace TNL { diff --git a/src/TNL/Algorithms/Sorting/BitonicSort.h b/src/TNL/Algorithms/Sorting/BitonicSort.h new file mode 100644 index 000000000..17471f69f --- /dev/null +++ b/src/TNL/Algorithms/Sorting/BitonicSort.h @@ -0,0 +1,41 @@ +/*************************************************************************** + BitonicSort.h - description + ------------------- + begin : Jul 14, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Xuan Thang Nguyen, Tomas Oberhuber + +#pragma once + +#include + +namespace TNL { + namespace Algorithms { + namespace Sorting { + +struct BitonicSort +{ + template< typename Array > + void static sort( Array& array ) + { + bitonicSort( array ); + } + + template< typename Array, typename Compare > + void static sort( Array& array, const Compare& compare ) + { + bitonicSort( array, compare ); + } + +}; + + } // namespace Sorting + } // namespace Algorithms +} //namespace TNL + + diff --git a/src/TNL/Algorithms/Sorting/bitonicSort.h b/src/TNL/Algorithms/Sorting/detail/bitonicSort.h similarity index 99% rename from src/TNL/Algorithms/Sorting/bitonicSort.h rename to src/TNL/Algorithms/Sorting/detail/bitonicSort.h index 1d42bed0c..1b888251a 100644 --- a/src/TNL/Algorithms/Sorting/bitonicSort.h +++ b/src/TNL/Algorithms/Sorting/detail/bitonicSort.h @@ -1,11 +1,11 @@ #pragma once #include -#include +#include #include namespace TNL { namespace Algorithms { - namespace detail { + namespace Sorting { #ifdef HAVE_CUDA diff --git a/src/TNL/Algorithms/Sorting/blockBitonicSort.h b/src/TNL/Algorithms/Sorting/detail/blockBitonicSort.h similarity index 100% rename from src/TNL/Algorithms/Sorting/blockBitonicSort.h rename to src/TNL/Algorithms/Sorting/detail/blockBitonicSort.h diff --git a/src/TNL/Algorithms/Sorting/detail/quicksort_1Block.h b/src/TNL/Algorithms/Sorting/detail/quicksort_1Block.h index 48cc9cd4d..e5196d7a5 100644 --- a/src/TNL/Algorithms/Sorting/detail/quicksort_1Block.h +++ b/src/TNL/Algorithms/Sorting/detail/quicksort_1Block.h @@ -14,7 +14,7 @@ #include #include "cassert" -#include +#include #include #include diff --git a/src/UnitTests/Algorithms/Sorting/BitonicSortTest.h b/src/UnitTests/Algorithms/Sorting/BitonicSortTest.h index 4aa1d5db4..e8fad07a0 100644 --- a/src/UnitTests/Algorithms/Sorting/BitonicSortTest.h +++ b/src/UnitTests/Algorithms/Sorting/BitonicSortTest.h @@ -6,7 +6,7 @@ #include #include -#include +#include #include #if defined HAVE_GTEST && defined HAVE_CUDA @@ -15,7 +15,7 @@ using namespace TNL; using namespace TNL::Algorithms; -using namespace TNL::Algorithms::detail; +using namespace TNL::Algorithms::Sorting; TEST(permutations, allPermutationSize_2_to_7) { @@ -30,7 +30,7 @@ TEST(permutations, allPermutationSize_2_to_7) TNL::Containers::Array cudaArr(orig); auto view = cudaArr.getView(); - bitonicSort(view); + BitonicSort::sort(view); EXPECT_TRUE( Algorithms::isSorted( view ) ) << "failed " << i << std::endl; } @@ -55,7 +55,7 @@ TEST(permutations, allPermutationSize_8) TNL::Containers::Array cudaArr(orig); auto view = cudaArr.getView(); - bitonicSort(view); + BitonicSort::sort(view); EXPECT_TRUE(Algorithms::isSorted(view)) << "result " << view << std::endl; } @@ -79,7 +79,7 @@ TEST(permutations, somePermutationSize9) TNL::Containers::Array cudaArr(orig); auto view = cudaArr.getView(); - bitonicSort(view); + BitonicSort::sort(view); EXPECT_TRUE(Algorithms::isSorted(view)) << "result " << view << std::endl; } @@ -91,7 +91,7 @@ TEST(selectedSize, size15) TNL::Containers::Array cudaArr{5, 9, 4, 8, 6, 1, 2, 3, 4, 8, 1, 6, 9, 4, 9}; auto view = cudaArr.getView(); EXPECT_EQ(15, view.getSize()) << "size not 15" << std::endl; - bitonicSort(view); + BitonicSort::sort(view); EXPECT_TRUE(Algorithms::isSorted(view)) << "result " << view << std::endl; } @@ -104,7 +104,7 @@ TEST(multiblock, 32768_decreasingNegative) TNL::Containers::Array cudaArr(arr); auto view = cudaArr.getView(); - bitonicSort(view); + BitonicSort::sort(view); EXPECT_TRUE(Algorithms::isSorted(view)) << "result " << view << std::endl; } @@ -120,7 +120,7 @@ TEST(randomGenerated, smallArray_randomVal) TNL::Containers::Array cudaArr(arr); auto view = cudaArr.getView(); - bitonicSort(view); + BitonicSort::sort(view); EXPECT_TRUE(Algorithms::isSorted(view)); } } @@ -135,7 +135,7 @@ TEST(randomGenerated, bigArray_all0) TNL::Containers::Array cudaArr(size); auto view = cudaArr.getView(); - bitonicSort(view); + BitonicSort::sort(view); EXPECT_TRUE(Algorithms::isSorted(view)); } } @@ -144,7 +144,7 @@ TEST(nonIntegerType, float_notPow2) { TNL::Containers::Array cudaArr{5.0, 9.4, 4.6, 8.9, 6.2, 1.15184, 2.23}; auto view = cudaArr.getView(); - bitonicSort(view); + BitonicSort::sort(view); EXPECT_TRUE(Algorithms::isSorted(view)) << "result " << view << std::endl; } @@ -152,7 +152,7 @@ TEST(nonIntegerType, double_notPow2) { TNL::Containers::Array cudaArr{5.0, 9.4, 4.6, 8.9, 6.2, 1.15184, 2.23}; auto view = cudaArr.getView(); - bitonicSort(view); + BitonicSort::sort(view); EXPECT_TRUE(Algorithms::isSorted(view)) << "result " << view << std::endl; } @@ -170,7 +170,7 @@ TEST(nonIntegerType, struct) { TNL::Containers::Array cudaArr{TMPSTRUCT(5), TMPSTRUCT(6), TMPSTRUCT(9), TMPSTRUCT(1)}; auto view = cudaArr.getView(); - bitonicSort(view); + BitonicSort::sort(view); EXPECT_TRUE(Algorithms::isSorted(view)); } @@ -192,7 +192,7 @@ TEST(nonIntegerType, struct_64b) TNL::Containers::Array cudaArr(vec); auto view = cudaArr.getView(); - bitonicSort(view); + BitonicSort::sort(view); EXPECT_TRUE(Algorithms::isSorted(view)); } @@ -214,7 +214,7 @@ TEST(nonIntegerType, struct_128b) TNL::Containers::Array cudaArr(vec); auto view = cudaArr.getView(); - bitonicSort(view); + BitonicSort::sort(view); EXPECT_TRUE(Algorithms::isSorted(view)); } -- GitLab From 165d5675a93e82c695cb0e4bf2797c9043bfee81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Wed, 14 Jul 2021 20:55:50 +0200 Subject: [PATCH 239/258] Making wrapper for Manca and Cederman quicksort compatible with TNL sorter. --- src/Benchmarks/Sorting/Measurer.h | 54 +++++-------------- .../{cederman_qsort.h => CedermanQuicksort.h} | 8 +++ .../{manca_quicksort.h => MancaQuicksort.h} | 10 ++++ src/Benchmarks/Sorting/tnl-benchmark-sort.h | 23 ++++---- 4 files changed, 42 insertions(+), 53 deletions(-) rename src/Benchmarks/Sorting/ReferenceAlgorithms/{cederman_qsort.h => CedermanQuicksort.h} (99%) rename src/Benchmarks/Sorting/ReferenceAlgorithms/{manca_quicksort.h => MancaQuicksort.h} (99%) diff --git a/src/Benchmarks/Sorting/Measurer.h b/src/Benchmarks/Sorting/Measurer.h index aff1485cc..7f7990f48 100644 --- a/src/Benchmarks/Sorting/Measurer.h +++ b/src/Benchmarks/Sorting/Measurer.h @@ -3,56 +3,24 @@ #include #include #include -#include +#include +#include +#include #ifdef HAVE_CUDA -#include "ReferenceAlgorithms/manca_quicksort.h" -#include "ReferenceAlgorithms/cederman_qsort.h" +#include "ReferenceAlgorithms/MancaQuicksort.h" +#include "ReferenceAlgorithms/CedermanQuicksort.h" #endif #include "timer.h" using namespace TNL; -struct QuicksortSorter -{ - template< typename Array > - static void sort( Array& array ) { - Algorithms::Sorting::Quicksort::sort( array ); - }; -}; - -struct BitonicSortSorter -{ - template< typename Array > - static void sort( Array& array ) { Algorithms::Sorting::BitonicSort::sort( array ); }; -}; - -struct STLSorter +/*struct STLSorter { template< typename Value > static void sort( std::vector< Value >& vec ) { std::sort( vec.begin(), vec.end() ); }; -}; - -#ifdef HAVE_CUDA -struct MancaQuicksortSorter -{ - static void sort( Containers::ArrayView< int, Devices::Cuda >& array ) - { - double timer; - CUDA_Quicksort( ( unsigned * ) array.getData(), (unsigned * ) array.getData(), array.getSize(), 256, 0, &timer ); - //return; - } -}; - -struct CedermanQuicksortSorter -{ - static void sort( Containers::ArrayView< int, Devices::Cuda >& array ) - { - gpuqsort( ( unsigned int * ) array.getData(), ( unsigned int ) array.getSize() ); - } -}; -#endif +};*/ template< typename Sorter > @@ -80,7 +48,7 @@ struct Measurer }; template<> -struct Measurer< STLSorter > +struct Measurer< Algorithms::Sorting::STLSort > { template< typename Value > static double measure( const std::vector&vec, int tries, int & wrongAnsCnt ) @@ -89,10 +57,12 @@ struct Measurer< STLSorter > for(int i = 0; i < tries; i++) { - std::vector< Value > vec2 = vec; + Containers::Array arr(vec); + auto view = arr.getView(); + //std::vector< Value > vec2 = vec; { TIMER t([&](double res){resAcc.push_back(res);}); - STLSorter::sort( vec2 ); + Algorithms::Sorting::STLSort::sort( view ); } } return accumulate(resAcc.begin(), resAcc.end(), 0.0) / resAcc.size(); diff --git a/src/Benchmarks/Sorting/ReferenceAlgorithms/cederman_qsort.h b/src/Benchmarks/Sorting/ReferenceAlgorithms/CedermanQuicksort.h similarity index 99% rename from src/Benchmarks/Sorting/ReferenceAlgorithms/cederman_qsort.h rename to src/Benchmarks/Sorting/ReferenceAlgorithms/CedermanQuicksort.h index fd877fdb0..5e64d773d 100644 --- a/src/Benchmarks/Sorting/ReferenceAlgorithms/cederman_qsort.h +++ b/src/Benchmarks/Sorting/ReferenceAlgorithms/CedermanQuicksort.h @@ -1090,3 +1090,11 @@ int gpuqsort(unsigned int *data, unsigned int size, unsigned int blockscount, un return 0; } } + +struct CedermanQuicksort +{ + static void sort( Containers::ArrayView< int, Devices::Cuda >& array ) + { + gpuqsort( ( unsigned int * ) array.getData(), ( unsigned int ) array.getSize() ); + } +}; diff --git a/src/Benchmarks/Sorting/ReferenceAlgorithms/manca_quicksort.h b/src/Benchmarks/Sorting/ReferenceAlgorithms/MancaQuicksort.h similarity index 99% rename from src/Benchmarks/Sorting/ReferenceAlgorithms/manca_quicksort.h rename to src/Benchmarks/Sorting/ReferenceAlgorithms/MancaQuicksort.h index bc7fe0d09..9b32a599a 100644 --- a/src/Benchmarks/Sorting/ReferenceAlgorithms/manca_quicksort.h +++ b/src/Benchmarks/Sorting/ReferenceAlgorithms/MancaQuicksort.h @@ -1315,3 +1315,13 @@ void CUDA_Quicksort_64(double* inputData,double* outputData, uint dataSize, uint sort(inputData,outputData, dataSize,threadCount,Device,wallClock); } + +struct MancaQuicksort +{ + static void sort( Containers::ArrayView< int, Devices::Cuda >& array ) + { + double timer; + CUDA_Quicksort( ( unsigned * ) array.getData(), (unsigned * ) array.getData(), array.getSize(), 256, 0, &timer ); + //return; + } +}; diff --git a/src/Benchmarks/Sorting/tnl-benchmark-sort.h b/src/Benchmarks/Sorting/tnl-benchmark-sort.h index aa2885f4f..bb5d15dbc 100644 --- a/src/Benchmarks/Sorting/tnl-benchmark-sort.h +++ b/src/Benchmarks/Sorting/tnl-benchmark-sort.h @@ -21,7 +21,8 @@ using namespace std; #endif using namespace TNL; - +using namespace TNL::Algorithms; +using namespace TNL::Algorithms::Sorting; template< typename Sorter > void start(ostream & out, string delim) @@ -85,32 +86,32 @@ int main(int argc, char *argv[]) if(argc == 1) { std::cout << "STL sort on CPU ... " << std::endl; - start< STLSorter >( cout, "\t" ); + start< STLSort >( cout, "\t" ); std::cout << "Quicksort on GPU ... " << std::endl; - start< QuicksortSorter >(cout, "\t"); + start< Quicksort >(cout, "\t"); std::cout << "Bitonic sort on GPU ... " << std::endl; - start< BitonicSortSorter >( cout, "\t" ); + start< BitonicSort >( cout, "\t" ); #ifdef HAVE_CUDA std::cout << "Manca quicksort on GPU ... " << std::endl; - start< MancaQuicksortSorter >( cout, "\t" ); + start< MancaQuicksort >( cout, "\t" ); std::cout << "Cederman quicksort on GPU ... " << std::endl; - start< CedermanQuicksortSorter >( cout, "\t" ); + start< CedermanQuicksort >( cout, "\t" ); #endif } else { std::ofstream out(argv[1]); std::cout << "STL sort on CPU ... " << std::endl; - start< STLSorter >( out, "," ); + start< STLSort >( out, "," ); std::cout << "Quicksort on GPU ... " << std::endl; - start< QuicksortSorter >(out, ","); + start< Quicksort >(out, ","); std::cout << "Bitonic sort on GPU ... " << std::endl; - start< BitonicSortSorter >(out, ","); + start< BitonicSort >(out, ","); #ifdef HAVE_CUDA std::cout << "Manca quicksort on GPU ... " << std::endl; - start< MancaQuicksortSorter >( out, "," ); + start< MancaQuicksort >( out, "," ); std::cout << "Cederman quicksort on GPU ... " << std::endl; - start< CedermanQuicksortSorter >( out, "," ); + start< CedermanQuicksort >( out, "," ); #endif } return 0; -- GitLab From d6882717e20cb6e9b4687ffcc22dc23862ce58f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Wed, 14 Jul 2021 20:56:12 +0200 Subject: [PATCH 240/258] Added wrapper for STL sort on CPU. --- src/TNL/Algorithms/Sorting/STLSort.h | 40 ++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 src/TNL/Algorithms/Sorting/STLSort.h diff --git a/src/TNL/Algorithms/Sorting/STLSort.h b/src/TNL/Algorithms/Sorting/STLSort.h new file mode 100644 index 000000000..3fc69f324 --- /dev/null +++ b/src/TNL/Algorithms/Sorting/STLSort.h @@ -0,0 +1,40 @@ +/*************************************************************************** + STLSort.h - description + ------------------- + begin : Jul 14, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include + +namespace TNL { + namespace Algorithms { + namespace Sorting { + +struct STLSort +{ + template< typename Array > + void static sort( Array& array ) + { + std::sort( array.getData(), array.getData() + array.getSize() ); + } + + template< typename Array, typename Compare > + void static sort( Array& array, const Compare& compare ) + { + std::sort( array.getData(), array.getData() + array.getSize(), compare ); + } +}; + + } // namespace Sorting + } // namespace Algorithms +} //namespace TNL + + -- GitLab From 7460350bf0db14ac171db989e5e35557175c620f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Wed, 14 Jul 2021 20:59:44 +0200 Subject: [PATCH 241/258] Added inplaceSort to BitonicSort. --- src/TNL/Algorithms/Sorting/BitonicSort.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/TNL/Algorithms/Sorting/BitonicSort.h b/src/TNL/Algorithms/Sorting/BitonicSort.h index 17471f69f..aefa966a0 100644 --- a/src/TNL/Algorithms/Sorting/BitonicSort.h +++ b/src/TNL/Algorithms/Sorting/BitonicSort.h @@ -32,6 +32,11 @@ struct BitonicSort bitonicSort( array, compare ); } + template< typename Index, typename Fetch, typename Compare, typename Swap > + void static inplaceSort( const Index begin, const Index end, const Fetch& fetch, const Compare& compare, const Swap& swap ) + { + bitonicSort( begin, end, fetch, compare, swap ); + } }; } // namespace Sorting -- GitLab From 40d35192557953cd97b9ebdc2ccb20be17bad47c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Thu, 15 Jul 2021 08:16:31 +0200 Subject: [PATCH 242/258] Added default sorter selector. --- src/TNL/Algorithms/Sort.h | 34 +++++++++++--- src/TNL/Algorithms/Sorting/DefaultSorter.h | 54 ++++++++++++++++++++++ 2 files changed, 81 insertions(+), 7 deletions(-) create mode 100644 src/TNL/Algorithms/Sorting/DefaultSorter.h diff --git a/src/TNL/Algorithms/Sort.h b/src/TNL/Algorithms/Sort.h index 498ae45fe..6b924215e 100644 --- a/src/TNL/Algorithms/Sort.h +++ b/src/TNL/Algorithms/Sort.h @@ -12,18 +12,38 @@ #pragma once -#include // std::pair, std::forward - -#include -#include -#include -#include -#include +#include namespace TNL { namespace Algorithms { +template< typename Array, + typename Sorter = typename Sorting::DefaultSorter< typename Array::DeviceType >::SorterType > +void sort( Array& array ) +{ + Sorter::sort( array ); +} + +template< typename Array, + typename Compare, + typename Sorter = typename Sorting::DefaultSorter< typename Array::DeviceType >::SorterType > +void sort( Array& array, const Compare& compare ) +{ + Sorter::sort( array, compare ); +} + +template< typename Device, + typename Index, + typename Fetch, + typename Compare, + typename Swap, + typename Sorter = typename Sorting::DefaultInplaceSorter< Device >::SorterType > +void inplaceSort( const Index begin, const Index end, const Fetch& fetch, const Compare& compare, const Swap& swap ) +{ + Sorter::inplaceSort( begin, end, fetch, compare, swap ); +} + template bool isSorted( const Array& arr, const Function& cmp ) { diff --git a/src/TNL/Algorithms/Sorting/DefaultSorter.h b/src/TNL/Algorithms/Sorting/DefaultSorter.h new file mode 100644 index 000000000..13863df54 --- /dev/null +++ b/src/TNL/Algorithms/Sorting/DefaultSorter.h @@ -0,0 +1,54 @@ +/*************************************************************************** + DefaultSorter.h - description + ------------------- + begin : Jul 14, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include // std::pair, std::forward + +#include +#include +#include +#include +#include +#include + +namespace TNL { + namespace Algorithms { + namespace Sorting { + +template< typename Device > +struct DefaultSorter; + +template<> +struct DefaultSorter< Devices::Host > +{ + using SorterType = Algorithms::Sorting::STLSort; +}; + +template<> +struct DefaultSorter< Devices::Cuda > +{ + using SorterType = Algorithms::Sorting::Quicksort; +}; + +template< typename Device > +struct DefaultInplaceSorter; + +template<> +struct DefaultInplaceSorter< Devices::Cuda > +{ + using SorterType = Algorithms::Sorting::BitonicSort; +}; + + } // namespace Sorting + } // namespace Algorithms +} // namespace TNL -- GitLab From e57c9cbb8802289def941d8b27fdba6a1a79f40c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Thu, 15 Jul 2021 18:19:40 +0200 Subject: [PATCH 243/258] Fixing build of tnl-benchmark-sort with CI flags. --- src/Benchmarks/Sorting/CMakeLists.txt | 3 ++- src/Benchmarks/Sorting/ReferenceAlgorithms/MancaQuicksort.h | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/Benchmarks/Sorting/CMakeLists.txt b/src/Benchmarks/Sorting/CMakeLists.txt index cb1454c09..c7d5de635 100644 --- a/src/Benchmarks/Sorting/CMakeLists.txt +++ b/src/Benchmarks/Sorting/CMakeLists.txt @@ -1,5 +1,6 @@ if( BUILD_CUDA ) - CUDA_ADD_EXECUTABLE( tnl-benchmark-sort tnl-benchmark-sort.cu ) + CUDA_ADD_EXECUTABLE( tnl-benchmark-sort tnl-benchmark-sort.cu OPTIONS -Xcompiler -Wno-error=switch,-Wno-error=sign-compare) + # Source code of reference algorithms contains warning which turn into errers with CI/CD compiler flags. Therefore we use -Wno-error to turn it off. TARGET_LINK_LIBRARIES( tnl-benchmark-sort ${CUDA_cusparse_LIBRARY} ${CUDA_cudadevrt_LIBRARY} ) else() ADD_EXECUTABLE( tnl-benchmark-sort tnl-benchmark-sort.cpp ) diff --git a/src/Benchmarks/Sorting/ReferenceAlgorithms/MancaQuicksort.h b/src/Benchmarks/Sorting/ReferenceAlgorithms/MancaQuicksort.h index 9b32a599a..44c7fb332 100644 --- a/src/Benchmarks/Sorting/ReferenceAlgorithms/MancaQuicksort.h +++ b/src/Benchmarks/Sorting/ReferenceAlgorithms/MancaQuicksort.h @@ -1040,7 +1040,7 @@ __global__ void bucketAssign(Block *bucket, uint *npartitions, int nbucket //else bucket[i + nbucket].done = (from - orgbeg) > 1024 && (minPiv != maxPiv); bucket[i + nbucket].select = select; - bucket[i + nbucket].minPiv = 0xffffffffffffffff; + bucket[i + nbucket].minPiv = ( Type ) 0xffffffffffffffff; bucket[i + nbucket].maxPiv = 0; //bucket[i+nbucket].finish=false; @@ -1061,7 +1061,7 @@ __global__ void bucketAssign(Block *bucket, uint *npartitions, int nbucket // else bucket[i].done = (orgend - end) > 1024 && (minPiv != maxPiv); bucket[i].select = select; - bucket[i].minPiv = 0xffffffffffffffff; + bucket[i].minPiv = ( Type ) 0xffffffffffffffff; bucket[i].maxPiv = 0; //bucket[i].finish=false; @@ -1090,7 +1090,7 @@ __global__ void init(Type *data, Block *bucket, uint *npartitions, int siz bucket[i].done = false + i == 0; bucket[i].select = false; bucket[i].maxPiv = 0x0; - bucket[i].minPiv = 0xffffffffffffffff; + bucket[i].minPiv = ( Type ) 0xffffffffffffffff; //bucket[i].pivot = 0+ (i==0)*((min(min(data[0],data[size/2]),data[size-1]) + max(max(data[0],data[size/2]),data[size-1]))/2); bucket[i].pivot = data[size / 2]; } -- GitLab From a272c837aca036d355b2eb2039adb57a6ec848d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Thu, 15 Jul 2021 19:31:15 +0200 Subject: [PATCH 244/258] Replacing source code files from CUDA samples. --- CMakeLists.txt | 5 + build | 12 + src/Benchmarks/Sorting/CMakeLists.txt | 2 +- .../ReferenceAlgorithms/MancaQuicksort.h | 10 +- .../ReferenceAlgorithms/helpers/exception.h | 151 --- .../ReferenceAlgorithms/helpers/helper_cuda.h | 966 ------------------ .../helpers/helper_string.h | 421 -------- .../helpers/helper_timer.h | 495 --------- .../ReferenceAlgorithms/helpers/scan_common.h | 61 -- 9 files changed, 23 insertions(+), 2100 deletions(-) delete mode 100644 src/Benchmarks/Sorting/ReferenceAlgorithms/helpers/exception.h delete mode 100644 src/Benchmarks/Sorting/ReferenceAlgorithms/helpers/helper_cuda.h delete mode 100644 src/Benchmarks/Sorting/ReferenceAlgorithms/helpers/helper_string.h delete mode 100644 src/Benchmarks/Sorting/ReferenceAlgorithms/helpers/helper_timer.h delete mode 100644 src/Benchmarks/Sorting/ReferenceAlgorithms/helpers/scan_common.h diff --git a/CMakeLists.txt b/CMakeLists.txt index d7e824a12..949444d32 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,6 +32,7 @@ option(BUILD_TESTS "Build tests" OFF) option(BUILD_MATRIX_TESTS "Build tests for matrix formats" OFF) option(BUILD_PYTHON "Compile the Python bindings" OFF) option(BUILD_DOC "Build examples included in the documentation" OFF) +set(CUDA_SAMPLES_PATH "none" CACHE STRING "Path to CUDA Samples - it is used only for some benchmarking.") # install paths relative to the cmake's prefix set( TNL_TARGET_INCLUDE_DIRECTORY "include/TNL" ) @@ -253,6 +254,9 @@ if( ${WITH_CUDA} ) endif() set( CMAKE_EXECUTABLE_SUFFIX "${executable_suffix_backup}" ) endif() + if( NOT CUDA_SAMPLES_DIR STREQUAL "none" ) + set( CUDA_SAMPLES_FLAGS "-I${CUDA_SAMPLES_DIR}/common/inc") + endif() endif() @@ -407,6 +411,7 @@ message( " CMAKE_SHARED_LINKER_FLAGS = ${CMAKE_SHARED_LINKER_FLAGS}" ) message( " CMAKE_SHARED_LINKER_FLAGS_DEBUG = ${CMAKE_SHARED_LINKER_FLAGS_DEBUG}" ) message( " CMAKE_SHARED_LINKER_FLAGS_RELEASE = ${CMAKE_SHARED_LINKER_FLAGS_RELEASE}" ) message( " CUDA_NVCC_FLAGS = ${CUDA_NVCC_FLAGS}" ) +message( " CUDA_SAMPLES_FLAGS = ${CUDA_SAMPLES_FLAGS}" ) message( " GMP_LIBRARIES = ${GMP_LIBRARIES}" ) if( MPI_CXX_FOUND AND ${WITH_MPI} ) message( " MPI_CXX_COMPILE_OPTIONS = ${MPI_CXX_COMPILE_OPTIONS}" ) diff --git a/build b/build index 09da8de5c..22468ccdb 100755 --- a/build +++ b/build @@ -39,6 +39,13 @@ BUILD_TESTS="no" BUILD_MATRIX_TESTS="no" BUILD_DOC="no" +# external dependencies +CUDA_SAMPLES_DIR=none + +if [[ x"$CUDA_SAMPLES_PATH" != "x" ]]; then + CUDA_SAMPLES_DIR=${CUDA_SAMPLES_PATH} +fi + function print_usage() { cat << EOF @@ -85,6 +92,9 @@ Options for the 'tests' and 'matrix-tests' targets: --tests-jobs=NUM Number of processes to be used for the unit tests. It is $TEST_JOBS by default. --with-coverage=yes/no Enables code coverage reports for unit tests (lcov is required). '$WITH_COVERAGE' by default. --with-system-gtest=yes/no Use GTest installed in the local system and do not download the latest version. '$WITH_SYSTEM_GTEST' by default. + +External dependencies: + --cuda-samples-dir=PATH CUDA samples are used by some reference algorithms used in benchmarks. '$CUDA_SAMPLES_PATH` by default. EOF } @@ -120,6 +130,7 @@ for option in "$@"; do --with-coverage=* ) WITH_COVERAGE="${option#*=}" ;; --with-ci-flags=* ) WITH_CI_FLAGS="${option#*=}" ;; --with-system-gtest=* ) WITH_SYSTEM_GTEST="${option#*=}" ;; + --cuda-samples-path=* ) CUDA_SAMPLES_DIR="${option#*=}" ;; -* ) echo "Unknown option $option. Use --help for more information." >&2 exit 1 @@ -216,6 +227,7 @@ cmake_command=( -DBUILD_MATRIX_TESTS=${BUILD_MATRIX_TESTS} -DBUILD_PYTHON=${BUILD_PYTHON} -DBUILD_DOC=${BUILD_DOC} + -DCUDA_SAMPLES_DIR=${CUDA_SAMPLES_DIR} ) # Skip running cmake if it was already run and the cmake command is the same. diff --git a/src/Benchmarks/Sorting/CMakeLists.txt b/src/Benchmarks/Sorting/CMakeLists.txt index c7d5de635..cbd22efc0 100644 --- a/src/Benchmarks/Sorting/CMakeLists.txt +++ b/src/Benchmarks/Sorting/CMakeLists.txt @@ -1,5 +1,5 @@ if( BUILD_CUDA ) - CUDA_ADD_EXECUTABLE( tnl-benchmark-sort tnl-benchmark-sort.cu OPTIONS -Xcompiler -Wno-error=switch,-Wno-error=sign-compare) + CUDA_ADD_EXECUTABLE( tnl-benchmark-sort tnl-benchmark-sort.cu OPTIONS -Xcompiler -Wno-error=switch,-Wno-error=sign-compare ${CUDA_SAMPLES_FLAGS} ) # Source code of reference algorithms contains warning which turn into errers with CI/CD compiler flags. Therefore we use -Wno-error to turn it off. TARGET_LINK_LIBRARIES( tnl-benchmark-sort ${CUDA_cusparse_LIBRARY} ${CUDA_cudadevrt_LIBRARY} ) else() diff --git a/src/Benchmarks/Sorting/ReferenceAlgorithms/MancaQuicksort.h b/src/Benchmarks/Sorting/ReferenceAlgorithms/MancaQuicksort.h index 44c7fb332..64b1d0c7a 100644 --- a/src/Benchmarks/Sorting/ReferenceAlgorithms/MancaQuicksort.h +++ b/src/Benchmarks/Sorting/ReferenceAlgorithms/MancaQuicksort.h @@ -217,8 +217,8 @@ inline __device__ void compareInclusive(Type &idata, Type &idata2, volatile Type } #include -#include "helpers/helper_cuda.h" -#include "helpers/scan_common.h" +#include +#include <../../6_Advanced/scan/scan_common.h> //All three kernels run 512 threads per workgroup //Must be a power of two @@ -654,9 +654,9 @@ size_t scanInclusiveLarge( #include -#include "helpers/helper_cuda.h" -#include "helpers/helper_timer.h" -#include "helpers/scan_common.h" +#include +#include +#include <../../6_Advanced/scan/scan_common.h> extern __shared__ uint sMemory[]; diff --git a/src/Benchmarks/Sorting/ReferenceAlgorithms/helpers/exception.h b/src/Benchmarks/Sorting/ReferenceAlgorithms/helpers/exception.h deleted file mode 100644 index ff12dbb5a..000000000 --- a/src/Benchmarks/Sorting/ReferenceAlgorithms/helpers/exception.h +++ /dev/null @@ -1,151 +0,0 @@ -/* -* Copyright 1993-2012 NVIDIA Corporation. All rights reserved. -* -* Please refer to the NVIDIA end user license agreement (EULA) associated -* with this source code for terms and conditions that govern your use of -* this software. Any use, reproduction, disclosure, or distribution of -* this software and related documentation outside the terms of the EULA -* is strictly prohibited. -* -*/ - -/* CUda UTility Library */ -#ifndef _EXCEPTION_H_ -#define _EXCEPTION_H_ - -// includes, system -#include -#include -#include -#include - -//! Exception wrapper. -//! @param Std_Exception Exception out of namespace std for easy typing. -template -class Exception : public Std_Exception -{ - public: - - //! @brief Static construction interface - //! @return Alwayss throws ( Located_Exception) - //! @param file file in which the Exception occurs - //! @param line line in which the Exception occurs - //! @param detailed details on the code fragment causing the Exception - static void throw_it(const char *file, - const int line, - const char *detailed = "-"); - - //! Static construction interface - //! @return Alwayss throws ( Located_Exception) - //! @param file file in which the Exception occurs - //! @param line line in which the Exception occurs - //! @param detailed details on the code fragment causing the Exception - static void throw_it(const char *file, - const int line, - const std::string &detailed); - - //! Destructor - virtual ~Exception() throw(); - - private: - - //! Constructor, default (private) - Exception(); - - //! Constructor, standard - //! @param str string returned by what() - Exception(const std::string &str); - -}; - -//////////////////////////////////////////////////////////////////////////////// -//! Exception handler function for arbitrary exceptions -//! @param ex exception to handle -//////////////////////////////////////////////////////////////////////////////// -template -inline void -handleException(const Exception_Typ &ex) -{ - std::cerr << ex.what() << std::endl; - - exit(EXIT_FAILURE); -} - -//! Convenience macros - -//! Exception caused by dynamic program behavior, e.g. file does not exist -#define RUNTIME_EXCEPTION( msg) \ - Exception::throw_it( __FILE__, __LINE__, msg) - -//! Logic exception in program, e.g. an assert failed -#define LOGIC_EXCEPTION( msg) \ - Exception::throw_it( __FILE__, __LINE__, msg) - -//! Out of range exception -#define RANGE_EXCEPTION( msg) \ - Exception::throw_it( __FILE__, __LINE__, msg) - -//////////////////////////////////////////////////////////////////////////////// -//! Implementation - -// includes, system -#include - -//////////////////////////////////////////////////////////////////////////////// -//! Static construction interface. -//! @param Exception causing code fragment (file and line) and detailed infos. -//////////////////////////////////////////////////////////////////////////////// -/*static*/ template -void -Exception:: -throw_it(const char *file, const int line, const char *detailed) -{ - std::stringstream s; - - // Quiet heavy-weight but exceptions are not for - // performance / release versions - s << "Exception in file '" << file << "' in line " << line << "\n" - << "Detailed description: " << detailed << "\n"; - - throw Exception(s.str()); -} - -//////////////////////////////////////////////////////////////////////////////// -//! Static construction interface. -//! @param Exception causing code fragment (file and line) and detailed infos. -//////////////////////////////////////////////////////////////////////////////// -/*static*/ template -void -Exception:: -throw_it(const char *file, const int line, const std::string &msg) -{ - throw_it(file, line, msg.c_str()); -} - -//////////////////////////////////////////////////////////////////////////////// -//! Constructor, default (private). -//////////////////////////////////////////////////////////////////////////////// -template -Exception::Exception() : - Exception("Unknown Exception.\n") -{ } - -//////////////////////////////////////////////////////////////////////////////// -//! Constructor, standard (private). -//! String returned by what(). -//////////////////////////////////////////////////////////////////////////////// -template -Exception::Exception(const std::string &s) : - Std_Exception(s) -{ } - -//////////////////////////////////////////////////////////////////////////////// -//! Destructor -//////////////////////////////////////////////////////////////////////////////// -template -Exception::~Exception() throw() { } - -// functions, exported - -#endif // #ifndef _EXCEPTION_H_ - diff --git a/src/Benchmarks/Sorting/ReferenceAlgorithms/helpers/helper_cuda.h b/src/Benchmarks/Sorting/ReferenceAlgorithms/helpers/helper_cuda.h deleted file mode 100644 index cddfe76a3..000000000 --- a/src/Benchmarks/Sorting/ReferenceAlgorithms/helpers/helper_cuda.h +++ /dev/null @@ -1,966 +0,0 @@ -/** - * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. - * - * Please refer to the NVIDIA end user license agreement (EULA) associated - * with this source code for terms and conditions that govern your use of - * this software. Any use, reproduction, disclosure, or distribution of - * this software and related documentation outside the terms of the EULA - * is strictly prohibited. - * - */ - -//////////////////////////////////////////////////////////////////////////////// -// These are CUDA Helper functions for initialization and error checking - -#ifndef HELPER_CUDA_H -#define HELPER_CUDA_H - -#pragma once - -#include -#include -#include - -#include "helper_string.h" - -//#include -//#include -//#include - -// Note, it is required that your SDK sample to include the proper header files, please -// refer the CUDA examples for examples of the needed CUDA headers, which may change depending -// on which CUDA functions are used. - -// CUDA Runtime error messages -#ifdef __DRIVER_TYPES_H__ -static const char *_cudaGetErrorEnum(cudaError_t error) -{ - switch (error) - { - case cudaSuccess: - return "cudaSuccess"; - - case cudaErrorMissingConfiguration: - return "cudaErrorMissingConfiguration"; - - case cudaErrorMemoryAllocation: - return "cudaErrorMemoryAllocation"; - - case cudaErrorInitializationError: - return "cudaErrorInitializationError"; - - case cudaErrorLaunchFailure: - return "cudaErrorLaunchFailure"; - - case cudaErrorPriorLaunchFailure: - return "cudaErrorPriorLaunchFailure"; - - case cudaErrorLaunchTimeout: - return "cudaErrorLaunchTimeout"; - - case cudaErrorLaunchOutOfResources: - return "cudaErrorLaunchOutOfResources"; - - case cudaErrorInvalidDeviceFunction: - return "cudaErrorInvalidDeviceFunction"; - - case cudaErrorInvalidConfiguration: - return "cudaErrorInvalidConfiguration"; - - case cudaErrorInvalidDevice: - return "cudaErrorInvalidDevice"; - - case cudaErrorInvalidValue: - return "cudaErrorInvalidValue"; - - case cudaErrorInvalidPitchValue: - return "cudaErrorInvalidPitchValue"; - - case cudaErrorInvalidSymbol: - return "cudaErrorInvalidSymbol"; - - case cudaErrorMapBufferObjectFailed: - return "cudaErrorMapBufferObjectFailed"; - - case cudaErrorUnmapBufferObjectFailed: - return "cudaErrorUnmapBufferObjectFailed"; - - case cudaErrorInvalidHostPointer: - return "cudaErrorInvalidHostPointer"; - - case cudaErrorInvalidDevicePointer: - return "cudaErrorInvalidDevicePointer"; - - case cudaErrorInvalidTexture: - return "cudaErrorInvalidTexture"; - - case cudaErrorInvalidTextureBinding: - return "cudaErrorInvalidTextureBinding"; - - case cudaErrorInvalidChannelDescriptor: - return "cudaErrorInvalidChannelDescriptor"; - - case cudaErrorInvalidMemcpyDirection: - return "cudaErrorInvalidMemcpyDirection"; - - case cudaErrorAddressOfConstant: - return "cudaErrorAddressOfConstant"; - - case cudaErrorTextureFetchFailed: - return "cudaErrorTextureFetchFailed"; - - case cudaErrorTextureNotBound: - return "cudaErrorTextureNotBound"; - - case cudaErrorSynchronizationError: - return "cudaErrorSynchronizationError"; - - case cudaErrorInvalidFilterSetting: - return "cudaErrorInvalidFilterSetting"; - - case cudaErrorInvalidNormSetting: - return "cudaErrorInvalidNormSetting"; - - case cudaErrorMixedDeviceExecution: - return "cudaErrorMixedDeviceExecution"; - - case cudaErrorCudartUnloading: - return "cudaErrorCudartUnloading"; - - case cudaErrorUnknown: - return "cudaErrorUnknown"; - - case cudaErrorNotYetImplemented: - return "cudaErrorNotYetImplemented"; - - case cudaErrorMemoryValueTooLarge: - return "cudaErrorMemoryValueTooLarge"; - - case cudaErrorInvalidResourceHandle: - return "cudaErrorInvalidResourceHandle"; - - case cudaErrorNotReady: - return "cudaErrorNotReady"; - - case cudaErrorInsufficientDriver: - return "cudaErrorInsufficientDriver"; - - case cudaErrorSetOnActiveProcess: - return "cudaErrorSetOnActiveProcess"; - - case cudaErrorInvalidSurface: - return "cudaErrorInvalidSurface"; - - case cudaErrorNoDevice: - return "cudaErrorNoDevice"; - - case cudaErrorECCUncorrectable: - return "cudaErrorECCUncorrectable"; - - case cudaErrorSharedObjectSymbolNotFound: - return "cudaErrorSharedObjectSymbolNotFound"; - - case cudaErrorSharedObjectInitFailed: - return "cudaErrorSharedObjectInitFailed"; - - case cudaErrorUnsupportedLimit: - return "cudaErrorUnsupportedLimit"; - - case cudaErrorDuplicateVariableName: - return "cudaErrorDuplicateVariableName"; - - case cudaErrorDuplicateTextureName: - return "cudaErrorDuplicateTextureName"; - - case cudaErrorDuplicateSurfaceName: - return "cudaErrorDuplicateSurfaceName"; - - case cudaErrorDevicesUnavailable: - return "cudaErrorDevicesUnavailable"; - - case cudaErrorInvalidKernelImage: - return "cudaErrorInvalidKernelImage"; - - case cudaErrorNoKernelImageForDevice: - return "cudaErrorNoKernelImageForDevice"; - - case cudaErrorIncompatibleDriverContext: - return "cudaErrorIncompatibleDriverContext"; - - case cudaErrorPeerAccessAlreadyEnabled: - return "cudaErrorPeerAccessAlreadyEnabled"; - - case cudaErrorPeerAccessNotEnabled: - return "cudaErrorPeerAccessNotEnabled"; - - case cudaErrorDeviceAlreadyInUse: - return "cudaErrorDeviceAlreadyInUse"; - - case cudaErrorProfilerDisabled: - return "cudaErrorProfilerDisabled"; - - case cudaErrorProfilerNotInitialized: - return "cudaErrorProfilerNotInitialized"; - - case cudaErrorProfilerAlreadyStarted: - return "cudaErrorProfilerAlreadyStarted"; - - case cudaErrorProfilerAlreadyStopped: - return "cudaErrorProfilerAlreadyStopped"; - -#if __CUDA_API_VERSION >= 0x4000 - - case cudaErrorAssert: - return "cudaErrorAssert"; - - case cudaErrorTooManyPeers: - return "cudaErrorTooManyPeers"; - - case cudaErrorHostMemoryAlreadyRegistered: - return "cudaErrorHostMemoryAlreadyRegistered"; - - case cudaErrorHostMemoryNotRegistered: - return "cudaErrorHostMemoryNotRegistered"; -#endif - - case cudaErrorStartupFailure: - return "cudaErrorStartupFailure"; - - case cudaErrorApiFailureBase: - return "cudaErrorApiFailureBase"; - } - - return ""; -} -#endif - -#ifdef __cuda_cuda_h__ -// CUDA Driver API errors -static const char *_cudaGetErrorEnum(CUresult error) -{ - switch (error) - { - case CUDA_SUCCESS: - return "CUDA_SUCCESS"; - - case CUDA_ERROR_INVALID_VALUE: - return "CUDA_ERROR_INVALID_VALUE"; - - case CUDA_ERROR_OUT_OF_MEMORY: - return "CUDA_ERROR_OUT_OF_MEMORY"; - - case CUDA_ERROR_NOT_INITIALIZED: - return "CUDA_ERROR_NOT_INITIALIZED"; - - case CUDA_ERROR_DEINITIALIZED: - return "CUDA_ERROR_DEINITIALIZED"; - - case CUDA_ERROR_PROFILER_DISABLED: - return "CUDA_ERROR_PROFILER_DISABLED"; - - case CUDA_ERROR_PROFILER_NOT_INITIALIZED: - return "CUDA_ERROR_PROFILER_NOT_INITIALIZED"; - - case CUDA_ERROR_PROFILER_ALREADY_STARTED: - return "CUDA_ERROR_PROFILER_ALREADY_STARTED"; - - case CUDA_ERROR_PROFILER_ALREADY_STOPPED: - return "CUDA_ERROR_PROFILER_ALREADY_STOPPED"; - - case CUDA_ERROR_NO_DEVICE: - return "CUDA_ERROR_NO_DEVICE"; - - case CUDA_ERROR_INVALID_DEVICE: - return "CUDA_ERROR_INVALID_DEVICE"; - - case CUDA_ERROR_INVALID_IMAGE: - return "CUDA_ERROR_INVALID_IMAGE"; - - case CUDA_ERROR_INVALID_CONTEXT: - return "CUDA_ERROR_INVALID_CONTEXT"; - - case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: - return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT"; - - case CUDA_ERROR_MAP_FAILED: - return "CUDA_ERROR_MAP_FAILED"; - - case CUDA_ERROR_UNMAP_FAILED: - return "CUDA_ERROR_UNMAP_FAILED"; - - case CUDA_ERROR_ARRAY_IS_MAPPED: - return "CUDA_ERROR_ARRAY_IS_MAPPED"; - - case CUDA_ERROR_ALREADY_MAPPED: - return "CUDA_ERROR_ALREADY_MAPPED"; - - case CUDA_ERROR_NO_BINARY_FOR_GPU: - return "CUDA_ERROR_NO_BINARY_FOR_GPU"; - - case CUDA_ERROR_ALREADY_ACQUIRED: - return "CUDA_ERROR_ALREADY_ACQUIRED"; - - case CUDA_ERROR_NOT_MAPPED: - return "CUDA_ERROR_NOT_MAPPED"; - - case CUDA_ERROR_NOT_MAPPED_AS_ARRAY: - return "CUDA_ERROR_NOT_MAPPED_AS_ARRAY"; - - case CUDA_ERROR_NOT_MAPPED_AS_POINTER: - return "CUDA_ERROR_NOT_MAPPED_AS_POINTER"; - - case CUDA_ERROR_ECC_UNCORRECTABLE: - return "CUDA_ERROR_ECC_UNCORRECTABLE"; - - case CUDA_ERROR_UNSUPPORTED_LIMIT: - return "CUDA_ERROR_UNSUPPORTED_LIMIT"; - - case CUDA_ERROR_CONTEXT_ALREADY_IN_USE: - return "CUDA_ERROR_CONTEXT_ALREADY_IN_USE"; - - case CUDA_ERROR_INVALID_SOURCE: - return "CUDA_ERROR_INVALID_SOURCE"; - - case CUDA_ERROR_FILE_NOT_FOUND: - return "CUDA_ERROR_FILE_NOT_FOUND"; - - case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: - return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND"; - - case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: - return "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED"; - - case CUDA_ERROR_OPERATING_SYSTEM: - return "CUDA_ERROR_OPERATING_SYSTEM"; - - case CUDA_ERROR_INVALID_HANDLE: - return "CUDA_ERROR_INVALID_HANDLE"; - - case CUDA_ERROR_NOT_FOUND: - return "CUDA_ERROR_NOT_FOUND"; - - case CUDA_ERROR_NOT_READY: - return "CUDA_ERROR_NOT_READY"; - - case CUDA_ERROR_LAUNCH_FAILED: - return "CUDA_ERROR_LAUNCH_FAILED"; - - case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: - return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES"; - - case CUDA_ERROR_LAUNCH_TIMEOUT: - return "CUDA_ERROR_LAUNCH_TIMEOUT"; - - case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: - return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING"; - - case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED: - return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED"; - - case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED: - return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED"; - - case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE: - return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE"; - - case CUDA_ERROR_CONTEXT_IS_DESTROYED: - return "CUDA_ERROR_CONTEXT_IS_DESTROYED"; - - case CUDA_ERROR_ASSERT: - return "CUDA_ERROR_ASSERT"; - - case CUDA_ERROR_TOO_MANY_PEERS: - return "CUDA_ERROR_TOO_MANY_PEERS"; - - case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED: - return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED"; - - case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED: - return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED"; - - case CUDA_ERROR_UNKNOWN: - return "CUDA_ERROR_UNKNOWN"; - } - - return ""; -} -#endif - -#ifdef CUBLAS_API_H_ -// cuBLAS API errors -static const char *_cudaGetErrorEnum(cublasStatus_t error) -{ - switch (error) - { - case CUBLAS_STATUS_SUCCESS: - return "CUBLAS_STATUS_SUCCESS"; - - case CUBLAS_STATUS_NOT_INITIALIZED: - return "CUBLAS_STATUS_NOT_INITIALIZED"; - - case CUBLAS_STATUS_ALLOC_FAILED: - return "CUBLAS_STATUS_ALLOC_FAILED"; - - case CUBLAS_STATUS_INVALID_VALUE: - return "CUBLAS_STATUS_INVALID_VALUE"; - - case CUBLAS_STATUS_ARCH_MISMATCH: - return "CUBLAS_STATUS_ARCH_MISMATCH"; - - case CUBLAS_STATUS_MAPPING_ERROR: - return "CUBLAS_STATUS_MAPPING_ERROR"; - - case CUBLAS_STATUS_EXECUTION_FAILED: - return "CUBLAS_STATUS_EXECUTION_FAILED"; - - case CUBLAS_STATUS_INTERNAL_ERROR: - return "CUBLAS_STATUS_INTERNAL_ERROR"; - } - - return ""; -} -#endif - -#ifdef _CUFFT_H_ -// cuFFT API errors -static const char *_cudaGetErrorEnum(cufftResult error) -{ - switch (error) - { - case CUFFT_SUCCESS: - return "CUFFT_SUCCESS"; - - case CUFFT_INVALID_PLAN: - return "CUFFT_INVALID_PLAN"; - - case CUFFT_ALLOC_FAILED: - return "CUFFT_ALLOC_FAILED"; - - case CUFFT_INVALID_TYPE: - return "CUFFT_INVALID_TYPE"; - - case CUFFT_INVALID_VALUE: - return "CUFFT_INVALID_VALUE"; - - case CUFFT_INTERNAL_ERROR: - return "CUFFT_INTERNAL_ERROR"; - - case CUFFT_EXEC_FAILED: - return "CUFFT_EXEC_FAILED"; - - case CUFFT_SETUP_FAILED: - return "CUFFT_SETUP_FAILED"; - - case CUFFT_INVALID_SIZE: - return "CUFFT_INVALID_SIZE"; - - case CUFFT_UNALIGNED_DATA: - return "CUFFT_UNALIGNED_DATA"; - } - - return ""; -} -#endif - - -#ifdef CUSPARSEAPI -// cuSPARSE API errors -static const char *_cudaGetErrorEnum(cusparseStatus_t error) -{ - switch (error) - { - case CUSPARSE_STATUS_SUCCESS: - return "CUSPARSE_STATUS_SUCCESS"; - - case CUSPARSE_STATUS_NOT_INITIALIZED: - return "CUSPARSE_STATUS_NOT_INITIALIZED"; - - case CUSPARSE_STATUS_ALLOC_FAILED: - return "CUSPARSE_STATUS_ALLOC_FAILED"; - - case CUSPARSE_STATUS_INVALID_VALUE: - return "CUSPARSE_STATUS_INVALID_VALUE"; - - case CUSPARSE_STATUS_ARCH_MISMATCH: - return "CUSPARSE_STATUS_ARCH_MISMATCH"; - - case CUSPARSE_STATUS_MAPPING_ERROR: - return "CUSPARSE_STATUS_MAPPING_ERROR"; - - case CUSPARSE_STATUS_EXECUTION_FAILED: - return "CUSPARSE_STATUS_EXECUTION_FAILED"; - - case CUSPARSE_STATUS_INTERNAL_ERROR: - return "CUSPARSE_STATUS_INTERNAL_ERROR"; - - case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: - return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; - } - - return ""; -} -#endif - -#ifdef CURAND_H_ -// cuRAND API errors -static const char *_cudaGetErrorEnum(curandStatus_t error) -{ - switch (error) - { - case CURAND_STATUS_SUCCESS: - return "CURAND_STATUS_SUCCESS"; - - case CURAND_STATUS_VERSION_MISMATCH: - return "CURAND_STATUS_VERSION_MISMATCH"; - - case CURAND_STATUS_NOT_INITIALIZED: - return "CURAND_STATUS_NOT_INITIALIZED"; - - case CURAND_STATUS_ALLOCATION_FAILED: - return "CURAND_STATUS_ALLOCATION_FAILED"; - - case CURAND_STATUS_TYPE_ERROR: - return "CURAND_STATUS_TYPE_ERROR"; - - case CURAND_STATUS_OUT_OF_RANGE: - return "CURAND_STATUS_OUT_OF_RANGE"; - - case CURAND_STATUS_LENGTH_NOT_MULTIPLE: - return "CURAND_STATUS_LENGTH_NOT_MULTIPLE"; - - case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: - return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; - - case CURAND_STATUS_LAUNCH_FAILURE: - return "CURAND_STATUS_LAUNCH_FAILURE"; - - case CURAND_STATUS_PREEXISTING_FAILURE: - return "CURAND_STATUS_PREEXISTING_FAILURE"; - - case CURAND_STATUS_INITIALIZATION_FAILED: - return "CURAND_STATUS_INITIALIZATION_FAILED"; - - case CURAND_STATUS_ARCH_MISMATCH: - return "CURAND_STATUS_ARCH_MISMATCH"; - - case CURAND_STATUS_INTERNAL_ERROR: - return "CURAND_STATUS_INTERNAL_ERROR"; - } - - return ""; -} -#endif - -#ifdef NV_NPPIDEFS_H -// NPP API errors -static const char *_cudaGetErrorEnum(NppStatus error) -{ - switch (error) - { - case NPP_NOT_SUPPORTED_MODE_ERROR: - return "NPP_NOT_SUPPORTED_MODE_ERROR"; - - case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR: - return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR"; - - case NPP_RESIZE_NO_OPERATION_ERROR: - return "NPP_RESIZE_NO_OPERATION_ERROR"; - - case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY: - return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY"; - - case NPP_BAD_ARG_ERROR: - return "NPP_BAD_ARG_ERROR"; - - case NPP_LUT_NUMBER_OF_LEVELS_ERROR: - return "NPP_LUT_NUMBER_OF_LEVELS_ERROR"; - - case NPP_TEXTURE_BIND_ERROR: - return "NPP_TEXTURE_BIND_ERROR"; - - case NPP_COEFF_ERROR: - return "NPP_COEFF_ERROR"; - - case NPP_RECT_ERROR: - return "NPP_RECT_ERROR"; - - case NPP_QUAD_ERROR: - return "NPP_QUAD_ERROR"; - - case NPP_WRONG_INTERSECTION_ROI_ERROR: - return "NPP_WRONG_INTERSECTION_ROI_ERROR"; - - case NPP_NOT_EVEN_STEP_ERROR: - return "NPP_NOT_EVEN_STEP_ERROR"; - - case NPP_INTERPOLATION_ERROR: - return "NPP_INTERPOLATION_ERROR"; - - case NPP_RESIZE_FACTOR_ERROR: - return "NPP_RESIZE_FACTOR_ERROR"; - - case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR: - return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR"; - - case NPP_MEMFREE_ERR: - return "NPP_MEMFREE_ERR"; - - case NPP_MEMSET_ERR: - return "NPP_MEMSET_ERR"; - - case NPP_MEMCPY_ERROR: - return "NPP_MEMCPY_ERROR"; - - case NPP_MEM_ALLOC_ERR: - return "NPP_MEM_ALLOC_ERR"; - - case NPP_HISTO_NUMBER_OF_LEVELS_ERROR: - return "NPP_HISTO_NUMBER_OF_LEVELS_ERROR"; - - case NPP_MIRROR_FLIP_ERR: - return "NPP_MIRROR_FLIP_ERR"; - - case NPP_INVALID_INPUT: - return "NPP_INVALID_INPUT"; - - case NPP_ALIGNMENT_ERROR: - return "NPP_ALIGNMENT_ERROR"; - - case NPP_STEP_ERROR: - return "NPP_STEP_ERROR"; - - case NPP_SIZE_ERROR: - return "NPP_SIZE_ERROR"; - - case NPP_POINTER_ERROR: - return "NPP_POINTER_ERROR"; - - case NPP_NULL_POINTER_ERROR: - return "NPP_NULL_POINTER_ERROR"; - - case NPP_CUDA_KERNEL_EXECUTION_ERROR: - return "NPP_CUDA_KERNEL_EXECUTION_ERROR"; - - case NPP_NOT_IMPLEMENTED_ERROR: - return "NPP_NOT_IMPLEMENTED_ERROR"; - - case NPP_ERROR: - return "NPP_ERROR"; - - case NPP_SUCCESS: - return "NPP_SUCCESS"; - - case NPP_WARNING: - return "NPP_WARNING"; - - case NPP_WRONG_INTERSECTION_QUAD_WARNING: - return "NPP_WRONG_INTERSECTION_QUAD_WARNING"; - - case NPP_MISALIGNED_DST_ROI_WARNING: - return "NPP_MISALIGNED_DST_ROI_WARNING"; - - case NPP_AFFINE_QUAD_INCORRECT_WARNING: - return "NPP_AFFINE_QUAD_INCORRECT_WARNING"; - - case NPP_DOUBLE_SIZE_WARNING: - return "NPP_DOUBLE_SIZE_WARNING"; - - case NPP_ODD_ROI_WARNING: - return "NPP_ODD_ROI_WARNING"; - - case NPP_WRONG_INTERSECTION_ROI_WARNING: - return "NPP_WRONG_INTERSECTION_ROI_WARNING"; - } - - return ""; -} -#endif - -template< typename T > -bool check(T result, char const *const func, const char *const file, int const line) -{ - if (result) - { - fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", - file, line, static_cast(result), _cudaGetErrorEnum(result), func); - /* - std::stringstream ss; - std::string msg("CUDA error at "); - msg += file; - msg += ":"; - ss << line; - msg += ss.str(); - msg += " code="; - ss << static_cast(result); - msg += ss.str(); - msg += " ("; - msg += _cudaGetErrorEnum(result); - msg += ") \""; - msg += func; - msg += "\""; - //throw msg; - std::cerr << msg <<"\n"; - */ - return true; - } - else - { - return false; - } -} - -#ifdef __DRIVER_TYPES_H__ -// This will output the proper CUDA error strings in the event that a CUDA host call returns an error -#define checkCudaErrors(val) check ( (val), #val, __FILE__, __LINE__ ) - -// This will output the proper error string when calling cudaGetLastError -#define getLastCudaError(msg) __getLastCudaError (msg, __FILE__, __LINE__) - -inline void __getLastCudaError(const char *errorMessage, const char *file, const int line) -{ - cudaError_t err = cudaGetLastError(); - - if (cudaSuccess != err) - { - fprintf(stderr, "%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n", - file, line, errorMessage, (int)err, cudaGetErrorString(err)); - exit(EXIT_FAILURE); - } -} -#endif - -#ifndef MAX -#define MAX(a,b) (a > b ? a : b) -#endif - -// Beginning of GPU Architecture definitions -inline int _ConvertSMVer2Cores(int major, int minor) -{ - // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM - typedef struct - { - int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version - int Cores; - } sSMtoCores; - - sSMtoCores nGpuArchCoresPerSM[] = - { - { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class - { 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class - { 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class - { 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class - { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class - { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class - { 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class - { 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class - { -1, -1 } - }; - - int index = 0; - - while (nGpuArchCoresPerSM[index].SM != -1) - { - if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) - { - return nGpuArchCoresPerSM[index].Cores; - } - - index++; - } - - // If we don't find the values, we default use the previous one to run properly - printf("MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[7].Cores); - return nGpuArchCoresPerSM[7].Cores; -} -// end of GPU Architecture definitions - -#ifdef __CUDA_RUNTIME_H__ -// General GPU Device CUDA Initialization -inline int gpuDeviceInit(int devID) -{ - int deviceCount; - checkCudaErrors(cudaGetDeviceCount(&deviceCount)); - - if (deviceCount == 0) - { - fprintf(stderr, "gpuDeviceInit() CUDA error: no devices supporting CUDA.\n"); - exit(EXIT_FAILURE); - } - - if (devID < 0) - { - devID = 0; - } - - if (devID > deviceCount-1) - { - fprintf(stderr, "\n"); - fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount); - fprintf(stderr, ">> gpuDeviceInit (-device=%d) is not a valid GPU device. <<\n", devID); - fprintf(stderr, "\n"); - return -devID; - } - - cudaDeviceProp deviceProp; - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); - - if (deviceProp.computeMode == cudaComputeModeProhibited) - { - fprintf(stderr, "Error: device is running in , no threads can use ::cudaSetDevice().\n"); - return -1; - } - - if (deviceProp.major < 1) - { - fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n"); - exit(EXIT_FAILURE); - } - - checkCudaErrors(cudaSetDevice(devID)); - printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, deviceProp.name); - - return devID; -} - -// This function returns the best GPU (with maximum GFLOPS) -inline int gpuGetMaxGflopsDeviceId() -{ - int current_device = 0, sm_per_multiproc = 0; - int max_compute_perf = 0, max_perf_device = 0; - int device_count = 0, best_SM_arch = 0; - cudaDeviceProp deviceProp; - cudaGetDeviceCount(&device_count); - - // Find the best major SM Architecture GPU device - while (current_device < device_count) - { - cudaGetDeviceProperties(&deviceProp, current_device); - - // If this GPU is not running on Compute Mode prohibited, then we can add it to the list - if (deviceProp.computeMode != cudaComputeModeProhibited) - { - if (deviceProp.major > 0 && deviceProp.major < 9999) - { - best_SM_arch = MAX(best_SM_arch, deviceProp.major); - } - } - - current_device++; - } - - // Find the best CUDA capable GPU device - current_device = 0; - - while (current_device < device_count) - { - cudaGetDeviceProperties(&deviceProp, current_device); - - // If this GPU is not running on Compute Mode prohibited, then we can add it to the list - if (deviceProp.computeMode != cudaComputeModeProhibited) - { - if (deviceProp.major == 9999 && deviceProp.minor == 9999) - { - sm_per_multiproc = 1; - } - else - { - sm_per_multiproc = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor); - } - - int compute_perf = deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate; - - if (compute_perf > max_compute_perf) - { - // If we find GPU with SM major > 2, search only these - if (best_SM_arch > 2) - { - // If our device==dest_SM_arch, choose this, or else pass - if (deviceProp.major == best_SM_arch) - { - max_compute_perf = compute_perf; - max_perf_device = current_device; - } - } - else - { - max_compute_perf = compute_perf; - max_perf_device = current_device; - } - } - } - - ++current_device; - } - - return max_perf_device; -} - - -// Initialization code to find the best CUDA Device -inline int findCudaDevice(int argc, const char **argv) -{ - cudaDeviceProp deviceProp; - int devID = 0; - - // If the command-line has a device number specified, use it - if (checkCmdLineFlag(argc, argv, "device")) - { - devID = getCmdLineArgumentInt(argc, argv, "device="); - - if (devID < 0) - { - printf("Invalid command line parameter\n "); - exit(EXIT_FAILURE); - } - else - { - devID = gpuDeviceInit(devID); - - if (devID < 0) - { - printf("exiting...\n"); - exit(EXIT_FAILURE); - } - } - } - else - { - // Otherwise pick the device with highest Gflops/s - devID = gpuGetMaxGflopsDeviceId(); - checkCudaErrors(cudaSetDevice(devID)); - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); - printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor); - } - - return devID; -} - -// General check for CUDA GPU SM Capabilities -inline bool checkCudaCapabilities(int major_version, int minor_version) -{ - cudaDeviceProp deviceProp; - deviceProp.major = 0; - deviceProp.minor = 0; - int dev; - - checkCudaErrors(cudaGetDevice(&dev)); - checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev)); - - if ((deviceProp.major > major_version) || - (deviceProp.major == major_version && deviceProp.minor >= minor_version)) - { - printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", dev, deviceProp.name, deviceProp.major, deviceProp.minor); - return true; - } - else - { - printf("No GPU device was found that can support CUDA compute capability %d.%d.\n", major_version, minor_version); - return false; - } -} -#endif - -// end of CUDA Helper Functions - - -#endif diff --git a/src/Benchmarks/Sorting/ReferenceAlgorithms/helpers/helper_string.h b/src/Benchmarks/Sorting/ReferenceAlgorithms/helpers/helper_string.h deleted file mode 100644 index 62b7156bc..000000000 --- a/src/Benchmarks/Sorting/ReferenceAlgorithms/helpers/helper_string.h +++ /dev/null @@ -1,421 +0,0 @@ -/** - * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. - * - * Please refer to the NVIDIA end user license agreement (EULA) associated - * with this source code for terms and conditions that govern your use of - * this software. Any use, reproduction, disclosure, or distribution of - * this software and related documentation outside the terms of the EULA - * is strictly prohibited. - * - */ - -// These are helper functions for the SDK samples (string parsing, timers, etc) -#ifndef STRING_HELPER_H -#define STRING_HELPER_H - -#include -#include -#include -#include - -#ifdef _WIN32 -#ifndef STRCASECMP -#define STRCASECMP _stricmp -#endif -#ifndef STRNCASECMP -#define STRNCASECMP _strnicmp -#endif -#ifndef STRCPY -#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath) -#endif - -#ifndef FOPEN -#define FOPEN(fHandle,filename,mode) fopen_s(&fHandle, filename, mode) -#endif -#ifndef FOPEN_FAIL -#define FOPEN_FAIL(result) (result != 0) -#endif -#ifndef SSCANF -#define SSCANF sscanf_s -#endif - -#else -#include -#include - -#ifndef STRCASECMP -#define STRCASECMP strcasecmp -#endif -#ifndef STRNCASECMP -#define STRNCASECMP strncasecmp -#endif -#ifndef STRCPY -#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath) -#endif - -#ifndef FOPEN -#define FOPEN(fHandle,filename,mode) (fHandle = fopen(filename, mode)) -#endif -#ifndef FOPEN_FAIL -#define FOPEN_FAIL(result) (result == NULL) -#endif -#ifndef SSCANF -#define SSCANF sscanf -#endif -#endif - -// CUDA Utility Helper Functions -inline int stringRemoveDelimiter(char delimiter, const char *string) -{ - int string_start = 0; - - while (string[string_start] == delimiter) - { - string_start++; - } - - if (string_start >= (int)strlen(string)-1) - { - return 0; - } - - return string_start; -} - -inline int getFileExtension(char *filename, char **extension) -{ - int string_length = (int)strlen(filename); - - while (filename[string_length--] != '.') { - if (string_length == 0) - break; - } - if (string_length > 0) string_length += 2; - - if (string_length == 0) - *extension = NULL; - else - *extension = &filename[string_length]; - - return string_length; -} - - -inline int checkCmdLineFlag(const int argc, const char **argv, const char *string_ref) -{ - bool bFound = false; - - if (argc >= 1) - { - for (int i=1; i < argc; i++) - { - int string_start = stringRemoveDelimiter('-', argv[i]); - const char *string_argv = &argv[i][string_start]; - - const char *equal_pos = strchr(string_argv, '='); - int argv_length = (int)(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv); - - int length = (int)strlen(string_ref); - - if (length == argv_length && !STRNCASECMP(string_argv, string_ref, length)) - { - - bFound = true; - continue; - } - } - } - - return (int)bFound; -} - -inline int getCmdLineArgumentInt(const int argc, const char **argv, const char *string_ref) -{ - bool bFound = false; - int value = -1; - - if (argc >= 1) - { - for (int i=1; i < argc; i++) - { - int string_start = stringRemoveDelimiter('-', argv[i]); - const char *string_argv = &argv[i][string_start]; - int length = (int)strlen(string_ref); - - if (!STRNCASECMP(string_argv, string_ref, length)) - { - if (length+1 <= (int)strlen(string_argv)) - { - int auto_inc = (string_argv[length] == '=') ? 1 : 0; - value = atoi(&string_argv[length + auto_inc]); - } - else - { - value = 0; - } - - bFound = true; - continue; - } - } - } - - if (bFound) - { - return value; - } - else - { - return 0; - } -} - -inline float getCmdLineArgumentFloat(const int argc, const char **argv, const char *string_ref) -{ - bool bFound = false; - float value = -1; - - if (argc >= 1) - { - for (int i=1; i < argc; i++) - { - int string_start = stringRemoveDelimiter('-', argv[i]); - const char *string_argv = &argv[i][string_start]; - int length = (int)strlen(string_ref); - - if (!STRNCASECMP(string_argv, string_ref, length)) - { - if (length+1 <= (int)strlen(string_argv)) - { - int auto_inc = (string_argv[length] == '=') ? 1 : 0; - value = (float)atof(&string_argv[length + auto_inc]); - } - else - { - value = 0.f; - } - - bFound = true; - continue; - } - } - } - - if (bFound) - { - return value; - } - else - { - return 0; - } -} - -inline bool getCmdLineArgumentString(const int argc, const char **argv, - const char *string_ref, char **string_retval) -{ - bool bFound = false; - - if (argc >= 1) - { - for (int i=1; i < argc; i++) - { - int string_start = stringRemoveDelimiter('-', argv[i]); - char *string_argv = (char *)&argv[i][string_start]; - int length = (int)strlen(string_ref); - - if (!STRNCASECMP(string_argv, string_ref, length)) - { - *string_retval = &string_argv[length+1]; - bFound = true; - continue; - } - } - } - - if (!bFound) - { - *string_retval = NULL; - } - - return bFound; -} - -////////////////////////////////////////////////////////////////////////////// -//! Find the path for a file assuming that -//! files are found in the searchPath. -//! -//! @return the path if succeeded, otherwise 0 -//! @param filename name of the file -//! @param executable_path optional absolute path of the executable -////////////////////////////////////////////////////////////////////////////// -inline char *sdkFindFilePath(const char *filename, const char *executable_path) -{ - // defines a variable that is replaced with the name of the executable - - // Typical relative search paths to locate needed companion files (e.g. sample input data, or JIT source files) - // The origin for the relative search may be the .exe file, a .bat file launching an .exe, a browser .exe launching the .exe or .bat, etc - const char *searchPath[] = - { - "./", // same dir - "./common/", // "/common/" subdir - "./common/data/", // "/common/data/" subdir - "./data/", // "/data/" subdir - "./src/", // "/src/" subdir - "./src//data/", // "/src//data/" subdir - "./inc/", // "/inc/" subdir - "./0_Simple/", // "/0_Simple/" subdir - "./1_Utilities/", // "/1_Utilities/" subdir - "./2_Graphics/", // "/2_Graphics/" subdir - "./3_Imaging/", // "/3_Imaging/" subdir - "./4_Financial/", // "/4_Financial/" subdir - "./5_Simulations/", // "/5_Simulations/" subdir - "./6_Advanced/", // "/6_Advanced/" subdir - "./7_CUDALibraries/", // "/7_CUDALibraries/" subdir - - "../", // up 1 in tree - "../common/", // up 1 in tree, "/common/" subdir - "../common/data/", // up 1 in tree, "/common/data/" subdir - "../data/", // up 1 in tree, "/data/" subdir - "../src/", // up 1 in tree, "/src/" subdir - "../inc/", // up 1 in tree, "/inc/" subdir - "../C/src//", // up 1 in tree, "/C/src//" subdir - "../C/src//data/", // up 1 in tree, "/C/src//data/" subdir - "../C/src//src/", // up 1 in tree, "/C/src//src/" subdir - "../C/src//inc/", // up 1 in tree, "/C/src//inc/" subdir - "../C/", // up 1 in tree - "../C/common/", // up 1 in tree, "/common/" subdir - "../C/common/data/", // up 1 in tree, "/common/data/" subdir - "../C/data/", // up 1 in tree, "/data/" subdir - "../C/src/", // up 1 in tree, "/src/" subdir - "../C/inc/", // up 1 in tree, "/inc/" subdir - "../C/0_Simple//data/", // up 1 in tree, "/0_Simple//" subdir - "../C/1_Utilities//data/", // up 1 in tree, "/1_Utilities//" subdir - "../C/2_Graphics//data/", // up 1 in tree, "/2_Graphics//" subdir - "../C/3_Imaging//data/", // up 1 in tree, "/3_Imaging//" subdir - "../C/4_Financial//data/", // up 1 in tree, "/4_Financial//" subdir - "../C/5_Simulations//data/", // up 1 in tree, "/5_Simulations//" subdir - "../C/6_Advanced//data/", // up 1 in tree, "/6_Advanced//" subdir - "../C/7_CUDALibraries//data/", // up 1 in tree, "/7_CUDALibraries//" subdir - - "../0_Simple//data/", // up 1 in tree, "/0_Simple//" subdir - "../1_Utilities//data/", // up 1 in tree, "/1_Utilities//" subdir - "../2_Graphics//data/", // up 1 in tree, "/2_Graphics//" subdir - "../3_Imaging//data/", // up 1 in tree, "/3_Imaging//" subdir - "../4_Financial//data/", // up 1 in tree, "/4_Financial//" subdir - "../5_Simulations//data/", // up 1 in tree, "/5_Simulations//" subdir - "../6_Advanced//data/", // up 1 in tree, "/6_Advanced//" subdir - "../7_CUDALibraries//data/", // up 1 in tree, "/7_CUDALibraries//" subdir - "../../", // up 2 in tree - "../../common/", // up 2 in tree, "/common/" subdir - "../../common/data/", // up 2 in tree, "/common/data/" subdir - "../../data/", // up 2 in tree, "/data/" subdir - "../../src/", // up 2 in tree, "/src/" subdir - "../../inc/", // up 2 in tree, "/inc/" subdir - "../../sandbox//data/", // up 2 in tree, "/sandbox//" subdir - "../../0_Simple//data/", // up 2 in tree, "/0_Simple//" subdir - "../../1_Utilities//data/", // up 2 in tree, "/1_Utilities//" subdir - "../../2_Graphics//data/", // up 2 in tree, "/2_Graphics//" subdir - "../../3_Imaging//data/", // up 2 in tree, "/3_Imaging//" subdir - "../../4_Financial//data/", // up 2 in tree, "/4_Financial//" subdir - "../../5_Simulations//data/", // up 2 in tree, "/5_Simulations//" subdir - "../../6_Advanced//data/", // up 2 in tree, "/6_Advanced//" subdir - "../../7_CUDALibraries//data/", // up 2 in tree, "/7_CUDALibraries//" subdir - "../../../", // up 3 in tree - "../../../src//", // up 3 in tree, "/src//" subdir - "../../../src//data/", // up 3 in tree, "/src//data/" subdir - "../../../src//src/", // up 3 in tree, "/src//src/" subdir - "../../../src//inc/", // up 3 in tree, "/src//inc/" subdir - "../../../sandbox//", // up 3 in tree, "/sandbox//" subdir - "../../../sandbox//data/", // up 3 in tree, "/sandbox//data/" subdir - "../../../sandbox//src/", // up 3 in tree, "/sandbox//src/" subdir - "../../../sandbox//inc/", // up 3 in tree, "/sandbox//inc/" subdir - "../../../0_Simple//data/", // up 3 in tree, "/0_Simple//" subdir - "../../../1_Utilities//data/", // up 3 in tree, "/1_Utilities//" subdir - "../../../2_Graphics//data/", // up 3 in tree, "/2_Graphics//" subdir - "../../../3_Imaging//data/", // up 3 in tree, "/3_Imaging//" subdir - "../../../4_Financial//data/", // up 3 in tree, "/4_Financial//" subdir - "../../../5_Simulations//data/",// up 3 in tree, "/5_Simulations//" subdir - "../../../6_Advanced//data/", // up 3 in tree, "/6_Advanced//" subdir - "../../../7_CUDALibraries//data/", // up 3 in tree, "/7_CUDALibraries//" subdir - "../../../common/", // up 3 in tree, "../../../common/" subdir - "../../../common/data/", // up 3 in tree, "../../../common/data/" subdir - "../../../data/", // up 3 in tree, "../../../data/" subdir - }; - - // Extract the executable name - std::string executable_name; - - if (executable_path != 0) - { - executable_name = std::string(executable_path); - -#ifdef _WIN32 - // Windows path delimiter - size_t delimiter_pos = executable_name.find_last_of('\\'); - executable_name.erase(0, delimiter_pos + 1); - - if (executable_name.rfind(".exe") != std::string::npos) - { - // we strip .exe, only if the .exe is found - executable_name.resize(executable_name.size() - 4); - } - -#else - // Linux & OSX path delimiter - size_t delimiter_pos = executable_name.find_last_of('/'); - executable_name.erase(0,delimiter_pos+1); -#endif - } - - // Loop over all search paths and return the first hit - for (unsigned int i = 0; i < sizeof(searchPath)/sizeof(char *); ++i) - { - std::string path(searchPath[i]); - size_t executable_name_pos = path.find(""); - - // If there is executable_name variable in the searchPath - // replace it with the value - if (executable_name_pos != std::string::npos) - { - if (executable_path != 0) - { - path.replace(executable_name_pos, strlen(""), executable_name); - } - else - { - // Skip this path entry if no executable argument is given - continue; - } - } - -#ifdef _DEBUG - printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str()); -#endif - - // Test if the file exists - path.append(filename); - FILE *fp; - FOPEN(fp, path.c_str(), "rb"); - - if (fp != NULL) - { - fclose(fp); - // File found - // returning an allocated array here for backwards compatibility reasons - char *file_path = (char *) malloc(path.length() + 1); - STRCPY(file_path, path.length() + 1, path.c_str()); - return file_path; - } - - if (fp) - { - fclose(fp); - } - } - - // File not found - return 0; -} - -#endif diff --git a/src/Benchmarks/Sorting/ReferenceAlgorithms/helpers/helper_timer.h b/src/Benchmarks/Sorting/ReferenceAlgorithms/helpers/helper_timer.h deleted file mode 100644 index 3cb4fece4..000000000 --- a/src/Benchmarks/Sorting/ReferenceAlgorithms/helpers/helper_timer.h +++ /dev/null @@ -1,495 +0,0 @@ -/** - * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. - * - * Please refer to the NVIDIA end user license agreement (EULA) associated - * with this source code for terms and conditions that govern your use of - * this software. Any use, reproduction, disclosure, or distribution of - * this software and related documentation outside the terms of the EULA - * is strictly prohibited. - * - */ - -// Helper Timing Functions -#ifndef HELPER_TIMER_H -#define HELPER_TIMER_H - -// includes, system -#include - -// includes, project -#include "exception.h" - -// Definition of the StopWatch Interface, this is used if we don't want to use the CUT functions -// But rather in a self contained class interface -class StopWatchInterface -{ - public: - StopWatchInterface() {}; - virtual ~StopWatchInterface() {}; - - public: - //! Start time measurement - virtual void start() = 0; - - //! Stop time measurement - virtual void stop() = 0; - - //! Reset time counters to zero - virtual void reset() = 0; - - //! Time in msec. after start. If the stop watch is still running (i.e. there - //! was no call to stop()) then the elapsed time is returned, otherwise the - //! time between the last start() and stop call is returned - virtual float getTime() = 0; - - //! Mean time to date based on the number of times the stopwatch has been - //! _stopped_ (ie finished sessions) and the current total time - virtual float getAverageTime() = 0; -}; - - -////////////////////////////////////////////////////////////////// -// Begin Stopwatch timer class definitions for all OS platforms // -////////////////////////////////////////////////////////////////// -#ifdef _WIN32 -// includes, system -#define WINDOWS_LEAN_AND_MEAN -#include -#undef min -#undef max - -//! Windows specific implementation of StopWatch -class StopWatchWin : public StopWatchInterface -{ - public: - //! Constructor, default - StopWatchWin() : - start_time(), end_time(), - diff_time(0.0f), total_time(0.0f), - running(false), clock_sessions(0), freq(0), freq_set(false) - { - if (! freq_set) - { - // helper variable - LARGE_INTEGER temp; - - // get the tick frequency from the OS - QueryPerformanceFrequency((LARGE_INTEGER *) &temp); - - // convert to type in which it is needed - freq = ((double) temp.QuadPart) / 1000.0; - - // rememeber query - freq_set = true; - } - }; - - // Destructor - ~StopWatchWin() { }; - - public: - //! Start time measurement - inline void start(); - - //! Stop time measurement - inline void stop(); - - //! Reset time counters to zero - inline void reset(); - - //! Time in msec. after start. If the stop watch is still running (i.e. there - //! was no call to stop()) then the elapsed time is returned, otherwise the - //! time between the last start() and stop call is returned - inline float getTime(); - - //! Mean time to date based on the number of times the stopwatch has been - //! _stopped_ (ie finished sessions) and the current total time - inline float getAverageTime(); - - private: - // member variables - - //! Start of measurement - LARGE_INTEGER start_time; - //! End of measurement - LARGE_INTEGER end_time; - - //! Time difference between the last start and stop - float diff_time; - - //! TOTAL time difference between starts and stops - float total_time; - - //! flag if the stop watch is running - bool running; - - //! Number of times clock has been started - //! and stopped to allow averaging - int clock_sessions; - - //! tick frequency - double freq; - - //! flag if the frequency has been set - bool freq_set; -}; - -// functions, inlined - -//////////////////////////////////////////////////////////////////////////////// -//! Start time measurement -//////////////////////////////////////////////////////////////////////////////// -inline void -StopWatchWin::start() -{ - QueryPerformanceCounter((LARGE_INTEGER *) &start_time); - running = true; -} - -//////////////////////////////////////////////////////////////////////////////// -//! Stop time measurement and increment add to the current diff_time summation -//! variable. Also increment the number of times this clock has been run. -//////////////////////////////////////////////////////////////////////////////// -inline void -StopWatchWin::stop() -{ - QueryPerformanceCounter((LARGE_INTEGER *) &end_time); - diff_time = (float) - (((double) end_time.QuadPart - (double) start_time.QuadPart) / freq); - - total_time += diff_time; - clock_sessions++; - running = false; -} - -//////////////////////////////////////////////////////////////////////////////// -//! Reset the timer to 0. Does not change the timer running state but does -//! recapture this point in time as the current start time if it is running. -//////////////////////////////////////////////////////////////////////////////// -inline void -StopWatchWin::reset() -{ - diff_time = 0; - total_time = 0; - clock_sessions = 0; - - if (running) - { - QueryPerformanceCounter((LARGE_INTEGER *) &start_time); - } -} - - -//////////////////////////////////////////////////////////////////////////////// -//! Time in msec. after start. If the stop watch is still running (i.e. there -//! was no call to stop()) then the elapsed time is returned added to the -//! current diff_time sum, otherwise the current summed time difference alone -//! is returned. -//////////////////////////////////////////////////////////////////////////////// -inline float -StopWatchWin::getTime() -{ - // Return the TOTAL time to date - float retval = total_time; - - if (running) - { - LARGE_INTEGER temp; - QueryPerformanceCounter((LARGE_INTEGER *) &temp); - retval += (float) - (((double)(temp.QuadPart - start_time.QuadPart)) / freq); - } - - return retval; -} - -//////////////////////////////////////////////////////////////////////////////// -//! Time in msec. for a single run based on the total number of COMPLETED runs -//! and the total time. -//////////////////////////////////////////////////////////////////////////////// -inline float -StopWatchWin::getAverageTime() -{ - return (clock_sessions > 0) ? (total_time/clock_sessions) : 0.0f; -} -#else -// Declarations for Stopwatch on Linux and Mac OSX -// includes, system -#include -#include - -//! Windows specific implementation of StopWatch -class StopWatchLinux : public StopWatchInterface -{ - public: - //! Constructor, default - StopWatchLinux() : - start_time(), diff_time(0.0), total_time(0.0), - running(false), clock_sessions(0) - { }; - - // Destructor - virtual ~StopWatchLinux() - { }; - - public: - //! Start time measurement - inline void start(); - - //! Stop time measurement - inline void stop(); - - //! Reset time counters to zero - inline void reset(); - - //! Time in msec. after start. If the stop watch is still running (i.e. there - //! was no call to stop()) then the elapsed time is returned, otherwise the - //! time between the last start() and stop call is returned - inline float getTime(); - - //! Mean time to date based on the number of times the stopwatch has been - //! _stopped_ (ie finished sessions) and the current total time - inline float getAverageTime(); - - private: - - // helper functions - - //! Get difference between start time and current time - inline float getDiffTime(); - - private: - - // member variables - - //! Start of measurement - struct timeval start_time; - - //! Time difference between the last start and stop - float diff_time; - - //! TOTAL time difference between starts and stops - float total_time; - - //! flag if the stop watch is running - bool running; - - //! Number of times clock has been started - //! and stopped to allow averaging - int clock_sessions; -}; - -// functions, inlined - -//////////////////////////////////////////////////////////////////////////////// -//! Start time measurement -//////////////////////////////////////////////////////////////////////////////// -inline void -StopWatchLinux::start() -{ - gettimeofday(&start_time, 0); - running = true; -} - -//////////////////////////////////////////////////////////////////////////////// -//! Stop time measurement and increment add to the current diff_time summation -//! variable. Also increment the number of times this clock has been run. -//////////////////////////////////////////////////////////////////////////////// -inline void -StopWatchLinux::stop() -{ - diff_time = getDiffTime(); - total_time += diff_time; - running = false; - clock_sessions++; -} - -//////////////////////////////////////////////////////////////////////////////// -//! Reset the timer to 0. Does not change the timer running state but does -//! recapture this point in time as the current start time if it is running. -//////////////////////////////////////////////////////////////////////////////// -inline void -StopWatchLinux::reset() -{ - diff_time = 0; - total_time = 0; - clock_sessions = 0; - - if (running) - { - gettimeofday(&start_time, 0); - } -} - -//////////////////////////////////////////////////////////////////////////////// -//! Time in msec. after start. If the stop watch is still running (i.e. there -//! was no call to stop()) then the elapsed time is returned added to the -//! current diff_time sum, otherwise the current summed time difference alone -//! is returned. -//////////////////////////////////////////////////////////////////////////////// -inline float -StopWatchLinux::getTime() -{ - // Return the TOTAL time to date - float retval = total_time; - - if (running) - { - retval += getDiffTime(); - } - - return retval; -} - -//////////////////////////////////////////////////////////////////////////////// -//! Time in msec. for a single run based on the total number of COMPLETED runs -//! and the total time. -//////////////////////////////////////////////////////////////////////////////// -inline float -StopWatchLinux::getAverageTime() -{ - return (clock_sessions > 0) ? (total_time/clock_sessions) : 0.0f; -} -//////////////////////////////////////////////////////////////////////////////// - -//////////////////////////////////////////////////////////////////////////////// -inline float -StopWatchLinux::getDiffTime() -{ - struct timeval t_time; - gettimeofday(&t_time, 0); - - // time difference in milli-seconds - return (float)(1000.0 * (t_time.tv_sec - start_time.tv_sec) - + (0.001 * (t_time.tv_usec - start_time.tv_usec))); -} -#endif // _WIN32 - -//////////////////////////////////////////////////////////////////////////////// -//! Timer functionality exported - -//////////////////////////////////////////////////////////////////////////////// -//! Create a new timer -//! @return true if a time has been created, otherwise false -//! @param name of the new timer, 0 if the creation failed -//////////////////////////////////////////////////////////////////////////////// -inline bool -sdkCreateTimer(StopWatchInterface **timer_interface) -{ - //printf("sdkCreateTimer called object %08x\n", (void *)*timer_interface); -#ifdef _WIN32 - *timer_interface = (StopWatchInterface *)new StopWatchWin(); -#else - *timer_interface = (StopWatchInterface *)new StopWatchLinux(); -#endif - return (*timer_interface != NULL) ? true : false; -} - - -//////////////////////////////////////////////////////////////////////////////// -//! Delete a timer -//! @return true if a time has been deleted, otherwise false -//! @param name of the timer to delete -//////////////////////////////////////////////////////////////////////////////// -inline bool -sdkDeleteTimer(StopWatchInterface **timer_interface) -{ - //printf("sdkDeleteTimer called object %08x\n", (void *)*timer_interface); - if (*timer_interface) - { - delete *timer_interface; - *timer_interface = NULL; - } - - return true; -} - -//////////////////////////////////////////////////////////////////////////////// -//! Start the time with name \a name -//! @param name name of the timer to start -//////////////////////////////////////////////////////////////////////////////// -inline bool -sdkStartTimer(StopWatchInterface **timer_interface) -{ - //printf("sdkStartTimer called object %08x\n", (void *)*timer_interface); - if (*timer_interface) - { - (*timer_interface)->start(); - } - - return true; -} - -//////////////////////////////////////////////////////////////////////////////// -//! Stop the time with name \a name. Does not reset. -//! @param name name of the timer to stop -//////////////////////////////////////////////////////////////////////////////// -inline bool -sdkStopTimer(StopWatchInterface **timer_interface) -{ - // printf("sdkStopTimer called object %08x\n", (void *)*timer_interface); - if (*timer_interface) - { - (*timer_interface)->stop(); - } - - return true; -} - -//////////////////////////////////////////////////////////////////////////////// -//! Resets the timer's counter. -//! @param name name of the timer to reset. -//////////////////////////////////////////////////////////////////////////////// -inline bool -sdkResetTimer(StopWatchInterface **timer_interface) -{ - // printf("sdkResetTimer called object %08x\n", (void *)*timer_interface); - if (*timer_interface) - { - (*timer_interface)->reset(); - } - - return true; -} - -//////////////////////////////////////////////////////////////////////////////// -//! Return the average time for timer execution as the total time -//! for the timer dividied by the number of completed (stopped) runs the timer -//! has made. -//! Excludes the current running time if the timer is currently running. -//! @param name name of the timer to return the time of -//////////////////////////////////////////////////////////////////////////////// -inline float -sdkGetAverageTimerValue(StopWatchInterface **timer_interface) -{ - // printf("sdkGetAverageTimerValue called object %08x\n", (void *)*timer_interface); - if (*timer_interface) - { - return (*timer_interface)->getAverageTime(); - } - else - { - return 0.0f; - } -} - -//////////////////////////////////////////////////////////////////////////////// -//! Total execution time for the timer over all runs since the last reset -//! or timer creation. -//! @param name name of the timer to obtain the value of. -//////////////////////////////////////////////////////////////////////////////// -inline float -sdkGetTimerValue(StopWatchInterface **timer_interface) -{ - // printf("sdkGetTimerValue called object %08x\n", (void *)*timer_interface); - if (*timer_interface) - { - return (*timer_interface)->getTime(); - } - else - { - return 0.0f; - } -} - -#endif // HELPER_TIMER_H diff --git a/src/Benchmarks/Sorting/ReferenceAlgorithms/helpers/scan_common.h b/src/Benchmarks/Sorting/ReferenceAlgorithms/helpers/scan_common.h deleted file mode 100644 index 80b93d574..000000000 --- a/src/Benchmarks/Sorting/ReferenceAlgorithms/helpers/scan_common.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. - * - * Please refer to the NVIDIA end user license agreement (EULA) associated - * with this source code for terms and conditions that govern your use of - * this software. Any use, reproduction, disclosure, or distribution of - * this software and related documentation outside the terms of the EULA - * is strictly prohibited. - * - */ - -#ifndef SCAN_COMMON_H -#define SCAN_COMMON_H - -#include - -//////////////////////////////////////////////////////////////////////////////// -// Shortcut typename -//////////////////////////////////////////////////////////////////////////////// -typedef unsigned int uint; - -//////////////////////////////////////////////////////////////////////////////// -// Implementation limits -//////////////////////////////////////////////////////////////////////////////// -extern "C" const uint MAX_BATCH_ELEMENTS; -extern "C" const uint MIN_SHORT_ARRAY_SIZE; -extern "C" const uint MAX_SHORT_ARRAY_SIZE; -extern "C" const uint MIN_LARGE_ARRAY_SIZE; -extern "C" const uint MAX_LARGE_ARRAY_SIZE; - -//////////////////////////////////////////////////////////////////////////////// -// CUDA scan -//////////////////////////////////////////////////////////////////////////////// -extern "C" void initScan(void); -extern "C" void closeScan(void); - -extern "C" size_t scanExclusiveShort( - uint *d_Dst, - uint *d_Src, - uint batchSize, - uint arrayLength -); - -extern "C" size_t scanExclusiveLarge( - uint *d_Dst, - uint *d_Src, - uint batchSize, - uint arrayLength -); - -//////////////////////////////////////////////////////////////////////////////// -// Reference CPU scan -//////////////////////////////////////////////////////////////////////////////// -extern "C" void scanExclusiveHost( - uint *dst, - uint *src, - uint batchSize, - uint arrayLength -); - -#endif -- GitLab From 97df27df81656f69cc7cce6c77968f4232d537f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Thu, 15 Jul 2021 21:08:22 +0200 Subject: [PATCH 245/258] Fixing comparison warnings. --- src/TNL/Algorithms/Sorting/detail/bitonicSort.h | 2 +- src/UnitTests/Algorithms/Sorting/BitonicSortTest.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/TNL/Algorithms/Sorting/detail/bitonicSort.h b/src/TNL/Algorithms/Sorting/detail/bitonicSort.h index 1b888251a..c88dac726 100644 --- a/src/TNL/Algorithms/Sorting/detail/bitonicSort.h +++ b/src/TNL/Algorithms/Sorting/detail/bitonicSort.h @@ -230,7 +230,7 @@ void bitonicSort(TNL::Containers::ArrayView src, int const int maxThreadsPerBlock = 512; int sharedMemLen = maxThreadsPerBlock * 2; - int sharedMemSize = sharedMemLen * sizeof(Value); + size_t sharedMemSize = sharedMemLen * sizeof(Value); if (sharedMemSize <= deviceProp.sharedMemPerBlock) { diff --git a/src/UnitTests/Algorithms/Sorting/BitonicSortTest.h b/src/UnitTests/Algorithms/Sorting/BitonicSortTest.h index e8fad07a0..9c8116646 100644 --- a/src/UnitTests/Algorithms/Sorting/BitonicSortTest.h +++ b/src/UnitTests/Algorithms/Sorting/BitonicSortTest.h @@ -371,7 +371,7 @@ TEST(fetchAndSwap, sortMiddle) std::vector orig{5, 9, 4, 54, 21, 6, 7, 9, 0, 9, 42, 4}; TNL::Containers::Array cudaArr(orig); auto view = cudaArr.getView(); - int from = 3, to = 8; + size_t from = 3, to = 8; fetchAndSwap_sortMiddle(view, from, to); EXPECT_TRUE(Algorithms::isSorted(view.getView(3, 8))) << "result " << view << std::endl; -- GitLab From 95b5e99d5ac4b4c095d68a5b9a5284db38a2ccb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Thu, 15 Jul 2021 21:08:57 +0200 Subject: [PATCH 246/258] Fixing build of tnl-benchmark-sort using CUDA samples. --- CMakeLists.txt | 2 +- build | 4 ++-- src/Benchmarks/Sorting/Measurer.h | 9 ++------- src/Benchmarks/Sorting/tnl-benchmark-sort.h | 4 ++++ src/TNL/Algorithms/Sorting/detail/Quicksorter.hpp | 2 +- 5 files changed, 10 insertions(+), 11 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 949444d32..0159f9f82 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -255,7 +255,7 @@ if( ${WITH_CUDA} ) set( CMAKE_EXECUTABLE_SUFFIX "${executable_suffix_backup}" ) endif() if( NOT CUDA_SAMPLES_DIR STREQUAL "none" ) - set( CUDA_SAMPLES_FLAGS "-I${CUDA_SAMPLES_DIR}/common/inc") + set( CUDA_SAMPLES_FLAGS "-I${CUDA_SAMPLES_DIR}/common/inc -DHAVE_CUDA_SAMPLES") endif() endif() diff --git a/build b/build index 22468ccdb..f8adef1a4 100755 --- a/build +++ b/build @@ -94,7 +94,7 @@ Options for the 'tests' and 'matrix-tests' targets: --with-system-gtest=yes/no Use GTest installed in the local system and do not download the latest version. '$WITH_SYSTEM_GTEST' by default. External dependencies: - --cuda-samples-dir=PATH CUDA samples are used by some reference algorithms used in benchmarks. '$CUDA_SAMPLES_PATH` by default. + --cuda-samples-path=PATH CUDA samples are used by some reference algorithms used in benchmarks. '$CUDA_SAMPLES_PATH' by default. EOF } @@ -130,7 +130,7 @@ for option in "$@"; do --with-coverage=* ) WITH_COVERAGE="${option#*=}" ;; --with-ci-flags=* ) WITH_CI_FLAGS="${option#*=}" ;; --with-system-gtest=* ) WITH_SYSTEM_GTEST="${option#*=}" ;; - --cuda-samples-path=* ) CUDA_SAMPLES_DIR="${option#*=}" ;; + --cuda-samples-path=* ) CUDA_SAMPLES_DIR="${option#*=}" ;; -* ) echo "Unknown option $option. Use --help for more information." >&2 exit 1 diff --git a/src/Benchmarks/Sorting/Measurer.h b/src/Benchmarks/Sorting/Measurer.h index 7f7990f48..596f63d88 100644 --- a/src/Benchmarks/Sorting/Measurer.h +++ b/src/Benchmarks/Sorting/Measurer.h @@ -8,7 +8,9 @@ #include #ifdef HAVE_CUDA +#ifdef HAVE_CUDA_SAMPLES #include "ReferenceAlgorithms/MancaQuicksort.h" +#endif #include "ReferenceAlgorithms/CedermanQuicksort.h" #endif @@ -16,13 +18,6 @@ using namespace TNL; -/*struct STLSorter -{ - template< typename Value > - static void sort( std::vector< Value >& vec ) { std::sort( vec.begin(), vec.end() ); }; -};*/ - - template< typename Sorter > struct Measurer { diff --git a/src/Benchmarks/Sorting/tnl-benchmark-sort.h b/src/Benchmarks/Sorting/tnl-benchmark-sort.h index bb5d15dbc..60fad1cd9 100644 --- a/src/Benchmarks/Sorting/tnl-benchmark-sort.h +++ b/src/Benchmarks/Sorting/tnl-benchmark-sort.h @@ -92,8 +92,10 @@ int main(int argc, char *argv[]) std::cout << "Bitonic sort on GPU ... " << std::endl; start< BitonicSort >( cout, "\t" ); #ifdef HAVE_CUDA +#ifdef HAVE_CUDA_SAMPLES std::cout << "Manca quicksort on GPU ... " << std::endl; start< MancaQuicksort >( cout, "\t" ); +#endif std::cout << "Cederman quicksort on GPU ... " << std::endl; start< CedermanQuicksort >( cout, "\t" ); #endif @@ -108,8 +110,10 @@ int main(int argc, char *argv[]) std::cout << "Bitonic sort on GPU ... " << std::endl; start< BitonicSort >(out, ","); #ifdef HAVE_CUDA +#ifdef HAVE_CUDA_SAMPLES std::cout << "Manca quicksort on GPU ... " << std::endl; start< MancaQuicksort >( out, "," ); +#endif std::cout << "Cederman quicksort on GPU ... " << std::endl; start< CedermanQuicksort >( out, "," ); #endif diff --git a/src/TNL/Algorithms/Sorting/detail/Quicksorter.hpp b/src/TNL/Algorithms/Sorting/detail/Quicksorter.hpp index 7c6a33ef9..f20ef00ec 100644 --- a/src/TNL/Algorithms/Sorting/detail/Quicksorter.hpp +++ b/src/TNL/Algorithms/Sorting/detail/Quicksorter.hpp @@ -69,7 +69,7 @@ sort( Array& arr, const Compare& cmp ) } } - TNL_ASSERT_LE( blockDim * multiplier * sizeof(Value), maxSharable,"" ); + TNL_ASSERT_LE( ( int ) ( blockDim * multiplier * sizeof(Value) ), maxSharable,"" ); this->init(arr, maxBlocks, blockDim, multiplier * blockDim, maxSharable); this->performSort( cmp ); -- GitLab From 70e4706825af986beb437835964846dde5bb67a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Mon, 19 Jul 2021 10:09:08 +0200 Subject: [PATCH 247/258] Removing Fetch lambda function from inplace bitonic sort. --- .../Algorithms/Sorting/detail/bitonicSort.h | 18 +++++++++--------- .../Algorithms/Sorting/BitonicSortTest.h | 14 +++++++------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/TNL/Algorithms/Sorting/detail/bitonicSort.h b/src/TNL/Algorithms/Sorting/detail/bitonicSort.h index c88dac726..001909120 100644 --- a/src/TNL/Algorithms/Sorting/detail/bitonicSort.h +++ b/src/TNL/Algorithms/Sorting/detail/bitonicSort.h @@ -315,8 +315,8 @@ void bitonicSort( TNL::Containers::Array< Value, TNL::Devices::Host > &vec) //--------------------------------------------- #ifdef HAVE_CUDA -template -__global__ void bitonicMergeGlobal(int size, FETCH Fetch, CMP Cmp, SWAP Swap, +template< typename CMP, typename SWAP> +__global__ void bitonicMergeGlobal(int size, CMP Cmp, SWAP Swap, int monotonicSeqLen, int bitonicLen) { int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -336,12 +336,12 @@ __global__ void bitonicMergeGlobal(int size, FETCH Fetch, CMP Cmp, SWAP Swap, if ((monotonicSeqIdx + 1) * monotonicSeqLen >= size) //special case for part with no "partner" to be merged with in next phase ascending = true; - if (ascending == Cmp(Fetch(e), Fetch(s))) + if( ascending == Cmp(e, s) ) Swap(s, e); } -template -void bitonicSort(int begin, int end, FETCH Fetch, const CMP &Cmp, SWAP Swap) +template< typename CMP, typename SWAP > +void bitonicSort(int begin, int end, const CMP &Cmp, SWAP Swap) { int size = end - begin; int paddedSize = closestPow2(size); @@ -352,9 +352,9 @@ void bitonicSort(int begin, int end, FETCH Fetch, const CMP &Cmp, SWAP Swap) int threadsPerBlock = maxThreadsPerBlock; int blocks = threadsNeeded / threadsPerBlock + (threadsNeeded % threadsPerBlock != 0); - auto fetchWithOffset = - [=] __cuda_callable__(int i) { - return Fetch(i + begin); + auto compareWithOffset = + [=] __cuda_callable__(int i, int j) { + return Cmp(i + begin, j + begin); }; auto swapWithOffset = @@ -367,7 +367,7 @@ void bitonicSort(int begin, int end, FETCH Fetch, const CMP &Cmp, SWAP Swap) for (int bitonicLen = monotonicSeqLen; bitonicLen > 1; bitonicLen /= 2) { bitonicMergeGlobal<<>>( - size, fetchWithOffset, Cmp, swapWithOffset, + size, compareWithOffset, swapWithOffset, monotonicSeqLen, bitonicLen); } } diff --git a/src/UnitTests/Algorithms/Sorting/BitonicSortTest.h b/src/UnitTests/Algorithms/Sorting/BitonicSortTest.h index 9c8116646..95925c472 100644 --- a/src/UnitTests/Algorithms/Sorting/BitonicSortTest.h +++ b/src/UnitTests/Algorithms/Sorting/BitonicSortTest.h @@ -314,10 +314,10 @@ TEST(sortRange, middleMultiBlock) template void fetchAndSwapSorter(TNL::Containers::ArrayView view) { - auto Fetch = [=]__cuda_callable__(int i){return view[i];}; - auto Cmp = [=]__cuda_callable__(const TYPE & a, const TYPE & b){return a < b;}; + //auto Fetch = [=]__cuda_callable__(int i){return view[i];}; + auto Cmp = [=]__cuda_callable__(const int i, const int j ){return view[ i ] < view[ j ];}; auto Swap = [=] __cuda_callable__ (int i, int j) mutable {TNL::swap(view[i], view[j]);}; - bitonicSort(0, view.getSize(), Fetch, Cmp, Swap); + bitonicSort(0, view.getSize(), Cmp, Swap); } TEST(fetchAndSwap, oneBlockSort) @@ -360,10 +360,10 @@ TEST(fetchAndSwap, typeDouble) void fetchAndSwap_sortMiddle(TNL::Containers::ArrayView view, int from, int to) { - auto Fetch = [=]__cuda_callable__(int i){return view[i];}; - auto Cmp = [=]__cuda_callable__(const int & a, const int & b){return a < b;}; - auto Swap = [=] __cuda_callable__ (int i, int j) mutable {TNL::swap(view[i], view[j]);}; - bitonicSort(from, to, Fetch, Cmp, Swap); + //auto Fetch = [=]__cuda_callable__(int i){return view[i];}; + auto Cmp = [=]__cuda_callable__(const int i, const int j ){ return view[ i ] < view[ j ]; }; + auto Swap = [=] __cuda_callable__ (int i, int j) mutable { TNL::swap(view[i], view[j]); }; + bitonicSort(from, to, Cmp, Swap); } TEST(fetchAndSwap, sortMiddle) -- GitLab From 9a2f8f66185edad52b95e692f068d879988b6546 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Mon, 19 Jul 2021 10:47:02 +0200 Subject: [PATCH 248/258] Changing path to CUDA samples. --- CMakeLists.txt | 2 +- .../Sorting/ReferenceAlgorithms/MancaQuicksort.h | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0159f9f82..ecc556346 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -255,7 +255,7 @@ if( ${WITH_CUDA} ) set( CMAKE_EXECUTABLE_SUFFIX "${executable_suffix_backup}" ) endif() if( NOT CUDA_SAMPLES_DIR STREQUAL "none" ) - set( CUDA_SAMPLES_FLAGS "-I${CUDA_SAMPLES_DIR}/common/inc -DHAVE_CUDA_SAMPLES") + set( CUDA_SAMPLES_FLAGS "-I${CUDA_SAMPLES_DIR} -DHAVE_CUDA_SAMPLES") endif() endif() diff --git a/src/Benchmarks/Sorting/ReferenceAlgorithms/MancaQuicksort.h b/src/Benchmarks/Sorting/ReferenceAlgorithms/MancaQuicksort.h index 64b1d0c7a..51f49250b 100644 --- a/src/Benchmarks/Sorting/ReferenceAlgorithms/MancaQuicksort.h +++ b/src/Benchmarks/Sorting/ReferenceAlgorithms/MancaQuicksort.h @@ -217,8 +217,8 @@ inline __device__ void compareInclusive(Type &idata, Type &idata2, volatile Type } #include -#include -#include <../../6_Advanced/scan/scan_common.h> +#include +#include <6_Advanced/scan/scan_common.h> //All three kernels run 512 threads per workgroup //Must be a power of two @@ -654,9 +654,9 @@ size_t scanInclusiveLarge( #include -#include -#include -#include <../../6_Advanced/scan/scan_common.h> +#include +#include +#include <6_Advanced/scan/scan_common.h> extern __shared__ uint sMemory[]; -- GitLab From 198c0c31c8695f90309db9c1f51b0bed4c5c951c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Mon, 19 Jul 2021 10:49:09 +0200 Subject: [PATCH 249/258] Added reference algorithms - Nvidia bitonic sort and thrust radix sort. --- src/Benchmarks/Sorting/Measurer.h | 2 + .../ReferenceAlgorithms/NvidiaBitonicSort.h | 16 +++++ .../ReferenceAlgorithms/ThrustRadixsort.h | 13 ++++ src/Benchmarks/Sorting/tnl-benchmark-sort.h | 61 +++++++++++-------- 4 files changed, 66 insertions(+), 26 deletions(-) create mode 100644 src/Benchmarks/Sorting/ReferenceAlgorithms/NvidiaBitonicSort.h create mode 100644 src/Benchmarks/Sorting/ReferenceAlgorithms/ThrustRadixsort.h diff --git a/src/Benchmarks/Sorting/Measurer.h b/src/Benchmarks/Sorting/Measurer.h index 596f63d88..7abf16604 100644 --- a/src/Benchmarks/Sorting/Measurer.h +++ b/src/Benchmarks/Sorting/Measurer.h @@ -12,6 +12,8 @@ #include "ReferenceAlgorithms/MancaQuicksort.h" #endif #include "ReferenceAlgorithms/CedermanQuicksort.h" +#include "ReferenceAlgorithms/ThrustRadixsort.h" +#include "ReferenceAlgorithms/NvidiaBitonicSort.h" #endif #include "timer.h" diff --git a/src/Benchmarks/Sorting/ReferenceAlgorithms/NvidiaBitonicSort.h b/src/Benchmarks/Sorting/ReferenceAlgorithms/NvidiaBitonicSort.h new file mode 100644 index 000000000..08a70731b --- /dev/null +++ b/src/Benchmarks/Sorting/ReferenceAlgorithms/NvidiaBitonicSort.h @@ -0,0 +1,16 @@ +#include <6_Advanced/sortingNetworks/bitonicSort.cu> +#include + + +struct NvidiaBitonicSort +{ + static void sort( Containers::ArrayView< int, Devices::Cuda >& view ) + { + Array arr; + arr = view; + bitonicSort((unsigned *)view.getData(), (unsigned *)arr.getData(), + (unsigned *)view.getData(), (unsigned *)arr.getData(), + 1, arr.getSize(), 1); + cudaDeviceSynchronize(); + } +}; diff --git a/src/Benchmarks/Sorting/ReferenceAlgorithms/ThrustRadixsort.h b/src/Benchmarks/Sorting/ReferenceAlgorithms/ThrustRadixsort.h new file mode 100644 index 000000000..b2aafc8ad --- /dev/null +++ b/src/Benchmarks/Sorting/ReferenceAlgorithms/ThrustRadixsort.h @@ -0,0 +1,13 @@ +#include +#include +#include + + +struct ThrustRadixsort +{ + static void sort( Containers::ArrayView< int, Devices::Cuda >& view ) + { + thrust::sort(thrust::device, view.getData(), view.getData() + view.getSize()); + cudaDeviceSynchronize(); + } +}; diff --git a/src/Benchmarks/Sorting/tnl-benchmark-sort.h b/src/Benchmarks/Sorting/tnl-benchmark-sort.h index 60fad1cd9..a58669912 100644 --- a/src/Benchmarks/Sorting/tnl-benchmark-sort.h +++ b/src/Benchmarks/Sorting/tnl-benchmark-sort.h @@ -83,40 +83,49 @@ void start(ostream & out, string delim) int main(int argc, char *argv[]) { - if(argc == 1) - { - std::cout << "STL sort on CPU ... " << std::endl; - start< STLSort >( cout, "\t" ); - std::cout << "Quicksort on GPU ... " << std::endl; - start< Quicksort >(cout, "\t"); - std::cout << "Bitonic sort on GPU ... " << std::endl; - start< BitonicSort >( cout, "\t" ); + if(argc == 1) + { #ifdef HAVE_CUDA + std::cout << "Quicksort on GPU ... " << std::endl; + start< Quicksort >(cout, "\t"); + std::cout << "Bitonic sort on GPU ... " << std::endl; + start< BitonicSort >( cout, "\t" ); #ifdef HAVE_CUDA_SAMPLES - std::cout << "Manca quicksort on GPU ... " << std::endl; - start< MancaQuicksort >( cout, "\t" ); + std::cout << "Manca quicksort on GPU ... " << std::endl; + start< MancaQuicksort >( cout, "\t" ); + std::cout << "Nvidia bitonic sort on GPU ... " << std::endl; + start< NvidiaBitonicSort >( cout, "\t" ); #endif - std::cout << "Cederman quicksort on GPU ... " << std::endl; - start< CedermanQuicksort >( cout, "\t" ); + std::cout << "Cederman quicksort on GPU ... " << std::endl; + start< CedermanQuicksort >( cout, "\t" ); + std::cout << "Thrust radixsort on GPU ... " << std::endl; + start< ThrustRadixsort >( cout, "\t" ); #endif - } - else - { - std::ofstream out(argv[1]); - std::cout << "STL sort on CPU ... " << std::endl; - start< STLSort >( out, "," ); - std::cout << "Quicksort on GPU ... " << std::endl; - start< Quicksort >(out, ","); - std::cout << "Bitonic sort on GPU ... " << std::endl; - start< BitonicSort >(out, ","); + std::cout << "STL sort on CPU ... " << std::endl; + start< STLSort >( cout, "\t" ); + } + else + { + std::ofstream out(argv[1]); #ifdef HAVE_CUDA + std::cout << "Quicksort on GPU ... " << std::endl; + start< Quicksort >(out, ","); + std::cout << "Bitonic sort on GPU ... " << std::endl; + start< BitonicSort >(out, ","); + #ifdef HAVE_CUDA_SAMPLES - std::cout << "Manca quicksort on GPU ... " << std::endl; - start< MancaQuicksort >( out, "," ); + std::cout << "Manca quicksort on GPU ... " << std::endl; + start< MancaQuicksort >( out, "," ); + std::cout << "Nvidia bitonic sort on GPU ... " << std::endl; + start< NvidiaBitonicSort >( out, "," ); #endif - std::cout << "Cederman quicksort on GPU ... " << std::endl; - start< CedermanQuicksort >( out, "," ); + std::cout << "Cederman quicksort on GPU ... " << std::endl; + start< CedermanQuicksort >( out, "," ); + std::cout << "Thrust radixsort on GPU ... " << std::endl; + start< ThrustRadixsort >( out, "," ); #endif + std::cout << "STL sort on CPU ... " << std::endl; + start< STLSort >( out, "," ); } return 0; } -- GitLab From be107229e4eb9764f785dbda7afab3b856380426 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Mon, 19 Jul 2021 12:04:46 +0200 Subject: [PATCH 250/258] Removing folder with original implementation of quicksort and bitonic sort. --- GPUSort/.gitignore | 6 - GPUSort/GPUSort/README.MD | 22 -- .../benchmark/bitonic_benchmark/Makefile | 27 -- .../benchmark/quicksort_benchmark/Makefile | 32 --- .../quicksort_dynamic_benchmark/Makefile | 35 --- .../quicksort_dynamic_benchmark/benchmark.cu | 10 - .../GPUSort/src/bitonicSort/sample/Makefile | 24 -- .../GPUSort/src/bitonicSort/sample/main.cu | 40 --- GPUSort/GPUSort/src/quicksort/sample/Makefile | 24 -- GPUSort/GPUSort/src/quicksort/sample/main.cu | 22 -- .../GPUSort/src/quicksort_dynamic/helper.cuh | 86 ------ .../src/quicksort_dynamic/quicksort.cu | 264 ------------------ .../src/quicksort_dynamic/quicksort.cuh | 16 -- .../src/quicksort_dynamic/sample/Makefile | 33 --- .../src/quicksort_dynamic/sample/main.cu | 23 -- GPUSort/GPUSort/src/quicksort_dynamic/task.h | 13 - GPUSort/GPUSort/src/util/config.mk | 53 ---- GPUSort/GPUSort/tests/bitonic_tests/Makefile | 26 -- .../tests/quicksort_dynamic_tests/Makefile | 35 --- .../tests/quicksort_dynamic_tests/README.md | 1 - .../quicksort_dynamic_tests/unitTests.cu | 77 ----- .../tests/quicksort_unitTests/Makefile | 26 -- GPUSort/README.md | 17 -- GPUSort/measuring/README.md | 9 - .../TNL_implementation/bitonic/Makefile | 27 -- .../TNL_implementation/bitonic/main.cu | 1 - .../bitonic/sameDir/.Makefile | 24 -- .../bitonic/sameDir/main.cu | 118 -------- .../TNL_implementation/cdpquicksort/.Makefile | 32 --- .../cdpquicksort/benchmark.cu | 4 - .../TNL_implementation/quicksort/Makefile | 27 -- .../TNL_implementation/quicksort/benchmark.cu | 1 - GPUSort/measuring/cederman_quicksort/Makefile | 23 -- GPUSort/measuring/cederman_quicksort/main.cu | 12 - .../measuring/cuda_example/bitonic/Makefile | 27 -- .../measuring/cuda_example/bitonic/main.cu | 37 --- .../cdpAdvancedQuicksort/Makefile | 32 --- .../cdpAdvancedQuicksort/benchmark.cu | 13 - .../cuda_example/cdpsimplequicksort/.Makefile | 32 --- .../cdpsimplequicksort/benchmark.cu | 15 - GPUSort/measuring/davors/quicksort/Makefile | 27 -- .../measuring/davors/quicksort/benchmark.cu | 20 -- GPUSort/measuring/manca_quicksort/Makefile | 27 -- GPUSort/measuring/manca_quicksort/main.cu | 15 - GPUSort/measuring/nickjillings/Makefile | 27 -- GPUSort/measuring/nickjillings/main.cu | 12 - GPUSort/measuring/script.sh | 15 - GPUSort/measuring/std_sort/Makefile | 12 - GPUSort/measuring/std_sort/main.cpp | 12 - GPUSort/measuring/thrust/Makefile | 27 -- GPUSort/measuring/thrust/main.cu | 12 - GPUSort/measuring/util/config.mk | 49 ---- 52 files changed, 1601 deletions(-) delete mode 100644 GPUSort/.gitignore delete mode 100644 GPUSort/GPUSort/README.MD delete mode 100644 GPUSort/GPUSort/benchmark/bitonic_benchmark/Makefile delete mode 100644 GPUSort/GPUSort/benchmark/quicksort_benchmark/Makefile delete mode 100644 GPUSort/GPUSort/benchmark/quicksort_dynamic_benchmark/Makefile delete mode 100644 GPUSort/GPUSort/benchmark/quicksort_dynamic_benchmark/benchmark.cu delete mode 100644 GPUSort/GPUSort/src/bitonicSort/sample/Makefile delete mode 100644 GPUSort/GPUSort/src/bitonicSort/sample/main.cu delete mode 100644 GPUSort/GPUSort/src/quicksort/sample/Makefile delete mode 100644 GPUSort/GPUSort/src/quicksort/sample/main.cu delete mode 100644 GPUSort/GPUSort/src/quicksort_dynamic/helper.cuh delete mode 100644 GPUSort/GPUSort/src/quicksort_dynamic/quicksort.cu delete mode 100644 GPUSort/GPUSort/src/quicksort_dynamic/quicksort.cuh delete mode 100644 GPUSort/GPUSort/src/quicksort_dynamic/sample/Makefile delete mode 100644 GPUSort/GPUSort/src/quicksort_dynamic/sample/main.cu delete mode 100644 GPUSort/GPUSort/src/quicksort_dynamic/task.h delete mode 100644 GPUSort/GPUSort/src/util/config.mk delete mode 100644 GPUSort/GPUSort/tests/bitonic_tests/Makefile delete mode 100644 GPUSort/GPUSort/tests/quicksort_dynamic_tests/Makefile delete mode 100644 GPUSort/GPUSort/tests/quicksort_dynamic_tests/README.md delete mode 100644 GPUSort/GPUSort/tests/quicksort_dynamic_tests/unitTests.cu delete mode 100644 GPUSort/GPUSort/tests/quicksort_unitTests/Makefile delete mode 100644 GPUSort/README.md delete mode 100644 GPUSort/measuring/README.md delete mode 100644 GPUSort/measuring/TNL_implementation/bitonic/Makefile delete mode 100644 GPUSort/measuring/TNL_implementation/bitonic/main.cu delete mode 100644 GPUSort/measuring/TNL_implementation/bitonic/sameDir/.Makefile delete mode 100644 GPUSort/measuring/TNL_implementation/bitonic/sameDir/main.cu delete mode 100644 GPUSort/measuring/TNL_implementation/cdpquicksort/.Makefile delete mode 100644 GPUSort/measuring/TNL_implementation/cdpquicksort/benchmark.cu delete mode 100644 GPUSort/measuring/TNL_implementation/quicksort/Makefile delete mode 100644 GPUSort/measuring/TNL_implementation/quicksort/benchmark.cu delete mode 100644 GPUSort/measuring/cederman_quicksort/Makefile delete mode 100644 GPUSort/measuring/cederman_quicksort/main.cu delete mode 100644 GPUSort/measuring/cuda_example/bitonic/Makefile delete mode 100644 GPUSort/measuring/cuda_example/bitonic/main.cu delete mode 100644 GPUSort/measuring/cuda_example/cdpAdvancedQuicksort/Makefile delete mode 100644 GPUSort/measuring/cuda_example/cdpAdvancedQuicksort/benchmark.cu delete mode 100644 GPUSort/measuring/cuda_example/cdpsimplequicksort/.Makefile delete mode 100644 GPUSort/measuring/cuda_example/cdpsimplequicksort/benchmark.cu delete mode 100644 GPUSort/measuring/davors/quicksort/Makefile delete mode 100644 GPUSort/measuring/davors/quicksort/benchmark.cu delete mode 100644 GPUSort/measuring/manca_quicksort/Makefile delete mode 100644 GPUSort/measuring/manca_quicksort/main.cu delete mode 100644 GPUSort/measuring/nickjillings/Makefile delete mode 100644 GPUSort/measuring/nickjillings/main.cu delete mode 100644 GPUSort/measuring/script.sh delete mode 100644 GPUSort/measuring/std_sort/Makefile delete mode 100644 GPUSort/measuring/std_sort/main.cpp delete mode 100644 GPUSort/measuring/thrust/Makefile delete mode 100644 GPUSort/measuring/thrust/main.cu delete mode 100644 GPUSort/measuring/util/config.mk diff --git a/GPUSort/.gitignore b/GPUSort/.gitignore deleted file mode 100644 index cb874b4e4..000000000 --- a/GPUSort/.gitignore +++ /dev/null @@ -1,6 +0,0 @@ -.vscode -backup -*.csv -*.o -*.cuo -*.ipynb \ No newline at end of file diff --git a/GPUSort/GPUSort/README.MD b/GPUSort/GPUSort/README.MD deleted file mode 100644 index f41427f52..000000000 --- a/GPUSort/GPUSort/README.MD +++ /dev/null @@ -1,22 +0,0 @@ -## Code implemented by Nguyen Xuan Thang for bachelor thesis - -Needs to have CUDA and TNL installed. - -* benchmark - * folder containing benchmarking scripts - * the main function is in ``benchmarker.cpp`` - * for each implemented algorithm, there is a folder with a benchmarker and a Makefile, to test out the algorithm, run ``make run``, to clean up ``make clean`` -* src - * folder containing the implementation of Bitonic sort, Quick sort and CDP Quick sort - * inside each folder there is a ``sample`` folder - * to test out the algorithm, simply run ``make run`` -* tests - * folder containing unit tests for each algorithm - * inside each folder there is a tester and a Makefile - * to test out the implementation, run ``make run`` - * needs gTests installed - - - -* To install TNL, read https://mmg-gitlab.fjfi.cvut.cz/doc/tnl/#installation -* To install CUDA, https://developer.nvidia.com/cuda-downloads diff --git a/GPUSort/GPUSort/benchmark/bitonic_benchmark/Makefile b/GPUSort/GPUSort/benchmark/bitonic_benchmark/Makefile deleted file mode 100644 index 8e4060e61..000000000 --- a/GPUSort/GPUSort/benchmark/bitonic_benchmark/Makefile +++ /dev/null @@ -1,27 +0,0 @@ -include ../../src/util/config.mk - -CUDA_SOURCES := $(wildcard *.cu) -CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) - -## targets definitions follow -.PHONY: all host cuda -all: cuda -cuda: $(CUDA_TARGETS) - -run: cuda - ./$(CUDA_TARGETS) - -measure: cuda - ./$(CUDA_TARGETS) ../bitonic.csv - -.PHONY: clean -clean: - rm -f *.d *.o *.cuo $(CUDA_TARGETS) - -# use .cuo instead of .cu.o to avoid problems with the implicit rules: https://stackoverflow.com/q/62967939 -# (and use the host compiler for linking CUDA, nvcc does not understand that .cuo is an object file) -$(CUDA_TARGETS): % : %.cuo - $(CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS) - -$(CUDA_SOURCES:%.cu=%.cuo): %.cuo : %.cu - $(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -c -o $@ $< diff --git a/GPUSort/GPUSort/benchmark/quicksort_benchmark/Makefile b/GPUSort/GPUSort/benchmark/quicksort_benchmark/Makefile deleted file mode 100644 index 14d2022df..000000000 --- a/GPUSort/GPUSort/benchmark/quicksort_benchmark/Makefile +++ /dev/null @@ -1,32 +0,0 @@ -include ../../src/util/config.mk - -CUDA_SOURCES := $(wildcard *.cu) -CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) - -## targets definitions follow -.PHONY: all host cuda -all: cuda -cuda: $(CUDA_TARGETS) - -run: cuda - ./$(CUDA_TARGETS) - -measure: cuda - ./$(CUDA_TARGETS) ../quicksort.csv - -.PHONY: clean -clean: - rm -f *.d *.o *.cuo $(CUDA_TARGETS) - -# use .cuo instead of .cu.o to avoid problems with the implicit rules: https://stackoverflow.com/q/62967939 -# (and use the host compiler for linking CUDA, nvcc does not understand that .cuo is an object file) -$(CUDA_TARGETS): % : %.cuo - $(CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS) - -$(CUDA_SOURCES:%.cu=%.cuo): %.cuo : %.cu - $(CUDA_CXX) $(CUDA_CXXFLAGS) -c -o $@ $< - -debug: - $(CUDA_CXX) -DCHECK_RESULT_SORT $(CUDA_CXXFLAGS) -c -o benchmark.cuo benchmark.cu - $(CXX) $(CUDA_LDFLAGS) -o benchmark benchmark.cuo $(CUDA_LDLIBS) - ./benchmark \ No newline at end of file diff --git a/GPUSort/GPUSort/benchmark/quicksort_dynamic_benchmark/Makefile b/GPUSort/GPUSort/benchmark/quicksort_dynamic_benchmark/Makefile deleted file mode 100644 index 2578516b3..000000000 --- a/GPUSort/GPUSort/benchmark/quicksort_dynamic_benchmark/Makefile +++ /dev/null @@ -1,35 +0,0 @@ -include ../../src/util/config.mk - -TARGET := benchmark -EXTRA_ARCH := -gencode arch=compute_52,code=sm_52 -DEVICE_CODE := -dc - -CUDA_LDLIBS += -lcudadevrt - -SRC_FOLDER := ../../src/quicksort_dynamic - -## targets definitions follow -.PHONY: cuda -all: cuda - -cuda: $(TARGET) - -.PHONY: cuda -run: cuda - ./$(TARGET) - -.PHONY: clean -clean: - rm -f *.d *.o *.cuo $(TARGET) - -$(TARGET): quicksort.o quicksort_link.o $(TARGET).o - $(CXX) $(TNL_INCLUDE_DIRS) $(CUDA_LDFLAGS) -o $@ $^ $(CUDA_LDLIBS) - -$(TARGET).o: $(TARGET).cu ../benchmarker.cpp - $(CUDA_CXX) $(CUDA_CXXFLAGS) -c -o $@ $< - -quicksort.o: $(SRC_FOLDER)/quicksort.cu - $(CUDA_CXX) $(CUDA_CXXFLAGS) $(EXTRA_ARCH) $(DEVICE_CODE) -c -o $@ $< - -quicksort_link.o: quicksort.o - $(CUDA_CXX) $(CUDA_LDFLAGS) -dlink -o $@ $< $(CUDA_LDLIBS) diff --git a/GPUSort/GPUSort/benchmark/quicksort_dynamic_benchmark/benchmark.cu b/GPUSort/GPUSort/benchmark/quicksort_dynamic_benchmark/benchmark.cu deleted file mode 100644 index 5bbf2d5af..000000000 --- a/GPUSort/GPUSort/benchmark/quicksort_dynamic_benchmark/benchmark.cu +++ /dev/null @@ -1,10 +0,0 @@ -#include "../../src/quicksort_dynamic/quicksort.cuh" - -#include "../benchmarker.cpp" -#include "../measure.cu" - -template -void sorter(ArrayView arr) -{ - quicksort(arr); -} diff --git a/GPUSort/GPUSort/src/bitonicSort/sample/Makefile b/GPUSort/GPUSort/src/bitonicSort/sample/Makefile deleted file mode 100644 index 23593937b..000000000 --- a/GPUSort/GPUSort/src/bitonicSort/sample/Makefile +++ /dev/null @@ -1,24 +0,0 @@ -include ../../util/config.mk - -CUDA_SOURCES := $(wildcard *.cu) -CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) - -## targets definitions follow -.PHONY: all host cuda -all: cuda -cuda: $(CUDA_TARGETS) - -run: cuda - ./$(CUDA_TARGETS) - -.PHONY: clean -clean: - rm -f *.d *.o *.cuo $(CUDA_TARGETS) - -# use .cuo instead of .cu.o to avoid problems with the implicit rules: https://stackoverflow.com/q/62967939 -# (and use the host compiler for linking CUDA, nvcc does not understand that .cuo is an object file) -$(CUDA_TARGETS): % : %.cuo - $(CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS) - -$(CUDA_SOURCES:%.cu=%.cuo): %.cuo : %.cu - $(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -c -o $@ $< diff --git a/GPUSort/GPUSort/src/bitonicSort/sample/main.cu b/GPUSort/GPUSort/src/bitonicSort/sample/main.cu deleted file mode 100644 index 21ad72b28..000000000 --- a/GPUSort/GPUSort/src/bitonicSort/sample/main.cu +++ /dev/null @@ -1,40 +0,0 @@ -#include -#include - -#include "../bitonicSort.h" -//-------------------------------------------------- -std::ostream& operator<< (std::ostream&out, std::vector &arr) -{ - for (auto x : arr) - std::cout << x << " "; - return std::cout << std::endl; -} - -#define deb(x) std::cout << #x << " = " << x << std::endl; -//-------------------------------------------------- - -int main( int argc, char* argv[] ) -{ - if(argc <= 1) - { - std::cout << "missing argument: N=array size to be tested on" << std::endl; - return 1; - } - - std::vector a(std::atoi(argv[1])); - for(int i = 0; i < a.size(); i++) - a[i] = std::rand() % a.size(); - - - TNL::Containers::Array Arr(a); - - auto view = Arr.getView(); - - - std::cout << "unsorted: " << view << std::endl; - bitonicSort(a); - - std::cout << "sorted: " << view << std::endl; - - return 0; -} \ No newline at end of file diff --git a/GPUSort/GPUSort/src/quicksort/sample/Makefile b/GPUSort/GPUSort/src/quicksort/sample/Makefile deleted file mode 100644 index 474eb7141..000000000 --- a/GPUSort/GPUSort/src/quicksort/sample/Makefile +++ /dev/null @@ -1,24 +0,0 @@ -include ../../util/config.mk - -CUDA_SOURCES := $(wildcard *.cu) -CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) - -## targets definitions follow -.PHONY: all host cuda -all: cuda -cuda: $(CUDA_TARGETS) - -run: cuda - ./$(CUDA_TARGETS) - -.PHONY: clean -clean: - rm -f *.d *.o *.cuo $(CUDA_TARGETS) - -# use .cuo instead of .cu.o to avoid problems with the implicit rules: https://stackoverflow.com/q/62967939 -# (and use the host compiler for linking CUDA, nvcc does not understand that .cuo is an object file) -$(CUDA_TARGETS): % : %.o - $(CUDA_CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS) -lcudadevrt - -$(CUDA_SOURCES:%.cu=%.o): %.o : %.cu - $(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -gencode arch=compute_52,code=sm_52 -dc -c -o $@ $< \ No newline at end of file diff --git a/GPUSort/GPUSort/src/quicksort/sample/main.cu b/GPUSort/GPUSort/src/quicksort/sample/main.cu deleted file mode 100644 index 38f1f2404..000000000 --- a/GPUSort/GPUSort/src/quicksort/sample/main.cu +++ /dev/null @@ -1,22 +0,0 @@ -#include -#include "../quicksort.cuh" - -#include -#include -#include -using namespace std; - -int main() -{ - vector vec(19); - iota(vec.begin(), vec.end(), 0); - random_shuffle(vec.begin(), vec.end()); - - TNL::Containers::Array arr(vec); - auto view = arr.getView(); - cout << view << endl; - quicksort(view); - cout << view << endl; - - return 0; -} \ No newline at end of file diff --git a/GPUSort/GPUSort/src/quicksort_dynamic/helper.cuh b/GPUSort/GPUSort/src/quicksort_dynamic/helper.cuh deleted file mode 100644 index 41cd87f4a..000000000 --- a/GPUSort/GPUSort/src/quicksort_dynamic/helper.cuh +++ /dev/null @@ -1,86 +0,0 @@ -#pragma once - -#include - -template -__device__ void countElem(TNL::Containers::ArrayView src, - int &smaller, int &bigger, - const Value &pivot) -{ - for (int i = threadIdx.x; i < src.getSize(); i += blockDim.x) - { - int data = src[i]; - if (data < pivot) - smaller++; - else if (data > pivot) - bigger++; - } -} - -template -__device__ void copyData(TNL::Containers::ArrayView src, - TNL::Containers::ArrayView dst, - int smallerStart, int biggerStart, - const Value &pivot) - -{ - for (int i = threadIdx.x; i < src.getSize(); i += blockDim.x) - { - int data = src[i]; - if (data < pivot) - dst[smallerStart++] = data; - else if (data > pivot) - dst[biggerStart++] = data; - } -} - -__device__ void calcBlocksNeeded(int totalBlocks, int elemLeft, int elemRight, int &blocksLeft, int &blocksRight) -{ - int minElemPerBlock = blockDim.x*2; - blocksLeft = elemLeft / minElemPerBlock + (elemLeft% minElemPerBlock != 0); - blocksRight = elemRight / minElemPerBlock + (elemRight% minElemPerBlock != 0); - - - int totalSets = blocksLeft + blocksRight; - if(totalSets<= totalBlocks) - return; - - int multiplier = 1.*totalBlocks/ totalSets + 1; - minElemPerBlock *= multiplier; - - blocksLeft = elemLeft / minElemPerBlock + (elemLeft% minElemPerBlock != 0); - blocksRight = elemRight / minElemPerBlock + (elemRight% minElemPerBlock != 0); - -} - -template -__device__ Value pickPivot(TNL::Containers::ArrayView src, const Function & Cmp) -{ - //return src[0]; - //return src[src.getSize()-1]; - - if(src.getSize() ==1) - return src[0]; - - Value a = src[0], b = src[src.getSize()/2], c = src[src.getSize() - 1]; - - if(Cmp(a, b)) // ..a..b.. - { - if(Cmp(b, c))// ..a..b..c - return b; - else if(Cmp(c, a))//..c..a..b.. - return a; - else //..a..c..b.. - return c; - } - else //..b..a.. - { - if(Cmp(a, c))//..b..a..c - return a; - else if(Cmp(c, b))//..c..b..a.. - return b; - else //..b..c..a.. - return c; - } - -} \ No newline at end of file diff --git a/GPUSort/GPUSort/src/quicksort_dynamic/quicksort.cu b/GPUSort/GPUSort/src/quicksort_dynamic/quicksort.cu deleted file mode 100644 index ca50dd624..000000000 --- a/GPUSort/GPUSort/src/quicksort_dynamic/quicksort.cu +++ /dev/null @@ -1,264 +0,0 @@ -#include "quicksort.cuh" - -#include -#include "../util/reduction.cuh" -#include "task.h" -#include "../bitonicSort/bitonicSort.h" -#include "helper.cuh" -#include -#include -#include - -#define deb(x) std::cout << #x << " = " << x << std::endl; - -using namespace TNL; -using namespace TNL::Containers; - -template -__global__ void cudaPartition(ArrayView src, ArrayView dst, int pivot, TASK *task, const Function &Cmp) -{ - static __shared__ int smallerStart, biggerStart; - - int elemPerBlock = ceil(((double)src.getSize()) / gridDim.x); - int myBegin = blockIdx.x * elemPerBlock; - int myEnd = TNL::min(src.getSize(), myBegin + elemPerBlock); - - int smaller = 0, bigger = 0; - countElem(src.getView(myBegin, myEnd), smaller, bigger, pivot); - - int smallerInclusiveSum = blockInclusivePrefixSum(smaller); - int biggerInclusiveSum = blockInclusivePrefixSum(bigger); - - if (threadIdx.x == blockDim.x - 1) //last thread in block has sum of all values - { - smallerStart = atomicAdd(&(task->begin), smallerInclusiveSum); - biggerStart = atomicAdd(&(task->end), -biggerInclusiveSum) - biggerInclusiveSum; - } - __syncthreads(); - - int destSmaller = smallerStart + (smallerInclusiveSum - smaller); - int destBigger = biggerStart + (biggerInclusiveSum - bigger); - copyData(src.getView(myBegin, myEnd), dst, destSmaller, destBigger, pivot); -} - -template -__device__ void multiBlockQuickSort(ArrayView arr, ArrayView aux, const Function &Cmp, int depth, int availblocks) -{ - static __shared__ int pivot; - static __shared__ int leftEnd, rightBegin; - - if (threadIdx.x == 0) - { - pivot = pickPivot(depth % 2 == 0 ? arr : aux, Cmp); - - TASK *task = (TASK *)malloc(sizeof(TASK)); - *task = TASK(0, arr.getSize()); - - if (depth % 2 == 0) - cudaPartition<<>>(arr, aux, pivot, task, Cmp); - else - cudaPartition<<>>(aux, arr, pivot, task, Cmp); - cudaDeviceSynchronize(); - - leftEnd = task->begin, rightBegin = task->end; - free(task); - } - __syncthreads(); - - for (int i = leftEnd + threadIdx.x; i < rightBegin; i += blockDim.x) - arr[i] = pivot; - - if (threadIdx.x == 0) - { - int blocksLeft = 0, blocksRight = 0; - calcBlocksNeeded(availblocks, leftEnd - 0, arr.getSize() - rightBegin, blocksLeft, blocksRight); - - if(leftEnd > 0) - { - cudaStream_t s; - cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking); - - cudaQuickSort<<<1, blockDim.x, 0, s>>>(arr.getView(0, leftEnd), aux.getView(0, leftEnd), Cmp, blocksLeft, depth + 1); - - cudaStreamDestroy(s); - } - if(arr.getSize() - rightBegin > 0) - { - cudaStream_t s; - cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking); - - cudaQuickSort<<<1, blockDim.x, 0, s>>>(arr.getView(rightBegin, arr.getSize()), aux.getView(rightBegin, aux.getSize()), Cmp, blocksRight, depth + 1); - cudaStreamDestroy(s); - } - } -} - -//------------------------------------------------------------------------- - -template -__device__ void externSort(ArrayView src, ArrayView dst, const Function &Cmp) -{ - static __shared__ int sharedMem[externMemSize]; - bitonicSort_Block(src, dst, sharedMem, Cmp); -} - -template -__device__ void stackPush(int stackArrBegin[], int stackArrEnd[], - int stackDepth[], int &stackTop, - int begin, int pivotBegin, - int pivotEnd, int end, - int depth) -{ - int sizeL = pivotBegin - begin, sizeR = end - pivotEnd; - - //push the bigger one 1st and then smaller one 2nd - //in next iteration, the smaller part will be handled 1st - if (sizeL > sizeR) - { - if (sizeL > 0) //left from pivot are smaller elems - { - stackArrBegin[stackTop] = begin; - stackArrEnd[stackTop] = pivotBegin; - stackDepth[stackTop] = depth + 1; - (stackTop)++; - } - - if (sizeR > 0) //right from pivot until end are elem greater than pivot - { - assert(stackTop < stackSize && "Local quicksort stack overflow."); - - stackArrBegin[stackTop] = pivotEnd; - stackArrEnd[stackTop] = end; - stackDepth[stackTop] = depth + 1; - (stackTop)++; - } - } - else - { - if (sizeR > 0) //right from pivot until end are elem greater than pivot - { - stackArrBegin[stackTop] = pivotEnd; - stackArrEnd[stackTop] = end; - stackDepth[stackTop] = depth + 1; - (stackTop)++; - } - - if (sizeL > 0) //left from pivot are smaller elems - { - assert(stackTop < stackSize && "Local quicksort stack overflow."); - - stackArrBegin[stackTop] = begin; - stackArrEnd[stackTop] = pivotBegin; - stackDepth[stackTop] = depth + 1; - (stackTop)++; - } - } -} - -template -__device__ void singleBlockQuickSort(ArrayView arr, ArrayView aux, const Function &Cmp, int _depth) -{ - static __shared__ int stackTop; - static __shared__ int stackArrBegin[stackSize], stackArrEnd[stackSize], stackDepth[stackSize]; - static __shared__ int begin, end, depth, pivotBegin, pivotEnd; - static __shared__ int pivot; - - if (threadIdx.x == 0) - { - stackTop = 0; - stackArrBegin[stackTop] = 0; - stackArrEnd[stackTop] = arr.getSize(); - stackDepth[stackTop] = _depth; - stackTop++; - } - __syncthreads(); - - while (stackTop > 0) - { - if (threadIdx.x == 0) - { - begin = stackArrBegin[stackTop - 1]; - end = stackArrEnd[stackTop - 1]; - depth = stackDepth[stackTop - 1]; - stackTop--; - pivot = pickPivot(depth % 2 == 0 ? arr.getView(begin, end) : aux.getView(begin, end), - Cmp); - } - __syncthreads(); - - int size = end - begin; - auto src = depth % 2 == 0 ? arr.getView(begin, end) : aux.getView(begin, end); - auto dst = depth % 2 == 0 ? aux.getView(begin, end) : arr.getView(begin, end); - - if (size <= blockDim.x * 2) - { - externSort(src, arr.getView(begin, end), Cmp); - continue; - } - - int smaller = 0, bigger = 0; - countElem(src, smaller, bigger, pivot); - - int smallerOffset = blockInclusivePrefixSum(smaller); - int biggerOffset = blockInclusivePrefixSum(bigger); - - if (threadIdx.x == blockDim.x - 1) - { - pivotBegin = smallerOffset; - pivotEnd = size - biggerOffset; - } - __syncthreads(); - - int destSmaller = 0 + smallerOffset - smaller; - int destBigger = pivotEnd + (biggerOffset - bigger); - - copyData(src, dst, destSmaller, destBigger, pivot); - __syncthreads(); - - for (int i = pivotBegin + threadIdx.x; i < pivotEnd; i += blockDim.x) - src[i] = dst[i] = pivot; - - if (threadIdx.x == 0) - { - stackPush(stackArrBegin, stackArrEnd, stackDepth, stackTop, - begin, begin + pivotBegin, - begin + pivotEnd, end, - depth); - } - __syncthreads(); - } //ends while loop -} - -//------------------------------------------------------------------------- - -template -__global__ void cudaQuickSort(ArrayView arr, ArrayView aux, - const Function &Cmp, int availBlocks, int depth) -{ - if (availBlocks == 0 || arr.getSize() <= blockDim.x * 2 || depth >= 4) //todo: determine max depth - singleBlockQuickSort(arr, aux, Cmp, depth); - else - multiBlockQuickSort(arr, aux, Cmp, depth, availBlocks); -} - -//----------------------------------------------------------- - -template -void quicksort(ArrayView arr, const Function &Cmp) -{ - TNL::Containers::Array aux(arr.getSize()); - - const int threadsPerBlock = 512, maxBlocks = 1 << 15; //32k - const int minElemPerBlock = threadsPerBlock * 2; - int sets = arr.getSize() / minElemPerBlock + (arr.getSize() % minElemPerBlock != 0); - - int blocks = min(sets, maxBlocks); - cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, 10); - cudaQuickSort<<<1, threadsPerBlock>>>(arr, aux.getView(), Cmp, blocks, 0); - cudaDeviceSynchronize(); -} - -void quicksort(TNL::Containers::ArrayView arr) -{ - quicksort(arr, [] __cuda_callable__(int a, int b) { return a < b; }); -} diff --git a/GPUSort/GPUSort/src/quicksort_dynamic/quicksort.cuh b/GPUSort/GPUSort/src/quicksort_dynamic/quicksort.cuh deleted file mode 100644 index d6f563a4d..000000000 --- a/GPUSort/GPUSort/src/quicksort_dynamic/quicksort.cuh +++ /dev/null @@ -1,16 +0,0 @@ -#pragma once - -#include -#include "task.h" - -using namespace TNL; -using namespace TNL::Containers; - -template -__global__ void cudaQuickSort(ArrayView arr, ArrayView aux, - const Function &Cmp, int availBlocks, int depth); - -template -void quicksort(ArrayView arr, const Function & Cmp); - -void quicksort(ArrayViewarr); \ No newline at end of file diff --git a/GPUSort/GPUSort/src/quicksort_dynamic/sample/Makefile b/GPUSort/GPUSort/src/quicksort_dynamic/sample/Makefile deleted file mode 100644 index 62d89d388..000000000 --- a/GPUSort/GPUSort/src/quicksort_dynamic/sample/Makefile +++ /dev/null @@ -1,33 +0,0 @@ -include ../../util/config.mk - -TARGET := main -EXTRA_ARCH := -gencode arch=compute_52,code=sm_52 -DEVICE_CODE := -dc - -CUDA_LDLIBS += -lcudadevrt - -SRC_FOLDER := .. - -## targets definitions follow -.PHONY: all host cuda -all: cuda -cuda: $(TARGET) - -run: cuda - ./$(TARGET) - -.PHONY: clean -clean: - rm -f *.d *.o *.cuo $(TARGET) - -$(TARGET): quicksort.o quicksort_link.o $(TARGET).o - $(CXX) $(TNL_INCLUDE_DIRS) $(CUDA_LDFLAGS) -o $@ $^ $(CUDA_LDLIBS) - -$(TARGET).o: $(TARGET).cu - $(CUDA_CXX) $(CUDA_CXXFLAGS) -c -o $@ $< - -quicksort.o: $(SRC_FOLDER)/quicksort.cu - $(CUDA_CXX) $(CUDA_CXXFLAGS) $(EXTRA_ARCH) $(DEVICE_CODE) -c -o $@ $< - -quicksort_link.o: quicksort.o - $(CUDA_CXX) $(CUDA_LDFLAGS) -dlink -o $@ $< $(CUDA_LDLIBS) diff --git a/GPUSort/GPUSort/src/quicksort_dynamic/sample/main.cu b/GPUSort/GPUSort/src/quicksort_dynamic/sample/main.cu deleted file mode 100644 index 9ba19cb38..000000000 --- a/GPUSort/GPUSort/src/quicksort_dynamic/sample/main.cu +++ /dev/null @@ -1,23 +0,0 @@ -#include -#include "../quicksort.cuh" -#include "../../util/algorithm.h" - -#include -#include -#include -using namespace std; - -int main() -{ - vector vec(19); - iota(vec.begin(), vec.end(), 0); - random_shuffle(vec.begin(), vec.end()); - - TNL::Containers::Array arr(vec); - auto view = arr.getView(); - cout << view << endl; - quicksort(view); - cout << view << endl; - - return 0; -} \ No newline at end of file diff --git a/GPUSort/GPUSort/src/quicksort_dynamic/task.h b/GPUSort/GPUSort/src/quicksort_dynamic/task.h deleted file mode 100644 index b6c897e01..000000000 --- a/GPUSort/GPUSort/src/quicksort_dynamic/task.h +++ /dev/null @@ -1,13 +0,0 @@ -#pragma once - -struct TASK -{ - int begin, end; - - __cuda_callable__ - TASK(int _begin, int _end) - : begin(_begin), end(_end){} - - __cuda_callable__ - TASK(){}; -}; \ No newline at end of file diff --git a/GPUSort/GPUSort/src/util/config.mk b/GPUSort/GPUSort/src/util/config.mk deleted file mode 100644 index e7db43570..000000000 --- a/GPUSort/GPUSort/src/util/config.mk +++ /dev/null @@ -1,53 +0,0 @@ -# configure the include path(s) according to your TNL installation -TNL_INCLUDE_DIRS := -I ~/.local/include - -WITH_OPENMP := no -WITH_DEBUG := no - -# If TNL is installed on your system, the CUDA architecture can be detected -# automatically by tnl-cuda-arch. This is done if CUDA_ARCH is set to "auto". -# Otherwise, CUDA_ARCH has to be set manually to the desired CUDA architecture -# number, e.g. 60, 61, etc. -CUDA_ARCH := auto - -# compilers -CXX := g++ -CUDA_CXX := nvcc - -# host compiler flags -CXXFLAGS := -std=c++14 $(TNL_INCLUDE_DIRS) -ifeq ($(WITH_DEBUG),yes) - CXXFLAGS += -O0 -g -else - CXXFLAGS += -O3 -DNDEBUG -endif - -# CUDA compiler flags -CUDA_CXXFLAGS := -std=c++14 --expt-relaxed-constexpr --expt-extended-lambda $(TNL_INCLUDE_DIRS) -CUDA_CXXFLAGS += -DHAVE_CUDA -ifeq ($(WITH_DEBUG), no) - CUDA_CXXFLAGS += -O3 -DNDEBUG -endif - -ifeq ($(CUDA_ARCH),auto) - CUDA_CXXFLAGS += $(shell tnl-cuda-arch) -else - CUDA_CXXFLAGS += -gencode arch=compute_$(CUDA_ARCH),code=sm_$(CUDA_ARCH) -endif - -# determine path to the CUDA toolkit installation -# (autodetection is attempted, set it manually if it fails) -CUDA_PATH ?= $(abspath $(dir $(shell command -v nvcc))/..) -#$(info Detected CUDA_PATH: $(CUDA_PATH)) - -# flags for linking CUDA with the host compiler -CUDA_LDFLAGS := -L $(CUDA_PATH)/lib64 -CUDA_LDLIBS := -lcudart -ldl -lrt - -# enable OpenMP -ifeq ($(WITH_OPENMP),yes) - CXXFLAGS += -fopenmp -DHAVE_OPENMP - LDLIBS += -lgomp - CUDA_CXXFLAGS += -Xcompiler -fopenmp -DHAVE_OPENMP - CUDA_LDLIBS += -lgomp -endif diff --git a/GPUSort/GPUSort/tests/bitonic_tests/Makefile b/GPUSort/GPUSort/tests/bitonic_tests/Makefile deleted file mode 100644 index a5dbfa2ab..000000000 --- a/GPUSort/GPUSort/tests/bitonic_tests/Makefile +++ /dev/null @@ -1,26 +0,0 @@ -include ../../src/util/config.mk - -CUDA_SOURCES := $(wildcard *.cu) -CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) - -GTEST := -lgtest -pthread - -## targets definitions follow -.PHONY: all host cuda -all: cuda -cuda: $(CUDA_TARGETS) - -run: cuda - ./$(CUDA_TARGETS) - -.PHONY: clean -clean: - rm -f *.d *.o *.cuo $(CUDA_TARGETS) - -# use .cuo instead of .cu.o to avoid problems with the implicit rules: https://stackoverflow.com/q/62967939 -# (and use the host compiler for linking CUDA, nvcc does not understand that .cuo is an object file) -$(CUDA_TARGETS): % : %.cuo - $(CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS) $(GTEST) - -$(CUDA_SOURCES:%.cu=%.cuo): %.cuo : %.cu - $(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -c -o $@ $< diff --git a/GPUSort/GPUSort/tests/quicksort_dynamic_tests/Makefile b/GPUSort/GPUSort/tests/quicksort_dynamic_tests/Makefile deleted file mode 100644 index 62fd87b3f..000000000 --- a/GPUSort/GPUSort/tests/quicksort_dynamic_tests/Makefile +++ /dev/null @@ -1,35 +0,0 @@ -include ../../src/util/config.mk - -TARGET := unitTests -GTEST := -lgtest -pthread -EXTRA_ARCH := -gencode arch=compute_52,code=sm_52 -DEVICE_CODE := -dc - -CUDA_LDLIBS += -lcudadevrt - -SRC_FOLDER := ../../src/quicksort_dynamic - -## targets definitions follow -.PHONY: cuda -all: cuda - -cuda: $(TARGET) - -run: cuda - ./$(TARGET) - -.PHONY: clean -clean: - rm -f *.d *.o *.cuo $(TARGET) - -$(TARGET): quicksort.o quicksort_link.o $(TARGET).o - $(CXX) $(TNL_INCLUDE_DIRS) $(CUDA_LDFLAGS) -o $@ $^ $(CUDA_LDLIBS) $(GTEST) - -$(TARGET).o: $(TARGET).cu - $(CUDA_CXX) $(CUDA_CXXFLAGS) -c -o $@ $< - -quicksort.o: $(SRC_FOLDER)/quicksort.cu - $(CUDA_CXX) $(CUDA_CXXFLAGS) $(EXTRA_ARCH) $(DEVICE_CODE) -c -o $@ $< - -quicksort_link.o: quicksort.o - $(CUDA_CXX) $(CUDA_LDFLAGS) -dlink -o $@ $< $(CUDA_LDLIBS) diff --git a/GPUSort/GPUSort/tests/quicksort_dynamic_tests/README.md b/GPUSort/GPUSort/tests/quicksort_dynamic_tests/README.md deleted file mode 100644 index 85a1ddbe5..000000000 --- a/GPUSort/GPUSort/tests/quicksort_dynamic_tests/README.md +++ /dev/null @@ -1 +0,0 @@ -the implementation of CDP Quick sort is broken and some tests can not be passed \ No newline at end of file diff --git a/GPUSort/GPUSort/tests/quicksort_dynamic_tests/unitTests.cu b/GPUSort/GPUSort/tests/quicksort_dynamic_tests/unitTests.cu deleted file mode 100644 index 9c59a031a..000000000 --- a/GPUSort/GPUSort/tests/quicksort_dynamic_tests/unitTests.cu +++ /dev/null @@ -1,77 +0,0 @@ -#include "gtest/gtest.h" -#include -#include -#include -#include - -#include -#include -#include "../../src/quicksort_dynamic/quicksort.cuh" -#include "../../src/util/algorithm.h" - -//---------------------------------------------------------------------------------- - -TEST(selectedSize, size15) -{ - TNL::Containers::Array cudaArr{5, 9, 4, 8, 6, 1, 2, 3, 4, 8, 1, 6, 9, 4, 9}; - auto view = cudaArr.getView(); - ASSERT_EQ(15, view.getSize()) << "size not 15" << std::endl; - quicksort(view); - ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl; -} - -TEST(multiblock, 32768_decreasingNegative) -{ - std::vector arr(1<<15); - for (size_t i = 0; i < arr.size(); i++) - arr[i] = -i; - - TNL::Containers::Array cudaArr(arr); - auto view = cudaArr.getView(); - - quicksort(view); - ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl; -} - -TEST(randomGenerated, smallArray_randomVal) -{ - std::srand(2006); - for(int i = 0; i < 100; i++) - { - std::vector arr(std::rand()%(1<<10)); - for(auto & x : arr) - x = std::rand(); - - TNL::Containers::Array cudaArr(arr); - - auto view = cudaArr.getView(); - quicksort(view); - ASSERT_TRUE(is_sorted(view)); - } -} - - -TEST(randomGenerated, bigArray_randomVal) -{ - std::srand(304); - for(int i = 0; i < 50; i++) - { - int size = (1<<20) + (std::rand()% (1<<19)); - std::vector arr(size); - for(auto & x : arr) x = std::rand(); - TNL::Containers::Array cudaArr(arr); - - auto view = cudaArr.getView(); - quicksort(view); - ASSERT_TRUE(is_sorted(view)); - } -} - -//---------------------------------------------------------------------------------- - -int main(int argc, char **argv) -{ - testing::InitGoogleTest(&argc, argv); - - return RUN_ALL_TESTS(); -} \ No newline at end of file diff --git a/GPUSort/GPUSort/tests/quicksort_unitTests/Makefile b/GPUSort/GPUSort/tests/quicksort_unitTests/Makefile deleted file mode 100644 index a5dbfa2ab..000000000 --- a/GPUSort/GPUSort/tests/quicksort_unitTests/Makefile +++ /dev/null @@ -1,26 +0,0 @@ -include ../../src/util/config.mk - -CUDA_SOURCES := $(wildcard *.cu) -CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) - -GTEST := -lgtest -pthread - -## targets definitions follow -.PHONY: all host cuda -all: cuda -cuda: $(CUDA_TARGETS) - -run: cuda - ./$(CUDA_TARGETS) - -.PHONY: clean -clean: - rm -f *.d *.o *.cuo $(CUDA_TARGETS) - -# use .cuo instead of .cu.o to avoid problems with the implicit rules: https://stackoverflow.com/q/62967939 -# (and use the host compiler for linking CUDA, nvcc does not understand that .cuo is an object file) -$(CUDA_TARGETS): % : %.cuo - $(CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS) $(GTEST) - -$(CUDA_SOURCES:%.cu=%.cuo): %.cuo : %.cu - $(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -c -o $@ $< diff --git a/GPUSort/README.md b/GPUSort/README.md deleted file mode 100644 index 2117c0478..000000000 --- a/GPUSort/README.md +++ /dev/null @@ -1,17 +0,0 @@ -## repository for bachelor thesis on Development of parallel sorting algorithms for GPU - - -# directory structure -* measuring - * scripts and codes used to make comparison between different algorithms -* otherGPUsorts - * code of other sorting algorithms -* GPUSort - * implementation Bitonic sort and Quick sort for the thesis - - -sidenote: - -warnings during compilation such as the one below are emitted by the TNL library and is an expected behaviour - -/home//.local/include/TNL/Containers/ArrayView.h(155): warning: __host__ annotation is ignored on a function("ArrayView") that is explicitly defaulted on its first declaration \ No newline at end of file diff --git a/GPUSort/measuring/README.md b/GPUSort/measuring/README.md deleted file mode 100644 index c9a6ff84d..000000000 --- a/GPUSort/measuring/README.md +++ /dev/null @@ -1,9 +0,0 @@ -## measuring folder - -* *.ipynb are python jupyter notebook used to process measured data -* ``script.sh`` is a bash scrip that will start all measurements and save the results into ``results`` folder -* ``results`` is a folder to store all .csv files generated after measurement -* each of the folder has a Makefile to start measuring - * to measure an algorithm manually, go into the folder, call ``make`` and execute the binary - * ``./a.out`` will print the results on the standard output - * ``./a.out ../results/my_results.csv`` will save the time measured into the given file location \ No newline at end of file diff --git a/GPUSort/measuring/TNL_implementation/bitonic/Makefile b/GPUSort/measuring/TNL_implementation/bitonic/Makefile deleted file mode 100644 index a8fc1a3eb..000000000 --- a/GPUSort/measuring/TNL_implementation/bitonic/Makefile +++ /dev/null @@ -1,27 +0,0 @@ -include ../../util/config.mk - -CUDA_SOURCES := $(wildcard *.cu) -CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) - -## targets definitions follow -.PHONY: all host cuda -all: cuda -cuda: $(CUDA_TARGETS) - -run: cuda - ./$(CUDA_TARGETS) - -measure: cuda - ./$(CUDA_TARGETS) ../../results/TNL_bitonicsort.csv - -.PHONY: clean -clean: - rm -f *.d *.o *.cuo $(CUDA_TARGETS) - -# use .cuo instead of .cu.o to avoid problems with the implicit rules: https://stackoverflow.com/q/62967939 -# (and use the host compiler for linking CUDA, nvcc does not understand that .cuo is an object file) -$(CUDA_TARGETS): % : %.cuo - $(CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS) - -$(CUDA_SOURCES:%.cu=%.cuo): %.cuo : %.cu - $(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -c -o $@ $< diff --git a/GPUSort/measuring/TNL_implementation/bitonic/main.cu b/GPUSort/measuring/TNL_implementation/bitonic/main.cu deleted file mode 100644 index 51df65df1..000000000 --- a/GPUSort/measuring/TNL_implementation/bitonic/main.cu +++ /dev/null @@ -1 +0,0 @@ -#include "../../../GPUSort/benchmark/bitonic_benchmark/benchmark.cu" \ No newline at end of file diff --git a/GPUSort/measuring/TNL_implementation/bitonic/sameDir/.Makefile b/GPUSort/measuring/TNL_implementation/bitonic/sameDir/.Makefile deleted file mode 100644 index 23593937b..000000000 --- a/GPUSort/measuring/TNL_implementation/bitonic/sameDir/.Makefile +++ /dev/null @@ -1,24 +0,0 @@ -include ../../util/config.mk - -CUDA_SOURCES := $(wildcard *.cu) -CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) - -## targets definitions follow -.PHONY: all host cuda -all: cuda -cuda: $(CUDA_TARGETS) - -run: cuda - ./$(CUDA_TARGETS) - -.PHONY: clean -clean: - rm -f *.d *.o *.cuo $(CUDA_TARGETS) - -# use .cuo instead of .cu.o to avoid problems with the implicit rules: https://stackoverflow.com/q/62967939 -# (and use the host compiler for linking CUDA, nvcc does not understand that .cuo is an object file) -$(CUDA_TARGETS): % : %.cuo - $(CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS) - -$(CUDA_SOURCES:%.cu=%.cuo): %.cuo : %.cu - $(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -c -o $@ $< diff --git a/GPUSort/measuring/TNL_implementation/bitonic/sameDir/main.cu b/GPUSort/measuring/TNL_implementation/bitonic/sameDir/main.cu deleted file mode 100644 index 412bf0cf3..000000000 --- a/GPUSort/measuring/TNL_implementation/bitonic/sameDir/main.cu +++ /dev/null @@ -1,118 +0,0 @@ -#include "../../../GPUSort/bitonicGPU/bitonicSort.h" -#include -#include "../../util/timer.h" -#include "../../util/algorithm.h" - -#include -#include -#include -#include -#include -#include -#include - - -class NOT_SORTED_PROPERLY{}; - -using namespace std; -int main() -{ - ofstream out("TNL_sameDir.csv"); - out << "implementation,size,sorted,almost_sorted,decreasing,random" << endl; - - for(int pow = 3; pow <= 23 ; pow++) - { - int size =(1<< pow); - std::set sizes{size, size+1, size-1}; - for(int i = 0; i < 3; i++) - sizes.insert(size + (std::rand() % size)); - - for(auto x : sizes) - { - cout << "checking size =" << x << endl; - - out << "TNL," << x; - std::vector vec(x); - for(int i = 0; i < x ; ++i) - vec[i] = i; - TNL::Containers::Array arr; - - //sorted sequence - { - arr = vec; - auto view = arr.getView(); - { - TIMER t([&](double res){out << "," << res;}); - bitonicSort(arr.getView()); - } - - if(!is_sorted(arr.getView())) - { - cerr << "sorted seq" << endl; - throw NOT_SORTED_PROPERLY(); - } - } - - //almost sorted sequence - { - for(int i = 0; i < 3; i++) - { - int s = std::rand() % (x - 3); - std::swap(vec[s], vec[s + 1]); - } - - auto view = arr.getView(); - { - TIMER t([&](double res){out << "," << res;}); - bitonicSort(arr.getView()); - } - - if(!is_sorted(arr.getView())) - { - cerr << "almost sorted seq" << endl; - throw NOT_SORTED_PROPERLY(); - } - } - - //decreasing sequence - { - for(size_t i = 0; i < x; i++) - vec[i] = -i; - - auto view = arr.getView(); - { - TIMER t([&](double res){out << "," << res;}); - bitonicSort(arr.getView()); - } - - if(!is_sorted(arr.getView())) - { - cerr << "dec seq" << endl; - throw NOT_SORTED_PROPERLY(); - } - } - - //random sequence - { - std::random_shuffle(vec.begin(), vec.end()); - - auto view = arr.getView(); - { - TIMER t([&](double res){out << "," << res;}); - bitonicSort(arr.getView()); - } - - if(!is_sorted(arr.getView())) - { - cerr << "random seq" << endl; - throw NOT_SORTED_PROPERLY(); - } - } - - out << endl; - } - - } - - return 0; -} \ No newline at end of file diff --git a/GPUSort/measuring/TNL_implementation/cdpquicksort/.Makefile b/GPUSort/measuring/TNL_implementation/cdpquicksort/.Makefile deleted file mode 100644 index 080c69609..000000000 --- a/GPUSort/measuring/TNL_implementation/cdpquicksort/.Makefile +++ /dev/null @@ -1,32 +0,0 @@ -include ../../util/config.mk - -CUDA_SOURCES := $(wildcard *.cu) -CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) - -EXTRA_ARCH := -gencode arch=compute_52,code=sm_52 -DEVICE_CODE := -dc - -CUDA_LDLIBS += -lcudadevrt - -## targets definitions follow -.PHONY: all host cuda -all: cuda -cuda: $(CUDA_TARGETS) - -run: cuda - ./$(CUDA_TARGETS) - -measure: cuda - ./$(CUDA_TARGETS) ../../results/TNL_cdpQuicksort.csv - -.PHONY: clean -clean: - rm -f *.d *.o *.cuo $(CUDA_TARGETS) - -# use .cuo instead of .cu.o to avoid problems with the implicit rules: https://stackoverflow.com/q/62967939 -# (and use the host compiler for linking CUDA, nvcc does not understand that .cuo is an object file) -$(CUDA_TARGETS): % : %.o - $(CUDA_CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS) - -$(CUDA_SOURCES:%.cu=%.o): %.o : %.cu - $(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) $(EXTRA_ARCH) $(DEVICE_CODE) -I/home/xuant/NVIDIA_CUDA-11.1_Samples/common/inc -c -o $@ $< diff --git a/GPUSort/measuring/TNL_implementation/cdpquicksort/benchmark.cu b/GPUSort/measuring/TNL_implementation/cdpquicksort/benchmark.cu deleted file mode 100644 index 332eae267..000000000 --- a/GPUSort/measuring/TNL_implementation/cdpquicksort/benchmark.cu +++ /dev/null @@ -1,4 +0,0 @@ -#include "../../../GPUSort/src/quicksort_dynamic/quicksort.cu" -#define SORTERFUNCTION quicksort -//--------------------------- -#include "../../../GPUSort/benchmark/benchmarker.cpp" \ No newline at end of file diff --git a/GPUSort/measuring/TNL_implementation/quicksort/Makefile b/GPUSort/measuring/TNL_implementation/quicksort/Makefile deleted file mode 100644 index 5c9a6e863..000000000 --- a/GPUSort/measuring/TNL_implementation/quicksort/Makefile +++ /dev/null @@ -1,27 +0,0 @@ -include ../../util/config.mk - -CUDA_SOURCES := $(wildcard *.cu) -CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) - -## targets definitions follow -.PHONY: all host cuda -all: cuda -cuda: $(CUDA_TARGETS) - -run: cuda - ./$(CUDA_TARGETS) - -measure: cuda - ./$(CUDA_TARGETS) ../../results/TNL_quicksort.csv - -.PHONY: clean -clean: - rm -f *.d *.o *.cuo $(CUDA_TARGETS) - -# use .cuo instead of .cu.o to avoid problems with the implicit rules: https://stackoverflow.com/q/62967939 -# (and use the host compiler for linking CUDA, nvcc does not understand that .cuo is an object file) -$(CUDA_TARGETS): % : %.cuo - $(CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS) - -$(CUDA_SOURCES:%.cu=%.cuo): %.cuo : %.cu - $(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -c -o $@ $< diff --git a/GPUSort/measuring/TNL_implementation/quicksort/benchmark.cu b/GPUSort/measuring/TNL_implementation/quicksort/benchmark.cu deleted file mode 100644 index 0ed1c8400..000000000 --- a/GPUSort/measuring/TNL_implementation/quicksort/benchmark.cu +++ /dev/null @@ -1 +0,0 @@ -#include "../../../GPUSort/benchmark/quicksort_benchmark/benchmark.cu" \ No newline at end of file diff --git a/GPUSort/measuring/cederman_quicksort/Makefile b/GPUSort/measuring/cederman_quicksort/Makefile deleted file mode 100644 index 5872137b3..000000000 --- a/GPUSort/measuring/cederman_quicksort/Makefile +++ /dev/null @@ -1,23 +0,0 @@ -include ../util/config.mk - -CUDA_SOURCES := $(wildcard *.cu) -CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) - -## targets definitions follow -.PHONY: all host cuda -all: cuda -cuda: $(CUDA_TARGETS) - -run: cuda - ./$(CUDA_TARGETS) - -measure: cuda - ./$(CUDA_TARGETS) ../results/cederman_quicksort.csv - -.PHONY: clean -clean: - rm -f *.d *.o *.cuo $(CUDA_TARGETS) - -$(CUDA_TARGETS): $(CUDA_TARGETS).cu - nvcc $(CUDA_TARGETS).cu -o $(CUDA_TARGETS) - diff --git a/GPUSort/measuring/cederman_quicksort/main.cu b/GPUSort/measuring/cederman_quicksort/main.cu deleted file mode 100644 index efa8c2540..000000000 --- a/GPUSort/measuring/cederman_quicksort/main.cu +++ /dev/null @@ -1,12 +0,0 @@ -#include "../../otherGPUsorts/cederman/cederman_qsort.cu" -#include - -void sorter(std::vector & vec) -{ - gpuqsort((unsigned int *)vec.data(), vec.size()); -} - -//------------------------------------ - -#include "../../GPUSort/benchmark/benchmarker.cpp" -#include "../../GPUSort/benchmark/measure.cpp" diff --git a/GPUSort/measuring/cuda_example/bitonic/Makefile b/GPUSort/measuring/cuda_example/bitonic/Makefile deleted file mode 100644 index 316d49a6e..000000000 --- a/GPUSort/measuring/cuda_example/bitonic/Makefile +++ /dev/null @@ -1,27 +0,0 @@ -include ../../util/config.mk - -CUDA_SOURCES := $(wildcard *.cu) -CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) - -## targets definitions follow -.PHONY: all host cuda -all: cuda -cuda: $(CUDA_TARGETS) - -run: cuda - ./$(CUDA_TARGETS) - -measure: cuda - ./$(CUDA_TARGETS) ../../results/nvidia_bitonic.csv - -.PHONY: clean -clean: - rm -f *.d *.o *.cuo $(CUDA_TARGETS) - -# use .cuo instead of .cu.o to avoid problems with the implicit rules: https://stackoverflow.com/q/62967939 -# (and use the host compiler for linking CUDA, nvcc does not understand that .cuo is an object file) -$(CUDA_TARGETS): % : %.cuo - $(CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS) - -$(CUDA_SOURCES:%.cu=%.cuo): %.cuo : %.cu - $(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -I../../../otherGPUsorts/cudaExamples/inc -c -o $@ $< diff --git a/GPUSort/measuring/cuda_example/bitonic/main.cu b/GPUSort/measuring/cuda_example/bitonic/main.cu deleted file mode 100644 index 043dc4fe7..000000000 --- a/GPUSort/measuring/cuda_example/bitonic/main.cu +++ /dev/null @@ -1,37 +0,0 @@ -#include "../../../otherGPUsorts/cudaExamples/sortingNetworks/bitonicSort.cu" -#include "../../../GPUSort/src/util/timer.h" -#include "../../../GPUSort/src/util/algorithm.h" -#include -#include -#include -using namespace std; -using namespace TNL; -using namespace TNL::Containers; -//--------------------- - -double measure(const std::vector&vec, int tries, int & wrongAnsCnt) -{ - vector resAcc; - - Array arr(vec.size()); - Array arr2(vec.size()); - for(int i = 0; i < tries; i++) - { - arr = vec; - arr2 = vec; - { - TIMER t([&](double res){resAcc.push_back(res);}); - bitonicSort((unsigned *)arr.getData(), (unsigned *)arr2.getData(), - (unsigned *)arr.getData(), (unsigned *)arr2.getData(), - 1, arr.getSize(), 1); - cudaDeviceSynchronize(); - } - - if(!is_sorted(arr.getView())) - wrongAnsCnt++; - } - - return accumulate(resAcc.begin(), resAcc.end(), 0.0) / resAcc.size(); -} - -#include "../../../GPUSort/benchmark/benchmarker.cpp" \ No newline at end of file diff --git a/GPUSort/measuring/cuda_example/cdpAdvancedQuicksort/Makefile b/GPUSort/measuring/cuda_example/cdpAdvancedQuicksort/Makefile deleted file mode 100644 index 578bfe530..000000000 --- a/GPUSort/measuring/cuda_example/cdpAdvancedQuicksort/Makefile +++ /dev/null @@ -1,32 +0,0 @@ -include ../../util/config.mk - -CUDA_SOURCES := $(wildcard *.cu) -CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) - -EXTRA_ARCH := -gencode arch=compute_52,code=sm_52 -DEVICE_CODE := -dc - -CUDA_LDLIBS += -lcudadevrt - -## targets definitions follow -.PHONY: all host cuda -all: cuda -cuda: $(CUDA_TARGETS) - -run: cuda - ./$(CUDA_TARGETS) - -measure: cuda - ./$(CUDA_TARGETS) ../../results/nvidia_cdpAdvanced.csv - -.PHONY: clean -clean: - rm -f *.d *.o *.cuo $(CUDA_TARGETS) - -# use .cuo instead of .cu.o to avoid problems with the implicit rules: https://stackoverflow.com/q/62967939 -# (and use the host compiler for linking CUDA, nvcc does not understand that .cuo is an object file) -$(CUDA_TARGETS): % : %.o - $(CUDA_CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS) - -$(CUDA_SOURCES:%.cu=%.o): %.o : %.cu - $(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) $(EXTRA_ARCH) $(DEVICE_CODE) -I../../../otherGPUsorts/cudaExamples/inc -c -o $@ $< diff --git a/GPUSort/measuring/cuda_example/cdpAdvancedQuicksort/benchmark.cu b/GPUSort/measuring/cuda_example/cdpAdvancedQuicksort/benchmark.cu deleted file mode 100644 index ec9f54058..000000000 --- a/GPUSort/measuring/cuda_example/cdpAdvancedQuicksort/benchmark.cu +++ /dev/null @@ -1,13 +0,0 @@ -#include "../../../otherGPUsorts/cudaExamples/cdpAdvancedQuicksort/cdpAdvancedQuicksort.cu" -#include "../../../otherGPUsorts/cudaExamples/cdpAdvancedQuicksort/cdpBitonicSort.cu" -#include - -//--------------------------- -void sorter(TNL::Containers::ArrayView view) -{ - TNL::Containers::Array aux(view.getSize()); - run_quicksort_cdp((unsigned int *)view.getData(), (unsigned int *)aux.getData(), view.getSize(), NULL); -} - -#include "../../../GPUSort/benchmark/benchmarker.cpp" -#include "../../../GPUSort/benchmark/measure.cu" \ No newline at end of file diff --git a/GPUSort/measuring/cuda_example/cdpsimplequicksort/.Makefile b/GPUSort/measuring/cuda_example/cdpsimplequicksort/.Makefile deleted file mode 100644 index 948720e0a..000000000 --- a/GPUSort/measuring/cuda_example/cdpsimplequicksort/.Makefile +++ /dev/null @@ -1,32 +0,0 @@ -include ../../util/config.mk - -CUDA_SOURCES := $(wildcard *.cu) -CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) - -EXTRA_ARCH := -gencode arch=compute_52,code=sm_52 -DEVICE_CODE := -dc - -CUDA_LDLIBS += -lcudadevrt - -## targets definitions follow -.PHONY: all host cuda -all: cuda -cuda: $(CUDA_TARGETS) - -run: cuda - ./$(CUDA_TARGETS) - -measure: cuda - ./$(CUDA_TARGETS) ../../results/cdpSimple.csv - -.PHONY: clean -clean: - rm -f *.d *.o *.cuo $(CUDA_TARGETS) - -# use .cuo instead of .cu.o to avoid problems with the implicit rules: https://stackoverflow.com/q/62967939 -# (and use the host compiler for linking CUDA, nvcc does not understand that .cuo is an object file) -$(CUDA_TARGETS): % : %.o - $(CUDA_CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS) - -$(CUDA_SOURCES:%.cu=%.o): %.o : %.cu - $(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) $(EXTRA_ARCH) $(DEVICE_CODE) -I../../../otherGPUsorts/cudaExamples/inc -c -o $@ $< diff --git a/GPUSort/measuring/cuda_example/cdpsimplequicksort/benchmark.cu b/GPUSort/measuring/cuda_example/cdpsimplequicksort/benchmark.cu deleted file mode 100644 index 258d96d26..000000000 --- a/GPUSort/measuring/cuda_example/cdpsimplequicksort/benchmark.cu +++ /dev/null @@ -1,15 +0,0 @@ -#include "../../../otherGPUsorts/cudaExamples/cdpSimpleQuicksort/cdpSimpleQuicksort.cu" -#include - -#define SORTERFUNCTION nvidia_quick - -#define HIGH_POW 20 -//--------------------------- - -void nvidia_quick(TNL::Containers::ArrayView view) -{ - run_qsort((unsigned int *)view.getData(), view.getSize()); - cudaDeviceSynchronize(); -} - -#include "../../../GPUSort/benchmark/benchmarker.cpp" \ No newline at end of file diff --git a/GPUSort/measuring/davors/quicksort/Makefile b/GPUSort/measuring/davors/quicksort/Makefile deleted file mode 100644 index 82d5c87df..000000000 --- a/GPUSort/measuring/davors/quicksort/Makefile +++ /dev/null @@ -1,27 +0,0 @@ -include ../../util/config.mk - -CUDA_SOURCES := $(wildcard *.cu) -CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) - -## targets definitions follow -.PHONY: all host cuda -all: cuda -cuda: $(CUDA_TARGETS) - -run: cuda - ./$(CUDA_TARGETS) - -measure: cuda - ./$(CUDA_TARGETS) ../../results/davors_bitonic.csv - -.PHONY: clean -clean: - rm -f *.d *.o *.cuo $(CUDA_TARGETS) - -# use .cuo instead of .cu.o to avoid problems with the implicit rules: https://stackoverflow.com/q/62967939 -# (and use the host compiler for linking CUDA, nvcc does not understand that .cuo is an object file) -$(CUDA_TARGETS): % : %.cuo - $(CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS) - -$(CUDA_SOURCES:%.cu=%.cuo): %.cuo : %.cu - $(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -c -o $@ $< diff --git a/GPUSort/measuring/davors/quicksort/benchmark.cu b/GPUSort/measuring/davors/quicksort/benchmark.cu deleted file mode 100644 index 9154dfb8d..000000000 --- a/GPUSort/measuring/davors/quicksort/benchmark.cu +++ /dev/null @@ -1,20 +0,0 @@ -#include -#include "../../../otherGPUsorts/davors/BitonicSort/Sort/parallel.h" -//------------------------ - -#define LOW_POW 19 -#define HIGH_POW 20 - -void sorter(TNL::Containers::ArrayView view) -{ - auto sorter = new BitonicSortParallel(); - sorter->sort((data_t*)view.getData(), (uint_t)view.getSize(), ORDER_ASC); - cudaDeviceSynchronize(); - delete sorter; - return; -} - -//------------------------ - -#include "../../../GPUSort/benchmark/benchmarker.cpp" -#include "../../../GPUSort/benchmark/measure.cu" diff --git a/GPUSort/measuring/manca_quicksort/Makefile b/GPUSort/measuring/manca_quicksort/Makefile deleted file mode 100644 index 2303b4304..000000000 --- a/GPUSort/measuring/manca_quicksort/Makefile +++ /dev/null @@ -1,27 +0,0 @@ -include ../util/config.mk - -CUDA_SOURCES := $(wildcard *.cu) -CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) - -## targets definitions follow -.PHONY: all host cuda -all: cuda -cuda: $(CUDA_TARGETS) - -run: cuda - ./$(CUDA_TARGETS) - -measure: cuda - ./$(CUDA_TARGETS) ../results/manca_quicksort.csv - -.PHONY: clean -clean: - rm -f *.d *.o *.cuo $(CUDA_TARGETS) - -# use .cuo instead of .cu.o to avoid problems with the implicit rules: https://stackoverflow.com/q/62967939 -# (and use the host compiler for linking CUDA, nvcc does not understand that .cuo is an object file) -$(CUDA_TARGETS): % : %.cuo - $(CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS) - -$(CUDA_SOURCES:%.cu=%.cuo): %.cuo : %.cu - $(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -I../../otherGPUsorts/manca_quicksort/libraries/include/ -c -o $@ $< diff --git a/GPUSort/measuring/manca_quicksort/main.cu b/GPUSort/measuring/manca_quicksort/main.cu deleted file mode 100644 index 448f9cac0..000000000 --- a/GPUSort/measuring/manca_quicksort/main.cu +++ /dev/null @@ -1,15 +0,0 @@ -#include -#include "../../otherGPUsorts/manca_quicksort_extracted/manca_quicksort.cu" -//------------------------ - -void sorter(TNL::Containers::ArrayView view) -{ - double timer = 0; - CUDA_Quicksort((unsigned *)view.getData(), (unsigned *)view.getData(), view.getSize(), 256, 0, &timer); - return; -} - -//------------------------ - -#include "../../GPUSort/benchmark/benchmarker.cpp" -#include "../../GPUSort/benchmark/measure.cu" diff --git a/GPUSort/measuring/nickjillings/Makefile b/GPUSort/measuring/nickjillings/Makefile deleted file mode 100644 index 5e685427a..000000000 --- a/GPUSort/measuring/nickjillings/Makefile +++ /dev/null @@ -1,27 +0,0 @@ -include ../util/config.mk - -CUDA_SOURCES := $(wildcard *.cu) -CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) - -## targets definitions follow -.PHONY: all host cuda -all: cuda -cuda: $(CUDA_TARGETS) - -run: cuda - ./$(CUDA_TARGETS) - -measure: cuda - ./$(CUDA_TARGETS) ../results/nickjillings.csv - -.PHONY: clean -clean: - rm -f *.d *.o *.cuo $(CUDA_TARGETS) - -# use .cuo instead of .cu.o to avoid problems with the implicit rules: https://stackoverflow.com/q/62967939 -# (and use the host compiler for linking CUDA, nvcc does not understand that .cuo is an object file) -$(CUDA_TARGETS): % : %.cuo - $(CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS) - -$(CUDA_SOURCES:%.cu=%.cuo): %.cuo : %.cu - $(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -c -o $@ $< diff --git a/GPUSort/measuring/nickjillings/main.cu b/GPUSort/measuring/nickjillings/main.cu deleted file mode 100644 index 75b93ea62..000000000 --- a/GPUSort/measuring/nickjillings/main.cu +++ /dev/null @@ -1,12 +0,0 @@ - -#include "../../otherGPUsorts/nickjillings/BitonicSortCUDA.cu" -#include - - -void sorter(TNL::Containers::ArrayView view) -{ - BitonicSort::BitonicSortCUDA((unsigned int *)view.getData(), view.getSize()); -} -//--------------------------- -#include "../../GPUSort/benchmark/benchmarker.cpp" -#include "../../GPUSort/benchmark/measure.cu" \ No newline at end of file diff --git a/GPUSort/measuring/script.sh b/GPUSort/measuring/script.sh deleted file mode 100644 index 2064d0f97..000000000 --- a/GPUSort/measuring/script.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash - -for i in $(find . -type f -name 'Makefile' | sed -r 's|/[^/]+$||' |sort |uniq) -do - echo going into $i - cd "$i" - echo starting... - make clean - make - make measure - echo done measuring - make clean - echo going out... - cd - -done diff --git a/GPUSort/measuring/std_sort/Makefile b/GPUSort/measuring/std_sort/Makefile deleted file mode 100644 index f7ef2b3b3..000000000 --- a/GPUSort/measuring/std_sort/Makefile +++ /dev/null @@ -1,12 +0,0 @@ -main: main.cpp - g++ -Wall -pedantic -std=c++14 -O3 main.cpp -o main - -measure: main - ./main ../results/std_sort.csv - -run: main - ./main - -.PHONY: main -clean: - rm -f main diff --git a/GPUSort/measuring/std_sort/main.cpp b/GPUSort/measuring/std_sort/main.cpp deleted file mode 100644 index 8ce12d0ac..000000000 --- a/GPUSort/measuring/std_sort/main.cpp +++ /dev/null @@ -1,12 +0,0 @@ -#include -#include - -#define TRIES 5 - -void sorter(std::vector&vec) -{ - std::sort(vec.begin(), vec.end()); -} -//--------------------------- -#include "../../GPUSort/benchmark/benchmarker.cpp" -#include "../../GPUSort/benchmark/measure.cpp" diff --git a/GPUSort/measuring/thrust/Makefile b/GPUSort/measuring/thrust/Makefile deleted file mode 100644 index 1c21a5ac7..000000000 --- a/GPUSort/measuring/thrust/Makefile +++ /dev/null @@ -1,27 +0,0 @@ -include ../util/config.mk - -CUDA_SOURCES := $(wildcard *.cu) -CUDA_TARGETS := $(CUDA_SOURCES:%.cu=%) - -## targets definitions follow -.PHONY: all host cuda -all: cuda -cuda: $(CUDA_TARGETS) - -run: cuda - ./$(CUDA_TARGETS) - -measure: cuda - ./$(CUDA_TARGETS) ../results/thrust.csv - -.PHONY: clean -clean: - rm -f *.d *.o *.cuo $(CUDA_TARGETS) - -# use .cuo instead of .cu.o to avoid problems with the implicit rules: https://stackoverflow.com/q/62967939 -# (and use the host compiler for linking CUDA, nvcc does not understand that .cuo is an object file) -$(CUDA_TARGETS): % : %.cuo - $(CXX) $(CUDA_LDFLAGS) -o $@ $< $(CUDA_LDLIBS) - -$(CUDA_SOURCES:%.cu=%.cuo): %.cuo : %.cu - $(CUDA_CXX) $(CUDA_CPPFLAGS) $(CUDA_CXXFLAGS) -c -o $@ $< diff --git a/GPUSort/measuring/thrust/main.cu b/GPUSort/measuring/thrust/main.cu deleted file mode 100644 index e28f1b6bb..000000000 --- a/GPUSort/measuring/thrust/main.cu +++ /dev/null @@ -1,12 +0,0 @@ -#include -#include -#include - -void sorter(TNL::Containers::ArrayView view) -{ - thrust::sort(thrust::device, view.getData(), view.getData() + view.getSize()); - cudaDeviceSynchronize(); -} -//--------------------------- -#include "../../GPUSort/benchmark/benchmarker.cpp" -#include "../../GPUSort/benchmark/measure.cu" \ No newline at end of file diff --git a/GPUSort/measuring/util/config.mk b/GPUSort/measuring/util/config.mk deleted file mode 100644 index 3715986f7..000000000 --- a/GPUSort/measuring/util/config.mk +++ /dev/null @@ -1,49 +0,0 @@ -# configure the include path(s) according to your TNL installation -TNL_INCLUDE_DIRS := -I ~/.local/include - -WITH_OPENMP := no -WITH_DEBUG := no - -# If TNL is installed on your system, the CUDA architecture can be detected -# automatically by tnl-cuda-arch. This is done if CUDA_ARCH is set to "auto". -# Otherwise, CUDA_ARCH has to be set manually to the desired CUDA architecture -# number, e.g. 60, 61, etc. -CUDA_ARCH := auto - -# compilers -CXX := g++ -CUDA_CXX := nvcc - -# host compiler flags -CXXFLAGS := -std=c++14 $(TNL_INCLUDE_DIRS) -ifeq ($(WITH_DEBUG),yes) - CXXFLAGS += -O0 -g -else - CXXFLAGS += -O3 -DNDEBUG -endif - -# CUDA compiler flags -CUDA_CXXFLAGS := -std=c++14 --expt-relaxed-constexpr --expt-extended-lambda $(TNL_INCLUDE_DIRS) -CUDA_CXXFLAGS += -DHAVE_CUDA -ifeq ($(CUDA_ARCH),auto) - CUDA_CXXFLAGS += $(shell tnl-cuda-arch) -else - CUDA_CXXFLAGS += -gencode arch=compute_$(CUDA_ARCH),code=sm_$(CUDA_ARCH) -endif - -# determine path to the CUDA toolkit installation -# (autodetection is attempted, set it manually if it fails) -CUDA_PATH ?= $(abspath $(dir $(shell command -v nvcc))/..) -#$(info Detected CUDA_PATH: $(CUDA_PATH)) - -# flags for linking CUDA with the host compiler -CUDA_LDFLAGS := -L $(CUDA_PATH)/lib64 -CUDA_LDLIBS := -lcudart -ldl -lrt - -# enable OpenMP -ifeq ($(WITH_OPENMP),yes) - CXXFLAGS += -fopenmp -DHAVE_OPENMP - LDLIBS += -lgomp - CUDA_CXXFLAGS += -Xcompiler -fopenmp -DHAVE_OPENMP - CUDA_LDLIBS += -lgomp -endif -- GitLab From 3c2b8bcc2a2e0e89a7152fac4d1d881c2b71d64f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Mon, 19 Jul 2021 12:49:48 +0200 Subject: [PATCH 251/258] Fixing namespaces in sorting source files. Fixing header including in Nvidia bitonic sort wrapper. Fixing namespaces definition. --- src/Benchmarks/Sorting/Measurer.h | 2 +- .../ReferenceAlgorithms/CedermanQuicksort.h | 2 +- .../ReferenceAlgorithms/MancaQuicksort.h | 2 +- .../ReferenceAlgorithms/NvidiaBitonicSort.h | 10 +++- .../ReferenceAlgorithms/ThrustRadixsort.h | 3 + src/TNL/Algorithms/Sort.h | 5 +- .../Algorithms/Sorting/detail/Quicksorter.hpp | 2 +- .../Algorithms/Sorting/detail/bitonicSort.h | 2 +- .../Sorting/detail/blockBitonicSort.h | 7 +++ .../Algorithms/Sorting/detail/cudaPartition.h | 57 ++++++++++--------- src/TNL/Algorithms/Sorting/detail/helpers.h | 9 ++- .../Sorting/detail/quicksort_1Block.h | 29 ++++++---- .../Sorting/detail/quicksort_kernel.h | 42 +++++++------- src/TNL/Algorithms/Sorting/detail/reduction.h | 14 ++++- src/TNL/Algorithms/Sorting/detail/task.h | 10 +++- 15 files changed, 123 insertions(+), 73 deletions(-) diff --git a/src/Benchmarks/Sorting/Measurer.h b/src/Benchmarks/Sorting/Measurer.h index 7abf16604..92a578b63 100644 --- a/src/Benchmarks/Sorting/Measurer.h +++ b/src/Benchmarks/Sorting/Measurer.h @@ -10,10 +10,10 @@ #ifdef HAVE_CUDA #ifdef HAVE_CUDA_SAMPLES #include "ReferenceAlgorithms/MancaQuicksort.h" +#include "ReferenceAlgorithms/NvidiaBitonicSort.h" #endif #include "ReferenceAlgorithms/CedermanQuicksort.h" #include "ReferenceAlgorithms/ThrustRadixsort.h" -#include "ReferenceAlgorithms/NvidiaBitonicSort.h" #endif #include "timer.h" diff --git a/src/Benchmarks/Sorting/ReferenceAlgorithms/CedermanQuicksort.h b/src/Benchmarks/Sorting/ReferenceAlgorithms/CedermanQuicksort.h index 5e64d773d..0ae220842 100644 --- a/src/Benchmarks/Sorting/ReferenceAlgorithms/CedermanQuicksort.h +++ b/src/Benchmarks/Sorting/ReferenceAlgorithms/CedermanQuicksort.h @@ -1093,7 +1093,7 @@ int gpuqsort(unsigned int *data, unsigned int size, unsigned int blockscount, un struct CedermanQuicksort { - static void sort( Containers::ArrayView< int, Devices::Cuda >& array ) + static void sort( TNL::Containers::ArrayView< int, TNL::Devices::Cuda >& array ) { gpuqsort( ( unsigned int * ) array.getData(), ( unsigned int ) array.getSize() ); } diff --git a/src/Benchmarks/Sorting/ReferenceAlgorithms/MancaQuicksort.h b/src/Benchmarks/Sorting/ReferenceAlgorithms/MancaQuicksort.h index 51f49250b..e7f4d20fb 100644 --- a/src/Benchmarks/Sorting/ReferenceAlgorithms/MancaQuicksort.h +++ b/src/Benchmarks/Sorting/ReferenceAlgorithms/MancaQuicksort.h @@ -1318,7 +1318,7 @@ void CUDA_Quicksort_64(double* inputData,double* outputData, uint dataSize, uint struct MancaQuicksort { - static void sort( Containers::ArrayView< int, Devices::Cuda >& array ) + static void sort( TNL::Containers::ArrayView< int, TNL::Devices::Cuda >& array ) { double timer; CUDA_Quicksort( ( unsigned * ) array.getData(), (unsigned * ) array.getData(), array.getSize(), 256, 0, &timer ); diff --git a/src/Benchmarks/Sorting/ReferenceAlgorithms/NvidiaBitonicSort.h b/src/Benchmarks/Sorting/ReferenceAlgorithms/NvidiaBitonicSort.h index 08a70731b..834571287 100644 --- a/src/Benchmarks/Sorting/ReferenceAlgorithms/NvidiaBitonicSort.h +++ b/src/Benchmarks/Sorting/ReferenceAlgorithms/NvidiaBitonicSort.h @@ -1,16 +1,24 @@ + +#ifdef HAVE_CUDA_SAMPLES #include <6_Advanced/sortingNetworks/bitonicSort.cu> +#endif #include +namespace TNL { struct NvidiaBitonicSort { static void sort( Containers::ArrayView< int, Devices::Cuda >& view ) { - Array arr; +#ifdef HAVE_CUDA_SAMPLES + Containers::Array arr; arr = view; bitonicSort((unsigned *)view.getData(), (unsigned *)arr.getData(), (unsigned *)view.getData(), (unsigned *)arr.getData(), 1, arr.getSize(), 1); cudaDeviceSynchronize(); +#endif } }; + +} // namespace TNL diff --git a/src/Benchmarks/Sorting/ReferenceAlgorithms/ThrustRadixsort.h b/src/Benchmarks/Sorting/ReferenceAlgorithms/ThrustRadixsort.h index b2aafc8ad..02f03a023 100644 --- a/src/Benchmarks/Sorting/ReferenceAlgorithms/ThrustRadixsort.h +++ b/src/Benchmarks/Sorting/ReferenceAlgorithms/ThrustRadixsort.h @@ -2,6 +2,7 @@ #include #include +namespace TNL { struct ThrustRadixsort { @@ -11,3 +12,5 @@ struct ThrustRadixsort cudaDeviceSynchronize(); } }; + +} // namespace TNL diff --git a/src/TNL/Algorithms/Sort.h b/src/TNL/Algorithms/Sort.h index 6b924215e..0da7b4120 100644 --- a/src/TNL/Algorithms/Sort.h +++ b/src/TNL/Algorithms/Sort.h @@ -35,13 +35,12 @@ void sort( Array& array, const Compare& compare ) template< typename Device, typename Index, - typename Fetch, typename Compare, typename Swap, typename Sorter = typename Sorting::DefaultInplaceSorter< Device >::SorterType > -void inplaceSort( const Index begin, const Index end, const Fetch& fetch, const Compare& compare, const Swap& swap ) +void sort( const Index begin, const Index end, const Compare& compare, const Swap& swap ) { - Sorter::inplaceSort( begin, end, fetch, compare, swap ); + Sorter::inplaceSort( begin, end, compare, swap ); } template diff --git a/src/TNL/Algorithms/Sorting/detail/Quicksorter.hpp b/src/TNL/Algorithms/Sorting/detail/Quicksorter.hpp index f20ef00ec..faa294ab3 100644 --- a/src/TNL/Algorithms/Sorting/detail/Quicksorter.hpp +++ b/src/TNL/Algorithms/Sorting/detail/Quicksorter.hpp @@ -88,7 +88,7 @@ sort( Array& arr ) template< typename Value > void Quicksorter< Value, Devices::Cuda >:: -init( ArrayView arr, int gridDim, int blockDim, int desiredElemPerBlock, int maxSharable) +init( Containers::ArrayView arr, int gridDim, int blockDim, int desiredElemPerBlock, int maxSharable) { this->maxBlocks = gridDim; this->threadsPerBlock = blockDim; diff --git a/src/TNL/Algorithms/Sorting/detail/bitonicSort.h b/src/TNL/Algorithms/Sorting/detail/bitonicSort.h index 001909120..8ccd0569c 100644 --- a/src/TNL/Algorithms/Sorting/detail/bitonicSort.h +++ b/src/TNL/Algorithms/Sorting/detail/bitonicSort.h @@ -374,6 +374,6 @@ void bitonicSort(int begin, int end, const CMP &Cmp, SWAP Swap) cudaDeviceSynchronize(); } #endif - } // namespace detail + } // namespace Sorting } // namespace Algorithms } // namespace TNL diff --git a/src/TNL/Algorithms/Sorting/detail/blockBitonicSort.h b/src/TNL/Algorithms/Sorting/detail/blockBitonicSort.h index 413d74456..f0732dcb0 100644 --- a/src/TNL/Algorithms/Sorting/detail/blockBitonicSort.h +++ b/src/TNL/Algorithms/Sorting/detail/blockBitonicSort.h @@ -2,6 +2,10 @@ #include #include +namespace TNL { + namespace Algorithms { + namespace Sorting { + #ifdef HAVE_CUDA /** @@ -102,3 +106,6 @@ __device__ void bitonicSort_Block(TNL::Containers::ArrayView #include -#ifdef HAVE_CUDA +namespace TNL { + namespace Algorithms { + namespace Sorting { -using namespace TNL; -using namespace TNL::Containers; +#ifdef HAVE_CUDA template __device__ Value pickPivot(TNL::Containers::ArrayView src, const CMP &Cmp) @@ -86,10 +87,10 @@ __device__ int pickPivotIdx(TNL::Containers::ArrayView src, const //----------------------------------------------------------- template -__device__ void countElem(ArrayView arr, - const CMP &Cmp, - int &smaller, int &bigger, - const Value &pivot) +__device__ void countElem( Containers::ArrayView arr, + const CMP &Cmp, + int &smaller, int &bigger, + const Value &pivot) { for (int i = threadIdx.x; i < arr.getSize(); i += blockDim.x) { @@ -104,14 +105,14 @@ __device__ void countElem(ArrayView arr, //----------------------------------------------------------- template -__device__ void copyDataShared(ArrayView src, - ArrayView dst, - const CMP &Cmp, - Value *sharedMem, - int smallerStart, int biggerStart, - int smallerTotal, int biggerTotal, - int smallerOffset, int biggerOffset, //exclusive prefix sum of elements - const Value &pivot) +__device__ void copyDataShared( Containers::ArrayView src, + Containers::ArrayView dst, + const CMP &Cmp, + Value *sharedMem, + int smallerStart, int biggerStart, + int smallerTotal, int biggerTotal, + int smallerOffset, int biggerOffset, //exclusive prefix sum of elements + const Value &pivot) { for (int i = threadIdx.x; i < src.getSize(); i += blockDim.x) @@ -134,11 +135,11 @@ __device__ void copyDataShared(ArrayView src, } template -__device__ void copyData(ArrayView src, - ArrayView dst, - const CMP &Cmp, - int smallerStart, int biggerStart, - const Value &pivot) +__device__ void copyData( Containers::ArrayView src, + Containers::ArrayView dst, + const CMP &Cmp, + int smallerStart, int biggerStart, + const Value &pivot) { for (int i = threadIdx.x; i < src.getSize(); i += blockDim.x) { @@ -165,12 +166,12 @@ __device__ void copyData(ArrayView src, //---------------------------------------------------------------------------------- template -__device__ void cudaPartition(ArrayView src, - ArrayView dst, - const CMP &Cmp, - Value *sharedMem, - const Value &pivot, - int elemPerBlock, TASK &task) +__device__ void cudaPartition( Containers::ArrayView src, + Containers::ArrayView dst, + const CMP &Cmp, + Value *sharedMem, + const Value &pivot, + int elemPerBlock, TASK &task) { static __shared__ int smallerStart, biggerStart; @@ -220,3 +221,7 @@ __device__ void cudaPartition(ArrayView src, } #endif + + } // namespace Sorting + } // namespace Algorithms +} // namespace TNL diff --git a/src/TNL/Algorithms/Sorting/detail/helpers.h b/src/TNL/Algorithms/Sorting/detail/helpers.h index bf5f6f9d0..2d7dbbcc7 100644 --- a/src/TNL/Algorithms/Sorting/detail/helpers.h +++ b/src/TNL/Algorithms/Sorting/detail/helpers.h @@ -13,6 +13,10 @@ #pragma once #include +namespace TNL { + namespace Algorithms { + namespace Sorting { + #ifdef HAVE_CUDA // Inline PTX call to return index of highest non-zero bit in a word @@ -49,4 +53,7 @@ __cuda_callable__ void cmpSwap(Value &a, Value &b, bool ascending, const CMP &Cm TNL::swap(a, b); } -#endif \ No newline at end of file +#endif + } //namespace Sorting + } //namespace Algorithms +} // namespace TNL \ No newline at end of file diff --git a/src/TNL/Algorithms/Sorting/detail/quicksort_1Block.h b/src/TNL/Algorithms/Sorting/detail/quicksort_1Block.h index e5196d7a5..efca29f24 100644 --- a/src/TNL/Algorithms/Sorting/detail/quicksort_1Block.h +++ b/src/TNL/Algorithms/Sorting/detail/quicksort_1Block.h @@ -18,22 +18,23 @@ #include #include -using namespace TNL; -using namespace TNL::Containers; +namespace TNL { + namespace Algorithms { + namespace Sorting { #ifdef HAVE_CUDA template -__device__ void externSort(ArrayView src, - ArrayView dst, - const CMP &Cmp, Value *sharedMem) +__device__ void externSort( Containers::ArrayView src, + Containers::ArrayView dst, + const CMP &Cmp, Value *sharedMem) { bitonicSort_Block(src, dst, sharedMem, Cmp); } template -__device__ void externSort(ArrayView src, - const CMP &Cmp) +__device__ void externSort( Containers::ArrayView src, + const CMP &Cmp) { bitonicSort_Block(src, Cmp); } @@ -50,11 +51,11 @@ __device__ void stackPush(int stackArrBegin[], int stackArrEnd[], //--------------------------------------------------------------- template -__device__ void singleBlockQuickSort(ArrayView arr, - ArrayView aux, - const CMP &Cmp, int _iteration, - Value *sharedMem, int memSize, - int maxBitonicSize) +__device__ void singleBlockQuickSort( Containers::ArrayView arr, + Containers::ArrayView aux, + const CMP &Cmp, int _iteration, + Value *sharedMem, int memSize, + int maxBitonicSize) { if (arr.getSize() <= maxBitonicSize) { @@ -248,3 +249,7 @@ __device__ void stackPush(int stackArrBegin[], int stackArrEnd[], } #endif + + } // namespace Sorting + } // namespace Algorithms +} // namespace TNL diff --git a/src/TNL/Algorithms/Sorting/detail/quicksort_kernel.h b/src/TNL/Algorithms/Sorting/detail/quicksort_kernel.h index 882316ac2..8d26d0637 100644 --- a/src/TNL/Algorithms/Sorting/detail/quicksort_kernel.h +++ b/src/TNL/Algorithms/Sorting/detail/quicksort_kernel.h @@ -27,13 +27,13 @@ namespace TNL { #ifdef HAVE_CUDA __device__ void writeNewTask(int begin, int end, int iteration, int maxElemFor2ndPhase, - ArrayView newTasks, int *newTasksCnt, - ArrayView secondPhaseTasks, int *secondPhaseTasksCnt); + Containers::ArrayView newTasks, int *newTasksCnt, + Containers::ArrayView secondPhaseTasks, int *secondPhaseTasksCnt); //----------------------------------------------------------- -__global__ void cudaCalcBlocksNeeded(ArrayView cuda_tasks, int elemPerBlock, - VectorView blocksNeeded) +__global__ void cudaCalcBlocksNeeded(Containers::ArrayView cuda_tasks, int elemPerBlock, + Containers::VectorView blocksNeeded) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i >= cuda_tasks.getSize()) @@ -47,10 +47,10 @@ __global__ void cudaCalcBlocksNeeded(ArrayView cuda_tasks, //----------------------------------------------------------- template -__global__ void cudaInitTask(ArrayView cuda_tasks, - ArrayView cuda_blockToTaskMapping, - VectorView cuda_reductionTaskInitMem, - ArrayView src, CMP Cmp) +__global__ void cudaInitTask(Containers::ArrayView cuda_tasks, + Containers::ArrayView cuda_blockToTaskMapping, + Containers::VectorView cuda_reductionTaskInitMem, + Containers::ArrayView src, CMP Cmp) { if (blockIdx.x >= cuda_tasks.getSize()) return; @@ -71,10 +71,10 @@ __global__ void cudaInitTask(ArrayView cuda_tasks, //---------------------------------------------------- template -__global__ void cudaQuickSort1stPhase(ArrayView arr, ArrayView aux, +__global__ void cudaQuickSort1stPhase(Containers::ArrayView arr, Containers::ArrayView aux, const CMP &Cmp, int elemPerBlock, - ArrayView tasks, - ArrayView taskMapping) + Containers::ArrayView tasks, + Containers::ArrayView taskMapping) { extern __shared__ int externMem[]; Value *piv = (Value *)externMem; @@ -99,9 +99,9 @@ __global__ void cudaQuickSort1stPhase(ArrayView arr, Array //---------------------------------------------------- template -__global__ void cudaWritePivot(ArrayView arr, ArrayView aux, int maxElemFor2ndPhase, - ArrayView tasks, ArrayView newTasks, int *newTasksCnt, - ArrayView secondPhaseTasks, int *secondPhaseTasksCnt) +__global__ void cudaWritePivot(Containers::ArrayView arr, Containers::ArrayView aux, int maxElemFor2ndPhase, + Containers::ArrayView tasks, Containers::ArrayView newTasks, int *newTasksCnt, + Containers::ArrayView secondPhaseTasks, int *secondPhaseTasksCnt) { extern __shared__ int externMem[]; Value *piv = (Value *)externMem; @@ -149,8 +149,8 @@ __global__ void cudaWritePivot(ArrayView arr, ArrayView newTasks, int *newTasksCnt, - ArrayView secondPhaseTasks, int *secondPhaseTasksCnt) + Containers::ArrayView newTasks, int *newTasksCnt, + Containers::ArrayView secondPhaseTasks, int *secondPhaseTasksCnt) { int size = end - begin; if (size < 0) @@ -197,9 +197,9 @@ __device__ void writeNewTask(int begin, int end, int iteration, int maxElemFor2n //----------------------------------------------------------- template -__global__ void cudaQuickSort2ndPhase(ArrayView arr, ArrayView aux, +__global__ void cudaQuickSort2ndPhase(Containers::ArrayView arr, Containers::ArrayView aux, CMP Cmp, - ArrayView secondPhaseTasks, + Containers::ArrayView secondPhaseTasks, int elemInShared, int maxBitonicSize) { extern __shared__ int externMem[]; @@ -226,10 +226,10 @@ __global__ void cudaQuickSort2ndPhase(ArrayView arr, Array } template -__global__ void cudaQuickSort2ndPhase(ArrayView arr, ArrayView aux, +__global__ void cudaQuickSort2ndPhase(Containers::ArrayView arr, Containers::ArrayView aux, CMP Cmp, - ArrayView secondPhaseTasks1, - ArrayView secondPhaseTasks2, + Containers::ArrayView secondPhaseTasks1, + Containers::ArrayView secondPhaseTasks2, int elemInShared, int maxBitonicSize) { extern __shared__ int externMem[]; diff --git a/src/TNL/Algorithms/Sorting/detail/reduction.h b/src/TNL/Algorithms/Sorting/detail/reduction.h index e1406ec46..e2bf14809 100644 --- a/src/TNL/Algorithms/Sorting/detail/reduction.h +++ b/src/TNL/Algorithms/Sorting/detail/reduction.h @@ -12,6 +12,10 @@ #pragma once +namespace TNL { + namespace Algorithms { + namespace Sorting { + #ifdef HAVE_CUDA /** @@ -38,7 +42,7 @@ __device__ int blockReduceSum(int val) if (lane == 0) shared[wid] = val; - __syncthreads(); + __syncthreads(); val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0; @@ -47,7 +51,7 @@ __device__ int blockReduceSum(int val) if(threadIdx.x == 0) shared[0] = val; - __syncthreads(); + __syncthreads(); return shared[0]; } @@ -127,4 +131,8 @@ __device__ int blockCmpReduce(int val, const Operator & Cmp) return shared[0]; } -#endif \ No newline at end of file +#endif + + } // namespace Sorting + } // namespace Algorithms +} // namespace TNL \ No newline at end of file diff --git a/src/TNL/Algorithms/Sorting/detail/task.h b/src/TNL/Algorithms/Sorting/detail/task.h index bafc5e64b..bf2130b2e 100644 --- a/src/TNL/Algorithms/Sorting/detail/task.h +++ b/src/TNL/Algorithms/Sorting/detail/task.h @@ -12,6 +12,10 @@ #pragma once +namespace TNL { + namespace Algorithms { + namespace Sorting { + struct TASK { //start and end position of array to read and write from @@ -57,4 +61,8 @@ std::ostream& operator<<(std::ostream & out, const TASK & task) out << " | " << "iteration: " << task.iteration; out << " | " << "pivotIdx: " << task.pivotIdx; return out << " ] "; -} \ No newline at end of file +} + + } // namespace Sorting + } // namespace Algorithms +} // namespace TNL \ No newline at end of file -- GitLab From 4c564a62208efd25b9af212823de96569d29b6db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Mon, 19 Jul 2021 16:22:29 +0200 Subject: [PATCH 252/258] Writting documentation on sorting. --- .../Examples/Algorithms/CMakeLists.txt | 17 +- .../Examples/Algorithms/SortingExample.cpp | 55 ++++++ .../Examples/Algorithms/SortingExample.cu | 1 + .../Examples/Algorithms/SortingExample2.cpp | 55 ++++++ .../Examples/Algorithms/SortingExample2.cu | 1 + .../Examples/Algorithms/SortingExample3.cpp | 66 ++++++++ .../Examples/Algorithms/SortingExample3.cu | 1 + .../Tutorials/Sorting/tutorial_Sorting.md | 43 +++++ Documentation/Tutorials/index.md | 7 +- src/Benchmarks/Sorting/Measurer.h | 2 +- src/Benchmarks/Sorting/generators.h | 4 +- src/TNL/Algorithms/Sort.h | 158 +++++++++++++++++- src/TNL/Algorithms/Sorting/BitonicSort.h | 9 +- .../Algorithms/Sorting/BitonicSortTest.h | 40 ++--- .../Algorithms/Sorting/QuicksortTest.h | 12 +- 15 files changed, 428 insertions(+), 43 deletions(-) create mode 100644 Documentation/Examples/Algorithms/SortingExample.cpp create mode 120000 Documentation/Examples/Algorithms/SortingExample.cu create mode 100644 Documentation/Examples/Algorithms/SortingExample2.cpp create mode 120000 Documentation/Examples/Algorithms/SortingExample2.cu create mode 100644 Documentation/Examples/Algorithms/SortingExample3.cpp create mode 120000 Documentation/Examples/Algorithms/SortingExample3.cu create mode 100644 Documentation/Tutorials/Sorting/tutorial_Sorting.md diff --git a/Documentation/Examples/Algorithms/CMakeLists.txt b/Documentation/Examples/Algorithms/CMakeLists.txt index 294006c08..51d78e29b 100644 --- a/Documentation/Examples/Algorithms/CMakeLists.txt +++ b/Documentation/Examples/Algorithms/CMakeLists.txt @@ -1,9 +1,21 @@ IF( BUILD_CUDA ) CUDA_ADD_EXECUTABLE(ParallelForExampleCuda ParallelForExample.cu) ADD_CUSTOM_COMMAND( COMMAND ParallelForExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out ) + CUDA_ADD_EXECUTABLE( SortingExampleCuda SortingExample.cu) + ADD_CUSTOM_COMMAND( COMMAND SortingExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample.out OUTPUT SortingExample.out ) + CUDA_ADD_EXECUTABLE( SortingExample2Cuda SortingExample2.cu) + ADD_CUSTOM_COMMAND( COMMAND SortingExample2Cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample2.out OUTPUT SortingExample2.out ) + CUDA_ADD_EXECUTABLE( SortingExample3Cuda SortingExample3.cu) + ADD_CUSTOM_COMMAND( COMMAND SortingExample3Cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample3.out OUTPUT SortingExample3.out ) ELSE() ADD_EXECUTABLE(ParallelForExample ParallelForExample.cpp) - ADD_CUSTOM_COMMAND( COMMAND ParallelForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out ) + ADD_CUSTOM_COMMAND( COMMAND ParallelForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample.out OUTPUT SortingExample.out ) + ADD_EXECUTABLE( SortingExample SortingExample.cpp) + ADD_CUSTOM_COMMAND( COMMAND SortingExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample.out OUTPUT SortingExample.out ) + ADD_EXECUTABLE( SortingExample2 SortingExample2.cpp) + ADD_CUSTOM_COMMAND( COMMAND SortingExample2 > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample2.out OUTPUT SortingExample2.out ) + ADD_EXECUTABLE( SortingExample3 SortingExample3.cpp) + ADD_CUSTOM_COMMAND( COMMAND SortingExample3 > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample3.out OUTPUT SortingExample3.out ) ENDIF() ADD_EXECUTABLE(staticForExample staticForExample.cpp) @@ -13,6 +25,9 @@ ADD_EXECUTABLE(unrolledForExample unrolledForExample.cpp) ADD_CUSTOM_COMMAND( COMMAND unrolledForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/unrolledForExample.out OUTPUT unrolledForExample.out ) ADD_CUSTOM_TARGET( RunAlgorithmsExamples ALL DEPENDS + SortingExample.out + SortingExample2.out + SortingExample3.out ParallelForExample.out unrolledForExample.out staticForExample.out diff --git a/Documentation/Examples/Algorithms/SortingExample.cpp b/Documentation/Examples/Algorithms/SortingExample.cpp new file mode 100644 index 000000000..fc84b6066 --- /dev/null +++ b/Documentation/Examples/Algorithms/SortingExample.cpp @@ -0,0 +1,55 @@ +#include +#include +#include + +using namespace TNL; +using namespace TNL::Containers; +using namespace TNL::Algorithms; + +template< typename ArrayT > +void sort( ArrayT& array ) +{ + const int size = 10; + + /**** + * Fill the array with random integers. + */ + Array< int > aux_array( size ); + srand( size + 2021 ); + aux_array.forAllElements( [=] __cuda_callable__ ( int i, int& value ) { value = std::rand() % (2*size); } ); + array = aux_array; + + std::cout << "Random array: " << array << std::endl; + + /**** + * Sort the array in ascending order. + */ + ascendingSort( array ); + std::cout << "Array sorted in ascending order:" << array << std::endl; + + /*** + * Sort the array in descending order. + */ + descendingSort( array ); + std::cout << "Array sorted in descending order:" << array << std::endl; +} + +int main( int argc, char* argv[] ) +{ + /*** + * Firstly, test the sorting on CPU. + */ + std::cout << "Sorting on CPU ... " << std::endl; + Array< int, Devices::Host > host_array; + sort( host_array ); + +#ifdef HAVE_CUDA + /*** + * And then also on GPU. + */ + std::cout << "Sorting on GPU ... " << std::endl; + Array< int, Devices::Cuda > cuda_array; + sort( cuda_array ); +#endif + return EXIT_SUCCESS; +} diff --git a/Documentation/Examples/Algorithms/SortingExample.cu b/Documentation/Examples/Algorithms/SortingExample.cu new file mode 120000 index 000000000..ce89e14f3 --- /dev/null +++ b/Documentation/Examples/Algorithms/SortingExample.cu @@ -0,0 +1 @@ +SortingExample.cpp \ No newline at end of file diff --git a/Documentation/Examples/Algorithms/SortingExample2.cpp b/Documentation/Examples/Algorithms/SortingExample2.cpp new file mode 100644 index 000000000..a45a48ef6 --- /dev/null +++ b/Documentation/Examples/Algorithms/SortingExample2.cpp @@ -0,0 +1,55 @@ +#include +#include +#include + +using namespace TNL; +using namespace TNL::Containers; +using namespace TNL::Algorithms; + +template< typename ArrayT > +void sort( ArrayT& array ) +{ + const int size = 10; + + /**** + * Fill the array with random integers. + */ + Array< int > aux_array( size ); + srand( size + 2021 ); + aux_array.forAllElements( [=] __cuda_callable__ ( int i, int& value ) { value = std::rand() % (2*size); } ); + array = aux_array; + + std::cout << "Random array: " << array << std::endl; + + /**** + * Sort the array in ascending order. + */ + sort( array, [] __cuda_callable__ ( int a, int b ) { return a < b; } ); + std::cout << "Array sorted in ascending order:" << array << std::endl; + + /*** + * Sort the array in descending order. + */ + sort( array, [] __cuda_callable__ ( int a, int b ) { return a > b; } ); + std::cout << "Array sorted in descending order:" << array << std::endl; +} + +int main( int argc, char* argv[] ) +{ + /*** + * Firstly, test the sorting on CPU. + */ + std::cout << "Sorting on CPU ... " << std::endl; + Array< int, Devices::Host > host_array; + sort( host_array ); + +#ifdef HAVE_CUDA + /*** + * And then also on GPU. + */ + std::cout << "Sorting on GPU ... " << std::endl; + Array< int, Devices::Cuda > cuda_array; + sort( cuda_array ); +#endif + return EXIT_SUCCESS; +} diff --git a/Documentation/Examples/Algorithms/SortingExample2.cu b/Documentation/Examples/Algorithms/SortingExample2.cu new file mode 120000 index 000000000..892bbc232 --- /dev/null +++ b/Documentation/Examples/Algorithms/SortingExample2.cu @@ -0,0 +1 @@ +SortingExample2.cpp \ No newline at end of file diff --git a/Documentation/Examples/Algorithms/SortingExample3.cpp b/Documentation/Examples/Algorithms/SortingExample3.cpp new file mode 100644 index 000000000..063cc9fc4 --- /dev/null +++ b/Documentation/Examples/Algorithms/SortingExample3.cpp @@ -0,0 +1,66 @@ +#include +#include +#include + +using namespace TNL; +using namespace TNL::Containers; +using namespace TNL::Algorithms; + +template< typename ArrayT > +void sort( ArrayT& array ) +{ + const int size = 10; + + /**** + * Fill the array with random integers. + */ + Array< int > aux_array( size ); + srand( size + 2021 ); + aux_array.forAllElements( [=] __cuda_callable__ ( int i, int& value ) { value = std::rand() % (2*size); } ); + array = aux_array; + + /*** + * Prepare second array holding elements positions. + */ + ArrayT index( size ); + index.forAllElements( [] __cuda_callable__ ( int idx, int& value ) { value = idx; } ); + std::cout << "Random array: " << array << std::endl; + std::cout << "Index array: " << index << std::endl; + + /*** + * Sort the array `array` and apply the same permutation on the array `identity`. + */ + auto array_view = array.getView(); + auto index_view = index.getView(); + sort< typename ArrayT::DeviceType, // device on which the sorting will be performed + typename ArrayT::IndexType >( // type used for indexing + 0, size, // range of indexes + [=] __cuda_callable__ ( int i, int j ) -> bool { // comparison lambda function + return array_view[ i ] < array_view[ j ]; }, + [=] __cuda_callable__ ( int i, int j ) mutable { // lambda function for swapping of elements + TNL::swap( array_view[ i ], array_view[ j ] ); + TNL::swap( index_view[ i ], index_view[ j ] ); } ); + std::cout << "Sorted array: " << array << std::endl; + std::cout << "Index: " << index << std::endl; +} + +int main( int argc, char* argv[] ) +{ + /*** + * Firstly, test the sorting on CPU. + */ + // Currently this does not work on CPU. + //std::cout << "Sorting on CPU ... " << std::endl; + //Array< int, Devices::Host > host_array; + //sort( host_array ); + +#ifdef HAVE_CUDA + /*** + * And then also on GPU. + */ + std::cout << "Sorting on GPU ... " << std::endl; + Array< int, Devices::Cuda > cuda_array; + sort( cuda_array ); +#endif + return EXIT_SUCCESS; +} diff --git a/Documentation/Examples/Algorithms/SortingExample3.cu b/Documentation/Examples/Algorithms/SortingExample3.cu new file mode 120000 index 000000000..a35af15c6 --- /dev/null +++ b/Documentation/Examples/Algorithms/SortingExample3.cu @@ -0,0 +1 @@ +SortingExample3.cpp \ No newline at end of file diff --git a/Documentation/Tutorials/Sorting/tutorial_Sorting.md b/Documentation/Tutorials/Sorting/tutorial_Sorting.md new file mode 100644 index 000000000..4832077cb --- /dev/null +++ b/Documentation/Tutorials/Sorting/tutorial_Sorting.md @@ -0,0 +1,43 @@ +\page tutorial_Sorting Sorting tutorial + +[TOC] + +## Introduction + +TNL offers several different parallel algorithms for sorting of arrays (or vectors) and also sorting based on user defined swapping. The later is more general but also less efficient. + +### Sorting of arrays and Vectors + +The sorting of arrays and vectors is accessible via the following functions: + +* \ref TNL::Algorithms::ascendingSort for sorting elements of array in ascending order, +* \ref TNL::Algorithms::descendingSort for sorting elements of array in descending order, +* \ref TNL::Algorithms::sort for sorting with user defined ordering. + +The following example demonstrates the use of ascending and descending sort. See + +\includelineno SortingExample.cpp + +Here we create array with random sequence of integers (lines 17-20) and then we sort the array in ascending order (line 27) and descending order (line 33). The result looks as follows: + +\include SortingExample.out + + +How to achieve the same result with user defined ordering is demonstrated by the following example: + +\includelineno SortingExample2.cpp + +The result looks as follows: + +\include SortingExample2.out + +The same way, one can sort also \ref TNL::Containers::ArrayView, \ref TNL::Containers::Vector and \ref TNL::Containers::VectorView. + +### Sorting with user define swapping + + +\includelineno SortingExample3.cpp + +In this example, we fill array `array` with random numbers and array `index` with numbers equal to position of an element in the array. We want to sort the array `array` and permute the `index` array the same way. See the lines 34-38. Here we call function `sort` which does not accept any array-like data structure but only range of indexes and two lambda functions. The first one defines ordering of the elements (line 35) by comparing elements of array `array`. The second lambda function is responsible for elements swapping (lines 36-38 ). Note that we do not swap only elements of array `array` but also `index` array. The result looks as follows: + +\include SortingExample3.out diff --git a/Documentation/Tutorials/index.md b/Documentation/Tutorials/index.md index e35e674c8..739d609ac 100644 --- a/Documentation/Tutorials/index.md +++ b/Documentation/Tutorials/index.md @@ -7,6 +7,7 @@ 3. [Vectors](tutorial_Vectors.html) 4. [Flexible parallel reduction and scan](tutorial_ReductionAndScan.html) 5. [For loops](tutorial_ForLoops.html) -6. [Cross-device pointers](tutorial_Pointers.html) -7. [Matrices](tutorial_Matrices.html) -8. [Unstructured meshes](tutorial_Meshes.html) +6. [Sorting](tutorial_Sorting.html) +7. [Cross-device pointers](tutorial_Pointers.html) +8. [Matrices](tutorial_Matrices.html) +9. [Unstructured meshes](tutorial_Meshes.html) diff --git a/src/Benchmarks/Sorting/Measurer.h b/src/Benchmarks/Sorting/Measurer.h index 92a578b63..e8c18389f 100644 --- a/src/Benchmarks/Sorting/Measurer.h +++ b/src/Benchmarks/Sorting/Measurer.h @@ -37,7 +37,7 @@ struct Measurer Sorter::sort(view); } - if( ! Algorithms::isSorted( view ) ) + if( ! Algorithms::isAscending( view ) ) wrongAnsCnt++; } return accumulate(resAcc.begin(), resAcc.end(), 0.0) / resAcc.size(); diff --git a/src/Benchmarks/Sorting/generators.h b/src/Benchmarks/Sorting/generators.h index e117a5abe..1615c90ab 100644 --- a/src/Benchmarks/Sorting/generators.h +++ b/src/Benchmarks/Sorting/generators.h @@ -56,7 +56,7 @@ vector generateDecreasing(int size) for(int i = 0; i < size; i++) vec[i] = size - i; - + return vec; } @@ -74,7 +74,7 @@ vector generateGaussian(int size) for (int i = 0; i < size; ++i) { int value = 0; - for (int j = 0; j < 4; ++j) + for (int j = 0; j < 4; ++j) value += rand()%16384; vec[i] = value /4; diff --git a/src/TNL/Algorithms/Sort.h b/src/TNL/Algorithms/Sort.h index 0da7b4120..d80c50cc5 100644 --- a/src/TNL/Algorithms/Sort.h +++ b/src/TNL/Algorithms/Sort.h @@ -17,14 +17,85 @@ namespace TNL { namespace Algorithms { +/** + * \brief Function for sorting elements of array or vector in ascending order. + * + * \tparam Array is a type of container to be sorted. It can be, for example, \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, + * \ref TNL::Containers::Vector, \ref TNL::Containers::VectorView. + * \tparam Sorter is an algorithm for sorting. It can be, \ref TNL::Algorithms::Sorting::STLSort for sorting on host and \ref TNL::Algorithms::Sorting::Quicksort + * or \ref TNL::Algorithms::Sorting::BitonicSort for sorting on CUDA GPU. + * + * \param array is an instance of array/array view/vector/vector view for sorting. + * + * \par Example + * + * \includelineno SortingExample.cpp + * + * \par Output + * + * \include SortingExample.out + * + */ +template< typename Array, + typename Sorter = typename Sorting::DefaultSorter< typename Array::DeviceType >::SorterType > +void ascendingSort( Array& array ) +{ + using ValueType = typename Array::ValueType; + Sorter::sort( array, [] __cuda_callable__ ( const ValueType& a, const ValueType& b ) { return a < b; } ); +} +/** + * \brief Function for sorting elements of array or vector in descending order. + * + * \tparam Array is a type of container to be sorted. It can be, for example, \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, + * \ref TNL::Containers::Vector, \ref TNL::Containers::VectorView. + * \tparam Sorter is an algorithm for sorting. It can be, \ref TNL::Algorithms::Sorting::STLSort for sorting on host and \ref TNL::Algorithms::Sorting::Quicksort + * or \ref TNL::Algorithms::Sorting::BitonicSort for sorting on CUDA GPU. + * + * \param array is an instance of array/array view/vector/vector view for sorting. + * + * \par Example + * + * \includelineno SortingExample.cpp + * + * \par Output + * + * \include SortingExample.out + * + */ template< typename Array, typename Sorter = typename Sorting::DefaultSorter< typename Array::DeviceType >::SorterType > -void sort( Array& array ) +void descendingSort( Array& array ) { - Sorter::sort( array ); + using ValueType = typename Array::ValueType; + Sorter::sort( array, [] __cuda_callable__ ( const ValueType& a, const ValueType& b ) { return a < b; } ); } +/** + * \brief Function for sorting elements of array or vector based on a user defined comparison lambda function. + * + * \tparam Array is a type of container to be sorted. It can be, for example, \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, + * \ref TNL::Containers::Vector, \ref TNL::Containers::VectorView. + * \tparam Compare is a lambda function for comparing of two elements. It returns true if the first argument should be ordered before the second. The + * lambda function is supposed to be defined as follows (`ValueType` is type of the array elements): + * ``` + * auto compare = [] __cuda_callable__ ( const ValueType& a , const ValueType& b ) -> bool { return .... }; + * ``` + * \tparam Sorter is an algorithm for sorting. It can be, \ref TNL::Algorithms::Sorting::STLSort for sorting on host and \ref TNL::Algorithms::Sorting::Quicksort + * or \ref TNL::Algorithms::Sorting::BitonicSort for sorting on CUDA GPU. + * + * \param array is an instance of array/array view/vector/vector view for sorting. + * \param compare is an instance of the lambda function for comparison of two elements. + * + * \par Example + * + * \includelineno SortingExample2.cpp + * + * \par Output + * + * \include SortingExample2.out + * + */ template< typename Array, typename Compare, typename Sorter = typename Sorting::DefaultSorter< typename Array::DeviceType >::SorterType > @@ -33,6 +104,35 @@ void sort( Array& array, const Compare& compare ) Sorter::sort( array, compare ); } +/** + * \brief Function for general sorting based on lambda functions for comparison and swaping of two elements.. + * + * \tparam Device is device on which the sorting algorithms should be executed. + * \tparam Index is type used for indexing of the sorted data. + * \tparam Compare is a lambda function for comparing of two elements. It returns true if the first argument should be ordered before the second - both are given + * by indices representing their positions. The lambda function is supposed to be defined as follows: + * ``` + * auto compare = [=] __cuda_callable__ ( const Index& a , const Index& b ) -> bool { return .... }; + * ``` + * \tparam Swap is a lambda function for swaping of two elements which are ordered wrong way. Both elements are represented by indeces as well. It supposed to + * be defined as: + * ``` + * auto swap = [=] __cuda_callable__ ( const Index& a , const Index& b ) mutable { swap( ....); }; + * ``` + * \tparam Sorter is an algorithm for sorting. It can be \ref TNL::Algorithms::Sorting::BitonicSort for sorting on CUDA GPU. Currently there is no algorithm for CPU :(. + * + * \param array is an instance of array/array view/vector/vector view for sorting. + * \param compare is an instance of the lambda function for comparison of two elements. + * + * \par Example + * + * \includelineno SortingExample3.cpp + * + * \par Output + * + * \include SortingExample3.out + * + */ template< typename Device, typename Index, typename Compare, @@ -40,29 +140,73 @@ template< typename Device, typename Sorter = typename Sorting::DefaultInplaceSorter< Device >::SorterType > void sort( const Index begin, const Index end, const Compare& compare, const Swap& swap ) { - Sorter::inplaceSort( begin, end, compare, swap ); + Sorter::template inplaceSort< Device, Index >( begin, end, compare, swap ); } -template -bool isSorted( const Array& arr, const Function& cmp ) +/** + * \brief Functions returning true if the array elements are sorted according to the lmabda function `comparison`. + * + * \tparam Array is the type of array/vector. It can be, for example, \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, + * \ref TNL::Containers::Vector, \ref TNL::Containers::VectorView. + * \tparam Compare is a lambda function for comparing of two elements. It returns true if the first argument should be ordered before the second - both are given + * by indices representing their positions. The lambda function is supposed to be defined as follows: + * ``` + * auto compare = [=] __cuda_callable__ ( const Index& a , const Index& b ) -> bool { return .... }; + * ``` + * \param arr is an instance of tested array. + * \param compare is an instance of the lambda function for elements comparison. + * + * \return true if the array is sorted in ascending order. + * \return false if the array is NOT sorted in ascending order. + */ +template< typename Array, typename Compare > +bool isSorted( const Array& arr, const Compare& compare ) { using Device = typename Array::DeviceType; if (arr.getSize() <= 1) return true; auto view = arr.getConstView(); - auto fetch = [=] __cuda_callable__(int i) { return ! cmp( view[ i ], view[ i - 1 ] ); }; + auto fetch = [=] __cuda_callable__(int i) { return ! compare( view[ i ], view[ i - 1 ] ); }; auto reduction = [] __cuda_callable__(bool a, bool b) { return a && b; }; return TNL::Algorithms::reduce< Device >( 1, arr.getSize(), fetch, reduction, true ); } +/** + * \brief Functions returning true if the array elements are sorted in ascending order. + * + * \tparam Array is the type of array/vector. It can be, for example, \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, + * \ref TNL::Containers::Vector, \ref TNL::Containers::VectorView. + * + * \param arr is an instance of tested array. + * + * \return true if the array is sorted in ascending order. + * \return false if the array is NOT sorted in ascending order. + */ template< typename Array > -bool isSorted( const Array& arr) +bool isAscending( const Array& arr ) { using Value = typename Array::ValueType; return isSorted( arr, [] __cuda_callable__( const Value& a, const Value& b ) { return a < b; }); } +/** + * \brief Functions returning true if the array elements are sorted in descending order. + * + * \tparam Array is the type of array/vector. It can be, for example, \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, + * \ref TNL::Containers::Vector, \ref TNL::Containers::VectorView. + * + * \param arr is an instance of tested array. + * + * \return true if the array is sorted in descending order. + * \return false if the array is NOT sorted in descending order. + */ +template< typename Array > +bool isDescending( const Array& arr) +{ + using Value = typename Array::ValueType; + return isSorted( arr, [] __cuda_callable__( const Value& a, const Value& b ) { return a < b; }); +} } // namespace Algorithms } // namespace TNL diff --git a/src/TNL/Algorithms/Sorting/BitonicSort.h b/src/TNL/Algorithms/Sorting/BitonicSort.h index aefa966a0..3c2d6e89d 100644 --- a/src/TNL/Algorithms/Sorting/BitonicSort.h +++ b/src/TNL/Algorithms/Sorting/BitonicSort.h @@ -32,10 +32,13 @@ struct BitonicSort bitonicSort( array, compare ); } - template< typename Index, typename Fetch, typename Compare, typename Swap > - void static inplaceSort( const Index begin, const Index end, const Fetch& fetch, const Compare& compare, const Swap& swap ) + template< typename Device, typename Index, typename Compare, typename Swap > + void static inplaceSort( const Index begin, const Index end, const Compare& compare, const Swap& swap ) { - bitonicSort( begin, end, fetch, compare, swap ); + if( std::is_same< Device, Devices::Cuda >::value ) + bitonicSort( begin, end, compare, swap ); + else + TNL_ASSERT( false, std::cerr << "inplace bitonic sort for CPU is not implemented" << std::endl ); } }; diff --git a/src/UnitTests/Algorithms/Sorting/BitonicSortTest.h b/src/UnitTests/Algorithms/Sorting/BitonicSortTest.h index 95925c472..f6c061eed 100644 --- a/src/UnitTests/Algorithms/Sorting/BitonicSortTest.h +++ b/src/UnitTests/Algorithms/Sorting/BitonicSortTest.h @@ -32,7 +32,7 @@ TEST(permutations, allPermutationSize_2_to_7) BitonicSort::sort(view); - EXPECT_TRUE( Algorithms::isSorted( view ) ) << "failed " << i << std::endl; + EXPECT_TRUE( Algorithms::isAscending( view ) ) << "failed " << i << std::endl; } while (std::next_permutation(orig.begin(), orig.end())); } @@ -57,7 +57,7 @@ TEST(permutations, allPermutationSize_8) BitonicSort::sort(view); - EXPECT_TRUE(Algorithms::isSorted(view)) << "result " << view << std::endl; + EXPECT_TRUE(Algorithms::isAscending(view)) << "result " << view << std::endl; } while (std::next_permutation(orig.begin(), orig.end())); } @@ -81,7 +81,7 @@ TEST(permutations, somePermutationSize9) BitonicSort::sort(view); - EXPECT_TRUE(Algorithms::isSorted(view)) << "result " << view << std::endl; + EXPECT_TRUE(Algorithms::isAscending(view)) << "result " << view << std::endl; } while (std::next_permutation(orig.begin(), orig.end())); } @@ -92,7 +92,7 @@ TEST(selectedSize, size15) auto view = cudaArr.getView(); EXPECT_EQ(15, view.getSize()) << "size not 15" << std::endl; BitonicSort::sort(view); - EXPECT_TRUE(Algorithms::isSorted(view)) << "result " << view << std::endl; + EXPECT_TRUE(Algorithms::isAscending(view)) << "result " << view << std::endl; } TEST(multiblock, 32768_decreasingNegative) @@ -105,7 +105,7 @@ TEST(multiblock, 32768_decreasingNegative) auto view = cudaArr.getView(); BitonicSort::sort(view); - EXPECT_TRUE(Algorithms::isSorted(view)) << "result " << view << std::endl; + EXPECT_TRUE(Algorithms::isAscending(view)) << "result " << view << std::endl; } TEST(randomGenerated, smallArray_randomVal) @@ -121,7 +121,7 @@ TEST(randomGenerated, smallArray_randomVal) auto view = cudaArr.getView(); BitonicSort::sort(view); - EXPECT_TRUE(Algorithms::isSorted(view)); + EXPECT_TRUE(Algorithms::isAscending(view)); } } @@ -136,7 +136,7 @@ TEST(randomGenerated, bigArray_all0) auto view = cudaArr.getView(); BitonicSort::sort(view); - EXPECT_TRUE(Algorithms::isSorted(view)); + EXPECT_TRUE(Algorithms::isAscending(view)); } } @@ -145,7 +145,7 @@ TEST(nonIntegerType, float_notPow2) TNL::Containers::Array cudaArr{5.0, 9.4, 4.6, 8.9, 6.2, 1.15184, 2.23}; auto view = cudaArr.getView(); BitonicSort::sort(view); - EXPECT_TRUE(Algorithms::isSorted(view)) << "result " << view << std::endl; + EXPECT_TRUE(Algorithms::isAscending(view)) << "result " << view << std::endl; } TEST(nonIntegerType, double_notPow2) @@ -153,7 +153,7 @@ TEST(nonIntegerType, double_notPow2) TNL::Containers::Array cudaArr{5.0, 9.4, 4.6, 8.9, 6.2, 1.15184, 2.23}; auto view = cudaArr.getView(); BitonicSort::sort(view); - EXPECT_TRUE(Algorithms::isSorted(view)) << "result " << view << std::endl; + EXPECT_TRUE(Algorithms::isAscending(view)) << "result " << view << std::endl; } @@ -171,7 +171,7 @@ TEST(nonIntegerType, struct) TNL::Containers::Array cudaArr{TMPSTRUCT(5), TMPSTRUCT(6), TMPSTRUCT(9), TMPSTRUCT(1)}; auto view = cudaArr.getView(); BitonicSort::sort(view); - EXPECT_TRUE(Algorithms::isSorted(view)); + EXPECT_TRUE(Algorithms::isAscending(view)); } struct TMPSTRUCT_64b{ @@ -193,7 +193,7 @@ TEST(nonIntegerType, struct_64b) TNL::Containers::Array cudaArr(vec); auto view = cudaArr.getView(); BitonicSort::sort(view); - EXPECT_TRUE(Algorithms::isSorted(view)); + EXPECT_TRUE(Algorithms::isAscending(view)); } struct TMPSTRUCT_128b{ @@ -215,7 +215,7 @@ TEST(nonIntegerType, struct_128b) TNL::Containers::Array cudaArr(vec); auto view = cudaArr.getView(); BitonicSort::sort(view); - EXPECT_TRUE(Algorithms::isSorted(view)); + EXPECT_TRUE(Algorithms::isAscending(view)); } //error bypassing @@ -232,7 +232,7 @@ TEST(sortWithFunction, descending) auto view = cudaArr.getView(); descendingSort(view); - EXPECT_FALSE(Algorithms::isSorted(view)) << "result " << view << std::endl; + EXPECT_FALSE(Algorithms::isAscending(view)) << "result " << view << std::endl; EXPECT_TRUE(view.getElement(0) == 9); EXPECT_TRUE(view.getElement(1) == 6); @@ -249,7 +249,7 @@ TEST(sortWithFunction, descending) bitonicSort(arr); - EXPECT_TRUE( TNL::Algorithms::isSorted(arr) ); + EXPECT_TRUE( TNL::Algorithms::isAscending(arr) ); }*/ /*TEST(sortRange, secondHalf) @@ -261,7 +261,7 @@ TEST(sortWithFunction, descending) bitonicSort(arr, s, 19); - EXPECT_TRUE(TNL::Algorithms::isSorted(arr.begin() + s, arr.end())); + EXPECT_TRUE(TNL::Algorithms::isAscending(arr.begin() + s, arr.end())); EXPECT_TRUE(arr[0] == -1); EXPECT_TRUE(arr[s-1] == -1); } @@ -279,7 +279,7 @@ TEST(sortRange, middle) bitonicSort(arr, s, e); - EXPECT_TRUE(TNL::Algorithms::isSorted(arr.begin() + s, arr.begin() + e)); + EXPECT_TRUE(TNL::Algorithms::isAscending(arr.begin() + s, arr.begin() + e)); EXPECT_TRUE(arr[0] == -1); EXPECT_TRUE(arr.back() == -1); EXPECT_TRUE(arr[s-1] == -1); @@ -300,7 +300,7 @@ TEST(sortRange, middleMultiBlock) bitonicSort(arr, s, e); - EXPECT_TRUE(TNL::Algorithms::isSorted(arr.begin() + s, arr.begin() + e)); + EXPECT_TRUE(TNL::Algorithms::isAscending(arr.begin() + s, arr.begin() + e)); EXPECT_TRUE(arr[0] == -1); EXPECT_TRUE(arr[std::rand() % s] == -1); @@ -337,7 +337,7 @@ TEST(fetchAndSwap, oneBlockSort) TNL::Containers::Array cudaArr(orig); auto view = cudaArr.getView(); fetchAndSwapSorter(view); - EXPECT_TRUE(Algorithms::isSorted(view)) << "result " << view << std::endl; + EXPECT_TRUE(Algorithms::isAscending(view)) << "result " << view << std::endl; } while (std::next_permutation(orig.begin(), orig.end())); } @@ -353,7 +353,7 @@ TEST(fetchAndSwap, typeDouble) TNL::Containers::Array cudaArr(orig); auto view = cudaArr.getView(); fetchAndSwapSorter(view); - EXPECT_TRUE(Algorithms::isSorted(view)) << "result " << view << std::endl; + EXPECT_TRUE(Algorithms::isAscending(view)) << "result " << view << std::endl; } while (std::next_permutation(orig.begin(), orig.end())); } @@ -374,7 +374,7 @@ TEST(fetchAndSwap, sortMiddle) size_t from = 3, to = 8; fetchAndSwap_sortMiddle(view, from, to); - EXPECT_TRUE(Algorithms::isSorted(view.getView(3, 8))) << "result " << view << std::endl; + EXPECT_TRUE(Algorithms::isAscending(view.getView(3, 8))) << "result " << view << std::endl; for(size_t i = 0; i < orig.size(); i++) { diff --git a/src/UnitTests/Algorithms/Sorting/QuicksortTest.h b/src/UnitTests/Algorithms/Sorting/QuicksortTest.h index 7a1333fbc..26895c823 100644 --- a/src/UnitTests/Algorithms/Sorting/QuicksortTest.h +++ b/src/UnitTests/Algorithms/Sorting/QuicksortTest.h @@ -25,7 +25,7 @@ TEST(selectedSize, size15) auto view = cudaArr.getView(); EXPECT_EQ(15, view.getSize()) << "size not 15" << std::endl; Quicksort::sort( view ); - EXPECT_TRUE(Algorithms::isSorted(view)) << "result " << view << std::endl; + EXPECT_TRUE(Algorithms::isAscending(view)) << "result " << view << std::endl; } TEST(multiblock, 32768_decreasingNegative) @@ -38,7 +38,7 @@ TEST(multiblock, 32768_decreasingNegative) auto view = cudaArr.getView(); Quicksort::sort( view ); - EXPECT_TRUE(Algorithms::isSorted(view)) << "result " << view << std::endl; + EXPECT_TRUE(Algorithms::isAscending(view)) << "result " << view << std::endl; } TEST(randomGenerated, smallArray_randomVal) @@ -54,7 +54,7 @@ TEST(randomGenerated, smallArray_randomVal) auto view = cudaArr.getView(); Quicksort::sort( view ); - EXPECT_TRUE(Algorithms::isSorted(view)); + EXPECT_TRUE(Algorithms::isAscending(view)); } } @@ -70,7 +70,7 @@ TEST(randomGenerated, bigArray_randomVal) auto view = cudaArr.getView(); Quicksort::sort( view ); - EXPECT_TRUE(Algorithms::isSorted(view)); + EXPECT_TRUE(Algorithms::isAscending(view)); } } @@ -169,7 +169,7 @@ TEST(types, struct_3D_points) //std::cout << view << std::endl; Quicksort::sort( view ); - EXPECT_TRUE(Algorithms::isSorted(view)); + EXPECT_TRUE(Algorithms::isAscending(view)); } struct TMPSTRUCT_64b{ @@ -195,7 +195,7 @@ TEST(types, struct_64b) //std::cout << view << std::endl; Quicksort::sort( view ); - EXPECT_TRUE(Algorithms::isSorted(view)); + EXPECT_TRUE(Algorithms::isAscending(view)); } #endif -- GitLab From de7beccf230e529470154f1a461b5eafb8e396ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Mon, 26 Jul 2021 10:31:00 +0200 Subject: [PATCH 253/258] Added bubble sort - to have CPU sorter with interface for lambda functions. --- .../Examples/Algorithms/CMakeLists.txt | 2 +- src/TNL/Algorithms/Sorting/BubbleSort.h | 57 ++++++++++++ src/TNL/Algorithms/Sorting/DefaultSorter.h | 7 ++ .../Algorithms/Sorting/BubbleSortTest.cpp | 1 + .../Algorithms/Sorting/BubbleSortTest.h | 93 +++++++++++++++++++ .../Algorithms/Sorting/BubbleSortTest.su | 1 + .../Algorithms/Sorting/CMakeLists.txt | 1 + 7 files changed, 161 insertions(+), 1 deletion(-) create mode 100644 src/TNL/Algorithms/Sorting/BubbleSort.h create mode 100644 src/UnitTests/Algorithms/Sorting/BubbleSortTest.cpp create mode 100644 src/UnitTests/Algorithms/Sorting/BubbleSortTest.h create mode 120000 src/UnitTests/Algorithms/Sorting/BubbleSortTest.su diff --git a/Documentation/Examples/Algorithms/CMakeLists.txt b/Documentation/Examples/Algorithms/CMakeLists.txt index 51d78e29b..5ffb91b16 100644 --- a/Documentation/Examples/Algorithms/CMakeLists.txt +++ b/Documentation/Examples/Algorithms/CMakeLists.txt @@ -9,7 +9,7 @@ IF( BUILD_CUDA ) ADD_CUSTOM_COMMAND( COMMAND SortingExample3Cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample3.out OUTPUT SortingExample3.out ) ELSE() ADD_EXECUTABLE(ParallelForExample ParallelForExample.cpp) - ADD_CUSTOM_COMMAND( COMMAND ParallelForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample.out OUTPUT SortingExample.out ) + ADD_CUSTOM_COMMAND( COMMAND ParallelForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out ) ADD_EXECUTABLE( SortingExample SortingExample.cpp) ADD_CUSTOM_COMMAND( COMMAND SortingExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample.out OUTPUT SortingExample.out ) ADD_EXECUTABLE( SortingExample2 SortingExample2.cpp) diff --git a/src/TNL/Algorithms/Sorting/BubbleSort.h b/src/TNL/Algorithms/Sorting/BubbleSort.h new file mode 100644 index 000000000..29e58fe07 --- /dev/null +++ b/src/TNL/Algorithms/Sorting/BubbleSort.h @@ -0,0 +1,57 @@ +/*************************************************************************** + BubbleSort.h - description + ------------------- + begin : Jul 26, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include + +namespace TNL { + namespace Algorithms { + namespace Sorting { + +struct BubbleSort +{ + template< typename Device, typename Index, typename Compare, typename Swap > + void static inplaceSort( const Index begin, const Index end, const Compare& compare, const Swap& swap ) + { + if( std::is_same< Device, Devices::Host >::value ) + { + Index left( begin ), right( end ); + while( left < right ) + { + int lastChange; + for( int j = left; j < right - 1; j++ ) + if( ! compare( j, j+1 ) ) + { + swap( j, j+1 ); + lastChange = j; + } + right = lastChange; + for( int j = right - 1; j >= left; j-- ) + if( ! compare( j, j+1 ) ) + { + swap( j, j+1 ); + lastChange = j; + } + left = lastChange + 1; + } + } + else + TNL_ASSERT( false, std::cerr << "inplace bubble sort is implemented only for CPU" << std::endl ); + } +}; + + } // namespace Sorting + } // namespace Algorithms +} //namespace TNL + + diff --git a/src/TNL/Algorithms/Sorting/DefaultSorter.h b/src/TNL/Algorithms/Sorting/DefaultSorter.h index 13863df54..4b9da10c7 100644 --- a/src/TNL/Algorithms/Sorting/DefaultSorter.h +++ b/src/TNL/Algorithms/Sorting/DefaultSorter.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -43,6 +44,12 @@ struct DefaultSorter< Devices::Cuda > template< typename Device > struct DefaultInplaceSorter; +template<> +struct DefaultInplaceSorter< Devices::Host > +{ + using SorterType = Algorithms::Sorting::BubbleSort; +}; + template<> struct DefaultInplaceSorter< Devices::Cuda > { diff --git a/src/UnitTests/Algorithms/Sorting/BubbleSortTest.cpp b/src/UnitTests/Algorithms/Sorting/BubbleSortTest.cpp new file mode 100644 index 000000000..2a2bc2de9 --- /dev/null +++ b/src/UnitTests/Algorithms/Sorting/BubbleSortTest.cpp @@ -0,0 +1 @@ +#include "BubbleSortTest.h" \ No newline at end of file diff --git a/src/UnitTests/Algorithms/Sorting/BubbleSortTest.h b/src/UnitTests/Algorithms/Sorting/BubbleSortTest.h new file mode 100644 index 000000000..cadc73b34 --- /dev/null +++ b/src/UnitTests/Algorithms/Sorting/BubbleSortTest.h @@ -0,0 +1,93 @@ +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#if defined HAVE_GTEST && defined HAVE_CUDA +#include + + +using namespace TNL; +using namespace TNL::Algorithms; +using namespace TNL::Algorithms::Sorting; + +template +void fetchAndSwapSorter( TNL::Containers::ArrayView< TYPE, TNL::Devices::Host > view) +{ + auto Cmp = [=]__cuda_callable__(const int i, const int j ){return view[ i ] < view[ j ];}; + auto Swap = [=] __cuda_callable__ (int i, int j) mutable {TNL::swap(view[i], view[j]);}; + bubbleSort(0, view.getSize(), Cmp, Swap); +} + +TEST(fetchAndSwap, oneBlockSort) +{ + int size = 9; + const int stride = 227; + int i = 0; + + std::vector orig(size); + std::iota(orig.begin(), orig.end(), 0); + + do + { + if ((i++) % stride != 0) + continue; + + TNL::Containers::Array arr(orig); + auto view = arr.getView(); + fetchAndSwapSorter(view); + EXPECT_TRUE(Algorithms::isAscending(view)) << "result " << view << std::endl; + } + while (std::next_permutation(orig.begin(), orig.end())); +} + +TEST(fetchAndSwap, typeDouble) +{ + int size = 5; + std::vector orig(size); + std::iota(orig.begin(), orig.end(), 0); + + do + { + TNL::Containers::Array arr(orig); + auto view = arr.getView(); + fetchAndSwapSorter(view); + EXPECT_TRUE(Algorithms::isAscending(view)) << "result " << view << std::endl; + } + while (std::next_permutation(orig.begin(), orig.end())); +} + +void fetchAndSwap_sortMiddle(TNL::Containers::ArrayView view, int from, int to) +{ + //auto Fetch = [=]__cuda_callable__(int i){return view[i];}; + auto Cmp = [=]__cuda_callable__(const int i, const int j ){ return view[ i ] < view[ j ]; }; + auto Swap = [=] __cuda_callable__ (int i, int j) mutable { TNL::swap(view[i], view[j]); }; + bubbleSort(from, to, Cmp, Swap); +} + +TEST(fetchAndSwap, sortMiddle) +{ + std::vector orig{5, 9, 4, 54, 21, 6, 7, 9, 0, 9, 42, 4}; + TNL::Containers::Array arr(orig); + auto view = arr.getView(); + size_t from = 3, to = 8; + + fetchAndSwap_sortMiddle(view, from, to); + EXPECT_TRUE(Algorithms::isAscending(view.getView(3, 8))) << "result " << view << std::endl; + + for(size_t i = 0; i < orig.size(); i++) + { + if(i < from || i >= to) + EXPECT_TRUE(view.getElement(i) == orig[i]); + } +} + +#endif + +#include "../../main.h" diff --git a/src/UnitTests/Algorithms/Sorting/BubbleSortTest.su b/src/UnitTests/Algorithms/Sorting/BubbleSortTest.su new file mode 120000 index 000000000..83313710f --- /dev/null +++ b/src/UnitTests/Algorithms/Sorting/BubbleSortTest.su @@ -0,0 +1 @@ +BubbleSortTest.cpp \ No newline at end of file diff --git a/src/UnitTests/Algorithms/Sorting/CMakeLists.txt b/src/UnitTests/Algorithms/Sorting/CMakeLists.txt index 5dfcff323..56fca2216 100644 --- a/src/UnitTests/Algorithms/Sorting/CMakeLists.txt +++ b/src/UnitTests/Algorithms/Sorting/CMakeLists.txt @@ -1,5 +1,6 @@ set( COMMON_TESTS BitonicSortTest + BubbleSortTest QuicksortTest ) -- GitLab From b17d31faf2292ddda68107998a796b0bf91c9c45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Mon, 26 Jul 2021 10:33:51 +0200 Subject: [PATCH 254/258] Renaming file Sort.h to sort.h. --- Documentation/Examples/Algorithms/SortingExample.cpp | 2 +- Documentation/Examples/Algorithms/SortingExample2.cpp | 2 +- Documentation/Examples/Algorithms/SortingExample3.cpp | 2 +- src/Benchmarks/Sorting/tnl-benchmark-sort.h | 2 +- src/TNL/Algorithms/{Sort.h => sort.h} | 2 +- src/UnitTests/Algorithms/Sorting/BitonicSortTest.h | 2 +- src/UnitTests/Algorithms/Sorting/BubbleSortTest.h | 2 +- src/UnitTests/Algorithms/Sorting/QuicksortTest.h | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) rename src/TNL/Algorithms/{Sort.h => sort.h} (99%) diff --git a/Documentation/Examples/Algorithms/SortingExample.cpp b/Documentation/Examples/Algorithms/SortingExample.cpp index fc84b6066..3d11787a1 100644 --- a/Documentation/Examples/Algorithms/SortingExample.cpp +++ b/Documentation/Examples/Algorithms/SortingExample.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include using namespace TNL; using namespace TNL::Containers; diff --git a/Documentation/Examples/Algorithms/SortingExample2.cpp b/Documentation/Examples/Algorithms/SortingExample2.cpp index a45a48ef6..ec24ec5dc 100644 --- a/Documentation/Examples/Algorithms/SortingExample2.cpp +++ b/Documentation/Examples/Algorithms/SortingExample2.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include using namespace TNL; using namespace TNL::Containers; diff --git a/Documentation/Examples/Algorithms/SortingExample3.cpp b/Documentation/Examples/Algorithms/SortingExample3.cpp index 063cc9fc4..3d9f554df 100644 --- a/Documentation/Examples/Algorithms/SortingExample3.cpp +++ b/Documentation/Examples/Algorithms/SortingExample3.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include using namespace TNL; using namespace TNL::Containers; diff --git a/src/Benchmarks/Sorting/tnl-benchmark-sort.h b/src/Benchmarks/Sorting/tnl-benchmark-sort.h index a58669912..68b500a09 100644 --- a/src/Benchmarks/Sorting/tnl-benchmark-sort.h +++ b/src/Benchmarks/Sorting/tnl-benchmark-sort.h @@ -2,7 +2,7 @@ #include #include #include -#include +#include using namespace std; #include "generators.h" diff --git a/src/TNL/Algorithms/Sort.h b/src/TNL/Algorithms/sort.h similarity index 99% rename from src/TNL/Algorithms/Sort.h rename to src/TNL/Algorithms/sort.h index d80c50cc5..79279a856 100644 --- a/src/TNL/Algorithms/Sort.h +++ b/src/TNL/Algorithms/sort.h @@ -1,5 +1,5 @@ /*************************************************************************** - Sort.h - description + sort.h - description ------------------- begin : Jul 12, 2021 copyright : (C) 2021 by Tomas Oberhuber et al. diff --git a/src/UnitTests/Algorithms/Sorting/BitonicSortTest.h b/src/UnitTests/Algorithms/Sorting/BitonicSortTest.h index f6c061eed..5838d8f6c 100644 --- a/src/UnitTests/Algorithms/Sorting/BitonicSortTest.h +++ b/src/UnitTests/Algorithms/Sorting/BitonicSortTest.h @@ -7,7 +7,7 @@ #include #include #include -#include +#include #if defined HAVE_GTEST && defined HAVE_CUDA #include diff --git a/src/UnitTests/Algorithms/Sorting/BubbleSortTest.h b/src/UnitTests/Algorithms/Sorting/BubbleSortTest.h index cadc73b34..70196080e 100644 --- a/src/UnitTests/Algorithms/Sorting/BubbleSortTest.h +++ b/src/UnitTests/Algorithms/Sorting/BubbleSortTest.h @@ -7,7 +7,7 @@ #include #include #include -#include +#include #if defined HAVE_GTEST && defined HAVE_CUDA #include diff --git a/src/UnitTests/Algorithms/Sorting/QuicksortTest.h b/src/UnitTests/Algorithms/Sorting/QuicksortTest.h index 26895c823..b84bcb78a 100644 --- a/src/UnitTests/Algorithms/Sorting/QuicksortTest.h +++ b/src/UnitTests/Algorithms/Sorting/QuicksortTest.h @@ -6,7 +6,7 @@ #include #include -#include +#include #include #if defined HAVE_CUDA && defined HAVE_GTEST -- GitLab From 1e1505dde6e168f9651fbdb61b7047a268a98e2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Mon, 26 Jul 2021 11:07:53 +0200 Subject: [PATCH 255/258] Fixing bubble sort. --- .../Examples/Algorithms/SortingExample3.cpp | 7 ++-- src/TNL/Algorithms/Sorting/BubbleSort.h | 39 +++++++++++-------- src/TNL/Algorithms/sort.h | 2 +- .../Algorithms/Sorting/BubbleSortTest.h | 10 +++-- 4 files changed, 33 insertions(+), 25 deletions(-) diff --git a/Documentation/Examples/Algorithms/SortingExample3.cpp b/Documentation/Examples/Algorithms/SortingExample3.cpp index 3d9f554df..4d6c33ce8 100644 --- a/Documentation/Examples/Algorithms/SortingExample3.cpp +++ b/Documentation/Examples/Algorithms/SortingExample3.cpp @@ -49,10 +49,9 @@ int main( int argc, char* argv[] ) /*** * Firstly, test the sorting on CPU. */ - // Currently this does not work on CPU. - //std::cout << "Sorting on CPU ... " << std::endl; - //Array< int, Devices::Host > host_array; - //sort( host_array ); + std::cout << "Sorting on CPU ... " << std::endl; + Array< int, Devices::Host > host_array; + sort( host_array ); #ifdef HAVE_CUDA /*** diff --git a/src/TNL/Algorithms/Sorting/BubbleSort.h b/src/TNL/Algorithms/Sorting/BubbleSort.h index 29e58fe07..d101929a3 100644 --- a/src/TNL/Algorithms/Sorting/BubbleSort.h +++ b/src/TNL/Algorithms/Sorting/BubbleSort.h @@ -13,6 +13,7 @@ #pragma once #include +#include namespace TNL { namespace Algorithms { @@ -21,28 +22,34 @@ namespace TNL { struct BubbleSort { template< typename Device, typename Index, typename Compare, typename Swap > - void static inplaceSort( const Index begin, const Index end, const Compare& compare, const Swap& swap ) + void static inplaceSort( const Index begin, const Index end, Compare& compare, Swap& swap ) { if( std::is_same< Device, Devices::Host >::value ) { - Index left( begin ), right( end ); + Index left( begin ), right( end -1 ); while( left < right ) { - int lastChange; + //int lastChange( end -1 ); for( int j = left; j < right - 1; j++ ) - if( ! compare( j, j+1 ) ) - { - swap( j, j+1 ); - lastChange = j; - } - right = lastChange; - for( int j = right - 1; j >= left; j-- ) - if( ! compare( j, j+1 ) ) - { - swap( j, j+1 ); - lastChange = j; - } - left = lastChange + 1; + { + TNL_ASSERT_LT( j+1, end, "" ); + if( ! compare( j, j+1 ) ) + { + swap( j, j+1 ); + //lastChange = j; + } + } + right--; //lastChange; + for( int j = right; j >= left; j-- ) + { + TNL_ASSERT_LT( j+1, end, "" ); + if( ! compare( j, j+1 ) ) + { + swap( j, j+1 ); + //lastChange = j; + } + } + left++; //lastChange; } } else diff --git a/src/TNL/Algorithms/sort.h b/src/TNL/Algorithms/sort.h index 79279a856..49ccb6891 100644 --- a/src/TNL/Algorithms/sort.h +++ b/src/TNL/Algorithms/sort.h @@ -138,7 +138,7 @@ template< typename Device, typename Compare, typename Swap, typename Sorter = typename Sorting::DefaultInplaceSorter< Device >::SorterType > -void sort( const Index begin, const Index end, const Compare& compare, const Swap& swap ) +void sort( const Index begin, const Index end, Compare&& compare, Swap&& swap ) { Sorter::template inplaceSort< Device, Index >( begin, end, compare, swap ); } diff --git a/src/UnitTests/Algorithms/Sorting/BubbleSortTest.h b/src/UnitTests/Algorithms/Sorting/BubbleSortTest.h index 70196080e..5e527794e 100644 --- a/src/UnitTests/Algorithms/Sorting/BubbleSortTest.h +++ b/src/UnitTests/Algorithms/Sorting/BubbleSortTest.h @@ -9,7 +9,7 @@ #include #include -#if defined HAVE_GTEST && defined HAVE_CUDA +#if defined HAVE_GTEST #include @@ -22,7 +22,7 @@ void fetchAndSwapSorter( TNL::Containers::ArrayView< TYPE, TNL::Devices::Host > { auto Cmp = [=]__cuda_callable__(const int i, const int j ){return view[ i ] < view[ j ];}; auto Swap = [=] __cuda_callable__ (int i, int j) mutable {TNL::swap(view[i], view[j]);}; - bubbleSort(0, view.getSize(), Cmp, Swap); + BubbleSort::inplaceSort< TNL::Devices::Host >(0, view.getSize(), Cmp, Swap); } TEST(fetchAndSwap, oneBlockSort) @@ -68,7 +68,7 @@ void fetchAndSwap_sortMiddle(TNL::Containers::ArrayView //auto Fetch = [=]__cuda_callable__(int i){return view[i];}; auto Cmp = [=]__cuda_callable__(const int i, const int j ){ return view[ i ] < view[ j ]; }; auto Swap = [=] __cuda_callable__ (int i, int j) mutable { TNL::swap(view[i], view[j]); }; - bubbleSort(from, to, Cmp, Swap); + BubbleSort::inplaceSort< TNL::Devices::Host >(from, to, Cmp, Swap); } TEST(fetchAndSwap, sortMiddle) @@ -83,8 +83,10 @@ TEST(fetchAndSwap, sortMiddle) for(size_t i = 0; i < orig.size(); i++) { - if(i < from || i >= to) + if( i < from || i >= to ) + { EXPECT_TRUE(view.getElement(i) == orig[i]); + } } } -- GitLab From edc7392b311f4134eb7919eb7aa7920c23104f28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Mon, 26 Jul 2021 12:24:15 +0200 Subject: [PATCH 256/258] Changing parameters of sort functions to make overriding of the default sorter easier. --- src/TNL/Algorithms/sort.h | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/TNL/Algorithms/sort.h b/src/TNL/Algorithms/sort.h index 49ccb6891..f65201333 100644 --- a/src/TNL/Algorithms/sort.h +++ b/src/TNL/Algorithms/sort.h @@ -26,6 +26,7 @@ namespace TNL { * or \ref TNL::Algorithms::Sorting::BitonicSort for sorting on CUDA GPU. * * \param array is an instance of array/array view/vector/vector view for sorting. + * \param sorter is an instance of sorter. * * \par Example * @@ -38,10 +39,10 @@ namespace TNL { */ template< typename Array, typename Sorter = typename Sorting::DefaultSorter< typename Array::DeviceType >::SorterType > -void ascendingSort( Array& array ) +void ascendingSort( Array& array, const Sorter& sorter = Sorter{} ) { using ValueType = typename Array::ValueType; - Sorter::sort( array, [] __cuda_callable__ ( const ValueType& a, const ValueType& b ) { return a < b; } ); + sorter.sort( array, [] __cuda_callable__ ( const ValueType& a, const ValueType& b ) { return a < b; } ); } /** @@ -53,6 +54,7 @@ void ascendingSort( Array& array ) * or \ref TNL::Algorithms::Sorting::BitonicSort for sorting on CUDA GPU. * * \param array is an instance of array/array view/vector/vector view for sorting. + * \param sorter is an instance of sorter. * * \par Example * @@ -65,10 +67,10 @@ void ascendingSort( Array& array ) */ template< typename Array, typename Sorter = typename Sorting::DefaultSorter< typename Array::DeviceType >::SorterType > -void descendingSort( Array& array ) +void descendingSort( Array& array, const Sorter& sorter = Sorter{} ) { using ValueType = typename Array::ValueType; - Sorter::sort( array, [] __cuda_callable__ ( const ValueType& a, const ValueType& b ) { return a < b; } ); + sorter.sort( array, [] __cuda_callable__ ( const ValueType& a, const ValueType& b ) { return a < b; } ); } /** @@ -86,6 +88,7 @@ void descendingSort( Array& array ) * * \param array is an instance of array/array view/vector/vector view for sorting. * \param compare is an instance of the lambda function for comparison of two elements. + * \param sorter is an instance of sorter. * * \par Example * @@ -99,9 +102,9 @@ void descendingSort( Array& array ) template< typename Array, typename Compare, typename Sorter = typename Sorting::DefaultSorter< typename Array::DeviceType >::SorterType > -void sort( Array& array, const Compare& compare ) +void sort( Array& array, const Compare& compare, const Sorter& sorter = Sorter{} ) { - Sorter::sort( array, compare ); + sorter.sort( array, compare ); } /** @@ -123,6 +126,7 @@ void sort( Array& array, const Compare& compare ) * * \param array is an instance of array/array view/vector/vector view for sorting. * \param compare is an instance of the lambda function for comparison of two elements. + * \param sorter is an instance of sorter. * * \par Example * @@ -138,9 +142,9 @@ template< typename Device, typename Compare, typename Swap, typename Sorter = typename Sorting::DefaultInplaceSorter< Device >::SorterType > -void sort( const Index begin, const Index end, Compare&& compare, Swap&& swap ) +void sort( const Index begin, const Index end, Compare&& compare, Swap&& swap, const Sorter& sorter = Sorter{} ) { - Sorter::template inplaceSort< Device, Index >( begin, end, compare, swap ); + sorter.template inplaceSort< Device, Index >( begin, end, compare, swap ); } /** @@ -155,6 +159,7 @@ void sort( const Index begin, const Index end, Compare&& compare, Swap&& swap ) * ``` * \param arr is an instance of tested array. * \param compare is an instance of the lambda function for elements comparison. + * \param sorter is an instance of sorter. * * \return true if the array is sorted in ascending order. * \return false if the array is NOT sorted in ascending order. -- GitLab From 02987e4c02c0ac5cf1d51760ae3b4555233cac60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Mon, 26 Jul 2021 12:26:09 +0200 Subject: [PATCH 257/258] Fixing isDescending function. --- src/TNL/Algorithms/sort.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TNL/Algorithms/sort.h b/src/TNL/Algorithms/sort.h index f65201333..131561760 100644 --- a/src/TNL/Algorithms/sort.h +++ b/src/TNL/Algorithms/sort.h @@ -210,7 +210,7 @@ template< typename Array > bool isDescending( const Array& arr) { using Value = typename Array::ValueType; - return isSorted( arr, [] __cuda_callable__( const Value& a, const Value& b ) { return a < b; }); + return isSorted( arr, [] __cuda_callable__( const Value& a, const Value& b ) { return a > b; }); } } // namespace Algorithms -- GitLab From f691a84f32e29c30b2f12db4cf0c3f93d64f0830 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= Date: Mon, 26 Jul 2021 13:53:43 +0200 Subject: [PATCH 258/258] Additional fixes of BubblerSort. --- src/TNL/Algorithms/Sorting/BubbleSort.h | 6 ++++-- src/TNL/Algorithms/Sorting/DefaultSorter.h | 12 ++++++++++++ .../Sorting/{BubbleSortTest.su => BubbleSortTest.cu} | 0 3 files changed, 16 insertions(+), 2 deletions(-) rename src/UnitTests/Algorithms/Sorting/{BubbleSortTest.su => BubbleSortTest.cu} (100%) diff --git a/src/TNL/Algorithms/Sorting/BubbleSort.h b/src/TNL/Algorithms/Sorting/BubbleSort.h index d101929a3..cb809b737 100644 --- a/src/TNL/Algorithms/Sorting/BubbleSort.h +++ b/src/TNL/Algorithms/Sorting/BubbleSort.h @@ -14,6 +14,7 @@ #include #include +#include namespace TNL { namespace Algorithms { @@ -24,7 +25,8 @@ struct BubbleSort template< typename Device, typename Index, typename Compare, typename Swap > void static inplaceSort( const Index begin, const Index end, Compare& compare, Swap& swap ) { - if( std::is_same< Device, Devices::Host >::value ) + if( std::is_same< Device, Devices::Host >::value || + std::is_same< Device, Devices::Sequential >::value ) { Index left( begin ), right( end -1 ); while( left < right ) @@ -53,7 +55,7 @@ struct BubbleSort } } else - TNL_ASSERT( false, std::cerr << "inplace bubble sort is implemented only for CPU" << std::endl ); + throw Exceptions::NotImplementedError(); } }; diff --git a/src/TNL/Algorithms/Sorting/DefaultSorter.h b/src/TNL/Algorithms/Sorting/DefaultSorter.h index 4b9da10c7..546ea0d19 100644 --- a/src/TNL/Algorithms/Sorting/DefaultSorter.h +++ b/src/TNL/Algorithms/Sorting/DefaultSorter.h @@ -35,6 +35,12 @@ struct DefaultSorter< Devices::Host > using SorterType = Algorithms::Sorting::STLSort; }; +template<> +struct DefaultSorter< Devices::Sequential > +{ + using SorterType = Algorithms::Sorting::STLSort; +}; + template<> struct DefaultSorter< Devices::Cuda > { @@ -50,6 +56,12 @@ struct DefaultInplaceSorter< Devices::Host > using SorterType = Algorithms::Sorting::BubbleSort; }; +template<> +struct DefaultInplaceSorter< Devices::Sequential > +{ + using SorterType = Algorithms::Sorting::BubbleSort; +}; + template<> struct DefaultInplaceSorter< Devices::Cuda > { diff --git a/src/UnitTests/Algorithms/Sorting/BubbleSortTest.su b/src/UnitTests/Algorithms/Sorting/BubbleSortTest.cu similarity index 100% rename from src/UnitTests/Algorithms/Sorting/BubbleSortTest.su rename to src/UnitTests/Algorithms/Sorting/BubbleSortTest.cu -- GitLab