Merge branch 'master' into same_dir_optimization (047ab2da) · Commits · TNL / GPUSort

bitonicGPU/benchmark/benchmark.cu

+56 −96

Original line number	Diff line number	Diff line
		#include <string>
		#include <chrono>
		#include <iostream>
		#include <algorithm>
		#include <numeric>
		#include <iomanip>

		#include <TNL/Containers/Array.h>

		#include "../bitonicSort.h"
		#include "../../util/timer.h"

		using namespace TNL;
		using namespace TNL::Containers;
		using namespace std;

		typedef Devices::Cuda Device;

		template <class T>
		std::ostream& operator<< (std::ostream&out, std::vector<T> &arr)
		int main()
		{
		for (auto x : arr)
		std::cout << x << " ";
		return out;
		}

		struct TIMER
		srand(2021);
		for(int pow = 10; pow <= 20; pow++)
		{
		std::string s;
		std::chrono::steady_clock::time_point begin;
		double result = 0;
		bool stopped = false;
		int size =(1<< pow);

		vector<int> vec(size);
		iota(vec.begin(), vec.end(), 0);

		TIMER(const std::string &name = "")
		: s(name), begin(std::chrono::steady_clock::now()) {}
		Array<int, Device> arr;
		vector<double> resAcc;

		double stop()
		//sorted sequence
		{
		auto end = std::chrono::steady_clock::now();
		result = (std::chrono::duration_cast<std::chrono::microseconds >(end - begin).count() / 1000.);
		stopped = true;
		return result;
		}
		arr = vec;
		auto view = arr.getView();

		void printTime()
		{
		if(!stopped)
		stop();
		std::cout << ("Measured " + s + ": ") << result << " ms" << std::endl;
		TIMER t([&](double res){resAcc.push_back(res);});
		bitonicSort(view);
		}
		}

		~TIMER()
		//almost sorted sequence
		{
		if(!stopped)
		for(int i = 0; i < 3; i++)
		{
		stop();
		printTime();
		int s = rand() % (size - 3);
		std::swap(vec[s], vec[s + 1]);
		}
		}
		};


		void test1()
		{
		int size = 1<<10;
		TNL::Containers::Array<int, Device> cudaArr(size);
		cudaArr.evaluate([=] __cuda_callable__ (int i) {return i;});
		auto view = cudaArr.getView();
		arr = vec;
		auto view = arr.getView();

		{
		TIMER t("sorted sequences");
		TIMER t([&](double res){resAcc.push_back(res);});
		bitonicSort(view);
		}
		}

		void randomShuffles()
		{
		int iterations = 100;
		std::cout << iterations << " random permutations" << std::endl;
		for(int p = 13; p <= 19; ++p)
		{
		int size = 1<<p;
		std::vector<int> orig(size);
		std::iota(orig.begin(), orig.end(), 0);
		std::vector<double> results;

		for (int i = 0; i < iterations; i++)
		//decreasing sequence
		{
		std::random_shuffle(orig.begin(), orig.end());
		for(size_t i = 0; i < size; i++)
		vec[i] = -i;

		TNL::Containers::Array<int, Device> cudaArr(orig);
		auto view = cudaArr.getView();
		std::vector<int> tmp(orig.begin(), orig.end());
		arr = vec;
		auto view = arr.getView();

		{
		TIMER t("random permutation");

		//std::sort(tmp.begin(), tmp.end());
		TIMER t([&](double res){resAcc.push_back(res);});
		bitonicSort(view);

		results.push_back(t.stop());
		//t.printTime();
		}

		}
		std::cout << "average time for arrSize = 2^" << p << ": " << std::accumulate(results.begin(), results.end(), 0.)/results.size() << " ms" << std::endl;

		}
		}

		void allPermutations(std::vector<int> orig)
		//random sequence
		{
		std::vector<double> results;
		while (std::next_permutation(orig.begin(), orig.end()))
		{
		TNL::Containers::Array<int, Device> cudaArr(orig);
		auto view = cudaArr.getView();
		random_shuffle(vec.begin(), vec.end());

		arr = vec;
		auto view = arr.getView();

		{
		TIMER t("random permutation");
		TIMER t([&](double res){resAcc.push_back(res);});
		bitonicSort(view);
		results.push_back(t.stop());
		//t.printTime();
		}
		}
		std::cout << "average time: " << std::accumulate(results.begin(), results.end(), 0.)/results.size() << " ms" << std::endl;
		}


		int main()
		{
		randomShuffles();
		cout << "2^" << pow << " = ";
		cout << fixed;
		cout << setprecision(3);
		cout << (accumulate(resAcc.begin(), resAcc.end(), 0.0) / resAcc.size()) << " ms" << endl;
		}

		return 0;
		}
		No newline at end of file

bitonicGPU/unitTests/unitTests.cu

+27 −56

Original line number	Diff line number	Diff line
		@@ -7,27 +7,19 @@
		#include <TNL/Containers/Array.h>
		#include <TNL/Algorithms/MemoryOperations.h>
		#include "../bitonicSort.h"

		template <typename Value>
		bool is_sorted(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr)
		{
		std::vector<Value> tmp(arr.getSize());
		TNL::Algorithms::MultiDeviceMemoryOperations<TNL::Devices::Host, TNL::Devices::Cuda >::copy(tmp.data(), arr.getData(), arr.getSize());

		return std::is_sorted(tmp.begin(), tmp.end());
		}
		#include "../../util/algorithm.h"

		//----------------------------------------------------------------------------------

		TEST(permutations, allPermutationSize_3_to_7)
		TEST(permutations, allPermutationSize_1_to_8)
		{
		for(int i = 3; i<=7; i++ )
		for(int i = 2; i<=8; i++ )
		{
		int size = i;
		std::vector<int> orig(size);
		std::iota(orig.begin(), orig.end(), 0);

		while (std::next_permutation(orig.begin(), orig.end()))
		do
		{
		TNL::Containers::Array<int, TNL::Devices::Cuda> cudaArr(orig);
		auto view = cudaArr.getView();
		@@ -36,29 +28,7 @@ TEST(permutations, allPermutationSize_3_to_7)

		ASSERT_TRUE(is_sorted(view)) << "failed " << i << std::endl;
		}
		}
		}

		TEST(permutations, somePermutationSize8)
		{
		int size = 8;
		const int stride = 23;
		int i = 0;

		std::vector<int> orig(size);
		std::iota(orig.begin(), orig.end(), 0);

		while (std::next_permutation(orig.begin(), orig.end()))
		{
		if ((i++) % stride != 0)
		continue;

		TNL::Containers::Array<int, TNL::Devices::Cuda> cudaArr(orig);
		auto view = cudaArr.getView();

		bitonicSort(view);

		ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl;
		while (std::next_permutation(orig.begin(), orig.end()));
		}
		}

		@@ -71,7 +41,7 @@ TEST(permutations, somePermutationSize9)
		std::vector<int> orig(size);
		std::iota(orig.begin(), orig.end(), 0);

		while (std::next_permutation(orig.begin(), orig.end()))
		do
		{
		if ((i++) % stride != 0)
		continue;
		@@ -83,35 +53,43 @@ TEST(permutations, somePermutationSize9)

		ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl;
		}
		while (std::next_permutation(orig.begin(), orig.end()));
		}

		//-----------------------------------------------------------------------

		TEST(selectedSize, size15)
		{
		TNL::Containers::Array<int, TNL::Devices::Cuda> cudaArr{5, 9, 4, 8, 6, 1, 2, 3, 4, 8, 1, 6, 9, 4, 9};
		auto view = cudaArr.getView();
		ASSERT_EQ(15, view.getSize());
		ASSERT_EQ(15, view.getSize()) << "size not 15" << std::endl;
		bitonicSort(view);
		ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl;
		}

		TEST(multiblock, 32768_decreasingNegative)
		{
		TNL::Containers::Array<int, TNL::Devices::Cuda> cudaArr(1 << 15);
		for (int i = 0; i < cudaArr.getSize(); i++)
		cudaArr.setElement(i, -i);
		std::vector<int> arr(1<<15);
		for (size_t i = 0; i < arr.size(); i++)
		arr[i] = -i;

		TNL::Containers::Array<int, TNL::Devices::Cuda> cudaArr(arr);
		auto view = cudaArr.getView();

		bitonicSort(view);
		ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl;
		}

		TEST(randomGenerated, smallArray_randomVal)
		{
		std::srand(2006);
		for(int i = 0; i < 100; i++)
		{
		TNL::Containers::Array<int, TNL::Devices::Cuda> cudaArr(std::rand()%(1<<10));
		for (int j = 0; j < cudaArr.getSize(); j++)
		cudaArr.setElement(j, std::rand());
		std::vector<int> arr(std::rand()%(1<<10));
		for(auto & x : arr)
		x = std::rand();

		TNL::Containers::Array<int, TNL::Devices::Cuda> cudaArr(arr);

		auto view = cudaArr.getView();
		bitonicSort(view);
		@@ -121,6 +99,7 @@ TEST(randomGenerated, smallArray_randomVal)

		TEST(randomGenerated, bigArray_all0)
		{
		std::srand(304);
		for(int i = 0; i < 50; i++)
		{
		int size = (1<<20) + (std::rand()% (1<<19));
		@@ -149,31 +128,23 @@ TEST(nonIntegerType, double_notPow2)
		ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl;
		}

		/*

		struct TMPSTRUCT{
		uint8_t m_data[6];
		TMPSTRUCT(){m_data[0] = 0;}
		TMPSTRUCT(int first){m_data[0] = first;};
		bool operator <(const TMPSTRUCT& other) const { return m_data[0] < other.m_data[0];}

		bool operator ==(const TMPSTRUCT& other) const {return !(this < other) && !(other < this); }

		bool operator >=(const TMPSTRUCT& other) const {return !(*this < other); }
		bool operator >(const TMPSTRUCT& other) const {return !(*this <= other); }
		bool operator <=(const TMPSTRUCT& other) const {return (this < other) \|\| (other == this); }

		std::ostream& operator << (std::ostream & out) { return out << "{ " << m_data[0] << " }";}
		bool operator <=(const TMPSTRUCT& other) const { return m_data[0] <= other.m_data[0];}
		};

		TEST(nonIntegerType, struct)
		{

		TNL::Containers::Array<TMPSTRUCT, TNL::Devices::Cuda> cudaArr{TMPSTRUCT(5), TMPSTRUCT(6), TMPSTRUCT(9), TMPSTRUCT(1)};
		auto view = cudaArr.getView();
		bitonicSort(view);
		ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl;
		ASSERT_TRUE(is_sorted(view));
		}
		*/


		//error bypassing
		//https://mmg-gitlab.fjfi.cvut.cz/gitlab/tnl/tnl-dev/blob/fbc34f6a97c13ec865ef7969b9704533222ed408/src/UnitTests/Containers/VectorTest-8.h

util/algorithm.h

0 → 100644

+19 −0

Original line number	Diff line number	Diff line
		#pragma once
		#include <TNL/Containers/Array.h>
		#include <TNL/Algorithms/Reduction.h>

		template <typename Value, typename Function>
		bool is_sorted(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, const Function &Cmp)
		{
		if(arr.getSize() <= 1) return true;

		auto fetch = [=] __cuda_callable__(int i) { return Cmp(arr[i - 1], arr[i]); };
		auto reduction = [] __cuda_callable__(bool a, bool b) { return a && b; };
		return TNL::Algorithms::Reduction<TNL::Devices::Cuda>::reduce(1, arr.getSize(), reduction, fetch, true);
		}

		template <typename Value>
		bool is_sorted(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr)
		{
		return is_sorted(arr, [] __cuda_callable__(const Value &a, const Value &b) { return a <= b; });
		}

util/timer.h

0 → 100644

+22 −0

Original line number	Diff line number	Diff line
		#pragma once

		#include <string>
		#include <chrono>
		#include <functional>
		#include <iostream>

		struct TIMER
		{
		std::function<void(double)> f;
		std::chrono::high_resolution_clock::time_point begin;

		TIMER(std::function<void(double)> func = [](double res){std::cout << res << std::endl;})
		: f(func), begin(std::chrono::high_resolution_clock::now()) {}

		~TIMER()
		{
		auto end = std::chrono::high_resolution_clock::now();
		double result = (std::chrono::duration_cast<std::chrono::microseconds >(end - begin).count() / 1000.);
		f(result);
		}
		};
		No newline at end of file