diff --git a/CMakeLists.txt b/CMakeLists.txt index d7e824a125acbb789af1cdd74811906746c1ea78..ecc55634635c3104ed6bf65d6b30bce20277de2c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,6 +32,7 @@ option(BUILD_TESTS "Build tests" OFF) option(BUILD_MATRIX_TESTS "Build tests for matrix formats" OFF) option(BUILD_PYTHON "Compile the Python bindings" OFF) option(BUILD_DOC "Build examples included in the documentation" OFF) +set(CUDA_SAMPLES_PATH "none" CACHE STRING "Path to CUDA Samples - it is used only for some benchmarking.") # install paths relative to the cmake's prefix set( TNL_TARGET_INCLUDE_DIRECTORY "include/TNL" ) @@ -253,6 +254,9 @@ if( ${WITH_CUDA} ) endif() set( CMAKE_EXECUTABLE_SUFFIX "${executable_suffix_backup}" ) endif() + if( NOT CUDA_SAMPLES_DIR STREQUAL "none" ) + set( CUDA_SAMPLES_FLAGS "-I${CUDA_SAMPLES_DIR} -DHAVE_CUDA_SAMPLES") + endif() endif() @@ -407,6 +411,7 @@ message( " CMAKE_SHARED_LINKER_FLAGS = ${CMAKE_SHARED_LINKER_FLAGS}" ) message( " CMAKE_SHARED_LINKER_FLAGS_DEBUG = ${CMAKE_SHARED_LINKER_FLAGS_DEBUG}" ) message( " CMAKE_SHARED_LINKER_FLAGS_RELEASE = ${CMAKE_SHARED_LINKER_FLAGS_RELEASE}" ) message( " CUDA_NVCC_FLAGS = ${CUDA_NVCC_FLAGS}" ) +message( " CUDA_SAMPLES_FLAGS = ${CUDA_SAMPLES_FLAGS}" ) message( " GMP_LIBRARIES = ${GMP_LIBRARIES}" ) if( MPI_CXX_FOUND AND ${WITH_MPI} ) message( " MPI_CXX_COMPILE_OPTIONS = ${MPI_CXX_COMPILE_OPTIONS}" ) diff --git a/Documentation/Examples/Algorithms/CMakeLists.txt b/Documentation/Examples/Algorithms/CMakeLists.txt index 294006c088765aea26ae189199f54fc49bb19ecb..5ffb91b16855d23531ad5c06dba8a66cdefd190c 100644 --- a/Documentation/Examples/Algorithms/CMakeLists.txt +++ b/Documentation/Examples/Algorithms/CMakeLists.txt @@ -1,9 +1,21 @@ IF( BUILD_CUDA ) CUDA_ADD_EXECUTABLE(ParallelForExampleCuda ParallelForExample.cu) ADD_CUSTOM_COMMAND( COMMAND ParallelForExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out ) + CUDA_ADD_EXECUTABLE( SortingExampleCuda SortingExample.cu) + ADD_CUSTOM_COMMAND( COMMAND SortingExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample.out OUTPUT SortingExample.out ) + CUDA_ADD_EXECUTABLE( SortingExample2Cuda SortingExample2.cu) + ADD_CUSTOM_COMMAND( COMMAND SortingExample2Cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample2.out OUTPUT SortingExample2.out ) + CUDA_ADD_EXECUTABLE( SortingExample3Cuda SortingExample3.cu) + ADD_CUSTOM_COMMAND( COMMAND SortingExample3Cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample3.out OUTPUT SortingExample3.out ) ELSE() ADD_EXECUTABLE(ParallelForExample ParallelForExample.cpp) ADD_CUSTOM_COMMAND( COMMAND ParallelForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out ) + ADD_EXECUTABLE( SortingExample SortingExample.cpp) + ADD_CUSTOM_COMMAND( COMMAND SortingExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample.out OUTPUT SortingExample.out ) + ADD_EXECUTABLE( SortingExample2 SortingExample2.cpp) + ADD_CUSTOM_COMMAND( COMMAND SortingExample2 > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample2.out OUTPUT SortingExample2.out ) + ADD_EXECUTABLE( SortingExample3 SortingExample3.cpp) + ADD_CUSTOM_COMMAND( COMMAND SortingExample3 > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample3.out OUTPUT SortingExample3.out ) ENDIF() ADD_EXECUTABLE(staticForExample staticForExample.cpp) @@ -13,6 +25,9 @@ ADD_EXECUTABLE(unrolledForExample unrolledForExample.cpp) ADD_CUSTOM_COMMAND( COMMAND unrolledForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/unrolledForExample.out OUTPUT unrolledForExample.out ) ADD_CUSTOM_TARGET( RunAlgorithmsExamples ALL DEPENDS + SortingExample.out + SortingExample2.out + SortingExample3.out ParallelForExample.out unrolledForExample.out staticForExample.out diff --git a/Documentation/Examples/Algorithms/SortingExample.cpp b/Documentation/Examples/Algorithms/SortingExample.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3d11787a1db16e3a77e05370e70d83059bdaa8eb --- /dev/null +++ b/Documentation/Examples/Algorithms/SortingExample.cpp @@ -0,0 +1,55 @@ +#include +#include +#include + +using namespace TNL; +using namespace TNL::Containers; +using namespace TNL::Algorithms; + +template< typename ArrayT > +void sort( ArrayT& array ) +{ + const int size = 10; + + /**** + * Fill the array with random integers. + */ + Array< int > aux_array( size ); + srand( size + 2021 ); + aux_array.forAllElements( [=] __cuda_callable__ ( int i, int& value ) { value = std::rand() % (2*size); } ); + array = aux_array; + + std::cout << "Random array: " << array << std::endl; + + /**** + * Sort the array in ascending order. + */ + ascendingSort( array ); + std::cout << "Array sorted in ascending order:" << array << std::endl; + + /*** + * Sort the array in descending order. + */ + descendingSort( array ); + std::cout << "Array sorted in descending order:" << array << std::endl; +} + +int main( int argc, char* argv[] ) +{ + /*** + * Firstly, test the sorting on CPU. + */ + std::cout << "Sorting on CPU ... " << std::endl; + Array< int, Devices::Host > host_array; + sort( host_array ); + +#ifdef HAVE_CUDA + /*** + * And then also on GPU. + */ + std::cout << "Sorting on GPU ... " << std::endl; + Array< int, Devices::Cuda > cuda_array; + sort( cuda_array ); +#endif + return EXIT_SUCCESS; +} diff --git a/Documentation/Examples/Algorithms/SortingExample.cu b/Documentation/Examples/Algorithms/SortingExample.cu new file mode 120000 index 0000000000000000000000000000000000000000..ce89e14f316dc18748ab7aa07550be5ea461e704 --- /dev/null +++ b/Documentation/Examples/Algorithms/SortingExample.cu @@ -0,0 +1 @@ +SortingExample.cpp \ No newline at end of file diff --git a/Documentation/Examples/Algorithms/SortingExample2.cpp b/Documentation/Examples/Algorithms/SortingExample2.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ec24ec5dcd819dff9662c881250636ff2c62088d --- /dev/null +++ b/Documentation/Examples/Algorithms/SortingExample2.cpp @@ -0,0 +1,55 @@ +#include +#include +#include + +using namespace TNL; +using namespace TNL::Containers; +using namespace TNL::Algorithms; + +template< typename ArrayT > +void sort( ArrayT& array ) +{ + const int size = 10; + + /**** + * Fill the array with random integers. + */ + Array< int > aux_array( size ); + srand( size + 2021 ); + aux_array.forAllElements( [=] __cuda_callable__ ( int i, int& value ) { value = std::rand() % (2*size); } ); + array = aux_array; + + std::cout << "Random array: " << array << std::endl; + + /**** + * Sort the array in ascending order. + */ + sort( array, [] __cuda_callable__ ( int a, int b ) { return a < b; } ); + std::cout << "Array sorted in ascending order:" << array << std::endl; + + /*** + * Sort the array in descending order. + */ + sort( array, [] __cuda_callable__ ( int a, int b ) { return a > b; } ); + std::cout << "Array sorted in descending order:" << array << std::endl; +} + +int main( int argc, char* argv[] ) +{ + /*** + * Firstly, test the sorting on CPU. + */ + std::cout << "Sorting on CPU ... " << std::endl; + Array< int, Devices::Host > host_array; + sort( host_array ); + +#ifdef HAVE_CUDA + /*** + * And then also on GPU. + */ + std::cout << "Sorting on GPU ... " << std::endl; + Array< int, Devices::Cuda > cuda_array; + sort( cuda_array ); +#endif + return EXIT_SUCCESS; +} diff --git a/Documentation/Examples/Algorithms/SortingExample2.cu b/Documentation/Examples/Algorithms/SortingExample2.cu new file mode 120000 index 0000000000000000000000000000000000000000..892bbc2328682c49bf3a491189525f79a08fde22 --- /dev/null +++ b/Documentation/Examples/Algorithms/SortingExample2.cu @@ -0,0 +1 @@ +SortingExample2.cpp \ No newline at end of file diff --git a/Documentation/Examples/Algorithms/SortingExample3.cpp b/Documentation/Examples/Algorithms/SortingExample3.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4d6c33ce80033e9d71d85ffafbb9ac596617cf38 --- /dev/null +++ b/Documentation/Examples/Algorithms/SortingExample3.cpp @@ -0,0 +1,65 @@ +#include +#include +#include + +using namespace TNL; +using namespace TNL::Containers; +using namespace TNL::Algorithms; + +template< typename ArrayT > +void sort( ArrayT& array ) +{ + const int size = 10; + + /**** + * Fill the array with random integers. + */ + Array< int > aux_array( size ); + srand( size + 2021 ); + aux_array.forAllElements( [=] __cuda_callable__ ( int i, int& value ) { value = std::rand() % (2*size); } ); + array = aux_array; + + /*** + * Prepare second array holding elements positions. + */ + ArrayT index( size ); + index.forAllElements( [] __cuda_callable__ ( int idx, int& value ) { value = idx; } ); + std::cout << "Random array: " << array << std::endl; + std::cout << "Index array: " << index << std::endl; + + /*** + * Sort the array `array` and apply the same permutation on the array `identity`. + */ + auto array_view = array.getView(); + auto index_view = index.getView(); + sort< typename ArrayT::DeviceType, // device on which the sorting will be performed + typename ArrayT::IndexType >( // type used for indexing + 0, size, // range of indexes + [=] __cuda_callable__ ( int i, int j ) -> bool { // comparison lambda function + return array_view[ i ] < array_view[ j ]; }, + [=] __cuda_callable__ ( int i, int j ) mutable { // lambda function for swapping of elements + TNL::swap( array_view[ i ], array_view[ j ] ); + TNL::swap( index_view[ i ], index_view[ j ] ); } ); + std::cout << "Sorted array: " << array << std::endl; + std::cout << "Index: " << index << std::endl; +} + +int main( int argc, char* argv[] ) +{ + /*** + * Firstly, test the sorting on CPU. + */ + std::cout << "Sorting on CPU ... " << std::endl; + Array< int, Devices::Host > host_array; + sort( host_array ); + +#ifdef HAVE_CUDA + /*** + * And then also on GPU. + */ + std::cout << "Sorting on GPU ... " << std::endl; + Array< int, Devices::Cuda > cuda_array; + sort( cuda_array ); +#endif + return EXIT_SUCCESS; +} diff --git a/Documentation/Examples/Algorithms/SortingExample3.cu b/Documentation/Examples/Algorithms/SortingExample3.cu new file mode 120000 index 0000000000000000000000000000000000000000..a35af15c62abe6a82dcbd3745dbd360311b8db2d --- /dev/null +++ b/Documentation/Examples/Algorithms/SortingExample3.cu @@ -0,0 +1 @@ +SortingExample3.cpp \ No newline at end of file diff --git a/Documentation/Tutorials/Sorting/tutorial_Sorting.md b/Documentation/Tutorials/Sorting/tutorial_Sorting.md new file mode 100644 index 0000000000000000000000000000000000000000..4832077cb6b2541925423e6d58ff9345e242751a --- /dev/null +++ b/Documentation/Tutorials/Sorting/tutorial_Sorting.md @@ -0,0 +1,43 @@ +\page tutorial_Sorting Sorting tutorial + +[TOC] + +## Introduction + +TNL offers several different parallel algorithms for sorting of arrays (or vectors) and also sorting based on user defined swapping. The later is more general but also less efficient. + +### Sorting of arrays and Vectors + +The sorting of arrays and vectors is accessible via the following functions: + +* \ref TNL::Algorithms::ascendingSort for sorting elements of array in ascending order, +* \ref TNL::Algorithms::descendingSort for sorting elements of array in descending order, +* \ref TNL::Algorithms::sort for sorting with user defined ordering. + +The following example demonstrates the use of ascending and descending sort. See + +\includelineno SortingExample.cpp + +Here we create array with random sequence of integers (lines 17-20) and then we sort the array in ascending order (line 27) and descending order (line 33). The result looks as follows: + +\include SortingExample.out + + +How to achieve the same result with user defined ordering is demonstrated by the following example: + +\includelineno SortingExample2.cpp + +The result looks as follows: + +\include SortingExample2.out + +The same way, one can sort also \ref TNL::Containers::ArrayView, \ref TNL::Containers::Vector and \ref TNL::Containers::VectorView. + +### Sorting with user define swapping + + +\includelineno SortingExample3.cpp + +In this example, we fill array `array` with random numbers and array `index` with numbers equal to position of an element in the array. We want to sort the array `array` and permute the `index` array the same way. See the lines 34-38. Here we call function `sort` which does not accept any array-like data structure but only range of indexes and two lambda functions. The first one defines ordering of the elements (line 35) by comparing elements of array `array`. The second lambda function is responsible for elements swapping (lines 36-38 ). Note that we do not swap only elements of array `array` but also `index` array. The result looks as follows: + +\include SortingExample3.out diff --git a/Documentation/Tutorials/index.md b/Documentation/Tutorials/index.md index e35e674c8fb877aa86fbedae4aeae94d1efa974b..739d609acc543148ddb30aa21ee231a4527e02ee 100644 --- a/Documentation/Tutorials/index.md +++ b/Documentation/Tutorials/index.md @@ -7,6 +7,7 @@ 3. [Vectors](tutorial_Vectors.html) 4. [Flexible parallel reduction and scan](tutorial_ReductionAndScan.html) 5. [For loops](tutorial_ForLoops.html) -6. [Cross-device pointers](tutorial_Pointers.html) -7. [Matrices](tutorial_Matrices.html) -8. [Unstructured meshes](tutorial_Meshes.html) +6. [Sorting](tutorial_Sorting.html) +7. [Cross-device pointers](tutorial_Pointers.html) +8. [Matrices](tutorial_Matrices.html) +9. [Unstructured meshes](tutorial_Meshes.html) diff --git a/build b/build index 09da8de5c7c28dab27750931b68217ff79d41d6c..f8adef1a4059159e55a1107500b2bffc58f9d33c 100755 --- a/build +++ b/build @@ -39,6 +39,13 @@ BUILD_TESTS="no" BUILD_MATRIX_TESTS="no" BUILD_DOC="no" +# external dependencies +CUDA_SAMPLES_DIR=none + +if [[ x"$CUDA_SAMPLES_PATH" != "x" ]]; then + CUDA_SAMPLES_DIR=${CUDA_SAMPLES_PATH} +fi + function print_usage() { cat << EOF @@ -85,6 +92,9 @@ Options for the 'tests' and 'matrix-tests' targets: --tests-jobs=NUM Number of processes to be used for the unit tests. It is $TEST_JOBS by default. --with-coverage=yes/no Enables code coverage reports for unit tests (lcov is required). '$WITH_COVERAGE' by default. --with-system-gtest=yes/no Use GTest installed in the local system and do not download the latest version. '$WITH_SYSTEM_GTEST' by default. + +External dependencies: + --cuda-samples-path=PATH CUDA samples are used by some reference algorithms used in benchmarks. '$CUDA_SAMPLES_PATH' by default. EOF } @@ -120,6 +130,7 @@ for option in "$@"; do --with-coverage=* ) WITH_COVERAGE="${option#*=}" ;; --with-ci-flags=* ) WITH_CI_FLAGS="${option#*=}" ;; --with-system-gtest=* ) WITH_SYSTEM_GTEST="${option#*=}" ;; + --cuda-samples-path=* ) CUDA_SAMPLES_DIR="${option#*=}" ;; -* ) echo "Unknown option $option. Use --help for more information." >&2 exit 1 @@ -216,6 +227,7 @@ cmake_command=( -DBUILD_MATRIX_TESTS=${BUILD_MATRIX_TESTS} -DBUILD_PYTHON=${BUILD_PYTHON} -DBUILD_DOC=${BUILD_DOC} + -DCUDA_SAMPLES_DIR=${CUDA_SAMPLES_DIR} ) # Skip running cmake if it was already run and the cmake command is the same. diff --git a/src/Benchmarks/CMakeLists.txt b/src/Benchmarks/CMakeLists.txt index 6f3185329c8dab36b4c07bdb949b33a45595607a..4e1961b3c1b39fad7cb794fd030cfe33e2f8dd0a 100644 --- a/src/Benchmarks/CMakeLists.txt +++ b/src/Benchmarks/CMakeLists.txt @@ -5,6 +5,7 @@ add_subdirectory( SpMV ) add_subdirectory( DistSpMV ) add_subdirectory( LinearSolvers ) add_subdirectory( ODESolvers ) +add_subdirectory( Sorting ) add_subdirectory( Traversers ) set( headers diff --git a/src/Benchmarks/Sorting/CMakeLists.txt b/src/Benchmarks/Sorting/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..cbd22efc080ac15725c979a9cc24428b539f055b --- /dev/null +++ b/src/Benchmarks/Sorting/CMakeLists.txt @@ -0,0 +1,9 @@ +if( BUILD_CUDA ) + CUDA_ADD_EXECUTABLE( tnl-benchmark-sort tnl-benchmark-sort.cu OPTIONS -Xcompiler -Wno-error=switch,-Wno-error=sign-compare ${CUDA_SAMPLES_FLAGS} ) + # Source code of reference algorithms contains warning which turn into errers with CI/CD compiler flags. Therefore we use -Wno-error to turn it off. + TARGET_LINK_LIBRARIES( tnl-benchmark-sort ${CUDA_cusparse_LIBRARY} ${CUDA_cudadevrt_LIBRARY} ) +else() + ADD_EXECUTABLE( tnl-benchmark-sort tnl-benchmark-sort.cpp ) +endif() + +install( TARGETS tnl-benchmark-sort RUNTIME DESTINATION bin ) diff --git a/src/Benchmarks/Sorting/Measurer.h b/src/Benchmarks/Sorting/Measurer.h new file mode 100644 index 0000000000000000000000000000000000000000..e8c18389f5b5157edf3b176490a400099076cb2c --- /dev/null +++ b/src/Benchmarks/Sorting/Measurer.h @@ -0,0 +1,67 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +#ifdef HAVE_CUDA +#ifdef HAVE_CUDA_SAMPLES +#include "ReferenceAlgorithms/MancaQuicksort.h" +#include "ReferenceAlgorithms/NvidiaBitonicSort.h" +#endif +#include "ReferenceAlgorithms/CedermanQuicksort.h" +#include "ReferenceAlgorithms/ThrustRadixsort.h" +#endif + +#include "timer.h" + +using namespace TNL; + +template< typename Sorter > +struct Measurer +{ + template< typename Value > + static double measure( const std::vector&vec, int tries, int & wrongAnsCnt ) + { + vector resAcc; + + for(int i = 0; i < tries; i++) + { + Containers::Array arr(vec); + auto view = arr.getView(); + { + TIMER t([&](double res){resAcc.push_back(res);}); + Sorter::sort(view); + } + + if( ! Algorithms::isAscending( view ) ) + wrongAnsCnt++; + } + return accumulate(resAcc.begin(), resAcc.end(), 0.0) / resAcc.size(); + } +}; + +template<> +struct Measurer< Algorithms::Sorting::STLSort > +{ + template< typename Value > + static double measure( const std::vector&vec, int tries, int & wrongAnsCnt ) + { + vector resAcc; + + for(int i = 0; i < tries; i++) + { + Containers::Array arr(vec); + auto view = arr.getView(); + //std::vector< Value > vec2 = vec; + { + TIMER t([&](double res){resAcc.push_back(res);}); + Algorithms::Sorting::STLSort::sort( view ); + } + } + return accumulate(resAcc.begin(), resAcc.end(), 0.0) / resAcc.size(); + } +}; diff --git a/src/Benchmarks/Sorting/Quicksorter.h b/src/Benchmarks/Sorting/Quicksorter.h new file mode 100644 index 0000000000000000000000000000000000000000..d361a1ee2a66a9c8ef98d22842dbd34844399123 --- /dev/null +++ b/src/Benchmarks/Sorting/Quicksorter.h @@ -0,0 +1,10 @@ +#include "../../src/quicksort/quicksort.cuh" + +#include "../benchmarker.cpp" +#include "../measure.cu" + +template +void sorter(ArrayView arr) +{ + quicksort(arr); +} \ No newline at end of file diff --git a/src/Benchmarks/Sorting/ReferenceAlgorithms/CedermanQuicksort.h b/src/Benchmarks/Sorting/ReferenceAlgorithms/CedermanQuicksort.h new file mode 100644 index 0000000000000000000000000000000000000000..0ae220842d4399ab0bcf76060bc51675e12e3f1d --- /dev/null +++ b/src/Benchmarks/Sorting/ReferenceAlgorithms/CedermanQuicksort.h @@ -0,0 +1,1100 @@ + + +#ifndef PQSORTH +#define PQSORTH + +#ifdef _MSC_VER +#ifdef BUILDING_DLL +#define DLLEXPORT __declspec(dllexport) +#else +#define DLLEXPORT /*__declspec(dllimport)*/ +#endif +#else +#ifdef HAVE_GCCVISIBILITYPATCH +#define DLLEXPORT __attribute__((visibility("default"))) +#else +#define DLLEXPORT +#endif +#endif + +#define MAXTHREADS 256 +#define MAXBLOCKS 2048 + +/** +* The main sort function +* @param data Data to be sorted +* @param size The length of the data +* @returns 0 if successful. For non-zero values, use getErrorStr() for more information about why it failed. +*/ +int gpuqsort(unsigned int *data, unsigned int size, unsigned int blockscount = 0, unsigned int threads = 0, unsigned int sbsize = 0, unsigned int phase = 0); + + +// Keep tracks of the data blocks in phase one +template +struct BlockSize +{ + unsigned int beg; + unsigned int end; + unsigned int orgbeg; + unsigned int orgend; + element rmaxpiv; + element lmaxpiv; + element rminpiv; + element lminpiv; + + bool altered; + bool flip; + element pivot; +}; + +// Holds parameters to the kernel in phase one +template +struct Params +{ + unsigned int from; + unsigned int end; + element pivot; + unsigned int ptr; + bool last; +}; + +// Used to perform a cumulative sum between blocks. +// Unnecessary for cards with atomic operations. +// Will be removed when these becomes more common +template +struct Length +{ + element maxpiv[MAXBLOCKS]; + element minpiv[MAXBLOCKS]; + + unsigned int left[MAXBLOCKS]; + unsigned int right[MAXBLOCKS]; +}; + +// Since we have divided up the kernel in to three +// we need to remember the result of the cumulative sum +// Unnecessary for cards with atomic operations. +// Will be removed when these becomes more common +struct Hist +{ + unsigned int left[(MAXTHREADS)*MAXBLOCKS]; + unsigned int right[(MAXTHREADS)*MAXBLOCKS]; +}; + +struct LQSortParams +{ + unsigned int beg; + unsigned int end; + bool flip; + unsigned int sbsize; +}; + +template +class GPUQSort +{ + element *ddata; + element *ddata2; + struct Params *params; + struct Params *dparams; + + LQSortParams *lqparams; + LQSortParams *dlqparams; + + Hist *dhists; + Length *dlength; + Length *length; + BlockSize *workset; + + float TK, TM, MK, MM, SM, SK; + + int err; + bool init; + + bool errCheck(int e); + +public: + GPUQSort(); + ~GPUQSort(); + + int sort(element *data, unsigned int size, unsigned int blockscount = 0, unsigned int threads = 0, unsigned int sbsize = 0, unsigned int phase = 0); + const char *getErrorStr(); +}; + +#endif + +#undef THREADS + +#define THREADS blockDim.x + +extern __shared__ unsigned int sarray[]; + +#ifdef HASATOMICS +__device__ unsigned int ohtotal = 0; +#endif + +/** +* Swaps the location of two unsigned ints +* @param a This unsigned int will swap place with unsigned int b +* @param b This unsigned int will swap place with unsigned int a +*/ +//template +__device__ inline void swap(unsigned int &a, unsigned int &b) +{ + unsigned int tmp = a; + a = b; + b = tmp; +} + +/** +* Perform a bitonic sort +* @param values The unsigned ints to be sorted +* @param target Where to place the sorted unsigned int when done +* @param size The number of unsigned ints +*/ +//template +__device__ inline void bitonicSort(unsigned int *fromvalues, unsigned int *tovalues, unsigned int from, unsigned int size) +{ + unsigned int *shared = (unsigned int *)sarray; + + unsigned int coal = (from & 0xf); + size = size + coal; + from = from - coal; + + int sb = 2 << (int)(__log2f(size)); + + // Buffer data to be sorted in the shared memory + for (int i = threadIdx.x; i < size; i += THREADS) + { + shared[i] = fromvalues[i + from]; + } + + for (int i = threadIdx.x; i < coal; i += THREADS) + shared[i] = 0; + + // Pad the data + for (int i = threadIdx.x + size; i < sb; i += THREADS) + shared[i] = 0xffffffff; + + __syncthreads(); + + // Parallel bitonic sort. + for (int k = 2; k <= sb; k *= 2) + { + // Bitonic merge: + for (int j = k / 2; j > 0; j /= 2) + { + for (int tid = threadIdx.x; tid < sb; tid += THREADS) + { + unsigned int ixj = tid ^ j; + + if (ixj > tid) + { + if ((tid & k) == 0) + { + if (shared[tid] > shared[ixj]) + { + swap(shared[tid], shared[ixj]); + } + } + else + { + if (shared[tid] < shared[ixj]) + { + swap(shared[tid], shared[ixj]); + } + } + } + } + + __syncthreads(); + } + } + __syncthreads(); + + // Write back the sorted data to its correct position + for (int i = threadIdx.x; i < size; i += THREADS) + if (i >= coal) + tovalues[i + from] = shared[i]; + __syncthreads(); +} + +/** +* Perform a cumulative count on two arrays +* @param lblock Array one +* @param rblock Array two +*/ +__device__ inline void cumcount(unsigned int *lblock, unsigned int *rblock) +{ + int tx = threadIdx.x; + + int offset = 1; + + __syncthreads(); + + for (int d = THREADS >> 1; d > 0; d >>= 1) // build sum in place up the tree + { + __syncthreads(); + + if (tx < d) + { + int ai = offset * (2 * tx + 1) - 1; + int bi = offset * (2 * tx + 2) - 1; + lblock[bi] += lblock[ai]; + rblock[bi] += rblock[ai]; + } + offset *= 2; + } + __syncthreads(); + if (tx == 0) + { + lblock[THREADS] = lblock[THREADS - 1]; + rblock[THREADS] = rblock[THREADS - 1]; + lblock[THREADS - 1] = 0; + rblock[THREADS - 1] = 0; + } // clear the last unsigned int */ + __syncthreads(); + + for (int d = 1; d < THREADS; d *= 2) // traverse down tree & build scan + { + offset >>= 1; + __syncthreads(); + + if (tx < d) + { + int ai = offset * (2 * tx + 1) - 1; + int bi = offset * (2 * tx + 2) - 1; + + int t = lblock[ai]; + lblock[ai] = lblock[bi]; + lblock[bi] += t; + + t = rblock[ai]; + rblock[ai] = rblock[bi]; + rblock[bi] += t; + } + } +} + +/** +* Part One - Counts the number of unsigned ints larger or smaller than the pivot. It then +* performs a cumulative sum so that each thread knows where to write +* @param data unsigned ints to be counted +* @param params Specifies which data each thread block is responsible for +* @param hist The cumulative sum for each thread is stored here +* @param lengths The total sum for each thread block is stored here +*/ +//template +__global__ void part1(unsigned int *data, Params *params, struct Hist *hist, Length *lengths) +{ + const int tx = threadIdx.x; + + unsigned int *lblock = (unsigned int *)sarray; + unsigned int *rblock = (unsigned int *)(&lblock[(blockDim.x + 1)]); + unsigned int *minpiv = (unsigned int *)(&rblock[(blockDim.x + 1)]); + unsigned int *maxpiv = (unsigned int *)(&minpiv[blockDim.x]); + + // Where should we read? + unsigned int start = params[blockIdx.x].from; + unsigned int end = params[blockIdx.x].end; + unsigned int pivot = params[blockIdx.x].pivot; + + // Stores the max and min value of the data. Used to decide a new pivot + minpiv[tx] = data[start + tx]; + maxpiv[tx] = data[start + tx]; + + __syncthreads(); + int ll = 0; + int lr = 0; + + __syncthreads(); + + int coal = (start & 0xf); + start = start - coal; + + // Go through the data + if (tx + start < end) + { + unsigned int d = data[tx + start]; + + if (!(tx < coal)) + { + + // Counting unsigned ints smaller... + if (d < pivot) + ll++; + else + // or larger than the pivot + if (d > pivot) + lr++; + + // Store the max and min unsigned int + minpiv[tx] = min(minpiv[tx], d); + maxpiv[tx] = max(maxpiv[tx], d); + } + } + + // Go through the data + for (unsigned int i = tx + start + THREADS; i < end; i += THREADS) + { + unsigned int d = data[i]; + + // Counting unsigned ints smaller... + if (d < pivot) + ll++; + else + // or larger than the pivot + if (d > pivot) + lr++; + + // Store the max and min unsigned int + minpiv[tx] = min(minpiv[tx], d); + maxpiv[tx] = max(maxpiv[tx], d); + } + + lblock[tx] = ll; + rblock[tx] = lr; + + __syncthreads(); + + // Perform a cumulative sum + cumcount((unsigned int *)lblock, (unsigned int *)rblock); + + if (tx == 0) + { + // Decide on max and min unsigned int + for (int i = 0; i < THREADS; i++) + { + minpiv[0] = min(minpiv[0], minpiv[i]); + maxpiv[0] = max(maxpiv[0], maxpiv[i]); + } + } + __syncthreads(); + + // Store each threads part of the cumulative count + hist->left[blockIdx.x * (THREADS) + threadIdx.x] = lblock[threadIdx.x + 1]; + hist->right[blockIdx.x * (THREADS) + threadIdx.x] = rblock[threadIdx.x + 1]; + + // Store the total sum + lengths->left[blockIdx.x] = lblock[THREADS]; + lengths->right[blockIdx.x] = rblock[THREADS]; + + // Store the max and min unsigned int + lengths->minpiv[blockIdx.x] = minpiv[0]; + lengths->maxpiv[blockIdx.x] = maxpiv[0]; +} + +/** +* Part Two - Move unsigned ints to their correct position in the auxillary array +* @param data unsigned ints to be moved +* @param data2 Destination for unsigned ints +* @param params Specifies which data each thread block is responsible for +* @param hist The cumulative sum for each thread is stored here +* @param lengths The total sum for each thread block is stored here +*/ +//template +__global__ void part2(unsigned int *data, unsigned int *data2, struct Params *params, struct Hist *hist, Length *lengths) +{ + const int tx = threadIdx.x; + const int bx = blockIdx.x; + + // Each thread uses the cumulative sum to know where to write + unsigned int x = lengths->left[bx] + hist->left[bx * (THREADS) + tx] - 1; // - 1; + unsigned int y = lengths->right[bx] - hist->right[bx * (THREADS) + tx]; + + // Where should we read? + unsigned int start = params[bx].from; + unsigned int end = params[bx].end; + unsigned int pivot = params[bx].pivot; + + __syncthreads(); + + int coal = (start & 0xf); + start = start - coal; + + // Go through all the assigned data + if (tx + start < end) + { + // Reading unsigned ints... + unsigned int d = data[tx + start]; + + if (!(tx < coal)) + { + + // and writing them to auxillary array + if (d < pivot) + { + if (x > 0) + data2[x--] = d; + else + data2[x] = d; + } + else if (d > pivot) + data2[y++] = d; + } + } + + __syncthreads(); + + // Go through all the assigned data + for (unsigned int i = start + tx + THREADS; i < end; i += THREADS) + { + // Reading unsigned ints... + unsigned int d = data[i]; + + // and writing them to auxillary array + if (d < pivot) + { + if (x > 0) + data2[x--] = d; + else + data2[x] = d; + } + else if (d > pivot) + data2[y++] = d; + } + + return; +} + +/** +* Part Three - Write the pivot value +* @param data Destination for pivot +* @param params Specifies which data each thread block is responsible for +* @param hist The cumulative sum for each thread is stored here +* @param lengths The total sum for each thread block is stored here +*/ +//template +__global__ void part3(unsigned int *data, struct Params *params, struct Hist *hist, Length *lengths) +{ + const int tx = threadIdx.x; + const int bx = blockIdx.x; + + // If we are the "last" thread block that is assigned to the same data sequence + // we write the pivot between the left and right block + if (params[bx].last) + { + // Get destination position + unsigned int x = lengths->left[bx] + hist->left[bx * THREADS + THREADS - 1] + tx; + unsigned int y = lengths->right[bx] - hist->right[bx * THREADS + THREADS - 1]; + unsigned int pivot = params[bx].pivot; + + // Write the pivot values + for (; x < y; x += THREADS) + data[x] = pivot; + } +} + +/** +* The local quicksort - sorts a block of data with no inter-block synchronization +* @param adata Contains some of the blocks to be sorted and also acts as the final +* destination for sorted data +* @param adata2 Contains some of the blocks to be sorted +* @param bs List of blocks to be sorted and a pointer telling if a specific block is +* in \a adata or \a adata2 +*/ +//template +__global__ void lqsort(unsigned int *adata, unsigned int *adata2, struct LQSortParams *bs, unsigned int phase) +{ + __shared__ unsigned int lphase; + lphase = phase; + + // Shorthand for the threadid + int tx = threadIdx.x; + + // Stack pointer + __shared__ int bi; + + // Stack unsigned ints + __shared__ unsigned int beg[32]; + __shared__ unsigned int end[32]; + __shared__ bool flip[32]; + + unsigned int *lblock = (unsigned int *)sarray; + unsigned int *rblock = (unsigned int *)(&lblock[(blockDim.x + 1)]); + + // The current pivot + __shared__ unsigned int pivot; + + // The sequence to be sorted + __shared__ unsigned int from; + __shared__ unsigned int to; + + // Since we switch between the primary and the auxillary buffer, + // these variables are required to keep track on which role + // a buffer currently has + __shared__ unsigned int *data; + __shared__ unsigned int *data2; + __shared__ unsigned int sbsize; + + __shared__ unsigned int bx; + if (threadIdx.x == 0) +#ifdef HASATOMICS + bx = atomicInc(&ohtotal, 50000); +#else + bx = blockIdx.x; +#endif + + __syncthreads(); + + while (bx < gridDim.x) + { + + // Thread 0 is in charge of the stack operations + if (tx == 0) + { + // We push our first block on the stack + // This is the block given by the bs parameter + beg[0] = bs[bx].beg; + end[0] = bs[bx].end; + flip[0] = bs[bx].flip; + sbsize = bs[bx].sbsize; + + bi = 0; + } + + __syncthreads(); + + // If we were given an empty block there is no need to continue + if (end[0] == beg[0]) + return; + + // While there are items left on the stack to sort + while (bi >= 0) + { + __syncthreads(); + // Thread 0 pops a fresh sequence from the stack + if (tx == 0) + { + from = beg[bi]; + to = end[bi]; + + // Check which buffer the sequence is in + if (!flip[bi]) + { + data = adata2; + data2 = adata; + } + else + { + data = adata; + data2 = adata2; + } + } + + __syncthreads(); + + // If the sequence is smaller than SBSIZE we sort it using + // an alternative sort. Otherwise each thread would sort just one + // or two unsigned ints and that wouldn't be efficient + if ((to - from) < (sbsize - 16)) + { + // Sort it using bitonic sort. This could be changed to some other + // sorting method. Store the result in the final destination buffer + if ((to - from >= 1) && (lphase != 2)) + bitonicSort(data, adata, from, to - from); + __syncthreads(); + + // Decrement the stack pointer + if (tx == 0) + bi--; + __syncthreads(); + // and continue with the next sequence + continue; + } + + if (tx == 0) + { + // Create a new pivot for the sequence + // Try to optimize this for your input distribution + // if you have some information about it + unsigned int mip = min(min(data[from], data[to - 1]), data[(from + to) / 2]); + unsigned int map = max(max(data[from], data[to - 1]), data[(from + to) / 2]); + pivot = min(max(mip / 2 + map / 2, mip), map); + } + + unsigned int ll = 0; + unsigned int lr = 0; + + __syncthreads(); + + unsigned int coal = (from)&0xf; + + if (tx + from - coal < to) + { + unsigned int d = data[tx + from - coal]; + + if (!(tx < coal)) + { + // Counting unsigned ints that have a higher value than the pivot + if (d < pivot) + ll++; + else + // or a lower + if (d > pivot) + lr++; + } + } + + // Go through the current sequence + for (int i = from + tx + THREADS - coal; i < to; i += THREADS) + { + unsigned int d = data[i]; + + // Counting unsigned ints that have a higher value than the pivot + if (d < pivot) + ll++; + else + // or a lower + if (d > pivot) + lr++; + } + + // Store the result in a shared array so that we can calculate a + // cumulative sum + lblock[tx] = ll; + rblock[tx] = lr; + + __syncthreads(); + + // Calculate the cumulative sum + cumcount((unsigned int *)lblock, (unsigned int *)rblock); + + __syncthreads(); + + // Let thread 0 add the new resulting subsequences to the stack + if (tx == 0) + { + // The sequences are in the other buffer now + flip[bi + 1] = !flip[bi]; + flip[bi] = !flip[bi]; + + // We need to place the smallest object on top of the stack + // to ensure that we don't run out of stack space + if (lblock[THREADS] < rblock[THREADS]) + { + beg[bi + 1] = beg[bi]; + beg[bi] = to - rblock[THREADS]; + end[bi + 1] = from + lblock[THREADS]; + } + else + { + end[bi + 1] = end[bi]; + end[bi] = from + lblock[THREADS]; + beg[bi + 1] = to - rblock[THREADS]; + } + // Increment the stack pointer + bi++; + } + + __syncthreads(); + + unsigned int x = from + lblock[tx + 1] - 1; + unsigned int y = to - rblock[tx + 1]; + + coal = from & 0xf; + + if (tx + from - coal < to) + { + unsigned int d = data[tx + from - coal]; + + if (!(tx < coal)) + { + if (d < pivot) + { + if (x > 0) + data2[x--] = d; + else + data2[x] = d; + } + else if (d > pivot) + data2[y++] = d; + } + } + + // Go through the data once again + // writing it to its correct position + for (unsigned int i = from + tx + THREADS - coal; i < to; i += THREADS) + { + unsigned int d = data[i]; + + if (d < pivot) + { + if (x > 0) + data2[x--] = d; + else + data2[x] = d; + } + else if (d > pivot) + data2[y++] = d; + } + + __syncthreads(); + + // As a final step, write the pivot value between the right and left + // subsequence. Write it to the final destination since this pivot + // is always correctly sorted + for (unsigned int i = from + lblock[THREADS] + tx; i < to - rblock[THREADS]; i += THREADS) + { + adata[i] = pivot; + } + + __syncthreads(); + } +#ifdef HASATOMICS + if (threadIdx.x == 0) + bx = atomicInc(&ohtotal, 50000); + __syncthreads(); +#else + break; +#endif + } + + __syncthreads(); +} + +#include +#include +#include + +#undef THREADS +#define THREADS threads + +/** +* The main sort function +* @param data Data to be sorted +* @param size The length of the data +* @returns 0 if successful. For non-zero values, use getErrorStr() for more information about why it failed. +*/ +template +int GPUQSort::sort(element *data, unsigned int size, unsigned int blockscount, unsigned int threads, unsigned int sbsize, unsigned int phase) +{ + if (!init) + return 1; + + if (!threads || !blockscount || !sbsize) + { + threads = 1 << (int)round(log(size * TK + TM) / log(2.0)); + blockscount = 1 << (int)round(log(size * MK + MM) / log(2.0)); + sbsize = 1 << (int)round(log(size * SK + SM) / log(2.0)); + } + +#ifdef HASATOMICS + unsigned int *doh; + unsigned int oh; + + cudaGetSymbolAddress((void **)&doh, "ohtotal"); + oh = 0; + cudaMemcpy(doh, &oh, 4, cudaMemcpyHostToDevice); +#endif + + if (threads > MAXTHREADS) + return 1; + + if (blockscount > MAXBLOCKS) + return 1; + + // Copy the data to the graphics card and create an auxiallary array + ddata2 = 0; + ddata = 0; + if (!errCheck(cudaMalloc((void **)&ddata2, (size) * sizeof(element)))) + return 1; + if (!errCheck(cudaMalloc((void **)&ddata, (size) * sizeof(element)))) + return 1; + if (!errCheck(cudaMemcpy(ddata, data, size * sizeof(element), cudaMemcpyHostToDevice))) + return 1; + + // We start with a set containg only the sequence to be sorted + // This will grow as we partition the data + workset[0].beg = 0; + workset[0].end = size; + workset[0].orgbeg = 0; + workset[0].orgend = size; + workset[0].altered = false; + workset[0].flip = false; + + // Get a starting pivot + workset[0].pivot = (min(min(data[0], data[size / 2]), data[size - 1]) + max(max(data[0], data[size / 2]), data[size - 1])) / 2; + unsigned int worksize = 1; + + unsigned int blocks = blockscount / 2; + unsigned totsize = size; + unsigned int maxlength = (size / blocks) / 4; + + unsigned int iterations = 0; + bool flip = true; + + // Partition the sequences until we have enough + while (worksize < blocks) + { + unsigned int ws = totsize / blocks; + unsigned int paramsize = 0; + + // Go through the sequences we have and divide them into sections + // and assign thread blocks according to their size + for (unsigned int i = 0; i < worksize; i++) + { + if ((workset[i].end - workset[i].beg) < maxlength) + continue; + + // Larger sequences gets more thread blocks assigned to them + unsigned int blocksassigned = max((workset[i].end - workset[i].beg) / ws, 1); + for (unsigned int q = 0; q < blocksassigned; q++) + { + params[paramsize].from = workset[i].beg + ws * q; + params[paramsize].end = params[paramsize].from + ws; + params[paramsize].pivot = workset[i].pivot; + params[paramsize].ptr = i; + params[paramsize].last = false; + paramsize++; + } + params[paramsize - 1].last = true; + params[paramsize - 1].end = workset[i].end; + + workset[i].lmaxpiv = 0; + workset[i].lminpiv = 0xffffffff; + workset[i].rmaxpiv = 0; + workset[i].rminpiv = 0xffffffff; + } + + if (paramsize == 0) + break; + + // Copy the block assignment to the GPU + if (!errCheck(cudaMemcpy(dparams, params, paramsize * sizeof(Params), cudaMemcpyHostToDevice))) + return 1; + + // Do the cumulative sum + if (flip) + part1<<>>(ddata, dparams, dhists, dlength); + else + part1<<>>(ddata2, dparams, dhists, dlength); + if (!errCheck((cudaMemcpy(length, dlength, sizeof(Length), cudaMemcpyDeviceToHost)))) + return 1; + + // Do the block cumulative sum. Done on the CPU since not all cards have support for + // atomic operations yet. + for (unsigned int i = 0; i < paramsize; i++) + { + unsigned int l = length->left[i]; + unsigned int r = length->right[i]; + + length->left[i] = workset[params[i].ptr].beg; + length->right[i] = workset[params[i].ptr].end; + + workset[params[i].ptr].beg += l; + workset[params[i].ptr].end -= r; + workset[params[i].ptr].altered = true; + + workset[params[i].ptr].rmaxpiv = max(length->maxpiv[i], workset[params[i].ptr].rmaxpiv); + workset[params[i].ptr].lminpiv = min(length->minpiv[i], workset[params[i].ptr].lminpiv); + + workset[params[i].ptr].lmaxpiv = min(workset[params[i].ptr].pivot, workset[params[i].ptr].rmaxpiv); + workset[params[i].ptr].rminpiv = max(workset[params[i].ptr].pivot, workset[params[i].ptr].lminpiv); + } + + // Copy the result of the block cumulative sum to the GPU + if (!errCheck((cudaMemcpy(dlength, length, sizeof(Length), cudaMemcpyHostToDevice)))) + return 1; + + // Move the elements to their correct position + if (flip) + part2<<>>(ddata, ddata2, dparams, dhists, dlength); + else + part2<<>>(ddata2, ddata, dparams, dhists, dlength); + + // Fill in the pivot value between the left and right blocks + part3<<>>(ddata, dparams, dhists, dlength); + + flip = !flip; + + // Add the sequences resulting from the partitioning + // to set + unsigned int oldworksize = worksize; + totsize = 0; + for (unsigned int i = 0; i < oldworksize; i++) + { + if (workset[i].altered) + { + if (workset[i].beg - workset[i].orgbeg >= maxlength) + totsize += workset[i].beg - workset[i].orgbeg; + if (workset[i].orgend - workset[i].end >= maxlength) + totsize += workset[i].orgend - workset[i].end; + + workset[worksize].beg = workset[worksize].orgbeg = workset[i].orgbeg; + workset[worksize].end = workset[worksize].orgend = workset[i].beg; + workset[worksize].flip = flip; + workset[worksize].altered = false; + workset[worksize].pivot = (workset[i].lminpiv / 2 + workset[i].lmaxpiv / 2); + + worksize++; + + workset[i].orgbeg = workset[i].beg = workset[i].end; + workset[i].end = workset[i].orgend; + workset[i].flip = flip; + workset[i].pivot = (workset[i].rminpiv / 2 + workset[i].rmaxpiv / 2); + workset[i].altered = false; + } + } + iterations++; + } + + // Due to the poor scheduler on some graphics card + // we need to sort the order in which the blocks + // are sorted to avoid poor scheduling decisions + unsigned int sortblocks[MAXBLOCKS * 2]; + for (int i = 0; i < worksize; i++) + sortblocks[i] = ((workset[i].end - workset[i].beg) << (int)round(log((float)(MAXBLOCKS * 4.0f)) / log(2.0f))) + i; + std::sort(&sortblocks[0], &sortblocks[worksize]); + + if (worksize != 0) + { + // Copy the block assignments to the GPU + for (int i = 0; i < worksize; i++) + { + unsigned int q = (worksize - 1) - (sortblocks[i] & (MAXBLOCKS * 4 - 1)); + + lqparams[i].beg = workset[q].beg; + lqparams[i].end = workset[q].end; + lqparams[i].flip = workset[q].flip; + lqparams[i].sbsize = sbsize; + } + + if (!errCheck((cudaMemcpy(dlqparams, lqparams, worksize * sizeof(LQSortParams), cudaMemcpyHostToDevice)))) + return 1; + + // Run the local quicksort, the one that doesn't need inter-block synchronization + if (phase != 1) + lqsort<<>>(ddata, ddata2, dlqparams, phase); + } + + err = cudaDeviceSynchronize(); + // Free the data + if (err != cudaSuccess) + { + cudaFree(ddata); + cudaFree(ddata2); + return 1; + } + + // Copy the result back to the CPU + if (!errCheck((cudaMemcpy(data, ddata, size * sizeof(element), cudaMemcpyDeviceToHost)))) + return 1; + + cudaFree(ddata); + cudaFree(ddata2); + + return 0; +} + +template +bool GPUQSort::errCheck(int e) +{ + if (e == cudaSuccess) + return true; + + err = e; + cudaFree(ddata); + cudaFree(ddata2); + return false; +} + +template +GPUQSort::GPUQSort() : init(false), workset(0), params(0), length(0), lqparams(0), dlqparams(0), + dhists(0), dlength(0), dparams(0) +{ + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, 0); + if (!strcmp(deviceProp.name, "GeForce 8800 GTX")) + { + TK = 1.17125033316e-005f; + TM = 52.855721393f; + MK = 3.7480010661e-005f; + MM = 476.338308458f; + SK = 4.68500133262e-005f; + SM = 211.422885572f; + } + else if (!strcmp(deviceProp.name, "GeForce 8600 GTS")) + { + TK = 0.0f; + TM = 64.0f; + MK = 0.0000951623403898f; + MM = 476.338308458f; + SK = 0.0000321583081317f; + SM = 202.666666667f; + } + else + { + TK = 0; + TM = 128; + MK = 0; + MM = 512; + SK = 0; + SM = 512; + } + + if (cudaMallocHost((void **)&workset, MAXBLOCKS * 2 * sizeof(BlockSize)) != cudaSuccess) + return; + if (cudaMallocHost((void **)¶ms, MAXBLOCKS * sizeof(Params)) != cudaSuccess) + return; + if (cudaMallocHost((void **)&length, sizeof(Length)) != cudaSuccess) + return; + if (cudaMallocHost((void **)&lqparams, MAXBLOCKS * sizeof(LQSortParams)) != cudaSuccess) + return; + if (cudaMalloc((void **)&dlqparams, MAXBLOCKS * sizeof(LQSortParams)) != cudaSuccess) + return; + if (cudaMalloc((void **)&dhists, sizeof(Hist)) != cudaSuccess) + return; + if (cudaMalloc((void **)&dlength, sizeof(Length)) != cudaSuccess) + return; + if (cudaMalloc((void **)&dparams, MAXBLOCKS * sizeof(Params)) != cudaSuccess) + return; + + init = true; +} + +/** +* Returns the latest error message +* @returns the latest error message +*/ +template +const char *GPUQSort::getErrorStr() +{ + return cudaGetErrorString((cudaError_t)err); +} + +template +GPUQSort::~GPUQSort() +{ + cudaFreeHost(workset); + cudaFreeHost(params); + cudaFreeHost(length); + cudaFreeHost(lqparams); + cudaFree(dparams); + cudaFree(dlqparams); + cudaFree(dhists); + cudaFree(dlength); +} + +int gpuqsort(unsigned int *data, unsigned int size, unsigned int blockscount, unsigned int threads, unsigned int sbsize, unsigned int phase) +{ + GPUQSort *s = new GPUQSort(); + + if (s->sort(data, size, blockscount, threads, sbsize, phase) != 0) + { + delete s; + return 1; + } + else + { + delete s; + return 0; + } +} + +struct CedermanQuicksort +{ + static void sort( TNL::Containers::ArrayView< int, TNL::Devices::Cuda >& array ) + { + gpuqsort( ( unsigned int * ) array.getData(), ( unsigned int ) array.getSize() ); + } +}; diff --git a/src/Benchmarks/Sorting/ReferenceAlgorithms/MancaQuicksort.h b/src/Benchmarks/Sorting/ReferenceAlgorithms/MancaQuicksort.h new file mode 100644 index 0000000000000000000000000000000000000000..e7f4d20fb0bfb2a31a6bf42fd4342098edb89dda --- /dev/null +++ b/src/Benchmarks/Sorting/ReferenceAlgorithms/MancaQuicksort.h @@ -0,0 +1,1327 @@ +//defines the shared memory size +#define SHARED_LIMIT 1024 + +#define GIGA 1073741824 +/* + * division of the vector to be sorted in buckets + * the attributes of the object Block are the parameters of each bucket + */ +template +struct Block +{ + + unsigned int begin; + unsigned int end; + + unsigned int nextbegin; + unsigned int nextend; + + Type pivot; + + //max of the bucket items + Type maxPiv; + //min of the bucket items + Type minPiv; + //done indicates that a bucket has been analyzed + short done; + short select; +}; + +template +struct Partition +{ + + unsigned int ibucket; + unsigned int from; + unsigned int end; + Type pivot; +}; + +void CUDA_Quicksort(unsigned int *inData, unsigned int *outData, unsigned int dataSize, unsigned int threads, int Device, double *timer); + +void CUDA_Quicksort_64(double *inData, double *outData, unsigned int size, unsigned int threads, int Device, double *timer); + +typedef unsigned int Type; + +void test_bitonicSort(unsigned int *h_InputKey, unsigned int N, double *timer); +void test_MergeSort(unsigned int *h_SrcKey, unsigned int N, double *timer); +void test_thrustSort(Type *h_data, unsigned int N, double *timer); + +typedef unsigned int uint; + +size_t scanInclusiveShort( + uint *d_Dst, + uint *d_Src, + uint batchSize, + uint arrayLength); + +size_t scanInclusiveLarge( + uint *d_Dst, + uint *d_Src, + uint batchSize, + uint arrayLength); + +template +inline __device__ void warpScanInclusive2(Type &idata, Type &idata2, volatile Type *s_Data, volatile Type *s_Data2, uint size) +{ + + //volatile uint* s_Data2; + //s_Data2 = s_Data + blockDim.x*2; + + uint pos = 2 * threadIdx.x - (threadIdx.x & (size - 1)); + s_Data[pos] = 0; + s_Data2[pos] = 0; + pos += size; + s_Data[pos] = idata; + s_Data2[pos] = idata2; + + for (uint offset = 1; offset < size; offset <<= 1) + { + s_Data[pos] += s_Data[pos - offset]; + s_Data2[pos] += s_Data2[pos - offset]; + } + + idata = s_Data[pos]; + idata2 = s_Data2[pos]; +} + +template +inline __device__ void warpScanExclusive2(Type &idata, Type &idata2, volatile Type *s_Data, volatile Type *s_Data2, uint size) +{ + + //volatile uint* s_Data2; + //s_Data2 = s_Data + blockDim.x*2; + + uint pos = 2 * threadIdx.x - (threadIdx.x & (size - 1)); + s_Data[pos] = 0; + s_Data2[pos] = 0; + pos += size; + s_Data[pos] = idata; + s_Data2[pos] = idata2; + + for (uint offset = 1; offset < size; offset <<= 1) + { + s_Data[pos] += s_Data[pos - offset]; + s_Data2[pos] += s_Data2[pos - offset]; + } + + idata = s_Data[pos] - idata; + idata2 = s_Data2[pos] - idata2; +} + +#define LOG2_WARP_SIZE 5U +#define WARP_SIZE (1U << LOG2_WARP_SIZE) + +template +inline __device__ void scan1Inclusive2(Type &idata, Type &idata2, volatile Type *s_Data, uint size) +{ + + volatile Type *s_Data2; + s_Data2 = s_Data + blockDim.x * 2; + + if (size > WARP_SIZE) + { + + //Bottom-level inclusive warp scan + warpScanInclusive2(idata, idata2, s_Data, s_Data2, WARP_SIZE); + + //Save top Types of each warp for exclusive warp scan + //sync to wait for warp scans to complete (because s_Data is being overwritten) + __syncthreads(); + if ((threadIdx.x & (WARP_SIZE - 1)) == (WARP_SIZE - 1)) + { + s_Data[threadIdx.x >> LOG2_WARP_SIZE] = idata; + s_Data2[threadIdx.x >> LOG2_WARP_SIZE] = idata2; + } + + //wait for warp scans to complete + __syncthreads(); + if (threadIdx.x < (blockDim.x / WARP_SIZE)) + { + //grab top warp Types + Type val = s_Data[threadIdx.x]; + Type val2 = s_Data2[threadIdx.x]; + //calculate exclsive scan and write back to shared memory + warpScanExclusive2(val, val2, s_Data, s_Data2, size >> LOG2_WARP_SIZE); + s_Data[threadIdx.x] = val; + s_Data2[threadIdx.x] = val2; + } + + //return updated warp scans with exclusive scan results + __syncthreads(); + idata += s_Data[threadIdx.x >> LOG2_WARP_SIZE]; + idata2 += s_Data2[threadIdx.x >> LOG2_WARP_SIZE]; + } + else + warpScanInclusive2(idata, idata2, s_Data, s_Data2, size); +} + +template +inline __device__ void warpCompareInclusive(Type &idata, Type &idata2, volatile Type *s_Data, uint size) +{ + + volatile Type *s_Data2; + s_Data2 = s_Data + blockDim.x * 2; + uint pos = 2 * threadIdx.x - (threadIdx.x & (size - 1)); + s_Data[pos] = 0; + s_Data2[pos] = 0; + pos += size; + s_Data[pos] = idata; + s_Data2[pos] = idata2; + + for (uint offset = 1; offset < size; offset <<= 1) + { + s_Data[pos] = max(s_Data[pos], s_Data[pos - offset]); + s_Data2[pos] = min(s_Data2[pos], s_Data2[pos - offset]); + } + + idata = s_Data[pos]; + idata2 = s_Data2[pos]; +} + +template +inline __device__ void compareInclusive(Type &idata, Type &idata2, volatile Type *s_Data, uint size) +{ + + volatile Type *s_Data2; + s_Data2 = s_Data + blockDim.x * 2; + //Bottom-level inclusive warp scan + warpCompareInclusive(idata, idata2, s_Data, WARP_SIZE); + + //Save top Types of each warp for exclusive warp scan + //sync to wait for warp scans to complete (because s_Data is being overwritten) + __syncthreads(); + if ((threadIdx.x & (WARP_SIZE - 1)) == (WARP_SIZE - 1)) + { + s_Data[threadIdx.x >> LOG2_WARP_SIZE] = idata; + s_Data2[threadIdx.x >> LOG2_WARP_SIZE] = idata2; + } + + //wait for warp scans to complete + __syncthreads(); + if (threadIdx.x < (blockDim.x / WARP_SIZE)) + { + //grab top warp Types + Type val = s_Data[threadIdx.x]; + Type val2 = s_Data2[threadIdx.x]; + //calculate exclsive scan and write back to shared memory + warpCompareInclusive(val, val2, s_Data, size >> LOG2_WARP_SIZE); + s_Data[threadIdx.x] = val; + s_Data2[threadIdx.x] = val2; + } + + //return updated warp scans with exclusive scan results + __syncthreads(); + idata = max(idata, s_Data[threadIdx.x >> LOG2_WARP_SIZE]); + idata2 = min(idata2, s_Data2[threadIdx.x >> LOG2_WARP_SIZE]); +} + +#include +#include +#include <6_Advanced/scan/scan_common.h> + +//All three kernels run 512 threads per workgroup +//Must be a power of two +#define THREADBLOCK_SIZE 256 + +//////////////////////////////////////////////////////////////////////////////// +// Basic ccan codelets +//////////////////////////////////////////////////////////////////////////////// +#if (0) +//Naive inclusive scan: O(N * log2(N)) operations +//Allocate 2 * 'size' local memory, initialize the first half +//with 'size' zeros avoiding if(pos >= offset) condition evaluation +//and saving instructions +inline __device__ uint scan1Inclusive(uint idata, volatile uint *s_Data, uint size) +{ + uint pos = 2 * threadIdx.x - (threadIdx.x & (size - 1)); + s_Data[pos] = 0; + pos += size; + s_Data[pos] = idata; + + for (uint offset = 1; offset < size; offset <<= 1) + { + __syncthreads(); + uint t = s_Data[pos] + s_Data[pos - offset]; + __syncthreads(); + s_Data[pos] = t; + } + + return s_Data[pos]; +} + +inline __device__ uint scan1Exclusive(uint idata, volatile uint *s_Data, uint size) +{ + return scan1Inclusive(idata, s_Data, size) - idata; +} + +#else +#define LOG2_WARP_SIZE 5U +#define WARP_SIZE (1U << LOG2_WARP_SIZE) + +//Almost the same as naive scan1Inclusive, but doesn't need __syncthreads() +//assuming size <= WARP_SIZE +inline __device__ uint warpScanInclusive(uint idata, volatile uint *s_Data, uint size) +{ + uint pos = 2 * threadIdx.x - (threadIdx.x & (size - 1)); + s_Data[pos] = 0; + pos += size; + s_Data[pos] = idata; + + for (uint offset = 1; offset < size; offset <<= 1) + s_Data[pos] += s_Data[pos - offset]; + + return s_Data[pos]; +} + +inline __device__ uint warpScanExclusive(uint idata, volatile uint *s_Data, uint size) +{ + return warpScanInclusive(idata, s_Data, size) - idata; +} + +inline __device__ uint scan1Inclusive(uint idata, volatile uint *s_Data, uint size) +{ + if (size > WARP_SIZE) + { + //Bottom-level inclusive warp scan + uint warpResult = warpScanInclusive(idata, s_Data, WARP_SIZE); + + //Save top elements of each warp for exclusive warp scan + //sync to wait for warp scans to complete (because s_Data is being overwritten) + __syncthreads(); + if ((threadIdx.x & (WARP_SIZE - 1)) == (WARP_SIZE - 1)) + s_Data[threadIdx.x >> LOG2_WARP_SIZE] = warpResult; + + //wait for warp scans to complete + __syncthreads(); + if (threadIdx.x < (THREADBLOCK_SIZE / WARP_SIZE)) + { + //grab top warp elements + uint val = s_Data[threadIdx.x]; + //calculate exclsive scan and write back to shared memory + s_Data[threadIdx.x] = warpScanExclusive(val, s_Data, size >> LOG2_WARP_SIZE); + } + + //return updated warp scans with exclusive scan results + __syncthreads(); + return warpResult + s_Data[threadIdx.x >> LOG2_WARP_SIZE]; + } + else + { + return warpScanInclusive(idata, s_Data, size); + } +} + +inline __device__ uint scan1Exclusive(uint idata, volatile uint *s_Data, uint size) +{ + return scan1Inclusive(idata, s_Data, size) - idata; +} + +#endif + +inline __device__ uint4 scan4Inclusive(uint4 idata4, volatile uint *s_Data, uint size) +{ + //Level-0 exclusive scan + idata4.y += idata4.x; + idata4.z += idata4.y; + idata4.w += idata4.z; + + //Level-1 exclusive scan + uint oval = scan1Exclusive(idata4.w, s_Data, size / 4); + + idata4.x += oval; + idata4.y += oval; + idata4.z += oval; + idata4.w += oval; + + return idata4; +} + +//Exclusive vector scan: the array to be scanned is stored +//in local thread memory scope as uint4 +inline __device__ uint4 scan4Exclusive(uint4 idata4, volatile uint *s_Data, uint size) +{ + uint4 odata4 = scan4Inclusive(idata4, s_Data, size); + odata4.x -= idata4.x; + odata4.y -= idata4.y; + odata4.z -= idata4.z; + odata4.w -= idata4.w; + return odata4; +} + +//////////////////////////////////////////////////////////////////////////////// +// Scan kernels +//////////////////////////////////////////////////////////////////////////////// +__global__ void scanExclusiveShared( + uint4 *d_Dst, + uint4 *d_Src, + uint size) +{ + __shared__ uint s_Data[2 * THREADBLOCK_SIZE]; + + uint pos = blockIdx.x * blockDim.x + threadIdx.x; + + //Load data + uint4 idata4 = d_Src[pos]; + + //Calculate exclusive scan + uint4 odata4 = scan4Exclusive(idata4, s_Data, size); + + //Write back + d_Dst[pos] = odata4; +} + +//Exclusive scan of top elements of bottom-level scans (4 * THREADBLOCK_SIZE) +__global__ void scanExclusiveShared2( + uint *d_Buf, + uint *d_Dst, + uint *d_Src, + uint N, + uint arrayLength) +{ + __shared__ uint s_Data[2 * THREADBLOCK_SIZE]; + + //Skip loads and stores for inactive threads of last threadblock (pos >= N) + uint pos = blockIdx.x * blockDim.x + threadIdx.x; + + //Load top elements + //Convert results of bottom-level scan back to inclusive + uint idata = 0; + if (pos < N) + idata = + d_Dst[(4 * THREADBLOCK_SIZE) - 1 + (4 * THREADBLOCK_SIZE) * pos] + + d_Src[(4 * THREADBLOCK_SIZE) - 1 + (4 * THREADBLOCK_SIZE) * pos]; + + //Compute + uint odata = scan1Exclusive(idata, s_Data, arrayLength); + + //Avoid out-of-bound access + if (pos < N) + d_Buf[pos] = odata; +} + +//Final step of large-array scan: combine basic inclusive scan with exclusive scan of top elements of input arrays +__global__ void uniformUpdate( + uint4 *d_Data, + uint *d_Buffer) +{ + __shared__ uint buf; + uint pos = blockIdx.x * blockDim.x + threadIdx.x; + + if (threadIdx.x == 0) + buf = d_Buffer[blockIdx.x]; + __syncthreads(); + + uint4 data4 = d_Data[pos]; + data4.x += buf; + data4.y += buf; + data4.z += buf; + data4.w += buf; + d_Data[pos] = data4; +} + +//////////////////////////////////////////////////////////////////////////////// +// Interface function +//////////////////////////////////////////////////////////////////////////////// +//Derived as 32768 (max power-of-two gridDim.x) * 4 * THREADBLOCK_SIZE +//Due to scanExclusiveShared<<<>>>() 1D block addressing +const uint MAX_BATCH_ELEMENTS = 64 * 1048576; +const uint MIN_SHORT_ARRAY_SIZE = 4; +const uint MAX_SHORT_ARRAY_SIZE = 4 * THREADBLOCK_SIZE; +const uint MIN_LARGE_ARRAY_SIZE = 8 * THREADBLOCK_SIZE; +const uint MAX_LARGE_ARRAY_SIZE = 4 * THREADBLOCK_SIZE * THREADBLOCK_SIZE; + +//Internal exclusive scan buffer +static uint *d_Buf; + +void initScan(void) +{ + checkCudaErrors(cudaMalloc((void **)&d_Buf, (MAX_BATCH_ELEMENTS / (4 * THREADBLOCK_SIZE)) * sizeof(uint))); +} + +void closeScan(void) +{ + checkCudaErrors(cudaFree(d_Buf)); +} + +static uint factorRadix2(uint &log2L, uint L) +{ + if (!L) + { + log2L = 0; + return 0; + } + else + { + for (log2L = 0; (L & 1) == 0; L >>= 1, log2L++) + ; + return L; + } +} + +static uint iDivUp(uint dividend, uint divisor) +{ + return ((dividend % divisor) == 0) ? (dividend / divisor) : (dividend / divisor + 1); +} + +size_t scanExclusiveShort( + uint *d_Dst, + uint *d_Src, + uint batchSize, + uint arrayLength) +{ + //Check power-of-two factorization + uint log2L; + uint factorizationRemainder = factorRadix2(log2L, arrayLength); + assert(factorizationRemainder == 1); + + //Check supported size range + assert((arrayLength >= MIN_SHORT_ARRAY_SIZE) && (arrayLength <= MAX_SHORT_ARRAY_SIZE)); + + //Check total batch size limit + assert((batchSize * arrayLength) <= MAX_BATCH_ELEMENTS); + + //Check all threadblocks to be fully packed with data + assert((batchSize * arrayLength) % (4 * THREADBLOCK_SIZE) == 0); + + scanExclusiveShared<<<(batchSize * arrayLength) / (4 * THREADBLOCK_SIZE), THREADBLOCK_SIZE>>>( + (uint4 *)d_Dst, + (uint4 *)d_Src, + arrayLength); + getLastCudaError("scanExclusiveShared() execution FAILED\n"); + + return THREADBLOCK_SIZE; +} + +size_t scanExclusiveLarge( + uint *d_Dst, + uint *d_Src, + uint batchSize, + uint arrayLength) +{ + //Check power-of-two factorization + uint log2L; + uint factorizationRemainder = factorRadix2(log2L, arrayLength); + assert(factorizationRemainder == 1); + + //Check supported size range + assert((arrayLength >= MIN_LARGE_ARRAY_SIZE) && (arrayLength <= MAX_LARGE_ARRAY_SIZE)); + + //Check total batch size limit + assert((batchSize * arrayLength) <= MAX_BATCH_ELEMENTS); + + scanExclusiveShared<<<(batchSize * arrayLength) / (4 * THREADBLOCK_SIZE), THREADBLOCK_SIZE>>>( + (uint4 *)d_Dst, + (uint4 *)d_Src, + 4 * THREADBLOCK_SIZE); + getLastCudaError("scanExclusiveShared() execution FAILED\n"); + + //Not all threadblocks need to be packed with input data: + //inactive threads of highest threadblock just don't do global reads and writes + const uint blockCount2 = iDivUp((batchSize * arrayLength) / (4 * THREADBLOCK_SIZE), THREADBLOCK_SIZE); + scanExclusiveShared2<<>>( + (uint *)d_Buf, + (uint *)d_Dst, + (uint *)d_Src, + (batchSize * arrayLength) / (4 * THREADBLOCK_SIZE), + arrayLength / (4 * THREADBLOCK_SIZE)); + getLastCudaError("scanExclusiveShared2() execution FAILED\n"); + + uniformUpdate<<<(batchSize * arrayLength) / (4 * THREADBLOCK_SIZE), THREADBLOCK_SIZE>>>( + (uint4 *)d_Dst, + (uint *)d_Buf); + getLastCudaError("uniformUpdate() execution FAILED\n"); + + return THREADBLOCK_SIZE; +} + +__global__ void scanInclusiveShared2( + uint *d_Buf, + uint *d_Dst, + uint N, + uint arrayLength) +{ + __shared__ uint s_Data[2 * THREADBLOCK_SIZE]; + + //Skip loads and stores for inactive threads of last threadblock (pos >= N) + uint pos = blockIdx.x * blockDim.x + threadIdx.x; + + //Load top elements + //Convert results of bottom-level scan back to inclusive + uint idata = 0; + if (pos < N) + idata = d_Dst[(4 * THREADBLOCK_SIZE) - 1 + (4 * THREADBLOCK_SIZE) * pos]; + + //Compute + uint odata = scan1Exclusive(idata, s_Data, arrayLength); + + //Avoid out-of-bound access + if (pos < N) + d_Buf[pos] = odata; +} + +__global__ void scanInclusiveShared( + uint4 *d_Dst, + uint4 *d_Src, + uint size) +{ + __shared__ uint s_Data[2 * THREADBLOCK_SIZE]; + + uint pos = blockIdx.x * blockDim.x + threadIdx.x; + + // if(pos= MIN_SHORT_ARRAY_SIZE) && (arrayLength <= MAX_SHORT_ARRAY_SIZE) ); + + //Check total batch size limit + // assert( (batchSize * arrayLength) <= MAX_BATCH_ELEMENTS ); + + //Check all threadblocks to be fully packed with data + //assert( (batchSize * arrayLength) % (4 * THREADBLOCK_SIZE) == 0 ); + int blocks = (batchSize * arrayLength + 4 * THREADBLOCK_SIZE - 1) / (4 * THREADBLOCK_SIZE); + scanInclusiveShared<<>>( + (uint4 *)d_Dst, + (uint4 *)d_Src, + arrayLength); + getLastCudaError("scanExclusiveShared() execution FAILED\n"); + + return THREADBLOCK_SIZE; +} + +size_t scanInclusiveLarge( + uint *d_Dst, + uint *d_Src, + uint batchSize, + uint arrayLength) +{ + //Check power-of-two factorization + uint log2L; + uint factorizationRemainder = factorRadix2(log2L, arrayLength); + assert(factorizationRemainder == 1); + + //Check supported size range + //assert( (arrayLength >= MIN_LARGE_ARRAY_SIZE) && (arrayLength <= MAX_LARGE_ARRAY_SIZE) ); + + //Check total batch size limit + //assert( (batchSize * arrayLength) <= MAX_BATCH_ELEMENTS ); + + scanInclusiveShared<<<(batchSize * arrayLength) / (4 * THREADBLOCK_SIZE), THREADBLOCK_SIZE>>>( + (uint4 *)d_Dst, + (uint4 *)d_Src, + 4 * THREADBLOCK_SIZE); + getLastCudaError("scanExclusiveShared() execution FAILED\n"); + + //Not all threadblocks need to be packed with input data: + //inactive threads of highest threadblock just don't do global reads and writes + const uint blockCount2 = iDivUp((batchSize * arrayLength) / (4 * THREADBLOCK_SIZE), THREADBLOCK_SIZE); + scanInclusiveShared2<<>>( + (uint *)d_Buf, + (uint *)d_Dst, + (batchSize * arrayLength) / (4 * THREADBLOCK_SIZE), + arrayLength / (4 * THREADBLOCK_SIZE)); + getLastCudaError("scanExclusiveShared2() execution FAILED\n"); + + uniformUpdate<<<(batchSize * arrayLength) / (4 * THREADBLOCK_SIZE), THREADBLOCK_SIZE>>>( + (uint4 *)d_Dst, + (uint *)d_Buf); + getLastCudaError("uniformUpdate() execution FAILED\n"); + + return THREADBLOCK_SIZE; +} + + +#include +#include +#include +#include <6_Advanced/scan/scan_common.h> + +extern __shared__ uint sMemory[]; + +__device__ inline double atomicMax(double *address, double val) +{ + unsigned long long int *address_as_ull = (unsigned long long int *)address; + unsigned long long int assumed; + unsigned long long int old = *address_as_ull; + + assumed = old; + old = atomicCAS(address_as_ull, + assumed, + __double_as_longlong(max(val, __longlong_as_double(assumed)))); + + while (assumed != old) + { + assumed = old; + old = atomicCAS(address_as_ull, + assumed, + __double_as_longlong(max(val, __longlong_as_double(assumed)))); + } + return __longlong_as_double(old); +} + +__device__ inline double atomicMin(double *address, double val) +{ + unsigned long long int *address_as_ull = (unsigned long long int *)address; + unsigned long long int old = *address_as_ull, assumed; + + assumed = old; + old = atomicCAS(address_as_ull, + assumed, + __double_as_longlong(min(val, __longlong_as_double(assumed)))); + while (assumed != old) + { + assumed = old; + old = atomicCAS(address_as_ull, + assumed, + __double_as_longlong(min(val, __longlong_as_double(assumed)))); + } + return __longlong_as_double(old); +} + +template +__device__ inline void Comparator( + + Type &valA, + Type &valB, + uint dir) +{ + Type t; + if ((valA > valB) == dir) + { + t = valA; + valA = valB; + valB = t; + } +} + +static __device__ __forceinline__ unsigned int __qsflo(unsigned int word) +{ + unsigned int ret; + asm volatile("bfind.u32 %0, %1;" + : "=r"(ret) + : "r"(word)); + return ret; +} + +template +__global__ void globalBitonicSort(Type *indata, Type *outdata, Block *bucket, bool inputSelect) +{ + __shared__ uint shared[1024]; + + Type *data; + + Block cord = bucket[blockIdx.x]; + + uint size = cord.end - cord.begin; + bool select = !(cord.select); + + if (cord.end - cord.begin > 1024 || cord.end - cord.begin == 0) + return; + + unsigned int bitonicSize = 1 << (__qsflo(size - 1U) + 1); + + if (select) + data = indata; + else + data = outdata; + + //__syncthreads(); + + for (int i = threadIdx.x; i < size; i += blockDim.x) + shared[i] = data[i + cord.begin]; + + for (int i = threadIdx.x + size; i < bitonicSize; i += blockDim.x) + shared[i] = 0xffffffff; + + __syncthreads(); + + for (uint size = 2; size < bitonicSize; size <<= 1) + { + //Bitonic merge + uint ddd = 1 ^ ((threadIdx.x & (size / 2)) != 0); + for (uint stride = size / 2; stride > 0; stride >>= 1) + { + __syncthreads(); + uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); + //if(pos 0; stride >>= 1) + { + __syncthreads(); + uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); + // if(pos +__global__ void quick(Type *indata, Type *buffer, Partition *partition, Block *bucket) +{ + + __shared__ Type sh_out[1024]; + + __shared__ uint start1, end1; + __shared__ uint left, right; + + int tix = threadIdx.x; + + uint start = partition[blockIdx.x].from; + uint end = partition[blockIdx.x].end; + Type pivot = partition[blockIdx.x].pivot; + uint nseq = partition[blockIdx.x].ibucket; + + uint lo = 0; + uint hi = 0; + + Type lmin = 0xffffffff; + Type rmax = 0; + + Type d; + + // start read on 1° tile and store the coordinates of the items that must + // be moved on the left or on the right of the pivot + + if (tix + start < end) + { + d = indata[tix + start]; + + //count items smaller or bigger than the pivot + // if d= pivot) * lo; + // if d>pivot then lr++ else lr + hi = (d <= pivot) * (hi) + (d > pivot) * (hi + 1); + + lmin = d; + rmax = d; + } + + //read and store the coordinates on next tiles for each block + for (uint i = tix + start + blockDim.x; i < end; i += blockDim.x) + { + Type d = indata[i]; + + //count items smaller or bigger than the pivot + lo = (d < pivot) * (lo + 1) + (d >= pivot) * lo; + hi = (d <= pivot) * (hi) + (d > pivot) * (hi + 1); + + //compute max and min of tile items + lmin = min(lmin, d); + rmax = max(rmax, d); + } + + //compute max and min of every partition + + compareInclusive(rmax, lmin, (Type *)sh_out, blockDim.x); + + __syncthreads(); + + if (tix == blockDim.x - 1) + { + //compute absolute max and min for the bucket + atomicMax(&bucket[nseq].maxPiv, rmax); + atomicMin(&bucket[nseq].minPiv, lmin); + } + + __syncthreads(); + + /* + * calculate the coordinates of its assigned item to each thread, + * which are necessary to known in which subsequences the item must be copied + * + */ + scan1Inclusive2(lo, hi, (uint *)sh_out, blockDim.x); + lo = lo - 1; + hi = SHARED_LIMIT - hi; + + if (tix == blockDim.x - 1) + { + left = lo + 1; + right = SHARED_LIMIT - hi; + + start1 = atomicAdd(&bucket[nseq].nextbegin, left); + end1 = atomicSub(&bucket[nseq].nextend, right); + } + + __syncthreads(); + + //thread blocks write on the shared memory the items smaller and bigger than the first tile's pivot + if (tix + start < end) + { + //items smaller than pivot + if (d < pivot) + { + sh_out[lo] = d; + lo--; + } + + //items bigger than pivot + if (d > pivot) + { + sh_out[hi] = d; + hi++; + } + } + + //thread blocks write on the shared memory the items smaller and bigger than next tiles' pivot + for (uint i = start + tix + blockDim.x; i < end; i += blockDim.x) + { + + Type d = indata[i]; + //items smaller than the pivot + if (d < pivot) + { + sh_out[lo] = d; + lo--; + } + + //items bigger than the pivot + if (d > pivot) + { + sh_out[hi] = d; + hi++; + } + } + + __syncthreads(); + + //items smaller and bigger than the pivot already sorted in the shared memory are coalesced written on the global memory + //partial results of each thread block stored on the shared memory are merged together in two subsequences within the global memory + //coalesced writing of next tiles on the global memory + for (uint i = tix; i < SHARED_LIMIT; i += blockDim.x) + { + if (i < left) + buffer[start1 + i] = sh_out[i]; + + if (i >= SHARED_LIMIT - right) + buffer[end1 + i - SHARED_LIMIT] = sh_out[i]; + } +} + +//this function assigns the attributes to each partition of each bucket +//a thread block is assigned to a specific partition +template +__global__ void partitionAssign(struct Block *bucket, uint *npartitions, struct Partition *partition) +{ + int tx = threadIdx.x; + int bx = blockIdx.x; + + uint beg = bucket[bx].nextbegin; + uint end = bucket[bx].nextend; + Type pivot = bucket[bx].pivot; + + uint from; + uint to; + + if (bx > 0) + { + from = npartitions[bx - 1]; + to = npartitions[bx]; + } + else + { + from = 0; + to = npartitions[bx]; + } + + uint i = tx + from; + + if (i < to) + { + uint begin = beg + SHARED_LIMIT * tx; + partition[i].from = begin; + partition[i].end = begin + SHARED_LIMIT; + partition[i].pivot = pivot; + partition[i].ibucket = bx; + } + + for (uint i = tx + from + blockDim.x; i < to; i += blockDim.x) + { + uint begin = beg + SHARED_LIMIT * (i - from); + partition[i].from = begin; + partition[i].end = begin + SHARED_LIMIT; + partition[i].pivot = pivot; + partition[i].ibucket = bx; + } + __syncthreads(); + if (tx == 0 && to - from > 0) + partition[to - 1].end = end; +} + +//this function enters the pivot value in the central bucket's items +template +__global__ void insertPivot(Type *data, struct Block *bucket, int nbucket) +{ + + Type pivot = bucket[blockIdx.x].pivot; + uint start = bucket[blockIdx.x].nextbegin; + uint end = bucket[blockIdx.x].nextend; + bool is_altered = bucket[blockIdx.x].done; + + if (is_altered && blockIdx.x < nbucket) + for (uint j = start + threadIdx.x; j < end; j += blockDim.x) + data[j] = pivot; +} + +//this function assigns the new attributes of each bucket +template +__global__ void bucketAssign(Block *bucket, uint *npartitions, int nbucket, int select) +{ + + uint i = blockIdx.x * blockDim.x + threadIdx.x; + + if (i < nbucket) + { + bool is_altered = bucket[i].done; + if (is_altered) + { + //read on i node + uint orgbeg = bucket[i].begin; + uint from = bucket[i].nextbegin; + uint orgend = bucket[i].end; + uint end = bucket[i].nextend; + Type pivot = bucket[i].pivot; + Type minPiv = bucket[i].minPiv; + Type maxPiv = bucket[i].maxPiv; + + //compare each bucket's max and min to the pivot + Type lmaxpiv = min(pivot, maxPiv); + Type rminpiv = max(pivot, minPiv); + + //write on i+nbucket node + bucket[i + nbucket].begin = orgbeg; + bucket[i + nbucket].nextbegin = orgbeg; + bucket[i + nbucket].nextend = from; + bucket[i + nbucket].end = from; + bucket[i + nbucket].pivot = (minPiv + lmaxpiv) / 2; + + //if(select) + // bucket[i+nbucket].done = (from-orgbeg)>1024;// && (minPiv!=maxPiv); + //else + bucket[i + nbucket].done = (from - orgbeg) > 1024 && (minPiv != maxPiv); + bucket[i + nbucket].select = select; + bucket[i + nbucket].minPiv = ( Type ) 0xffffffffffffffff; + bucket[i + nbucket].maxPiv = 0; + //bucket[i+nbucket].finish=false; + + //calculate the number of partitions (npartitions) necessary to the i+nbucket bucket + if (!bucket[i + nbucket].done) + npartitions[i + nbucket] = 0; + else + npartitions[i + nbucket] = (from - orgbeg + SHARED_LIMIT - 1) / SHARED_LIMIT; + + //write on i node + bucket[i].begin = end; + bucket[i].nextbegin = end; + bucket[i].nextend = orgend; + bucket[i].pivot = (rminpiv + maxPiv) / 2 + 1; + + //if(select) + //bucket[i].done = (orgend-end)>1024;// && (minPiv!=maxPiv); + // else + bucket[i].done = (orgend - end) > 1024 && (minPiv != maxPiv); + bucket[i].select = select; + bucket[i].minPiv = ( Type ) 0xffffffffffffffff; + bucket[i].maxPiv = 0; + //bucket[i].finish=false; + + //calculate the number of partitions (npartitions) necessary to the i-bucket bucket + if (!bucket[i].done) + npartitions[i] = 0; + else + npartitions[i] = (orgend - end + SHARED_LIMIT - 1) / SHARED_LIMIT; + } + } +} + +template +__global__ void init(Type *data, Block *bucket, uint *npartitions, int size, int nblocks) +{ + uint i = blockIdx.x * blockDim.x + threadIdx.x; + + if (i < nblocks) + { + bucket[i].nextbegin = 0; + bucket[i].begin = 0; + + bucket[i].nextend = 0 + size * (i == 0); + bucket[i].end = 0 + size * (i == 0); + npartitions[i] = 0; + bucket[i].done = false + i == 0; + bucket[i].select = false; + bucket[i].maxPiv = 0x0; + bucket[i].minPiv = ( Type ) 0xffffffffffffffff; + //bucket[i].pivot = 0+ (i==0)*((min(min(data[0],data[size/2]),data[size-1]) + max(max(data[0],data[size/2]),data[size-1]))/2); + bucket[i].pivot = data[size / 2]; + } +} + +template +void sort(Type *ddata, Type *outputData, uint size, uint threadCount, int device, double *wallClock) +{ + + cudaSetDevice(device); + + cudaGetLastError(); + //cudaDeviceReset(); + + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, device); + + StopWatchInterface *htimer = NULL; + Type *dbuffer; + + Block *dbucket; + struct Partition *partition; + uint *npartitions1, *npartitions2; + + uint *cudaBlocks = (uint *)malloc(4); + + uint blocks = (size + SHARED_LIMIT - 1) / SHARED_LIMIT; + uint nblock = 10 * blocks; + int partition_max = 262144; + + //unsigned long long int total = partition_max * sizeof(Block) + nblock * sizeof(Partition) + 2 * partition_max * sizeof(uint) + 3 * (size) * sizeof(Type); + + //Allocating and initializing CUDA arrays + sdkCreateTimer(&htimer); + checkCudaErrors(cudaMalloc((void **)&dbucket, partition_max * sizeof(Block))); + checkCudaErrors(cudaMalloc((void **)&partition, nblock * sizeof(Partition))); //nblock + + checkCudaErrors(cudaMalloc((void **)&npartitions1, partition_max * sizeof(uint))); + checkCudaErrors(cudaMalloc((void **)&npartitions2, partition_max * sizeof(uint))); + + checkCudaErrors(cudaMalloc((void **)&dbuffer, (size) * sizeof(Type))); + + initScan(); + + //setting GPU Cache + cudaFuncSetCacheConfig(init, cudaFuncCachePreferL1); + cudaFuncSetCacheConfig(insertPivot, cudaFuncCachePreferL1); + cudaFuncSetCacheConfig(bucketAssign, cudaFuncCachePreferL1); + cudaFuncSetCacheConfig(partitionAssign, cudaFuncCachePreferL1); + cudaFuncSetCacheConfig(quick, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(globalBitonicSort, cudaFuncCachePreferShared); + + checkCudaErrors(cudaDeviceSynchronize()); + sdkResetTimer(&htimer); + sdkStartTimer(&htimer); + + //initializing bucket array: initial attributes for each bucket + init<<<(nblock + 255) / 256, 256>>>(ddata, dbucket, npartitions1, size, partition_max); + + uint nbucket = 1; + uint numIterations = 0; + bool inputSelect = true; + + *cudaBlocks = blocks; + checkCudaErrors(cudaDeviceSynchronize()); + getLastCudaError("init() execution FAILED\n"); + checkCudaErrors(cudaMemcpy(&npartitions2[0], cudaBlocks, sizeof(uint), cudaMemcpyHostToDevice)); + + // beginning of the first phase + // this phase goes on until the size of the buckets is comparable to the SHARED_LIMIT size + while (1) + { + + /* + * --------------------- Pre-processing: Partitioning --------------------- + * + * buckets are further divided in partitions based on their size + * the number of partitions needed for each subsequence is determined by the number of elements which can be + * processed by each thread block. + * + * the number of partitions (npartitions) for each block will depend on the shared memory size (SHARED_LIMIT) + * + */ + + if (numIterations > 0) + { //1024 is the shared memory limit of scanInclusiveShort() + if (nbucket <= 1024) + scanInclusiveShort(npartitions2, npartitions1, 1, nbucket); + else + scanInclusiveLarge(npartitions2, npartitions1, 1, nbucket); + + checkCudaErrors(cudaMemcpy(cudaBlocks, &npartitions2[nbucket - 1], sizeof(uint), cudaMemcpyDeviceToHost)); + } + + if (*cudaBlocks == 0) + break; + + /* + * --------------------- step 1 --------------------- + * + * A thread block is assigned to each different partition + * each partition is assigned coordinates, pivot and .... + */ + + partitionAssign<<>>(dbucket, npartitions2, partition); + cudaDeviceSynchronize(); + getLastCudaError("partitionAssign() execution FAILED\n"); + + /* + --------------------- step 2a --------------------- + + in this function each thread block creates two subsequences + to divide the items in the partition whose value is lower than + the pivot value, from the items whose value is higher than the pivot value + */ + + if (inputSelect) + quick<<<*cudaBlocks, threadCount>>>(ddata, dbuffer, partition, dbucket); + else + quick<<<*cudaBlocks, threadCount>>>(dbuffer, ddata, partition, dbucket); + cudaDeviceSynchronize(); + getLastCudaError("quick() execution FAILED\n"); + + //step 2b: this function enters the pivot value in the central bucket's items + insertPivot<<>>(ddata, dbucket, nbucket); + + //step 3: parameters are assigned, linked to the two new buckets created in step 2 + bucketAssign<<<(nbucket + 255) / 256, 256>>>(dbucket, npartitions1, nbucket, inputSelect); + cudaDeviceSynchronize(); + getLastCudaError("insertPivot() or bucketAssign() execution FAILED\n"); + + nbucket *= 2; + + inputSelect = !inputSelect; + numIterations++; + if (nbucket > deviceProp.maxGridSize[0]) + break; + //if(numIterations==18) break; + } + + /* + * start second phase: + * now the size of the buckets is such that they can be entirely processed by a thread block + * + */ + + if (nbucket > deviceProp.maxGridSize[0]) + fprintf(stderr, "ERROR: CUDA-Quicksort can't terminate sorting as the block threads needed to finish it are more than the Maximum x-dimension of FERMI GPU thread blocks. Please use Kepler GPUs as the Maximum x-dimension of their thread blocks is much higher\n"); + else + globalBitonicSort<<>>(ddata, dbuffer, dbucket, inputSelect); + + checkCudaErrors(cudaDeviceSynchronize()); + getLastCudaError("globalBitonicSort() execution FAILED\n"); + + sdkStopTimer(&htimer); + *wallClock = sdkGetTimerValue(&htimer); + + // release resources + checkCudaErrors(cudaFree(dbuffer)); + checkCudaErrors(cudaFree(dbucket)); + checkCudaErrors(cudaFree(npartitions2)); + checkCudaErrors(cudaFree(npartitions1)); + free(cudaBlocks); + + closeScan(); + return; +} + +void CUDA_Quicksort(uint* inputData, uint* outputData, uint dataSize, uint threadCount, int Device, double* wallClock) +{ + + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, Device); + + if(deviceProp.major<2) + { + fprintf(stderr, "Error: the GPU device %d has a Compute Capability of %d.%d, while a Compute Capability of 2.x is required to run the code\n", + Device, deviceProp.major, deviceProp.minor); + + int deviceCount; + cudaGetDeviceCount(&deviceCount); + + fprintf(stderr, " the Host system has the following GPU devices:\n"); + + for (int device = 0; device < deviceCount; device++) { + + fprintf(stderr, "\t the GPU device %d is a %s, with Compute Capability %d.%d\n", + device, deviceProp.name, deviceProp.major, deviceProp.minor); + } + + return; + } + + sort(inputData,outputData, dataSize,threadCount,Device, wallClock); +} + +void CUDA_Quicksort_64(double* inputData,double* outputData, uint dataSize, uint threadCount, int Device, double* wallClock) +{ + + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, Device); + + if(deviceProp.major<2) + { + fprintf(stderr, "Error: the GPU device %d has a Compute Capability of %d.%d, while a Compute Capability of 2.x is required to run the code\n", + Device, deviceProp.major, deviceProp.minor); + + int deviceCount; + cudaGetDeviceCount(&deviceCount); + + fprintf(stderr, " the Host system has the following GPU devices:\n"); + + for (int device = 0; device < deviceCount; device++) { + + fprintf(stderr, "\t the GPU device %d is a %s, with Compute Capability %d.%d\n", + device, deviceProp.name, deviceProp.major, deviceProp.minor); + } + + return; + } + + sort(inputData,outputData, dataSize,threadCount,Device,wallClock); + +} + +struct MancaQuicksort +{ + static void sort( TNL::Containers::ArrayView< int, TNL::Devices::Cuda >& array ) + { + double timer; + CUDA_Quicksort( ( unsigned * ) array.getData(), (unsigned * ) array.getData(), array.getSize(), 256, 0, &timer ); + //return; + } +}; diff --git a/src/Benchmarks/Sorting/ReferenceAlgorithms/NvidiaBitonicSort.h b/src/Benchmarks/Sorting/ReferenceAlgorithms/NvidiaBitonicSort.h new file mode 100644 index 0000000000000000000000000000000000000000..8345712879881c1dd3860f1b03f74b60ff647691 --- /dev/null +++ b/src/Benchmarks/Sorting/ReferenceAlgorithms/NvidiaBitonicSort.h @@ -0,0 +1,24 @@ + +#ifdef HAVE_CUDA_SAMPLES +#include <6_Advanced/sortingNetworks/bitonicSort.cu> +#endif +#include + +namespace TNL { + +struct NvidiaBitonicSort +{ + static void sort( Containers::ArrayView< int, Devices::Cuda >& view ) + { +#ifdef HAVE_CUDA_SAMPLES + Containers::Array arr; + arr = view; + bitonicSort((unsigned *)view.getData(), (unsigned *)arr.getData(), + (unsigned *)view.getData(), (unsigned *)arr.getData(), + 1, arr.getSize(), 1); + cudaDeviceSynchronize(); +#endif + } +}; + +} // namespace TNL diff --git a/src/Benchmarks/Sorting/ReferenceAlgorithms/ThrustRadixsort.h b/src/Benchmarks/Sorting/ReferenceAlgorithms/ThrustRadixsort.h new file mode 100644 index 0000000000000000000000000000000000000000..02f03a023f55d97c4570c52d9bae7c238146ef59 --- /dev/null +++ b/src/Benchmarks/Sorting/ReferenceAlgorithms/ThrustRadixsort.h @@ -0,0 +1,16 @@ +#include +#include +#include + +namespace TNL { + +struct ThrustRadixsort +{ + static void sort( Containers::ArrayView< int, Devices::Cuda >& view ) + { + thrust::sort(thrust::device, view.getData(), view.getData() + view.getSize()); + cudaDeviceSynchronize(); + } +}; + +} // namespace TNL diff --git a/src/Benchmarks/Sorting/ReferenceAlgorithms/helper_timer.h b/src/Benchmarks/Sorting/ReferenceAlgorithms/helper_timer.h new file mode 100644 index 0000000000000000000000000000000000000000..c1e411650a72d470dc8b18565b4f229cf425ae97 --- /dev/null +++ b/src/Benchmarks/Sorting/ReferenceAlgorithms/helper_timer.h @@ -0,0 +1,495 @@ +/** + * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +// Helper Timing Functions +#ifndef HELPER_TIMER_H +#define HELPER_TIMER_H + +// includes, system +#include + +// includes, project +#include "helpers/exception.h" + +// Definition of the StopWatch Interface, this is used if we don't want to use the CUT functions +// But rather in a self contained class interface +class StopWatchInterface +{ + public: + StopWatchInterface() {}; + virtual ~StopWatchInterface() {}; + + public: + //! Start time measurement + virtual void start() = 0; + + //! Stop time measurement + virtual void stop() = 0; + + //! Reset time counters to zero + virtual void reset() = 0; + + //! Time in msec. after start. If the stop watch is still running (i.e. there + //! was no call to stop()) then the elapsed time is returned, otherwise the + //! time between the last start() and stop call is returned + virtual float getTime() = 0; + + //! Mean time to date based on the number of times the stopwatch has been + //! _stopped_ (ie finished sessions) and the current total time + virtual float getAverageTime() = 0; +}; + + +////////////////////////////////////////////////////////////////// +// Begin Stopwatch timer class definitions for all OS platforms // +////////////////////////////////////////////////////////////////// +#ifdef _WIN32 +// includes, system +#define WINDOWS_LEAN_AND_MEAN +#include +#undef min +#undef max + +//! Windows specific implementation of StopWatch +class StopWatchWin : public StopWatchInterface +{ + public: + //! Constructor, default + StopWatchWin() : + start_time(), end_time(), + diff_time(0.0f), total_time(0.0f), + running(false), clock_sessions(0), freq(0), freq_set(false) + { + if (! freq_set) + { + // helper variable + LARGE_INTEGER temp; + + // get the tick frequency from the OS + QueryPerformanceFrequency((LARGE_INTEGER *) &temp); + + // convert to type in which it is needed + freq = ((double) temp.QuadPart) / 1000.0; + + // rememeber query + freq_set = true; + } + }; + + // Destructor + ~StopWatchWin() { }; + + public: + //! Start time measurement + inline void start(); + + //! Stop time measurement + inline void stop(); + + //! Reset time counters to zero + inline void reset(); + + //! Time in msec. after start. If the stop watch is still running (i.e. there + //! was no call to stop()) then the elapsed time is returned, otherwise the + //! time between the last start() and stop call is returned + inline float getTime(); + + //! Mean time to date based on the number of times the stopwatch has been + //! _stopped_ (ie finished sessions) and the current total time + inline float getAverageTime(); + + private: + // member variables + + //! Start of measurement + LARGE_INTEGER start_time; + //! End of measurement + LARGE_INTEGER end_time; + + //! Time difference between the last start and stop + float diff_time; + + //! TOTAL time difference between starts and stops + float total_time; + + //! flag if the stop watch is running + bool running; + + //! Number of times clock has been started + //! and stopped to allow averaging + int clock_sessions; + + //! tick frequency + double freq; + + //! flag if the frequency has been set + bool freq_set; +}; + +// functions, inlined + +//////////////////////////////////////////////////////////////////////////////// +//! Start time measurement +//////////////////////////////////////////////////////////////////////////////// +inline void +StopWatchWin::start() +{ + QueryPerformanceCounter((LARGE_INTEGER *) &start_time); + running = true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Stop time measurement and increment add to the current diff_time summation +//! variable. Also increment the number of times this clock has been run. +//////////////////////////////////////////////////////////////////////////////// +inline void +StopWatchWin::stop() +{ + QueryPerformanceCounter((LARGE_INTEGER *) &end_time); + diff_time = (float) + (((double) end_time.QuadPart - (double) start_time.QuadPart) / freq); + + total_time += diff_time; + clock_sessions++; + running = false; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Reset the timer to 0. Does not change the timer running state but does +//! recapture this point in time as the current start time if it is running. +//////////////////////////////////////////////////////////////////////////////// +inline void +StopWatchWin::reset() +{ + diff_time = 0; + total_time = 0; + clock_sessions = 0; + + if (running) + { + QueryPerformanceCounter((LARGE_INTEGER *) &start_time); + } +} + + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. after start. If the stop watch is still running (i.e. there +//! was no call to stop()) then the elapsed time is returned added to the +//! current diff_time sum, otherwise the current summed time difference alone +//! is returned. +//////////////////////////////////////////////////////////////////////////////// +inline float +StopWatchWin::getTime() +{ + // Return the TOTAL time to date + float retval = total_time; + + if (running) + { + LARGE_INTEGER temp; + QueryPerformanceCounter((LARGE_INTEGER *) &temp); + retval += (float) + (((double)(temp.QuadPart - start_time.QuadPart)) / freq); + } + + return retval; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. for a single run based on the total number of COMPLETED runs +//! and the total time. +//////////////////////////////////////////////////////////////////////////////// +inline float +StopWatchWin::getAverageTime() +{ + return (clock_sessions > 0) ? (total_time/clock_sessions) : 0.0f; +} +#else +// Declarations for Stopwatch on Linux and Mac OSX +// includes, system +#include +#include + +//! Windows specific implementation of StopWatch +class StopWatchLinux : public StopWatchInterface +{ + public: + //! Constructor, default + StopWatchLinux() : + start_time(), diff_time(0.0), total_time(0.0), + running(false), clock_sessions(0) + { }; + + // Destructor + virtual ~StopWatchLinux() + { }; + + public: + //! Start time measurement + inline void start(); + + //! Stop time measurement + inline void stop(); + + //! Reset time counters to zero + inline void reset(); + + //! Time in msec. after start. If the stop watch is still running (i.e. there + //! was no call to stop()) then the elapsed time is returned, otherwise the + //! time between the last start() and stop call is returned + inline float getTime(); + + //! Mean time to date based on the number of times the stopwatch has been + //! _stopped_ (ie finished sessions) and the current total time + inline float getAverageTime(); + + private: + + // helper functions + + //! Get difference between start time and current time + inline float getDiffTime(); + + private: + + // member variables + + //! Start of measurement + struct timeval start_time; + + //! Time difference between the last start and stop + float diff_time; + + //! TOTAL time difference between starts and stops + float total_time; + + //! flag if the stop watch is running + bool running; + + //! Number of times clock has been started + //! and stopped to allow averaging + int clock_sessions; +}; + +// functions, inlined + +//////////////////////////////////////////////////////////////////////////////// +//! Start time measurement +//////////////////////////////////////////////////////////////////////////////// +inline void +StopWatchLinux::start() +{ + gettimeofday(&start_time, 0); + running = true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Stop time measurement and increment add to the current diff_time summation +//! variable. Also increment the number of times this clock has been run. +//////////////////////////////////////////////////////////////////////////////// +inline void +StopWatchLinux::stop() +{ + diff_time = getDiffTime(); + total_time += diff_time; + running = false; + clock_sessions++; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Reset the timer to 0. Does not change the timer running state but does +//! recapture this point in time as the current start time if it is running. +//////////////////////////////////////////////////////////////////////////////// +inline void +StopWatchLinux::reset() +{ + diff_time = 0; + total_time = 0; + clock_sessions = 0; + + if (running) + { + gettimeofday(&start_time, 0); + } +} + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. after start. If the stop watch is still running (i.e. there +//! was no call to stop()) then the elapsed time is returned added to the +//! current diff_time sum, otherwise the current summed time difference alone +//! is returned. +//////////////////////////////////////////////////////////////////////////////// +inline float +StopWatchLinux::getTime() +{ + // Return the TOTAL time to date + float retval = total_time; + + if (running) + { + retval += getDiffTime(); + } + + return retval; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. for a single run based on the total number of COMPLETED runs +//! and the total time. +//////////////////////////////////////////////////////////////////////////////// +inline float +StopWatchLinux::getAverageTime() +{ + return (clock_sessions > 0) ? (total_time/clock_sessions) : 0.0f; +} +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +inline float +StopWatchLinux::getDiffTime() +{ + struct timeval t_time; + gettimeofday(&t_time, 0); + + // time difference in milli-seconds + return (float)(1000.0 * (t_time.tv_sec - start_time.tv_sec) + + (0.001 * (t_time.tv_usec - start_time.tv_usec))); +} +#endif // _WIN32 + +//////////////////////////////////////////////////////////////////////////////// +//! Timer functionality exported + +//////////////////////////////////////////////////////////////////////////////// +//! Create a new timer +//! @return true if a time has been created, otherwise false +//! @param name of the new timer, 0 if the creation failed +//////////////////////////////////////////////////////////////////////////////// +inline bool +sdkCreateTimer(StopWatchInterface **timer_interface) +{ + //printf("sdkCreateTimer called object %08x\n", (void *)*timer_interface); +#ifdef _WIN32 + *timer_interface = (StopWatchInterface *)new StopWatchWin(); +#else + *timer_interface = (StopWatchInterface *)new StopWatchLinux(); +#endif + return (*timer_interface != NULL) ? true : false; +} + + +//////////////////////////////////////////////////////////////////////////////// +//! Delete a timer +//! @return true if a time has been deleted, otherwise false +//! @param name of the timer to delete +//////////////////////////////////////////////////////////////////////////////// +inline bool +sdkDeleteTimer(StopWatchInterface **timer_interface) +{ + //printf("sdkDeleteTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) + { + delete *timer_interface; + *timer_interface = NULL; + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Start the time with name \a name +//! @param name name of the timer to start +//////////////////////////////////////////////////////////////////////////////// +inline bool +sdkStartTimer(StopWatchInterface **timer_interface) +{ + //printf("sdkStartTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) + { + (*timer_interface)->start(); + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Stop the time with name \a name. Does not reset. +//! @param name name of the timer to stop +//////////////////////////////////////////////////////////////////////////////// +inline bool +sdkStopTimer(StopWatchInterface **timer_interface) +{ + // printf("sdkStopTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) + { + (*timer_interface)->stop(); + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Resets the timer's counter. +//! @param name name of the timer to reset. +//////////////////////////////////////////////////////////////////////////////// +inline bool +sdkResetTimer(StopWatchInterface **timer_interface) +{ + // printf("sdkResetTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) + { + (*timer_interface)->reset(); + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Return the average time for timer execution as the total time +//! for the timer dividied by the number of completed (stopped) runs the timer +//! has made. +//! Excludes the current running time if the timer is currently running. +//! @param name name of the timer to return the time of +//////////////////////////////////////////////////////////////////////////////// +inline float +sdkGetAverageTimerValue(StopWatchInterface **timer_interface) +{ + // printf("sdkGetAverageTimerValue called object %08x\n", (void *)*timer_interface); + if (*timer_interface) + { + return (*timer_interface)->getAverageTime(); + } + else + { + return 0.0f; + } +} + +//////////////////////////////////////////////////////////////////////////////// +//! Total execution time for the timer over all runs since the last reset +//! or timer creation. +//! @param name name of the timer to obtain the value of. +//////////////////////////////////////////////////////////////////////////////// +inline float +sdkGetTimerValue(StopWatchInterface **timer_interface) +{ + // printf("sdkGetTimerValue called object %08x\n", (void *)*timer_interface); + if (*timer_interface) + { + return (*timer_interface)->getTime(); + } + else + { + return 0.0f; + } +} + +#endif // HELPER_TIMER_H diff --git a/src/Benchmarks/Sorting/bitonicsortBenchmark.cu b/src/Benchmarks/Sorting/bitonicsortBenchmark.cu new file mode 100644 index 0000000000000000000000000000000000000000..88c612fe8a3442b7ac81b4e22a3d6425c734c962 --- /dev/null +++ b/src/Benchmarks/Sorting/bitonicsortBenchmark.cu @@ -0,0 +1,10 @@ +#include "../../src/bitonicSort/bitonicSort.h" + +#include "../benchmarker.cpp" +#include "../measure.cu" + +template +void sorter(ArrayView arr) +{ + bitonicSort(arr); +} \ No newline at end of file diff --git a/src/Benchmarks/Sorting/generators.h b/src/Benchmarks/Sorting/generators.h new file mode 100644 index 0000000000000000000000000000000000000000..1615c90abda0f586df79570a529069f273fa4b0d --- /dev/null +++ b/src/Benchmarks/Sorting/generators.h @@ -0,0 +1,148 @@ +#pragma once +#include +#include +#include +#include +using namespace std; + +vector generateSorted(int size) +{ + vector vec(size); + + iota(vec.begin(), vec.end(), 0); + + return vec; +} + +vector generateRandom(int size) +{ + vector vec(size); + + srand(size + 2021); + generate(vec.begin(), vec.end(), [=](){return std::rand() % (2*size);}); + + return vec; +} + +vector generateShuffle(int size) +{ + vector vec(size); + + iota(vec.begin(), vec.end(), 0); + srand(size); + random_shuffle(vec.begin(), vec.end()); + + return vec; +} + +vector generateAlmostSorted(int size) +{ + vector vec(size); + + iota(vec.begin(), vec.end(), 0); + srand(9451); + for(int i = 0; i < 3; i++) //swaps 3 times in array + { + int s = rand() % (size - 3); + std::swap(vec[s], vec[s + 1]); + } + + return vec; +} + +vector generateDecreasing(int size) +{ + vector vec(size); + + for(int i = 0; i < size; i++) + vec[i] = size - i; + + return vec; +} + +vector generateZero_entropy(int size) +{ + vector vec(size, 515); + return vec; +} + +vector generateGaussian(int size) +{ + vector vec(size); + srand(size + 2000); + + for (int i = 0; i < size; ++i) + { + int value = 0; + for (int j = 0; j < 4; ++j) + value += rand()%16384; + + vec[i] = value /4; + } + + return vec; +} + +vector generateBucket(int size) +{ + vector vec(size); + + srand (size + 94215); + double tmp = ((double)size)*3000000; //(RAND_MAX)/p; --> ((double)N)*30000; + double tmp2 = sqrt(tmp); + + int p= (size+tmp2-1)/tmp2; + + const int VALUE = 8192/p; //(RAND_MAX)/p; + + int i=0; int x=0; + //the array of size N is split into 'p' buckets + while(i < p) + { + for (int z = 0; z < p; ++z) + for (int j = 0; j < size/(p*p); ++j) + { + //every bucket has N/(p*p) items and the range is [min : VALUE-1 ] + int min = VALUE*z; + + vec[x]= min + ( rand() % (VALUE-1) ) ; + x++; + } + i++; + } + + return vec; +} + +vector generateStaggered(int size) +{ + vector vec(size); + + srand (size + 815618); + int tmp=4096; //(RAND_MAX)/p; --> size=2048 + int p= (size+tmp-1)/tmp; + + const int VALUE = (1<<30)/p; //(RAND_MAX)/p; + + int i=1; int x=0; + //the array of size N is split into 'p' buckets + while(i <= p) + { + //every bucket has N/(p) items + for (int j = 0; j < size/(p); ++j) + { + int min; + + if(i<=(p/2)) + min = (2*i -1)*VALUE; + + else + min = (2*i-p-1)*VALUE; + + vec[x++]= min + ( rand() % (VALUE - 1) ); + } + i++; + } + + return vec; +} \ No newline at end of file diff --git a/src/Benchmarks/Sorting/timer.h b/src/Benchmarks/Sorting/timer.h new file mode 100644 index 0000000000000000000000000000000000000000..880eb03e79d82a50acd4148028e5498b09272ed8 --- /dev/null +++ b/src/Benchmarks/Sorting/timer.h @@ -0,0 +1,22 @@ +#pragma once + +#include +#include +#include +#include + +struct TIMER +{ + std::function f; + std::chrono::high_resolution_clock::time_point begin; + + TIMER(std::function func = [](double res){std::cout << res << std::endl;}) + : f(func), begin(std::chrono::high_resolution_clock::now()) {} + + ~TIMER() + { + auto end = std::chrono::high_resolution_clock::now(); + double result = (std::chrono::duration_cast(end - begin).count() / 1000.); + f(result); + } +}; \ No newline at end of file diff --git a/src/Benchmarks/Sorting/tnl-benchmark-sort.cpp b/src/Benchmarks/Sorting/tnl-benchmark-sort.cpp new file mode 100644 index 0000000000000000000000000000000000000000..489a9497aa0a4dcbd92dbd6a28507b0e6b78a582 --- /dev/null +++ b/src/Benchmarks/Sorting/tnl-benchmark-sort.cpp @@ -0,0 +1 @@ +#include "tnl-benchmark-sort.h" diff --git a/src/Benchmarks/Sorting/tnl-benchmark-sort.cu b/src/Benchmarks/Sorting/tnl-benchmark-sort.cu new file mode 120000 index 0000000000000000000000000000000000000000..26b452a6150a3c5413db2046a58f6e190cd9f1d4 --- /dev/null +++ b/src/Benchmarks/Sorting/tnl-benchmark-sort.cu @@ -0,0 +1 @@ +tnl-benchmark-sort.cpp \ No newline at end of file diff --git a/src/Benchmarks/Sorting/tnl-benchmark-sort.h b/src/Benchmarks/Sorting/tnl-benchmark-sort.h new file mode 100644 index 0000000000000000000000000000000000000000..68b500a0970dbeb4e4b49090f1f33b0f440fc79a --- /dev/null +++ b/src/Benchmarks/Sorting/tnl-benchmark-sort.h @@ -0,0 +1,131 @@ +#include +#include +#include +#include +#include +using namespace std; + +#include "generators.h" +#include "Measurer.h" + +#ifndef LOW_POW + #define LOW_POW 10 +#endif + +#ifndef HIGH_POW + #define HIGH_POW 25 +#endif + +#ifndef TRIES + #define TRIES 20 +#endif + +using namespace TNL; +using namespace TNL::Algorithms; +using namespace TNL::Algorithms::Sorting; + +template< typename Sorter > +void start(ostream & out, string delim) +{ + out << "size" << delim; + out << "random" << delim; + out << "shuffle" << delim; + out << "sorted" << delim; + out << "almost" << delim; + out << "decreas" << delim; + out << "gauss" << delim; + out << "bucket" << delim; + out << "stagger" << delim; + out << "zero_entropy"; + out << endl; + + int wrongAnsCnt = 0; + + for(int pow = LOW_POW; pow <= HIGH_POW; pow++) + { + int size =(1<< pow); + vector vec(size); + + out << "2^" << pow << delim << flush; + out << fixed << setprecision(3); + + out << Measurer< Sorter >::measure( generateRandom(size), TRIES, wrongAnsCnt); + out << delim << flush; + + out << Measurer< Sorter >::measure( generateShuffle(size), TRIES, wrongAnsCnt); + out << delim << flush; + + out << Measurer< Sorter >::measure( generateSorted(size), TRIES, wrongAnsCnt); + out << delim << flush; + + out << Measurer< Sorter >::measure( generateAlmostSorted(size), TRIES, wrongAnsCnt); + out << delim << flush; + + out << Measurer< Sorter >::measure( generateDecreasing(size), TRIES, wrongAnsCnt); + out << delim << flush; + + out << Measurer< Sorter >::measure( generateGaussian(size), TRIES, wrongAnsCnt) ; + out << delim << flush; + + out << Measurer< Sorter >::measure( generateBucket(size), TRIES, wrongAnsCnt); + out << delim << flush; + + out << Measurer< Sorter >::measure( generateStaggered(size), TRIES, wrongAnsCnt); + out << delim << flush; + + out << Measurer< Sorter >::measure( generateZero_entropy(size), TRIES, wrongAnsCnt); + out << endl; + } + + if(wrongAnsCnt > 0) + std::cerr << wrongAnsCnt << "tries were sorted incorrectly" << std::endl; +} + +int main(int argc, char *argv[]) +{ + if(argc == 1) + { +#ifdef HAVE_CUDA + std::cout << "Quicksort on GPU ... " << std::endl; + start< Quicksort >(cout, "\t"); + std::cout << "Bitonic sort on GPU ... " << std::endl; + start< BitonicSort >( cout, "\t" ); +#ifdef HAVE_CUDA_SAMPLES + std::cout << "Manca quicksort on GPU ... " << std::endl; + start< MancaQuicksort >( cout, "\t" ); + std::cout << "Nvidia bitonic sort on GPU ... " << std::endl; + start< NvidiaBitonicSort >( cout, "\t" ); +#endif + std::cout << "Cederman quicksort on GPU ... " << std::endl; + start< CedermanQuicksort >( cout, "\t" ); + std::cout << "Thrust radixsort on GPU ... " << std::endl; + start< ThrustRadixsort >( cout, "\t" ); +#endif + std::cout << "STL sort on CPU ... " << std::endl; + start< STLSort >( cout, "\t" ); + } + else + { + std::ofstream out(argv[1]); +#ifdef HAVE_CUDA + std::cout << "Quicksort on GPU ... " << std::endl; + start< Quicksort >(out, ","); + std::cout << "Bitonic sort on GPU ... " << std::endl; + start< BitonicSort >(out, ","); + +#ifdef HAVE_CUDA_SAMPLES + std::cout << "Manca quicksort on GPU ... " << std::endl; + start< MancaQuicksort >( out, "," ); + std::cout << "Nvidia bitonic sort on GPU ... " << std::endl; + start< NvidiaBitonicSort >( out, "," ); +#endif + std::cout << "Cederman quicksort on GPU ... " << std::endl; + start< CedermanQuicksort >( out, "," ); + std::cout << "Thrust radixsort on GPU ... " << std::endl; + start< ThrustRadixsort >( out, "," ); +#endif + std::cout << "STL sort on CPU ... " << std::endl; + start< STLSort >( out, "," ); + } + return 0; +} diff --git a/src/TNL/Algorithms/Sorting/BitonicSort.h b/src/TNL/Algorithms/Sorting/BitonicSort.h new file mode 100644 index 0000000000000000000000000000000000000000..3c2d6e89d103df4ba706c9a31e9f9e6440bef38d --- /dev/null +++ b/src/TNL/Algorithms/Sorting/BitonicSort.h @@ -0,0 +1,49 @@ +/*************************************************************************** + BitonicSort.h - description + ------------------- + begin : Jul 14, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Xuan Thang Nguyen, Tomas Oberhuber + +#pragma once + +#include + +namespace TNL { + namespace Algorithms { + namespace Sorting { + +struct BitonicSort +{ + template< typename Array > + void static sort( Array& array ) + { + bitonicSort( array ); + } + + template< typename Array, typename Compare > + void static sort( Array& array, const Compare& compare ) + { + bitonicSort( array, compare ); + } + + template< typename Device, typename Index, typename Compare, typename Swap > + void static inplaceSort( const Index begin, const Index end, const Compare& compare, const Swap& swap ) + { + if( std::is_same< Device, Devices::Cuda >::value ) + bitonicSort( begin, end, compare, swap ); + else + TNL_ASSERT( false, std::cerr << "inplace bitonic sort for CPU is not implemented" << std::endl ); + } +}; + + } // namespace Sorting + } // namespace Algorithms +} //namespace TNL + + diff --git a/src/TNL/Algorithms/Sorting/BubbleSort.h b/src/TNL/Algorithms/Sorting/BubbleSort.h new file mode 100644 index 0000000000000000000000000000000000000000..cb809b737c4ea2e48b1bc8f42a02587ea78a7303 --- /dev/null +++ b/src/TNL/Algorithms/Sorting/BubbleSort.h @@ -0,0 +1,66 @@ +/*************************************************************************** + BubbleSort.h - description + ------------------- + begin : Jul 26, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include +#include +#include + +namespace TNL { + namespace Algorithms { + namespace Sorting { + +struct BubbleSort +{ + template< typename Device, typename Index, typename Compare, typename Swap > + void static inplaceSort( const Index begin, const Index end, Compare& compare, Swap& swap ) + { + if( std::is_same< Device, Devices::Host >::value || + std::is_same< Device, Devices::Sequential >::value ) + { + Index left( begin ), right( end -1 ); + while( left < right ) + { + //int lastChange( end -1 ); + for( int j = left; j < right - 1; j++ ) + { + TNL_ASSERT_LT( j+1, end, "" ); + if( ! compare( j, j+1 ) ) + { + swap( j, j+1 ); + //lastChange = j; + } + } + right--; //lastChange; + for( int j = right; j >= left; j-- ) + { + TNL_ASSERT_LT( j+1, end, "" ); + if( ! compare( j, j+1 ) ) + { + swap( j, j+1 ); + //lastChange = j; + } + } + left++; //lastChange; + } + } + else + throw Exceptions::NotImplementedError(); + } +}; + + } // namespace Sorting + } // namespace Algorithms +} //namespace TNL + + diff --git a/src/TNL/Algorithms/Sorting/DefaultSorter.h b/src/TNL/Algorithms/Sorting/DefaultSorter.h new file mode 100644 index 0000000000000000000000000000000000000000..546ea0d19e9d39f8fa4424b51dbb4ed2fd445abf --- /dev/null +++ b/src/TNL/Algorithms/Sorting/DefaultSorter.h @@ -0,0 +1,73 @@ +/*************************************************************************** + DefaultSorter.h - description + ------------------- + begin : Jul 14, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include // std::pair, std::forward + +#include +#include +#include +#include +#include +#include +#include + +namespace TNL { + namespace Algorithms { + namespace Sorting { + +template< typename Device > +struct DefaultSorter; + +template<> +struct DefaultSorter< Devices::Host > +{ + using SorterType = Algorithms::Sorting::STLSort; +}; + +template<> +struct DefaultSorter< Devices::Sequential > +{ + using SorterType = Algorithms::Sorting::STLSort; +}; + +template<> +struct DefaultSorter< Devices::Cuda > +{ + using SorterType = Algorithms::Sorting::Quicksort; +}; + +template< typename Device > +struct DefaultInplaceSorter; + +template<> +struct DefaultInplaceSorter< Devices::Host > +{ + using SorterType = Algorithms::Sorting::BubbleSort; +}; + +template<> +struct DefaultInplaceSorter< Devices::Sequential > +{ + using SorterType = Algorithms::Sorting::BubbleSort; +}; + +template<> +struct DefaultInplaceSorter< Devices::Cuda > +{ + using SorterType = Algorithms::Sorting::BitonicSort; +}; + + } // namespace Sorting + } // namespace Algorithms +} // namespace TNL diff --git a/src/TNL/Algorithms/Sorting/Quicksort.h b/src/TNL/Algorithms/Sorting/Quicksort.h new file mode 100644 index 0000000000000000000000000000000000000000..4df0bf471ee48afbeda99f32dde01d7b79f15b1b --- /dev/null +++ b/src/TNL/Algorithms/Sorting/Quicksort.h @@ -0,0 +1,43 @@ +/*************************************************************************** + Quicksort.h - description + ------------------- + begin : Jul 14, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Xuan Thang Nguyen, Tomas Oberhuber + +#pragma once + +#include + +namespace TNL { + namespace Algorithms { + namespace Sorting { + +struct Quicksort +{ + template< typename Array > + void static sort( Array& array ) + { + Quicksorter< typename Array::ValueType, typename Array::DeviceType > qs; + qs.sort( array ); + } + + template< typename Array, typename Compare > + void static sort( Array& array, const Compare& compare ) + { + Quicksorter< typename Array::ValueType, typename Array::DeviceType > qs; + qs.sort( array, compare ); + } + +}; + + } // namespace Sorting + } // namespace Algorithms +} //namespace TNL + + diff --git a/src/TNL/Algorithms/Sorting/STLSort.h b/src/TNL/Algorithms/Sorting/STLSort.h new file mode 100644 index 0000000000000000000000000000000000000000..3fc69f324f7e3acce9188e80e28df24cb3fef9f1 --- /dev/null +++ b/src/TNL/Algorithms/Sorting/STLSort.h @@ -0,0 +1,40 @@ +/*************************************************************************** + STLSort.h - description + ------------------- + begin : Jul 14, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber + +#pragma once + +#include + +namespace TNL { + namespace Algorithms { + namespace Sorting { + +struct STLSort +{ + template< typename Array > + void static sort( Array& array ) + { + std::sort( array.getData(), array.getData() + array.getSize() ); + } + + template< typename Array, typename Compare > + void static sort( Array& array, const Compare& compare ) + { + std::sort( array.getData(), array.getData() + array.getSize(), compare ); + } +}; + + } // namespace Sorting + } // namespace Algorithms +} //namespace TNL + + diff --git a/src/TNL/Algorithms/Sorting/detail/Quicksorter.h b/src/TNL/Algorithms/Sorting/detail/Quicksorter.h new file mode 100644 index 0000000000000000000000000000000000000000..3f891fa590c18eeec33841e264bfbacadbb1782e --- /dev/null +++ b/src/TNL/Algorithms/Sorting/detail/Quicksorter.h @@ -0,0 +1,111 @@ +/*************************************************************************** + Quicksorter.h - description + ------------------- + begin : Jul 13, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Xuan Thang Nguyen, Tomas Oberhuber + +#pragma once + +#include +#include +#include +#include + +namespace TNL { + namespace Algorithms { + namespace Sorting { + +template< typename Value, typename Device > +class Quicksorter; + +template< typename Value > +class Quicksorter< Value, Devices::Cuda > +{ + public: + + using ValueType = Value; + using DeviceType = Devices::Cuda; + + + template< typename Array, typename Compare > + void sort( Array& arr, const Compare& cmp ); + + template< typename Array > + void sort( Array& arr ); + + protected: + + void init( Containers::ArrayView arr, int gridDim, int blockDim, int desiredElemPerBlock, int maxSharable); + + template< typename CMP > + void performSort( const CMP &Cmp ); + + + /** + * returns how many blocks are needed to start sort phase 1 if @param elemPerBlock were to be used + * */ + int getSetsNeeded(int elemPerBlock) const; + + /** + * returns the optimal amount of elements per thread needed for phase + * */ + int getElemPerBlock() const; + + /** + * returns the amount of blocks needed to start phase 1 while also initializing all tasks + * */ + template< typename CMP > + int initTasks(int elemPerBlock, const CMP &Cmp); + + /** + * does the 1st phase of Quicksort until out of task memory or each task is small enough + * for correctness, secondphase method needs to be called to sort each subsequences + * */ + template + void firstPhase(const CMP &Cmp); + + /** + * update necessary variables after 1 phase1 sort + * */ + void processNewTasks(); + + /** + * sorts all leftover tasks + * */ + template + void secondPhase( const CMP &Cmp) ; + + int maxBlocks, threadsPerBlock, desiredElemPerBlock, maxSharable; //kernel config + + Containers::Array auxMem; + Containers::ArrayView arr, aux; + + int desired_2ndPhasElemPerBlock; + const int g_maxTasks = 1 << 14; + int maxTasks; + + + Containers::Array cuda_tasks, cuda_newTasks, cuda_2ndPhaseTasks; //1 set of 2 rotating tasks and 2nd phase + Containers::Array cuda_newTasksAmount, cuda_2ndPhaseTasksAmount; //is in reality 1 integer each + + Containers::Array cuda_blockToTaskMapping; + Containers::Vector cuda_reductionTaskInitMem; + + int host_1stPhaseTasksAmount = 0, host_2ndPhaseTasksAmount = 0; + int iteration = 0; + + template< typename T > + friend int getSetsNeededFunction(int elemPerBlock, const Quicksorter< T, Devices::Cuda >& quicksort ); +}; + + } // namespace Sorting + } // namespace Algorithms +}// namespace TNL + +#include diff --git a/src/TNL/Algorithms/Sorting/detail/Quicksorter.hpp b/src/TNL/Algorithms/Sorting/detail/Quicksorter.hpp new file mode 100644 index 0000000000000000000000000000000000000000..faa294ab3799f216dca5a9165c7e5f2020c2939b --- /dev/null +++ b/src/TNL/Algorithms/Sorting/detail/Quicksorter.hpp @@ -0,0 +1,414 @@ +/*************************************************************************** + Quicksorter.h - description + ------------------- + begin : Jul 13, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Xuan Thang Nguyen, Tomas Oberhuber + +#pragma once + +#include +#include +#include +#include +#include + +namespace TNL { + namespace Algorithms { + namespace Sorting { + + +template< typename Value > + template< typename Array, typename Compare > +void +Quicksorter< Value, Devices::Cuda >:: +sort( Array& arr, const Compare& cmp ) +{ +#ifdef HAVE_CUDA + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, 0); + + /** + * for every block there is a bit of shared memory reserved, the actual value can slightly differ + * */ + int sharedReserve = sizeof(int) * (16 + 3 * 32); + int maxSharable = deviceProp.sharedMemPerBlock - sharedReserve; + + int blockDim = 512; //best case + + /** + * the goal is to use shared memory as often as possible + * each thread in a block will process n elements, n==multiplier + * + 1 reserved for pivot (statically allocating Value type throws weird error, hence it needs to be dynamic) + * + * blockDim*multiplier*sizeof(Value) + 1*sizeof(Value) <= maxSharable + * */ + int elemPerBlock = (maxSharable - sizeof(Value)) / sizeof(Value); //try to use up all of shared memory to store elements + const int maxBlocks = (1 << 20); + const int maxMultiplier = 8; + int multiplier = min(elemPerBlock / blockDim, maxMultiplier); + + if (multiplier <= 0) //a block cant store 512 elements, sorting some really big data + { + blockDim = 256; //try to fit 256 elements + multiplier = min(elemPerBlock / blockDim, maxMultiplier); + + if (multiplier <= 0) + { + //worst case scenario, shared memory cant be utilized at all because of the sheer size of Value + //sort has to be done with the use of global memory alone + + this->init(arr, maxBlocks, 512, 0, 0); + this->performSort( cmp ); + return; + } + } + + TNL_ASSERT_LE( ( int ) ( blockDim * multiplier * sizeof(Value) ), maxSharable,"" ); + + this->init(arr, maxBlocks, blockDim, multiplier * blockDim, maxSharable); + this->performSort( cmp ); +#endif +} + +template< typename Value > + template< typename Array > +void +Quicksorter< Value, Devices::Cuda >:: +sort( Array& arr ) +{ + this->sort(arr, [] __cuda_callable__( const Value& a, const Value& b ) { return a < b; } ); +} + +template< typename Value > +void +Quicksorter< Value, Devices::Cuda >:: +init( Containers::ArrayView arr, int gridDim, int blockDim, int desiredElemPerBlock, int maxSharable) +{ + this->maxBlocks = gridDim; + this->threadsPerBlock = blockDim; + this->desiredElemPerBlock = desiredElemPerBlock; + this->maxSharable = maxSharable; + this->arr.bind( arr ); + this->auxMem.setSize( arr.getSize() ); + this->aux.bind( auxMem.getView() ); + this->desired_2ndPhasElemPerBlock = desiredElemPerBlock; + this->maxTasks = min( arr.getSize(), g_maxTasks ); + this->cuda_tasks.setSize(maxTasks); + this->cuda_newTasks.setSize(maxTasks); + this->cuda_2ndPhaseTasks.setSize(maxTasks); + this->cuda_newTasksAmount.setSize(1); + this->cuda_2ndPhaseTasksAmount.setSize(1); + this->cuda_blockToTaskMapping.setSize(maxBlocks); + this->cuda_reductionTaskInitMem.setSize(maxTasks); + + if (arr.getSize() > desired_2ndPhasElemPerBlock) + { + cuda_tasks.setElement(0, TASK(0, arr.getSize(), 0)); + host_1stPhaseTasksAmount = 1; + } + else + { + cuda_2ndPhaseTasks.setElement(0, TASK(0, arr.getSize(), 0)); + host_2ndPhaseTasksAmount = 1; + } + + cuda_2ndPhaseTasksAmount = 0; + TNL_CHECK_CUDA_DEVICE; +} + + +template< typename Value > + template< typename CMP > +void +Quicksorter< Value, Devices::Cuda >:: +performSort( const CMP &Cmp ) +{ +#ifdef HAVE_CUDA + firstPhase(Cmp); + + int total2ndPhase = host_1stPhaseTasksAmount + host_2ndPhaseTasksAmount; + if (total2ndPhase > 0) + secondPhase(Cmp); + + cudaDeviceSynchronize(); + TNL_CHECK_CUDA_DEVICE; + +#ifdef CHECK_RESULT_SORT + if (!is_sorted(arr)) + { + std::ofstream out("error.txt"); + out << arr << std::endl; + out << aux << std::endl; + out << cuda_tasks << std::endl; + out << cuda_newTasks << std::endl; + out << cuda_2ndPhaseTasks << std::endl; + + out << cuda_newTasksAmount << std::endl; + out << cuda_2ndPhaseTasksAmount << std::endl; + + out << iteration << std::endl; + } +#endif +#endif +} + +template< typename Value > + template< typename CMP > +void +Quicksorter< Value, Devices::Cuda >:: +firstPhase( const CMP &Cmp ) +{ +#ifdef HAVE_CUDA + while (host_1stPhaseTasksAmount > 0) + { + if (host_1stPhaseTasksAmount >= maxTasks) + break; + + if (host_2ndPhaseTasksAmount >= maxTasks) //2nd phase occupies enoughs tasks to warrant premature 2nd phase sort + { + int tmp = host_1stPhaseTasksAmount; + host_1stPhaseTasksAmount = 0; + secondPhase(Cmp); + cuda_2ndPhaseTasksAmount = host_2ndPhaseTasksAmount = 0; + host_1stPhaseTasksAmount = tmp; + } + + //just in case newly created tasks wouldnt fit + //bite the bullet and sort with single blocks + if (host_1stPhaseTasksAmount * 2 >= maxTasks + (maxTasks - host_2ndPhaseTasksAmount)) + { + if (host_2ndPhaseTasksAmount >= 0.75 * maxTasks) //2nd phase occupies enoughs tasks to warrant premature 2nd phase sort + { + int tmp = host_1stPhaseTasksAmount; + host_1stPhaseTasksAmount = 0; + secondPhase(Cmp); + cuda_2ndPhaseTasksAmount = host_2ndPhaseTasksAmount = 0; + host_1stPhaseTasksAmount = tmp; + } + else + break; + } + + //--------------------------------------------------------------- + + int elemPerBlock = getElemPerBlock(); + + /** + * initializes tasks so that each block knows which task to work on and which part of array to split + * also sets pivot needed for partitioning, this is why Cmp is needed + * */ + int blocksCnt = initTasks(elemPerBlock, Cmp); + TNL_CHECK_CUDA_DEVICE; + + //not enough or too many blocks needed, switch to 2nd phase + if (blocksCnt <= 1 || blocksCnt > cuda_blockToTaskMapping.getSize()) + break; + + //----------------------------------------------- + //do the partitioning + + auto &task = iteration % 2 == 0 ? cuda_tasks : cuda_newTasks; + int externMemByteSize = elemPerBlock * sizeof(Value) + sizeof(Value); //elems + 1 for pivot + + /** + * check if partition procedure can use shared memory for coalesced write after reordering + * + * move elements smaller than pivot to the left and bigger to the right + * note: pivot isnt inserted in the middle yet + * */ + if (externMemByteSize <= maxSharable) + { + cudaQuickSort1stPhase + <<>>( + arr, aux, Cmp, elemPerBlock, + task, cuda_blockToTaskMapping); + } + else + { + cudaQuickSort1stPhase + <<>>( + arr, aux, Cmp, elemPerBlock, + task, cuda_blockToTaskMapping); + } + TNL_CHECK_CUDA_DEVICE; + + /** + * fill in the gap between smaller and bigger with elements == pivot + * after writing also create new tasks, each task generates at max 2 tasks + * + * tasks smaller than desired_2ndPhasElemPerBlock go into 2nd phase + * bigger need more blocks to partition and are written into newTask + * with iteration %2, rotate between the 2 tasks array to save from copying + * */ + auto &newTask = iteration % 2 == 0 ? cuda_newTasks : cuda_tasks; + cudaWritePivot + <<>>( + arr, aux, desired_2ndPhasElemPerBlock, + task, newTask, cuda_newTasksAmount.getData(), + cuda_2ndPhaseTasks, cuda_2ndPhaseTasksAmount.getData()); + TNL_CHECK_CUDA_DEVICE; + + //---------------------------------------- + + processNewTasks(); + iteration++; + } +#endif +} + +template< typename Value > + template< typename CMP > +void +Quicksorter< Value, Devices::Cuda >:: +secondPhase(const CMP &Cmp) +{ +#ifdef HAVE_CUDA + int total2ndPhase = host_1stPhaseTasksAmount + host_2ndPhaseTasksAmount; + const int stackSize = 32; + auto &leftoverTasks = iteration % 2 == 0 ? cuda_tasks : cuda_newTasks; + + int elemInShared = desiredElemPerBlock; + int externSharedByteSize = elemInShared * sizeof(Value) + sizeof(Value); //reserve space for storing elements + 1 pivot + if (externSharedByteSize > maxSharable) + { + externSharedByteSize = sizeof(Value); + elemInShared = 0; + } + + if (host_1stPhaseTasksAmount > 0 && host_2ndPhaseTasksAmount > 0) + { + auto tasks = leftoverTasks.getView(0, host_1stPhaseTasksAmount); + auto tasks2 = cuda_2ndPhaseTasks.getView(0, host_2ndPhaseTasksAmount); + + cudaQuickSort2ndPhase + <<>>(arr, aux, Cmp, tasks, tasks2, elemInShared, desired_2ndPhasElemPerBlock); + } + else if (host_1stPhaseTasksAmount > 0) + { + auto tasks = leftoverTasks.getView(0, host_1stPhaseTasksAmount); + cudaQuickSort2ndPhase + <<>>(arr, aux, Cmp, tasks, elemInShared, desired_2ndPhasElemPerBlock); + } + else + { + auto tasks2 = cuda_2ndPhaseTasks.getView(0, host_2ndPhaseTasksAmount); + + cudaQuickSort2ndPhase + <<>>(arr, aux, Cmp, tasks2, elemInShared, desired_2ndPhasElemPerBlock); + } +#endif +} + +template< typename Value > +int getSetsNeededFunction(int elemPerBlock, const Quicksorter< Value, Devices::Cuda >& quicksort ) +{ + auto view = quicksort.iteration % 2 == 0 ? quicksort.cuda_tasks.getConstView() : quicksort.cuda_newTasks.getConstView(); + auto fetch = [=] __cuda_callable__(int i) -> int { + const auto &task = view[i]; + int size = task.partitionEnd - task.partitionBegin; + return size / elemPerBlock + (size % elemPerBlock != 0); + }; + auto reduction = [] __cuda_callable__(int a, int b) { return a + b; }; + return Algorithms::reduce( 0, quicksort.host_1stPhaseTasksAmount, fetch, reduction, 0 ); +} + +template< typename Value > +int +Quicksorter< Value, Devices::Cuda >:: +getSetsNeeded(int elemPerBlock) const +{ + /*auto view = iteration % 2 == 0 ? cuda_tasks.getConstView() : cuda_newTasks.getConstView(); + auto fetch = [=] __cuda_callable__(int i) { + const auto &task = view[i]; + int size = task.partitionEnd - task.partitionBegin; + return size / elemPerBlock + (size % elemPerBlock != 0); + }; + auto reduction = [] __cuda_callable__(int a, int b) { return a + b; }; + return Algorithms::reduce(0, host_1stPhaseTasksAmount, fetch, reduction, 0);*/ + return getSetsNeededFunction< Value >( elemPerBlock, *this ); +} + +template< typename Value > +int +Quicksorter< Value, Devices::Cuda >:: +getElemPerBlock() const +{ + return desiredElemPerBlock; + + int setsNeeded = getSetsNeeded(desiredElemPerBlock); + + if (setsNeeded <= maxBlocks) + return desiredElemPerBlock; + + //want multiplier*minElemPerBLock <= x*threadPerBlock + //find smallest x so that this inequality holds + double multiplier = 1. * setsNeeded / maxBlocks; + int elemPerBlock = multiplier * desiredElemPerBlock; + setsNeeded = elemPerBlock / threadsPerBlock + (elemPerBlock % threadsPerBlock != 0); + + return setsNeeded * threadsPerBlock; +} + +template< typename Value > + template< typename CMP > +int +Quicksorter< Value, Devices::Cuda >:: +initTasks(int elemPerBlock, const CMP &Cmp) +{ +#ifdef HAVE_CUDA + auto &src = iteration % 2 == 0 ? arr : aux; + auto &tasks = iteration % 2 == 0 ? cuda_tasks : cuda_newTasks; + + //-------------------------------------------------------- + int blocks = host_1stPhaseTasksAmount / threadsPerBlock + (host_1stPhaseTasksAmount % threadsPerBlock != 0); + + cudaCalcBlocksNeeded<<>>(tasks.getView(0, host_1stPhaseTasksAmount), elemPerBlock, + cuda_reductionTaskInitMem.getView(0, host_1stPhaseTasksAmount)); + //cuda_reductionTaskInitMem[i] == how many blocks task i needs + + //auto reduce = [] __cuda_callable__(const int &a, const int &b) { return a + b; }; + + Algorithms::Scan:: + perform(cuda_reductionTaskInitMem, 0, cuda_reductionTaskInitMem.getSize(), TNL::Plus{}, 0); + //cuda_reductionTaskInitMem[i] == how many blocks task [0..i] need + + int blocksNeeded = cuda_reductionTaskInitMem.getElement(host_1stPhaseTasksAmount - 1); + + //need too many blocks, give back control + if (blocksNeeded > cuda_blockToTaskMapping.getSize()) + return blocksNeeded; + + //-------------------------------------------------------- + + cudaInitTask<<>>( + tasks.getView(0, host_1stPhaseTasksAmount), //task to read from + cuda_blockToTaskMapping.getView(0, blocksNeeded), //maps block to a certain task + cuda_reductionTaskInitMem.getView(0, host_1stPhaseTasksAmount), //has how many each task need blocks precalculated + src, Cmp); //used to pick pivot + + cuda_newTasksAmount.setElement(0, 0); //resets new element counter + return blocksNeeded; +#else + return -1; +#endif +} + +template +void +Quicksorter< Value, Devices::Cuda >:: +processNewTasks() +{ + host_1stPhaseTasksAmount = min(cuda_newTasksAmount.getElement(0), maxTasks); + host_2ndPhaseTasksAmount = min(cuda_2ndPhaseTasksAmount.getElement(0), maxTasks); +} + + + } // namespace Sorting + } // namespace Algorithms +} // namespace TNL diff --git a/src/TNL/Algorithms/Sorting/detail/bitonicSort.h b/src/TNL/Algorithms/Sorting/detail/bitonicSort.h new file mode 100644 index 0000000000000000000000000000000000000000..8ccd0569c2e0e14dbb6d12ccd656459ae2171c46 --- /dev/null +++ b/src/TNL/Algorithms/Sorting/detail/bitonicSort.h @@ -0,0 +1,379 @@ +#pragma once +#include +#include +#include + +namespace TNL { + namespace Algorithms { + namespace Sorting { + +#ifdef HAVE_CUDA + +/** + * this kernel simulates 1 exchange + * splits input arr that is bitonic into 2 bitonic sequences + */ +template +__global__ void bitonicMergeGlobal(TNL::Containers::ArrayView arr, + CMP Cmp, + int monotonicSeqLen, int bitonicLen) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + + int part = i / (bitonicLen / 2); //computes which sorting block this thread belongs to + + //the index of 2 elements that should be compared and swapped + int s = part * bitonicLen + (i & ((bitonicLen / 2) - 1)); + int e = s + bitonicLen / 2; + if (e >= arr.getSize()) //arr[e] is virtual padding and will not be exchanged with + return; + + int partsInSeq = monotonicSeqLen / bitonicLen; + //calculate the direction of swapping + int monotonicSeqIdx = part / partsInSeq; + bool ascending = (monotonicSeqIdx & 1) != 0; + if ((monotonicSeqIdx + 1) * monotonicSeqLen >= arr.getSize()) //special case for part with no "partner" to be merged with in next phase + ascending = true; + + cmpSwap(arr[s], arr[e], ascending, Cmp); +} + +//--------------------------------------------- +//--------------------------------------------- + +/** + * simulates many layers of merge + * turns input that is a bitonic sequence into 1 monotonic sequence + * + * this version uses shared memory to do the operations + * */ +template +__global__ void bitonicMergeSharedMemory(TNL::Containers::ArrayView arr, + CMP Cmp, + int monotonicSeqLen, int bitonicLen) +{ + extern __shared__ int externMem[]; + Value *sharedMem = (Value *)externMem; + + int sharedMemLen = 2 * blockDim.x; + + //1st index and last index of subarray that this threadBlock should merge + int myBlockStart = blockIdx.x * sharedMemLen; + int myBlockEnd = TNL::min(arr.getSize(), myBlockStart + sharedMemLen); + + //copy from globalMem into sharedMem + for (int i = threadIdx.x; myBlockStart + i < myBlockEnd; i += blockDim.x) + sharedMem[i] = arr[myBlockStart + i]; + __syncthreads(); + + //------------------------------------------ + //bitonic activity + { + //calculate the direction of swapping + int i = blockIdx.x * blockDim.x + threadIdx.x; + int part = i / (bitonicLen / 2); + int partsInSeq = monotonicSeqLen / bitonicLen; + int monotonicSeqIdx = part / partsInSeq; + + bool ascending = (monotonicSeqIdx & 1) != 0; + //special case for parts with no "partner" + if ((monotonicSeqIdx + 1) * monotonicSeqLen >= arr.getSize()) + ascending = true; + //------------------------------------------ + + //do bitonic merge + for (; bitonicLen > 1; bitonicLen /= 2) + { + //calculates which 2 indexes will be compared and swap + int part = threadIdx.x / (bitonicLen / 2); + int s = part * bitonicLen + (threadIdx.x & ((bitonicLen / 2) - 1)); + int e = s + bitonicLen / 2; + + if (e < myBlockEnd - myBlockStart) //not touching virtual padding + cmpSwap(sharedMem[s], sharedMem[e], ascending, Cmp); + __syncthreads(); + } + } + + //------------------------------------------ + + //writeback to global memory + for (int i = threadIdx.x; myBlockStart + i < myBlockEnd; i += blockDim.x) + arr[myBlockStart + i] = sharedMem[i]; +} + + +/** + * entrypoint for bitonicSort_Block + * sorts @param arr in alternating order to create bitonic sequences + * sharedMem has to be able to store at least blockDim.x*2 elements + * */ +template +__global__ void bitoniSort1stStepSharedMemory(TNL::Containers::ArrayView arr, CMP Cmp) +{ + extern __shared__ int externMem[]; + + Value * sharedMem = (Value *)externMem; + int sharedMemLen = 2*blockDim.x; + + int myBlockStart = blockIdx.x * sharedMemLen; + int myBlockEnd = TNL::min(arr.getSize(), myBlockStart+sharedMemLen); + + //copy from globalMem into sharedMem + for (int i = threadIdx.x; myBlockStart + i < myBlockEnd; i += blockDim.x) + sharedMem[i] = arr[myBlockStart + i]; + __syncthreads(); + + //------------------------------------------ + //bitonic activity + { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int paddedSize = closestPow2(myBlockEnd - myBlockStart); + + for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) + { + //calculate the direction of swapping + int monotonicSeqIdx = i / (monotonicSeqLen/2); + bool ascending = (monotonicSeqIdx & 1) != 0; + if ((monotonicSeqIdx + 1) * monotonicSeqLen >= arr.getSize()) //special case for parts with no "partner" + ascending = true; + + for (int len = monotonicSeqLen; len > 1; len /= 2) + { + //calculates which 2 indexes will be compared and swap + int part = threadIdx.x / (len / 2); + int s = part * len + (threadIdx.x & ((len / 2) - 1)); + int e = s + len / 2; + + if(e < myBlockEnd - myBlockStart) //touching virtual padding + cmpSwap(sharedMem[s], sharedMem[e], ascending, Cmp); + __syncthreads(); + } + } + } + + //writeback to global memory + for (int i = threadIdx.x; myBlockStart + i < myBlockEnd; i += blockDim.x) + arr[myBlockStart + i] = sharedMem[i]; +} +#endif + + +template +void bitonicSortWithShared(TNL::Containers::ArrayView view, const CMP &Cmp, + int gridDim, int blockDim, int sharedMemLen, int sharedMemSize) +{ +#ifdef HAVE_CUDA + int paddedSize = closestPow2(view.getSize()); + + bitoniSort1stStepSharedMemory<<>>(view, Cmp); + //now alternating monotonic sequences with bitonicLenght of sharedMemLen + + // \/ has bitonicLength of 2 * sharedMemLen + for (int monotonicSeqLen = 2 * sharedMemLen; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) + { + for (int bitonicLen = monotonicSeqLen; bitonicLen > 1; bitonicLen /= 2) + { + if (bitonicLen > sharedMemLen) + { + bitonicMergeGlobal<<>>( + view, Cmp, monotonicSeqLen, bitonicLen); + } + else + { + bitonicMergeSharedMemory<<>>( + view, Cmp, monotonicSeqLen, bitonicLen); + + //simulates sorts until bitonicLen == 2 already, no need to continue this loop + break; + } + } + } + cudaDeviceSynchronize(); +#endif +} + +//--------------------------------------------- + +template +void bitonicSort(TNL::Containers::ArrayView view, + const CMP &Cmp, + int gridDim, int blockDim) + +{ +#ifdef HAVE_CUDA + int paddedSize = closestPow2(view.getSize()); + + for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) + { + for (int bitonicLen = monotonicSeqLen; bitonicLen > 1; bitonicLen /= 2) + { + bitonicMergeGlobal<<>>(view, Cmp, monotonicSeqLen, bitonicLen); + } + } + cudaDeviceSynchronize(); +#endif +} + +//--------------------------------------------- +template +void bitonicSort(TNL::Containers::ArrayView src, int begin, int end, const CMP &Cmp) +{ +#ifdef HAVE_CUDA + auto view = src.getView(begin, end); + + int threadsNeeded = view.getSize() / 2 + (view.getSize() % 2 != 0); + + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, 0); + + const int maxThreadsPerBlock = 512; + + int sharedMemLen = maxThreadsPerBlock * 2; + size_t sharedMemSize = sharedMemLen * sizeof(Value); + + if (sharedMemSize <= deviceProp.sharedMemPerBlock) + { + int blockDim = maxThreadsPerBlock; + int gridDim = threadsNeeded / blockDim + (threadsNeeded % blockDim != 0); + bitonicSortWithShared(view, Cmp, gridDim, blockDim, sharedMemLen, sharedMemSize); + } + else if (sharedMemSize / 2 <= deviceProp.sharedMemPerBlock) + { + int blockDim = maxThreadsPerBlock / 2; //256 + int gridDim = threadsNeeded / blockDim + (threadsNeeded % blockDim != 0); + sharedMemSize /= 2; + sharedMemLen /= 2; + bitonicSortWithShared(view, Cmp, gridDim, blockDim, sharedMemLen, sharedMemSize); + } + else + { + int gridDim = threadsNeeded / maxThreadsPerBlock + (threadsNeeded % maxThreadsPerBlock != 0); + bitonicSort(view, Cmp, gridDim, maxThreadsPerBlock); + } +#endif +} + +//--------------------------------------------- + +template +void bitonicSort(TNL::Containers::ArrayView arr, int begin, int end) +{ + bitonicSort(arr, begin, end, [] __cuda_callable__(const Value &a, const Value &b) { return a < b; }); +} + +template +void bitonicSort(TNL::Containers::ArrayView arr, const CMP &Cmp) +{ + bitonicSort(arr, 0, arr.getSize(), Cmp); +} + +template +void bitonicSort(TNL::Containers::ArrayView arr) +{ + bitonicSort(arr, [] __cuda_callable__(const Value &a, const Value &b) { return a < b; }); +} + +//--------------------------------------------- +template +void bitonicSort(std::vector &vec, int begin, int end, const CMP &Cmp) +{ + TNL::Containers::Array Arr(vec); + auto view = Arr.getView(); + bitonicSort(view, begin, end, Cmp); + + TNL::Algorithms::MultiDeviceMemoryOperations:: + copy(vec.data(), view.getData(), view.getSize()); +} + +template +void bitonicSort(std::vector &vec, int begin, int end) +{ + bitonicSort(vec, begin, end, [] __cuda_callable__(const Value &a, const Value &b) { return a < b; }); +} + +template +void bitonicSort(std::vector &vec, const CMP &Cmp) +{ + bitonicSort(vec, 0, vec.size(), Cmp); +} + +template +void bitonicSort(std::vector &vec) +{ + bitonicSort(vec, [] __cuda_callable__(const Value &a, const Value &b) { return a < b; }); +} + +template +void bitonicSort( TNL::Containers::Array< Value, TNL::Devices::Host > &vec) +{ + bitonicSort(vec, [] __cuda_callable__(const Value &a, const Value &b) { return a < b; }); +} + + +//--------------------------------------------- +//--------------------------------------------- + +#ifdef HAVE_CUDA +template< typename CMP, typename SWAP> +__global__ void bitonicMergeGlobal(int size, CMP Cmp, SWAP Swap, + int monotonicSeqLen, int bitonicLen) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + + int part = i / (bitonicLen / 2); //computes which sorting block this thread belongs to + + //the index of 2 elements that should be compared and swapped + int s = part * bitonicLen + (i & ((bitonicLen / 2) - 1)); + int e = s + bitonicLen / 2; + if (e >= size) //arr[e] is virtual padding and will not be exchanged with + return; + + //calculate the direction of swapping + int partsInSeq = monotonicSeqLen / bitonicLen; + int monotonicSeqIdx = part / partsInSeq; + bool ascending = (monotonicSeqIdx & 1) != 0; + if ((monotonicSeqIdx + 1) * monotonicSeqLen >= size) //special case for part with no "partner" to be merged with in next phase + ascending = true; + + if( ascending == Cmp(e, s) ) + Swap(s, e); +} + +template< typename CMP, typename SWAP > +void bitonicSort(int begin, int end, const CMP &Cmp, SWAP Swap) +{ + int size = end - begin; + int paddedSize = closestPow2(size); + + int threadsNeeded = size / 2 + (size % 2 != 0); + + const int maxThreadsPerBlock = 512; + int threadsPerBlock = maxThreadsPerBlock; + int blocks = threadsNeeded / threadsPerBlock + (threadsNeeded % threadsPerBlock != 0); + + auto compareWithOffset = + [=] __cuda_callable__(int i, int j) { + return Cmp(i + begin, j + begin); + }; + + auto swapWithOffset = + [=] __cuda_callable__(int i, int j) mutable { + Swap(i + begin, j + begin); + }; + + for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) + { + for (int bitonicLen = monotonicSeqLen; bitonicLen > 1; bitonicLen /= 2) + { + bitonicMergeGlobal<<>>( + size, compareWithOffset, swapWithOffset, + monotonicSeqLen, bitonicLen); + } + } + cudaDeviceSynchronize(); +} +#endif + } // namespace Sorting + } // namespace Algorithms +} // namespace TNL diff --git a/src/TNL/Algorithms/Sorting/detail/blockBitonicSort.h b/src/TNL/Algorithms/Sorting/detail/blockBitonicSort.h new file mode 100644 index 0000000000000000000000000000000000000000..f0732dcb0d51fc7063e7060272c0f267fbed3d20 --- /dev/null +++ b/src/TNL/Algorithms/Sorting/detail/blockBitonicSort.h @@ -0,0 +1,111 @@ +#pragma once +#include +#include + +namespace TNL { + namespace Algorithms { + namespace Sorting { + +#ifdef HAVE_CUDA + +/** + * IMPORTANT: all threads in block have to call this function to work properly + * the size of src isn't limited, but for optimal efficiency, no more than 8*blockDim.x should be used + * Description: sorts src and writes into dst within a block + * works independently from other concurrent blocks + * @param sharedMem sharedMem pointer has to be able to store all of src elements + * */ +template +__device__ void bitonicSort_Block(TNL::Containers::ArrayView src, + TNL::Containers::ArrayView dst, + Value *sharedMem, const CMP &Cmp) +{ + //copy from globalMem into sharedMem + for (int i = threadIdx.x; i < src.getSize(); i += blockDim.x) + sharedMem[i] = src[i]; + __syncthreads(); + + //------------------------------------------ + //bitonic activity + { + int paddedSize = closestPow2_ptx(src.getSize()); + + for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) + { + for (int bitonicLen = monotonicSeqLen; bitonicLen > 1; bitonicLen /= 2) + { + for (int i = threadIdx.x;; i += blockDim.x) //simulates other blocks in case src.size > blockDim.x*2 + { + //calculates which 2 indexes will be compared and swap + int part = i / (bitonicLen / 2); + int s = part * bitonicLen + (i & ((bitonicLen / 2) - 1)); + int e = s + bitonicLen / 2; + + if (e >= src.getSize()) //touching virtual padding, the order dont swap + break; + + //calculate the direction of swapping + int monotonicSeqIdx = i / (monotonicSeqLen / 2); + bool ascending = (monotonicSeqIdx & 1) != 0; + if ((monotonicSeqIdx + 1) * monotonicSeqLen >= src.getSize()) //special case for parts with no "partner" + ascending = true; + + cmpSwap(sharedMem[s], sharedMem[e], ascending, Cmp); + } + + __syncthreads(); //only 1 synchronization needed + } + } + } + + //------------------------------------------ + //writeback to global memory + for (int i = threadIdx.x; i < dst.getSize(); i += blockDim.x) + dst[i] = sharedMem[i]; +} + +/** + * IMPORTANT: all threads in block have to call this function to work properly + * IMPORTANT: unlike the counterpart with shared memory, this function only works in-place + * the size of src isn't limited, but for optimal efficiency, no more than 8*blockDim.x should be used + * Description: sorts src in place using bitonic sort + * works independently from other concurrent blocks + * this version doesnt use shared memory and is prefered for Value with big size + * */ +template +__device__ void bitonicSort_Block(TNL::Containers::ArrayView src, + const CMP &Cmp) +{ + int paddedSize = closestPow2_ptx(src.getSize()); + + for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) + { + for (int bitonicLen = monotonicSeqLen; bitonicLen > 1; bitonicLen /= 2) + { + for (int i = threadIdx.x;; i += blockDim.x) //simulates other blocks in case src.size > blockDim.x*2 + { + //calculates which 2 indexes will be compared and swap + int part = i / (bitonicLen / 2); + int s = part * bitonicLen + (i & ((bitonicLen / 2) - 1)); + int e = s + bitonicLen / 2; + + if (e >= src.getSize()) + break; + + //calculate the direction of swapping + int monotonicSeqIdx = i / (monotonicSeqLen / 2); + bool ascending = (monotonicSeqIdx & 1) != 0; + if ((monotonicSeqIdx + 1) * monotonicSeqLen >= src.getSize()) //special case for parts with no "partner" + ascending = true; + + cmpSwap(src[s], src[e], ascending, Cmp); + } + __syncthreads(); + } + } +} + +#endif + } // namespace Sorting + } // namespace Algorithms +} // namespace TNL diff --git a/src/TNL/Algorithms/Sorting/detail/cudaPartition.h b/src/TNL/Algorithms/Sorting/detail/cudaPartition.h new file mode 100644 index 0000000000000000000000000000000000000000..a6afaa20a33a0c687a6814bcfee92546e38086b7 --- /dev/null +++ b/src/TNL/Algorithms/Sorting/detail/cudaPartition.h @@ -0,0 +1,227 @@ +/*************************************************************************** + cudaPartition.h - description + ------------------- + begin : Jul 13, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Xuan Thang Nguyen + +#pragma once + +#include +#include +#include + +namespace TNL { + namespace Algorithms { + namespace Sorting { + +#ifdef HAVE_CUDA + +template +__device__ Value pickPivot(TNL::Containers::ArrayView src, const CMP &Cmp) +{ + //return src[0]; + //return src[src.getSize()-1]; + + if (src.getSize() == 1) + return src[0]; + + const Value &a = src[0], &b = src[src.getSize() / 2], &c = src[src.getSize() - 1]; + + if (Cmp(a, b)) // ..a..b.. + { + if (Cmp(b, c)) // ..a..b..c + return b; + else if (Cmp(c, a)) //..c..a..b.. + return a; + else //..a..c..b.. + return c; + } + else //..b..a.. + { + if (Cmp(a, c)) //..b..a..c + return a; + else if (Cmp(c, b)) //..c..b..a.. + return b; + else //..b..c..a.. + return c; + } +} + +template +__device__ int pickPivotIdx(TNL::Containers::ArrayView src, const CMP &Cmp) +{ + //return 0; + //return src.getSize()-1; + + if (src.getSize() <= 1) + return 0; + + const Value &a = src[0], &b = src[src.getSize() / 2], &c = src[src.getSize() - 1]; + + if (Cmp(a, b)) // ..a..b.. + { + if (Cmp(b, c)) // ..a..b..c + return src.getSize() / 2; + else if (Cmp(c, a)) //..c..a..b.. + return 0; + else //..a..c..b.. + return src.getSize() - 1; + } + else //..b..a.. + { + if (Cmp(a, c)) //..b..a..c + return 0; + else if (Cmp(c, b)) //..c..b..a.. + return src.getSize() / 2; + else //..b..c..a.. + return src.getSize() - 1; + } +} + +//----------------------------------------------------------- + +template +__device__ void countElem( Containers::ArrayView arr, + const CMP &Cmp, + int &smaller, int &bigger, + const Value &pivot) +{ + for (int i = threadIdx.x; i < arr.getSize(); i += blockDim.x) + { + const Value &data = arr[i]; + if (Cmp(data, pivot)) + smaller++; + else if (Cmp(pivot, data)) + bigger++; + } +} + +//----------------------------------------------------------- + +template +__device__ void copyDataShared( Containers::ArrayView src, + Containers::ArrayView dst, + const CMP &Cmp, + Value *sharedMem, + int smallerStart, int biggerStart, + int smallerTotal, int biggerTotal, + int smallerOffset, int biggerOffset, //exclusive prefix sum of elements + const Value &pivot) +{ + + for (int i = threadIdx.x; i < src.getSize(); i += blockDim.x) + { + const Value &data = src[i]; + if (Cmp(data, pivot)) + sharedMem[smallerOffset++] = data; + else if (Cmp(pivot, data)) + sharedMem[smallerTotal + biggerOffset++] = data; + } + __syncthreads(); + + for (int i = threadIdx.x; i < smallerTotal + biggerTotal; i += blockDim.x) + { + if (i < smallerTotal) + dst[smallerStart + i] = sharedMem[i]; + else + dst[biggerStart + i - smallerTotal] = sharedMem[i]; + } +} + +template +__device__ void copyData( Containers::ArrayView src, + Containers::ArrayView dst, + const CMP &Cmp, + int smallerStart, int biggerStart, + const Value &pivot) +{ + for (int i = threadIdx.x; i < src.getSize(); i += blockDim.x) + { + const Value &data = src[i]; + if (Cmp(data, pivot)) + { + /* + if(smallerStart >= dst.getSize() || smallerStart < 0) + printf("failed smaller: b:%d t:%d: tried to write into [%d]/%d\n", blockDim.x, threadIdx.x, smallerStart, dst.getSize()); + */ + dst[smallerStart++] = data; + } + else if (Cmp(pivot, data)) + { + /* + if(biggerStart >= dst.getSize() || biggerStart < 0) + printf("failed bigger: b:%d t:%d: tried to write into [%d]/%d\n", blockDim.x, threadIdx.x, biggerStart, dst.getSize()); + */ + dst[biggerStart++] = data; + } + } +} + +//---------------------------------------------------------------------------------- + +template +__device__ void cudaPartition( Containers::ArrayView src, + Containers::ArrayView dst, + const CMP &Cmp, + Value *sharedMem, + const Value &pivot, + int elemPerBlock, TASK &task) +{ + static __shared__ int smallerStart, biggerStart; + + int myBegin = elemPerBlock * (blockIdx.x - task.firstBlock); + int myEnd = TNL::min(myBegin + elemPerBlock, src.getSize()); + + auto srcView = src.getView(myBegin, myEnd); + + //------------------------------------------------------------------------- + + int smaller = 0, bigger = 0; + countElem(srcView, Cmp, smaller, bigger, pivot); + + int smallerPrefSumInc = blockInclusivePrefixSum(smaller); + int biggerPrefSumInc = blockInclusivePrefixSum(bigger); + + if (threadIdx.x == blockDim.x - 1) //last thread in block has sum of all values + { + smallerStart = atomicAdd(&(task.dstBegin), smallerPrefSumInc); + biggerStart = atomicAdd(&(task.dstEnd), -biggerPrefSumInc) - biggerPrefSumInc; + } + __syncthreads(); + + //----------------------------------------------------------- + if (useShared) + { + static __shared__ int smallerTotal, biggerTotal; + if (threadIdx.x == blockDim.x - 1) + { + smallerTotal = smallerPrefSumInc; + biggerTotal = biggerPrefSumInc; + } + __syncthreads(); + + copyDataShared(srcView, dst, Cmp, sharedMem, + smallerStart, biggerStart, + smallerTotal, biggerTotal, + smallerPrefSumInc - smaller, biggerPrefSumInc - bigger, //exclusive prefix sum of elements + pivot); + } + else + { + int destSmaller = smallerStart + smallerPrefSumInc - smaller; + int destBigger = biggerStart + biggerPrefSumInc - bigger; + copyData(srcView, dst, Cmp, destSmaller, destBigger, pivot); + } +} + +#endif + + } // namespace Sorting + } // namespace Algorithms +} // namespace TNL diff --git a/src/TNL/Algorithms/Sorting/detail/helpers.h b/src/TNL/Algorithms/Sorting/detail/helpers.h new file mode 100644 index 0000000000000000000000000000000000000000..2d7dbbcc7b04c0c922927f215d6f51bf185315b0 --- /dev/null +++ b/src/TNL/Algorithms/Sorting/detail/helpers.h @@ -0,0 +1,59 @@ +/*************************************************************************** + helpers.h - description + ------------------- + begin : Jul 13, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Xuan Thang Nguyen + +#pragma once +#include + +namespace TNL { + namespace Algorithms { + namespace Sorting { + +#ifdef HAVE_CUDA + +// Inline PTX call to return index of highest non-zero bit in a word +static __device__ __forceinline__ unsigned int __btflo(unsigned int word) +{ + unsigned int ret; + asm volatile("bfind.u32 %0, %1;" + : "=r"(ret) + : "r"(word)); + return ret; +} + +__device__ int closestPow2_ptx(int bitonicLen) +{ + return 1 << (__btflo((unsigned)bitonicLen - 1U) + 1); +} + +__host__ __device__ int closestPow2(int x) +{ + if (x == 0) + return 0; + + int ret = 1; + while (ret < x) + ret <<= 1; + + return ret; +} + +template +__cuda_callable__ void cmpSwap(Value &a, Value &b, bool ascending, const CMP &Cmp) +{ + if (ascending == Cmp(b, a)) + TNL::swap(a, b); +} + +#endif + } //namespace Sorting + } //namespace Algorithms +} // namespace TNL \ No newline at end of file diff --git a/src/TNL/Algorithms/Sorting/detail/quicksort_1Block.h b/src/TNL/Algorithms/Sorting/detail/quicksort_1Block.h new file mode 100644 index 0000000000000000000000000000000000000000..efca29f2429fd54b852384793d67538b4ab0356d --- /dev/null +++ b/src/TNL/Algorithms/Sorting/detail/quicksort_1Block.h @@ -0,0 +1,255 @@ +/*************************************************************************** + quicksort_1Block.h - description + ------------------- + begin : Jul 13, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Xuan Thang Nguyen + +#pragma once + +#include +#include "cassert" +#include +#include +#include + +namespace TNL { + namespace Algorithms { + namespace Sorting { + +#ifdef HAVE_CUDA + +template +__device__ void externSort( Containers::ArrayView src, + Containers::ArrayView dst, + const CMP &Cmp, Value *sharedMem) +{ + bitonicSort_Block(src, dst, sharedMem, Cmp); +} + +template +__device__ void externSort( Containers::ArrayView src, + const CMP &Cmp) +{ + bitonicSort_Block(src, Cmp); +} + +//--------------------------------------------------------------- + +template +__device__ void stackPush(int stackArrBegin[], int stackArrEnd[], + int stackDepth[], int &stackTop, + int begin, int pivotBegin, + int pivotEnd, int end, + int iteration); + +//--------------------------------------------------------------- + +template +__device__ void singleBlockQuickSort( Containers::ArrayView arr, + Containers::ArrayView aux, + const CMP &Cmp, int _iteration, + Value *sharedMem, int memSize, + int maxBitonicSize) +{ + if (arr.getSize() <= maxBitonicSize) + { + auto &src = (_iteration & 1) == 0 ? arr : aux; + if (useShared && arr.getSize() <= memSize) + externSort(src, arr, Cmp, sharedMem); + else + { + externSort(src, Cmp); + //extern sort without shared memory only works in-place, need to copy into from aux + if ((_iteration & 1) != 0) + for (int i = threadIdx.x; i < arr.getSize(); i += blockDim.x) + arr[i] = src[i]; + } + + return; + } + + static __shared__ int stackTop; + static __shared__ int stackArrBegin[stackSize], stackArrEnd[stackSize], stackDepth[stackSize]; + static __shared__ int begin, end, iteration; + static __shared__ int pivotBegin, pivotEnd; + Value *piv = sharedMem; + sharedMem += 1; + + if (threadIdx.x == 0) + { + stackTop = 0; + stackArrBegin[stackTop] = 0; + stackArrEnd[stackTop] = arr.getSize(); + stackDepth[stackTop] = _iteration; + stackTop++; + } + __syncthreads(); + + while (stackTop > 0) + { + //pick up partition to break up + if (threadIdx.x == 0) + { + begin = stackArrBegin[stackTop - 1]; + end = stackArrEnd[stackTop - 1]; + iteration = stackDepth[stackTop - 1]; + stackTop--; + } + __syncthreads(); + + int size = end - begin; + auto &src = (iteration & 1) == 0 ? arr : aux; + + //small enough for for bitonic + if (size <= maxBitonicSize) + { + if (useShared && size <= memSize) + externSort(src.getView(begin, end), arr.getView(begin, end), Cmp, sharedMem); + else + { + externSort(src.getView(begin, end), Cmp); + //extern sort without shared memory only works in-place, need to copy into from aux + if ((iteration & 1) != 0) + for (int i = threadIdx.x; i < src.getSize(); i += blockDim.x) + arr[begin + i] = src[i]; + } + __syncthreads(); + continue; + } + + //------------------------------------------------------ + + if (threadIdx.x == 0) + *piv = pickPivot(src.getView(begin, end), Cmp); + __syncthreads(); + Value &pivot = *piv; + + int smaller = 0, bigger = 0; + countElem(src.getView(begin, end), Cmp, smaller, bigger, pivot); + + //synchronization is in this function already + int smallerPrefSumInc = blockInclusivePrefixSum(smaller); + int biggerPrefSumInc = blockInclusivePrefixSum(bigger); + + if (threadIdx.x == blockDim.x - 1) //has sum of all smaller and greater elements than pivot in src + { + pivotBegin = 0 + smallerPrefSumInc; + pivotEnd = size - biggerPrefSumInc; + } + __syncthreads(); + + //-------------------------------------------------------------- + /** + * move elements, either use shared mem for coalesced access or without shared mem if data is too big + * */ + + auto &dst = (iteration & 1) == 0 ? aux : arr; + + if (useShared && size <= memSize) + { + static __shared__ int smallerTotal, biggerTotal; + if (threadIdx.x == blockDim.x - 1) + { + smallerTotal = smallerPrefSumInc; + biggerTotal = biggerPrefSumInc; + } + __syncthreads(); + + copyDataShared(src.getView(begin, end), dst.getView(begin, end), + Cmp, sharedMem, + 0, pivotEnd, + smallerTotal, biggerTotal, + smallerPrefSumInc - smaller, biggerPrefSumInc - bigger, //exclusive prefix sum of elements + pivot); + } + else + { + int destSmaller = 0 + (smallerPrefSumInc - smaller); + int destBigger = pivotEnd + (biggerPrefSumInc - bigger); + + copyData(src.getView(begin, end), dst.getView(begin, end), Cmp, destSmaller, destBigger, pivot); + } + + __syncthreads(); + + for (int i = pivotBegin + threadIdx.x; i < pivotEnd; i += blockDim.x) + arr[begin + i] = pivot; + + //creates new tasks + if (threadIdx.x == 0) + { + stackPush(stackArrBegin, stackArrEnd, stackDepth, stackTop, + begin, begin + pivotBegin, + begin + pivotEnd, end, + iteration); + } + __syncthreads(); //sync to update stackTop + } //ends while loop +} + +//-------------------------------------------------------------- + +template +__device__ void stackPush(int stackArrBegin[], int stackArrEnd[], + int stackDepth[], int &stackTop, + int begin, int pivotBegin, + int pivotEnd, int end, + int iteration) +{ + int sizeL = pivotBegin - begin, sizeR = end - pivotEnd; + + //push the bigger one 1st and then smaller one 2nd + //in next iteration, the smaller part will be handled 1st + if (sizeL > sizeR) + { + if (sizeL > 0) //left from pivot are smaller elems + { + stackArrBegin[stackTop] = begin; + stackArrEnd[stackTop] = pivotBegin; + stackDepth[stackTop] = iteration + 1; + stackTop++; + } + + if (sizeR > 0) //right from pivot until end are elem greater than pivot + { + assert(stackTop < stackSize && "Local quicksort stack overflow."); + + stackArrBegin[stackTop] = pivotEnd; + stackArrEnd[stackTop] = end; + stackDepth[stackTop] = iteration + 1; + stackTop++; + } + } + else + { + if (sizeR > 0) //right from pivot until end are elem greater than pivot + { + stackArrBegin[stackTop] = pivotEnd; + stackArrEnd[stackTop] = end; + stackDepth[stackTop] = iteration + 1; + stackTop++; + } + + if (sizeL > 0) //left from pivot are smaller elems + { + assert(stackTop < stackSize && "Local quicksort stack overflow."); + + stackArrBegin[stackTop] = begin; + stackArrEnd[stackTop] = pivotBegin; + stackDepth[stackTop] = iteration + 1; + stackTop++; + } + } +} + +#endif + + } // namespace Sorting + } // namespace Algorithms +} // namespace TNL diff --git a/src/TNL/Algorithms/Sorting/detail/quicksort_kernel.h b/src/TNL/Algorithms/Sorting/detail/quicksort_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..8d26d0637bd55c12751639b36d288e890debf00e --- /dev/null +++ b/src/TNL/Algorithms/Sorting/detail/quicksort_kernel.h @@ -0,0 +1,267 @@ +/*************************************************************************** + quicksort_kernel.h - description + ------------------- + begin : Jul 13, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Xuan Thang Nguyen + +#pragma once + +#include +#include +#include +#include +#include +#include + + +namespace TNL { + namespace Algorithms { + namespace Sorting { + +#ifdef HAVE_CUDA + +__device__ void writeNewTask(int begin, int end, int iteration, int maxElemFor2ndPhase, + Containers::ArrayView newTasks, int *newTasksCnt, + Containers::ArrayView secondPhaseTasks, int *secondPhaseTasksCnt); + +//----------------------------------------------------------- + +__global__ void cudaCalcBlocksNeeded(Containers::ArrayView cuda_tasks, int elemPerBlock, + Containers::VectorView blocksNeeded) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= cuda_tasks.getSize()) + return; + + TASK &task = cuda_tasks[i]; + int size = task.partitionEnd - task.partitionBegin; + blocksNeeded[i] = size / elemPerBlock + (size % elemPerBlock != 0); +} + +//----------------------------------------------------------- + +template +__global__ void cudaInitTask(Containers::ArrayView cuda_tasks, + Containers::ArrayView cuda_blockToTaskMapping, + Containers::VectorView cuda_reductionTaskInitMem, + Containers::ArrayView src, CMP Cmp) +{ + if (blockIdx.x >= cuda_tasks.getSize()) + return; + + int start = blockIdx.x == 0 ? 0 : cuda_reductionTaskInitMem[blockIdx.x - 1]; + int end = cuda_reductionTaskInitMem[blockIdx.x]; + for (int i = start + threadIdx.x; i < end; i += blockDim.x) + cuda_blockToTaskMapping[i] = blockIdx.x; + + if (threadIdx.x == 0) + { + TASK &task = cuda_tasks[blockIdx.x]; + int pivotIdx = task.partitionBegin + pickPivotIdx(src.getView(task.partitionBegin, task.partitionEnd), Cmp); + task.initTask(start, end - start, pivotIdx); + } +} + +//---------------------------------------------------- + +template +__global__ void cudaQuickSort1stPhase(Containers::ArrayView arr, Containers::ArrayView aux, + const CMP &Cmp, int elemPerBlock, + Containers::ArrayView tasks, + Containers::ArrayView taskMapping) +{ + extern __shared__ int externMem[]; + Value *piv = (Value *)externMem; + Value *sharedMem = piv + 1; + + TASK &myTask = tasks[taskMapping[blockIdx.x]]; + auto &src = (myTask.iteration & 1) == 0 ? arr : aux; + auto &dst = (myTask.iteration & 1) == 0 ? aux : arr; + + if (threadIdx.x == 0) + *piv = src[myTask.pivotIdx]; + __syncthreads(); + Value &pivot = *piv; + + cudaPartition( + src.getView(myTask.partitionBegin, myTask.partitionEnd), + dst.getView(myTask.partitionBegin, myTask.partitionEnd), + Cmp, sharedMem, pivot, + elemPerBlock, myTask); +} + +//---------------------------------------------------- + +template +__global__ void cudaWritePivot(Containers::ArrayView arr, Containers::ArrayView aux, int maxElemFor2ndPhase, + Containers::ArrayView tasks, Containers::ArrayView newTasks, int *newTasksCnt, + Containers::ArrayView secondPhaseTasks, int *secondPhaseTasksCnt) +{ + extern __shared__ int externMem[]; + Value *piv = (Value *)externMem; + + TASK &myTask = tasks[blockIdx.x]; + + if (threadIdx.x == 0) + *piv = (myTask.iteration & 1) == 0 ? arr[myTask.pivotIdx] : aux[myTask.pivotIdx]; + __syncthreads(); + Value &pivot = *piv; + + int leftBegin = myTask.partitionBegin, leftEnd = myTask.partitionBegin + myTask.dstBegin; + int rightBegin = myTask.partitionBegin + myTask.dstEnd, rightEnd = myTask.partitionEnd; + + for (int i = leftEnd + threadIdx.x; i < rightBegin; i += blockDim.x) + { + /* + #ifdef DEBUG + aux[i] = -1; + #endif + */ + arr[i] = pivot; + } + + if (threadIdx.x != 0) + return; + + if (leftEnd - leftBegin > 0) + { + writeNewTask(leftBegin, leftEnd, myTask.iteration, + maxElemFor2ndPhase, + newTasks, newTasksCnt, + secondPhaseTasks, secondPhaseTasksCnt); + } + + if (rightEnd - rightBegin > 0) + { + writeNewTask(rightBegin, rightEnd, + myTask.iteration, maxElemFor2ndPhase, + newTasks, newTasksCnt, + secondPhaseTasks, secondPhaseTasksCnt); + } +} + +//----------------------------------------------------------- + +__device__ void writeNewTask(int begin, int end, int iteration, int maxElemFor2ndPhase, + Containers::ArrayView newTasks, int *newTasksCnt, + Containers::ArrayView secondPhaseTasks, int *secondPhaseTasksCnt) +{ + int size = end - begin; + if (size < 0) + { + printf("negative size, something went really wrong\n"); + return; + } + + if (size == 0) + return; + + if (size <= maxElemFor2ndPhase) + { + int idx = atomicAdd(secondPhaseTasksCnt, 1); + if (idx < secondPhaseTasks.getSize()) + secondPhaseTasks[idx] = TASK(begin, end, iteration + 1); + else + { + //printf("ran out of memory, trying backup\n"); + int idx = atomicAdd(newTasksCnt, 1); + if (idx < newTasks.getSize()) + newTasks[idx] = TASK(begin, end, iteration + 1); + else + printf("ran out of memory for second phase task, there isnt even space in newTask list\nPart of array may stay unsorted!!!\n"); + } + } + else + { + int idx = atomicAdd(newTasksCnt, 1); + if (idx < newTasks.getSize()) + newTasks[idx] = TASK(begin, end, iteration + 1); + else + { + //printf("ran out of memory, trying backup\n"); + int idx = atomicAdd(secondPhaseTasksCnt, 1); + if (idx < secondPhaseTasks.getSize()) + secondPhaseTasks[idx] = TASK(begin, end, iteration + 1); + else + printf("ran out of memory for newtask, there isnt even space in second phase task list\nPart of array may stay unsorted!!!\n"); + } + } +} + +//----------------------------------------------------------- + +template +__global__ void cudaQuickSort2ndPhase(Containers::ArrayView arr, Containers::ArrayView aux, + CMP Cmp, + Containers::ArrayView secondPhaseTasks, + int elemInShared, int maxBitonicSize) +{ + extern __shared__ int externMem[]; + Value *sharedMem = (Value *)externMem; + + TASK &myTask = secondPhaseTasks[blockIdx.x]; + if (myTask.partitionEnd - myTask.partitionBegin <= 0) + { + //printf("empty task???\n"); + return; + } + + auto arrView = arr.getView(myTask.partitionBegin, myTask.partitionEnd); + auto auxView = aux.getView(myTask.partitionBegin, myTask.partitionEnd); + + if (elemInShared == 0) + { + singleBlockQuickSort(arrView, auxView, Cmp, myTask.iteration, sharedMem, 0, maxBitonicSize); + } + else + { + singleBlockQuickSort(arrView, auxView, Cmp, myTask.iteration, sharedMem, elemInShared, maxBitonicSize); + } +} + +template +__global__ void cudaQuickSort2ndPhase(Containers::ArrayView arr, Containers::ArrayView aux, + CMP Cmp, + Containers::ArrayView secondPhaseTasks1, + Containers::ArrayView secondPhaseTasks2, + int elemInShared, int maxBitonicSize) +{ + extern __shared__ int externMem[]; + Value *sharedMem = (Value *)externMem; + + TASK myTask; + if (blockIdx.x < secondPhaseTasks1.getSize()) + myTask = secondPhaseTasks1[blockIdx.x]; + else + myTask = secondPhaseTasks2[blockIdx.x - secondPhaseTasks1.getSize()]; + + if (myTask.partitionEnd - myTask.partitionBegin <= 0) + { + //printf("empty task???\n"); + return; + } + + auto arrView = arr.getView(myTask.partitionBegin, myTask.partitionEnd); + auto auxView = aux.getView(myTask.partitionBegin, myTask.partitionEnd); + + if (elemInShared <= 0) + { + singleBlockQuickSort(arrView, auxView, Cmp, myTask.iteration, sharedMem, 0, maxBitonicSize); + } + else + { + singleBlockQuickSort(arrView, auxView, Cmp, myTask.iteration, sharedMem, elemInShared, maxBitonicSize); + } +} + +#endif + + } // namespace Sorting + } // namespace Algorithms +} // namespace TNL diff --git a/src/TNL/Algorithms/Sorting/detail/reduction.h b/src/TNL/Algorithms/Sorting/detail/reduction.h new file mode 100644 index 0000000000000000000000000000000000000000..e2bf148099e830bae96264ab4cda5faa5b964181 --- /dev/null +++ b/src/TNL/Algorithms/Sorting/detail/reduction.h @@ -0,0 +1,138 @@ +/*************************************************************************** + reduction.h - description + ------------------- + begin : Jul 13, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Xuan Thang Nguyen + +#pragma once + +namespace TNL { + namespace Algorithms { + namespace Sorting { + +#ifdef HAVE_CUDA + +/** + * https://developer.nvidia.com/blog/faster-parallel-reductions-kepler/ + * */ + + +__device__ int warpReduceSum(int initVal) +{ + const unsigned int maskConstant = 0xffffffff; //not used + for (unsigned int mask = warpSize / 2; mask > 0; mask >>= 1) + initVal += __shfl_xor_sync(maskConstant, initVal, mask); + + return initVal; +} + +__device__ int blockReduceSum(int val) +{ + static __shared__ int shared[32]; + int lane = threadIdx.x & (warpSize - 1); + int wid = threadIdx.x / warpSize; + + val = warpReduceSum(val); + + if (lane == 0) + shared[wid] = val; + __syncthreads(); + + val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0; + + if (wid == 0) + val = warpReduceSum(val); + + if(threadIdx.x == 0) + shared[0] = val; + __syncthreads(); + + return shared[0]; +} + +//------------------------------------------------------------------------------- + +__device__ int warpInclusivePrefixSum(int value) +{ + int laneId = threadIdx.x & (32-1); + + #pragma unroll + for (int i = 1; i*2 <= 32; i *= 2)//32 here is warp size + { + int n = __shfl_up_sync(0xffffffff, value, i); + if ((laneId & (warpSize - 1)) >= i) + value += n; + } + + return value; +} + +__device__ int blockInclusivePrefixSum(int value) +{ + static __shared__ int shared[32]; + int lane = threadIdx.x & (warpSize - 1); + int wid = threadIdx.x / warpSize; + + int tmp = warpInclusivePrefixSum(value); + + if (lane == warpSize-1) + shared[wid] = tmp; + __syncthreads(); + + int tmp2 = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0; + if (wid == 0) + shared[lane] = warpInclusivePrefixSum(tmp2) - tmp2; + __syncthreads(); + + tmp += shared[wid]; + return tmp; +} + +//-------------------------------------------------------------------- + +template +__device__ int warpCmpReduce(int initVal, const Operator & Cmp) +{ + const unsigned int maskConstant = 0xffffffff; //not used + for (unsigned int mask = warpSize / 2; mask > 0; mask >>= 1) + initVal = Cmp(initVal, __shfl_xor_sync(maskConstant, initVal, mask)); + + return initVal; +} + +template +__device__ int blockCmpReduce(int val, const Operator & Cmp) +{ + static __shared__ int shared[32]; + int lane = threadIdx.x & (warpSize - 1); + int wid = threadIdx.x / warpSize; + + val = warpCmpReduce(val, Cmp); + + if (lane == 0) + shared[wid] = val; + __syncthreads(); + + val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : shared[0]; + + if (wid == 0) + val = warpCmpReduce(val, Cmp); + + if(threadIdx.x == 0) + shared[0] = val; + __syncthreads(); + + return shared[0]; +} + +#endif + + } // namespace Sorting + } // namespace Algorithms +} // namespace TNL \ No newline at end of file diff --git a/src/TNL/Algorithms/Sorting/detail/task.h b/src/TNL/Algorithms/Sorting/detail/task.h new file mode 100644 index 0000000000000000000000000000000000000000..bf2130b2eac068877487321d3a1501a0a91e71fd --- /dev/null +++ b/src/TNL/Algorithms/Sorting/detail/task.h @@ -0,0 +1,68 @@ +/*************************************************************************** + TASK.h - description + ------------------- + begin : Jul 13, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Xuan Thang Nguyen + +#pragma once + +namespace TNL { + namespace Algorithms { + namespace Sorting { + +struct TASK +{ + //start and end position of array to read and write from + int partitionBegin, partitionEnd; + //----------------------------------------------- + //helper variables for blocks working on this task + + int iteration; + int pivotIdx; + int dstBegin, dstEnd; + int firstBlock, blockCount;//for workers read only values + + __cuda_callable__ + TASK(int begin, int end, int iteration) + : partitionBegin(begin), partitionEnd(end), + iteration(iteration), pivotIdx(-1), + dstBegin(-151561), dstEnd(-151561), + firstBlock(-100), blockCount(-100) + {} + + __cuda_callable__ + void initTask(int firstBlock, int blocks, int pivotIdx) + { + dstBegin= 0; dstEnd = partitionEnd - partitionBegin; + this->firstBlock = firstBlock; + blockCount = blocks; + this->pivotIdx = pivotIdx; + } + + __cuda_callable__ + int getSize() const + { + return partitionEnd - partitionBegin; + } + + TASK() = default; +}; + +std::ostream& operator<<(std::ostream & out, const TASK & task) +{ + out << "[ "; + out << task.partitionBegin << " - " << task.partitionEnd; + out << " | " << "iteration: " << task.iteration; + out << " | " << "pivotIdx: " << task.pivotIdx; + return out << " ] "; +} + + } // namespace Sorting + } // namespace Algorithms +} // namespace TNL \ No newline at end of file diff --git a/src/TNL/Algorithms/sort.h b/src/TNL/Algorithms/sort.h new file mode 100644 index 0000000000000000000000000000000000000000..131561760e3c655f4fd690187948d96e69297ea3 --- /dev/null +++ b/src/TNL/Algorithms/sort.h @@ -0,0 +1,217 @@ +/*************************************************************************** + sort.h - description + ------------------- + begin : Jul 12, 2021 + copyright : (C) 2021 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Tomas Oberhuber, Xuan Thang Nguyen + +#pragma once + +#include + +namespace TNL { + namespace Algorithms { + +/** + * \brief Function for sorting elements of array or vector in ascending order. + * + * \tparam Array is a type of container to be sorted. It can be, for example, \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, + * \ref TNL::Containers::Vector, \ref TNL::Containers::VectorView. + * \tparam Sorter is an algorithm for sorting. It can be, \ref TNL::Algorithms::Sorting::STLSort for sorting on host and \ref TNL::Algorithms::Sorting::Quicksort + * or \ref TNL::Algorithms::Sorting::BitonicSort for sorting on CUDA GPU. + * + * \param array is an instance of array/array view/vector/vector view for sorting. + * \param sorter is an instance of sorter. + * + * \par Example + * + * \includelineno SortingExample.cpp + * + * \par Output + * + * \include SortingExample.out + * + */ +template< typename Array, + typename Sorter = typename Sorting::DefaultSorter< typename Array::DeviceType >::SorterType > +void ascendingSort( Array& array, const Sorter& sorter = Sorter{} ) +{ + using ValueType = typename Array::ValueType; + sorter.sort( array, [] __cuda_callable__ ( const ValueType& a, const ValueType& b ) { return a < b; } ); +} + +/** + * \brief Function for sorting elements of array or vector in descending order. + * + * \tparam Array is a type of container to be sorted. It can be, for example, \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, + * \ref TNL::Containers::Vector, \ref TNL::Containers::VectorView. + * \tparam Sorter is an algorithm for sorting. It can be, \ref TNL::Algorithms::Sorting::STLSort for sorting on host and \ref TNL::Algorithms::Sorting::Quicksort + * or \ref TNL::Algorithms::Sorting::BitonicSort for sorting on CUDA GPU. + * + * \param array is an instance of array/array view/vector/vector view for sorting. + * \param sorter is an instance of sorter. + * + * \par Example + * + * \includelineno SortingExample.cpp + * + * \par Output + * + * \include SortingExample.out + * + */ +template< typename Array, + typename Sorter = typename Sorting::DefaultSorter< typename Array::DeviceType >::SorterType > +void descendingSort( Array& array, const Sorter& sorter = Sorter{} ) +{ + using ValueType = typename Array::ValueType; + sorter.sort( array, [] __cuda_callable__ ( const ValueType& a, const ValueType& b ) { return a < b; } ); +} + +/** + * \brief Function for sorting elements of array or vector based on a user defined comparison lambda function. + * + * \tparam Array is a type of container to be sorted. It can be, for example, \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, + * \ref TNL::Containers::Vector, \ref TNL::Containers::VectorView. + * \tparam Compare is a lambda function for comparing of two elements. It returns true if the first argument should be ordered before the second. The + * lambda function is supposed to be defined as follows (`ValueType` is type of the array elements): + * ``` + * auto compare = [] __cuda_callable__ ( const ValueType& a , const ValueType& b ) -> bool { return .... }; + * ``` + * \tparam Sorter is an algorithm for sorting. It can be, \ref TNL::Algorithms::Sorting::STLSort for sorting on host and \ref TNL::Algorithms::Sorting::Quicksort + * or \ref TNL::Algorithms::Sorting::BitonicSort for sorting on CUDA GPU. + * + * \param array is an instance of array/array view/vector/vector view for sorting. + * \param compare is an instance of the lambda function for comparison of two elements. + * \param sorter is an instance of sorter. + * + * \par Example + * + * \includelineno SortingExample2.cpp + * + * \par Output + * + * \include SortingExample2.out + * + */ +template< typename Array, + typename Compare, + typename Sorter = typename Sorting::DefaultSorter< typename Array::DeviceType >::SorterType > +void sort( Array& array, const Compare& compare, const Sorter& sorter = Sorter{} ) +{ + sorter.sort( array, compare ); +} + +/** + * \brief Function for general sorting based on lambda functions for comparison and swaping of two elements.. + * + * \tparam Device is device on which the sorting algorithms should be executed. + * \tparam Index is type used for indexing of the sorted data. + * \tparam Compare is a lambda function for comparing of two elements. It returns true if the first argument should be ordered before the second - both are given + * by indices representing their positions. The lambda function is supposed to be defined as follows: + * ``` + * auto compare = [=] __cuda_callable__ ( const Index& a , const Index& b ) -> bool { return .... }; + * ``` + * \tparam Swap is a lambda function for swaping of two elements which are ordered wrong way. Both elements are represented by indeces as well. It supposed to + * be defined as: + * ``` + * auto swap = [=] __cuda_callable__ ( const Index& a , const Index& b ) mutable { swap( ....); }; + * ``` + * \tparam Sorter is an algorithm for sorting. It can be \ref TNL::Algorithms::Sorting::BitonicSort for sorting on CUDA GPU. Currently there is no algorithm for CPU :(. + * + * \param array is an instance of array/array view/vector/vector view for sorting. + * \param compare is an instance of the lambda function for comparison of two elements. + * \param sorter is an instance of sorter. + * + * \par Example + * + * \includelineno SortingExample3.cpp + * + * \par Output + * + * \include SortingExample3.out + * + */ +template< typename Device, + typename Index, + typename Compare, + typename Swap, + typename Sorter = typename Sorting::DefaultInplaceSorter< Device >::SorterType > +void sort( const Index begin, const Index end, Compare&& compare, Swap&& swap, const Sorter& sorter = Sorter{} ) +{ + sorter.template inplaceSort< Device, Index >( begin, end, compare, swap ); +} + +/** + * \brief Functions returning true if the array elements are sorted according to the lmabda function `comparison`. + * + * \tparam Array is the type of array/vector. It can be, for example, \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, + * \ref TNL::Containers::Vector, \ref TNL::Containers::VectorView. + * \tparam Compare is a lambda function for comparing of two elements. It returns true if the first argument should be ordered before the second - both are given + * by indices representing their positions. The lambda function is supposed to be defined as follows: + * ``` + * auto compare = [=] __cuda_callable__ ( const Index& a , const Index& b ) -> bool { return .... }; + * ``` + * \param arr is an instance of tested array. + * \param compare is an instance of the lambda function for elements comparison. + * \param sorter is an instance of sorter. + * + * \return true if the array is sorted in ascending order. + * \return false if the array is NOT sorted in ascending order. + */ +template< typename Array, typename Compare > +bool isSorted( const Array& arr, const Compare& compare ) +{ + using Device = typename Array::DeviceType; + if (arr.getSize() <= 1) + return true; + + auto view = arr.getConstView(); + auto fetch = [=] __cuda_callable__(int i) { return ! compare( view[ i ], view[ i - 1 ] ); }; + auto reduction = [] __cuda_callable__(bool a, bool b) { return a && b; }; + return TNL::Algorithms::reduce< Device >( 1, arr.getSize(), fetch, reduction, true ); +} + +/** + * \brief Functions returning true if the array elements are sorted in ascending order. + * + * \tparam Array is the type of array/vector. It can be, for example, \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, + * \ref TNL::Containers::Vector, \ref TNL::Containers::VectorView. + * + * \param arr is an instance of tested array. + * + * \return true if the array is sorted in ascending order. + * \return false if the array is NOT sorted in ascending order. + */ +template< typename Array > +bool isAscending( const Array& arr ) +{ + using Value = typename Array::ValueType; + return isSorted( arr, [] __cuda_callable__( const Value& a, const Value& b ) { return a < b; }); +} + +/** + * \brief Functions returning true if the array elements are sorted in descending order. + * + * \tparam Array is the type of array/vector. It can be, for example, \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView, + * \ref TNL::Containers::Vector, \ref TNL::Containers::VectorView. + * + * \param arr is an instance of tested array. + * + * \return true if the array is sorted in descending order. + * \return false if the array is NOT sorted in descending order. + */ +template< typename Array > +bool isDescending( const Array& arr) +{ + using Value = typename Array::ValueType; + return isSorted( arr, [] __cuda_callable__( const Value& a, const Value& b ) { return a > b; }); +} + + } // namespace Algorithms +} // namespace TNL diff --git a/src/UnitTests/Algorithms/CMakeLists.txt b/src/UnitTests/Algorithms/CMakeLists.txt index 14a7d43ab35a0ea9d5dc26fa8e571cef97dfbe1b..31028036b233748f5a5d57fa4784354265b52156 100644 --- a/src/UnitTests/Algorithms/CMakeLists.txt +++ b/src/UnitTests/Algorithms/CMakeLists.txt @@ -1,4 +1,5 @@ ADD_SUBDIRECTORY( Segments ) +ADD_SUBDIRECTORY( Sorting ) set( COMMON_TESTS MemoryOperationsTest diff --git a/src/UnitTests/Algorithms/Sorting/BitonicSortTest.cpp b/src/UnitTests/Algorithms/Sorting/BitonicSortTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6d48cf991585240295ca5eb23e5cd56259d9c3fc --- /dev/null +++ b/src/UnitTests/Algorithms/Sorting/BitonicSortTest.cpp @@ -0,0 +1 @@ +#include "BitonicSortTest.h" \ No newline at end of file diff --git a/src/UnitTests/Algorithms/Sorting/BitonicSortTest.cu b/src/UnitTests/Algorithms/Sorting/BitonicSortTest.cu new file mode 120000 index 0000000000000000000000000000000000000000..dfdfdf06da67288f999f4dab4cb72ab133a57a73 --- /dev/null +++ b/src/UnitTests/Algorithms/Sorting/BitonicSortTest.cu @@ -0,0 +1 @@ +BitonicSortTest.cpp \ No newline at end of file diff --git a/src/UnitTests/Algorithms/Sorting/BitonicSortTest.h b/src/UnitTests/Algorithms/Sorting/BitonicSortTest.h new file mode 100644 index 0000000000000000000000000000000000000000..5838d8f6c84e8a9deed70a539f27b12260252a50 --- /dev/null +++ b/src/UnitTests/Algorithms/Sorting/BitonicSortTest.h @@ -0,0 +1,388 @@ +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#if defined HAVE_GTEST && defined HAVE_CUDA +#include + + +using namespace TNL; +using namespace TNL::Algorithms; +using namespace TNL::Algorithms::Sorting; + +TEST(permutations, allPermutationSize_2_to_7) +{ + for(int i = 2; i<=7; i++ ) + { + int size = i; + std::vector orig(size); + std::iota(orig.begin(), orig.end(), 0); + + do + { + TNL::Containers::Array cudaArr(orig); + auto view = cudaArr.getView(); + + BitonicSort::sort(view); + + EXPECT_TRUE( Algorithms::isAscending( view ) ) << "failed " << i << std::endl; + } + while (std::next_permutation(orig.begin(), orig.end())); + } +} + +TEST(permutations, allPermutationSize_8) +{ + int size = 9; + const int stride = 151; + int i = 0; + + std::vector orig(size); + std::iota(orig.begin(), orig.end(), 0); + + do + { + if ((i++) % stride != 0) + continue; + + TNL::Containers::Array cudaArr(orig); + auto view = cudaArr.getView(); + + BitonicSort::sort(view); + + EXPECT_TRUE(Algorithms::isAscending(view)) << "result " << view << std::endl; + } + while (std::next_permutation(orig.begin(), orig.end())); +} + +TEST(permutations, somePermutationSize9) +{ + int size = 9; + const int stride = 227; + int i = 0; + + std::vector orig(size); + std::iota(orig.begin(), orig.end(), 0); + + do + { + if ((i++) % stride != 0) + continue; + + TNL::Containers::Array cudaArr(orig); + auto view = cudaArr.getView(); + + BitonicSort::sort(view); + + EXPECT_TRUE(Algorithms::isAscending(view)) << "result " << view << std::endl; + } + while (std::next_permutation(orig.begin(), orig.end())); +} + +TEST(selectedSize, size15) +{ + TNL::Containers::Array cudaArr{5, 9, 4, 8, 6, 1, 2, 3, 4, 8, 1, 6, 9, 4, 9}; + auto view = cudaArr.getView(); + EXPECT_EQ(15, view.getSize()) << "size not 15" << std::endl; + BitonicSort::sort(view); + EXPECT_TRUE(Algorithms::isAscending(view)) << "result " << view << std::endl; +} + +TEST(multiblock, 32768_decreasingNegative) +{ + std::vector arr(1<<15); + for (size_t i = 0; i < arr.size(); i++) + arr[i] = -i; + + TNL::Containers::Array cudaArr(arr); + auto view = cudaArr.getView(); + + BitonicSort::sort(view); + EXPECT_TRUE(Algorithms::isAscending(view)) << "result " << view << std::endl; +} + +TEST(randomGenerated, smallArray_randomVal) +{ + std::srand(2006); + for(int i = 0; i < 100; i++) + { + std::vector arr(std::rand()%(1<<10)); + for(auto & x : arr) + x = std::rand(); + + TNL::Containers::Array cudaArr(arr); + + auto view = cudaArr.getView(); + BitonicSort::sort(view); + EXPECT_TRUE(Algorithms::isAscending(view)); + } +} + +TEST(randomGenerated, bigArray_all0) +{ + std::srand(304); + for(int i = 0; i < 50; i++) + { + int size = (1<<20) + (std::rand()% (1<<19)); + + TNL::Containers::Array cudaArr(size); + + auto view = cudaArr.getView(); + BitonicSort::sort(view); + EXPECT_TRUE(Algorithms::isAscending(view)); + } +} + +TEST(nonIntegerType, float_notPow2) +{ + TNL::Containers::Array cudaArr{5.0, 9.4, 4.6, 8.9, 6.2, 1.15184, 2.23}; + auto view = cudaArr.getView(); + BitonicSort::sort(view); + EXPECT_TRUE(Algorithms::isAscending(view)) << "result " << view << std::endl; +} + +TEST(nonIntegerType, double_notPow2) +{ + TNL::Containers::Array cudaArr{5.0, 9.4, 4.6, 8.9, 6.2, 1.15184, 2.23}; + auto view = cudaArr.getView(); + BitonicSort::sort(view); + EXPECT_TRUE(Algorithms::isAscending(view)) << "result " << view << std::endl; +} + + +struct TMPSTRUCT{ + uint8_t m_data[6]; + __cuda_callable__ TMPSTRUCT(){m_data[0] = 0;} + __cuda_callable__ TMPSTRUCT(int first){m_data[0] = first;}; + __cuda_callable__ bool operator <(const TMPSTRUCT& other) const { return m_data[0] < other.m_data[0];} + __cuda_callable__ TMPSTRUCT& operator =(const TMPSTRUCT& other) {m_data[0] = other.m_data[0]; return *this;} + +}; + +TEST(nonIntegerType, struct) +{ + TNL::Containers::Array cudaArr{TMPSTRUCT(5), TMPSTRUCT(6), TMPSTRUCT(9), TMPSTRUCT(1)}; + auto view = cudaArr.getView(); + BitonicSort::sort(view); + EXPECT_TRUE(Algorithms::isAscending(view)); +} + +struct TMPSTRUCT_64b{ + uint8_t m_data[64]; + __cuda_callable__ TMPSTRUCT_64b(){m_data[0] = 0;} + __cuda_callable__ TMPSTRUCT_64b(int first){m_data[0] = first;}; + __cuda_callable__ bool operator <(const TMPSTRUCT_64b& other) const { return m_data[0] < other.m_data[0];} + __cuda_callable__ TMPSTRUCT_64b& operator =(const TMPSTRUCT_64b& other) {m_data[0] = other.m_data[0]; return *this;} +}; + +TEST(nonIntegerType, struct_64b) +{ + std::srand(61513); + int size = std::rand() % (1<<15); + std::vector vec(size); + for(auto & x : vec) + x = TMPSTRUCT_64b(std::rand()); + + TNL::Containers::Array cudaArr(vec); + auto view = cudaArr.getView(); + BitonicSort::sort(view); + EXPECT_TRUE(Algorithms::isAscending(view)); +} + +struct TMPSTRUCT_128b{ + uint8_t m_data[128]; + __cuda_callable__ TMPSTRUCT_128b(){m_data[0] = 0;} + __cuda_callable__ TMPSTRUCT_128b(int first){m_data[0] = first;}; + __cuda_callable__ bool operator <(const TMPSTRUCT_128b& other) const { return m_data[0] < other.m_data[0];} + __cuda_callable__ TMPSTRUCT_128b& operator =(const TMPSTRUCT_128b& other) {m_data[0] = other.m_data[0]; return *this;} +}; + +TEST(nonIntegerType, struct_128b) +{ + std::srand(98451); + int size = std::rand() % (1<<14); + std::vector vec(size); + for(auto & x : vec) + x = TMPSTRUCT_128b(std::rand()); + + TNL::Containers::Array cudaArr(vec); + auto view = cudaArr.getView(); + BitonicSort::sort(view); + EXPECT_TRUE(Algorithms::isAscending(view)); +} + +//error bypassing +//https://mmg-gitlab.fjfi.cvut.cz/gitlab/tnl/tnl-dev/blob/fbc34f6a97c13ec865ef7969b9704533222ed408/src/UnitTests/Containers/VectorTest-8.h +void descendingSort(TNL::Containers::ArrayView view) +{ + auto cmpDescending = [] __cuda_callable__ (int a, int b) {return a > b;}; + bitonicSort(view, cmpDescending); +} + +TEST(sortWithFunction, descending) +{ + TNL::Containers::Array cudaArr{6, 9, 4, 2, 3}; + auto view = cudaArr.getView(); + descendingSort(view); + + EXPECT_FALSE(Algorithms::isAscending(view)) << "result " << view << std::endl; + + EXPECT_TRUE(view.getElement(0) == 9); + EXPECT_TRUE(view.getElement(1) == 6); + EXPECT_TRUE(view.getElement(2) == 4); + EXPECT_TRUE(view.getElement(3) == 3); + EXPECT_TRUE(view.getElement(4) == 2); +} + +/*TEST(sortHostArray, hostArray) +{ + TNL::Containers::Array< int > arr( 84561 ); + for( size_t i = 0; i < arr.getSize(); i++ ) + arr[i] = -i; + + bitonicSort(arr); + + EXPECT_TRUE( TNL::Algorithms::isAscending(arr) ); +}*/ + +/*TEST(sortRange, secondHalf) +{ + std::vector arr(19); + int s = 19/2; + for(size_t i = 0; i < s; i++) arr[i] = -1; + for(size_t i = s; i < 19; i++) arr[i] = -i; + + bitonicSort(arr, s, 19); + + EXPECT_TRUE(TNL::Algorithms::isAscending(arr.begin() + s, arr.end())); + EXPECT_TRUE(arr[0] == -1); + EXPECT_TRUE(arr[s-1] == -1); +} + +TEST(sortRange, middle) +{ + std::srand(8705); + + std::vector arr(20); + int s = 5, e = 15; + for(size_t i = 0; i < s; i++) arr[i] = -1; + for(size_t i = e; i < 20; i++) arr[i] = -1; + + for(size_t i = s; i < e; i++) arr[i] = std::rand(); + + bitonicSort(arr, s, e); + + EXPECT_TRUE(TNL::Algorithms::isAscending(arr.begin() + s, arr.begin() + e)); + EXPECT_TRUE(arr[0] == -1); + EXPECT_TRUE(arr.back() == -1); + EXPECT_TRUE(arr[s-1] == -1); + EXPECT_TRUE(arr[e] == -1); +} + +TEST(sortRange, middleMultiBlock) +{ + std::srand(4513); + int size = 1<<20; + int s = 2000, e = size - 1512; + + std::vector arr(size); + for(size_t i = 0; i < s; i++) arr[i] = -1; + for(size_t i = e; i < size; i++) arr[i] = -1; + + for(size_t i = s; i < e; i++) arr[i] = std::rand(); + + bitonicSort(arr, s, e); + + EXPECT_TRUE(TNL::Algorithms::isAscending(arr.begin() + s, arr.begin() + e)); + + EXPECT_TRUE(arr[0] == -1); + EXPECT_TRUE(arr[std::rand() % s] == -1); + EXPECT_TRUE(arr[s-1] == -1); + + EXPECT_TRUE(arr[e] == -1); + EXPECT_TRUE(arr[e + (std::rand() % (size - e))] == -1); + EXPECT_TRUE(arr.back() == -1); +}*/ + +template +void fetchAndSwapSorter(TNL::Containers::ArrayView view) +{ + //auto Fetch = [=]__cuda_callable__(int i){return view[i];}; + auto Cmp = [=]__cuda_callable__(const int i, const int j ){return view[ i ] < view[ j ];}; + auto Swap = [=] __cuda_callable__ (int i, int j) mutable {TNL::swap(view[i], view[j]);}; + bitonicSort(0, view.getSize(), Cmp, Swap); +} + +TEST(fetchAndSwap, oneBlockSort) +{ + int size = 9; + const int stride = 227; + int i = 0; + + std::vector orig(size); + std::iota(orig.begin(), orig.end(), 0); + + do + { + if ((i++) % stride != 0) + continue; + + TNL::Containers::Array cudaArr(orig); + auto view = cudaArr.getView(); + fetchAndSwapSorter(view); + EXPECT_TRUE(Algorithms::isAscending(view)) << "result " << view << std::endl; + } + while (std::next_permutation(orig.begin(), orig.end())); +} + +TEST(fetchAndSwap, typeDouble) +{ + int size = 5; + std::vector orig(size); + std::iota(orig.begin(), orig.end(), 0); + + do + { + TNL::Containers::Array cudaArr(orig); + auto view = cudaArr.getView(); + fetchAndSwapSorter(view); + EXPECT_TRUE(Algorithms::isAscending(view)) << "result " << view << std::endl; + } + while (std::next_permutation(orig.begin(), orig.end())); +} + +void fetchAndSwap_sortMiddle(TNL::Containers::ArrayView view, int from, int to) +{ + //auto Fetch = [=]__cuda_callable__(int i){return view[i];}; + auto Cmp = [=]__cuda_callable__(const int i, const int j ){ return view[ i ] < view[ j ]; }; + auto Swap = [=] __cuda_callable__ (int i, int j) mutable { TNL::swap(view[i], view[j]); }; + bitonicSort(from, to, Cmp, Swap); +} + +TEST(fetchAndSwap, sortMiddle) +{ + std::vector orig{5, 9, 4, 54, 21, 6, 7, 9, 0, 9, 42, 4}; + TNL::Containers::Array cudaArr(orig); + auto view = cudaArr.getView(); + size_t from = 3, to = 8; + + fetchAndSwap_sortMiddle(view, from, to); + EXPECT_TRUE(Algorithms::isAscending(view.getView(3, 8))) << "result " << view << std::endl; + + for(size_t i = 0; i < orig.size(); i++) + { + if(i < from || i >= to) + EXPECT_TRUE(view.getElement(i) == orig[i]); + } +} + +#endif + +#include "../../main.h" diff --git a/src/UnitTests/Algorithms/Sorting/BubbleSortTest.cpp b/src/UnitTests/Algorithms/Sorting/BubbleSortTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2a2bc2de9733435d3ff01f862591c48eab050336 --- /dev/null +++ b/src/UnitTests/Algorithms/Sorting/BubbleSortTest.cpp @@ -0,0 +1 @@ +#include "BubbleSortTest.h" \ No newline at end of file diff --git a/src/UnitTests/Algorithms/Sorting/BubbleSortTest.cu b/src/UnitTests/Algorithms/Sorting/BubbleSortTest.cu new file mode 120000 index 0000000000000000000000000000000000000000..83313710f089e2c88bc68831abab3bec6ddb26e5 --- /dev/null +++ b/src/UnitTests/Algorithms/Sorting/BubbleSortTest.cu @@ -0,0 +1 @@ +BubbleSortTest.cpp \ No newline at end of file diff --git a/src/UnitTests/Algorithms/Sorting/BubbleSortTest.h b/src/UnitTests/Algorithms/Sorting/BubbleSortTest.h new file mode 100644 index 0000000000000000000000000000000000000000..5e527794e85db043fdcaa02af4d057c0bf37cf5d --- /dev/null +++ b/src/UnitTests/Algorithms/Sorting/BubbleSortTest.h @@ -0,0 +1,95 @@ +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#if defined HAVE_GTEST +#include + + +using namespace TNL; +using namespace TNL::Algorithms; +using namespace TNL::Algorithms::Sorting; + +template +void fetchAndSwapSorter( TNL::Containers::ArrayView< TYPE, TNL::Devices::Host > view) +{ + auto Cmp = [=]__cuda_callable__(const int i, const int j ){return view[ i ] < view[ j ];}; + auto Swap = [=] __cuda_callable__ (int i, int j) mutable {TNL::swap(view[i], view[j]);}; + BubbleSort::inplaceSort< TNL::Devices::Host >(0, view.getSize(), Cmp, Swap); +} + +TEST(fetchAndSwap, oneBlockSort) +{ + int size = 9; + const int stride = 227; + int i = 0; + + std::vector orig(size); + std::iota(orig.begin(), orig.end(), 0); + + do + { + if ((i++) % stride != 0) + continue; + + TNL::Containers::Array arr(orig); + auto view = arr.getView(); + fetchAndSwapSorter(view); + EXPECT_TRUE(Algorithms::isAscending(view)) << "result " << view << std::endl; + } + while (std::next_permutation(orig.begin(), orig.end())); +} + +TEST(fetchAndSwap, typeDouble) +{ + int size = 5; + std::vector orig(size); + std::iota(orig.begin(), orig.end(), 0); + + do + { + TNL::Containers::Array arr(orig); + auto view = arr.getView(); + fetchAndSwapSorter(view); + EXPECT_TRUE(Algorithms::isAscending(view)) << "result " << view << std::endl; + } + while (std::next_permutation(orig.begin(), orig.end())); +} + +void fetchAndSwap_sortMiddle(TNL::Containers::ArrayView view, int from, int to) +{ + //auto Fetch = [=]__cuda_callable__(int i){return view[i];}; + auto Cmp = [=]__cuda_callable__(const int i, const int j ){ return view[ i ] < view[ j ]; }; + auto Swap = [=] __cuda_callable__ (int i, int j) mutable { TNL::swap(view[i], view[j]); }; + BubbleSort::inplaceSort< TNL::Devices::Host >(from, to, Cmp, Swap); +} + +TEST(fetchAndSwap, sortMiddle) +{ + std::vector orig{5, 9, 4, 54, 21, 6, 7, 9, 0, 9, 42, 4}; + TNL::Containers::Array arr(orig); + auto view = arr.getView(); + size_t from = 3, to = 8; + + fetchAndSwap_sortMiddle(view, from, to); + EXPECT_TRUE(Algorithms::isAscending(view.getView(3, 8))) << "result " << view << std::endl; + + for(size_t i = 0; i < orig.size(); i++) + { + if( i < from || i >= to ) + { + EXPECT_TRUE(view.getElement(i) == orig[i]); + } + } +} + +#endif + +#include "../../main.h" diff --git a/src/UnitTests/Algorithms/Sorting/CMakeLists.txt b/src/UnitTests/Algorithms/Sorting/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..56fca2216ea760113c2d4cdf0e39e6a5bcf4454e --- /dev/null +++ b/src/UnitTests/Algorithms/Sorting/CMakeLists.txt @@ -0,0 +1,28 @@ +set( COMMON_TESTS + BitonicSortTest + BubbleSortTest + QuicksortTest +) + +set( CPP_TESTS ) +set( CUDA_TESTS ) +if( BUILD_CUDA ) + set( CUDA_TESTS ${CUDA_TESTS} ${COMMON_TESTS} ) +else() + set( CPP_TESTS ${CPP_TESTS} ${COMMON_TESTS} ) +endif() + +foreach( target IN ITEMS ${CPP_TESTS} ) + add_executable( ${target} ${target}.cpp ) + target_compile_options( ${target} PRIVATE ${CXX_TESTS_FLAGS} ) + target_link_libraries( ${target} ${GTEST_BOTH_LIBRARIES} ) + add_test( ${target} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${target}${CMAKE_EXECUTABLE_SUFFIX} ) +endforeach() + +if( BUILD_CUDA ) + foreach( target IN ITEMS ${CUDA_TESTS} ) + cuda_add_executable( ${target} ${target}.cu OPTIONS ${CXX_TESTS_FLAGS} ) + target_link_libraries( ${target} ${GTEST_BOTH_LIBRARIES} ) + add_test( ${target} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${target}${CMAKE_EXECUTABLE_SUFFIX} ) + endforeach() +endif() diff --git a/src/UnitTests/Algorithms/Sorting/QuicksortTest.cpp b/src/UnitTests/Algorithms/Sorting/QuicksortTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..04be4c54ac81ee606bb0556011149c95c9dc5938 --- /dev/null +++ b/src/UnitTests/Algorithms/Sorting/QuicksortTest.cpp @@ -0,0 +1 @@ +#include "QuicksortTest.h" \ No newline at end of file diff --git a/src/UnitTests/Algorithms/Sorting/QuicksortTest.cu b/src/UnitTests/Algorithms/Sorting/QuicksortTest.cu new file mode 120000 index 0000000000000000000000000000000000000000..d6099b8e1a3267e1f28a9f9b219bc54cfee8c522 --- /dev/null +++ b/src/UnitTests/Algorithms/Sorting/QuicksortTest.cu @@ -0,0 +1 @@ +QuicksortTest.cpp \ No newline at end of file diff --git a/src/UnitTests/Algorithms/Sorting/QuicksortTest.h b/src/UnitTests/Algorithms/Sorting/QuicksortTest.h new file mode 100644 index 0000000000000000000000000000000000000000..b84bcb78a59792438abf7f16c4a44c7a5566dc8e --- /dev/null +++ b/src/UnitTests/Algorithms/Sorting/QuicksortTest.h @@ -0,0 +1,203 @@ +#include +#include +#include +#include + + +#include +#include +#include +#include + +#if defined HAVE_CUDA && defined HAVE_GTEST +#include +#include + +#include + +using namespace TNL; +using namespace TNL::Algorithms; +using namespace TNL::Algorithms::Sorting; + +TEST(selectedSize, size15) +{ + TNL::Containers::Array cudaArr{5, 9, 4, 8, 6, 1, 2, 3, 4, 8, 1, 6, 9, 4, 9}; + auto view = cudaArr.getView(); + EXPECT_EQ(15, view.getSize()) << "size not 15" << std::endl; + Quicksort::sort( view ); + EXPECT_TRUE(Algorithms::isAscending(view)) << "result " << view << std::endl; +} + +TEST(multiblock, 32768_decreasingNegative) +{ + std::vector arr(1<<15); + for (size_t i = 0; i < arr.size(); i++) + arr[i] = -i; + + TNL::Containers::Array cudaArr(arr); + auto view = cudaArr.getView(); + Quicksort::sort( view ); + + EXPECT_TRUE(Algorithms::isAscending(view)) << "result " << view << std::endl; +} + +TEST(randomGenerated, smallArray_randomVal) +{ + std::srand(2006); + for(int i = 0; i < 100; i++) + { + std::vector arr(std::rand()%(1<<10)); + for(auto & x : arr) + x = std::rand(); + + TNL::Containers::Array cudaArr(arr); + auto view = cudaArr.getView(); + Quicksort::sort( view ); + + EXPECT_TRUE(Algorithms::isAscending(view)); + } +} + +TEST(randomGenerated, bigArray_randomVal) +{ + std::srand(304); + for(int i = 0; i < 50; i++) + { + int size = (1<<20) + (std::rand()% (1<<19)); + std::vector arr(size); + for(auto & x : arr) x = std::rand(); + TNL::Containers::Array cudaArr(arr); + + auto view = cudaArr.getView(); + Quicksort::sort( view ); + EXPECT_TRUE(Algorithms::isAscending(view)); + } +} + +TEST(noLostElement, smallArray) +{ + std::srand(9151); + + int size = (1<<7); + std::vector arr(size); + for(auto & x : arr) x = std::rand(); + + TNL::Containers::Array cudaArr(arr); + auto view = cudaArr.getView(); + Quicksort::sort( view ); + + std::sort(arr.begin(), arr.end()); + TNL::Containers::Array cudaArr2(arr); + EXPECT_TRUE(view == cudaArr2.getView()); +} + +TEST(noLostElement, midSizedArray) +{ + std::srand(91503); + + int size = (1<<15); + std::vector arr(size); + for(auto & x : arr) x = std::rand(); + + TNL::Containers::Array cudaArr(arr); + auto view = cudaArr.getView(); + Quicksort::sort( view ); + + std::sort(arr.begin(), arr.end()); + TNL::Containers::Array cudaArr2(arr); + EXPECT_TRUE(view == cudaArr2.getView()); +} + +TEST(noLostElement, bigSizedArray) +{ + std::srand(15611); + + int size = (1<<22); + std::vector arr(size); + for(auto & x : arr) x = std::rand(); + for(int i = 0; i < 10000; i++) + arr[std::rand() % arr.size()] = (1<<10); + + TNL::Containers::Array cudaArr(arr); + auto view = cudaArr.getView(); + Quicksort::sort( view ); + + TNL::Containers::Array cudaArr2(arr); + thrust::sort(thrust::device, cudaArr2.getData(), cudaArr2.getData() + cudaArr2.getSize()); + EXPECT_TRUE(view == cudaArr2.getView()); +} + +TEST(types, type_double) +{ + std::srand(8451); + + int size = (1<<16); + std::vector arr(size); + for(auto & x : arr) x = std::rand(); + for(int i = 0; i < 10000; i++) + arr[std::rand() % arr.size()] = (1<<10); + + TNL::Containers::Array cudaArr(arr); + auto view = cudaArr.getView(); + Quicksort::sort( view ); + + TNL::Containers::Array cudaArr2(arr); + thrust::sort(thrust::device, cudaArr2.getData(), cudaArr2.getData() + cudaArr2.getSize()); + EXPECT_TRUE(view == cudaArr2.getView()); +} + +struct TMPSTRUCT_xyz{ + double x, y, z; + __cuda_callable__ TMPSTRUCT_xyz(): x(0){} + __cuda_callable__ TMPSTRUCT_xyz(int first){x = first;}; + __cuda_callable__ bool operator <(const TMPSTRUCT_xyz& other) const { return x< other.x;} + __cuda_callable__ TMPSTRUCT_xyz& operator =(const TMPSTRUCT_xyz& other) {x = other.x; return *this;} +}; +std::ostream & operator<<(std::ostream & out, const TMPSTRUCT_xyz & data){return out << data.x;} + +TEST(types, struct_3D_points) +{ + std::srand(46151); + + int size = (1<<18); + std::vector arr(size); + for(auto & x : arr) x = TMPSTRUCT_xyz(std::rand()); + + TNL::Containers::Array cudaArr(arr); + auto view = cudaArr.getView(); + //thrust::sort(thrust::device, cudaArr.getData(), cudaArr.getData() + cudaArr.getSize()); + //std::cout << view << std::endl; + Quicksort::sort( view ); + + EXPECT_TRUE(Algorithms::isAscending(view)); +} + +struct TMPSTRUCT_64b{ + uint8_t m_Data[64]; + __cuda_callable__ TMPSTRUCT_64b() {m_Data[0] = 0;} + __cuda_callable__ TMPSTRUCT_64b(int first){m_Data[0] = first;}; + __cuda_callable__ bool operator <(const TMPSTRUCT_64b& other) const { return m_Data[0]< other.m_Data[0];} + __cuda_callable__ TMPSTRUCT_64b& operator =(const TMPSTRUCT_64b& other) {m_Data[0] = other.m_Data[0]; return *this;} +}; +std::ostream & operator<<(std::ostream & out, const TMPSTRUCT_64b & data){return out << (unsigned) data.m_Data[0];} + +TEST(types, struct_64b) +{ + std::srand(96); + + int size = (1<<18); + std::vector arr(size); + for(auto & x : arr) x = TMPSTRUCT_64b(std::rand() % 512); + + TNL::Containers::Array cudaArr(arr); + auto view = cudaArr.getView(); + //thrust::sort(thrust::device, cudaArr.getData(), cudaArr.getData() + cudaArr.getSize()); + //std::cout << view << std::endl; + Quicksort::sort( view ); + + EXPECT_TRUE(Algorithms::isAscending(view)); +} + +#endif + +#include "../../main.h"