diff --git a/CMakeLists.txt b/CMakeLists.txt
index d7e824a125acbb789af1cdd74811906746c1ea78..ecc55634635c3104ed6bf65d6b30bce20277de2c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,6 +32,7 @@ option(BUILD_TESTS "Build tests" OFF)
 option(BUILD_MATRIX_TESTS "Build tests for matrix formats" OFF)
 option(BUILD_PYTHON "Compile the Python bindings" OFF)
 option(BUILD_DOC "Build examples included in the documentation" OFF)
+set(CUDA_SAMPLES_PATH "none" CACHE STRING "Path to CUDA Samples - it is used only for some benchmarking.")
 
 # install paths relative to the cmake's prefix
 set( TNL_TARGET_INCLUDE_DIRECTORY "include/TNL" )
@@ -253,6 +254,9 @@ if( ${WITH_CUDA} )
         endif()
         set( CMAKE_EXECUTABLE_SUFFIX "${executable_suffix_backup}" )
     endif()
+    if( NOT CUDA_SAMPLES_DIR STREQUAL "none" )
+      set( CUDA_SAMPLES_FLAGS "-I${CUDA_SAMPLES_DIR} -DHAVE_CUDA_SAMPLES")
+    endif()
 endif()
 
 
@@ -407,6 +411,7 @@ message( "   CMAKE_SHARED_LINKER_FLAGS = ${CMAKE_SHARED_LINKER_FLAGS}" )
 message( "   CMAKE_SHARED_LINKER_FLAGS_DEBUG = ${CMAKE_SHARED_LINKER_FLAGS_DEBUG}" )
 message( "   CMAKE_SHARED_LINKER_FLAGS_RELEASE = ${CMAKE_SHARED_LINKER_FLAGS_RELEASE}" )
 message( "   CUDA_NVCC_FLAGS = ${CUDA_NVCC_FLAGS}" )
+message( "   CUDA_SAMPLES_FLAGS = ${CUDA_SAMPLES_FLAGS}" )
 message( "   GMP_LIBRARIES = ${GMP_LIBRARIES}" )
 if( MPI_CXX_FOUND AND ${WITH_MPI} )
    message( "   MPI_CXX_COMPILE_OPTIONS = ${MPI_CXX_COMPILE_OPTIONS}" )
diff --git a/Documentation/Examples/Algorithms/CMakeLists.txt b/Documentation/Examples/Algorithms/CMakeLists.txt
index 294006c088765aea26ae189199f54fc49bb19ecb..5ffb91b16855d23531ad5c06dba8a66cdefd190c 100644
--- a/Documentation/Examples/Algorithms/CMakeLists.txt
+++ b/Documentation/Examples/Algorithms/CMakeLists.txt
@@ -1,9 +1,21 @@
 IF( BUILD_CUDA )
    CUDA_ADD_EXECUTABLE(ParallelForExampleCuda ParallelForExample.cu)
    ADD_CUSTOM_COMMAND( COMMAND ParallelForExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out )
+   CUDA_ADD_EXECUTABLE( SortingExampleCuda SortingExample.cu)
+   ADD_CUSTOM_COMMAND( COMMAND SortingExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample.out OUTPUT SortingExample.out )
+   CUDA_ADD_EXECUTABLE( SortingExample2Cuda SortingExample2.cu)
+   ADD_CUSTOM_COMMAND( COMMAND SortingExample2Cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample2.out OUTPUT SortingExample2.out )
+   CUDA_ADD_EXECUTABLE( SortingExample3Cuda SortingExample3.cu)
+   ADD_CUSTOM_COMMAND( COMMAND SortingExample3Cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample3.out OUTPUT SortingExample3.out )
 ELSE()
    ADD_EXECUTABLE(ParallelForExample ParallelForExample.cpp)
    ADD_CUSTOM_COMMAND( COMMAND ParallelForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out )
+   ADD_EXECUTABLE( SortingExample SortingExample.cpp)
+   ADD_CUSTOM_COMMAND( COMMAND SortingExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample.out OUTPUT SortingExample.out )
+   ADD_EXECUTABLE( SortingExample2 SortingExample2.cpp)
+   ADD_CUSTOM_COMMAND( COMMAND SortingExample2 > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample2.out OUTPUT SortingExample2.out )
+   ADD_EXECUTABLE( SortingExample3 SortingExample3.cpp)
+   ADD_CUSTOM_COMMAND( COMMAND SortingExample3 > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample3.out OUTPUT SortingExample3.out )
 ENDIF()
 
 ADD_EXECUTABLE(staticForExample staticForExample.cpp)
@@ -13,6 +25,9 @@ ADD_EXECUTABLE(unrolledForExample unrolledForExample.cpp)
 ADD_CUSTOM_COMMAND( COMMAND unrolledForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/unrolledForExample.out OUTPUT unrolledForExample.out )
 
 ADD_CUSTOM_TARGET( RunAlgorithmsExamples ALL DEPENDS
+   SortingExample.out
+   SortingExample2.out
+   SortingExample3.out
    ParallelForExample.out
    unrolledForExample.out
    staticForExample.out
diff --git a/Documentation/Examples/Algorithms/SortingExample.cpp b/Documentation/Examples/Algorithms/SortingExample.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3d11787a1db16e3a77e05370e70d83059bdaa8eb
--- /dev/null
+++ b/Documentation/Examples/Algorithms/SortingExample.cpp
@@ -0,0 +1,55 @@
+#include <iostream>
+#include <TNL/Containers/Array.h>
+#include <TNL/Algorithms/sort.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+using namespace TNL::Algorithms;
+
+template< typename ArrayT >
+void sort( ArrayT& array )
+{
+   const int size = 10;
+
+   /****
+    * Fill the array with random integers.
+    */
+   Array< int > aux_array( size );
+   srand( size + 2021 );
+   aux_array.forAllElements( [=] __cuda_callable__ ( int i, int& value ) { value = std::rand() % (2*size); } );
+   array = aux_array;
+
+   std::cout << "Random array: " << array << std::endl;
+
+   /****
+    * Sort the array in ascending order.
+    */
+   ascendingSort( array );
+   std::cout << "Array sorted in ascending order:" << array << std::endl;
+
+   /***
+    * Sort the array in descending order.
+    */
+   descendingSort( array );
+   std::cout << "Array sorted in descending order:" << array << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Firstly, test the sorting on CPU.
+    */
+   std::cout << "Sorting on CPU ... " << std::endl;
+   Array< int, Devices::Host > host_array;
+   sort( host_array );
+
+#ifdef HAVE_CUDA
+   /***
+    * And then also on GPU.
+    */
+   std::cout << "Sorting on GPU ... " << std::endl;
+   Array< int, Devices::Cuda > cuda_array;
+   sort( cuda_array );
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Examples/Algorithms/SortingExample.cu b/Documentation/Examples/Algorithms/SortingExample.cu
new file mode 120000
index 0000000000000000000000000000000000000000..ce89e14f316dc18748ab7aa07550be5ea461e704
--- /dev/null
+++ b/Documentation/Examples/Algorithms/SortingExample.cu
@@ -0,0 +1 @@
+SortingExample.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/SortingExample2.cpp b/Documentation/Examples/Algorithms/SortingExample2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ec24ec5dcd819dff9662c881250636ff2c62088d
--- /dev/null
+++ b/Documentation/Examples/Algorithms/SortingExample2.cpp
@@ -0,0 +1,55 @@
+#include <iostream>
+#include <TNL/Containers/Array.h>
+#include <TNL/Algorithms/sort.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+using namespace TNL::Algorithms;
+
+template< typename ArrayT >
+void sort( ArrayT& array )
+{
+   const int size = 10;
+
+  /****
+   * Fill the array with random integers.
+   */
+   Array< int > aux_array( size );
+   srand( size + 2021 );
+   aux_array.forAllElements( [=] __cuda_callable__ ( int i, int& value ) { value = std::rand() % (2*size); } );
+   array = aux_array;
+
+   std::cout << "Random array: " << array << std::endl;
+
+   /****
+    * Sort the array in ascending order.
+    */
+   sort( array, [] __cuda_callable__ ( int a, int b ) { return a < b; } );
+   std::cout << "Array sorted in ascending order:" << array << std::endl;
+
+   /***
+    * Sort the array in descending order.
+    */
+   sort( array, [] __cuda_callable__ ( int a, int b ) { return a > b; } );
+   std::cout << "Array sorted in descending order:" << array << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Firstly, test the sorting on CPU.
+    */
+   std::cout << "Sorting on CPU ... " << std::endl;
+   Array< int, Devices::Host > host_array;
+   sort( host_array );
+
+#ifdef HAVE_CUDA
+   /***
+    * And then also on GPU.
+    */
+   std::cout << "Sorting on GPU ... " << std::endl;
+   Array< int, Devices::Cuda > cuda_array;
+   sort( cuda_array );
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Examples/Algorithms/SortingExample2.cu b/Documentation/Examples/Algorithms/SortingExample2.cu
new file mode 120000
index 0000000000000000000000000000000000000000..892bbc2328682c49bf3a491189525f79a08fde22
--- /dev/null
+++ b/Documentation/Examples/Algorithms/SortingExample2.cu
@@ -0,0 +1 @@
+SortingExample2.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/SortingExample3.cpp b/Documentation/Examples/Algorithms/SortingExample3.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4d6c33ce80033e9d71d85ffafbb9ac596617cf38
--- /dev/null
+++ b/Documentation/Examples/Algorithms/SortingExample3.cpp
@@ -0,0 +1,65 @@
+#include <iostream>
+#include <TNL/Containers/Array.h>
+#include <TNL/Algorithms/sort.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+using namespace TNL::Algorithms;
+
+template< typename ArrayT >
+void sort( ArrayT& array )
+{
+   const int size = 10;
+
+   /****
+    * Fill the array with random integers.
+    */
+   Array< int > aux_array( size );
+   srand( size + 2021 );
+   aux_array.forAllElements( [=] __cuda_callable__ ( int i, int& value ) { value = std::rand() % (2*size); } );
+   array = aux_array;
+
+   /***
+    * Prepare second array holding elements positions.
+    */
+   ArrayT index( size );
+   index.forAllElements( [] __cuda_callable__ ( int idx, int& value  ) { value = idx; } );
+   std::cout << "Random array:     " << array << std::endl;
+   std::cout << "Index array:      " << index << std::endl;
+
+   /***
+    * Sort the array `array` and apply the same permutation on the array `identity`.
+    */
+   auto array_view = array.getView();
+   auto index_view = index.getView();
+   sort< typename ArrayT::DeviceType,                             // device on which the sorting will be performed
+         typename ArrayT::IndexType >(                            // type used for indexing
+         0, size,                                                 // range of indexes
+         [=] __cuda_callable__ ( int i, int j ) -> bool {         // comparison lambda function
+            return array_view[ i ] < array_view[ j ]; },
+         [=] __cuda_callable__ ( int i, int j ) mutable {         // lambda function for swapping of elements
+            TNL::swap( array_view[ i ], array_view[ j ] );
+            TNL::swap( index_view[ i ], index_view[ j ] ); } );
+   std::cout << "Sorted array:      " << array << std::endl;
+   std::cout << "Index:             " << index << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Firstly, test the sorting on CPU.
+    */
+   std::cout << "Sorting on CPU ... " << std::endl;
+   Array< int, Devices::Host > host_array;
+   sort( host_array );
+
+#ifdef HAVE_CUDA
+   /***
+    * And then also on GPU.
+    */
+   std::cout << "Sorting on GPU ... " << std::endl;
+   Array< int, Devices::Cuda > cuda_array;
+   sort( cuda_array );
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Examples/Algorithms/SortingExample3.cu b/Documentation/Examples/Algorithms/SortingExample3.cu
new file mode 120000
index 0000000000000000000000000000000000000000..a35af15c62abe6a82dcbd3745dbd360311b8db2d
--- /dev/null
+++ b/Documentation/Examples/Algorithms/SortingExample3.cu
@@ -0,0 +1 @@
+SortingExample3.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Sorting/tutorial_Sorting.md b/Documentation/Tutorials/Sorting/tutorial_Sorting.md
new file mode 100644
index 0000000000000000000000000000000000000000..4832077cb6b2541925423e6d58ff9345e242751a
--- /dev/null
+++ b/Documentation/Tutorials/Sorting/tutorial_Sorting.md
@@ -0,0 +1,43 @@
+\page tutorial_Sorting Sorting tutorial
+
+[TOC]
+
+## Introduction
+
+TNL offers several different parallel algorithms for sorting of arrays (or vectors) and also sorting based on user defined swapping. The later is more general but also less efficient.
+
+### Sorting of arrays and Vectors
+
+The sorting of arrays and vectors is accessible via the following functions:
+
+* \ref TNL::Algorithms::ascendingSort for sorting elements of array in ascending order,
+* \ref TNL::Algorithms::descendingSort for sorting elements of array in descending order,
+* \ref TNL::Algorithms::sort for sorting with user defined ordering.
+
+The following example demonstrates the use of ascending and descending sort. See
+
+\includelineno SortingExample.cpp
+
+Here we create array with random sequence of integers (lines 17-20) and then we sort the array in ascending order (line 27) and descending order (line 33). The result looks as follows:
+
+\include SortingExample.out
+
+
+How to achieve the same result with user defined ordering is demonstrated by the following example:
+
+\includelineno SortingExample2.cpp
+
+The result looks as follows:
+
+\include SortingExample2.out
+
+The same way, one can sort also \ref TNL::Containers::ArrayView, \ref TNL::Containers::Vector and \ref TNL::Containers::VectorView.
+
+### Sorting with user define swapping
+
+
+\includelineno SortingExample3.cpp
+
+In this example, we fill array `array` with random numbers and array `index` with numbers equal to position of an element in the array. We want to sort the array `array` and permute the `index` array the same way. See the lines 34-38. Here we call function `sort` which does not accept any array-like data structure but only range of indexes and two lambda functions. The first one defines ordering of the elements (line 35) by comparing elements of array `array`. The second lambda function is responsible for elements swapping (lines 36-38 ). Note that we do not swap only elements of array `array` but also `index` array. The result looks as follows:
+
+\include SortingExample3.out
diff --git a/Documentation/Tutorials/index.md b/Documentation/Tutorials/index.md
index e35e674c8fb877aa86fbedae4aeae94d1efa974b..739d609acc543148ddb30aa21ee231a4527e02ee 100644
--- a/Documentation/Tutorials/index.md
+++ b/Documentation/Tutorials/index.md
@@ -7,6 +7,7 @@
 3. [Vectors](tutorial_Vectors.html)
 4. [Flexible parallel reduction and scan](tutorial_ReductionAndScan.html)
 5. [For loops](tutorial_ForLoops.html)
-6. [Cross-device pointers](tutorial_Pointers.html)
-7. [Matrices](tutorial_Matrices.html)
-8. [Unstructured meshes](tutorial_Meshes.html)
+6. [Sorting](tutorial_Sorting.html)
+7. [Cross-device pointers](tutorial_Pointers.html)
+8. [Matrices](tutorial_Matrices.html)
+9. [Unstructured meshes](tutorial_Meshes.html)
diff --git a/build b/build
index 09da8de5c7c28dab27750931b68217ff79d41d6c..f8adef1a4059159e55a1107500b2bffc58f9d33c 100755
--- a/build
+++ b/build
@@ -39,6 +39,13 @@ BUILD_TESTS="no"
 BUILD_MATRIX_TESTS="no"
 BUILD_DOC="no"
 
+# external dependencies
+CUDA_SAMPLES_DIR=none
+
+if [[ x"$CUDA_SAMPLES_PATH" != "x" ]]; then
+   CUDA_SAMPLES_DIR=${CUDA_SAMPLES_PATH}
+fi
+
 function print_usage()
 {
     cat << EOF
@@ -85,6 +92,9 @@ Options for the 'tests' and 'matrix-tests' targets:
     --tests-jobs=NUM                      Number of processes to be used for the unit tests. It is $TEST_JOBS by default.
     --with-coverage=yes/no                Enables code coverage reports for unit tests (lcov is required). '$WITH_COVERAGE' by default.
     --with-system-gtest=yes/no            Use GTest installed in the local system and do not download the latest version. '$WITH_SYSTEM_GTEST' by default.
+
+External dependencies:
+   --cuda-samples-path=PATH                CUDA samples are used by some reference algorithms used in benchmarks. '$CUDA_SAMPLES_PATH' by default.
 EOF
 }
 
@@ -120,6 +130,7 @@ for option in "$@"; do
         --with-coverage=*     ) WITH_COVERAGE="${option#*=}" ;;
         --with-ci-flags=*     ) WITH_CI_FLAGS="${option#*=}" ;;
         --with-system-gtest=* ) WITH_SYSTEM_GTEST="${option#*=}" ;;
+        --cuda-samples-path=* ) CUDA_SAMPLES_DIR="${option#*=}" ;;
         -*                    )
             echo "Unknown option $option. Use --help for more information." >&2
             exit 1
@@ -216,6 +227,7 @@ cmake_command=(
          -DBUILD_MATRIX_TESTS=${BUILD_MATRIX_TESTS}
          -DBUILD_PYTHON=${BUILD_PYTHON}
          -DBUILD_DOC=${BUILD_DOC}
+         -DCUDA_SAMPLES_DIR=${CUDA_SAMPLES_DIR}
 )
 
 # Skip running cmake if it was already run and the cmake command is the same.
diff --git a/src/Benchmarks/CMakeLists.txt b/src/Benchmarks/CMakeLists.txt
index 6f3185329c8dab36b4c07bdb949b33a45595607a..4e1961b3c1b39fad7cb794fd030cfe33e2f8dd0a 100644
--- a/src/Benchmarks/CMakeLists.txt
+++ b/src/Benchmarks/CMakeLists.txt
@@ -5,6 +5,7 @@ add_subdirectory( SpMV )
 add_subdirectory( DistSpMV )
 add_subdirectory( LinearSolvers )
 add_subdirectory( ODESolvers )
+add_subdirectory( Sorting )
 add_subdirectory( Traversers )
 
 set( headers
diff --git a/src/Benchmarks/Sorting/CMakeLists.txt b/src/Benchmarks/Sorting/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..cbd22efc080ac15725c979a9cc24428b539f055b
--- /dev/null
+++ b/src/Benchmarks/Sorting/CMakeLists.txt
@@ -0,0 +1,9 @@
+if( BUILD_CUDA )
+    CUDA_ADD_EXECUTABLE( tnl-benchmark-sort tnl-benchmark-sort.cu OPTIONS -Xcompiler -Wno-error=switch,-Wno-error=sign-compare ${CUDA_SAMPLES_FLAGS} )
+    # Source code of reference algorithms contains warning which turn into errers with CI/CD compiler flags. Therefore we use -Wno-error to turn it off.
+    TARGET_LINK_LIBRARIES( tnl-benchmark-sort ${CUDA_cusparse_LIBRARY} ${CUDA_cudadevrt_LIBRARY} )
+else()
+    ADD_EXECUTABLE( tnl-benchmark-sort tnl-benchmark-sort.cpp )
+endif()
+
+install( TARGETS tnl-benchmark-sort RUNTIME DESTINATION bin )
diff --git a/src/Benchmarks/Sorting/Measurer.h b/src/Benchmarks/Sorting/Measurer.h
new file mode 100644
index 0000000000000000000000000000000000000000..e8c18389f5b5157edf3b176490a400099076cb2c
--- /dev/null
+++ b/src/Benchmarks/Sorting/Measurer.h
@@ -0,0 +1,67 @@
+#pragma once
+
+#include <vector>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Containers/Array.h>
+#include <TNL/Algorithms/Sorting/Quicksort.h>
+#include <TNL/Algorithms/Sorting/BitonicSort.h>
+#include <TNL/Algorithms/Sorting/STLSort.h>
+
+#ifdef HAVE_CUDA
+#ifdef HAVE_CUDA_SAMPLES
+#include "ReferenceAlgorithms/MancaQuicksort.h"
+#include "ReferenceAlgorithms/NvidiaBitonicSort.h"
+#endif
+#include "ReferenceAlgorithms/CedermanQuicksort.h"
+#include "ReferenceAlgorithms/ThrustRadixsort.h"
+#endif
+
+#include "timer.h"
+
+using namespace TNL;
+
+template< typename Sorter >
+struct Measurer
+{
+   template< typename Value >
+   static double measure( const std::vector<Value>&vec, int tries, int & wrongAnsCnt )
+   {
+      vector<double> resAcc;
+
+      for(int i = 0; i < tries; i++)
+      {
+         Containers::Array<Value, Devices::Cuda > arr(vec);
+         auto view = arr.getView();
+         {
+               TIMER t([&](double res){resAcc.push_back(res);});
+               Sorter::sort(view);
+         }
+
+         if( ! Algorithms::isAscending( view ) )
+               wrongAnsCnt++;
+      }
+      return accumulate(resAcc.begin(), resAcc.end(), 0.0) / resAcc.size();
+   }
+};
+
+template<>
+struct Measurer< Algorithms::Sorting::STLSort >
+{
+   template< typename Value >
+   static double measure( const std::vector<Value>&vec, int tries, int & wrongAnsCnt )
+   {
+      vector<double> resAcc;
+
+      for(int i = 0; i < tries; i++)
+      {
+         Containers::Array<Value, Devices::Host > arr(vec);
+         auto view = arr.getView();
+         //std::vector< Value > vec2 = vec;
+         {
+               TIMER t([&](double res){resAcc.push_back(res);});
+               Algorithms::Sorting::STLSort::sort( view );
+         }
+      }
+      return accumulate(resAcc.begin(), resAcc.end(), 0.0) / resAcc.size();
+   }
+};
diff --git a/src/Benchmarks/Sorting/Quicksorter.h b/src/Benchmarks/Sorting/Quicksorter.h
new file mode 100644
index 0000000000000000000000000000000000000000..d361a1ee2a66a9c8ef98d22842dbd34844399123
--- /dev/null
+++ b/src/Benchmarks/Sorting/Quicksorter.h
@@ -0,0 +1,10 @@
+#include "../../src/quicksort/quicksort.cuh"
+
+#include "../benchmarker.cpp"
+#include "../measure.cu"
+
+template<typename Value>
+void sorter(ArrayView<Value, Devices::Cuda> arr)
+{
+    quicksort(arr);
+}
\ No newline at end of file
diff --git a/src/Benchmarks/Sorting/ReferenceAlgorithms/CedermanQuicksort.h b/src/Benchmarks/Sorting/ReferenceAlgorithms/CedermanQuicksort.h
new file mode 100644
index 0000000000000000000000000000000000000000..0ae220842d4399ab0bcf76060bc51675e12e3f1d
--- /dev/null
+++ b/src/Benchmarks/Sorting/ReferenceAlgorithms/CedermanQuicksort.h
@@ -0,0 +1,1100 @@
+
+
+#ifndef PQSORTH
+#define PQSORTH
+
+#ifdef _MSC_VER
+#ifdef BUILDING_DLL
+#define DLLEXPORT __declspec(dllexport)
+#else
+#define DLLEXPORT /*__declspec(dllimport)*/
+#endif
+#else
+#ifdef HAVE_GCCVISIBILITYPATCH
+#define DLLEXPORT __attribute__((visibility("default")))
+#else
+#define DLLEXPORT
+#endif
+#endif
+
+#define MAXTHREADS 256
+#define MAXBLOCKS 2048
+
+/**
+* The main sort function
+* @param data		Data to be sorted
+* @param size		The length of the data
+* @returns 0 if successful. For non-zero values, use getErrorStr() for more information about why it failed.
+*/
+int gpuqsort(unsigned int *data, unsigned int size, unsigned int blockscount = 0, unsigned int threads = 0, unsigned int sbsize = 0, unsigned int phase = 0);
+
+
+// Keep tracks of the data blocks in phase one
+template <typename element>
+struct BlockSize
+{
+	unsigned int beg;
+	unsigned int end;
+	unsigned int orgbeg;
+	unsigned int orgend;
+	element rmaxpiv;
+	element lmaxpiv;
+	element rminpiv;
+	element lminpiv;
+
+	bool altered;
+	bool flip;
+	element pivot;
+};
+
+// Holds parameters to the kernel in phase one
+template <typename element>
+struct Params
+{
+	unsigned int from;
+	unsigned int end;
+	element pivot;
+	unsigned int ptr;
+	bool last;
+};
+
+// Used to perform a cumulative sum between blocks.
+// Unnecessary for cards with atomic operations.
+// Will be removed when these becomes more common
+template <typename element>
+struct Length
+{
+	element maxpiv[MAXBLOCKS];
+	element minpiv[MAXBLOCKS];
+
+	unsigned int left[MAXBLOCKS];
+	unsigned int right[MAXBLOCKS];
+};
+
+// Since we have divided up the kernel in to three
+// we need to remember the result of the cumulative sum
+// Unnecessary for cards with atomic operations.
+// Will be removed when these becomes more common
+struct Hist
+{
+	unsigned int left[(MAXTHREADS)*MAXBLOCKS];
+	unsigned int right[(MAXTHREADS)*MAXBLOCKS];
+};
+
+struct LQSortParams
+{
+	unsigned int beg;
+	unsigned int end;
+	bool flip;
+	unsigned int sbsize;
+};
+
+template <typename element>
+class GPUQSort
+{
+	element *ddata;
+	element *ddata2;
+	struct Params<element> *params;
+	struct Params<element> *dparams;
+
+	LQSortParams *lqparams;
+	LQSortParams *dlqparams;
+
+	Hist *dhists;
+	Length<element> *dlength;
+	Length<element> *length;
+	BlockSize<element> *workset;
+
+	float TK, TM, MK, MM, SM, SK;
+
+	int err;
+	bool init;
+
+	bool errCheck(int e);
+
+public:
+	GPUQSort();
+	~GPUQSort();
+
+	int sort(element *data, unsigned int size, unsigned int blockscount = 0, unsigned int threads = 0, unsigned int sbsize = 0, unsigned int phase = 0);
+	const char *getErrorStr();
+};
+
+#endif
+
+#undef THREADS
+
+#define THREADS blockDim.x
+
+extern __shared__ unsigned int sarray[];
+
+#ifdef HASATOMICS
+__device__ unsigned int ohtotal = 0;
+#endif
+
+/**
+* Swaps the location of two unsigned ints
+* @param a This unsigned int will swap place with unsigned int b
+* @param b This unsigned int will swap place with unsigned int a
+*/
+//template <typename unsigned int>
+__device__ inline void swap(unsigned int &a, unsigned int &b)
+{
+	unsigned int tmp = a;
+	a = b;
+	b = tmp;
+}
+
+/**
+* Perform a bitonic sort
+* @param values The unsigned ints to be sorted
+* @param target Where to place the sorted unsigned int when done
+* @param size The number of unsigned ints
+*/
+//template <typename unsigned int>
+__device__ inline void bitonicSort(unsigned int *fromvalues, unsigned int *tovalues, unsigned int from, unsigned int size)
+{
+	unsigned int *shared = (unsigned int *)sarray;
+
+	unsigned int coal = (from & 0xf);
+	size = size + coal;
+	from = from - coal;
+
+	int sb = 2 << (int)(__log2f(size));
+
+	// Buffer data to be sorted in the shared memory
+	for (int i = threadIdx.x; i < size; i += THREADS)
+	{
+		shared[i] = fromvalues[i + from];
+	}
+
+	for (int i = threadIdx.x; i < coal; i += THREADS)
+		shared[i] = 0;
+
+	// Pad the data
+	for (int i = threadIdx.x + size; i < sb; i += THREADS)
+		shared[i] = 0xffffffff;
+
+	__syncthreads();
+
+	// Parallel bitonic sort.
+	for (int k = 2; k <= sb; k *= 2)
+	{
+		// Bitonic merge:
+		for (int j = k / 2; j > 0; j /= 2)
+		{
+			for (int tid = threadIdx.x; tid < sb; tid += THREADS)
+			{
+				unsigned int ixj = tid ^ j;
+
+				if (ixj > tid)
+				{
+					if ((tid & k) == 0)
+					{
+						if (shared[tid] > shared[ixj])
+						{
+							swap(shared[tid], shared[ixj]);
+						}
+					}
+					else
+					{
+						if (shared[tid] < shared[ixj])
+						{
+							swap(shared[tid], shared[ixj]);
+						}
+					}
+				}
+			}
+
+			__syncthreads();
+		}
+	}
+	__syncthreads();
+
+	// Write back the sorted data to its correct position
+	for (int i = threadIdx.x; i < size; i += THREADS)
+		if (i >= coal)
+			tovalues[i + from] = shared[i];
+	__syncthreads();
+}
+
+/**
+* Perform a cumulative count on two arrays
+* @param lblock Array one
+* @param rblock Array two
+*/
+__device__ inline void cumcount(unsigned int *lblock, unsigned int *rblock)
+{
+	int tx = threadIdx.x;
+
+	int offset = 1;
+
+	__syncthreads();
+
+	for (int d = THREADS >> 1; d > 0; d >>= 1) // build sum in place up the tree
+	{
+		__syncthreads();
+
+		if (tx < d)
+		{
+			int ai = offset * (2 * tx + 1) - 1;
+			int bi = offset * (2 * tx + 2) - 1;
+			lblock[bi] += lblock[ai];
+			rblock[bi] += rblock[ai];
+		}
+		offset *= 2;
+	}
+	__syncthreads();
+	if (tx == 0)
+	{
+		lblock[THREADS] = lblock[THREADS - 1];
+		rblock[THREADS] = rblock[THREADS - 1];
+		lblock[THREADS - 1] = 0;
+		rblock[THREADS - 1] = 0;
+	} // clear the last unsigned int */
+	__syncthreads();
+
+	for (int d = 1; d < THREADS; d *= 2) // traverse down tree & build scan
+	{
+		offset >>= 1;
+		__syncthreads();
+
+		if (tx < d)
+		{
+			int ai = offset * (2 * tx + 1) - 1;
+			int bi = offset * (2 * tx + 2) - 1;
+
+			int t = lblock[ai];
+			lblock[ai] = lblock[bi];
+			lblock[bi] += t;
+
+			t = rblock[ai];
+			rblock[ai] = rblock[bi];
+			rblock[bi] += t;
+		}
+	}
+}
+
+/**
+* Part One - Counts the number of unsigned ints larger or smaller than the pivot. It then
+* performs a cumulative sum so that each thread knows where to write
+* @param data   unsigned ints to be counted
+* @param params Specifies which data each thread block is responsible for
+* @param hist   The cumulative sum for each thread is stored here
+* @param lengths The total sum for each thread block is stored here
+*/
+//template <typename unsigned int>
+__global__ void part1(unsigned int *data, Params<unsigned int> *params, struct Hist *hist, Length<unsigned int> *lengths)
+{
+	const int tx = threadIdx.x;
+
+	unsigned int *lblock = (unsigned int *)sarray;
+	unsigned int *rblock = (unsigned int *)(&lblock[(blockDim.x + 1)]);
+	unsigned int *minpiv = (unsigned int *)(&rblock[(blockDim.x + 1)]);
+	unsigned int *maxpiv = (unsigned int *)(&minpiv[blockDim.x]);
+
+	// Where should we read?
+	unsigned int start = params[blockIdx.x].from;
+	unsigned int end = params[blockIdx.x].end;
+	unsigned int pivot = params[blockIdx.x].pivot;
+
+	// Stores the max and min value of the data. Used to decide a new pivot
+	minpiv[tx] = data[start + tx];
+	maxpiv[tx] = data[start + tx];
+
+	__syncthreads();
+	int ll = 0;
+	int lr = 0;
+
+	__syncthreads();
+
+	int coal = (start & 0xf);
+	start = start - coal;
+
+	// Go through the data
+	if (tx + start < end)
+	{
+		unsigned int d = data[tx + start];
+
+		if (!(tx < coal))
+		{
+
+			// Counting unsigned ints smaller...
+			if (d < pivot)
+				ll++;
+			else
+				// or larger than the pivot
+				if (d > pivot)
+				lr++;
+
+			// Store the max and min unsigned int
+			minpiv[tx] = min(minpiv[tx], d);
+			maxpiv[tx] = max(maxpiv[tx], d);
+		}
+	}
+
+	// Go through the data
+	for (unsigned int i = tx + start + THREADS; i < end; i += THREADS)
+	{
+		unsigned int d = data[i];
+
+		// Counting unsigned ints smaller...
+		if (d < pivot)
+			ll++;
+		else
+			// or larger than the pivot
+			if (d > pivot)
+			lr++;
+
+		// Store the max and min unsigned int
+		minpiv[tx] = min(minpiv[tx], d);
+		maxpiv[tx] = max(maxpiv[tx], d);
+	}
+
+	lblock[tx] = ll;
+	rblock[tx] = lr;
+
+	__syncthreads();
+
+	// Perform a cumulative sum
+	cumcount((unsigned int *)lblock, (unsigned int *)rblock);
+
+	if (tx == 0)
+	{
+		// Decide on max and min unsigned int
+		for (int i = 0; i < THREADS; i++)
+		{
+			minpiv[0] = min(minpiv[0], minpiv[i]);
+			maxpiv[0] = max(maxpiv[0], maxpiv[i]);
+		}
+	}
+	__syncthreads();
+
+	// Store each threads part of the cumulative count
+	hist->left[blockIdx.x * (THREADS) + threadIdx.x] = lblock[threadIdx.x + 1];
+	hist->right[blockIdx.x * (THREADS) + threadIdx.x] = rblock[threadIdx.x + 1];
+
+	// Store the total sum
+	lengths->left[blockIdx.x] = lblock[THREADS];
+	lengths->right[blockIdx.x] = rblock[THREADS];
+
+	// Store the max and min unsigned int
+	lengths->minpiv[blockIdx.x] = minpiv[0];
+	lengths->maxpiv[blockIdx.x] = maxpiv[0];
+}
+
+/**
+* Part Two - Move unsigned ints to their correct position in the auxillary array
+* @param data   unsigned ints to be moved
+* @param data2  Destination for unsigned ints
+* @param params Specifies which data each thread block is responsible for
+* @param hist   The cumulative sum for each thread is stored here
+* @param lengths The total sum for each thread block is stored here
+*/
+//template <typename unsigned int>
+__global__ void part2(unsigned int *data, unsigned int *data2, struct Params<unsigned int> *params, struct Hist *hist, Length<unsigned int> *lengths)
+{
+	const int tx = threadIdx.x;
+	const int bx = blockIdx.x;
+
+	// Each thread uses the cumulative sum to know where to write
+	unsigned int x = lengths->left[bx] + hist->left[bx * (THREADS) + tx] - 1; // - 1;
+	unsigned int y = lengths->right[bx] - hist->right[bx * (THREADS) + tx];
+
+	// Where should we read?
+	unsigned int start = params[bx].from;
+	unsigned int end = params[bx].end;
+	unsigned int pivot = params[bx].pivot;
+
+	__syncthreads();
+
+	int coal = (start & 0xf);
+	start = start - coal;
+
+	// Go through all the assigned data
+	if (tx + start < end)
+	{
+		// Reading unsigned ints...
+		unsigned int d = data[tx + start];
+
+		if (!(tx < coal))
+		{
+
+			// and writing them to auxillary array
+			if (d < pivot)
+			{
+				if (x > 0)
+					data2[x--] = d;
+				else
+					data2[x] = d;
+			}
+			else if (d > pivot)
+				data2[y++] = d;
+		}
+	}
+
+	__syncthreads();
+
+	// Go through all the assigned data
+	for (unsigned int i = start + tx + THREADS; i < end; i += THREADS)
+	{
+		// Reading unsigned ints...
+		unsigned int d = data[i];
+
+		// and writing them to auxillary array
+		if (d < pivot)
+		{
+			if (x > 0)
+				data2[x--] = d;
+			else
+				data2[x] = d;
+		}
+		else if (d > pivot)
+			data2[y++] = d;
+	}
+
+	return;
+}
+
+/**
+* Part Three - Write the pivot value
+* @param data   Destination for pivot
+* @param params Specifies which data each thread block is responsible for
+* @param hist   The cumulative sum for each thread is stored here
+* @param lengths The total sum for each thread block is stored here
+*/
+//template <typename unsigned int>
+__global__ void part3(unsigned int *data, struct Params<unsigned int> *params, struct Hist *hist, Length<unsigned int> *lengths)
+{
+	const int tx = threadIdx.x;
+	const int bx = blockIdx.x;
+
+	// If we are the "last" thread block that is assigned to the same data sequence
+	// we write the pivot between the left and right block
+	if (params[bx].last)
+	{
+		// Get destination position
+		unsigned int x = lengths->left[bx] + hist->left[bx * THREADS + THREADS - 1] + tx;
+		unsigned int y = lengths->right[bx] - hist->right[bx * THREADS + THREADS - 1];
+		unsigned int pivot = params[bx].pivot;
+
+		// Write the pivot values
+		for (; x < y; x += THREADS)
+			data[x] = pivot;
+	}
+}
+
+/**
+* The local quicksort - sorts a block of data with no inter-block synchronization
+* @param adata  Contains some of the blocks to be sorted and also acts as the final
+*               destination for sorted data
+* @param adata2 Contains some of the blocks to be sorted
+* @param bs     List of blocks to be sorted and a pointer telling if a specific block is
+*               in \a adata or \a adata2
+*/
+//template <typename unsigned int>
+__global__ void lqsort(unsigned int *adata, unsigned int *adata2, struct LQSortParams *bs, unsigned int phase)
+{
+	__shared__ unsigned int lphase;
+	lphase = phase;
+
+	// Shorthand for the threadid
+	int tx = threadIdx.x;
+
+	// Stack pointer
+	__shared__ int bi;
+
+	// Stack unsigned ints
+	__shared__ unsigned int beg[32];
+	__shared__ unsigned int end[32];
+	__shared__ bool flip[32];
+
+	unsigned int *lblock = (unsigned int *)sarray;
+	unsigned int *rblock = (unsigned int *)(&lblock[(blockDim.x + 1)]);
+
+	// The current pivot
+	__shared__ unsigned int pivot;
+
+	// The sequence to be sorted
+	__shared__ unsigned int from;
+	__shared__ unsigned int to;
+
+	// Since we switch between the primary and the auxillary buffer,
+	// these variables are required to keep track on which role
+	// a buffer currently has
+	__shared__ unsigned int *data;
+	__shared__ unsigned int *data2;
+	__shared__ unsigned int sbsize;
+
+	__shared__ unsigned int bx;
+	if (threadIdx.x == 0)
+#ifdef HASATOMICS
+		bx = atomicInc(&ohtotal, 50000);
+#else
+		bx = blockIdx.x;
+#endif
+
+	__syncthreads();
+
+	while (bx < gridDim.x)
+	{
+
+		// Thread 0 is in charge of the stack operations
+		if (tx == 0)
+		{
+			// We push our first block on the stack
+			// This is the block given by the bs parameter
+			beg[0] = bs[bx].beg;
+			end[0] = bs[bx].end;
+			flip[0] = bs[bx].flip;
+			sbsize = bs[bx].sbsize;
+
+			bi = 0;
+		}
+
+		__syncthreads();
+
+		// If we were given an empty block there is no need to continue
+		if (end[0] == beg[0])
+			return;
+
+		// While there are items left on the stack to sort
+		while (bi >= 0)
+		{
+			__syncthreads();
+			// Thread 0 pops a fresh sequence from the stack
+			if (tx == 0)
+			{
+				from = beg[bi];
+				to = end[bi];
+
+				// Check which buffer the sequence is in
+				if (!flip[bi])
+				{
+					data = adata2;
+					data2 = adata;
+				}
+				else
+				{
+					data = adata;
+					data2 = adata2;
+				}
+			}
+
+			__syncthreads();
+
+			// If the sequence is smaller than SBSIZE we sort it using
+			// an alternative sort. Otherwise each thread would sort just one
+			// or two unsigned ints and that wouldn't be efficient
+			if ((to - from) < (sbsize - 16))
+			{
+				// Sort it using bitonic sort. This could be changed to some other
+				// sorting method. Store the result in the final destination buffer
+				if ((to - from >= 1) && (lphase != 2))
+					bitonicSort(data, adata, from, to - from);
+				__syncthreads();
+
+				// Decrement the stack pointer
+				if (tx == 0)
+					bi--;
+				__syncthreads();
+				// and continue with the next sequence
+				continue;
+			}
+
+			if (tx == 0)
+			{
+				// Create a new pivot for the sequence
+				// Try to optimize this for your input distribution
+				// if you have some information about it
+				unsigned int mip = min(min(data[from], data[to - 1]), data[(from + to) / 2]);
+				unsigned int map = max(max(data[from], data[to - 1]), data[(from + to) / 2]);
+				pivot = min(max(mip / 2 + map / 2, mip), map);
+			}
+
+			unsigned int ll = 0;
+			unsigned int lr = 0;
+
+			__syncthreads();
+
+			unsigned int coal = (from)&0xf;
+
+			if (tx + from - coal < to)
+			{
+				unsigned int d = data[tx + from - coal];
+
+				if (!(tx < coal))
+				{
+					// Counting unsigned ints that have a higher value than the pivot
+					if (d < pivot)
+						ll++;
+					else
+						// or a lower
+						if (d > pivot)
+						lr++;
+				}
+			}
+
+			// Go through the current sequence
+			for (int i = from + tx + THREADS - coal; i < to; i += THREADS)
+			{
+				unsigned int d = data[i];
+
+				// Counting unsigned ints that have a higher value than the pivot
+				if (d < pivot)
+					ll++;
+				else
+					// or a lower
+					if (d > pivot)
+					lr++;
+			}
+
+			// Store the result in a shared array so that we can calculate a
+			// cumulative sum
+			lblock[tx] = ll;
+			rblock[tx] = lr;
+
+			__syncthreads();
+
+			// Calculate the cumulative sum
+			cumcount((unsigned int *)lblock, (unsigned int *)rblock);
+
+			__syncthreads();
+
+			// Let thread 0 add the new resulting subsequences to the stack
+			if (tx == 0)
+			{
+				// The sequences are in the other buffer now
+				flip[bi + 1] = !flip[bi];
+				flip[bi] = !flip[bi];
+
+				// We need to place the smallest object on top of the stack
+				// to ensure that we don't run out of stack space
+				if (lblock[THREADS] < rblock[THREADS])
+				{
+					beg[bi + 1] = beg[bi];
+					beg[bi] = to - rblock[THREADS];
+					end[bi + 1] = from + lblock[THREADS];
+				}
+				else
+				{
+					end[bi + 1] = end[bi];
+					end[bi] = from + lblock[THREADS];
+					beg[bi + 1] = to - rblock[THREADS];
+				}
+				// Increment the stack pointer
+				bi++;
+			}
+
+			__syncthreads();
+
+			unsigned int x = from + lblock[tx + 1] - 1;
+			unsigned int y = to - rblock[tx + 1];
+
+			coal = from & 0xf;
+
+			if (tx + from - coal < to)
+			{
+				unsigned int d = data[tx + from - coal];
+
+				if (!(tx < coal))
+				{
+					if (d < pivot)
+					{
+						if (x > 0)
+							data2[x--] = d;
+						else
+							data2[x] = d;
+					}
+					else if (d > pivot)
+						data2[y++] = d;
+				}
+			}
+
+			// Go through the data once again
+			// writing it to its correct position
+			for (unsigned int i = from + tx + THREADS - coal; i < to; i += THREADS)
+			{
+				unsigned int d = data[i];
+
+				if (d < pivot)
+				{
+					if (x > 0)
+						data2[x--] = d;
+					else
+						data2[x] = d;
+				}
+				else if (d > pivot)
+					data2[y++] = d;
+			}
+
+			__syncthreads();
+
+			// As a final step, write the pivot value between the right and left
+			// subsequence. Write it to the final destination since this pivot
+			// is always correctly sorted
+			for (unsigned int i = from + lblock[THREADS] + tx; i < to - rblock[THREADS]; i += THREADS)
+			{
+				adata[i] = pivot;
+			}
+
+			__syncthreads();
+		}
+#ifdef HASATOMICS
+		if (threadIdx.x == 0)
+			bx = atomicInc(&ohtotal, 50000);
+		__syncthreads();
+#else
+		break;
+#endif
+	}
+
+	__syncthreads();
+}
+
+#include <stdio.h>
+#include <algorithm>
+#include <string>
+
+#undef THREADS
+#define THREADS threads
+
+/**
+* The main sort function
+* @param data		Data to be sorted
+* @param size		The length of the data
+* @returns 0 if successful. For non-zero values, use getErrorStr() for more information about why it failed.
+*/
+template <typename element>
+int GPUQSort<element>::sort(element *data, unsigned int size, unsigned int blockscount, unsigned int threads, unsigned int sbsize, unsigned int phase)
+{
+	if (!init)
+		return 1;
+
+	if (!threads || !blockscount || !sbsize)
+	{
+		threads = 1 << (int)round(log(size * TK + TM) / log(2.0));
+		blockscount = 1 << (int)round(log(size * MK + MM) / log(2.0));
+		sbsize = 1 << (int)round(log(size * SK + SM) / log(2.0));
+	}
+
+#ifdef HASATOMICS
+	unsigned int *doh;
+	unsigned int oh;
+
+	cudaGetSymbolAddress((void **)&doh, "ohtotal");
+	oh = 0;
+	cudaMemcpy(doh, &oh, 4, cudaMemcpyHostToDevice);
+#endif
+
+	if (threads > MAXTHREADS)
+		return 1;
+
+	if (blockscount > MAXBLOCKS)
+		return 1;
+
+	// Copy the data to the graphics card and create an auxiallary array
+	ddata2 = 0;
+	ddata = 0;
+	if (!errCheck(cudaMalloc((void **)&ddata2, (size) * sizeof(element))))
+		return 1;
+	if (!errCheck(cudaMalloc((void **)&ddata, (size) * sizeof(element))))
+		return 1;
+	if (!errCheck(cudaMemcpy(ddata, data, size * sizeof(element), cudaMemcpyHostToDevice)))
+		return 1;
+
+	// We start with a set containg only the sequence to be sorted
+	// This will grow as we partition the data
+	workset[0].beg = 0;
+	workset[0].end = size;
+	workset[0].orgbeg = 0;
+	workset[0].orgend = size;
+	workset[0].altered = false;
+	workset[0].flip = false;
+
+	// Get a starting pivot
+	workset[0].pivot = (min(min(data[0], data[size / 2]), data[size - 1]) + max(max(data[0], data[size / 2]), data[size - 1])) / 2;
+	unsigned int worksize = 1;
+
+	unsigned int blocks = blockscount / 2;
+	unsigned totsize = size;
+	unsigned int maxlength = (size / blocks) / 4;
+
+	unsigned int iterations = 0;
+	bool flip = true;
+
+	// Partition the sequences until we have enough
+	while (worksize < blocks)
+	{
+		unsigned int ws = totsize / blocks;
+		unsigned int paramsize = 0;
+
+		// Go through the sequences we have and divide them into sections
+		// and assign thread blocks according to their size
+		for (unsigned int i = 0; i < worksize; i++)
+		{
+			if ((workset[i].end - workset[i].beg) < maxlength)
+				continue;
+
+			// Larger sequences gets more thread blocks assigned to them
+			unsigned int blocksassigned = max((workset[i].end - workset[i].beg) / ws, 1);
+			for (unsigned int q = 0; q < blocksassigned; q++)
+			{
+				params[paramsize].from = workset[i].beg + ws * q;
+				params[paramsize].end = params[paramsize].from + ws;
+				params[paramsize].pivot = workset[i].pivot;
+				params[paramsize].ptr = i;
+				params[paramsize].last = false;
+				paramsize++;
+			}
+			params[paramsize - 1].last = true;
+			params[paramsize - 1].end = workset[i].end;
+
+			workset[i].lmaxpiv = 0;
+			workset[i].lminpiv = 0xffffffff;
+			workset[i].rmaxpiv = 0;
+			workset[i].rminpiv = 0xffffffff;
+		}
+
+		if (paramsize == 0)
+			break;
+
+		// Copy the block assignment to the GPU
+		if (!errCheck(cudaMemcpy(dparams, params, paramsize * sizeof(Params<element>), cudaMemcpyHostToDevice)))
+			return 1;
+
+		// Do the cumulative sum
+		if (flip)
+			part1<<<paramsize, THREADS, (THREADS + 1) * 2 * 4 + THREADS * 2 * 4>>>(ddata, dparams, dhists, dlength);
+		else
+			part1<<<paramsize, THREADS, (THREADS + 1) * 2 * 4 + THREADS * 2 * 4>>>(ddata2, dparams, dhists, dlength);
+		if (!errCheck((cudaMemcpy(length, dlength, sizeof(Length<element>), cudaMemcpyDeviceToHost))))
+			return 1;
+
+		// Do the block cumulative sum. Done on the CPU since not all cards have support for
+		// atomic operations yet.
+		for (unsigned int i = 0; i < paramsize; i++)
+		{
+			unsigned int l = length->left[i];
+			unsigned int r = length->right[i];
+
+			length->left[i] = workset[params[i].ptr].beg;
+			length->right[i] = workset[params[i].ptr].end;
+
+			workset[params[i].ptr].beg += l;
+			workset[params[i].ptr].end -= r;
+			workset[params[i].ptr].altered = true;
+
+			workset[params[i].ptr].rmaxpiv = max(length->maxpiv[i], workset[params[i].ptr].rmaxpiv);
+			workset[params[i].ptr].lminpiv = min(length->minpiv[i], workset[params[i].ptr].lminpiv);
+
+			workset[params[i].ptr].lmaxpiv = min(workset[params[i].ptr].pivot, workset[params[i].ptr].rmaxpiv);
+			workset[params[i].ptr].rminpiv = max(workset[params[i].ptr].pivot, workset[params[i].ptr].lminpiv);
+		}
+
+		// Copy the result of the block cumulative sum to the GPU
+		if (!errCheck((cudaMemcpy(dlength, length, sizeof(Length<element>), cudaMemcpyHostToDevice))))
+			return 1;
+
+		// Move the elements to their correct position
+		if (flip)
+			part2<<<paramsize, THREADS>>>(ddata, ddata2, dparams, dhists, dlength);
+		else
+			part2<<<paramsize, THREADS>>>(ddata2, ddata, dparams, dhists, dlength);
+
+		// Fill in the pivot value between the left and right blocks
+		part3<<<paramsize, THREADS>>>(ddata, dparams, dhists, dlength);
+
+		flip = !flip;
+
+		// Add the sequences resulting from the partitioning
+		// to set
+		unsigned int oldworksize = worksize;
+		totsize = 0;
+		for (unsigned int i = 0; i < oldworksize; i++)
+		{
+			if (workset[i].altered)
+			{
+				if (workset[i].beg - workset[i].orgbeg >= maxlength)
+					totsize += workset[i].beg - workset[i].orgbeg;
+				if (workset[i].orgend - workset[i].end >= maxlength)
+					totsize += workset[i].orgend - workset[i].end;
+
+				workset[worksize].beg = workset[worksize].orgbeg = workset[i].orgbeg;
+				workset[worksize].end = workset[worksize].orgend = workset[i].beg;
+				workset[worksize].flip = flip;
+				workset[worksize].altered = false;
+				workset[worksize].pivot = (workset[i].lminpiv / 2 + workset[i].lmaxpiv / 2);
+
+				worksize++;
+
+				workset[i].orgbeg = workset[i].beg = workset[i].end;
+				workset[i].end = workset[i].orgend;
+				workset[i].flip = flip;
+				workset[i].pivot = (workset[i].rminpiv / 2 + workset[i].rmaxpiv / 2);
+				workset[i].altered = false;
+			}
+		}
+		iterations++;
+	}
+
+	// Due to the poor scheduler on some graphics card
+	// we need to sort the order in which the blocks
+	// are sorted to avoid poor scheduling decisions
+	unsigned int sortblocks[MAXBLOCKS * 2];
+	for (int i = 0; i < worksize; i++)
+		sortblocks[i] = ((workset[i].end - workset[i].beg) << (int)round(log((float)(MAXBLOCKS * 4.0f)) / log(2.0f))) + i;
+	std::sort(&sortblocks[0], &sortblocks[worksize]);
+
+	if (worksize != 0)
+	{
+		// Copy the block assignments to the GPU
+		for (int i = 0; i < worksize; i++)
+		{
+			unsigned int q = (worksize - 1) - (sortblocks[i] & (MAXBLOCKS * 4 - 1));
+
+			lqparams[i].beg = workset[q].beg;
+			lqparams[i].end = workset[q].end;
+			lqparams[i].flip = workset[q].flip;
+			lqparams[i].sbsize = sbsize;
+		}
+
+		if (!errCheck((cudaMemcpy(dlqparams, lqparams, worksize * sizeof(LQSortParams), cudaMemcpyHostToDevice))))
+			return 1;
+
+		// Run the local quicksort, the one that doesn't need inter-block synchronization
+		if (phase != 1)
+			lqsort<<<worksize, THREADS, max((THREADS + 1) * 2 * 4, sbsize * 4)>>>(ddata, ddata2, dlqparams, phase);
+	}
+
+	err = cudaDeviceSynchronize();
+	// Free the data
+	if (err != cudaSuccess)
+	{
+		cudaFree(ddata);
+		cudaFree(ddata2);
+		return 1;
+	}
+
+	// Copy the result back to the CPU
+	if (!errCheck((cudaMemcpy(data, ddata, size * sizeof(element), cudaMemcpyDeviceToHost))))
+		return 1;
+
+	cudaFree(ddata);
+	cudaFree(ddata2);
+
+	return 0;
+}
+
+template <typename element>
+bool GPUQSort<element>::errCheck(int e)
+{
+	if (e == cudaSuccess)
+		return true;
+
+	err = e;
+	cudaFree(ddata);
+	cudaFree(ddata2);
+	return false;
+}
+
+template <typename element>
+GPUQSort<element>::GPUQSort() : init(false), workset(0), params(0), length(0), lqparams(0), dlqparams(0),
+								dhists(0), dlength(0), dparams(0)
+{
+	cudaDeviceProp deviceProp;
+	cudaGetDeviceProperties(&deviceProp, 0);
+	if (!strcmp(deviceProp.name, "GeForce 8800 GTX"))
+	{
+		TK = 1.17125033316e-005f;
+		TM = 52.855721393f;
+		MK = 3.7480010661e-005f;
+		MM = 476.338308458f;
+		SK = 4.68500133262e-005f;
+		SM = 211.422885572f;
+	}
+	else if (!strcmp(deviceProp.name, "GeForce 8600 GTS"))
+	{
+		TK = 0.0f;
+		TM = 64.0f;
+		MK = 0.0000951623403898f;
+		MM = 476.338308458f;
+		SK = 0.0000321583081317f;
+		SM = 202.666666667f;
+	}
+	else
+	{
+		TK = 0;
+		TM = 128;
+		MK = 0;
+		MM = 512;
+		SK = 0;
+		SM = 512;
+	}
+
+	if (cudaMallocHost((void **)&workset, MAXBLOCKS * 2 * sizeof(BlockSize<element>)) != cudaSuccess)
+		return;
+	if (cudaMallocHost((void **)&params, MAXBLOCKS * sizeof(Params<element>)) != cudaSuccess)
+		return;
+	if (cudaMallocHost((void **)&length, sizeof(Length<element>)) != cudaSuccess)
+		return;
+	if (cudaMallocHost((void **)&lqparams, MAXBLOCKS * sizeof(LQSortParams)) != cudaSuccess)
+		return;
+	if (cudaMalloc((void **)&dlqparams, MAXBLOCKS * sizeof(LQSortParams)) != cudaSuccess)
+		return;
+	if (cudaMalloc((void **)&dhists, sizeof(Hist)) != cudaSuccess)
+		return;
+	if (cudaMalloc((void **)&dlength, sizeof(Length<element>)) != cudaSuccess)
+		return;
+	if (cudaMalloc((void **)&dparams, MAXBLOCKS * sizeof(Params<element>)) != cudaSuccess)
+		return;
+
+	init = true;
+}
+
+/**
+* Returns the latest error message
+* @returns the latest error message
+*/
+template <typename element>
+const char *GPUQSort<element>::getErrorStr()
+{
+	return cudaGetErrorString((cudaError_t)err);
+}
+
+template <typename element>
+GPUQSort<element>::~GPUQSort()
+{
+	cudaFreeHost(workset);
+	cudaFreeHost(params);
+	cudaFreeHost(length);
+	cudaFreeHost(lqparams);
+	cudaFree(dparams);
+	cudaFree(dlqparams);
+	cudaFree(dhists);
+	cudaFree(dlength);
+}
+
+int gpuqsort(unsigned int *data, unsigned int size, unsigned int blockscount, unsigned int threads, unsigned int sbsize, unsigned int phase)
+{
+	GPUQSort<unsigned int> *s = new GPUQSort<unsigned int>();
+
+	if (s->sort(data, size, blockscount, threads, sbsize, phase) != 0)
+	{
+		delete s;
+		return 1;
+	}
+	else
+	{
+		delete s;
+		return 0;
+	}
+}
+
+struct CedermanQuicksort
+{
+   static void sort( TNL::Containers::ArrayView< int, TNL::Devices::Cuda >& array )
+   {
+      gpuqsort( ( unsigned int * ) array.getData(), ( unsigned int ) array.getSize() );
+   }
+};
diff --git a/src/Benchmarks/Sorting/ReferenceAlgorithms/MancaQuicksort.h b/src/Benchmarks/Sorting/ReferenceAlgorithms/MancaQuicksort.h
new file mode 100644
index 0000000000000000000000000000000000000000..e7f4d20fb0bfb2a31a6bf42fd4342098edb89dda
--- /dev/null
+++ b/src/Benchmarks/Sorting/ReferenceAlgorithms/MancaQuicksort.h
@@ -0,0 +1,1327 @@
+//defines the shared memory size
+#define SHARED_LIMIT 1024
+
+#define GIGA 1073741824
+/*
+ * division of the vector to be sorted in buckets
+ * the attributes of the object Block are the parameters of each bucket
+ */
+template <typename Type>
+struct Block
+{
+
+    unsigned int begin;
+    unsigned int end;
+
+    unsigned int nextbegin;
+    unsigned int nextend;
+
+    Type pivot;
+
+    //max of the bucket items
+    Type maxPiv;
+    //min of the bucket items
+    Type minPiv;
+    //done indicates that a bucket has been analyzed
+    short done;
+    short select;
+};
+
+template <typename Type>
+struct Partition
+{
+
+    unsigned int ibucket;
+    unsigned int from;
+    unsigned int end;
+    Type pivot;
+};
+
+void CUDA_Quicksort(unsigned int *inData, unsigned int *outData, unsigned int dataSize, unsigned int threads, int Device, double *timer);
+
+void CUDA_Quicksort_64(double *inData, double *outData, unsigned int size, unsigned int threads, int Device, double *timer);
+
+typedef unsigned int Type;
+
+void test_bitonicSort(unsigned int *h_InputKey, unsigned int N, double *timer);
+void test_MergeSort(unsigned int *h_SrcKey, unsigned int N, double *timer);
+void test_thrustSort(Type *h_data, unsigned int N, double *timer);
+
+typedef unsigned int uint;
+
+size_t scanInclusiveShort(
+    uint *d_Dst,
+    uint *d_Src,
+    uint batchSize,
+    uint arrayLength);
+
+size_t scanInclusiveLarge(
+    uint *d_Dst,
+    uint *d_Src,
+    uint batchSize,
+    uint arrayLength);
+
+template <typename Type>
+inline __device__ void warpScanInclusive2(Type &idata, Type &idata2, volatile Type *s_Data, volatile Type *s_Data2, uint size)
+{
+
+    //volatile uint* s_Data2;
+    //s_Data2 = s_Data + blockDim.x*2;
+
+    uint pos = 2 * threadIdx.x - (threadIdx.x & (size - 1));
+    s_Data[pos] = 0;
+    s_Data2[pos] = 0;
+    pos += size;
+    s_Data[pos] = idata;
+    s_Data2[pos] = idata2;
+
+    for (uint offset = 1; offset < size; offset <<= 1)
+    {
+        s_Data[pos] += s_Data[pos - offset];
+        s_Data2[pos] += s_Data2[pos - offset];
+    }
+
+    idata = s_Data[pos];
+    idata2 = s_Data2[pos];
+}
+
+template <typename Type>
+inline __device__ void warpScanExclusive2(Type &idata, Type &idata2, volatile Type *s_Data, volatile Type *s_Data2, uint size)
+{
+
+    //volatile uint* s_Data2;
+    //s_Data2 = s_Data + blockDim.x*2;
+
+    uint pos = 2 * threadIdx.x - (threadIdx.x & (size - 1));
+    s_Data[pos] = 0;
+    s_Data2[pos] = 0;
+    pos += size;
+    s_Data[pos] = idata;
+    s_Data2[pos] = idata2;
+
+    for (uint offset = 1; offset < size; offset <<= 1)
+    {
+        s_Data[pos] += s_Data[pos - offset];
+        s_Data2[pos] += s_Data2[pos - offset];
+    }
+
+    idata = s_Data[pos] - idata;
+    idata2 = s_Data2[pos] - idata2;
+}
+
+#define LOG2_WARP_SIZE 5U
+#define WARP_SIZE (1U << LOG2_WARP_SIZE)
+
+template <typename Type>
+inline __device__ void scan1Inclusive2(Type &idata, Type &idata2, volatile Type *s_Data, uint size)
+{
+
+    volatile Type *s_Data2;
+    s_Data2 = s_Data + blockDim.x * 2;
+
+    if (size > WARP_SIZE)
+    {
+
+        //Bottom-level inclusive warp scan
+        warpScanInclusive2(idata, idata2, s_Data, s_Data2, WARP_SIZE);
+
+        //Save top Types of each warp for exclusive warp scan
+        //sync to wait for warp scans to complete (because s_Data is being overwritten)
+        __syncthreads();
+        if ((threadIdx.x & (WARP_SIZE - 1)) == (WARP_SIZE - 1))
+        {
+            s_Data[threadIdx.x >> LOG2_WARP_SIZE] = idata;
+            s_Data2[threadIdx.x >> LOG2_WARP_SIZE] = idata2;
+        }
+
+        //wait for warp scans to complete
+        __syncthreads();
+        if (threadIdx.x < (blockDim.x / WARP_SIZE))
+        {
+            //grab top warp Types
+            Type val = s_Data[threadIdx.x];
+            Type val2 = s_Data2[threadIdx.x];
+            //calculate exclsive scan and write back to shared memory
+            warpScanExclusive2(val, val2, s_Data, s_Data2, size >> LOG2_WARP_SIZE);
+            s_Data[threadIdx.x] = val;
+            s_Data2[threadIdx.x] = val2;
+        }
+
+        //return updated warp scans with exclusive scan results
+        __syncthreads();
+        idata += s_Data[threadIdx.x >> LOG2_WARP_SIZE];
+        idata2 += s_Data2[threadIdx.x >> LOG2_WARP_SIZE];
+    }
+    else
+        warpScanInclusive2(idata, idata2, s_Data, s_Data2, size);
+}
+
+template <typename Type>
+inline __device__ void warpCompareInclusive(Type &idata, Type &idata2, volatile Type *s_Data, uint size)
+{
+
+    volatile Type *s_Data2;
+    s_Data2 = s_Data + blockDim.x * 2;
+    uint pos = 2 * threadIdx.x - (threadIdx.x & (size - 1));
+    s_Data[pos] = 0;
+    s_Data2[pos] = 0;
+    pos += size;
+    s_Data[pos] = idata;
+    s_Data2[pos] = idata2;
+
+    for (uint offset = 1; offset < size; offset <<= 1)
+    {
+        s_Data[pos] = max(s_Data[pos], s_Data[pos - offset]);
+        s_Data2[pos] = min(s_Data2[pos], s_Data2[pos - offset]);
+    }
+
+    idata = s_Data[pos];
+    idata2 = s_Data2[pos];
+}
+
+template <typename Type>
+inline __device__ void compareInclusive(Type &idata, Type &idata2, volatile Type *s_Data, uint size)
+{
+
+    volatile Type *s_Data2;
+    s_Data2 = s_Data + blockDim.x * 2;
+    //Bottom-level inclusive warp scan
+    warpCompareInclusive(idata, idata2, s_Data, WARP_SIZE);
+
+    //Save top Types of each warp for exclusive warp scan
+    //sync to wait for warp scans to complete (because s_Data is being overwritten)
+    __syncthreads();
+    if ((threadIdx.x & (WARP_SIZE - 1)) == (WARP_SIZE - 1))
+    {
+        s_Data[threadIdx.x >> LOG2_WARP_SIZE] = idata;
+        s_Data2[threadIdx.x >> LOG2_WARP_SIZE] = idata2;
+    }
+
+    //wait for warp scans to complete
+    __syncthreads();
+    if (threadIdx.x < (blockDim.x / WARP_SIZE))
+    {
+        //grab top warp Types
+        Type val = s_Data[threadIdx.x];
+        Type val2 = s_Data2[threadIdx.x];
+        //calculate exclsive scan and write back to shared memory
+        warpCompareInclusive(val, val2, s_Data, size >> LOG2_WARP_SIZE);
+        s_Data[threadIdx.x] = val;
+        s_Data2[threadIdx.x] = val2;
+    }
+
+    //return updated warp scans with exclusive scan results
+    __syncthreads();
+    idata = max(idata, s_Data[threadIdx.x >> LOG2_WARP_SIZE]);
+    idata2 = min(idata2, s_Data2[threadIdx.x >> LOG2_WARP_SIZE]);
+}
+
+#include <assert.h>
+#include <common/inc/helper_cuda.h>
+#include <6_Advanced/scan/scan_common.h>
+
+//All three kernels run 512 threads per workgroup
+//Must be a power of two
+#define THREADBLOCK_SIZE 256
+
+////////////////////////////////////////////////////////////////////////////////
+// Basic ccan codelets
+////////////////////////////////////////////////////////////////////////////////
+#if (0)
+//Naive inclusive scan: O(N * log2(N)) operations
+//Allocate 2 * 'size' local memory, initialize the first half
+//with 'size' zeros avoiding if(pos >= offset) condition evaluation
+//and saving instructions
+inline __device__ uint scan1Inclusive(uint idata, volatile uint *s_Data, uint size)
+{
+    uint pos = 2 * threadIdx.x - (threadIdx.x & (size - 1));
+    s_Data[pos] = 0;
+    pos += size;
+    s_Data[pos] = idata;
+
+    for (uint offset = 1; offset < size; offset <<= 1)
+    {
+        __syncthreads();
+        uint t = s_Data[pos] + s_Data[pos - offset];
+        __syncthreads();
+        s_Data[pos] = t;
+    }
+
+    return s_Data[pos];
+}
+
+inline __device__ uint scan1Exclusive(uint idata, volatile uint *s_Data, uint size)
+{
+    return scan1Inclusive(idata, s_Data, size) - idata;
+}
+
+#else
+#define LOG2_WARP_SIZE 5U
+#define WARP_SIZE (1U << LOG2_WARP_SIZE)
+
+//Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()
+//assuming size <= WARP_SIZE
+inline __device__ uint warpScanInclusive(uint idata, volatile uint *s_Data, uint size)
+{
+    uint pos = 2 * threadIdx.x - (threadIdx.x & (size - 1));
+    s_Data[pos] = 0;
+    pos += size;
+    s_Data[pos] = idata;
+
+    for (uint offset = 1; offset < size; offset <<= 1)
+        s_Data[pos] += s_Data[pos - offset];
+
+    return s_Data[pos];
+}
+
+inline __device__ uint warpScanExclusive(uint idata, volatile uint *s_Data, uint size)
+{
+    return warpScanInclusive(idata, s_Data, size) - idata;
+}
+
+inline __device__ uint scan1Inclusive(uint idata, volatile uint *s_Data, uint size)
+{
+    if (size > WARP_SIZE)
+    {
+        //Bottom-level inclusive warp scan
+        uint warpResult = warpScanInclusive(idata, s_Data, WARP_SIZE);
+
+        //Save top elements of each warp for exclusive warp scan
+        //sync to wait for warp scans to complete (because s_Data is being overwritten)
+        __syncthreads();
+        if ((threadIdx.x & (WARP_SIZE - 1)) == (WARP_SIZE - 1))
+            s_Data[threadIdx.x >> LOG2_WARP_SIZE] = warpResult;
+
+        //wait for warp scans to complete
+        __syncthreads();
+        if (threadIdx.x < (THREADBLOCK_SIZE / WARP_SIZE))
+        {
+            //grab top warp elements
+            uint val = s_Data[threadIdx.x];
+            //calculate exclsive scan and write back to shared memory
+            s_Data[threadIdx.x] = warpScanExclusive(val, s_Data, size >> LOG2_WARP_SIZE);
+        }
+
+        //return updated warp scans with exclusive scan results
+        __syncthreads();
+        return warpResult + s_Data[threadIdx.x >> LOG2_WARP_SIZE];
+    }
+    else
+    {
+        return warpScanInclusive(idata, s_Data, size);
+    }
+}
+
+inline __device__ uint scan1Exclusive(uint idata, volatile uint *s_Data, uint size)
+{
+    return scan1Inclusive(idata, s_Data, size) - idata;
+}
+
+#endif
+
+inline __device__ uint4 scan4Inclusive(uint4 idata4, volatile uint *s_Data, uint size)
+{
+    //Level-0 exclusive scan
+    idata4.y += idata4.x;
+    idata4.z += idata4.y;
+    idata4.w += idata4.z;
+
+    //Level-1 exclusive scan
+    uint oval = scan1Exclusive(idata4.w, s_Data, size / 4);
+
+    idata4.x += oval;
+    idata4.y += oval;
+    idata4.z += oval;
+    idata4.w += oval;
+
+    return idata4;
+}
+
+//Exclusive vector scan: the array to be scanned is stored
+//in local thread memory scope as uint4
+inline __device__ uint4 scan4Exclusive(uint4 idata4, volatile uint *s_Data, uint size)
+{
+    uint4 odata4 = scan4Inclusive(idata4, s_Data, size);
+    odata4.x -= idata4.x;
+    odata4.y -= idata4.y;
+    odata4.z -= idata4.z;
+    odata4.w -= idata4.w;
+    return odata4;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Scan kernels
+////////////////////////////////////////////////////////////////////////////////
+__global__ void scanExclusiveShared(
+    uint4 *d_Dst,
+    uint4 *d_Src,
+    uint size)
+{
+    __shared__ uint s_Data[2 * THREADBLOCK_SIZE];
+
+    uint pos = blockIdx.x * blockDim.x + threadIdx.x;
+
+    //Load data
+    uint4 idata4 = d_Src[pos];
+
+    //Calculate exclusive scan
+    uint4 odata4 = scan4Exclusive(idata4, s_Data, size);
+
+    //Write back
+    d_Dst[pos] = odata4;
+}
+
+//Exclusive scan of top elements of bottom-level scans (4 * THREADBLOCK_SIZE)
+__global__ void scanExclusiveShared2(
+    uint *d_Buf,
+    uint *d_Dst,
+    uint *d_Src,
+    uint N,
+    uint arrayLength)
+{
+    __shared__ uint s_Data[2 * THREADBLOCK_SIZE];
+
+    //Skip loads and stores for inactive threads of last threadblock (pos >= N)
+    uint pos = blockIdx.x * blockDim.x + threadIdx.x;
+
+    //Load top elements
+    //Convert results of bottom-level scan back to inclusive
+    uint idata = 0;
+    if (pos < N)
+        idata =
+            d_Dst[(4 * THREADBLOCK_SIZE) - 1 + (4 * THREADBLOCK_SIZE) * pos] +
+            d_Src[(4 * THREADBLOCK_SIZE) - 1 + (4 * THREADBLOCK_SIZE) * pos];
+
+    //Compute
+    uint odata = scan1Exclusive(idata, s_Data, arrayLength);
+
+    //Avoid out-of-bound access
+    if (pos < N)
+        d_Buf[pos] = odata;
+}
+
+//Final step of large-array scan: combine basic inclusive scan with exclusive scan of top elements of input arrays
+__global__ void uniformUpdate(
+    uint4 *d_Data,
+    uint *d_Buffer)
+{
+    __shared__ uint buf;
+    uint pos = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (threadIdx.x == 0)
+        buf = d_Buffer[blockIdx.x];
+    __syncthreads();
+
+    uint4 data4 = d_Data[pos];
+    data4.x += buf;
+    data4.y += buf;
+    data4.z += buf;
+    data4.w += buf;
+    d_Data[pos] = data4;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Interface function
+////////////////////////////////////////////////////////////////////////////////
+//Derived as 32768 (max power-of-two gridDim.x) * 4 * THREADBLOCK_SIZE
+//Due to scanExclusiveShared<<<>>>() 1D block addressing
+const uint MAX_BATCH_ELEMENTS = 64 * 1048576;
+const uint MIN_SHORT_ARRAY_SIZE = 4;
+const uint MAX_SHORT_ARRAY_SIZE = 4 * THREADBLOCK_SIZE;
+const uint MIN_LARGE_ARRAY_SIZE = 8 * THREADBLOCK_SIZE;
+const uint MAX_LARGE_ARRAY_SIZE = 4 * THREADBLOCK_SIZE * THREADBLOCK_SIZE;
+
+//Internal exclusive scan buffer
+static uint *d_Buf;
+
+void initScan(void)
+{
+    checkCudaErrors(cudaMalloc((void **)&d_Buf, (MAX_BATCH_ELEMENTS / (4 * THREADBLOCK_SIZE)) * sizeof(uint)));
+}
+
+void closeScan(void)
+{
+    checkCudaErrors(cudaFree(d_Buf));
+}
+
+static uint factorRadix2(uint &log2L, uint L)
+{
+    if (!L)
+    {
+        log2L = 0;
+        return 0;
+    }
+    else
+    {
+        for (log2L = 0; (L & 1) == 0; L >>= 1, log2L++)
+            ;
+        return L;
+    }
+}
+
+static uint iDivUp(uint dividend, uint divisor)
+{
+    return ((dividend % divisor) == 0) ? (dividend / divisor) : (dividend / divisor + 1);
+}
+
+size_t scanExclusiveShort(
+    uint *d_Dst,
+    uint *d_Src,
+    uint batchSize,
+    uint arrayLength)
+{
+    //Check power-of-two factorization
+    uint log2L;
+    uint factorizationRemainder = factorRadix2(log2L, arrayLength);
+    assert(factorizationRemainder == 1);
+
+    //Check supported size range
+    assert((arrayLength >= MIN_SHORT_ARRAY_SIZE) && (arrayLength <= MAX_SHORT_ARRAY_SIZE));
+
+    //Check total batch size limit
+    assert((batchSize * arrayLength) <= MAX_BATCH_ELEMENTS);
+
+    //Check all threadblocks to be fully packed with data
+    assert((batchSize * arrayLength) % (4 * THREADBLOCK_SIZE) == 0);
+
+    scanExclusiveShared<<<(batchSize * arrayLength) / (4 * THREADBLOCK_SIZE), THREADBLOCK_SIZE>>>(
+        (uint4 *)d_Dst,
+        (uint4 *)d_Src,
+        arrayLength);
+    getLastCudaError("scanExclusiveShared() execution FAILED\n");
+
+    return THREADBLOCK_SIZE;
+}
+
+size_t scanExclusiveLarge(
+    uint *d_Dst,
+    uint *d_Src,
+    uint batchSize,
+    uint arrayLength)
+{
+    //Check power-of-two factorization
+    uint log2L;
+    uint factorizationRemainder = factorRadix2(log2L, arrayLength);
+    assert(factorizationRemainder == 1);
+
+    //Check supported size range
+    assert((arrayLength >= MIN_LARGE_ARRAY_SIZE) && (arrayLength <= MAX_LARGE_ARRAY_SIZE));
+
+    //Check total batch size limit
+    assert((batchSize * arrayLength) <= MAX_BATCH_ELEMENTS);
+
+    scanExclusiveShared<<<(batchSize * arrayLength) / (4 * THREADBLOCK_SIZE), THREADBLOCK_SIZE>>>(
+        (uint4 *)d_Dst,
+        (uint4 *)d_Src,
+        4 * THREADBLOCK_SIZE);
+    getLastCudaError("scanExclusiveShared() execution FAILED\n");
+
+    //Not all threadblocks need to be packed with input data:
+    //inactive threads of highest threadblock just don't do global reads and writes
+    const uint blockCount2 = iDivUp((batchSize * arrayLength) / (4 * THREADBLOCK_SIZE), THREADBLOCK_SIZE);
+    scanExclusiveShared2<<<blockCount2, THREADBLOCK_SIZE>>>(
+        (uint *)d_Buf,
+        (uint *)d_Dst,
+        (uint *)d_Src,
+        (batchSize * arrayLength) / (4 * THREADBLOCK_SIZE),
+        arrayLength / (4 * THREADBLOCK_SIZE));
+    getLastCudaError("scanExclusiveShared2() execution FAILED\n");
+
+    uniformUpdate<<<(batchSize * arrayLength) / (4 * THREADBLOCK_SIZE), THREADBLOCK_SIZE>>>(
+        (uint4 *)d_Dst,
+        (uint *)d_Buf);
+    getLastCudaError("uniformUpdate() execution FAILED\n");
+
+    return THREADBLOCK_SIZE;
+}
+
+__global__ void scanInclusiveShared2(
+    uint *d_Buf,
+    uint *d_Dst,
+    uint N,
+    uint arrayLength)
+{
+    __shared__ uint s_Data[2 * THREADBLOCK_SIZE];
+
+    //Skip loads and stores for inactive threads of last threadblock (pos >= N)
+    uint pos = blockIdx.x * blockDim.x + threadIdx.x;
+
+    //Load top elements
+    //Convert results of bottom-level scan back to inclusive
+    uint idata = 0;
+    if (pos < N)
+        idata = d_Dst[(4 * THREADBLOCK_SIZE) - 1 + (4 * THREADBLOCK_SIZE) * pos];
+
+    //Compute
+    uint odata = scan1Exclusive(idata, s_Data, arrayLength);
+
+    //Avoid out-of-bound access
+    if (pos < N)
+        d_Buf[pos] = odata;
+}
+
+__global__ void scanInclusiveShared(
+    uint4 *d_Dst,
+    uint4 *d_Src,
+    uint size)
+{
+    __shared__ uint s_Data[2 * THREADBLOCK_SIZE];
+
+    uint pos = blockIdx.x * blockDim.x + threadIdx.x;
+
+    //  if(pos<warpSize*(size/warpSize+1))
+    {
+        //Load data
+        uint4 idata4 = d_Src[pos];
+
+        //Calculate exclusive scan
+        uint4 odata4 = scan4Inclusive(idata4, s_Data, size);
+
+        //Write back
+        d_Dst[pos] = odata4;
+    }
+}
+
+size_t scanInclusiveShort(
+    uint *d_Dst,
+    uint *d_Src,
+    uint batchSize,
+    uint arrayLength)
+{
+    //Check power-of-two factorization
+    uint log2L;
+    uint factorizationRemainder = factorRadix2(log2L, arrayLength);
+    assert(factorizationRemainder == 1);
+
+    //Check supported size range
+    //  assert( (arrayLength >= MIN_SHORT_ARRAY_SIZE) && (arrayLength <= MAX_SHORT_ARRAY_SIZE) );
+
+    //Check total batch size limit
+    // assert( (batchSize * arrayLength) <= MAX_BATCH_ELEMENTS );
+
+    //Check all threadblocks to be fully packed with data
+    //assert( (batchSize * arrayLength) % (4 * THREADBLOCK_SIZE) == 0 );
+    int blocks = (batchSize * arrayLength + 4 * THREADBLOCK_SIZE - 1) / (4 * THREADBLOCK_SIZE);
+    scanInclusiveShared<<<blocks, THREADBLOCK_SIZE>>>(
+        (uint4 *)d_Dst,
+        (uint4 *)d_Src,
+        arrayLength);
+    getLastCudaError("scanExclusiveShared() execution FAILED\n");
+
+    return THREADBLOCK_SIZE;
+}
+
+size_t scanInclusiveLarge(
+    uint *d_Dst,
+    uint *d_Src,
+    uint batchSize,
+    uint arrayLength)
+{
+    //Check power-of-two factorization
+    uint log2L;
+    uint factorizationRemainder = factorRadix2(log2L, arrayLength);
+    assert(factorizationRemainder == 1);
+
+    //Check supported size range
+    //assert( (arrayLength >= MIN_LARGE_ARRAY_SIZE) && (arrayLength <= MAX_LARGE_ARRAY_SIZE) );
+
+    //Check total batch size limit
+    //assert( (batchSize * arrayLength) <= MAX_BATCH_ELEMENTS );
+
+    scanInclusiveShared<<<(batchSize * arrayLength) / (4 * THREADBLOCK_SIZE), THREADBLOCK_SIZE>>>(
+        (uint4 *)d_Dst,
+        (uint4 *)d_Src,
+        4 * THREADBLOCK_SIZE);
+    getLastCudaError("scanExclusiveShared() execution FAILED\n");
+
+    //Not all threadblocks need to be packed with input data:
+    //inactive threads of highest threadblock just don't do global reads and writes
+    const uint blockCount2 = iDivUp((batchSize * arrayLength) / (4 * THREADBLOCK_SIZE), THREADBLOCK_SIZE);
+    scanInclusiveShared2<<<blockCount2, THREADBLOCK_SIZE>>>(
+        (uint *)d_Buf,
+        (uint *)d_Dst,
+        (batchSize * arrayLength) / (4 * THREADBLOCK_SIZE),
+        arrayLength / (4 * THREADBLOCK_SIZE));
+    getLastCudaError("scanExclusiveShared2() execution FAILED\n");
+
+    uniformUpdate<<<(batchSize * arrayLength) / (4 * THREADBLOCK_SIZE), THREADBLOCK_SIZE>>>(
+        (uint4 *)d_Dst,
+        (uint *)d_Buf);
+    getLastCudaError("uniformUpdate() execution FAILED\n");
+
+    return THREADBLOCK_SIZE;
+}
+
+
+#include <thrust/scan.h>
+#include <common/inc/helper_cuda.h>
+#include <common/inc/helper_timer.h>
+#include <6_Advanced/scan/scan_common.h>
+
+extern __shared__ uint sMemory[];
+
+__device__ inline double atomicMax(double *address, double val)
+{
+    unsigned long long int *address_as_ull = (unsigned long long int *)address;
+    unsigned long long int assumed;
+    unsigned long long int old = *address_as_ull;
+
+    assumed = old;
+    old = atomicCAS(address_as_ull,
+                    assumed,
+                    __double_as_longlong(max(val, __longlong_as_double(assumed))));
+
+    while (assumed != old)
+    {
+        assumed = old;
+        old = atomicCAS(address_as_ull,
+                        assumed,
+                        __double_as_longlong(max(val, __longlong_as_double(assumed))));
+    }
+    return __longlong_as_double(old);
+}
+
+__device__ inline double atomicMin(double *address, double val)
+{
+    unsigned long long int *address_as_ull = (unsigned long long int *)address;
+    unsigned long long int old = *address_as_ull, assumed;
+
+    assumed = old;
+    old = atomicCAS(address_as_ull,
+                    assumed,
+                    __double_as_longlong(min(val, __longlong_as_double(assumed))));
+    while (assumed != old)
+    {
+        assumed = old;
+        old = atomicCAS(address_as_ull,
+                        assumed,
+                        __double_as_longlong(min(val, __longlong_as_double(assumed))));
+    }
+    return __longlong_as_double(old);
+}
+
+template <typename Type>
+__device__ inline void Comparator(
+
+    Type &valA,
+    Type &valB,
+    uint dir)
+{
+    Type t;
+    if ((valA > valB) == dir)
+    {
+        t = valA;
+        valA = valB;
+        valB = t;
+    }
+}
+
+static __device__ __forceinline__ unsigned int __qsflo(unsigned int word)
+{
+    unsigned int ret;
+    asm volatile("bfind.u32 %0, %1;"
+                 : "=r"(ret)
+                 : "r"(word));
+    return ret;
+}
+
+template <typename Type>
+__global__ void globalBitonicSort(Type *indata, Type *outdata, Block<Type> *bucket, bool inputSelect)
+{
+    __shared__ uint shared[1024];
+
+    Type *data;
+
+    Block<Type> cord = bucket[blockIdx.x];
+
+    uint size = cord.end - cord.begin;
+    bool select = !(cord.select);
+
+    if (cord.end - cord.begin > 1024 || cord.end - cord.begin == 0)
+        return;
+
+    unsigned int bitonicSize = 1 << (__qsflo(size - 1U) + 1);
+
+    if (select)
+        data = indata;
+    else
+        data = outdata;
+
+    //__syncthreads();
+
+    for (int i = threadIdx.x; i < size; i += blockDim.x)
+        shared[i] = data[i + cord.begin];
+
+    for (int i = threadIdx.x + size; i < bitonicSize; i += blockDim.x)
+        shared[i] = 0xffffffff;
+
+    __syncthreads();
+
+    for (uint size = 2; size < bitonicSize; size <<= 1)
+    {
+        //Bitonic merge
+        uint ddd = 1 ^ ((threadIdx.x & (size / 2)) != 0);
+        for (uint stride = size / 2; stride > 0; stride >>= 1)
+        {
+            __syncthreads();
+            uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
+            //if(pos <bitonicSize){
+            Comparator(
+                shared[pos + 0],
+                shared[pos + stride],
+                ddd);
+            // }
+        }
+    }
+
+    //ddd == dir for the last bitonic merge step
+
+    for (uint stride = bitonicSize / 2; stride > 0; stride >>= 1)
+    {
+        __syncthreads();
+        uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
+        // if(pos <bitonicSize){
+        Comparator(
+            shared[pos + 0],
+            shared[pos + stride],
+            1);
+        // }
+    }
+
+    __syncthreads();
+
+    // Write back the sorted data to its correct position
+    for (int i = threadIdx.x; i < size; i += blockDim.x)
+        indata[i + cord.begin] = shared[i];
+}
+
+template <typename Type>
+__global__ void quick(Type *indata, Type *buffer, Partition<Type> *partition, Block<Type> *bucket)
+{
+
+    __shared__ Type sh_out[1024];
+
+    __shared__ uint start1, end1;
+    __shared__ uint left, right;
+
+    int tix = threadIdx.x;
+
+    uint start = partition[blockIdx.x].from;
+    uint end = partition[blockIdx.x].end;
+    Type pivot = partition[blockIdx.x].pivot;
+    uint nseq = partition[blockIdx.x].ibucket;
+
+    uint lo = 0;
+    uint hi = 0;
+
+    Type lmin = 0xffffffff;
+    Type rmax = 0;
+
+    Type d;
+
+    // start read on 1° tile and store the coordinates of the items that must
+    // be moved on the left or on the right of the pivot
+
+    if (tix + start < end)
+    {
+        d = indata[tix + start];
+
+        //count items smaller or bigger than the pivot
+        // if d<pivot then ll++ else ll
+        lo = (d < pivot) * (lo + 1) + (d >= pivot) * lo;
+        // if d>pivot then lr++ else lr
+        hi = (d <= pivot) * (hi) + (d > pivot) * (hi + 1);
+
+        lmin = d;
+        rmax = d;
+    }
+
+    //read and store the coordinates on next tiles for each block
+    for (uint i = tix + start + blockDim.x; i < end; i += blockDim.x)
+    {
+        Type d = indata[i];
+
+        //count items smaller or bigger than the pivot
+        lo = (d < pivot) * (lo + 1) + (d >= pivot) * lo;
+        hi = (d <= pivot) * (hi) + (d > pivot) * (hi + 1);
+
+        //compute max and min of tile items
+        lmin = min(lmin, d);
+        rmax = max(rmax, d);
+    }
+
+    //compute max and min of every partition
+
+    compareInclusive(rmax, lmin, (Type *)sh_out, blockDim.x);
+
+    __syncthreads();
+
+    if (tix == blockDim.x - 1)
+    {
+        //compute absolute max and min for the bucket
+        atomicMax(&bucket[nseq].maxPiv, rmax);
+        atomicMin(&bucket[nseq].minPiv, lmin);
+    }
+
+    __syncthreads();
+
+    /*
+	 * calculate the coordinates of its assigned item to each thread,
+	 * which are necessary to known in which subsequences the item must be copied
+	 *
+	 */
+    scan1Inclusive2(lo, hi, (uint *)sh_out, blockDim.x);
+    lo = lo - 1;
+    hi = SHARED_LIMIT - hi;
+
+    if (tix == blockDim.x - 1)
+    {
+        left = lo + 1;
+        right = SHARED_LIMIT - hi;
+
+        start1 = atomicAdd(&bucket[nseq].nextbegin, left);
+        end1 = atomicSub(&bucket[nseq].nextend, right);
+    }
+
+    __syncthreads();
+
+    //thread blocks write on the shared memory the items smaller and bigger than the first tile's pivot
+    if (tix + start < end)
+    {
+        //items smaller than pivot
+        if (d < pivot)
+        {
+            sh_out[lo] = d;
+            lo--;
+        }
+
+        //items bigger than pivot
+        if (d > pivot)
+        {
+            sh_out[hi] = d;
+            hi++;
+        }
+    }
+
+    //thread blocks write on the shared memory the items smaller and bigger than next tiles' pivot
+    for (uint i = start + tix + blockDim.x; i < end; i += blockDim.x)
+    {
+
+        Type d = indata[i];
+        //items smaller than the pivot
+        if (d < pivot)
+        {
+            sh_out[lo] = d;
+            lo--;
+        }
+
+        //items bigger than the pivot
+        if (d > pivot)
+        {
+            sh_out[hi] = d;
+            hi++;
+        }
+    }
+
+    __syncthreads();
+
+    //items smaller and bigger than the pivot already sorted in the shared memory are coalesced written on the global memory
+    //partial results of each thread block stored on the shared memory are merged together in two subsequences within the global memory
+    //coalesced writing of next tiles on the global memory
+    for (uint i = tix; i < SHARED_LIMIT; i += blockDim.x)
+    {
+        if (i < left)
+            buffer[start1 + i] = sh_out[i];
+
+        if (i >= SHARED_LIMIT - right)
+            buffer[end1 + i - SHARED_LIMIT] = sh_out[i];
+    }
+}
+
+//this function assigns the attributes to each partition of each bucket
+//a thread block is assigned to a specific partition
+template <typename Type>
+__global__ void partitionAssign(struct Block<Type> *bucket, uint *npartitions, struct Partition<Type> *partition)
+{
+    int tx = threadIdx.x;
+    int bx = blockIdx.x;
+
+    uint beg = bucket[bx].nextbegin;
+    uint end = bucket[bx].nextend;
+    Type pivot = bucket[bx].pivot;
+
+    uint from;
+    uint to;
+
+    if (bx > 0)
+    {
+        from = npartitions[bx - 1];
+        to = npartitions[bx];
+    }
+    else
+    {
+        from = 0;
+        to = npartitions[bx];
+    }
+
+    uint i = tx + from;
+
+    if (i < to)
+    {
+        uint begin = beg + SHARED_LIMIT * tx;
+        partition[i].from = begin;
+        partition[i].end = begin + SHARED_LIMIT;
+        partition[i].pivot = pivot;
+        partition[i].ibucket = bx;
+    }
+
+    for (uint i = tx + from + blockDim.x; i < to; i += blockDim.x)
+    {
+        uint begin = beg + SHARED_LIMIT * (i - from);
+        partition[i].from = begin;
+        partition[i].end = begin + SHARED_LIMIT;
+        partition[i].pivot = pivot;
+        partition[i].ibucket = bx;
+    }
+    __syncthreads();
+    if (tx == 0 && to - from > 0)
+        partition[to - 1].end = end;
+}
+
+//this function enters the pivot value in the central bucket's items
+template <typename Type>
+__global__ void insertPivot(Type *data, struct Block<Type> *bucket, int nbucket)
+{
+
+    Type pivot = bucket[blockIdx.x].pivot;
+    uint start = bucket[blockIdx.x].nextbegin;
+    uint end = bucket[blockIdx.x].nextend;
+    bool is_altered = bucket[blockIdx.x].done;
+
+    if (is_altered && blockIdx.x < nbucket)
+        for (uint j = start + threadIdx.x; j < end; j += blockDim.x)
+            data[j] = pivot;
+}
+
+//this function assigns the new attributes of each bucket
+template <typename Type>
+__global__ void bucketAssign(Block<Type> *bucket, uint *npartitions, int nbucket, int select)
+{
+
+    uint i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (i < nbucket)
+    {
+        bool is_altered = bucket[i].done;
+        if (is_altered)
+        {
+            //read on i node
+            uint orgbeg = bucket[i].begin;
+            uint from = bucket[i].nextbegin;
+            uint orgend = bucket[i].end;
+            uint end = bucket[i].nextend;
+            Type pivot = bucket[i].pivot;
+            Type minPiv = bucket[i].minPiv;
+            Type maxPiv = bucket[i].maxPiv;
+
+            //compare each bucket's max and min to the pivot
+            Type lmaxpiv = min(pivot, maxPiv);
+            Type rminpiv = max(pivot, minPiv);
+
+            //write on i+nbucket node
+            bucket[i + nbucket].begin = orgbeg;
+            bucket[i + nbucket].nextbegin = orgbeg;
+            bucket[i + nbucket].nextend = from;
+            bucket[i + nbucket].end = from;
+            bucket[i + nbucket].pivot = (minPiv + lmaxpiv) / 2;
+
+            //if(select)
+            //	bucket[i+nbucket].done   = (from-orgbeg)>1024;// && (minPiv!=maxPiv);
+            //else
+            bucket[i + nbucket].done = (from - orgbeg) > 1024 && (minPiv != maxPiv);
+            bucket[i + nbucket].select = select;
+            bucket[i + nbucket].minPiv = ( Type ) 0xffffffffffffffff;
+            bucket[i + nbucket].maxPiv = 0;
+            //bucket[i+nbucket].finish=false;
+
+            //calculate the number of partitions (npartitions) necessary to the i+nbucket bucket
+            if (!bucket[i + nbucket].done)
+                npartitions[i + nbucket] = 0;
+            else
+                npartitions[i + nbucket] = (from - orgbeg + SHARED_LIMIT - 1) / SHARED_LIMIT;
+
+            //write on i node
+            bucket[i].begin = end;
+            bucket[i].nextbegin = end;
+            bucket[i].nextend = orgend;
+            bucket[i].pivot = (rminpiv + maxPiv) / 2 + 1;
+
+            //if(select)
+            //bucket[i].done   = (orgend-end)>1024;// && (minPiv!=maxPiv);
+            //	else
+            bucket[i].done = (orgend - end) > 1024 && (minPiv != maxPiv);
+            bucket[i].select = select;
+            bucket[i].minPiv = ( Type ) 0xffffffffffffffff;
+            bucket[i].maxPiv = 0;
+            //bucket[i].finish=false;
+
+            //calculate the number of partitions (npartitions) necessary to the i-bucket bucket
+            if (!bucket[i].done)
+                npartitions[i] = 0;
+            else
+                npartitions[i] = (orgend - end + SHARED_LIMIT - 1) / SHARED_LIMIT;
+        }
+    }
+}
+
+template <typename Type>
+__global__ void init(Type *data, Block<Type> *bucket, uint *npartitions, int size, int nblocks)
+{
+    uint i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (i < nblocks)
+    {
+        bucket[i].nextbegin = 0;
+        bucket[i].begin = 0;
+
+        bucket[i].nextend = 0 + size * (i == 0);
+        bucket[i].end = 0 + size * (i == 0);
+        npartitions[i] = 0;
+        bucket[i].done = false + i == 0;
+        bucket[i].select = false;
+        bucket[i].maxPiv = 0x0;
+        bucket[i].minPiv = ( Type ) 0xffffffffffffffff;
+        //bucket[i].pivot  = 0+ (i==0)*((min(min(data[0],data[size/2]),data[size-1]) + max(max(data[0],data[size/2]),data[size-1]))/2);
+        bucket[i].pivot = data[size / 2];
+    }
+}
+
+template <typename Type>
+void sort(Type *ddata, Type *outputData, uint size, uint threadCount, int device, double *wallClock)
+{
+
+    cudaSetDevice(device);
+
+    cudaGetLastError();
+    //cudaDeviceReset();
+
+    cudaDeviceProp deviceProp;
+    cudaGetDeviceProperties(&deviceProp, device);
+
+    StopWatchInterface *htimer = NULL;
+    Type *dbuffer;
+
+    Block<Type> *dbucket;
+    struct Partition<Type> *partition;
+    uint *npartitions1, *npartitions2;
+
+    uint *cudaBlocks = (uint *)malloc(4);
+
+    uint blocks = (size + SHARED_LIMIT - 1) / SHARED_LIMIT;
+    uint nblock = 10 * blocks;
+    int partition_max = 262144;
+
+    //unsigned long long int total = partition_max * sizeof(Block<Type>) + nblock * sizeof(Partition<Type>) + 2 * partition_max * sizeof(uint) + 3 * (size) * sizeof(Type);
+
+    //Allocating and initializing CUDA arrays
+    sdkCreateTimer(&htimer);
+    checkCudaErrors(cudaMalloc((void **)&dbucket, partition_max * sizeof(Block<Type>)));
+    checkCudaErrors(cudaMalloc((void **)&partition, nblock * sizeof(Partition<Type>))); //nblock
+
+    checkCudaErrors(cudaMalloc((void **)&npartitions1, partition_max * sizeof(uint)));
+    checkCudaErrors(cudaMalloc((void **)&npartitions2, partition_max * sizeof(uint)));
+
+    checkCudaErrors(cudaMalloc((void **)&dbuffer, (size) * sizeof(Type)));
+
+    initScan();
+
+    //setting GPU Cache
+    cudaFuncSetCacheConfig(init<Type>, cudaFuncCachePreferL1);
+    cudaFuncSetCacheConfig(insertPivot<Type>, cudaFuncCachePreferL1);
+    cudaFuncSetCacheConfig(bucketAssign<Type>, cudaFuncCachePreferL1);
+    cudaFuncSetCacheConfig(partitionAssign<Type>, cudaFuncCachePreferL1);
+    cudaFuncSetCacheConfig(quick<Type>, cudaFuncCachePreferShared);
+    cudaFuncSetCacheConfig(globalBitonicSort<Type>, cudaFuncCachePreferShared);
+
+    checkCudaErrors(cudaDeviceSynchronize());
+    sdkResetTimer(&htimer);
+    sdkStartTimer(&htimer);
+
+    //initializing bucket array: initial attributes for each bucket
+    init<Type><<<(nblock + 255) / 256, 256>>>(ddata, dbucket, npartitions1, size, partition_max);
+
+    uint nbucket = 1;
+    uint numIterations = 0;
+    bool inputSelect = true;
+
+    *cudaBlocks = blocks;
+    checkCudaErrors(cudaDeviceSynchronize());
+    getLastCudaError("init() execution FAILED\n");
+    checkCudaErrors(cudaMemcpy(&npartitions2[0], cudaBlocks, sizeof(uint), cudaMemcpyHostToDevice));
+
+    // beginning of the first phase
+    // this phase goes on until the size of the buckets is comparable to the SHARED_LIMIT size
+    while (1)
+    {
+
+        /*
+		 *       	---------------------    Pre-processing: Partitioning    ---------------------
+		 *
+		 * buckets are further divided in partitions based on their size
+		 * the number of partitions needed for each subsequence is determined by the number of elements which can be
+		 * processed by each thread block.
+		 *
+		 * the number of partitions (npartitions) for each block will depend on the shared memory size (SHARED_LIMIT)
+		 *
+		 */
+
+        if (numIterations > 0)
+        { //1024 is the shared memory limit of scanInclusiveShort()
+            if (nbucket <= 1024)
+                scanInclusiveShort(npartitions2, npartitions1, 1, nbucket);
+            else
+                scanInclusiveLarge(npartitions2, npartitions1, 1, nbucket);
+
+            checkCudaErrors(cudaMemcpy(cudaBlocks, &npartitions2[nbucket - 1], sizeof(uint), cudaMemcpyDeviceToHost));
+        }
+
+        if (*cudaBlocks == 0)
+            break;
+
+        /*
+		 *  ---------------------     step 1    ---------------------
+		 *
+		 * 	A thread block is assigned to each different partition
+		 * 	each partition is assigned coordinates, pivot and ....
+		 */
+
+        partitionAssign<Type><<<nbucket, 1024>>>(dbucket, npartitions2, partition);
+        cudaDeviceSynchronize();
+        getLastCudaError("partitionAssign() execution FAILED\n");
+
+        /*
+		 	 ---------------------    step 2a    ---------------------
+
+		 	 in this function each thread block creates two subsequences
+		 	 to divide the items in the partition whose value is lower than
+			 the pivot value, from the items whose value is higher than the pivot value
+		 */
+
+        if (inputSelect)
+            quick<Type><<<*cudaBlocks, threadCount>>>(ddata, dbuffer, partition, dbucket);
+        else
+            quick<Type><<<*cudaBlocks, threadCount>>>(dbuffer, ddata, partition, dbucket);
+        cudaDeviceSynchronize();
+        getLastCudaError("quick() execution FAILED\n");
+
+        //step 2b: this function enters the pivot value in the central bucket's items
+        insertPivot<Type><<<nbucket, 512>>>(ddata, dbucket, nbucket);
+
+        //step 3: parameters are assigned, linked to the two new buckets created in step 2
+        bucketAssign<Type><<<(nbucket + 255) / 256, 256>>>(dbucket, npartitions1, nbucket, inputSelect);
+        cudaDeviceSynchronize();
+        getLastCudaError("insertPivot() or bucketAssign() execution FAILED\n");
+
+        nbucket *= 2;
+
+        inputSelect = !inputSelect;
+        numIterations++;
+        if (nbucket > deviceProp.maxGridSize[0])
+            break;
+        //if(numIterations==18) break;
+    }
+
+    /*
+	 * start second phase:
+	 * now the size of the buckets is such that they can be entirely processed by a thread block
+	 *
+	 */
+
+    if (nbucket > deviceProp.maxGridSize[0])
+        fprintf(stderr, "ERROR: CUDA-Quicksort can't terminate sorting as the block threads needed to finish it are more than the Maximum x-dimension of FERMI GPU thread blocks. Please use Kepler GPUs as the Maximum x-dimension of their thread blocks is much higher\n");
+    else
+        globalBitonicSort<Type><<<nbucket, 512, 0>>>(ddata, dbuffer, dbucket, inputSelect);
+
+    checkCudaErrors(cudaDeviceSynchronize());
+    getLastCudaError("globalBitonicSort() execution FAILED\n");
+
+    sdkStopTimer(&htimer);
+    *wallClock = sdkGetTimerValue(&htimer);
+
+    // release resources
+    checkCudaErrors(cudaFree(dbuffer));
+    checkCudaErrors(cudaFree(dbucket));
+    checkCudaErrors(cudaFree(npartitions2));
+    checkCudaErrors(cudaFree(npartitions1));
+    free(cudaBlocks);
+
+    closeScan();
+    return;
+}
+
+void CUDA_Quicksort(uint* inputData, uint* outputData, uint dataSize, uint threadCount, int Device, double* wallClock)
+{
+
+	cudaDeviceProp deviceProp;
+	cudaGetDeviceProperties(&deviceProp, Device);
+
+	if(deviceProp.major<2)
+	{
+		fprintf(stderr, "Error: the GPU device %d has a Compute Capability of %d.%d, while a Compute Capability of 2.x is required to run the code\n",
+				Device, deviceProp.major, deviceProp.minor);
+
+		int deviceCount;
+		cudaGetDeviceCount(&deviceCount);
+
+		fprintf(stderr, "       the Host system has the following GPU devices:\n");
+
+		for (int device = 0; device < deviceCount; device++) {
+
+				fprintf(stderr, "\t  the GPU device %d is a %s, with Compute Capability %d.%d\n",
+						device, deviceProp.name, deviceProp.major, deviceProp.minor);
+		}
+
+		return;
+	}
+
+	sort<uint>(inputData,outputData, dataSize,threadCount,Device, wallClock);
+}
+
+void CUDA_Quicksort_64(double* inputData,double* outputData, uint dataSize, uint threadCount, int Device, double* wallClock)
+{
+
+	cudaDeviceProp deviceProp;
+	cudaGetDeviceProperties(&deviceProp, Device);
+
+	if(deviceProp.major<2)
+	{
+		fprintf(stderr, "Error: the GPU device %d has a Compute Capability of %d.%d, while a Compute Capability of 2.x is required to run the code\n",
+				Device, deviceProp.major, deviceProp.minor);
+
+		int deviceCount;
+		cudaGetDeviceCount(&deviceCount);
+
+		fprintf(stderr, "       the Host system has the following GPU devices:\n");
+
+		for (int device = 0; device < deviceCount; device++) {
+
+				fprintf(stderr, "\t  the GPU device %d is a %s, with Compute Capability %d.%d\n",
+						device, deviceProp.name, deviceProp.major, deviceProp.minor);
+		}
+
+		return;
+	}
+
+	sort<double>(inputData,outputData, dataSize,threadCount,Device,wallClock);
+
+}
+
+struct MancaQuicksort
+{
+   static void sort( TNL::Containers::ArrayView< int, TNL::Devices::Cuda >& array )
+   {
+      double timer;
+      CUDA_Quicksort( ( unsigned * ) array.getData(),  (unsigned * ) array.getData(), array.getSize(), 256, 0, &timer );
+      //return;
+   }
+};
diff --git a/src/Benchmarks/Sorting/ReferenceAlgorithms/NvidiaBitonicSort.h b/src/Benchmarks/Sorting/ReferenceAlgorithms/NvidiaBitonicSort.h
new file mode 100644
index 0000000000000000000000000000000000000000..8345712879881c1dd3860f1b03f74b60ff647691
--- /dev/null
+++ b/src/Benchmarks/Sorting/ReferenceAlgorithms/NvidiaBitonicSort.h
@@ -0,0 +1,24 @@
+
+#ifdef HAVE_CUDA_SAMPLES
+#include <6_Advanced/sortingNetworks/bitonicSort.cu>
+#endif
+#include <TNL/Containers/Array.h>
+
+namespace TNL {
+
+struct NvidiaBitonicSort
+{
+   static void sort( Containers::ArrayView< int, Devices::Cuda >& view )
+   {
+#ifdef HAVE_CUDA_SAMPLES
+      Containers::Array<int, Devices::Cuda> arr;
+      arr = view;
+      bitonicSort((unsigned *)view.getData(), (unsigned *)arr.getData(),
+                  (unsigned *)view.getData(), (unsigned *)arr.getData(),
+                  1, arr.getSize(), 1);
+      cudaDeviceSynchronize();
+#endif
+   }
+};
+
+} // namespace TNL
diff --git a/src/Benchmarks/Sorting/ReferenceAlgorithms/ThrustRadixsort.h b/src/Benchmarks/Sorting/ReferenceAlgorithms/ThrustRadixsort.h
new file mode 100644
index 0000000000000000000000000000000000000000..02f03a023f55d97c4570c52d9bae7c238146ef59
--- /dev/null
+++ b/src/Benchmarks/Sorting/ReferenceAlgorithms/ThrustRadixsort.h
@@ -0,0 +1,16 @@
+#include <thrust/sort.h>
+#include <thrust/execution_policy.h>
+#include <TNL/Containers/Array.h>
+
+namespace TNL {
+
+struct ThrustRadixsort
+{
+   static void sort( Containers::ArrayView< int, Devices::Cuda >& view )
+   {
+      thrust::sort(thrust::device, view.getData(), view.getData() + view.getSize());
+      cudaDeviceSynchronize();
+   }
+};
+
+} // namespace TNL
diff --git a/src/Benchmarks/Sorting/ReferenceAlgorithms/helper_timer.h b/src/Benchmarks/Sorting/ReferenceAlgorithms/helper_timer.h
new file mode 100644
index 0000000000000000000000000000000000000000..c1e411650a72d470dc8b18565b4f229cf425ae97
--- /dev/null
+++ b/src/Benchmarks/Sorting/ReferenceAlgorithms/helper_timer.h
@@ -0,0 +1,495 @@
+/**
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+// Helper Timing Functions
+#ifndef HELPER_TIMER_H
+#define HELPER_TIMER_H
+
+// includes, system
+#include <vector>
+
+// includes, project
+#include "helpers/exception.h"
+
+// Definition of the StopWatch Interface, this is used if we don't want to use the CUT functions
+// But rather in a self contained class interface
+class StopWatchInterface
+{
+    public:
+        StopWatchInterface() {};
+        virtual ~StopWatchInterface() {};
+
+    public:
+        //! Start time measurement
+        virtual void start() = 0;
+
+        //! Stop time measurement
+        virtual void stop() = 0;
+
+        //! Reset time counters to zero
+        virtual void reset() = 0;
+
+        //! Time in msec. after start. If the stop watch is still running (i.e. there
+        //! was no call to stop()) then the elapsed time is returned, otherwise the
+        //! time between the last start() and stop call is returned
+        virtual float getTime() = 0;
+
+        //! Mean time to date based on the number of times the stopwatch has been
+        //! _stopped_ (ie finished sessions) and the current total time
+        virtual float getAverageTime() = 0;
+};
+
+
+//////////////////////////////////////////////////////////////////
+// Begin Stopwatch timer class definitions for all OS platforms //
+//////////////////////////////////////////////////////////////////
+#ifdef _WIN32
+// includes, system
+#define WINDOWS_LEAN_AND_MEAN
+#include <windows.h>
+#undef min
+#undef max
+
+//! Windows specific implementation of StopWatch
+class StopWatchWin : public StopWatchInterface
+{
+    public:
+        //! Constructor, default
+        StopWatchWin() :
+            start_time(),     end_time(),
+            diff_time(0.0f),  total_time(0.0f),
+            running(false), clock_sessions(0), freq(0), freq_set(false)
+        {
+            if (! freq_set)
+            {
+                // helper variable
+                LARGE_INTEGER temp;
+
+                // get the tick frequency from the OS
+                QueryPerformanceFrequency((LARGE_INTEGER *) &temp);
+
+                // convert to type in which it is needed
+                freq = ((double) temp.QuadPart) / 1000.0;
+
+                // rememeber query
+                freq_set = true;
+            }
+        };
+
+        // Destructor
+        ~StopWatchWin() { };
+
+    public:
+        //! Start time measurement
+        inline void start();
+
+        //! Stop time measurement
+        inline void stop();
+
+        //! Reset time counters to zero
+        inline void reset();
+
+        //! Time in msec. after start. If the stop watch is still running (i.e. there
+        //! was no call to stop()) then the elapsed time is returned, otherwise the
+        //! time between the last start() and stop call is returned
+        inline float getTime();
+
+        //! Mean time to date based on the number of times the stopwatch has been
+        //! _stopped_ (ie finished sessions) and the current total time
+        inline float getAverageTime();
+
+    private:
+        // member variables
+
+        //! Start of measurement
+        LARGE_INTEGER  start_time;
+        //! End of measurement
+        LARGE_INTEGER  end_time;
+
+        //! Time difference between the last start and stop
+        float  diff_time;
+
+        //! TOTAL time difference between starts and stops
+        float  total_time;
+
+        //! flag if the stop watch is running
+        bool running;
+
+        //! Number of times clock has been started
+        //! and stopped to allow averaging
+        int clock_sessions;
+
+        //! tick frequency
+        double  freq;
+
+        //! flag if the frequency has been set
+        bool  freq_set;
+};
+
+// functions, inlined
+
+////////////////////////////////////////////////////////////////////////////////
+//! Start time measurement
+////////////////////////////////////////////////////////////////////////////////
+inline void
+StopWatchWin::start()
+{
+    QueryPerformanceCounter((LARGE_INTEGER *) &start_time);
+    running = true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Stop time measurement and increment add to the current diff_time summation
+//! variable. Also increment the number of times this clock has been run.
+////////////////////////////////////////////////////////////////////////////////
+inline void
+StopWatchWin::stop()
+{
+    QueryPerformanceCounter((LARGE_INTEGER *) &end_time);
+    diff_time = (float)
+                (((double) end_time.QuadPart - (double) start_time.QuadPart) / freq);
+
+    total_time += diff_time;
+    clock_sessions++;
+    running = false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Reset the timer to 0. Does not change the timer running state but does
+//! recapture this point in time as the current start time if it is running.
+////////////////////////////////////////////////////////////////////////////////
+inline void
+StopWatchWin::reset()
+{
+    diff_time = 0;
+    total_time = 0;
+    clock_sessions = 0;
+
+    if (running)
+    {
+        QueryPerformanceCounter((LARGE_INTEGER *) &start_time);
+    }
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. after start. If the stop watch is still running (i.e. there
+//! was no call to stop()) then the elapsed time is returned added to the
+//! current diff_time sum, otherwise the current summed time difference alone
+//! is returned.
+////////////////////////////////////////////////////////////////////////////////
+inline float
+StopWatchWin::getTime()
+{
+    // Return the TOTAL time to date
+    float retval = total_time;
+
+    if (running)
+    {
+        LARGE_INTEGER temp;
+        QueryPerformanceCounter((LARGE_INTEGER *) &temp);
+        retval += (float)
+                  (((double)(temp.QuadPart - start_time.QuadPart)) / freq);
+    }
+
+    return retval;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. for a single run based on the total number of COMPLETED runs
+//! and the total time.
+////////////////////////////////////////////////////////////////////////////////
+inline float
+StopWatchWin::getAverageTime()
+{
+    return (clock_sessions > 0) ? (total_time/clock_sessions) : 0.0f;
+}
+#else
+// Declarations for Stopwatch on Linux and Mac OSX
+// includes, system
+#include <ctime>
+#include <sys/time.h>
+
+//! Windows specific implementation of StopWatch
+class StopWatchLinux : public StopWatchInterface
+{
+    public:
+        //! Constructor, default
+        StopWatchLinux() :
+            start_time(), diff_time(0.0), total_time(0.0),
+            running(false), clock_sessions(0)
+        { };
+
+        // Destructor
+        virtual ~StopWatchLinux()
+        { };
+
+    public:
+        //! Start time measurement
+        inline void start();
+
+        //! Stop time measurement
+        inline void stop();
+
+        //! Reset time counters to zero
+        inline void reset();
+
+        //! Time in msec. after start. If the stop watch is still running (i.e. there
+        //! was no call to stop()) then the elapsed time is returned, otherwise the
+        //! time between the last start() and stop call is returned
+        inline float getTime();
+
+        //! Mean time to date based on the number of times the stopwatch has been
+        //! _stopped_ (ie finished sessions) and the current total time
+        inline float getAverageTime();
+
+    private:
+
+        // helper functions
+
+        //! Get difference between start time and current time
+        inline float getDiffTime();
+
+    private:
+
+        // member variables
+
+        //! Start of measurement
+        struct timeval  start_time;
+
+        //! Time difference between the last start and stop
+        float  diff_time;
+
+        //! TOTAL time difference between starts and stops
+        float  total_time;
+
+        //! flag if the stop watch is running
+        bool running;
+
+        //! Number of times clock has been started
+        //! and stopped to allow averaging
+        int clock_sessions;
+};
+
+// functions, inlined
+
+////////////////////////////////////////////////////////////////////////////////
+//! Start time measurement
+////////////////////////////////////////////////////////////////////////////////
+inline void
+StopWatchLinux::start()
+{
+    gettimeofday(&start_time, 0);
+    running = true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Stop time measurement and increment add to the current diff_time summation
+//! variable. Also increment the number of times this clock has been run.
+////////////////////////////////////////////////////////////////////////////////
+inline void
+StopWatchLinux::stop()
+{
+    diff_time = getDiffTime();
+    total_time += diff_time;
+    running = false;
+    clock_sessions++;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Reset the timer to 0. Does not change the timer running state but does
+//! recapture this point in time as the current start time if it is running.
+////////////////////////////////////////////////////////////////////////////////
+inline void
+StopWatchLinux::reset()
+{
+    diff_time = 0;
+    total_time = 0;
+    clock_sessions = 0;
+
+    if (running)
+    {
+        gettimeofday(&start_time, 0);
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. after start. If the stop watch is still running (i.e. there
+//! was no call to stop()) then the elapsed time is returned added to the
+//! current diff_time sum, otherwise the current summed time difference alone
+//! is returned.
+////////////////////////////////////////////////////////////////////////////////
+inline float
+StopWatchLinux::getTime()
+{
+    // Return the TOTAL time to date
+    float retval = total_time;
+
+    if (running)
+    {
+        retval += getDiffTime();
+    }
+
+    return retval;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. for a single run based on the total number of COMPLETED runs
+//! and the total time.
+////////////////////////////////////////////////////////////////////////////////
+inline float
+StopWatchLinux::getAverageTime()
+{
+    return (clock_sessions > 0) ? (total_time/clock_sessions) : 0.0f;
+}
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+inline float
+StopWatchLinux::getDiffTime()
+{
+    struct timeval t_time;
+    gettimeofday(&t_time, 0);
+
+    // time difference in milli-seconds
+    return (float)(1000.0 * (t_time.tv_sec - start_time.tv_sec)
+                   + (0.001 * (t_time.tv_usec - start_time.tv_usec)));
+}
+#endif // _WIN32
+
+////////////////////////////////////////////////////////////////////////////////
+//! Timer functionality exported
+
+////////////////////////////////////////////////////////////////////////////////
+//! Create a new timer
+//! @return true if a time has been created, otherwise false
+//! @param  name of the new timer, 0 if the creation failed
+////////////////////////////////////////////////////////////////////////////////
+inline bool
+sdkCreateTimer(StopWatchInterface **timer_interface)
+{
+    //printf("sdkCreateTimer called object %08x\n", (void *)*timer_interface);
+#ifdef _WIN32
+    *timer_interface = (StopWatchInterface *)new StopWatchWin();
+#else
+    *timer_interface = (StopWatchInterface *)new StopWatchLinux();
+#endif
+    return (*timer_interface != NULL) ? true : false;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+//! Delete a timer
+//! @return true if a time has been deleted, otherwise false
+//! @param  name of the timer to delete
+////////////////////////////////////////////////////////////////////////////////
+inline bool
+sdkDeleteTimer(StopWatchInterface **timer_interface)
+{
+    //printf("sdkDeleteTimer called object %08x\n", (void *)*timer_interface);
+    if (*timer_interface)
+    {
+        delete *timer_interface;
+        *timer_interface = NULL;
+    }
+
+    return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Start the time with name \a name
+//! @param name  name of the timer to start
+////////////////////////////////////////////////////////////////////////////////
+inline bool
+sdkStartTimer(StopWatchInterface **timer_interface)
+{
+    //printf("sdkStartTimer called object %08x\n", (void *)*timer_interface);
+    if (*timer_interface)
+    {
+        (*timer_interface)->start();
+    }
+
+    return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Stop the time with name \a name. Does not reset.
+//! @param name  name of the timer to stop
+////////////////////////////////////////////////////////////////////////////////
+inline bool
+sdkStopTimer(StopWatchInterface **timer_interface)
+{
+    // printf("sdkStopTimer called object %08x\n", (void *)*timer_interface);
+    if (*timer_interface)
+    {
+        (*timer_interface)->stop();
+    }
+
+    return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Resets the timer's counter.
+//! @param name  name of the timer to reset.
+////////////////////////////////////////////////////////////////////////////////
+inline bool
+sdkResetTimer(StopWatchInterface **timer_interface)
+{
+    // printf("sdkResetTimer called object %08x\n", (void *)*timer_interface);
+    if (*timer_interface)
+    {
+        (*timer_interface)->reset();
+    }
+
+    return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Return the average time for timer execution as the total time
+//! for the timer dividied by the number of completed (stopped) runs the timer
+//! has made.
+//! Excludes the current running time if the timer is currently running.
+//! @param name  name of the timer to return the time of
+////////////////////////////////////////////////////////////////////////////////
+inline float
+sdkGetAverageTimerValue(StopWatchInterface **timer_interface)
+{
+    //  printf("sdkGetAverageTimerValue called object %08x\n", (void *)*timer_interface);
+    if (*timer_interface)
+    {
+        return (*timer_interface)->getAverageTime();
+    }
+    else
+    {
+        return 0.0f;
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Total execution time for the timer over all runs since the last reset
+//! or timer creation.
+//! @param name  name of the timer to obtain the value of.
+////////////////////////////////////////////////////////////////////////////////
+inline float
+sdkGetTimerValue(StopWatchInterface **timer_interface)
+{
+    // printf("sdkGetTimerValue called object %08x\n", (void *)*timer_interface);
+    if (*timer_interface)
+    {
+        return (*timer_interface)->getTime();
+    }
+    else
+    {
+        return 0.0f;
+    }
+}
+
+#endif // HELPER_TIMER_H
diff --git a/src/Benchmarks/Sorting/bitonicsortBenchmark.cu b/src/Benchmarks/Sorting/bitonicsortBenchmark.cu
new file mode 100644
index 0000000000000000000000000000000000000000..88c612fe8a3442b7ac81b4e22a3d6425c734c962
--- /dev/null
+++ b/src/Benchmarks/Sorting/bitonicsortBenchmark.cu
@@ -0,0 +1,10 @@
+#include "../../src/bitonicSort/bitonicSort.h"
+
+#include "../benchmarker.cpp"
+#include "../measure.cu"
+
+template<typename Value>
+void sorter(ArrayView<Value, Devices::Cuda> arr)
+{
+    bitonicSort(arr);
+}
\ No newline at end of file
diff --git a/src/Benchmarks/Sorting/generators.h b/src/Benchmarks/Sorting/generators.h
new file mode 100644
index 0000000000000000000000000000000000000000..1615c90abda0f586df79570a529069f273fa4b0d
--- /dev/null
+++ b/src/Benchmarks/Sorting/generators.h
@@ -0,0 +1,148 @@
+#pragma once
+#include <numeric>
+#include <vector>
+#include <algorithm>
+#include <cmath>
+using namespace std;
+
+vector<int> generateSorted(int size)
+{
+    vector<int> vec(size);
+
+    iota(vec.begin(), vec.end(), 0);
+
+    return vec;
+}
+
+vector<int> generateRandom(int size)
+{
+    vector<int> vec(size);
+
+    srand(size + 2021);
+    generate(vec.begin(), vec.end(), [=](){return std::rand() % (2*size);});
+
+    return vec;
+}
+
+vector<int> generateShuffle(int size)
+{
+    vector<int> vec(size);
+
+    iota(vec.begin(), vec.end(), 0);
+    srand(size);
+    random_shuffle(vec.begin(), vec.end());
+
+    return vec;
+}
+
+vector<int> generateAlmostSorted(int size)
+{
+    vector<int> vec(size);
+
+    iota(vec.begin(), vec.end(), 0);
+    srand(9451);
+    for(int i = 0; i < 3; i++) //swaps 3 times in array
+    {
+        int s = rand() % (size - 3);
+        std::swap(vec[s], vec[s + 1]);
+    }
+
+    return vec;
+}
+
+vector<int> generateDecreasing(int size)
+{
+    vector<int> vec(size);
+
+    for(int i = 0; i < size; i++)
+        vec[i] = size - i;
+
+    return vec;
+}
+
+vector<int> generateZero_entropy(int size)
+{
+    vector<int> vec(size, 515);
+    return vec;
+}
+
+vector<int> generateGaussian(int size)
+{
+    vector<int> vec(size);
+	srand(size + 2000);
+
+	for (int i = 0; i < size; ++i)
+    {
+		int value = 0;
+		for (int j = 0; j < 4; ++j)
+			value += rand()%16384;
+
+		vec[i] = value /4;
+	}
+
+    return vec;
+}
+
+vector<int> generateBucket(int size)
+{
+    vector<int> vec(size);
+
+	srand (size + 94215);
+    double tmp = ((double)size)*3000000; //(RAND_MAX)/p; --> ((double)N)*30000;
+	double tmp2 = sqrt(tmp);
+
+	int p= (size+tmp2-1)/tmp2;
+
+	const int VALUE = 8192/p; //(RAND_MAX)/p;
+
+	int i=0; int x=0;
+	//the array of size N is split into 'p' buckets
+	while(i < p)
+	{
+		for (int z = 0; z < p; ++z)
+			for (int j = 0; j < size/(p*p); ++j)
+			{
+				//every bucket has N/(p*p) items and the range is [min : VALUE-1 ]
+				int min = VALUE*z;
+
+				vec[x]= min + ( rand() %  (VALUE-1) ) ;
+				x++;
+			}
+		i++;
+	}
+
+    return vec;
+}
+
+vector<int> generateStaggered(int size)
+{
+    vector<int> vec(size);
+
+	srand (size + 815618);
+	int tmp=4096; //(RAND_MAX)/p; --> size=2048
+	int p= (size+tmp-1)/tmp;
+
+	const int VALUE = (1<<30)/p; //(RAND_MAX)/p;
+
+	int i=1; int x=0;
+	//the array of size N is split into 'p' buckets
+	while(i <= p)
+	{
+		//every bucket has N/(p) items
+		for (int j = 0; j < size/(p); ++j)
+		{
+			int min;
+
+			if(i<=(p/2))
+				min = (2*i -1)*VALUE;
+
+			else
+				min = (2*i-p-1)*VALUE;
+
+			vec[x++]= min + ( rand() % (VALUE - 1) );
+		}
+		i++;
+	}
+
+    return vec;
+}
\ No newline at end of file
diff --git a/src/Benchmarks/Sorting/timer.h b/src/Benchmarks/Sorting/timer.h
new file mode 100644
index 0000000000000000000000000000000000000000..880eb03e79d82a50acd4148028e5498b09272ed8
--- /dev/null
+++ b/src/Benchmarks/Sorting/timer.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <string>
+#include <chrono>
+#include <functional>
+#include <iostream>
+
+struct TIMER
+{
+    std::function<void(double)> f;
+    std::chrono::high_resolution_clock::time_point begin;
+
+    TIMER(std::function<void(double)> func = [](double res){std::cout << res << std::endl;})
+        : f(func), begin(std::chrono::high_resolution_clock::now()) {}
+
+    ~TIMER()
+    {
+        auto end = std::chrono::high_resolution_clock::now();
+        double result = (std::chrono::duration_cast<std::chrono::microseconds >(end - begin).count() / 1000.);
+        f(result);
+    }
+};
\ No newline at end of file
diff --git a/src/Benchmarks/Sorting/tnl-benchmark-sort.cpp b/src/Benchmarks/Sorting/tnl-benchmark-sort.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..489a9497aa0a4dcbd92dbd6a28507b0e6b78a582
--- /dev/null
+++ b/src/Benchmarks/Sorting/tnl-benchmark-sort.cpp
@@ -0,0 +1 @@
+#include "tnl-benchmark-sort.h"
diff --git a/src/Benchmarks/Sorting/tnl-benchmark-sort.cu b/src/Benchmarks/Sorting/tnl-benchmark-sort.cu
new file mode 120000
index 0000000000000000000000000000000000000000..26b452a6150a3c5413db2046a58f6e190cd9f1d4
--- /dev/null
+++ b/src/Benchmarks/Sorting/tnl-benchmark-sort.cu
@@ -0,0 +1 @@
+tnl-benchmark-sort.cpp
\ No newline at end of file
diff --git a/src/Benchmarks/Sorting/tnl-benchmark-sort.h b/src/Benchmarks/Sorting/tnl-benchmark-sort.h
new file mode 100644
index 0000000000000000000000000000000000000000..68b500a0970dbeb4e4b49090f1f33b0f440fc79a
--- /dev/null
+++ b/src/Benchmarks/Sorting/tnl-benchmark-sort.h
@@ -0,0 +1,131 @@
+#include <iostream>
+#include <fstream>
+#include <iomanip>
+#include <vector>
+#include <TNL/Algorithms/sort.h>
+using namespace std;
+
+#include "generators.h"
+#include "Measurer.h"
+
+#ifndef LOW_POW
+    #define LOW_POW 10
+#endif
+
+#ifndef HIGH_POW
+    #define HIGH_POW 25
+#endif
+
+#ifndef TRIES
+    #define TRIES 20
+#endif
+
+using namespace TNL;
+using namespace TNL::Algorithms;
+using namespace TNL::Algorithms::Sorting;
+
+template< typename Sorter >
+void start(ostream & out, string delim)
+{
+    out << "size" << delim;
+    out << "random" << delim;
+    out << "shuffle" << delim;
+    out << "sorted" << delim;
+    out << "almost" << delim;
+    out << "decreas" << delim;
+    out << "gauss" << delim;
+    out << "bucket" << delim;
+    out << "stagger" << delim;
+    out << "zero_entropy";
+    out << endl;
+
+    int wrongAnsCnt = 0;
+
+    for(int pow = LOW_POW; pow <= HIGH_POW; pow++)
+    {
+        int size =(1<< pow);
+        vector<int> vec(size);
+
+        out << "2^" << pow << delim << flush;
+        out << fixed << setprecision(3);
+
+        out << Measurer< Sorter >::measure( generateRandom(size), TRIES, wrongAnsCnt);
+        out << delim << flush;
+
+        out << Measurer< Sorter >::measure( generateShuffle(size), TRIES, wrongAnsCnt);
+        out << delim << flush;
+
+        out << Measurer< Sorter >::measure( generateSorted(size), TRIES, wrongAnsCnt);
+        out << delim << flush;
+
+        out << Measurer< Sorter >::measure( generateAlmostSorted(size), TRIES, wrongAnsCnt);
+        out << delim << flush;
+
+        out << Measurer< Sorter >::measure( generateDecreasing(size), TRIES, wrongAnsCnt);
+        out << delim << flush;
+
+        out << Measurer< Sorter >::measure( generateGaussian(size), TRIES, wrongAnsCnt) ;
+        out << delim << flush;
+
+        out << Measurer< Sorter >::measure( generateBucket(size), TRIES, wrongAnsCnt);
+        out << delim << flush;
+
+        out << Measurer< Sorter >::measure( generateStaggered(size), TRIES, wrongAnsCnt);
+        out << delim << flush;
+
+        out << Measurer< Sorter >::measure( generateZero_entropy(size), TRIES, wrongAnsCnt);
+        out << endl;
+    }
+
+    if(wrongAnsCnt > 0)
+        std::cerr << wrongAnsCnt << "tries were sorted incorrectly" << std::endl;
+}
+
+int main(int argc, char *argv[])
+{
+   if(argc == 1)
+   {
+#ifdef HAVE_CUDA
+      std::cout << "Quicksort on GPU ... " << std::endl;
+      start< Quicksort >(cout, "\t");
+      std::cout << "Bitonic sort on GPU ... " << std::endl;
+      start< BitonicSort >( cout, "\t" );
+#ifdef HAVE_CUDA_SAMPLES
+      std::cout << "Manca quicksort on GPU ... " << std::endl;
+      start< MancaQuicksort >( cout, "\t" );
+      std::cout << "Nvidia bitonic sort on GPU ... " << std::endl;
+      start< NvidiaBitonicSort >( cout, "\t" );
+#endif
+      std::cout << "Cederman quicksort on GPU ... " << std::endl;
+      start< CedermanQuicksort >( cout, "\t" );
+      std::cout << "Thrust radixsort on GPU ... " << std::endl;
+      start< ThrustRadixsort >( cout, "\t" );
+#endif
+      std::cout << "STL sort on CPU ... " << std::endl;
+      start< STLSort >( cout, "\t" );
+   }
+   else
+   {
+      std::ofstream out(argv[1]);
+#ifdef HAVE_CUDA
+      std::cout << "Quicksort on GPU ... " << std::endl;
+      start< Quicksort >(out, ",");
+      std::cout << "Bitonic sort on GPU ... " << std::endl;
+      start< BitonicSort >(out, ",");
+
+#ifdef HAVE_CUDA_SAMPLES
+      std::cout << "Manca quicksort on GPU ... " << std::endl;
+      start< MancaQuicksort >( out, "," );
+      std::cout << "Nvidia bitonic sort on GPU ... " << std::endl;
+      start< NvidiaBitonicSort >( out, "," );
+#endif
+      std::cout << "Cederman quicksort on GPU ... " << std::endl;
+      start< CedermanQuicksort >( out, "," );
+      std::cout << "Thrust radixsort on GPU ... " << std::endl;
+      start< ThrustRadixsort >( out, "," );
+#endif
+      std::cout << "STL sort on CPU ... " << std::endl;
+      start< STLSort >( out, "," );
+    }
+    return 0;
+}
diff --git a/src/TNL/Algorithms/Sorting/BitonicSort.h b/src/TNL/Algorithms/Sorting/BitonicSort.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c2d6e89d103df4ba706c9a31e9f9e6440bef38d
--- /dev/null
+++ b/src/TNL/Algorithms/Sorting/BitonicSort.h
@@ -0,0 +1,49 @@
+/***************************************************************************
+                          BitonicSort.h  -  description
+                             -------------------
+    begin                : Jul 14, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Xuan Thang Nguyen, Tomas Oberhuber
+
+#pragma once
+
+#include <TNL/Algorithms/Sorting/detail/bitonicSort.h>
+
+namespace TNL {
+   namespace Algorithms {
+      namespace Sorting {
+
+struct BitonicSort
+{
+   template< typename Array >
+   void static sort( Array& array )
+   {
+      bitonicSort( array );
+   }
+
+   template< typename Array, typename Compare >
+   void static sort( Array& array, const Compare& compare )
+   {
+      bitonicSort( array, compare );
+   }
+
+   template< typename Device, typename Index, typename Compare, typename Swap >
+   void static inplaceSort( const Index begin, const Index end, const Compare& compare, const Swap& swap )
+   {
+      if( std::is_same< Device, Devices::Cuda >::value )
+         bitonicSort( begin, end, compare, swap );
+      else
+         TNL_ASSERT( false, std::cerr <<  "inplace bitonic sort for CPU is not implemented" << std::endl );
+   }
+};
+
+      } // namespace Sorting
+   } // namespace Algorithms
+} //namespace TNL
+
+
diff --git a/src/TNL/Algorithms/Sorting/BubbleSort.h b/src/TNL/Algorithms/Sorting/BubbleSort.h
new file mode 100644
index 0000000000000000000000000000000000000000..cb809b737c4ea2e48b1bc8f42a02587ea78a7303
--- /dev/null
+++ b/src/TNL/Algorithms/Sorting/BubbleSort.h
@@ -0,0 +1,66 @@
+/***************************************************************************
+                          BubbleSort.h  -  description
+                             -------------------
+    begin                : Jul 26, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include <algorithm>
+#include <TNL/Assert.h>
+#include <TNL/Exceptions/NotImplementedError.h>
+
+namespace TNL {
+   namespace Algorithms {
+      namespace Sorting {
+
+struct BubbleSort
+{
+   template< typename Device, typename Index, typename Compare, typename Swap >
+   void static inplaceSort( const Index begin, const Index end, Compare& compare, Swap& swap )
+   {
+      if( std::is_same< Device, Devices::Host >::value ||
+          std::is_same< Device, Devices::Sequential >::value )
+      {
+         Index left( begin ), right( end -1 );
+         while( left < right )
+         {
+            //int lastChange( end -1 );
+            for( int j = left; j < right - 1; j++ )
+            {
+               TNL_ASSERT_LT( j+1, end, "" );
+               if( ! compare( j, j+1 ) )
+               {
+                  swap( j, j+1 );
+                  //lastChange = j;
+               }
+            }
+            right--; //lastChange;
+            for( int j = right; j >= left; j-- )
+            {
+               TNL_ASSERT_LT( j+1, end, "" );
+               if( ! compare( j, j+1 ) )
+               {
+                  swap( j, j+1 );
+                  //lastChange = j;
+               }
+            }
+            left++; //lastChange;
+         }
+      }
+      else
+         throw Exceptions::NotImplementedError();
+   }
+};
+
+      } // namespace Sorting
+   } // namespace Algorithms
+} //namespace TNL
+
+
diff --git a/src/TNL/Algorithms/Sorting/DefaultSorter.h b/src/TNL/Algorithms/Sorting/DefaultSorter.h
new file mode 100644
index 0000000000000000000000000000000000000000..546ea0d19e9d39f8fa4424b51dbb4ed2fd445abf
--- /dev/null
+++ b/src/TNL/Algorithms/Sorting/DefaultSorter.h
@@ -0,0 +1,73 @@
+/***************************************************************************
+                          DefaultSorter.h  -  description
+                             -------------------
+    begin                : Jul 14, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include <utility>  // std::pair, std::forward
+
+#include <TNL/Devices/Sequential.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Algorithms/Sorting/BitonicSort.h>
+#include <TNL/Algorithms/Sorting/BubbleSort.h>
+#include <TNL/Algorithms/Sorting/Quicksort.h>
+#include <TNL/Algorithms/Sorting/STLSort.h>
+
+namespace TNL {
+   namespace Algorithms {
+      namespace Sorting {
+
+template< typename Device >
+struct DefaultSorter;
+
+template<>
+struct DefaultSorter< Devices::Host >
+{
+   using SorterType = Algorithms::Sorting::STLSort;
+};
+
+template<>
+struct DefaultSorter< Devices::Sequential >
+{
+   using SorterType = Algorithms::Sorting::STLSort;
+};
+
+template<>
+struct DefaultSorter< Devices::Cuda >
+{
+   using SorterType = Algorithms::Sorting::Quicksort;
+};
+
+template< typename Device >
+struct DefaultInplaceSorter;
+
+template<>
+struct DefaultInplaceSorter< Devices::Host >
+{
+   using SorterType = Algorithms::Sorting::BubbleSort;
+};
+
+template<>
+struct DefaultInplaceSorter< Devices::Sequential >
+{
+   using SorterType = Algorithms::Sorting::BubbleSort;
+};
+
+template<>
+struct DefaultInplaceSorter< Devices::Cuda >
+{
+   using SorterType = Algorithms::Sorting::BitonicSort;
+};
+
+      } // namespace Sorting
+   } // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Algorithms/Sorting/Quicksort.h b/src/TNL/Algorithms/Sorting/Quicksort.h
new file mode 100644
index 0000000000000000000000000000000000000000..4df0bf471ee48afbeda99f32dde01d7b79f15b1b
--- /dev/null
+++ b/src/TNL/Algorithms/Sorting/Quicksort.h
@@ -0,0 +1,43 @@
+/***************************************************************************
+                          Quicksort.h  -  description
+                             -------------------
+    begin                : Jul 14, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Xuan Thang Nguyen, Tomas Oberhuber
+
+#pragma once
+
+#include <TNL/Algorithms/Sorting/detail/Quicksorter.h>
+
+namespace TNL {
+   namespace Algorithms {
+      namespace Sorting {
+
+struct Quicksort
+{
+   template< typename Array >
+   void static sort( Array& array )
+   {
+      Quicksorter< typename Array::ValueType, typename Array::DeviceType > qs;
+      qs.sort( array );
+   }
+
+   template< typename Array, typename Compare >
+   void static sort( Array& array, const Compare& compare )
+   {
+      Quicksorter< typename Array::ValueType, typename Array::DeviceType > qs;
+      qs.sort( array, compare );
+   }
+
+};
+
+      } // namespace Sorting
+   } // namespace Algorithms
+} //namespace TNL
+
+
diff --git a/src/TNL/Algorithms/Sorting/STLSort.h b/src/TNL/Algorithms/Sorting/STLSort.h
new file mode 100644
index 0000000000000000000000000000000000000000..3fc69f324f7e3acce9188e80e28df24cb3fef9f1
--- /dev/null
+++ b/src/TNL/Algorithms/Sorting/STLSort.h
@@ -0,0 +1,40 @@
+/***************************************************************************
+                          STLSort.h  -  description
+                             -------------------
+    begin                : Jul 14, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber
+
+#pragma once
+
+#include <algorithm>
+
+namespace TNL {
+   namespace Algorithms {
+      namespace Sorting {
+
+struct STLSort
+{
+   template< typename Array >
+   void static sort( Array& array )
+   {
+      std::sort( array.getData(), array.getData() + array.getSize() );
+   }
+
+   template< typename Array, typename Compare >
+   void static sort( Array& array, const Compare& compare )
+   {
+      std::sort( array.getData(), array.getData() + array.getSize(), compare );
+   }
+};
+
+      } // namespace Sorting
+   } // namespace Algorithms
+} //namespace TNL
+
+
diff --git a/src/TNL/Algorithms/Sorting/detail/Quicksorter.h b/src/TNL/Algorithms/Sorting/detail/Quicksorter.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f891fa590c18eeec33841e264bfbacadbb1782e
--- /dev/null
+++ b/src/TNL/Algorithms/Sorting/detail/Quicksorter.h
@@ -0,0 +1,111 @@
+/***************************************************************************
+                          Quicksorter.h  -  description
+                             -------------------
+    begin                : Jul 13, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Xuan Thang Nguyen, Tomas Oberhuber
+
+#pragma once
+
+#include <TNL/Containers/Array.h>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Scan.h>
+#include <TNL/Algorithms//Sorting/detail/task.h>
+
+namespace TNL {
+   namespace Algorithms {
+      namespace Sorting {
+
+template< typename Value, typename Device >
+class Quicksorter;
+
+template< typename Value >
+class Quicksorter< Value, Devices::Cuda >
+{
+   public:
+
+      using ValueType = Value;
+      using DeviceType = Devices::Cuda;
+
+
+      template< typename Array, typename Compare >
+      void sort( Array& arr, const Compare& cmp );
+
+      template< typename Array >
+      void sort( Array& arr );
+
+   protected:
+
+      void init( Containers::ArrayView<Value, Devices::Cuda> arr, int gridDim, int blockDim, int desiredElemPerBlock, int maxSharable);
+
+      template< typename CMP >
+         void performSort( const CMP &Cmp );
+
+
+      /**
+       * returns how many blocks are needed to start sort phase 1 if @param elemPerBlock were to be used
+       * */
+      int getSetsNeeded(int elemPerBlock) const;
+
+      /**
+       * returns the optimal amount of elements per thread needed for phase
+       * */
+      int getElemPerBlock() const;
+
+      /**
+       * returns the amount of blocks needed to start phase 1 while also initializing all tasks
+       * */
+      template< typename CMP >
+         int initTasks(int elemPerBlock, const CMP &Cmp);
+
+      /**
+       * does the 1st phase of Quicksort until out of task memory or each task is small enough
+       * for correctness, secondphase method needs to be called to sort each subsequences
+       * */
+      template <typename CMP>
+         void firstPhase(const CMP &Cmp);
+
+      /**
+       * update necessary variables after 1 phase1 sort
+       * */
+      void processNewTasks();
+
+      /**
+       * sorts all leftover tasks
+       * */
+      template <typename CMP>
+         void secondPhase( const CMP &Cmp) ;
+
+      int maxBlocks, threadsPerBlock, desiredElemPerBlock, maxSharable; //kernel config
+
+      Containers::Array<Value, Devices::Cuda> auxMem;
+      Containers::ArrayView<Value, Devices::Cuda> arr, aux;
+
+      int desired_2ndPhasElemPerBlock;
+      const int g_maxTasks = 1 << 14;
+      int maxTasks;
+
+
+      Containers::Array<TASK, Devices::Cuda> cuda_tasks, cuda_newTasks, cuda_2ndPhaseTasks; //1 set of 2 rotating tasks and 2nd phase
+      Containers::Array<int, Devices::Cuda> cuda_newTasksAmount, cuda_2ndPhaseTasksAmount;  //is in reality 1 integer each
+
+      Containers::Array<int, Devices::Cuda> cuda_blockToTaskMapping;
+      Containers::Vector<int, Devices::Cuda> cuda_reductionTaskInitMem;
+
+      int host_1stPhaseTasksAmount = 0, host_2ndPhaseTasksAmount = 0;
+      int iteration = 0;
+
+      template< typename T >
+      friend int getSetsNeededFunction(int elemPerBlock, const Quicksorter< T, Devices::Cuda >& quicksort );
+};
+
+        } // namespace Sorting
+    } // namespace Algorithms
+}// namespace TNL
+
+#include <TNL/Algorithms/Sorting/detail/Quicksorter.hpp>
diff --git a/src/TNL/Algorithms/Sorting/detail/Quicksorter.hpp b/src/TNL/Algorithms/Sorting/detail/Quicksorter.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..faa294ab3799f216dca5a9165c7e5f2020c2939b
--- /dev/null
+++ b/src/TNL/Algorithms/Sorting/detail/Quicksorter.hpp
@@ -0,0 +1,414 @@
+/***************************************************************************
+                          Quicksorter.h  -  description
+                             -------------------
+    begin                : Jul 13, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Xuan Thang Nguyen, Tomas Oberhuber
+
+#pragma once
+
+#include <TNL/Functional.h>
+#include <TNL/Algorithms/Sorting/detail/task.h>
+#include <TNL/Algorithms/Sorting/detail/quicksort_kernel.h>
+#include <TNL/Algorithms/Sorting/detail/quicksort_1Block.h>
+#include <TNL/Algorithms/Sorting/detail/Quicksorter.h>
+
+namespace TNL {
+    namespace Algorithms {
+        namespace Sorting {
+
+
+template< typename Value >
+   template< typename Array, typename Compare >
+void
+Quicksorter< Value, Devices::Cuda >::
+sort( Array& arr, const Compare& cmp )
+{
+#ifdef HAVE_CUDA
+    cudaDeviceProp deviceProp;
+    cudaGetDeviceProperties(&deviceProp, 0);
+
+    /**
+     * for every block there is a bit of shared memory reserved, the actual value can slightly differ
+     * */
+    int sharedReserve = sizeof(int) * (16 + 3 * 32);
+    int maxSharable = deviceProp.sharedMemPerBlock - sharedReserve;
+
+    int blockDim = 512; //best case
+
+    /**
+     * the goal is to use shared memory as often as possible
+     * each thread in a block will process n elements, n==multiplier
+     * + 1 reserved for pivot (statically allocating Value type throws weird error, hence it needs to be dynamic)
+     *
+     * blockDim*multiplier*sizeof(Value) + 1*sizeof(Value) <= maxSharable
+     * */
+    int elemPerBlock = (maxSharable - sizeof(Value)) / sizeof(Value); //try to use up all of shared memory to store elements
+    const int maxBlocks = (1 << 20);
+    const int maxMultiplier = 8;
+    int multiplier = min(elemPerBlock / blockDim, maxMultiplier);
+
+    if (multiplier <= 0) //a block cant store 512 elements, sorting some really big data
+    {
+        blockDim = 256; //try to fit 256 elements
+        multiplier = min(elemPerBlock / blockDim, maxMultiplier);
+
+        if (multiplier <= 0)
+        {
+            //worst case scenario, shared memory cant be utilized at all because of the sheer size of Value
+            //sort has to be done with the use of global memory alone
+
+            this->init(arr, maxBlocks, 512, 0, 0);
+            this->performSort( cmp );
+            return;
+        }
+    }
+
+    TNL_ASSERT_LE( ( int ) ( blockDim * multiplier * sizeof(Value) ), maxSharable,"" );
+
+    this->init(arr, maxBlocks, blockDim, multiplier * blockDim, maxSharable);
+    this->performSort( cmp );
+#endif
+}
+
+template< typename Value >
+   template< typename Array >
+void
+Quicksorter< Value, Devices::Cuda >::
+sort( Array& arr )
+{
+   this->sort(arr, [] __cuda_callable__( const Value& a, const Value& b ) { return a < b; } );
+}
+
+template< typename Value >
+void
+Quicksorter< Value, Devices::Cuda >::
+init( Containers::ArrayView<Value, Devices::Cuda> arr, int gridDim, int blockDim, int desiredElemPerBlock, int maxSharable)
+{
+   this->maxBlocks = gridDim;
+   this->threadsPerBlock = blockDim;
+   this->desiredElemPerBlock = desiredElemPerBlock;
+   this->maxSharable = maxSharable;
+   this->arr.bind( arr );
+   this->auxMem.setSize( arr.getSize() );
+   this->aux.bind( auxMem.getView() );
+   this->desired_2ndPhasElemPerBlock = desiredElemPerBlock;
+   this->maxTasks = min( arr.getSize(), g_maxTasks );
+   this->cuda_tasks.setSize(maxTasks);
+   this->cuda_newTasks.setSize(maxTasks);
+   this->cuda_2ndPhaseTasks.setSize(maxTasks);
+   this->cuda_newTasksAmount.setSize(1);
+   this->cuda_2ndPhaseTasksAmount.setSize(1);
+   this->cuda_blockToTaskMapping.setSize(maxBlocks);
+   this->cuda_reductionTaskInitMem.setSize(maxTasks);
+
+   if (arr.getSize() > desired_2ndPhasElemPerBlock)
+   {
+      cuda_tasks.setElement(0, TASK(0, arr.getSize(), 0));
+      host_1stPhaseTasksAmount = 1;
+   }
+   else
+   {
+      cuda_2ndPhaseTasks.setElement(0, TASK(0, arr.getSize(), 0));
+      host_2ndPhaseTasksAmount = 1;
+   }
+
+   cuda_2ndPhaseTasksAmount = 0;
+   TNL_CHECK_CUDA_DEVICE;
+}
+
+
+template< typename Value >
+   template< typename CMP >
+void
+Quicksorter< Value, Devices::Cuda >::
+performSort( const CMP &Cmp )
+{
+#ifdef HAVE_CUDA
+    firstPhase(Cmp);
+
+    int total2ndPhase = host_1stPhaseTasksAmount + host_2ndPhaseTasksAmount;
+    if (total2ndPhase > 0)
+        secondPhase(Cmp);
+
+    cudaDeviceSynchronize();
+    TNL_CHECK_CUDA_DEVICE;
+
+#ifdef CHECK_RESULT_SORT
+    if (!is_sorted(arr))
+    {
+        std::ofstream out("error.txt");
+        out << arr << std::endl;
+        out << aux << std::endl;
+        out << cuda_tasks << std::endl;
+        out << cuda_newTasks << std::endl;
+        out << cuda_2ndPhaseTasks << std::endl;
+
+        out << cuda_newTasksAmount << std::endl;
+        out << cuda_2ndPhaseTasksAmount << std::endl;
+
+        out << iteration << std::endl;
+    }
+#endif
+#endif
+}
+
+template< typename Value >
+   template< typename CMP >
+void
+Quicksorter< Value, Devices::Cuda >::
+firstPhase( const CMP &Cmp )
+{
+#ifdef HAVE_CUDA
+    while (host_1stPhaseTasksAmount > 0)
+    {
+        if (host_1stPhaseTasksAmount >= maxTasks)
+            break;
+
+        if (host_2ndPhaseTasksAmount >= maxTasks) //2nd phase occupies enoughs tasks to warrant premature 2nd phase sort
+        {
+            int tmp = host_1stPhaseTasksAmount;
+            host_1stPhaseTasksAmount = 0;
+            secondPhase(Cmp);
+            cuda_2ndPhaseTasksAmount = host_2ndPhaseTasksAmount = 0;
+            host_1stPhaseTasksAmount = tmp;
+        }
+
+        //just in case newly created tasks wouldnt fit
+        //bite the bullet and sort with single blocks
+        if (host_1stPhaseTasksAmount * 2 >= maxTasks + (maxTasks - host_2ndPhaseTasksAmount))
+        {
+            if (host_2ndPhaseTasksAmount >= 0.75 * maxTasks) //2nd phase occupies enoughs tasks to warrant premature 2nd phase sort
+            {
+                int tmp = host_1stPhaseTasksAmount;
+                host_1stPhaseTasksAmount = 0;
+                secondPhase(Cmp);
+                cuda_2ndPhaseTasksAmount = host_2ndPhaseTasksAmount = 0;
+                host_1stPhaseTasksAmount = tmp;
+            }
+            else
+                break;
+        }
+
+        //---------------------------------------------------------------
+
+        int elemPerBlock = getElemPerBlock();
+
+        /**
+         * initializes tasks so that each block knows which task to work on and which part of array to split
+         * also sets pivot needed for partitioning, this is why Cmp is needed
+         * */
+        int blocksCnt = initTasks(elemPerBlock, Cmp);
+        TNL_CHECK_CUDA_DEVICE;
+
+        //not enough or too many blocks needed, switch to 2nd phase
+        if (blocksCnt <= 1 || blocksCnt > cuda_blockToTaskMapping.getSize())
+            break;
+
+        //-----------------------------------------------
+        //do the partitioning
+
+        auto &task = iteration % 2 == 0 ? cuda_tasks : cuda_newTasks;
+        int externMemByteSize = elemPerBlock * sizeof(Value) + sizeof(Value); //elems + 1 for pivot
+
+        /**
+         * check if partition procedure can use shared memory for coalesced write after reordering
+         *
+         * move elements smaller than pivot to the left and bigger to the right
+         * note: pivot isnt inserted in the middle yet
+         * */
+        if (externMemByteSize <= maxSharable)
+        {
+            cudaQuickSort1stPhase<Value, CMP, true>
+                <<<blocksCnt, threadsPerBlock, externMemByteSize>>>(
+                    arr, aux, Cmp, elemPerBlock,
+                    task, cuda_blockToTaskMapping);
+        }
+        else
+        {
+            cudaQuickSort1stPhase<Value, CMP, false>
+                <<<blocksCnt, threadsPerBlock, sizeof(Value)>>>(
+                    arr, aux, Cmp, elemPerBlock,
+                    task, cuda_blockToTaskMapping);
+        }
+        TNL_CHECK_CUDA_DEVICE;
+
+        /**
+         * fill in the gap between smaller and bigger with elements == pivot
+         * after writing also create new tasks, each task generates at max 2 tasks
+         *
+         * tasks smaller than desired_2ndPhasElemPerBlock go into 2nd phase
+         * bigger need more blocks to partition and are written into newTask
+         * with iteration %2, rotate between the 2 tasks array to save from copying
+         * */
+        auto &newTask = iteration % 2 == 0 ? cuda_newTasks : cuda_tasks;
+        cudaWritePivot<Value>
+            <<<host_1stPhaseTasksAmount, threadsPerBlock, sizeof(Value)>>>(
+                arr, aux, desired_2ndPhasElemPerBlock,
+                task, newTask, cuda_newTasksAmount.getData(),
+                cuda_2ndPhaseTasks, cuda_2ndPhaseTasksAmount.getData());
+        TNL_CHECK_CUDA_DEVICE;
+
+        //----------------------------------------
+
+        processNewTasks();
+        iteration++;
+    }
+#endif
+}
+
+template< typename Value >
+   template< typename CMP >
+void
+Quicksorter< Value, Devices::Cuda >::
+secondPhase(const CMP &Cmp)
+{
+#ifdef HAVE_CUDA
+    int total2ndPhase = host_1stPhaseTasksAmount + host_2ndPhaseTasksAmount;
+    const int stackSize = 32;
+    auto &leftoverTasks = iteration % 2 == 0 ? cuda_tasks : cuda_newTasks;
+
+    int elemInShared = desiredElemPerBlock;
+    int externSharedByteSize = elemInShared * sizeof(Value) + sizeof(Value); //reserve space for storing elements + 1 pivot
+    if (externSharedByteSize > maxSharable)
+    {
+        externSharedByteSize = sizeof(Value);
+        elemInShared = 0;
+    }
+
+    if (host_1stPhaseTasksAmount > 0 && host_2ndPhaseTasksAmount > 0)
+    {
+        auto tasks = leftoverTasks.getView(0, host_1stPhaseTasksAmount);
+        auto tasks2 = cuda_2ndPhaseTasks.getView(0, host_2ndPhaseTasksAmount);
+
+        cudaQuickSort2ndPhase<Value, CMP, stackSize>
+            <<<total2ndPhase, threadsPerBlock, externSharedByteSize>>>(arr, aux, Cmp, tasks, tasks2, elemInShared, desired_2ndPhasElemPerBlock);
+    }
+    else if (host_1stPhaseTasksAmount > 0)
+    {
+        auto tasks = leftoverTasks.getView(0, host_1stPhaseTasksAmount);
+        cudaQuickSort2ndPhase<Value, CMP, stackSize>
+            <<<total2ndPhase, threadsPerBlock, externSharedByteSize>>>(arr, aux, Cmp, tasks, elemInShared, desired_2ndPhasElemPerBlock);
+    }
+    else
+    {
+        auto tasks2 = cuda_2ndPhaseTasks.getView(0, host_2ndPhaseTasksAmount);
+
+        cudaQuickSort2ndPhase<Value, CMP, stackSize>
+            <<<total2ndPhase, threadsPerBlock, externSharedByteSize>>>(arr, aux, Cmp, tasks2, elemInShared, desired_2ndPhasElemPerBlock);
+    }
+#endif
+}
+
+template< typename Value >
+int getSetsNeededFunction(int elemPerBlock, const Quicksorter< Value, Devices::Cuda >& quicksort )
+{
+    auto view = quicksort.iteration % 2 == 0 ? quicksort.cuda_tasks.getConstView() : quicksort.cuda_newTasks.getConstView();
+    auto fetch = [=] __cuda_callable__(int i) -> int {
+        const auto &task = view[i];
+        int size = task.partitionEnd - task.partitionBegin;
+        return size / elemPerBlock + (size % elemPerBlock != 0);
+    };
+    auto reduction = [] __cuda_callable__(int a, int b) { return a + b; };
+    return Algorithms::reduce<Devices::Cuda>( 0, quicksort.host_1stPhaseTasksAmount, fetch, reduction, 0 );
+}
+
+template< typename Value >
+int
+Quicksorter< Value, Devices::Cuda >::
+getSetsNeeded(int elemPerBlock) const
+{
+    /*auto view = iteration % 2 == 0 ? cuda_tasks.getConstView() : cuda_newTasks.getConstView();
+    auto fetch = [=] __cuda_callable__(int i) {
+        const auto &task = view[i];
+        int size = task.partitionEnd - task.partitionBegin;
+        return size / elemPerBlock + (size % elemPerBlock != 0);
+    };
+    auto reduction = [] __cuda_callable__(int a, int b) { return a + b; };
+    return Algorithms::reduce<Devices::Cuda>(0, host_1stPhaseTasksAmount, fetch, reduction, 0);*/
+    return getSetsNeededFunction< Value >( elemPerBlock, *this );
+}
+
+template< typename Value >
+int
+Quicksorter< Value, Devices::Cuda >::
+getElemPerBlock() const
+{
+    return desiredElemPerBlock;
+
+    int setsNeeded = getSetsNeeded(desiredElemPerBlock);
+
+    if (setsNeeded <= maxBlocks)
+        return desiredElemPerBlock;
+
+    //want multiplier*minElemPerBLock <= x*threadPerBlock
+    //find smallest x so that this inequality holds
+    double multiplier = 1. * setsNeeded / maxBlocks;
+    int elemPerBlock = multiplier * desiredElemPerBlock;
+    setsNeeded = elemPerBlock / threadsPerBlock + (elemPerBlock % threadsPerBlock != 0);
+
+    return setsNeeded * threadsPerBlock;
+}
+
+template< typename Value >
+   template< typename CMP >
+int
+Quicksorter< Value, Devices::Cuda >::
+initTasks(int elemPerBlock, const CMP &Cmp)
+{
+#ifdef HAVE_CUDA
+    auto &src = iteration % 2 == 0 ? arr : aux;
+    auto &tasks = iteration % 2 == 0 ? cuda_tasks : cuda_newTasks;
+
+    //--------------------------------------------------------
+    int blocks = host_1stPhaseTasksAmount / threadsPerBlock + (host_1stPhaseTasksAmount % threadsPerBlock != 0);
+
+    cudaCalcBlocksNeeded<<<blocks, threadsPerBlock>>>(tasks.getView(0, host_1stPhaseTasksAmount), elemPerBlock,
+                                                      cuda_reductionTaskInitMem.getView(0, host_1stPhaseTasksAmount));
+    //cuda_reductionTaskInitMem[i] == how many blocks task i needs
+
+    //auto reduce = [] __cuda_callable__(const int &a, const int &b) { return a + b; };
+
+    Algorithms::Scan<Devices::Cuda, Algorithms::ScanType::Inclusive >::
+        perform(cuda_reductionTaskInitMem, 0, cuda_reductionTaskInitMem.getSize(), TNL::Plus{}, 0);
+    //cuda_reductionTaskInitMem[i] == how many blocks task [0..i] need
+
+    int blocksNeeded = cuda_reductionTaskInitMem.getElement(host_1stPhaseTasksAmount - 1);
+
+    //need too many blocks, give back control
+    if (blocksNeeded > cuda_blockToTaskMapping.getSize())
+        return blocksNeeded;
+
+    //--------------------------------------------------------
+
+    cudaInitTask<<<host_1stPhaseTasksAmount, threadsPerBlock>>>(
+        tasks.getView(0, host_1stPhaseTasksAmount),                     //task to read from
+        cuda_blockToTaskMapping.getView(0, blocksNeeded),               //maps block to a certain task
+        cuda_reductionTaskInitMem.getView(0, host_1stPhaseTasksAmount), //has how many each task need blocks precalculated
+        src, Cmp);                                                      //used to pick pivot
+
+    cuda_newTasksAmount.setElement(0, 0); //resets new element counter
+    return blocksNeeded;
+#else
+    return -1;
+#endif
+}
+
+template <typename Value>
+void
+Quicksorter< Value, Devices::Cuda >::
+processNewTasks()
+{
+    host_1stPhaseTasksAmount = min(cuda_newTasksAmount.getElement(0), maxTasks);
+    host_2ndPhaseTasksAmount = min(cuda_2ndPhaseTasksAmount.getElement(0), maxTasks);
+}
+
+
+        } // namespace Sorting
+    } // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Algorithms/Sorting/detail/bitonicSort.h b/src/TNL/Algorithms/Sorting/detail/bitonicSort.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ccd0569c2e0e14dbb6d12ccd656459ae2171c46
--- /dev/null
+++ b/src/TNL/Algorithms/Sorting/detail/bitonicSort.h
@@ -0,0 +1,379 @@
+#pragma once
+#include <TNL/Containers/Array.h>
+#include <TNL/Algorithms/Sorting/detail/blockBitonicSort.h>
+#include <TNL/Algorithms/Sorting/detail/helpers.h>
+
+namespace TNL {
+    namespace Algorithms {
+        namespace Sorting {
+
+#ifdef HAVE_CUDA
+
+/**
+ * this kernel simulates 1 exchange
+ * splits input arr that is bitonic into 2 bitonic sequences
+ */
+template <typename Value, typename CMP>
+__global__ void bitonicMergeGlobal(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr,
+                                   CMP Cmp,
+                                   int monotonicSeqLen, int bitonicLen)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    int part = i / (bitonicLen / 2); //computes which sorting block this thread belongs to
+
+    //the index of 2 elements that should be compared and swapped
+    int s = part * bitonicLen + (i & ((bitonicLen / 2) - 1));
+    int e = s + bitonicLen / 2;
+    if (e >= arr.getSize()) //arr[e] is virtual padding and will not be exchanged with
+        return;
+
+    int partsInSeq = monotonicSeqLen / bitonicLen;
+    //calculate the direction of swapping
+    int monotonicSeqIdx = part / partsInSeq;
+    bool ascending = (monotonicSeqIdx & 1) != 0;
+    if ((monotonicSeqIdx + 1) * monotonicSeqLen >= arr.getSize()) //special case for part with no "partner" to be merged with in next phase
+        ascending = true;
+
+    cmpSwap(arr[s], arr[e], ascending, Cmp);
+}
+
+//---------------------------------------------
+//---------------------------------------------
+
+/**
+ * simulates many layers of merge
+ * turns input that is a bitonic sequence into 1 monotonic sequence
+ *
+ * this version uses shared memory to do the operations
+ * */
+template <typename Value, typename CMP>
+__global__ void bitonicMergeSharedMemory(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr,
+                                         CMP Cmp,
+                                         int monotonicSeqLen, int bitonicLen)
+{
+    extern __shared__ int externMem[];
+    Value *sharedMem = (Value *)externMem;
+
+    int sharedMemLen = 2 * blockDim.x;
+
+    //1st index and last index of subarray that this threadBlock should merge
+    int myBlockStart = blockIdx.x * sharedMemLen;
+    int myBlockEnd = TNL::min(arr.getSize(), myBlockStart + sharedMemLen);
+
+    //copy from globalMem into sharedMem
+    for (int i = threadIdx.x; myBlockStart + i < myBlockEnd; i += blockDim.x)
+        sharedMem[i] = arr[myBlockStart + i];
+    __syncthreads();
+
+    //------------------------------------------
+    //bitonic activity
+    {
+        //calculate the direction of swapping
+        int i = blockIdx.x * blockDim.x + threadIdx.x;
+        int part = i / (bitonicLen / 2);
+        int partsInSeq = monotonicSeqLen / bitonicLen;
+        int monotonicSeqIdx = part / partsInSeq;
+
+        bool ascending = (monotonicSeqIdx & 1) != 0;
+        //special case for parts with no "partner"
+        if ((monotonicSeqIdx + 1) * monotonicSeqLen >= arr.getSize())
+            ascending = true;
+        //------------------------------------------
+
+        //do bitonic merge
+        for (; bitonicLen > 1; bitonicLen /= 2)
+        {
+            //calculates which 2 indexes will be compared and swap
+            int part = threadIdx.x / (bitonicLen / 2);
+            int s = part * bitonicLen + (threadIdx.x & ((bitonicLen / 2) - 1));
+            int e = s + bitonicLen / 2;
+
+            if (e < myBlockEnd - myBlockStart) //not touching virtual padding
+                cmpSwap(sharedMem[s], sharedMem[e], ascending, Cmp);
+            __syncthreads();
+        }
+    }
+
+    //------------------------------------------
+
+    //writeback to global memory
+    for (int i = threadIdx.x; myBlockStart + i < myBlockEnd; i += blockDim.x)
+        arr[myBlockStart + i] = sharedMem[i];
+}
+
+
+/**
+ * entrypoint for bitonicSort_Block
+ * sorts @param arr in alternating order to create bitonic sequences
+ * sharedMem has to be able to store at least blockDim.x*2 elements
+ * */
+template <typename Value, typename CMP>
+__global__ void bitoniSort1stStepSharedMemory(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, CMP Cmp)
+{
+    extern __shared__ int externMem[];
+
+    Value * sharedMem = (Value *)externMem;
+    int sharedMemLen = 2*blockDim.x;
+
+    int myBlockStart = blockIdx.x * sharedMemLen;
+    int myBlockEnd = TNL::min(arr.getSize(), myBlockStart+sharedMemLen);
+
+    //copy from globalMem into sharedMem
+    for (int i = threadIdx.x; myBlockStart + i < myBlockEnd; i += blockDim.x)
+        sharedMem[i] = arr[myBlockStart + i];
+    __syncthreads();
+
+    //------------------------------------------
+    //bitonic activity
+    {
+        int i = blockIdx.x * blockDim.x + threadIdx.x;
+        int paddedSize = closestPow2(myBlockEnd - myBlockStart);
+
+        for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2)
+        {
+            //calculate the direction of swapping
+            int monotonicSeqIdx = i / (monotonicSeqLen/2);
+            bool ascending = (monotonicSeqIdx & 1) != 0;
+            if ((monotonicSeqIdx + 1) * monotonicSeqLen >= arr.getSize()) //special case for parts with no "partner"
+                ascending = true;
+
+            for (int len = monotonicSeqLen; len > 1; len /= 2)
+            {
+                //calculates which 2 indexes will be compared and swap
+                int part = threadIdx.x / (len / 2);
+                int s = part * len + (threadIdx.x & ((len / 2) - 1));
+                int e = s + len / 2;
+
+                if(e < myBlockEnd - myBlockStart) //touching virtual padding
+                    cmpSwap(sharedMem[s], sharedMem[e], ascending, Cmp);
+                __syncthreads();
+            }
+        }
+    }
+
+    //writeback to global memory
+    for (int i = threadIdx.x; myBlockStart + i < myBlockEnd; i += blockDim.x)
+        arr[myBlockStart + i] = sharedMem[i];
+}
+#endif
+
+
+template <typename Value, typename CMP>
+void bitonicSortWithShared(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> view, const CMP &Cmp,
+                           int gridDim, int blockDim, int sharedMemLen, int sharedMemSize)
+{
+#ifdef HAVE_CUDA
+    int paddedSize = closestPow2(view.getSize());
+
+    bitoniSort1stStepSharedMemory<<<gridDim, blockDim, sharedMemSize>>>(view, Cmp);
+    //now alternating monotonic sequences with bitonicLenght of sharedMemLen
+
+    // \/ has bitonicLength of 2 * sharedMemLen
+    for (int monotonicSeqLen = 2 * sharedMemLen; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2)
+    {
+        for (int bitonicLen = monotonicSeqLen; bitonicLen > 1; bitonicLen /= 2)
+        {
+            if (bitonicLen > sharedMemLen)
+            {
+                bitonicMergeGlobal<<<gridDim, blockDim>>>(
+                    view, Cmp, monotonicSeqLen, bitonicLen);
+            }
+            else
+            {
+                bitonicMergeSharedMemory<<<gridDim, blockDim, sharedMemSize>>>(
+                    view, Cmp, monotonicSeqLen, bitonicLen);
+
+                //simulates sorts until bitonicLen == 2 already, no need to continue this loop
+                break;
+            }
+        }
+    }
+    cudaDeviceSynchronize();
+#endif
+}
+
+//---------------------------------------------
+
+template <typename Value, typename CMP>
+void bitonicSort(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> view,
+                 const CMP &Cmp,
+                 int gridDim, int blockDim)
+
+{
+#ifdef HAVE_CUDA
+    int paddedSize = closestPow2(view.getSize());
+
+    for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2)
+    {
+        for (int bitonicLen = monotonicSeqLen; bitonicLen > 1; bitonicLen /= 2)
+        {
+            bitonicMergeGlobal<<<gridDim, blockDim>>>(view, Cmp, monotonicSeqLen, bitonicLen);
+        }
+    }
+    cudaDeviceSynchronize();
+#endif
+}
+
+//---------------------------------------------
+template <typename Value, typename CMP>
+void bitonicSort(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src, int begin, int end, const CMP &Cmp)
+{
+#ifdef HAVE_CUDA
+    auto view = src.getView(begin, end);
+
+    int threadsNeeded = view.getSize() / 2 + (view.getSize() % 2 != 0);
+
+    cudaDeviceProp deviceProp;
+    cudaGetDeviceProperties(&deviceProp, 0);
+
+    const int maxThreadsPerBlock = 512;
+
+    int sharedMemLen = maxThreadsPerBlock * 2;
+    size_t sharedMemSize = sharedMemLen * sizeof(Value);
+
+    if (sharedMemSize <= deviceProp.sharedMemPerBlock)
+    {
+        int blockDim = maxThreadsPerBlock;
+        int gridDim = threadsNeeded / blockDim + (threadsNeeded % blockDim != 0);
+        bitonicSortWithShared(view, Cmp, gridDim, blockDim, sharedMemLen, sharedMemSize);
+    }
+    else if (sharedMemSize / 2 <= deviceProp.sharedMemPerBlock)
+    {
+        int blockDim = maxThreadsPerBlock / 2; //256
+        int gridDim = threadsNeeded / blockDim + (threadsNeeded % blockDim != 0);
+        sharedMemSize /= 2;
+        sharedMemLen /= 2;
+        bitonicSortWithShared(view, Cmp, gridDim, blockDim, sharedMemLen, sharedMemSize);
+    }
+    else
+    {
+        int gridDim = threadsNeeded / maxThreadsPerBlock + (threadsNeeded % maxThreadsPerBlock != 0);
+        bitonicSort(view, Cmp, gridDim, maxThreadsPerBlock);
+    }
+#endif
+}
+
+//---------------------------------------------
+
+template <typename Value, typename CMP>
+void bitonicSort(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, int begin, int end)
+{
+    bitonicSort(arr, begin, end, [] __cuda_callable__(const Value &a, const Value &b) { return a < b; });
+}
+
+template <typename Value, typename CMP>
+void bitonicSort(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, const CMP &Cmp)
+{
+    bitonicSort(arr, 0, arr.getSize(), Cmp);
+}
+
+template <typename Value>
+void bitonicSort(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr)
+{
+    bitonicSort(arr, [] __cuda_callable__(const Value &a, const Value &b) { return a < b; });
+}
+
+//---------------------------------------------
+template <typename Value, typename CMP>
+void bitonicSort(std::vector<Value> &vec, int begin, int end, const CMP &Cmp)
+{
+    TNL::Containers::Array<Value, TNL::Devices::Cuda> Arr(vec);
+    auto view = Arr.getView();
+    bitonicSort(view, begin, end, Cmp);
+
+    TNL::Algorithms::MultiDeviceMemoryOperations<TNL::Devices::Host, TNL::Devices::Cuda>::
+        copy(vec.data(), view.getData(), view.getSize());
+}
+
+template <typename Value>
+void bitonicSort(std::vector<Value> &vec, int begin, int end)
+{
+    bitonicSort(vec, begin, end, [] __cuda_callable__(const Value &a, const Value &b) { return a < b; });
+}
+
+template <typename Value, typename CMP>
+void bitonicSort(std::vector<Value> &vec, const CMP &Cmp)
+{
+    bitonicSort(vec, 0, vec.size(), Cmp);
+}
+
+template <typename Value>
+void bitonicSort(std::vector<Value> &vec)
+{
+    bitonicSort(vec, [] __cuda_callable__(const Value &a, const Value &b) { return a < b; });
+}
+
+template <typename Value>
+void bitonicSort( TNL::Containers::Array< Value, TNL::Devices::Host > &vec)
+{
+    bitonicSort(vec, [] __cuda_callable__(const Value &a, const Value &b) { return a < b; });
+}
+
+
+//---------------------------------------------
+//---------------------------------------------
+
+#ifdef HAVE_CUDA
+template< typename CMP, typename SWAP>
+__global__ void bitonicMergeGlobal(int size, CMP Cmp, SWAP Swap,
+                                   int monotonicSeqLen, int bitonicLen)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    int part = i / (bitonicLen / 2); //computes which sorting block this thread belongs to
+
+    //the index of 2 elements that should be compared and swapped
+    int s = part * bitonicLen + (i & ((bitonicLen / 2) - 1));
+    int e = s + bitonicLen / 2;
+    if (e >= size) //arr[e] is virtual padding and will not be exchanged with
+        return;
+
+    //calculate the direction of swapping
+    int partsInSeq = monotonicSeqLen / bitonicLen;
+    int monotonicSeqIdx = part / partsInSeq;
+    bool ascending = (monotonicSeqIdx & 1) != 0;
+    if ((monotonicSeqIdx + 1) * monotonicSeqLen >= size) //special case for part with no "partner" to be merged with in next phase
+        ascending = true;
+
+    if( ascending == Cmp(e, s) )
+        Swap(s, e);
+}
+
+template< typename CMP, typename SWAP >
+void bitonicSort(int begin, int end, const CMP &Cmp, SWAP Swap)
+{
+    int size = end - begin;
+    int paddedSize = closestPow2(size);
+
+    int threadsNeeded = size / 2 + (size % 2 != 0);
+
+    const int maxThreadsPerBlock = 512;
+    int threadsPerBlock = maxThreadsPerBlock;
+    int blocks = threadsNeeded / threadsPerBlock + (threadsNeeded % threadsPerBlock != 0);
+
+    auto compareWithOffset =
+        [=] __cuda_callable__(int i, int j) {
+            return Cmp(i + begin, j + begin);
+        };
+
+    auto swapWithOffset =
+        [=] __cuda_callable__(int i, int j) mutable {
+            Swap(i + begin, j + begin);
+        };
+
+    for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2)
+    {
+        for (int bitonicLen = monotonicSeqLen; bitonicLen > 1; bitonicLen /= 2)
+        {
+            bitonicMergeGlobal<<<blocks, threadsPerBlock>>>(
+                size, compareWithOffset, swapWithOffset,
+                monotonicSeqLen, bitonicLen);
+        }
+    }
+    cudaDeviceSynchronize();
+}
+#endif
+        } // namespace Sorting
+    } // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Algorithms/Sorting/detail/blockBitonicSort.h b/src/TNL/Algorithms/Sorting/detail/blockBitonicSort.h
new file mode 100644
index 0000000000000000000000000000000000000000..f0732dcb0d51fc7063e7060272c0f267fbed3d20
--- /dev/null
+++ b/src/TNL/Algorithms/Sorting/detail/blockBitonicSort.h
@@ -0,0 +1,111 @@
+#pragma once
+#include <TNL/Algorithms/Sorting/detail/helpers.h>
+#include <TNL/Containers/Array.h>
+
+namespace TNL {
+    namespace Algorithms {
+        namespace Sorting {
+
+#ifdef HAVE_CUDA
+
+/**
+ * IMPORTANT: all threads in block have to call this function to work properly
+ * the size of src isn't limited, but for optimal efficiency, no more than 8*blockDim.x should be used
+ * Description: sorts src and writes into dst within a block
+ * works independently from other concurrent blocks
+ * @param sharedMem sharedMem pointer has to be able to store all of src elements
+ * */
+template <typename Value, typename CMP>
+__device__ void bitonicSort_Block(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src,
+                                  TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> dst,
+                                  Value *sharedMem, const CMP &Cmp)
+{
+    //copy from globalMem into sharedMem
+    for (int i = threadIdx.x; i < src.getSize(); i += blockDim.x)
+        sharedMem[i] = src[i];
+    __syncthreads();
+
+    //------------------------------------------
+    //bitonic activity
+    {
+        int paddedSize = closestPow2_ptx(src.getSize());
+
+        for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2)
+        {
+            for (int bitonicLen = monotonicSeqLen; bitonicLen > 1; bitonicLen /= 2)
+            {
+                for (int i = threadIdx.x;; i += blockDim.x) //simulates other blocks in case src.size > blockDim.x*2
+                {
+                    //calculates which 2 indexes will be compared and swap
+                    int part = i / (bitonicLen / 2);
+                    int s = part * bitonicLen + (i & ((bitonicLen / 2) - 1));
+                    int e = s + bitonicLen / 2;
+
+                    if (e >= src.getSize()) //touching virtual padding, the order dont swap
+                        break;
+
+                    //calculate the direction of swapping
+                    int monotonicSeqIdx = i / (monotonicSeqLen / 2);
+                    bool ascending = (monotonicSeqIdx & 1) != 0;
+                    if ((monotonicSeqIdx + 1) * monotonicSeqLen >= src.getSize()) //special case for parts with no "partner"
+                        ascending = true;
+
+                    cmpSwap(sharedMem[s], sharedMem[e], ascending, Cmp);
+                }
+
+                __syncthreads(); //only 1 synchronization needed
+            }
+        }
+    }
+
+    //------------------------------------------
+    //writeback to global memory
+    for (int i = threadIdx.x; i < dst.getSize(); i += blockDim.x)
+        dst[i] = sharedMem[i];
+}
+
+/**
+ * IMPORTANT: all threads in block have to call this function to work properly
+ * IMPORTANT: unlike the counterpart with shared memory, this function only works in-place
+ * the size of src isn't limited, but for optimal efficiency, no more than 8*blockDim.x should be used
+ * Description: sorts src in place using bitonic sort
+ * works independently from other concurrent blocks
+ * this version doesnt use shared memory and is prefered for Value with big size
+ * */
+template <typename Value, typename CMP>
+__device__ void bitonicSort_Block(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src,
+                                  const CMP &Cmp)
+{
+    int paddedSize = closestPow2_ptx(src.getSize());
+
+    for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2)
+    {
+        for (int bitonicLen = monotonicSeqLen; bitonicLen > 1; bitonicLen /= 2)
+        {
+            for (int i = threadIdx.x;; i += blockDim.x) //simulates other blocks in case src.size > blockDim.x*2
+            {
+                //calculates which 2 indexes will be compared and swap
+                int part = i / (bitonicLen / 2);
+                int s = part * bitonicLen + (i & ((bitonicLen / 2) - 1));
+                int e = s + bitonicLen / 2;
+
+                if (e >= src.getSize())
+                    break;
+
+                //calculate the direction of swapping
+                int monotonicSeqIdx = i / (monotonicSeqLen / 2);
+                bool ascending = (monotonicSeqIdx & 1) != 0;
+                if ((monotonicSeqIdx + 1) * monotonicSeqLen >= src.getSize()) //special case for parts with no "partner"
+                    ascending = true;
+
+                cmpSwap(src[s], src[e], ascending, Cmp);
+            }
+            __syncthreads();
+        }
+    }
+}
+
+#endif
+        } // namespace Sorting
+    } // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Algorithms/Sorting/detail/cudaPartition.h b/src/TNL/Algorithms/Sorting/detail/cudaPartition.h
new file mode 100644
index 0000000000000000000000000000000000000000..a6afaa20a33a0c687a6814bcfee92546e38086b7
--- /dev/null
+++ b/src/TNL/Algorithms/Sorting/detail/cudaPartition.h
@@ -0,0 +1,227 @@
+/***************************************************************************
+                          cudaPartition.h  -  description
+                             -------------------
+    begin                : Jul 13, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Xuan Thang Nguyen
+
+#pragma once
+
+#include <TNL/Containers/Array.h>
+#include <TNL/Algorithms/Sorting/detail/reduction.h>
+#include <TNL/Algorithms/Sorting/detail/task.h>
+
+namespace TNL {
+   namespace Algorithms {
+      namespace Sorting {
+
+#ifdef HAVE_CUDA
+
+template <typename Value, typename Device, typename CMP>
+__device__ Value pickPivot(TNL::Containers::ArrayView<Value, Device> src, const CMP &Cmp)
+{
+    //return src[0];
+    //return src[src.getSize()-1];
+
+    if (src.getSize() == 1)
+        return src[0];
+
+    const Value &a = src[0], &b = src[src.getSize() / 2], &c = src[src.getSize() - 1];
+
+    if (Cmp(a, b)) // ..a..b..
+    {
+        if (Cmp(b, c)) // ..a..b..c
+            return b;
+        else if (Cmp(c, a)) //..c..a..b..
+            return a;
+        else //..a..c..b..
+            return c;
+    }
+    else //..b..a..
+    {
+        if (Cmp(a, c)) //..b..a..c
+            return a;
+        else if (Cmp(c, b)) //..c..b..a..
+            return b;
+        else //..b..c..a..
+            return c;
+    }
+}
+
+template <typename Value, typename Device, typename CMP>
+__device__ int pickPivotIdx(TNL::Containers::ArrayView<Value, Device> src, const CMP &Cmp)
+{
+    //return 0;
+    //return src.getSize()-1;
+
+    if (src.getSize() <= 1)
+        return 0;
+
+    const Value &a = src[0], &b = src[src.getSize() / 2], &c = src[src.getSize() - 1];
+
+    if (Cmp(a, b)) // ..a..b..
+    {
+        if (Cmp(b, c)) // ..a..b..c
+            return src.getSize() / 2;
+        else if (Cmp(c, a)) //..c..a..b..
+            return 0;
+        else //..a..c..b..
+            return src.getSize() - 1;
+    }
+    else //..b..a..
+    {
+        if (Cmp(a, c)) //..b..a..c
+            return 0;
+        else if (Cmp(c, b)) //..c..b..a..
+            return src.getSize() / 2;
+        else //..b..c..a..
+            return src.getSize() - 1;
+    }
+}
+
+//-----------------------------------------------------------
+
+template <typename Value, typename CMP>
+__device__ void countElem( Containers::ArrayView<Value, Devices::Cuda> arr,
+                           const CMP &Cmp,
+                           int &smaller, int &bigger,
+                           const Value &pivot)
+{
+    for (int i = threadIdx.x; i < arr.getSize(); i += blockDim.x)
+    {
+        const Value &data = arr[i];
+        if (Cmp(data, pivot))
+            smaller++;
+        else if (Cmp(pivot, data))
+            bigger++;
+    }
+}
+
+//-----------------------------------------------------------
+
+template <typename Value, typename CMP>
+__device__ void copyDataShared( Containers::ArrayView<Value, Devices::Cuda> src,
+                                Containers::ArrayView<Value, Devices::Cuda> dst,
+                                const CMP &Cmp,
+                                Value *sharedMem,
+                                int smallerStart, int biggerStart,
+                                int smallerTotal, int biggerTotal,
+                                int smallerOffset, int biggerOffset, //exclusive prefix sum of elements
+                                const Value &pivot)
+{
+
+    for (int i = threadIdx.x; i < src.getSize(); i += blockDim.x)
+    {
+        const Value &data = src[i];
+        if (Cmp(data, pivot))
+            sharedMem[smallerOffset++] = data;
+        else if (Cmp(pivot, data))
+            sharedMem[smallerTotal + biggerOffset++] = data;
+    }
+    __syncthreads();
+
+    for (int i = threadIdx.x; i < smallerTotal + biggerTotal; i += blockDim.x)
+    {
+        if (i < smallerTotal)
+            dst[smallerStart + i] = sharedMem[i];
+        else
+            dst[biggerStart + i - smallerTotal] = sharedMem[i];
+    }
+}
+
+template <typename Value, typename CMP>
+__device__ void copyData( Containers::ArrayView<Value, Devices::Cuda> src,
+                          Containers::ArrayView<Value, Devices::Cuda> dst,
+                          const CMP &Cmp,
+                          int smallerStart, int biggerStart,
+                          const Value &pivot)
+{
+    for (int i = threadIdx.x; i < src.getSize(); i += blockDim.x)
+    {
+        const Value &data = src[i];
+        if (Cmp(data, pivot))
+        {
+            /*
+            if(smallerStart >= dst.getSize() || smallerStart < 0)
+                printf("failed smaller: b:%d t:%d: tried to write into [%d]/%d\n", blockDim.x, threadIdx.x, smallerStart, dst.getSize());
+            */
+            dst[smallerStart++] = data;
+        }
+        else if (Cmp(pivot, data))
+        {
+            /*
+            if(biggerStart >= dst.getSize() || biggerStart < 0)
+                printf("failed bigger: b:%d t:%d: tried to write into [%d]/%d\n", blockDim.x, threadIdx.x, biggerStart, dst.getSize());
+            */
+            dst[biggerStart++] = data;
+        }
+    }
+}
+
+//----------------------------------------------------------------------------------
+
+template <typename Value, typename CMP, bool useShared>
+__device__ void cudaPartition( Containers::ArrayView<Value, Devices::Cuda> src,
+                               Containers::ArrayView<Value, Devices::Cuda> dst,
+                               const CMP &Cmp,
+                               Value *sharedMem,
+                               const Value &pivot,
+                               int elemPerBlock, TASK &task)
+{
+    static __shared__ int smallerStart, biggerStart;
+
+    int myBegin = elemPerBlock * (blockIdx.x - task.firstBlock);
+    int myEnd = TNL::min(myBegin + elemPerBlock, src.getSize());
+
+    auto srcView = src.getView(myBegin, myEnd);
+
+    //-------------------------------------------------------------------------
+
+    int smaller = 0, bigger = 0;
+    countElem(srcView, Cmp, smaller, bigger, pivot);
+
+    int smallerPrefSumInc = blockInclusivePrefixSum(smaller);
+    int biggerPrefSumInc = blockInclusivePrefixSum(bigger);
+
+    if (threadIdx.x == blockDim.x - 1) //last thread in block has sum of all values
+    {
+        smallerStart = atomicAdd(&(task.dstBegin), smallerPrefSumInc);
+        biggerStart = atomicAdd(&(task.dstEnd), -biggerPrefSumInc) - biggerPrefSumInc;
+    }
+    __syncthreads();
+
+    //-----------------------------------------------------------
+    if (useShared)
+    {
+        static __shared__ int smallerTotal, biggerTotal;
+        if (threadIdx.x == blockDim.x - 1)
+        {
+            smallerTotal = smallerPrefSumInc;
+            biggerTotal = biggerPrefSumInc;
+        }
+        __syncthreads();
+
+        copyDataShared(srcView, dst, Cmp, sharedMem,
+                       smallerStart, biggerStart,
+                       smallerTotal, biggerTotal,
+                       smallerPrefSumInc - smaller, biggerPrefSumInc - bigger, //exclusive prefix sum of elements
+                       pivot);
+    }
+    else
+    {
+        int destSmaller = smallerStart + smallerPrefSumInc - smaller;
+        int destBigger = biggerStart + biggerPrefSumInc - bigger;
+        copyData(srcView, dst, Cmp, destSmaller, destBigger, pivot);
+    }
+}
+
+#endif
+
+      } // namespace Sorting
+   } // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Algorithms/Sorting/detail/helpers.h b/src/TNL/Algorithms/Sorting/detail/helpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..2d7dbbcc7b04c0c922927f215d6f51bf185315b0
--- /dev/null
+++ b/src/TNL/Algorithms/Sorting/detail/helpers.h
@@ -0,0 +1,59 @@
+/***************************************************************************
+                          helpers.h  -  description
+                             -------------------
+    begin                : Jul 13, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Xuan Thang Nguyen
+
+#pragma once
+#include <TNL/Math.h>
+
+namespace TNL {
+    namespace Algorithms {
+        namespace Sorting {
+
+#ifdef HAVE_CUDA
+
+// Inline PTX call to return index of highest non-zero bit in a word
+static __device__ __forceinline__ unsigned int __btflo(unsigned int word)
+{
+    unsigned int ret;
+    asm volatile("bfind.u32 %0, %1;"
+                 : "=r"(ret)
+                 : "r"(word));
+    return ret;
+}
+
+__device__ int closestPow2_ptx(int bitonicLen)
+{
+    return 1 << (__btflo((unsigned)bitonicLen - 1U) + 1);
+}
+
+__host__ __device__ int closestPow2(int x)
+{
+    if (x == 0)
+        return 0;
+
+    int ret = 1;
+    while (ret < x)
+        ret <<= 1;
+
+    return ret;
+}
+
+template <typename Value, typename CMP>
+__cuda_callable__ void cmpSwap(Value &a, Value &b, bool ascending, const CMP &Cmp)
+{
+    if (ascending == Cmp(b, a))
+        TNL::swap(a, b);
+}
+
+#endif
+        } //namespace Sorting
+    } //namespace Algorithms
+} // namespace TNL
\ No newline at end of file
diff --git a/src/TNL/Algorithms/Sorting/detail/quicksort_1Block.h b/src/TNL/Algorithms/Sorting/detail/quicksort_1Block.h
new file mode 100644
index 0000000000000000000000000000000000000000..efca29f2429fd54b852384793d67538b4ab0356d
--- /dev/null
+++ b/src/TNL/Algorithms/Sorting/detail/quicksort_1Block.h
@@ -0,0 +1,255 @@
+/***************************************************************************
+                          quicksort_1Block.h  -  description
+                             -------------------
+    begin                : Jul 13, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Xuan Thang Nguyen
+
+#pragma once
+
+#include <TNL/Containers/Array.h>
+#include "cassert"
+#include <TNL/Algorithms/Sorting/detail/bitonicSort.h>
+#include <TNL/Algorithms/Sorting/detail/reduction.h>
+#include <TNL/Algorithms/Sorting/detail/cudaPartition.h>
+
+namespace TNL {
+    namespace Algorithms {
+        namespace Sorting {
+
+#ifdef HAVE_CUDA
+
+template <typename Value, typename CMP>
+__device__ void externSort( Containers::ArrayView<Value, TNL::Devices::Cuda> src,
+                            Containers::ArrayView<Value, TNL::Devices::Cuda> dst,
+                            const CMP &Cmp, Value *sharedMem)
+{
+    bitonicSort_Block(src, dst, sharedMem, Cmp);
+}
+
+template <typename Value, typename CMP>
+__device__ void externSort( Containers::ArrayView<Value, TNL::Devices::Cuda> src,
+                            const CMP &Cmp)
+{
+    bitonicSort_Block(src, Cmp);
+}
+
+//---------------------------------------------------------------
+
+template <int stackSize>
+__device__ void stackPush(int stackArrBegin[], int stackArrEnd[],
+                          int stackDepth[], int &stackTop,
+                          int begin, int pivotBegin,
+                          int pivotEnd, int end,
+                          int iteration);
+
+//---------------------------------------------------------------
+
+template <typename Value, typename CMP, int stackSize, bool useShared>
+__device__ void singleBlockQuickSort( Containers::ArrayView<Value, TNL::Devices::Cuda> arr,
+                                      Containers::ArrayView<Value, TNL::Devices::Cuda> aux,
+                                      const CMP &Cmp, int _iteration,
+                                      Value *sharedMem, int memSize,
+                                      int maxBitonicSize)
+{
+    if (arr.getSize() <= maxBitonicSize)
+    {
+        auto &src = (_iteration & 1) == 0 ? arr : aux;
+        if (useShared && arr.getSize() <= memSize)
+            externSort<Value, CMP>(src, arr, Cmp, sharedMem);
+        else
+        {
+            externSort<Value, CMP>(src, Cmp);
+            //extern sort without shared memory only works in-place, need to copy into from aux
+            if ((_iteration & 1) != 0)
+                for (int i = threadIdx.x; i < arr.getSize(); i += blockDim.x)
+                    arr[i] = src[i];
+        }
+
+        return;
+    }
+
+    static __shared__ int stackTop;
+    static __shared__ int stackArrBegin[stackSize], stackArrEnd[stackSize], stackDepth[stackSize];
+    static __shared__ int begin, end, iteration;
+    static __shared__ int pivotBegin, pivotEnd;
+    Value *piv = sharedMem;
+    sharedMem += 1;
+
+    if (threadIdx.x == 0)
+    {
+        stackTop = 0;
+        stackArrBegin[stackTop] = 0;
+        stackArrEnd[stackTop] = arr.getSize();
+        stackDepth[stackTop] = _iteration;
+        stackTop++;
+    }
+    __syncthreads();
+
+    while (stackTop > 0)
+    {
+        //pick up partition to break up
+        if (threadIdx.x == 0)
+        {
+            begin = stackArrBegin[stackTop - 1];
+            end = stackArrEnd[stackTop - 1];
+            iteration = stackDepth[stackTop - 1];
+            stackTop--;
+        }
+        __syncthreads();
+
+        int size = end - begin;
+        auto &src = (iteration & 1) == 0 ? arr : aux;
+
+        //small enough for for bitonic
+        if (size <= maxBitonicSize)
+        {
+            if (useShared && size <= memSize)
+                externSort<Value, CMP>(src.getView(begin, end), arr.getView(begin, end), Cmp, sharedMem);
+            else
+            {
+                externSort<Value, CMP>(src.getView(begin, end), Cmp);
+                //extern sort without shared memory only works in-place, need to copy into from aux
+                if ((iteration & 1) != 0)
+                    for (int i = threadIdx.x; i < src.getSize(); i += blockDim.x)
+                        arr[begin + i] = src[i];
+            }
+            __syncthreads();
+            continue;
+        }
+
+        //------------------------------------------------------
+
+        if (threadIdx.x == 0)
+            *piv = pickPivot(src.getView(begin, end), Cmp);
+        __syncthreads();
+        Value &pivot = *piv;
+
+        int smaller = 0, bigger = 0;
+        countElem(src.getView(begin, end), Cmp, smaller, bigger, pivot);
+
+        //synchronization is in this function already
+        int smallerPrefSumInc = blockInclusivePrefixSum(smaller);
+        int biggerPrefSumInc = blockInclusivePrefixSum(bigger);
+
+        if (threadIdx.x == blockDim.x - 1) //has sum of all smaller and greater elements than pivot in src
+        {
+            pivotBegin = 0 + smallerPrefSumInc;
+            pivotEnd = size - biggerPrefSumInc;
+        }
+        __syncthreads();
+
+        //--------------------------------------------------------------
+        /**
+         * move elements, either use shared mem for coalesced access or without shared mem if data is too big
+         * */
+
+        auto &dst = (iteration & 1) == 0 ? aux : arr;
+
+        if (useShared && size <= memSize)
+        {
+            static __shared__ int smallerTotal, biggerTotal;
+            if (threadIdx.x == blockDim.x - 1)
+            {
+                smallerTotal = smallerPrefSumInc;
+                biggerTotal = biggerPrefSumInc;
+            }
+            __syncthreads();
+
+            copyDataShared(src.getView(begin, end), dst.getView(begin, end),
+                           Cmp, sharedMem,
+                           0, pivotEnd,
+                           smallerTotal, biggerTotal,
+                           smallerPrefSumInc - smaller, biggerPrefSumInc - bigger, //exclusive prefix sum of elements
+                           pivot);
+        }
+        else
+        {
+            int destSmaller = 0 + (smallerPrefSumInc - smaller);
+            int destBigger = pivotEnd + (biggerPrefSumInc - bigger);
+
+            copyData(src.getView(begin, end), dst.getView(begin, end), Cmp, destSmaller, destBigger, pivot);
+        }
+
+        __syncthreads();
+
+        for (int i = pivotBegin + threadIdx.x; i < pivotEnd; i += blockDim.x)
+            arr[begin + i] = pivot;
+
+        //creates new tasks
+        if (threadIdx.x == 0)
+        {
+            stackPush<stackSize>(stackArrBegin, stackArrEnd, stackDepth, stackTop,
+                                 begin, begin + pivotBegin,
+                                 begin + pivotEnd, end,
+                                 iteration);
+        }
+        __syncthreads(); //sync to update stackTop
+    }                    //ends while loop
+}
+
+//--------------------------------------------------------------
+
+template <int stackSize>
+__device__ void stackPush(int stackArrBegin[], int stackArrEnd[],
+                          int stackDepth[], int &stackTop,
+                          int begin, int pivotBegin,
+                          int pivotEnd, int end,
+                          int iteration)
+{
+    int sizeL = pivotBegin - begin, sizeR = end - pivotEnd;
+
+    //push the bigger one 1st and then smaller one 2nd
+    //in next iteration, the smaller part will be handled 1st
+    if (sizeL > sizeR)
+    {
+        if (sizeL > 0) //left from pivot are smaller elems
+        {
+            stackArrBegin[stackTop] = begin;
+            stackArrEnd[stackTop] = pivotBegin;
+            stackDepth[stackTop] = iteration + 1;
+            stackTop++;
+        }
+
+        if (sizeR > 0) //right from pivot until end are elem greater than pivot
+        {
+            assert(stackTop < stackSize && "Local quicksort stack overflow.");
+
+            stackArrBegin[stackTop] = pivotEnd;
+            stackArrEnd[stackTop] = end;
+            stackDepth[stackTop] = iteration + 1;
+            stackTop++;
+        }
+    }
+    else
+    {
+        if (sizeR > 0) //right from pivot until end are elem greater than pivot
+        {
+            stackArrBegin[stackTop] = pivotEnd;
+            stackArrEnd[stackTop] = end;
+            stackDepth[stackTop] = iteration + 1;
+            stackTop++;
+        }
+
+        if (sizeL > 0) //left from pivot are smaller elems
+        {
+            assert(stackTop < stackSize && "Local quicksort stack overflow.");
+
+            stackArrBegin[stackTop] = begin;
+            stackArrEnd[stackTop] = pivotBegin;
+            stackDepth[stackTop] = iteration + 1;
+            stackTop++;
+        }
+    }
+}
+
+#endif
+
+        } // namespace Sorting
+    } // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Algorithms/Sorting/detail/quicksort_kernel.h b/src/TNL/Algorithms/Sorting/detail/quicksort_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d26d0637bd55c12751639b36d288e890debf00e
--- /dev/null
+++ b/src/TNL/Algorithms/Sorting/detail/quicksort_kernel.h
@@ -0,0 +1,267 @@
+/***************************************************************************
+                          quicksort_kernel.h  -  description
+                             -------------------
+    begin                : Jul 13, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Xuan Thang Nguyen
+
+#pragma once
+
+#include <TNL/Containers/Array.h>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Sorting/detail/reduction.h>
+#include <TNL/Algorithms/Sorting/detail/task.h>
+#include <TNL/Algorithms/Sorting/detail/cudaPartition.h>
+#include <TNL/Algorithms/Sorting/detail/quicksort_1Block.h>
+
+
+namespace TNL {
+   namespace Algorithms {
+      namespace Sorting {
+
+#ifdef HAVE_CUDA
+
+__device__ void writeNewTask(int begin, int end, int iteration, int maxElemFor2ndPhase,
+                             Containers::ArrayView<TASK, Devices::Cuda> newTasks, int *newTasksCnt,
+                             Containers::ArrayView<TASK, Devices::Cuda> secondPhaseTasks, int *secondPhaseTasksCnt);
+
+//-----------------------------------------------------------
+
+__global__ void cudaCalcBlocksNeeded(Containers::ArrayView<TASK, Devices::Cuda> cuda_tasks, int elemPerBlock,
+                                     Containers::VectorView<int, Devices::Cuda> blocksNeeded)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= cuda_tasks.getSize())
+        return;
+
+    TASK &task = cuda_tasks[i];
+    int size = task.partitionEnd - task.partitionBegin;
+    blocksNeeded[i] = size / elemPerBlock + (size % elemPerBlock != 0);
+}
+
+//-----------------------------------------------------------
+
+template <typename Value, typename CMP>
+__global__ void cudaInitTask(Containers::ArrayView<TASK, Devices::Cuda> cuda_tasks,
+                             Containers::ArrayView<int, Devices::Cuda> cuda_blockToTaskMapping,
+                             Containers::VectorView<int, Devices::Cuda> cuda_reductionTaskInitMem,
+                             Containers::ArrayView<Value, Devices::Cuda> src, CMP Cmp)
+{
+    if (blockIdx.x >= cuda_tasks.getSize())
+        return;
+
+    int start = blockIdx.x == 0 ? 0 : cuda_reductionTaskInitMem[blockIdx.x - 1];
+    int end = cuda_reductionTaskInitMem[blockIdx.x];
+    for (int i = start + threadIdx.x; i < end; i += blockDim.x)
+        cuda_blockToTaskMapping[i] = blockIdx.x;
+
+    if (threadIdx.x == 0)
+    {
+        TASK &task = cuda_tasks[blockIdx.x];
+        int pivotIdx = task.partitionBegin + pickPivotIdx(src.getView(task.partitionBegin, task.partitionEnd), Cmp);
+        task.initTask(start, end - start, pivotIdx);
+    }
+}
+
+//----------------------------------------------------
+
+template <typename Value, typename CMP, bool useShared>
+__global__ void cudaQuickSort1stPhase(Containers::ArrayView<Value, Devices::Cuda> arr, Containers::ArrayView<Value, Devices::Cuda> aux,
+                                      const CMP &Cmp, int elemPerBlock,
+                                      Containers::ArrayView<TASK, Devices::Cuda> tasks,
+                                      Containers::ArrayView<int, Devices::Cuda> taskMapping)
+{
+    extern __shared__ int externMem[];
+    Value *piv = (Value *)externMem;
+    Value *sharedMem = piv + 1;
+
+    TASK &myTask = tasks[taskMapping[blockIdx.x]];
+    auto &src = (myTask.iteration & 1) == 0 ? arr : aux;
+    auto &dst = (myTask.iteration & 1) == 0 ? aux : arr;
+
+    if (threadIdx.x == 0)
+        *piv = src[myTask.pivotIdx];
+    __syncthreads();
+    Value &pivot = *piv;
+
+    cudaPartition<Value, CMP, useShared>(
+        src.getView(myTask.partitionBegin, myTask.partitionEnd),
+        dst.getView(myTask.partitionBegin, myTask.partitionEnd),
+        Cmp, sharedMem, pivot,
+        elemPerBlock, myTask);
+}
+
+//----------------------------------------------------
+
+template <typename Value>
+__global__ void cudaWritePivot(Containers::ArrayView<Value, Devices::Cuda> arr, Containers::ArrayView<Value, Devices::Cuda> aux, int maxElemFor2ndPhase,
+                               Containers::ArrayView<TASK, Devices::Cuda> tasks, Containers::ArrayView<TASK, Devices::Cuda> newTasks, int *newTasksCnt,
+                               Containers::ArrayView<TASK, Devices::Cuda> secondPhaseTasks, int *secondPhaseTasksCnt)
+{
+    extern __shared__ int externMem[];
+    Value *piv = (Value *)externMem;
+
+    TASK &myTask = tasks[blockIdx.x];
+
+    if (threadIdx.x == 0)
+        *piv = (myTask.iteration & 1) == 0 ? arr[myTask.pivotIdx] : aux[myTask.pivotIdx];
+    __syncthreads();
+    Value &pivot = *piv;
+
+    int leftBegin = myTask.partitionBegin, leftEnd = myTask.partitionBegin + myTask.dstBegin;
+    int rightBegin = myTask.partitionBegin + myTask.dstEnd, rightEnd = myTask.partitionEnd;
+
+    for (int i = leftEnd + threadIdx.x; i < rightBegin; i += blockDim.x)
+    {
+        /*
+        #ifdef DEBUG
+        aux[i] = -1;
+        #endif
+        */
+        arr[i] = pivot;
+    }
+
+    if (threadIdx.x != 0)
+        return;
+
+    if (leftEnd - leftBegin > 0)
+    {
+        writeNewTask(leftBegin, leftEnd, myTask.iteration,
+                     maxElemFor2ndPhase,
+                     newTasks, newTasksCnt,
+                     secondPhaseTasks, secondPhaseTasksCnt);
+    }
+
+    if (rightEnd - rightBegin > 0)
+    {
+        writeNewTask(rightBegin, rightEnd,
+                     myTask.iteration, maxElemFor2ndPhase,
+                     newTasks, newTasksCnt,
+                     secondPhaseTasks, secondPhaseTasksCnt);
+    }
+}
+
+//-----------------------------------------------------------
+
+__device__ void writeNewTask(int begin, int end, int iteration, int maxElemFor2ndPhase,
+                             Containers::ArrayView<TASK, Devices::Cuda> newTasks, int *newTasksCnt,
+                             Containers::ArrayView<TASK, Devices::Cuda> secondPhaseTasks, int *secondPhaseTasksCnt)
+{
+    int size = end - begin;
+    if (size < 0)
+    {
+        printf("negative size, something went really wrong\n");
+        return;
+    }
+
+    if (size == 0)
+        return;
+
+    if (size <= maxElemFor2ndPhase)
+    {
+        int idx = atomicAdd(secondPhaseTasksCnt, 1);
+        if (idx < secondPhaseTasks.getSize())
+            secondPhaseTasks[idx] = TASK(begin, end, iteration + 1);
+        else
+        {
+            //printf("ran out of memory, trying backup\n");
+            int idx = atomicAdd(newTasksCnt, 1);
+            if (idx < newTasks.getSize())
+                newTasks[idx] = TASK(begin, end, iteration + 1);
+            else
+                printf("ran out of memory for second phase task, there isnt even space in newTask list\nPart of array may stay unsorted!!!\n");
+        }
+    }
+    else
+    {
+        int idx = atomicAdd(newTasksCnt, 1);
+        if (idx < newTasks.getSize())
+            newTasks[idx] = TASK(begin, end, iteration + 1);
+        else
+        {
+            //printf("ran out of memory, trying backup\n");
+            int idx = atomicAdd(secondPhaseTasksCnt, 1);
+            if (idx < secondPhaseTasks.getSize())
+                secondPhaseTasks[idx] = TASK(begin, end, iteration + 1);
+            else
+                printf("ran out of memory for newtask, there isnt even space in second phase task list\nPart of array may stay unsorted!!!\n");
+        }
+    }
+}
+
+//-----------------------------------------------------------
+
+template <typename Value, typename CMP, int stackSize>
+__global__ void cudaQuickSort2ndPhase(Containers::ArrayView<Value, Devices::Cuda> arr, Containers::ArrayView<Value, Devices::Cuda> aux,
+                                      CMP Cmp,
+                                      Containers::ArrayView<TASK, Devices::Cuda> secondPhaseTasks,
+                                      int elemInShared, int maxBitonicSize)
+{
+    extern __shared__ int externMem[];
+    Value *sharedMem = (Value *)externMem;
+
+    TASK &myTask = secondPhaseTasks[blockIdx.x];
+    if (myTask.partitionEnd - myTask.partitionBegin <= 0)
+    {
+        //printf("empty task???\n");
+        return;
+    }
+
+    auto arrView = arr.getView(myTask.partitionBegin, myTask.partitionEnd);
+    auto auxView = aux.getView(myTask.partitionBegin, myTask.partitionEnd);
+
+    if (elemInShared == 0)
+    {
+        singleBlockQuickSort<Value, CMP, stackSize, false>(arrView, auxView, Cmp, myTask.iteration, sharedMem, 0, maxBitonicSize);
+    }
+    else
+    {
+        singleBlockQuickSort<Value, CMP, stackSize, true>(arrView, auxView, Cmp, myTask.iteration, sharedMem, elemInShared, maxBitonicSize);
+    }
+}
+
+template <typename Value, typename CMP, int stackSize>
+__global__ void cudaQuickSort2ndPhase(Containers::ArrayView<Value, Devices::Cuda> arr, Containers::ArrayView<Value, Devices::Cuda> aux,
+                                      CMP Cmp,
+                                      Containers::ArrayView<TASK, Devices::Cuda> secondPhaseTasks1,
+                                      Containers::ArrayView<TASK, Devices::Cuda> secondPhaseTasks2,
+                                      int elemInShared, int maxBitonicSize)
+{
+    extern __shared__ int externMem[];
+    Value *sharedMem = (Value *)externMem;
+
+    TASK myTask;
+    if (blockIdx.x < secondPhaseTasks1.getSize())
+        myTask = secondPhaseTasks1[blockIdx.x];
+    else
+        myTask = secondPhaseTasks2[blockIdx.x - secondPhaseTasks1.getSize()];
+
+    if (myTask.partitionEnd - myTask.partitionBegin <= 0)
+    {
+        //printf("empty task???\n");
+        return;
+    }
+
+    auto arrView = arr.getView(myTask.partitionBegin, myTask.partitionEnd);
+    auto auxView = aux.getView(myTask.partitionBegin, myTask.partitionEnd);
+
+    if (elemInShared <= 0)
+    {
+        singleBlockQuickSort<Value, CMP, stackSize, false>(arrView, auxView, Cmp, myTask.iteration, sharedMem, 0, maxBitonicSize);
+    }
+    else
+    {
+        singleBlockQuickSort<Value, CMP, stackSize, true>(arrView, auxView, Cmp, myTask.iteration, sharedMem, elemInShared, maxBitonicSize);
+    }
+}
+
+#endif
+
+      } // namespace Sorting
+   } // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Algorithms/Sorting/detail/reduction.h b/src/TNL/Algorithms/Sorting/detail/reduction.h
new file mode 100644
index 0000000000000000000000000000000000000000..e2bf148099e830bae96264ab4cda5faa5b964181
--- /dev/null
+++ b/src/TNL/Algorithms/Sorting/detail/reduction.h
@@ -0,0 +1,138 @@
+/***************************************************************************
+                          reduction.h  -  description
+                             -------------------
+    begin                : Jul 13, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Xuan Thang Nguyen
+
+#pragma once
+
+namespace TNL {
+    namespace Algorithms {
+        namespace Sorting {
+
+#ifdef HAVE_CUDA
+
+/**
+ * https://developer.nvidia.com/blog/faster-parallel-reductions-kepler/
+ * */
+
+
+__device__ int warpReduceSum(int initVal)
+{
+    const unsigned int maskConstant = 0xffffffff; //not used
+    for (unsigned int mask = warpSize / 2; mask > 0; mask >>= 1)
+        initVal += __shfl_xor_sync(maskConstant, initVal, mask);
+
+    return initVal;
+}
+
+__device__ int blockReduceSum(int val)
+{
+    static __shared__ int shared[32];
+    int lane = threadIdx.x & (warpSize - 1);
+    int wid = threadIdx.x / warpSize;
+
+    val = warpReduceSum(val);
+
+    if (lane == 0)
+        shared[wid] = val;
+    __syncthreads();
+
+    val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
+
+    if (wid == 0)
+        val = warpReduceSum(val);
+
+    if(threadIdx.x == 0)
+        shared[0] = val;
+    __syncthreads();
+
+    return shared[0];
+}
+
+//-------------------------------------------------------------------------------
+
+__device__ int warpInclusivePrefixSum(int value)
+{
+    int laneId = threadIdx.x & (32-1);
+
+    #pragma unroll
+    for (int i = 1; i*2 <= 32; i *= 2)//32 here is warp size
+    {
+        int n = __shfl_up_sync(0xffffffff, value, i);
+        if ((laneId & (warpSize - 1)) >= i)
+            value += n;
+    }
+
+    return value;
+}
+
+__device__ int blockInclusivePrefixSum(int value)
+{
+    static __shared__ int shared[32];
+    int lane = threadIdx.x & (warpSize - 1);
+    int wid = threadIdx.x / warpSize;
+
+    int tmp = warpInclusivePrefixSum(value);
+
+    if (lane == warpSize-1)
+        shared[wid] = tmp;
+    __syncthreads();
+
+    int tmp2 = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
+    if (wid == 0)
+        shared[lane] = warpInclusivePrefixSum(tmp2) - tmp2;
+    __syncthreads();
+
+    tmp += shared[wid];
+    return tmp;
+}
+
+//--------------------------------------------------------------------
+
+template<typename Operator>
+__device__ int warpCmpReduce(int initVal, const Operator & Cmp)
+{
+    const unsigned int maskConstant = 0xffffffff; //not used
+    for (unsigned int mask = warpSize / 2; mask > 0; mask >>= 1)
+        initVal = Cmp(initVal, __shfl_xor_sync(maskConstant, initVal, mask));
+
+    return initVal;
+}
+
+template<typename Operator>
+__device__ int blockCmpReduce(int val, const Operator & Cmp)
+{
+    static __shared__ int shared[32];
+    int lane = threadIdx.x & (warpSize - 1);
+    int wid = threadIdx.x / warpSize;
+
+    val = warpCmpReduce(val, Cmp);
+
+    if (lane == 0)
+        shared[wid] = val;
+    __syncthreads();
+
+    val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : shared[0];
+
+    if (wid == 0)
+        val = warpCmpReduce(val, Cmp);
+
+    if(threadIdx.x == 0)
+        shared[0] = val;
+    __syncthreads();
+
+    return shared[0];
+}
+
+#endif
+
+        } // namespace Sorting
+    } // namespace Algorithms
+} // namespace TNL
\ No newline at end of file
diff --git a/src/TNL/Algorithms/Sorting/detail/task.h b/src/TNL/Algorithms/Sorting/detail/task.h
new file mode 100644
index 0000000000000000000000000000000000000000..bf2130b2eac068877487321d3a1501a0a91e71fd
--- /dev/null
+++ b/src/TNL/Algorithms/Sorting/detail/task.h
@@ -0,0 +1,68 @@
+/***************************************************************************
+                          TASK.h  -  description
+                             -------------------
+    begin                : Jul 13, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Xuan Thang Nguyen
+
+#pragma once
+
+namespace TNL {
+    namespace Algorithms {
+        namespace Sorting {
+
+struct TASK
+{
+    //start and end position of array to read and write from
+    int partitionBegin, partitionEnd;
+    //-----------------------------------------------
+    //helper variables for blocks working on this task
+
+    int iteration;
+    int pivotIdx;
+    int dstBegin, dstEnd;
+    int firstBlock, blockCount;//for workers read only values
+
+    __cuda_callable__
+    TASK(int begin, int end, int iteration)
+        : partitionBegin(begin), partitionEnd(end),
+        iteration(iteration), pivotIdx(-1),
+        dstBegin(-151561), dstEnd(-151561),
+        firstBlock(-100), blockCount(-100)
+        {}
+
+    __cuda_callable__
+    void initTask(int firstBlock, int blocks, int pivotIdx)
+    {
+        dstBegin= 0; dstEnd = partitionEnd - partitionBegin;
+        this->firstBlock = firstBlock;
+        blockCount = blocks;
+        this->pivotIdx = pivotIdx;
+    }
+
+    __cuda_callable__
+    int getSize() const
+    {
+        return partitionEnd - partitionBegin;
+    }
+
+    TASK() = default;
+};
+
+std::ostream& operator<<(std::ostream & out, const TASK & task)
+{
+    out << "[ ";
+    out << task.partitionBegin << " - " << task.partitionEnd;
+    out << " | " << "iteration: " << task.iteration;
+    out << " | " << "pivotIdx: " << task.pivotIdx;
+    return out << " ] ";
+}
+
+        } // namespace Sorting
+    } // namespace Algorithms
+} // namespace TNL
\ No newline at end of file
diff --git a/src/TNL/Algorithms/sort.h b/src/TNL/Algorithms/sort.h
new file mode 100644
index 0000000000000000000000000000000000000000..131561760e3c655f4fd690187948d96e69297ea3
--- /dev/null
+++ b/src/TNL/Algorithms/sort.h
@@ -0,0 +1,217 @@
+/***************************************************************************
+                          sort.h  -  description
+                             -------------------
+    begin                : Jul 12, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber, Xuan Thang Nguyen
+
+#pragma once
+
+#include <TNL/Algorithms/Sorting/DefaultSorter.h>
+
+namespace TNL {
+   namespace Algorithms {
+
+/**
+ * \brief Function for sorting elements of array or vector in ascending order.
+ *
+ * \tparam Array is a type of container to be sorted. It can be, for example, \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView,
+ *    \ref TNL::Containers::Vector, \ref TNL::Containers::VectorView.
+ * \tparam Sorter is an algorithm for sorting. It can be, \ref TNL::Algorithms::Sorting::STLSort for sorting on host and \ref TNL::Algorithms::Sorting::Quicksort
+ *    or \ref TNL::Algorithms::Sorting::BitonicSort for sorting on CUDA GPU.
+ *
+ * \param array is an instance of array/array view/vector/vector view for sorting.
+ * \param sorter is an instance of sorter.
+ *
+ * \par Example
+ *
+ * \includelineno SortingExample.cpp
+ *
+ * \par Output
+ *
+ * \include SortingExample.out
+ *
+ */
+template< typename Array,
+          typename Sorter = typename Sorting::DefaultSorter< typename Array::DeviceType >::SorterType >
+void ascendingSort( Array& array, const Sorter& sorter = Sorter{} )
+{
+   using ValueType = typename Array::ValueType;
+   sorter.sort( array, [] __cuda_callable__ ( const ValueType& a, const ValueType& b ) { return a < b; } );
+}
+
+/**
+ * \brief Function for sorting elements of array or vector in descending order.
+ *
+ * \tparam Array is a type of container to be sorted. It can be, for example, \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView,
+ *    \ref TNL::Containers::Vector, \ref TNL::Containers::VectorView.
+ * \tparam Sorter is an algorithm for sorting. It can be, \ref TNL::Algorithms::Sorting::STLSort for sorting on host and \ref TNL::Algorithms::Sorting::Quicksort
+ *    or \ref TNL::Algorithms::Sorting::BitonicSort for sorting on CUDA GPU.
+ *
+ * \param array is an instance of array/array view/vector/vector view for sorting.
+ * \param sorter is an instance of sorter.
+ *
+ * \par Example
+ *
+ * \includelineno SortingExample.cpp
+ *
+ * \par Output
+ *
+ * \include SortingExample.out
+ *
+ */
+template< typename Array,
+          typename Sorter = typename Sorting::DefaultSorter< typename Array::DeviceType >::SorterType >
+void descendingSort( Array& array, const Sorter& sorter = Sorter{} )
+{
+   using ValueType = typename Array::ValueType;
+   sorter.sort( array, [] __cuda_callable__ ( const ValueType& a, const ValueType& b ) { return a < b; } );
+}
+
+/**
+ * \brief Function for sorting elements of array or vector based on a user defined comparison lambda function.
+ *
+ * \tparam Array is a type of container to be sorted. It can be, for example, \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView,
+ *    \ref TNL::Containers::Vector, \ref TNL::Containers::VectorView.
+ * \tparam Compare is a lambda function for comparing of two elements. It returns true if the first argument should be ordered before the second. The
+ *    lambda function is supposed to be defined as follows (`ValueType` is type of the array elements):
+ *  ```
+ *  auto compare = [] __cuda_callable__ ( const ValueType& a , const ValueType& b ) -> bool { return .... };
+ *  ```
+ * \tparam Sorter is an algorithm for sorting. It can be, \ref TNL::Algorithms::Sorting::STLSort for sorting on host and \ref TNL::Algorithms::Sorting::Quicksort
+ *    or \ref TNL::Algorithms::Sorting::BitonicSort for sorting on CUDA GPU.
+ *
+ * \param array is an instance of array/array view/vector/vector view for sorting.
+ * \param compare is an instance of the lambda function for comparison of two elements.
+ * \param sorter is an instance of sorter.
+ *
+ * \par Example
+ *
+ * \includelineno SortingExample2.cpp
+ *
+ * \par Output
+ *
+ * \include SortingExample2.out
+ *
+ */
+template< typename Array,
+          typename Compare,
+          typename Sorter = typename Sorting::DefaultSorter< typename Array::DeviceType >::SorterType >
+void sort( Array& array, const Compare& compare, const Sorter& sorter = Sorter{} )
+{
+   sorter.sort( array, compare );
+}
+
+/**
+ * \brief Function for general sorting based on lambda functions for comparison and swaping of two elements..
+ *
+ * \tparam Device is device on which the sorting algorithms should be executed.
+ * \tparam Index is type used for indexing of the sorted data.
+ * \tparam Compare is a lambda function for comparing of two elements. It returns true if the first argument should be ordered before the second - both are given
+ *  by indices representing their positions. The lambda function is supposed to be defined as follows:
+ *  ```
+ *  auto compare = [=] __cuda_callable__ ( const Index& a , const Index& b ) -> bool { return .... };
+ *  ```
+ * \tparam Swap is a lambda function for swaping of two elements which are ordered wrong way. Both elements are represented by indeces as well.  It supposed to
+ *  be defined as:
+ * ```
+ * auto swap = [=] __cuda_callable__ (  const Index& a , const Index& b ) mutable { swap( ....); };
+ * ```
+ * \tparam Sorter is an algorithm for sorting. It can be \ref TNL::Algorithms::Sorting::BitonicSort for sorting on CUDA GPU. Currently there is no algorithm for CPU :(.
+ *
+ * \param array is an instance of array/array view/vector/vector view for sorting.
+ * \param compare is an instance of the lambda function for comparison of two elements.
+ * \param sorter is an instance of sorter.
+ *
+ * \par Example
+ *
+ * \includelineno SortingExample3.cpp
+ *
+ * \par Output
+ *
+ * \include SortingExample3.out
+ *
+ */
+template< typename Device,
+          typename Index,
+          typename Compare,
+          typename Swap,
+          typename Sorter = typename Sorting::DefaultInplaceSorter< Device >::SorterType >
+void sort( const Index begin, const Index end, Compare&& compare, Swap&& swap, const Sorter& sorter = Sorter{} )
+{
+   sorter.template inplaceSort< Device, Index >( begin, end, compare, swap );
+}
+
+/**
+ * \brief Functions returning true if the array elements are sorted according to the lmabda function `comparison`.
+ *
+ * \tparam Array is the type of array/vector. It can be, for example,  \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView,
+ *    \ref TNL::Containers::Vector, \ref TNL::Containers::VectorView.
+ * \tparam Compare is a lambda function for comparing of two elements. It returns true if the first argument should be ordered before the second - both are given
+ *  by indices representing their positions. The lambda function is supposed to be defined as follows:
+ *  ```
+ *  auto compare = [=] __cuda_callable__ ( const Index& a , const Index& b ) -> bool { return .... };
+ *  ```
+ * \param arr is an instance of tested array.
+ * \param compare is an instance of the lambda function for elements comparison.
+ * \param sorter is an instance of sorter.
+ *
+ * \return true if the array is sorted in ascending order.
+ * \return false  if the array is NOT sorted in ascending order.
+ */
+template< typename Array, typename Compare >
+bool isSorted( const Array& arr, const Compare& compare )
+{
+   using Device = typename Array::DeviceType;
+   if (arr.getSize() <= 1)
+      return true;
+
+   auto view = arr.getConstView();
+   auto fetch = [=] __cuda_callable__(int i) { return ! compare( view[ i ], view[ i - 1 ] ); };
+   auto reduction = [] __cuda_callable__(bool a, bool b) { return a && b; };
+   return TNL::Algorithms::reduce< Device >( 1, arr.getSize(), fetch, reduction, true );
+}
+
+/**
+ * \brief Functions returning true if the array elements are sorted in ascending order.
+ *
+ * \tparam Array is the type of array/vector. It can be, for example,  \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView,
+ *    \ref TNL::Containers::Vector, \ref TNL::Containers::VectorView.
+ *
+ * \param arr is an instance of tested array.
+ *
+ * \return true if the array is sorted in ascending order.
+ * \return false  if the array is NOT sorted in ascending order.
+ */
+template< typename Array >
+bool isAscending( const Array& arr )
+{
+   using Value = typename Array::ValueType;
+   return isSorted( arr, [] __cuda_callable__( const Value& a, const Value& b ) { return a < b; });
+}
+
+/**
+ * \brief Functions returning true if the array elements are sorted in descending order.
+ *
+ * \tparam Array is the type of array/vector. It can be, for example,  \ref TNL::Containers::Array, \ref TNL::Containers::ArrayView,
+ *    \ref TNL::Containers::Vector, \ref TNL::Containers::VectorView.
+ *
+ * \param arr is an instance of tested array.
+ *
+ * \return true if the array is sorted in descending order.
+ * \return false  if the array is NOT sorted in descending order.
+ */
+template< typename Array >
+bool isDescending( const Array& arr)
+{
+   using Value = typename Array::ValueType;
+   return isSorted( arr, [] __cuda_callable__( const Value& a, const Value& b ) { return a > b; });
+}
+
+   } // namespace Algorithms
+} // namespace TNL
diff --git a/src/UnitTests/Algorithms/CMakeLists.txt b/src/UnitTests/Algorithms/CMakeLists.txt
index 14a7d43ab35a0ea9d5dc26fa8e571cef97dfbe1b..31028036b233748f5a5d57fa4784354265b52156 100644
--- a/src/UnitTests/Algorithms/CMakeLists.txt
+++ b/src/UnitTests/Algorithms/CMakeLists.txt
@@ -1,4 +1,5 @@
 ADD_SUBDIRECTORY( Segments )
+ADD_SUBDIRECTORY( Sorting )
 
 set( COMMON_TESTS
          MemoryOperationsTest
diff --git a/src/UnitTests/Algorithms/Sorting/BitonicSortTest.cpp b/src/UnitTests/Algorithms/Sorting/BitonicSortTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6d48cf991585240295ca5eb23e5cd56259d9c3fc
--- /dev/null
+++ b/src/UnitTests/Algorithms/Sorting/BitonicSortTest.cpp
@@ -0,0 +1 @@
+#include "BitonicSortTest.h"
\ No newline at end of file
diff --git a/src/UnitTests/Algorithms/Sorting/BitonicSortTest.cu b/src/UnitTests/Algorithms/Sorting/BitonicSortTest.cu
new file mode 120000
index 0000000000000000000000000000000000000000..dfdfdf06da67288f999f4dab4cb72ab133a57a73
--- /dev/null
+++ b/src/UnitTests/Algorithms/Sorting/BitonicSortTest.cu
@@ -0,0 +1 @@
+BitonicSortTest.cpp
\ No newline at end of file
diff --git a/src/UnitTests/Algorithms/Sorting/BitonicSortTest.h b/src/UnitTests/Algorithms/Sorting/BitonicSortTest.h
new file mode 100644
index 0000000000000000000000000000000000000000..5838d8f6c84e8a9deed70a539f27b12260252a50
--- /dev/null
+++ b/src/UnitTests/Algorithms/Sorting/BitonicSortTest.h
@@ -0,0 +1,388 @@
+#include <vector>
+#include <algorithm>
+#include <numeric>
+#include <random>
+#include <iostream>
+
+#include <TNL/Containers/Array.h>
+#include <TNL/Algorithms/MemoryOperations.h>
+#include <TNL/Algorithms/Sorting/BitonicSort.h>
+#include <TNL/Algorithms/sort.h>
+
+#if defined HAVE_GTEST && defined HAVE_CUDA
+#include <gtest/gtest.h>
+
+
+using namespace TNL;
+using namespace TNL::Algorithms;
+using namespace TNL::Algorithms::Sorting;
+
+TEST(permutations, allPermutationSize_2_to_7)
+{
+    for(int i = 2; i<=7; i++ )
+    {
+        int size = i;
+        std::vector<int> orig(size);
+        std::iota(orig.begin(), orig.end(), 0);
+
+        do
+        {
+            TNL::Containers::Array<int, TNL::Devices::Cuda> cudaArr(orig);
+            auto view = cudaArr.getView();
+
+            BitonicSort::sort(view);
+
+            EXPECT_TRUE( Algorithms::isAscending( view ) ) << "failed " << i << std::endl;
+        }
+        while (std::next_permutation(orig.begin(), orig.end()));
+    }
+}
+
+TEST(permutations, allPermutationSize_8)
+{
+    int size = 9;
+    const int stride = 151;
+    int i = 0;
+
+    std::vector<int> orig(size);
+    std::iota(orig.begin(), orig.end(), 0);
+
+    do
+    {
+        if ((i++) % stride != 0)
+            continue;
+
+        TNL::Containers::Array<int, TNL::Devices::Cuda> cudaArr(orig);
+        auto view = cudaArr.getView();
+
+        BitonicSort::sort(view);
+
+        EXPECT_TRUE(Algorithms::isAscending(view)) << "result " << view << std::endl;
+    }
+    while (std::next_permutation(orig.begin(), orig.end()));
+}
+
+TEST(permutations, somePermutationSize9)
+{
+    int size = 9;
+    const int stride = 227;
+    int i = 0;
+
+    std::vector<int> orig(size);
+    std::iota(orig.begin(), orig.end(), 0);
+
+    do
+    {
+        if ((i++) % stride != 0)
+            continue;
+
+        TNL::Containers::Array<int, TNL::Devices::Cuda> cudaArr(orig);
+        auto view = cudaArr.getView();
+
+        BitonicSort::sort(view);
+
+        EXPECT_TRUE(Algorithms::isAscending(view)) << "result " << view << std::endl;
+    }
+    while (std::next_permutation(orig.begin(), orig.end()));
+}
+
+TEST(selectedSize, size15)
+{
+    TNL::Containers::Array<int, TNL::Devices::Cuda> cudaArr{5, 9, 4, 8, 6, 1, 2, 3, 4, 8, 1, 6, 9, 4, 9};
+    auto view = cudaArr.getView();
+    EXPECT_EQ(15, view.getSize()) << "size not 15" << std::endl;
+    BitonicSort::sort(view);
+    EXPECT_TRUE(Algorithms::isAscending(view)) << "result " << view << std::endl;
+}
+
+TEST(multiblock, 32768_decreasingNegative)
+{
+    std::vector<int> arr(1<<15);
+    for (size_t i = 0; i < arr.size(); i++)
+        arr[i] = -i;
+
+    TNL::Containers::Array<int, TNL::Devices::Cuda> cudaArr(arr);
+    auto view = cudaArr.getView();
+
+    BitonicSort::sort(view);
+    EXPECT_TRUE(Algorithms::isAscending(view)) << "result " << view << std::endl;
+}
+
+TEST(randomGenerated, smallArray_randomVal)
+{
+    std::srand(2006);
+    for(int i = 0; i < 100; i++)
+    {
+        std::vector<int> arr(std::rand()%(1<<10));
+        for(auto & x : arr)
+            x = std::rand();
+
+        TNL::Containers::Array<int, TNL::Devices::Cuda> cudaArr(arr);
+
+        auto view = cudaArr.getView();
+        BitonicSort::sort(view);
+        EXPECT_TRUE(Algorithms::isAscending(view));
+    }
+}
+
+TEST(randomGenerated, bigArray_all0)
+{
+    std::srand(304);
+    for(int i = 0; i < 50; i++)
+    {
+        int size = (1<<20) + (std::rand()% (1<<19));
+
+        TNL::Containers::Array<int, TNL::Devices::Cuda> cudaArr(size);
+
+        auto view = cudaArr.getView();
+        BitonicSort::sort(view);
+        EXPECT_TRUE(Algorithms::isAscending(view));
+    }
+}
+
+TEST(nonIntegerType, float_notPow2)
+{
+    TNL::Containers::Array<float, TNL::Devices::Cuda> cudaArr{5.0, 9.4, 4.6, 8.9, 6.2, 1.15184, 2.23};
+    auto view = cudaArr.getView();
+    BitonicSort::sort(view);
+    EXPECT_TRUE(Algorithms::isAscending(view)) << "result " << view << std::endl;
+}
+
+TEST(nonIntegerType, double_notPow2)
+{
+    TNL::Containers::Array<double, TNL::Devices::Cuda> cudaArr{5.0, 9.4, 4.6, 8.9, 6.2, 1.15184, 2.23};
+    auto view = cudaArr.getView();
+    BitonicSort::sort(view);
+    EXPECT_TRUE(Algorithms::isAscending(view)) << "result " << view << std::endl;
+}
+
+
+struct TMPSTRUCT{
+    uint8_t m_data[6];
+    __cuda_callable__ TMPSTRUCT(){m_data[0] = 0;}
+    __cuda_callable__ TMPSTRUCT(int first){m_data[0] = first;};
+    __cuda_callable__ bool operator <(const TMPSTRUCT& other) const { return m_data[0] < other.m_data[0];}
+    __cuda_callable__ TMPSTRUCT& operator =(const TMPSTRUCT& other) {m_data[0] = other.m_data[0]; return *this;}
+
+};
+
+TEST(nonIntegerType, struct)
+{
+    TNL::Containers::Array<TMPSTRUCT, TNL::Devices::Cuda> cudaArr{TMPSTRUCT(5), TMPSTRUCT(6), TMPSTRUCT(9), TMPSTRUCT(1)};
+    auto view = cudaArr.getView();
+    BitonicSort::sort(view);
+    EXPECT_TRUE(Algorithms::isAscending(view));
+}
+
+struct TMPSTRUCT_64b{
+    uint8_t m_data[64];
+    __cuda_callable__ TMPSTRUCT_64b(){m_data[0] = 0;}
+    __cuda_callable__ TMPSTRUCT_64b(int first){m_data[0] = first;};
+    __cuda_callable__ bool operator <(const TMPSTRUCT_64b& other) const { return m_data[0] < other.m_data[0];}
+    __cuda_callable__ TMPSTRUCT_64b& operator =(const TMPSTRUCT_64b& other) {m_data[0] = other.m_data[0]; return *this;}
+};
+
+TEST(nonIntegerType, struct_64b)
+{
+    std::srand(61513);
+    int size = std::rand() % (1<<15);
+    std::vector<TMPSTRUCT_64b> vec(size);
+    for(auto & x : vec)
+        x = TMPSTRUCT_64b(std::rand());
+
+    TNL::Containers::Array<TMPSTRUCT_64b, TNL::Devices::Cuda> cudaArr(vec);
+    auto view = cudaArr.getView();
+    BitonicSort::sort(view);
+    EXPECT_TRUE(Algorithms::isAscending(view));
+}
+
+struct TMPSTRUCT_128b{
+    uint8_t m_data[128];
+    __cuda_callable__ TMPSTRUCT_128b(){m_data[0] = 0;}
+    __cuda_callable__ TMPSTRUCT_128b(int first){m_data[0] = first;};
+    __cuda_callable__ bool operator <(const TMPSTRUCT_128b& other) const { return m_data[0] < other.m_data[0];}
+    __cuda_callable__ TMPSTRUCT_128b& operator =(const TMPSTRUCT_128b& other) {m_data[0] = other.m_data[0]; return *this;}
+};
+
+TEST(nonIntegerType, struct_128b)
+{
+    std::srand(98451);
+    int size = std::rand() % (1<<14);
+    std::vector<TMPSTRUCT_128b> vec(size);
+    for(auto & x : vec)
+        x = TMPSTRUCT_128b(std::rand());
+
+    TNL::Containers::Array<TMPSTRUCT_128b, TNL::Devices::Cuda> cudaArr(vec);
+    auto view = cudaArr.getView();
+    BitonicSort::sort(view);
+    EXPECT_TRUE(Algorithms::isAscending(view));
+}
+
+//error bypassing
+//https://mmg-gitlab.fjfi.cvut.cz/gitlab/tnl/tnl-dev/blob/fbc34f6a97c13ec865ef7969b9704533222ed408/src/UnitTests/Containers/VectorTest-8.h
+void descendingSort(TNL::Containers::ArrayView<int, TNL::Devices::Cuda> view)
+{
+    auto cmpDescending = [] __cuda_callable__ (int a, int b) {return a > b;};
+    bitonicSort(view, cmpDescending);
+}
+
+TEST(sortWithFunction, descending)
+{
+    TNL::Containers::Array<int, TNL::Devices::Cuda> cudaArr{6, 9, 4, 2, 3};
+    auto view = cudaArr.getView();
+    descendingSort(view);
+
+    EXPECT_FALSE(Algorithms::isAscending(view)) << "result " << view << std::endl;
+
+    EXPECT_TRUE(view.getElement(0) == 9);
+    EXPECT_TRUE(view.getElement(1) == 6);
+    EXPECT_TRUE(view.getElement(2) == 4);
+    EXPECT_TRUE(view.getElement(3) == 3);
+    EXPECT_TRUE(view.getElement(4) == 2);
+}
+
+/*TEST(sortHostArray, hostArray)
+{
+    TNL::Containers::Array< int > arr( 84561 );
+    for( size_t i = 0; i < arr.getSize(); i++ )
+        arr[i] = -i;
+
+    bitonicSort(arr);
+
+    EXPECT_TRUE( TNL::Algorithms::isAscending(arr) );
+}*/
+
+/*TEST(sortRange, secondHalf)
+{
+    std::vector<int> arr(19);
+    int s = 19/2;
+    for(size_t i = 0; i < s; i++) arr[i] = -1;
+    for(size_t i = s; i < 19; i++) arr[i] = -i;
+
+    bitonicSort(arr, s, 19);
+
+    EXPECT_TRUE(TNL::Algorithms::isAscending(arr.begin() + s, arr.end()));
+    EXPECT_TRUE(arr[0] == -1);
+    EXPECT_TRUE(arr[s-1] == -1);
+}
+
+TEST(sortRange, middle)
+{
+    std::srand(8705);
+
+    std::vector<int> arr(20);
+    int s = 5, e = 15;
+    for(size_t i = 0; i < s; i++) arr[i] = -1;
+    for(size_t i = e; i < 20; i++) arr[i] = -1;
+
+    for(size_t i = s; i < e; i++) arr[i] = std::rand();
+
+    bitonicSort(arr, s, e);
+
+    EXPECT_TRUE(TNL::Algorithms::isAscending(arr.begin() + s, arr.begin() + e));
+    EXPECT_TRUE(arr[0] == -1);
+    EXPECT_TRUE(arr.back() == -1);
+    EXPECT_TRUE(arr[s-1] == -1);
+    EXPECT_TRUE(arr[e] == -1);
+}
+
+TEST(sortRange, middleMultiBlock)
+{
+    std::srand(4513);
+    int size = 1<<20;
+    int s = 2000, e = size - 1512;
+
+    std::vector<int> arr(size);
+    for(size_t i = 0; i < s; i++) arr[i] = -1;
+    for(size_t i = e; i < size; i++) arr[i] = -1;
+
+    for(size_t i = s; i < e; i++) arr[i] = std::rand();
+
+    bitonicSort(arr, s, e);
+
+    EXPECT_TRUE(TNL::Algorithms::isAscending(arr.begin() + s, arr.begin() + e));
+
+    EXPECT_TRUE(arr[0] == -1);
+    EXPECT_TRUE(arr[std::rand() % s] == -1);
+    EXPECT_TRUE(arr[s-1] == -1);
+
+    EXPECT_TRUE(arr[e] == -1);
+    EXPECT_TRUE(arr[e + (std::rand() % (size - e))] == -1);
+    EXPECT_TRUE(arr.back() == -1);
+}*/
+
+template<typename TYPE>
+void fetchAndSwapSorter(TNL::Containers::ArrayView<TYPE, TNL::Devices::Cuda> view)
+{
+    //auto Fetch = [=]__cuda_callable__(int i){return view[i];};
+    auto Cmp = [=]__cuda_callable__(const int i, const int j ){return view[ i ]  < view[ j ];};
+    auto Swap = [=] __cuda_callable__ (int i, int j) mutable {TNL::swap(view[i], view[j]);};
+    bitonicSort(0, view.getSize(), Cmp, Swap);
+}
+
+TEST(fetchAndSwap, oneBlockSort)
+{
+    int size = 9;
+    const int stride = 227;
+    int i = 0;
+
+    std::vector<int> orig(size);
+    std::iota(orig.begin(), orig.end(), 0);
+
+    do
+    {
+        if ((i++) % stride != 0)
+            continue;
+
+        TNL::Containers::Array<int, TNL::Devices::Cuda> cudaArr(orig);
+        auto view = cudaArr.getView();
+        fetchAndSwapSorter(view);
+        EXPECT_TRUE(Algorithms::isAscending(view)) << "result " << view << std::endl;
+    }
+    while (std::next_permutation(orig.begin(), orig.end()));
+}
+
+TEST(fetchAndSwap, typeDouble)
+{
+    int size = 5;
+    std::vector<double> orig(size);
+    std::iota(orig.begin(), orig.end(), 0);
+
+    do
+    {
+        TNL::Containers::Array<double, TNL::Devices::Cuda> cudaArr(orig);
+        auto view = cudaArr.getView();
+        fetchAndSwapSorter(view);
+        EXPECT_TRUE(Algorithms::isAscending(view)) << "result " << view << std::endl;
+    }
+    while (std::next_permutation(orig.begin(), orig.end()));
+}
+
+void fetchAndSwap_sortMiddle(TNL::Containers::ArrayView<int, TNL::Devices::Cuda> view, int from, int to)
+{
+    //auto Fetch = [=]__cuda_callable__(int i){return view[i];};
+    auto Cmp = [=]__cuda_callable__(const int i, const int j ){ return view[ i ] < view[ j ]; };
+    auto Swap = [=] __cuda_callable__ (int i, int j) mutable { TNL::swap(view[i], view[j]); };
+    bitonicSort(from, to, Cmp, Swap);
+}
+
+TEST(fetchAndSwap, sortMiddle)
+{
+    std::vector<int> orig{5, 9, 4, 54, 21, 6, 7, 9, 0, 9, 42, 4};
+    TNL::Containers::Array<int, TNL::Devices::Cuda> cudaArr(orig);
+    auto view = cudaArr.getView();
+    size_t from = 3, to = 8;
+
+    fetchAndSwap_sortMiddle(view, from, to);
+    EXPECT_TRUE(Algorithms::isAscending(view.getView(3, 8))) << "result " << view << std::endl;
+
+    for(size_t i = 0; i < orig.size(); i++)
+    {
+        if(i < from || i >= to)
+            EXPECT_TRUE(view.getElement(i) == orig[i]);
+    }
+}
+
+#endif
+
+#include "../../main.h"
diff --git a/src/UnitTests/Algorithms/Sorting/BubbleSortTest.cpp b/src/UnitTests/Algorithms/Sorting/BubbleSortTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2a2bc2de9733435d3ff01f862591c48eab050336
--- /dev/null
+++ b/src/UnitTests/Algorithms/Sorting/BubbleSortTest.cpp
@@ -0,0 +1 @@
+#include "BubbleSortTest.h"
\ No newline at end of file
diff --git a/src/UnitTests/Algorithms/Sorting/BubbleSortTest.cu b/src/UnitTests/Algorithms/Sorting/BubbleSortTest.cu
new file mode 120000
index 0000000000000000000000000000000000000000..83313710f089e2c88bc68831abab3bec6ddb26e5
--- /dev/null
+++ b/src/UnitTests/Algorithms/Sorting/BubbleSortTest.cu
@@ -0,0 +1 @@
+BubbleSortTest.cpp
\ No newline at end of file
diff --git a/src/UnitTests/Algorithms/Sorting/BubbleSortTest.h b/src/UnitTests/Algorithms/Sorting/BubbleSortTest.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e527794e85db043fdcaa02af4d057c0bf37cf5d
--- /dev/null
+++ b/src/UnitTests/Algorithms/Sorting/BubbleSortTest.h
@@ -0,0 +1,95 @@
+#include <vector>
+#include <algorithm>
+#include <numeric>
+#include <random>
+#include <iostream>
+
+#include <TNL/Containers/Array.h>
+#include <TNL/Algorithms/MemoryOperations.h>
+#include <TNL/Algorithms/Sorting/BubbleSort.h>
+#include <TNL/Algorithms/sort.h>
+
+#if defined HAVE_GTEST
+#include <gtest/gtest.h>
+
+
+using namespace TNL;
+using namespace TNL::Algorithms;
+using namespace TNL::Algorithms::Sorting;
+
+template<typename TYPE>
+void fetchAndSwapSorter( TNL::Containers::ArrayView< TYPE, TNL::Devices::Host > view)
+{
+    auto Cmp = [=]__cuda_callable__(const int i, const int j ){return view[ i ]  < view[ j ];};
+    auto Swap = [=] __cuda_callable__ (int i, int j) mutable {TNL::swap(view[i], view[j]);};
+    BubbleSort::inplaceSort< TNL::Devices::Host >(0, view.getSize(), Cmp, Swap);
+}
+
+TEST(fetchAndSwap, oneBlockSort)
+{
+    int size = 9;
+    const int stride = 227;
+    int i = 0;
+
+    std::vector<int> orig(size);
+    std::iota(orig.begin(), orig.end(), 0);
+
+    do
+    {
+        if ((i++) % stride != 0)
+            continue;
+
+        TNL::Containers::Array<int, TNL::Devices::Host> arr(orig);
+        auto view = arr.getView();
+        fetchAndSwapSorter(view);
+        EXPECT_TRUE(Algorithms::isAscending(view)) << "result " << view << std::endl;
+    }
+    while (std::next_permutation(orig.begin(), orig.end()));
+}
+
+TEST(fetchAndSwap, typeDouble)
+{
+    int size = 5;
+    std::vector<double> orig(size);
+    std::iota(orig.begin(), orig.end(), 0);
+
+    do
+    {
+        TNL::Containers::Array<double, TNL::Devices::Host> arr(orig);
+        auto view = arr.getView();
+        fetchAndSwapSorter(view);
+        EXPECT_TRUE(Algorithms::isAscending(view)) << "result " << view << std::endl;
+    }
+    while (std::next_permutation(orig.begin(), orig.end()));
+}
+
+void fetchAndSwap_sortMiddle(TNL::Containers::ArrayView<int, TNL::Devices::Host> view, int from, int to)
+{
+    //auto Fetch = [=]__cuda_callable__(int i){return view[i];};
+    auto Cmp = [=]__cuda_callable__(const int i, const int j ){ return view[ i ] < view[ j ]; };
+    auto Swap = [=] __cuda_callable__ (int i, int j) mutable { TNL::swap(view[i], view[j]); };
+    BubbleSort::inplaceSort< TNL::Devices::Host >(from, to, Cmp, Swap);
+}
+
+TEST(fetchAndSwap, sortMiddle)
+{
+    std::vector<int> orig{5, 9, 4, 54, 21, 6, 7, 9, 0, 9, 42, 4};
+    TNL::Containers::Array<int, TNL::Devices::Host> arr(orig);
+    auto view = arr.getView();
+    size_t from = 3, to = 8;
+
+    fetchAndSwap_sortMiddle(view, from, to);
+    EXPECT_TRUE(Algorithms::isAscending(view.getView(3, 8))) << "result " << view << std::endl;
+
+    for(size_t i = 0; i < orig.size(); i++)
+    {
+        if( i < from || i >= to )
+        {
+            EXPECT_TRUE(view.getElement(i) == orig[i]);
+        }
+    }
+}
+
+#endif
+
+#include "../../main.h"
diff --git a/src/UnitTests/Algorithms/Sorting/CMakeLists.txt b/src/UnitTests/Algorithms/Sorting/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..56fca2216ea760113c2d4cdf0e39e6a5bcf4454e
--- /dev/null
+++ b/src/UnitTests/Algorithms/Sorting/CMakeLists.txt
@@ -0,0 +1,28 @@
+set( COMMON_TESTS
+         BitonicSortTest
+         BubbleSortTest
+         QuicksortTest
+)
+
+set( CPP_TESTS )
+set( CUDA_TESTS )
+if( BUILD_CUDA )
+   set( CUDA_TESTS  ${CUDA_TESTS} ${COMMON_TESTS} )
+else()
+   set( CPP_TESTS  ${CPP_TESTS} ${COMMON_TESTS} )
+endif()
+
+foreach( target IN ITEMS ${CPP_TESTS} )
+   add_executable( ${target} ${target}.cpp )
+   target_compile_options( ${target} PRIVATE ${CXX_TESTS_FLAGS} )
+   target_link_libraries( ${target} ${GTEST_BOTH_LIBRARIES} )
+   add_test( ${target} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${target}${CMAKE_EXECUTABLE_SUFFIX} )
+endforeach()
+
+if( BUILD_CUDA )
+   foreach( target IN ITEMS ${CUDA_TESTS} )
+      cuda_add_executable( ${target} ${target}.cu OPTIONS ${CXX_TESTS_FLAGS} )
+      target_link_libraries( ${target} ${GTEST_BOTH_LIBRARIES} )
+      add_test( ${target} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${target}${CMAKE_EXECUTABLE_SUFFIX} )
+   endforeach()
+endif()
diff --git a/src/UnitTests/Algorithms/Sorting/QuicksortTest.cpp b/src/UnitTests/Algorithms/Sorting/QuicksortTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..04be4c54ac81ee606bb0556011149c95c9dc5938
--- /dev/null
+++ b/src/UnitTests/Algorithms/Sorting/QuicksortTest.cpp
@@ -0,0 +1 @@
+#include "QuicksortTest.h"
\ No newline at end of file
diff --git a/src/UnitTests/Algorithms/Sorting/QuicksortTest.cu b/src/UnitTests/Algorithms/Sorting/QuicksortTest.cu
new file mode 120000
index 0000000000000000000000000000000000000000..d6099b8e1a3267e1f28a9f9b219bc54cfee8c522
--- /dev/null
+++ b/src/UnitTests/Algorithms/Sorting/QuicksortTest.cu
@@ -0,0 +1 @@
+QuicksortTest.cpp
\ No newline at end of file
diff --git a/src/UnitTests/Algorithms/Sorting/QuicksortTest.h b/src/UnitTests/Algorithms/Sorting/QuicksortTest.h
new file mode 100644
index 0000000000000000000000000000000000000000..b84bcb78a59792438abf7f16c4a44c7a5566dc8e
--- /dev/null
+++ b/src/UnitTests/Algorithms/Sorting/QuicksortTest.h
@@ -0,0 +1,203 @@
+#include <vector>
+#include <algorithm>
+#include <numeric>
+#include <random>
+
+
+#include <TNL/Containers/Array.h>
+#include <TNL/Algorithms/MemoryOperations.h>
+#include <TNL/Algorithms/sort.h>
+#include <TNL/Algorithms/Sorting/Quicksort.h>
+
+#if defined HAVE_CUDA && defined HAVE_GTEST
+#include <thrust/sort.h>
+#include <thrust/execution_policy.h>
+
+#include <gtest/gtest.h>
+
+using namespace TNL;
+using namespace TNL::Algorithms;
+using namespace TNL::Algorithms::Sorting;
+
+TEST(selectedSize, size15)
+{
+   TNL::Containers::Array<int, TNL::Devices::Cuda> cudaArr{5, 9, 4, 8, 6, 1, 2, 3, 4, 8, 1, 6, 9, 4, 9};
+   auto view = cudaArr.getView();
+   EXPECT_EQ(15, view.getSize()) << "size not 15" << std::endl;
+   Quicksort::sort( view );
+   EXPECT_TRUE(Algorithms::isAscending(view)) << "result " << view << std::endl;
+}
+
+TEST(multiblock, 32768_decreasingNegative)
+{
+   std::vector<int> arr(1<<15);
+   for (size_t i = 0; i < arr.size(); i++)
+      arr[i] = -i;
+
+   TNL::Containers::Array<int, TNL::Devices::Cuda> cudaArr(arr);
+   auto view = cudaArr.getView();
+   Quicksort::sort( view );
+
+   EXPECT_TRUE(Algorithms::isAscending(view)) << "result " << view << std::endl;
+}
+
+TEST(randomGenerated, smallArray_randomVal)
+{
+   std::srand(2006);
+   for(int i = 0; i < 100; i++)
+   {
+      std::vector<int> arr(std::rand()%(1<<10));
+      for(auto & x : arr)
+         x = std::rand();
+
+      TNL::Containers::Array<int, TNL::Devices::Cuda> cudaArr(arr);
+      auto view = cudaArr.getView();
+      Quicksort::sort( view );
+
+      EXPECT_TRUE(Algorithms::isAscending(view));
+    }
+}
+
+TEST(randomGenerated, bigArray_randomVal)
+{
+   std::srand(304);
+   for(int i = 0; i < 50; i++)
+   {
+      int size = (1<<20) + (std::rand()% (1<<19));
+      std::vector<int> arr(size);
+      for(auto & x : arr) x = std::rand();
+      TNL::Containers::Array<int, TNL::Devices::Cuda> cudaArr(arr);
+
+      auto view = cudaArr.getView();
+      Quicksort::sort( view );
+      EXPECT_TRUE(Algorithms::isAscending(view));
+    }
+}
+
+TEST(noLostElement, smallArray)
+{
+   std::srand(9151);
+
+   int size = (1<<7);
+   std::vector<int> arr(size);
+   for(auto & x : arr) x = std::rand();
+
+   TNL::Containers::Array<int, TNL::Devices::Cuda> cudaArr(arr);
+   auto view = cudaArr.getView();
+   Quicksort::sort( view );
+
+   std::sort(arr.begin(), arr.end());
+   TNL::Containers::Array<int, TNL::Devices::Cuda> cudaArr2(arr);
+   EXPECT_TRUE(view == cudaArr2.getView());
+}
+
+TEST(noLostElement, midSizedArray)
+{
+   std::srand(91503);
+
+   int size = (1<<15);
+   std::vector<int> arr(size);
+   for(auto & x : arr) x = std::rand();
+
+   TNL::Containers::Array<int, TNL::Devices::Cuda> cudaArr(arr);
+   auto view = cudaArr.getView();
+   Quicksort::sort( view );
+
+   std::sort(arr.begin(), arr.end());
+   TNL::Containers::Array<int, TNL::Devices::Cuda> cudaArr2(arr);
+   EXPECT_TRUE(view == cudaArr2.getView());
+}
+
+TEST(noLostElement, bigSizedArray)
+{
+   std::srand(15611);
+
+   int size = (1<<22);
+   std::vector<int> arr(size);
+   for(auto & x : arr) x = std::rand();
+   for(int i = 0; i < 10000; i++)
+      arr[std::rand() % arr.size()] = (1<<10);
+
+   TNL::Containers::Array<int, TNL::Devices::Cuda> cudaArr(arr);
+   auto view = cudaArr.getView();
+   Quicksort::sort( view );
+
+   TNL::Containers::Array<int, TNL::Devices::Cuda> cudaArr2(arr);
+   thrust::sort(thrust::device, cudaArr2.getData(), cudaArr2.getData() + cudaArr2.getSize());
+   EXPECT_TRUE(view == cudaArr2.getView());
+}
+
+TEST(types, type_double)
+{
+   std::srand(8451);
+
+   int size = (1<<16);
+   std::vector<double> arr(size);
+   for(auto & x : arr) x = std::rand();
+   for(int i = 0; i < 10000; i++)
+      arr[std::rand() % arr.size()] = (1<<10);
+
+   TNL::Containers::Array<double, TNL::Devices::Cuda> cudaArr(arr);
+   auto view = cudaArr.getView();
+   Quicksort::sort( view );
+
+   TNL::Containers::Array<double, TNL::Devices::Cuda> cudaArr2(arr);
+   thrust::sort(thrust::device, cudaArr2.getData(), cudaArr2.getData() + cudaArr2.getSize());
+   EXPECT_TRUE(view == cudaArr2.getView());
+}
+
+struct TMPSTRUCT_xyz{
+   double x, y, z;
+   __cuda_callable__ TMPSTRUCT_xyz(): x(0){}
+   __cuda_callable__ TMPSTRUCT_xyz(int first){x = first;};
+   __cuda_callable__ bool operator <(const TMPSTRUCT_xyz& other) const { return x< other.x;}
+   __cuda_callable__ TMPSTRUCT_xyz& operator =(const TMPSTRUCT_xyz& other) {x = other.x; return *this;}
+};
+std::ostream & operator<<(std::ostream & out, const TMPSTRUCT_xyz & data){return out << data.x;}
+
+TEST(types, struct_3D_points)
+{
+   std::srand(46151);
+
+   int size = (1<<18);
+   std::vector<TMPSTRUCT_xyz> arr(size);
+   for(auto & x : arr) x = TMPSTRUCT_xyz(std::rand());
+
+   TNL::Containers::Array<TMPSTRUCT_xyz, TNL::Devices::Cuda> cudaArr(arr);
+   auto view = cudaArr.getView();
+   //thrust::sort(thrust::device, cudaArr.getData(), cudaArr.getData() + cudaArr.getSize());
+   //std::cout << view << std::endl;
+   Quicksort::sort( view );
+
+   EXPECT_TRUE(Algorithms::isAscending(view));
+}
+
+struct TMPSTRUCT_64b{
+   uint8_t m_Data[64];
+   __cuda_callable__ TMPSTRUCT_64b() {m_Data[0] = 0;}
+   __cuda_callable__ TMPSTRUCT_64b(int first){m_Data[0] = first;};
+   __cuda_callable__ bool operator <(const TMPSTRUCT_64b& other) const { return m_Data[0]< other.m_Data[0];}
+   __cuda_callable__ TMPSTRUCT_64b& operator =(const TMPSTRUCT_64b& other) {m_Data[0] = other.m_Data[0]; return *this;}
+};
+std::ostream & operator<<(std::ostream & out, const TMPSTRUCT_64b & data){return out << (unsigned) data.m_Data[0];}
+
+TEST(types, struct_64b)
+{
+   std::srand(96);
+
+   int size = (1<<18);
+   std::vector<TMPSTRUCT_64b> arr(size);
+   for(auto & x : arr) x = TMPSTRUCT_64b(std::rand() % 512);
+
+   TNL::Containers::Array<TMPSTRUCT_64b, TNL::Devices::Cuda> cudaArr(arr);
+   auto view = cudaArr.getView();
+   //thrust::sort(thrust::device, cudaArr.getData(), cudaArr.getData() + cudaArr.getSize());
+   //std::cout << view << std::endl;
+   Quicksort::sort( view );
+
+   EXPECT_TRUE(Algorithms::isAscending(view));
+}
+
+#endif
+
+#include "../../main.h"