Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
G
GPUSort
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
TNL
GPUSort
Commits
070391be
There was an error fetching the commit references. Please try again later.
Commit
070391be
authored
3 years ago
by
Xuan Thang Nguyen
Browse files
Options
Downloads
Patches
Plain Diff
calc sharedMem for 1st phase
parent
7e72c87e
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/quicksort/quicksort.cuh
+204
-126
204 additions, 126 deletions
src/quicksort/quicksort.cuh
with
204 additions
and
126 deletions
src/quicksort/quicksort.cuh
+
204
−
126
View file @
070391be
...
...
@@ -19,11 +19,11 @@ using namespace TNL::Containers;
//-----------------------------------------------------------
__device__
void
writeNewTask
(
int
begin
,
int
end
,
int
depth
,
int
maxElemFor2ndPhase
,
ArrayView
<
TASK
,
Devices
::
Cuda
>
newTasks
,
int
*
newTasksCnt
,
ArrayView
<
TASK
,
Devices
::
Cuda
>
newTasks
,
int
*
newTasksCnt
,
ArrayView
<
TASK
,
Devices
::
Cuda
>
secondPhaseTasks
,
int
*
secondPhaseTasksCnt
)
{
int
size
=
end
-
begin
;
if
(
size
<
0
)
if
(
size
<
0
)
{
printf
(
"negative size, something went really wrong
\n
"
);
return
;
...
...
@@ -68,18 +68,18 @@ __device__ void writeNewTask(int begin, int end, int depth, int maxElemFor2ndPha
template
<
typename
Value
,
typename
Function
>
__global__
void
cudaQuickSort1stPhase_1
(
ArrayView
<
Value
,
Devices
::
Cuda
>
arr
,
ArrayView
<
Value
,
Devices
::
Cuda
>
aux
,
const
Function
&
Cmp
,
int
elemPerBlock
,
ArrayView
<
TASK
,
Devices
::
Cuda
>
tasks
,
ArrayView
<
int
,
Devices
::
Cuda
>
taskMapping
)
const
Function
&
Cmp
,
int
elemPerBlock
,
ArrayView
<
TASK
,
Devices
::
Cuda
>
tasks
,
ArrayView
<
int
,
Devices
::
Cuda
>
taskMapping
)
{
extern
__shared__
int
externMem
[];
Value
*
sharedMem
=
(
Value
*
)
externMem
;
Value
*
sharedMem
=
(
Value
*
)
externMem
;
static
__shared__
Value
pivot
;
TASK
&
myTask
=
tasks
[
taskMapping
[
blockIdx
.
x
]];
auto
&
src
=
(
myTask
.
depth
&
1
)
==
0
?
arr
:
aux
;
auto
&
dst
=
(
myTask
.
depth
&
1
)
==
0
?
aux
:
arr
;
auto
&
src
=
(
myTask
.
depth
&
1
)
==
0
?
arr
:
aux
;
auto
&
dst
=
(
myTask
.
depth
&
1
)
==
0
?
aux
:
arr
;
if
(
threadIdx
.
x
==
0
)
pivot
=
src
[
myTask
.
pivotIdx
];
...
...
@@ -94,15 +94,15 @@ __global__ void cudaQuickSort1stPhase_1(ArrayView<Value, Devices::Cuda> arr, Arr
template
<
typename
Value
,
typename
Function
>
__global__
void
cudaQuickSort1stPhase_2
(
ArrayView
<
Value
,
Devices
::
Cuda
>
arr
,
ArrayView
<
Value
,
Devices
::
Cuda
>
aux
,
const
Function
&
Cmp
,
int
elemPerBlock
,
ArrayView
<
TASK
,
Devices
::
Cuda
>
tasks
,
ArrayView
<
int
,
Devices
::
Cuda
>
taskMapping
)
const
Function
&
Cmp
,
int
elemPerBlock
,
ArrayView
<
TASK
,
Devices
::
Cuda
>
tasks
,
ArrayView
<
int
,
Devices
::
Cuda
>
taskMapping
)
{
static
__shared__
Value
pivot
;
TASK
&
myTask
=
tasks
[
taskMapping
[
blockIdx
.
x
]];
auto
&
src
=
(
myTask
.
depth
&
1
)
==
0
?
arr
:
aux
;
auto
&
dst
=
(
myTask
.
depth
&
1
)
==
0
?
aux
:
arr
;
auto
&
src
=
(
myTask
.
depth
&
1
)
==
0
?
arr
:
aux
;
auto
&
dst
=
(
myTask
.
depth
&
1
)
==
0
?
aux
:
arr
;
if
(
threadIdx
.
x
==
0
)
pivot
=
src
[
myTask
.
pivotIdx
];
...
...
@@ -116,7 +116,7 @@ __global__ void cudaQuickSort1stPhase_2(ArrayView<Value, Devices::Cuda> arr, Arr
//----------------------------------------------------
template
<
typename
Value
>
template
<
typename
Value
>
__global__
void
cudaWritePivot
(
ArrayView
<
Value
,
Devices
::
Cuda
>
arr
,
ArrayView
<
Value
,
Devices
::
Cuda
>
aux
,
int
maxElemFor2ndPhase
,
ArrayView
<
TASK
,
Devices
::
Cuda
>
tasks
,
ArrayView
<
TASK
,
Devices
::
Cuda
>
newTasks
,
int
*
newTasksCnt
,
ArrayView
<
TASK
,
Devices
::
Cuda
>
secondPhaseTasks
,
int
*
secondPhaseTasksCnt
)
...
...
@@ -149,20 +149,20 @@ __global__ void cudaWritePivot(ArrayView<Value, Devices::Cuda> arr, ArrayView<Va
if
(
threadIdx
.
x
!=
0
)
return
;
if
(
leftEnd
-
leftBegin
>
0
)
if
(
leftEnd
-
leftBegin
>
0
)
{
writeNewTask
(
leftBegin
,
leftEnd
,
myTask
.
depth
,
maxElemFor2ndPhase
,
newTasks
,
newTasksCnt
,
secondPhaseTasks
,
secondPhaseTasksCnt
);
maxElemFor2ndPhase
,
newTasks
,
newTasksCnt
,
secondPhaseTasks
,
secondPhaseTasksCnt
);
}
if
(
rightEnd
-
rightBegin
>
0
)
if
(
rightEnd
-
rightBegin
>
0
)
{
writeNewTask
(
rightBegin
,
rightEnd
,
myTask
.
depth
,
maxElemFor2ndPhase
,
newTasks
,
newTasksCnt
,
secondPhaseTasks
,
secondPhaseTasksCnt
);
myTask
.
depth
,
maxElemFor2ndPhase
,
newTasks
,
newTasksCnt
,
secondPhaseTasks
,
secondPhaseTasksCnt
);
}
}
...
...
@@ -174,7 +174,7 @@ __global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, Array
ArrayView
<
TASK
,
Devices
::
Cuda
>
secondPhaseTasks
)
{
TASK
&
myTask
=
secondPhaseTasks
[
blockIdx
.
x
];
if
(
myTask
.
partitionEnd
-
myTask
.
partitionBegin
<=
0
)
if
(
myTask
.
partitionEnd
-
myTask
.
partitionBegin
<=
0
)
return
;
auto
arrView
=
arr
.
getView
(
myTask
.
partitionBegin
,
myTask
.
partitionEnd
);
...
...
@@ -183,7 +183,6 @@ __global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, Array
singleBlockQuickSort
<
Value
,
Function
,
stackSize
>
(
arrView
,
auxView
,
Cmp
,
myTask
.
depth
);
}
template
<
typename
Value
,
typename
Function
,
int
stackSize
>
__global__
void
cudaQuickSort2ndPhase
(
ArrayView
<
Value
,
Devices
::
Cuda
>
arr
,
ArrayView
<
Value
,
Devices
::
Cuda
>
aux
,
const
Function
&
Cmp
,
...
...
@@ -191,12 +190,12 @@ __global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, Array
ArrayView
<
TASK
,
Devices
::
Cuda
>
secondPhaseTasks2
)
{
TASK
myTask
;
if
(
blockIdx
.
x
<
secondPhaseTasks1
.
getSize
())
if
(
blockIdx
.
x
<
secondPhaseTasks1
.
getSize
())
myTask
=
secondPhaseTasks1
[
blockIdx
.
x
];
else
myTask
=
secondPhaseTasks2
[
blockIdx
.
x
-
secondPhaseTasks1
.
getSize
()];
if
(
myTask
.
partitionEnd
-
myTask
.
partitionBegin
<=
0
)
if
(
myTask
.
partitionEnd
-
myTask
.
partitionBegin
<=
0
)
return
;
auto
arrView
=
arr
.
getView
(
myTask
.
partitionBegin
,
myTask
.
partitionEnd
);
...
...
@@ -208,16 +207,16 @@ __global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, Array
//-----------------------------------------------------------
__global__
void
cudaCalcBlocksNeeded
(
ArrayView
<
TASK
,
Devices
::
Cuda
>
cuda_tasks
,
int
elemPerBlock
,
ArrayView
<
int
,
Devices
::
Cuda
>
blocksNeeded
)
ArrayView
<
int
,
Devices
::
Cuda
>
blocksNeeded
)
{
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
>=
cuda_tasks
.
getSize
())
if
(
i
>=
cuda_tasks
.
getSize
())
return
;
auto
task
=
cuda_tasks
[
i
];
int
size
=
task
.
partitionEnd
-
task
.
partitionBegin
;
blocksNeeded
[
i
]
=
size
/
elemPerBlock
+
(
size
%
elemPerBlock
!=
0
);
}
}
template
<
typename
Value
,
typename
Function
>
__global__
void
cudaInitTask
(
ArrayView
<
TASK
,
Devices
::
Cuda
>
cuda_tasks
,
...
...
@@ -225,40 +224,42 @@ __global__ void cudaInitTask(ArrayView<TASK, Devices::Cuda> cuda_tasks,
ArrayView
<
int
,
Devices
::
Cuda
>
cuda_reductionTaskInitMem
,
ArrayView
<
Value
,
Devices
::
Cuda
>
src
,
const
Function
&
Cmp
)
{
if
(
blockIdx
.
x
>=
cuda_tasks
.
getSize
())
if
(
blockIdx
.
x
>=
cuda_tasks
.
getSize
())
return
;
int
start
=
blockIdx
.
x
==
0
?
0
:
cuda_reductionTaskInitMem
[
blockIdx
.
x
-
1
];
int
start
=
blockIdx
.
x
==
0
?
0
:
cuda_reductionTaskInitMem
[
blockIdx
.
x
-
1
];
int
end
=
cuda_reductionTaskInitMem
[
blockIdx
.
x
];
for
(
int
i
=
start
+
threadIdx
.
x
;
i
<
end
;
i
+=
blockDim
.
x
)
for
(
int
i
=
start
+
threadIdx
.
x
;
i
<
end
;
i
+=
blockDim
.
x
)
cuda_blockToTaskMapping
[
i
]
=
blockIdx
.
x
;
if
(
threadIdx
.
x
==
0
)
if
(
threadIdx
.
x
==
0
)
{
TASK
&
task
=
cuda_tasks
[
blockIdx
.
x
];
TASK
&
task
=
cuda_tasks
[
blockIdx
.
x
];
int
pivotIdx
=
task
.
partitionBegin
+
pickPivotIdx
(
src
.
getView
(
task
.
partitionBegin
,
task
.
partitionEnd
),
Cmp
);
task
.
initTask
(
start
,
end
-
start
,
pivotIdx
);
task
.
initTask
(
start
,
end
-
start
,
pivotIdx
);
}
}
//-----------------------------------------------------------
//-----------------------------------------------------------
const
int
threadsPerBlock
=
512
,
g_maxBlocks
=
1
<<
15
;
//32k
const
int
g_maxTasks
=
1
<<
14
;
const
int
minElemPerBlock
=
threadsPerBlock
*
10
;
const
int
maxBitonicSize
=
threadsPerBlock
*
2
;
const
int
desired_2ndPhasElemPerBlock
=
maxBitonicSize
;
template
<
typename
Value
>
template
<
typename
Value
>
class
QUICKSORT
{
ArrayView
<
Value
,
Devices
::
Cuda
>
arr
;
Array
<
Value
,
Devices
::
Cuda
>
aux
;
int
maxTasks
,
maxBlocks
;
int
maxBlocks
,
threadsPerBlock
,
desiredElemPerBlock
,
maxSharable
;
const
int
maxBitonicSize
=
threadsPerBlock
*
2
;
const
int
desired_2ndPhasElemPerBlock
=
maxBitonicSize
;
const
int
g_maxTasks
=
1
<<
14
;
int
maxTasks
;
Array
<
TASK
,
Devices
::
Cuda
>
cuda_tasks
,
cuda_newTasks
,
cuda_2ndPhaseTasks
;
Array
<
int
,
Devices
::
Cuda
>
cuda_newTasksAmount
,
cuda_2ndPhaseTasksAmount
;
//is in reality 1 integer
Array
<
int
,
Devices
::
Cuda
>
cuda_newTasksAmount
,
cuda_2ndPhaseTasksAmount
;
//is in reality 1 integer
each
int
tasksAmount
;
//counter for Host == cuda_newTasksAmount
int
host_1stPhaseTasksAmount
;
//counter for Host == cuda_newTasksAmount
int
host_2ndPhaseTasksAmount
;
// cuda_2ndPhaseTasksAmount
Array
<
int
,
Devices
::
Cuda
>
cuda_blockToTaskMapping
;
...
...
@@ -266,32 +267,40 @@ class QUICKSORT
int
iteration
=
0
;
//--------------------------------------------------------------------------------------
cudaDeviceProp
deviceProp
;
//--------------------------------------------------------------------------------------
public:
QUICKSORT
(
ArrayView
<
Value
,
Devices
::
Cuda
>
_arr
)
:
arr
(
_arr
),
aux
(
arr
.
getSize
()),
QUICKSORT
(
ArrayView
<
Value
,
Devices
::
Cuda
>
arr
,
int
gridDim
,
int
blockDim
,
int
desiredElemPerBlock
,
int
maxSharable
)
:
arr
(
arr
.
getView
()),
aux
(
arr
.
getSize
()),
maxBlocks
(
gridDim
),
threadsPerBlock
(
blockDim
),
desiredElemPerBlock
(
desiredElemPerBlock
),
maxSharable
(
maxSharable
),
maxTasks
(
min
(
arr
.
getSize
(),
g_maxTasks
)),
maxBlocks
(
g_maxBlocks
),
cuda_tasks
(
maxTasks
),
cuda_newTasks
(
maxTasks
),
cuda_2ndPhaseTasks
(
maxTasks
),
cuda_newTasksAmount
(
1
),
cuda_2ndPhaseTasksAmount
(
1
),
cuda_newTasksAmount
(
1
),
cuda_2ndPhaseTasksAmount
(
1
),
cuda_blockToTaskMapping
(
maxBlocks
),
cuda_reductionTaskInitMem
(
maxTasks
)
{
cuda_tasks
.
setElement
(
0
,
TASK
(
0
,
arr
.
getSize
(),
0
));
tasksAmount
=
1
;
host_1stPhaseTasksAmount
=
1
;
host_2ndPhaseTasksAmount
=
0
;
cuda_2ndPhaseTasksAmount
=
0
;
iteration
=
0
;
cudaGetDeviceProperties
(
&
deviceProp
,
0
);
//change device
TNL_CHECK_CUDA_DEVICE
;
}
template
<
typename
Function
>
void
sort
(
const
Function
&
Cmp
);
template
<
typename
Function
>
void
firstPhase
(
const
Function
&
Cmp
);
template
<
typename
Function
>
void
secondPhase
(
const
Function
&
Cmp
);
int
getSetsNeeded
(
int
elemPerBlock
)
const
;
int
getElemPerBlock
()
const
;
...
...
@@ -299,41 +308,66 @@ public:
* returns the amount of blocks needed
* */
template
<
typename
Function
>
int
initTasks
(
int
elemPerBlock
,
const
Function
&
Cmp
);
int
initTasks
(
int
elemPerBlock
,
const
Function
&
Cmp
);
void
processNewTasks
();
};
//---------------------------------------------------------------------------------------------
template
<
typename
Value
>
template
<
typename
Function
>
void
QUICKSORT
<
Value
>::
sort
(
const
Function
&
Cmp
)
{
while
(
tasksAmount
>
0
)
firstPhase
(
Cmp
);
int
total2ndPhase
=
host_1stPhaseTasksAmount
+
host_2ndPhaseTasksAmount
;
if
(
total2ndPhase
>
0
)
secondPhase
(
Cmp
);
cudaDeviceSynchronize
();
TNL_CHECK_CUDA_DEVICE
;
return
;
}
//---------------------------------------------------------------------------------------------
template
<
typename
Value
>
template
<
typename
Function
>
void
QUICKSORT
<
Value
>::
firstPhase
(
const
Function
&
Cmp
)
{
while
(
host_1stPhaseTasksAmount
>
0
)
{
//2ndphase task is now full or tasksAmount is full, as backup during writing, overflowing tasks were written into the other array
if
(
tasksAmount
>=
maxTasks
||
host_2ndPhaseTasksAmount
>=
maxTasks
)
{
//2ndphase task is now full or host_1stPhaseTasksAmount is full, as backup during writing, overflowing tasks were written into the other array
if
(
host_1stPhaseTasksAmount
>=
maxTasks
||
host_2ndPhaseTasksAmount
>=
maxTasks
)
break
;
}
//just in case newly created tasks wouldnt fit
if
(
tasksAmount
*
2
>=
maxTasks
+
(
maxTasks
-
host_2ndPhaseTasksAmount
))
{
if
(
host_1stPhaseTasksAmount
*
2
>=
maxTasks
+
(
maxTasks
-
host_2ndPhaseTasksAmount
))
break
;
}
int
elemPerBlock
=
getElemPerBlock
();
int
blocksCnt
=
initTasks
(
elemPerBlock
,
Cmp
);
if
(
blocksCnt
>=
cuda_blockToTaskMapping
.
getSize
())
TNL_CHECK_CUDA_DEVICE
;
if
(
blocksCnt
>=
maxBlocks
)
//too many blocks needed, switch to 2nd phase
break
;
TNL_CHECK_CUDA_DEVICE
;
//-----------------------------------------------
//do the partitioning
auto
&
task
=
iteration
%
2
==
0
?
cuda_tasks
:
cuda_newTasks
;
int
externMemByteSize
=
elemPerBlock
*
sizeof
(
Value
);
auto
&
task
=
iteration
%
2
==
0
?
cuda_tasks
:
cuda_newTasks
;
if
(
externMemByteSize
<=
deviceProp
.
sharedMemPerBlock
)
/**
* check if can partition using shared memory for coalesced read and write
* 1st phase of partitioning
* sets of blocks work on a task
*
* using the atomicAdd intristic, each block reserves a chunk of memory where to move elements
* smaller and bigger than pivot move to
* */
if
(
externMemByteSize
<=
maxSharable
)
{
cudaQuickSort1stPhase_1
<
Value
,
Function
>
<<<
blocksCnt
,
threadsPerBlock
,
externMemByteSize
>>>
(
...
...
@@ -345,110 +379,124 @@ void QUICKSORT<Value>::sort(const Function &Cmp)
cudaQuickSort1stPhase_2
<
Value
,
Function
>
<<<
blocksCnt
,
threadsPerBlock
>>>
(
arr
,
aux
,
Cmp
,
elemPerBlock
,
task
,
cuda_blockToTaskMapping
);
task
,
cuda_blockToTaskMapping
);
}
TNL_CHECK_CUDA_DEVICE
;
auto
&
newTask
=
iteration
%
2
==
0
?
cuda_newTasks
:
cuda_tasks
;
/**
* fill in the gap between smaller and bigger with elements == pivot
* after writing also create new tasks, each task generates at max 2 tasks
*
* tasks smaller than desired_2ndPhasElemPerBlock go into 2nd phase
* bigger need more blocks to partition and are written into newTask
* with iteration %2, rotate between the 2 tasks array to save from copying
* */
auto
&
newTask
=
iteration
%
2
==
0
?
cuda_newTasks
:
cuda_tasks
;
cudaWritePivot
<
Value
>
<<<
tasksAmount
,
1024
>>>
(
arr
,
aux
,
desired_2ndPhasElemPerBlock
,
task
,
newTask
,
cuda_newTasksAmount
.
getData
(),
cuda_2ndPhaseTasks
,
cuda_2ndPhaseTasksAmount
.
getData
());
<<<
host_1stPhaseTasksAmount
,
1024
>>>
(
arr
,
aux
,
desired_2ndPhasElemPerBlock
,
task
,
newTask
,
cuda_newTasksAmount
.
getData
(),
cuda_2ndPhaseTasks
,
cuda_2ndPhaseTasksAmount
.
getData
());
TNL_CHECK_CUDA_DEVICE
;
processNewTasks
();
iteration
++
;
}
int
total2ndPhase
=
tasksAmount
+
host_2ndPhaseTasksAmount
;
if
(
total2ndPhase
>
0
)
}
//----------------------------------------------------------------------
template
<
typename
Value
>
template
<
typename
Function
>
void
QUICKSORT
<
Value
>::
secondPhase
(
const
Function
&
Cmp
)
{
int
total2ndPhase
=
host_1stPhaseTasksAmount
+
host_2ndPhaseTasksAmount
;
const
int
stackSize
=
32
;
auto
&
leftoverTasks
=
iteration
%
2
==
0
?
cuda_tasks
:
cuda_newTasks
;
if
(
host_1stPhaseTasksAmount
>
0
&&
host_2ndPhaseTasksAmount
>
0
)
{
const
int
stackSize
=
32
;
if
(
tasksAmount
>
0
&&
host_2ndPhaseTasksAmount
>
0
)
{
auto
tasks
=
iteration
%
2
==
0
?
cuda_tasks
.
getView
(
0
,
tasksAmount
)
:
cuda_newTasks
.
getView
(
0
,
tasksAmount
);
auto
tasks2
=
cuda_2ndPhaseTasks
.
getView
(
0
,
host_2ndPhaseTasksAmount
);
auto
tasks2
=
cuda_2ndPhaseTasks
.
getView
(
0
,
host_2ndPhaseTasksAmount
);
cudaQuickSort2ndPhase
<
Value
,
Function
,
stackSize
>
<<<
total2ndPhase
,
threadsPerBlock
>>>
(
arr
,
aux
,
Cmp
,
t
asks
,
tasks2
);
}
else
if
(
t
asksAmount
>
0
)
{
auto
tasks
=
iteration
%
2
==
0
?
cuda_tasks
.
getView
(
0
,
tasksAmount
)
:
cuda_newTasks
.
getView
(
0
,
t
asksAmount
);
cudaQuickSort2ndPhase
<
Value
,
Function
,
stackSize
>
<<<
total2ndPhase
,
threadsPerBlock
>>>
(
arr
,
aux
,
Cmp
,
tasks
);
}
else
{
auto
tasks2
=
cuda_2ndPhaseTasks
.
getView
(
0
,
host_2ndPhaseTasksAmount
);
cudaQuickSort2ndPhase
<
Value
,
Function
,
stackSize
>
<<<
total2ndPhase
,
threadsPerBlock
>>>
(
arr
,
aux
,
Cmp
,
leftoverT
asks
,
tasks2
);
}
else
if
(
host_1stPhaseT
asksAmount
>
0
)
{
auto
tasks
=
leftoverTasks
.
getView
(
0
,
host_1stPhaseT
asksAmount
);
cudaQuickSort2ndPhase
<
Value
,
Function
,
stackSize
>
<<<
total2ndPhase
,
threadsPerBlock
>>>
(
arr
,
aux
,
Cmp
,
tasks
);
}
else
{
auto
tasks2
=
cuda_2ndPhaseTasks
.
getView
(
0
,
host_2ndPhaseTasksAmount
);
cudaQuickSort2ndPhase
<
Value
,
Function
,
stackSize
>
<<<
total2ndPhase
,
threadsPerBlock
>>>
(
arr
,
aux
,
Cmp
,
tasks2
);
}
cudaQuickSort2ndPhase
<
Value
,
Function
,
stackSize
>
<<<
total2ndPhase
,
threadsPerBlock
>>>
(
arr
,
aux
,
Cmp
,
tasks2
);
}
cudaDeviceSynchronize
();
TNL_CHECK_CUDA_DEVICE
;
return
;
}
//----------------------------------------------------------------------
template
<
typename
Value
>
int
QUICKSORT
<
Value
>::
getSetsNeeded
(
int
elemPerBlock
)
const
{
auto
view
=
iteration
%
2
==
0
?
cuda_tasks
.
getConstView
()
:
cuda_newTasks
.
getConstView
();
auto
fetch
=
[
=
]
__cuda_callable__
(
int
i
)
{
auto
&
task
=
view
[
i
];
const
auto
&
task
=
view
[
i
];
int
size
=
task
.
partitionEnd
-
task
.
partitionBegin
;
return
size
/
elemPerBlock
+
(
size
%
elemPerBlock
!=
0
);
};
auto
reduction
=
[]
__cuda_callable__
(
int
a
,
int
b
)
{
return
a
+
b
;
};
return
Algorithms
::
Reduction
<
Devices
::
Cuda
>::
reduce
(
0
,
t
asksAmount
,
fetch
,
reduction
,
0
);
return
Algorithms
::
Reduction
<
Devices
::
Cuda
>::
reduce
(
0
,
host_1stPhaseT
asksAmount
,
fetch
,
reduction
,
0
);
}
template
<
typename
Value
>
int
QUICKSORT
<
Value
>::
getElemPerBlock
()
const
{
int
setsNeeded
=
getSetsNeeded
(
min
ElemPerBlock
);
int
setsNeeded
=
getSetsNeeded
(
desired
ElemPerBlock
);
if
(
setsNeeded
<=
maxBlocks
)
return
minElemPerBlock
;
return
desiredElemPerBlock
;
//want multiplier*minElemPerBLock <= x*threadPerBlock
//find smallest x so that this inequality holds
double
multiplier
=
1.
*
setsNeeded
/
maxBlocks
;
int
elemPerBlock
=
multiplier
*
desiredElemPerBlock
;
setsNeeded
=
elemPerBlock
/
threadsPerBlock
+
(
elemPerBlock
%
threadsPerBlock
!=
0
);
int
setsPerBlock
=
ceil
(
1.
*
setsNeeded
/
maxBlocks
);
return
setsPerBlock
*
minElemPerBlock
;
return
setsNeeded
*
threadsPerBlock
;
}
template
<
typename
Value
>
template
<
typename
Function
>
int
QUICKSORT
<
Value
>::
initTasks
(
int
elemPerBlock
,
const
Function
&
Cmp
)
int
QUICKSORT
<
Value
>::
initTasks
(
int
elemPerBlock
,
const
Function
&
Cmp
)
{
int
threads
=
min
(
t
asksAmount
,
threadsPerBlock
);
int
blocks
=
t
asksAmount
/
threads
+
(
t
asksAmount
%
threads
!=
0
);
int
threads
=
min
(
host_1stPhaseT
asksAmount
,
threadsPerBlock
);
int
blocks
=
host_1stPhaseT
asksAmount
/
threads
+
(
host_1stPhaseT
asksAmount
%
threads
!=
0
);
auto
src
=
iteration
%
2
==
0
?
arr
:
aux
.
getView
();
auto
&
tasks
=
iteration
%
2
==
0
?
cuda_tasks
:
cuda_newTasks
;
auto
src
=
iteration
%
2
==
0
?
arr
:
aux
.
getView
();
auto
&
tasks
=
iteration
%
2
==
0
?
cuda_tasks
:
cuda_newTasks
;
//[i] == how many blocks task i needs
cudaCalcBlocksNeeded
<<<
threads
,
blocks
>>>
(
tasks
.
getView
(
0
,
t
asksAmount
),
elemPerBlock
,
cuda_reductionTaskInitMem
.
getView
(
0
,
t
asksAmount
));
cudaCalcBlocksNeeded
<<<
threads
,
blocks
>>>
(
tasks
.
getView
(
0
,
host_1stPhaseT
asksAmount
),
elemPerBlock
,
cuda_reductionTaskInitMem
.
getView
(
0
,
host_1stPhaseT
asksAmount
));
thrust
::
inclusive_scan
(
thrust
::
device
,
cuda_reductionTaskInitMem
.
getData
(),
cuda_reductionTaskInitMem
.
getData
()
+
t
asksAmount
,
cuda_reductionTaskInitMem
.
getData
());
cuda_reductionTaskInitMem
.
getData
(),
cuda_reductionTaskInitMem
.
getData
()
+
host_1stPhaseT
asksAmount
,
cuda_reductionTaskInitMem
.
getData
());
int
blocksNeeded
=
cuda_reductionTaskInitMem
.
getElement
(
t
asksAmount
-
1
);
int
blocksNeeded
=
cuda_reductionTaskInitMem
.
getElement
(
host_1stPhaseT
asksAmount
-
1
);
//need too many blocks, give back control
if
(
blocksNeeded
>=
cuda_blockToTaskMapping
.
getSize
())
if
(
blocksNeeded
>=
cuda_blockToTaskMapping
.
getSize
())
return
blocksNeeded
;
cudaInitTask
<<<
t
asksAmount
,
512
>>>
(
tasks
.
getView
(
0
,
t
asksAmount
),
cudaInitTask
<<<
host_1stPhaseT
asksAmount
,
512
>>>
(
tasks
.
getView
(
0
,
host_1stPhaseT
asksAmount
),
cuda_blockToTaskMapping
.
getView
(
0
,
blocksNeeded
),
cuda_reductionTaskInitMem
.
getView
(
0
,
tasksAmount
),
src
,
Cmp
);
cuda_reductionTaskInitMem
.
getView
(
0
,
host_1stPhaseTasksAmount
),
src
,
Cmp
);
cuda_newTasksAmount
.
setElement
(
0
,
0
);
return
blocksNeeded
;
...
...
@@ -457,7 +505,7 @@ int QUICKSORT<Value>::initTasks(int elemPerBlock, const Function & Cmp)
template
<
typename
Value
>
void
QUICKSORT
<
Value
>::
processNewTasks
()
{
t
asksAmount
=
cuda_newTasksAmount
.
getElement
(
0
);
host_1stPhaseT
asksAmount
=
cuda_newTasksAmount
.
getElement
(
0
);
host_2ndPhaseTasksAmount
=
cuda_2ndPhaseTasksAmount
.
getElement
(
0
);
}
...
...
@@ -468,12 +516,42 @@ void QUICKSORT<Value>::processNewTasks()
template
<
typename
Value
,
typename
Function
>
void
quicksort
(
ArrayView
<
Value
,
Devices
::
Cuda
>
arr
,
const
Function
&
Cmp
)
{
QUICKSORT
<
Value
>
sorter
(
arr
);
const
int
maxBlocks
=
(
1
<<
20
);
cudaDeviceProp
deviceProp
;
cudaGetDeviceProperties
(
&
deviceProp
,
0
);
int
sharedReserve
=
sizeof
(
Value
)
+
sizeof
(
int
)
*
16
;
//1pivot + 16 other shared vars reserved
int
maxSharable
=
deviceProp
.
sharedMemPerBlock
-
sharedReserve
;
//blockDim*multiplier*sizeof(Value) <= maxSharable
int
blockDim
=
512
;
//best case
int
elemPerBlock
=
maxSharable
/
sizeof
(
Value
);
const
int
maxMultiplier
=
8
;
int
multiplier
=
min
(
elemPerBlock
/
blockDim
,
maxMultiplier
);
if
(
multiplier
<=
0
)
{
blockDim
=
256
;
multiplier
=
min
(
elemPerBlock
/
blockDim
,
maxMultiplier
);
if
(
multiplier
<=
0
)
{
//worst case scenario, shared memory cant be utilized at all because of the sheer size of Value
//sort has to be done with the use of global memory alone
QUICKSORT
<
Value
>
sorter
(
arr
,
maxBlocks
,
512
,
0
,
maxSharable
);
sorter
.
sort
(
Cmp
);
return
;
}
}
assert
(
blockDim
*
multiplier
*
sizeof
(
Value
)
<=
maxSharable
);
QUICKSORT
<
Value
>
sorter
(
arr
,
maxBlocks
,
blockDim
,
multiplier
*
blockDim
,
maxSharable
);
sorter
.
sort
(
Cmp
);
}
template
<
typename
Value
>
void
quicksort
(
ArrayView
<
Value
,
Devices
::
Cuda
>
arr
)
{
quicksort
(
arr
,
[]
__cuda_callable__
(
const
Value
&
a
,
const
Value
&
b
)
{
return
a
<
b
;
});
quicksort
(
arr
,
[]
__cuda_callable__
(
const
Value
&
a
,
const
Value
&
b
)
{
return
a
<
b
;
});
}
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment