Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
T
tnl-dev
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Model registry
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
This is an archived project. Repository and other project resources are read-only.
Show more breadcrumbs
TNL
tnl-dev
Commits
5e5d94ce
There was an error fetching the commit references. Please try again later.
Commit
5e5d94ce
authored
9 years ago
by
Jakub Klinkovský
Browse files
Options
Downloads
Patches
Plain Diff
Refactoring benchmarks using more lambda functions
parent
8bb94c12
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
tests/benchmarks/tnl-cuda-benchmarks.h
+141
-102
141 additions, 102 deletions
tests/benchmarks/tnl-cuda-benchmarks.h
with
141 additions
and
102 deletions
tests/benchmarks/tnl-cuda-benchmarks.h
+
141
−
102
View file @
5e5d94ce
...
...
@@ -41,6 +41,8 @@ const double oneGB = 1024.0 * 1024.0 * 1024.0;
// check operations with the timer:
// - reset() clears the timer and starts it again
// - getTime() stops the timer and starts it again !!!
// - data members are not zero-initialized - reset has to be called manually, but it immediately starts the timer
// FIXME: scalarProduct is not const method
template
<
typename
Matrix
>
...
...
@@ -107,19 +109,86 @@ void setCudaTestMatrix( Matrix& matrix,
tnlCuda
::
freeFromDevice
(
kernel_matrix
);
}
template
<
typename
Function
,
typename
...
Args
>
double
time_void_function
(
int
loops
,
Function
&
f
,
Args
&
...
args
)
// TODO: add data member for error message
struct
BenchmarkError
{};
auto
trueFunc
=
[]()
{
return
true
;
};
auto
voidFunc
=
[](){};
template
<
typename
ComputeFunction
,
typename
CheckFunction
,
typename
ResetFunction
>
double
benchmarkSingle
(
const
int
&
loops
,
const
double
&
datasetSize
,
// in GB
ComputeFunction
&
compute
,
// TODO: check that default argument works here
CheckFunction
&
check
=
trueFunc
,
ResetFunction
&
reset
=
voidFunc
)
{
tnlTimerRT
timer
;
timer
.
reset
();
for
(
int
i
=
0
;
i
<
loops
;
++
i
)
{
timer
.
start
();
f
(
args
...
);
compute
(
);
timer
.
stop
();
if
(
!
check
()
)
throw
BenchmarkError
();
reset
();
}
const
double
time
=
timer
.
getTime
();
const
double
bandwidth
=
datasetSize
/
time
;
cout
<<
"bandwidth: "
<<
bandwidth
<<
" GB/sec, time: "
<<
time
<<
" sec."
<<
endl
;
return
time
;
}
template
<
typename
ComputeHostFunction
,
typename
ComputeCudaFunction
,
typename
CheckFunction
,
typename
ResetFunction
>
void
benchmarkCuda
(
const
int
&
loops
,
const
double
&
datasetSize
,
// in GB
ComputeHostFunction
&
computeHost
,
ComputeCudaFunction
&
computeCuda
,
// TODO: check that default argument works here
CheckFunction
&
check
=
trueFunc
,
ResetFunction
&
reset
=
voidFunc
)
{
tnlTimerRT
timerHost
,
timerCuda
;
timerHost
.
reset
();
timerHost
.
stop
();
timerCuda
.
reset
();
timerCuda
.
stop
();
for
(
int
i
=
0
;
i
<
loops
;
++
i
)
{
timerHost
.
start
();
computeHost
();
timerHost
.
stop
();
timerCuda
.
start
();
computeCuda
();
timerCuda
.
stop
();
if
(
!
check
()
)
throw
BenchmarkError
();
reset
();
}
return
timer
.
getTime
();
const
double
timeHost
=
timerHost
.
getTime
();
const
double
timeCuda
=
timerCuda
.
getTime
();
const
double
bandwidthHost
=
datasetSize
/
timeHost
;
const
double
bandwidthCuda
=
datasetSize
/
timeCuda
;
cout
<<
" CPU: bandwidth: "
<<
bandwidthHost
<<
" GB/sec, time: "
<<
timeHost
<<
" sec."
<<
endl
;
cout
<<
" GPU: bandwidth: "
<<
bandwidthCuda
<<
" GB/sec, time: "
<<
timeCuda
<<
" sec."
<<
endl
;
cout
<<
" CPU/GPU speedup: "
<<
timeHost
/
timeCuda
<<
endl
;
}
template
<
typename
Real
,
...
...
@@ -169,43 +238,33 @@ benchmarkSpMV( const int & loops,
return
false
;
}
double
bandwidth
(
0.0
),
datasetSize
(
0.0
),
timeHost
(
0.0
),
timeDevice
(
0.0
);
tnlList
<
tnlString
>
parsedType
;
parseObjectType
(
HostMatrix
::
getType
(),
parsedType
);
cout
<<
"Benchmarking SpMV (matrix type: "
<<
parsedType
[
0
]
<<
", rows: "
<<
size
<<
", elements per row: "
<<
elementsPerRow
<<
"):"
<<
endl
;
const
int
elements
=
setHostTestMatrix
<
HostMatrix
>
(
hostMatrix
,
elementsPerRow
);
setCudaTestMatrix
<
DeviceMatrix
>
(
deviceMatrix
,
elementsPerRow
);
datasetSize
=
loops
*
elements
*
(
2
*
sizeof
(
Real
)
+
sizeof
(
int
)
)
/
oneGB
;
const
double
datasetSize
=
loops
*
elements
*
(
2
*
sizeof
(
Real
)
+
sizeof
(
int
)
)
/
oneGB
;
hostVector
.
setValue
(
1.0
);
deviceVector
.
setValue
(
1.0
);
auto
spmvHost
=
[](
const
HostMatrix
&
m
,
const
HostVector
&
x
,
HostVector
&
y
)
{
m
.
vectorProduct
(
x
,
y
);
// check and reset functions
auto
check
=
[
&
]()
{
return
hostVector2
==
deviceVector2
;
};
timeHost
=
time_void_function
(
loops
,
spmvHost
,
hostMatrix
,
hostVector
,
hostVector2
);
bandwidth
=
datasetSize
/
timeHost
;
cout
<<
" CPU: bandwidth: "
<<
bandwidth
<<
" GB/sec, time: "
<<
timeHost
<<
" sec."
<<
endl
;
auto
spmvCuda
=
[](
const
DeviceMatrix
&
m
,
const
CudaVector
&
x
,
CudaVector
&
y
)
{
m
.
vectorProduct
(
x
,
y
);
auto
reset
=
[
&
]()
{
hostVector2
.
setValue
(
0.0
);
deviceVector2
.
setValue
(
0.0
);
};
timeDevice
=
time_void_function
(
loops
,
spmvCuda
,
deviceMatrix
,
deviceVector
,
deviceVector2
);
bandwidth
=
datasetSize
/
timeDevice
;
cout
<<
" GPU: bandwidth: "
<<
bandwidth
<<
" GB/sec, time: "
<<
timeDevice
<<
" sec."
<<
endl
;
cout
<<
" CPU/GPU speedup: "
<<
timeHost
/
timeDevice
<<
endl
;
//cout << hostVector2 << endl << deviceVector2 << endl;
if
(
hostVector2
!=
deviceVector2
)
{
cerr
<<
"Error in Spmv kernel"
<<
endl
;
//for( int i = 0; i < size; i++ )
// if( hostVector2.getElement( i ) != deviceVector2.getElement( i ) )
// cerr << " " << i;
}
// compute functions
auto
spmvHost
=
[
&
]()
{
hostMatrix
.
vectorProduct
(
hostVector
,
hostVector2
);
};
auto
spmvCuda
=
[
&
]()
{
deviceMatrix
.
vectorProduct
(
deviceVector
,
deviceVector2
);
};
benchmarkCuda
(
loops
,
datasetSize
,
spmvHost
,
spmvCuda
,
check
,
reset
);
return
true
;
}
...
...
@@ -234,7 +293,6 @@ int main( int argc, char* argv[] )
elementsPerRow
=
atoi
(
argv
[
3
]
);
double
datasetSize
=
(
double
)
(
loops
*
size
)
*
sizeof
(
Real
)
/
oneGB
;
HostVector
hostVector
,
hostVector2
;
...
...
@@ -246,76 +304,70 @@ int main( int argc, char* argv[] )
if
(
!
deviceVector2
.
setLike
(
deviceVector
)
)
return
EXIT_FAILURE
;
hostVector
.
setValue
(
1.0
);
deviceVector
.
setValue
(
1.0
);
hostVector2
.
setValue
(
1.0
);
deviceVector2
.
setValue
(
1.0
);
Real
resultHost
,
resultDevice
;
// check functions
auto
compare1
=
[
&
]()
{
return
hostVector
==
deviceVector
;
};
auto
compare2
=
[
&
]()
{
return
hostVector2
==
deviceVector2
;
};
auto
compare12
=
[
&
]()
{
return
compare1
()
&&
compare2
();
};
auto
compareScalars
=
[
&
]()
{
return
resultHost
==
resultDevice
;
};
// reset functions
auto
reset1
=
[
&
]()
{
hostVector
.
setValue
(
1.0
);
deviceVector
.
setValue
(
1.0
);
};
auto
reset2
=
[
&
]()
{
hostVector2
.
setValue
(
1.0
);
deviceVector2
.
setValue
(
1.0
);
};
auto
reset12
=
[
&
]()
{
reset1
();
reset2
();
};
double
bandwidth
(
0.0
);
Real
resultHost
,
resultDevice
,
timeHost
,
timeDevice
;
reset12
();
cout
<<
"Benchmarking CPU-GPU memory
bandwidth: "
;
auto
copyAssign
=
[](
CudaVector
&
v1
,
const
HostVector
&
v2
)
{
v1
=
v2
;
cout
<<
"Benchmarking CPU-GPU memory
transfer:"
<<
endl
;
auto
copyAssign
=
[
&
]()
{
deviceVector
=
hostVector
;
};
timeHost
=
time_void_function
(
loops
,
copyAssign
,
deviceVector
,
hostVector
);
bandwidth
=
datasetSize
/
timeHost
;
cout
<<
bandwidth
<<
" GB/sec."
<<
endl
;
cout
<<
" "
;
benchmarkSingle
(
loops
,
datasetSize
,
copyAssign
,
compare1
,
reset1
);
cout
<<
"Benchmarking vector addition:"
<<
endl
;
auto
addVectorHost
=
[](
HostVector
&
v1
,
const
HostVector
&
v2
)
{
v1
.
addVector
(
v
2
);
auto
addVectorHost
=
[
&
]()
{
hostVector
.
addVector
(
hostVector
2
);
};
timeHost
=
time_void_function
(
loops
,
addVectorHost
,
hostVector
,
hostVector2
);
bandwidth
=
3
*
datasetSize
/
timeHost
;
cout
<<
" CPU: bandwidth: "
<<
bandwidth
<<
" GB/sec, time: "
<<
timeHost
<<
" sec."
<<
endl
;
auto
addVectorCuda
=
[](
CudaVector
&
v1
,
const
CudaVector
&
v2
)
{
v1
.
addVector
(
v2
);
auto
addVectorCuda
=
[
&
]()
{
deviceVector
.
addVector
(
deviceVector2
);
// TODO: synchronization should be part of addVector
cudaThreadSynchronize
();
};
timeDevice
=
time_void_function
(
loops
,
addVectorCuda
,
deviceVector
,
deviceVector2
);
bandwidth
=
3
*
datasetSize
/
timeDevice
;
cout
<<
" GPU: bandwidth: "
<<
bandwidth
<<
" GB/sec, time: "
<<
timeDevice
<<
" sec."
<<
endl
;
cout
<<
" CPU/GPU speedup: "
<<
timeHost
/
timeDevice
<<
endl
;
hostVector
.
setValue
(
1.0
);
deviceVector
.
setValue
(
1.0
);
hostVector2
.
setValue
(
1.0
);
deviceVector2
.
setValue
(
1.0
);
benchmarkCuda
(
loops
,
3
*
datasetSize
,
addVectorHost
,
addVectorCuda
,
compare1
,
reset1
);
cout
<<
"Benchmarking scalar product:"
<<
endl
;
// FIXME: scalarProduct is not const method
// auto scalarProductHost = []( const HostVector & v1, const HostVector & v2 ) {
auto
scalarProductHost
=
[](
HostVector
&
v1
,
const
HostVector
&
v2
)
{
return
v1
.
scalarProduct
(
v2
);
auto
scalarProductHost
=
[
&
]()
{
resultHost
=
hostVector
.
scalarProduct
(
hostVector2
);
};
timeHost
=
time_void_function
(
loops
,
scalarProductHost
,
hostVector
,
hostVector2
);
bandwidth
=
2
*
datasetSize
/
timeHost
;
cout
<<
" CPU: bandwidth: "
<<
bandwidth
<<
" GB/sec, time: "
<<
timeHost
<<
" sec."
<<
endl
;
// FIXME: scalarProduct is not const method
// auto scalarProductCuda = []( const CudaVector & v1, const CudaVector & v2 ) {
auto
scalarProductCuda
=
[](
CudaVector
&
v1
,
const
CudaVector
&
v2
)
{
return
v1
.
scalarProduct
(
v2
);
auto
scalarProductCuda
=
[
&
]()
{
resultDevice
=
deviceVector
.
scalarProduct
(
deviceVector2
);
};
timeDevice
=
time_void_function
(
loops
,
scalarProductCuda
,
deviceVector
,
deviceVector2
);
bandwidth
=
2
*
datasetSize
/
timeDevice
;
cout
<<
" GPU: bandwidth: "
<<
bandwidth
<<
" GB/sec, time: "
<<
timeDevice
<<
" sec."
<<
endl
;
cout
<<
" CPU/GPU speedup: "
<<
timeHost
/
timeDevice
<<
endl
;
// TODO: devise a way to check the result of the timed function
// if( resultHost != resultDevice )
// {
// cerr << "Error. " << resultHost << " != " << resultDevice << endl;
//return EXIT_FAILURE;
// }
benchmarkCuda
(
loops
,
2
*
datasetSize
,
scalarProductHost
,
scalarProductCuda
,
compareScalars
,
voidFunc
);
/* TODO
#ifdef HAVE_CUBLAS
cout << "Benchmarking scalar product on GPU with Cublas: " << endl;
cublasHandle_t handle;
...
...
@@ -333,29 +385,16 @@ int main( int argc, char* argv[] )
bandwidth = 2 * datasetSize / timer.getTime();
cout << "bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << endl;
#endif
*/
cout
<<
"Benchmarking L2 norm: "
<<
endl
;
auto
l2normHost
=
[](
const
HostVector
&
v
)
{
re
turn
v
.
lpNorm
(
2.0
);
auto
l2normHost
=
[
&
]()
{
re
sultHost
=
hostVector
.
lpNorm
(
2.0
);
};
timeHost
=
time_void_function
(
loops
,
l2normHost
,
hostVector
);
bandwidth
=
datasetSize
/
timeHost
;
cout
<<
" CPU: bandwidth: "
<<
bandwidth
<<
" GB/sec, time: "
<<
timeHost
<<
" sec."
<<
endl
;
auto
l2normCuda
=
[](
const
CudaVector
&
v
)
{
return
v
.
lpNorm
(
2.0
);
auto
l2normCuda
=
[
&
]()
{
resultDevice
=
deviceVector
.
lpNorm
(
2.0
);
};
timeDevice
=
time_void_function
(
loops
,
l2normCuda
,
deviceVector
);
bandwidth
=
datasetSize
/
timeDevice
;
cout
<<
" GPU: bandwidth: "
<<
bandwidth
<<
" GB/sec, time: "
<<
timeDevice
<<
" sec."
<<
endl
;
cout
<<
" CPU/GPU speedup: "
<<
timeHost
/
timeDevice
<<
endl
;
// TODO: devise a way to check the result of the timed function
// if( resultHost != resultDevice )
// {
// cerr << "Error. " << resultHost << " != " << resultDevice << endl;
//return EXIT_FAILURE;
// }
benchmarkCuda
(
loops
,
datasetSize
,
l2normHost
,
l2normCuda
,
compareScalars
,
voidFunc
);
/*
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment