Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
T
tnl-dev
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Model registry
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
This is an archived project. Repository and other project resources are read-only.
Show more breadcrumbs
TNL
tnl-dev
Commits
e2fe6a72
There was an error fetching the commit references. Please try again later.
Commit
e2fe6a72
authored
5 years ago
by
Lukas Cejka
Committed by
Tomáš Oberhuber
5 years ago
Browse files
Options
Downloads
Patches
Plain Diff
Commit for backup purposes of debugging AdELL.
parent
b4803a17
No related branches found
No related tags found
1 merge request
!45
Matrices revision
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/TNL/Matrices/AdEllpack_impl.h
+342
-113
342 additions, 113 deletions
src/TNL/Matrices/AdEllpack_impl.h
with
342 additions
and
113 deletions
src/TNL/Matrices/AdEllpack_impl.h
+
342
−
113
View file @
e2fe6a72
...
...
@@ -701,6 +701,7 @@ AdEllpack< Real, Device, Index >::operator=( const AdEllpack& matrix )
return
*
this
;
}
// cross-device copy assignment
template
<
typename
Real
,
typename
Device
,
...
...
@@ -709,13 +710,30 @@ template< typename Real,
AdEllpack
<
Real
,
Device
,
Index
>&
AdEllpack
<
Real
,
Device
,
Index
>::
operator
=
(
const
AdEllpack
<
Real2
,
Device2
,
Index2
>&
matrix
)
{
std
::
cout
<<
"< operator= >"
<<
std
::
endl
;
static_assert
(
std
::
is_same
<
Device
,
Devices
::
Host
>::
value
||
std
::
is_same
<
Device
,
Devices
::
Cuda
>::
value
,
"unknown device"
);
static_assert
(
std
::
is_same
<
Device2
,
Devices
::
Host
>::
value
||
std
::
is_same
<
Device2
,
Devices
::
Cuda
>::
value
,
"unknown device"
);
TNL_ASSERT_TRUE
(
false
,
"Cross-device copy assignment is not yet implemented for AdEllpack."
);
this
->
setLike
(
matrix
);
this
->
values
=
matrix
.
values
;
this
->
columnIndexes
=
matrix
.
columnIndexes
;
std
::
cout
<<
"After Values"
<<
std
::
endl
;
this
->
offset
=
matrix
.
offset
;
std
::
cout
<<
"
\t
offset"
<<
std
::
endl
;
this
->
rowOffset
=
matrix
.
rowOffset
;
std
::
cout
<<
"
\t
rowOffset"
<<
std
::
endl
;
this
->
localLoad
=
matrix
.
localLoad
;
std
::
cout
<<
"
\t
localLoad"
<<
std
::
endl
;
this
->
reduceMap
=
matrix
.
reduceMap
;
std
::
cout
<<
"
\t
reduceMap"
<<
std
::
endl
;
this
->
totalLoad
=
matrix
.
totalLoad
;
std
::
cout
<<
"
\t
totalLoad"
<<
std
::
endl
;
this
->
warpSize
=
matrix
.
warpSize
;
std
::
cout
<<
"After All"
<<
std
::
endl
;
std
::
cout
<<
"<// operator= >"
<<
std
::
endl
;
return
*
this
;
}
...
...
@@ -1082,16 +1100,17 @@ void AdEllpack< Real, Device, Index >::spmvCuda2( const InVector& inVector,
reduceMap
[
threadIdx
.
x
]
=
this
->
reduceMap
[
globalIdx
];
temp
[
threadIdx
.
x
]
=
0.0
;
IndexType
i
=
0
;
IndexType
elementPtr
=
this
->
offset
[
warpIdx
]
+
inWarpIdx
;
for
(
;
i
<
this
->
localLoad
[
warpIdx
];
i
++
)
IndexType
i
=
0
;
IndexType
elementPtr
=
this
->
offset
[
warpIdx
]
+
inWarpIdx
;
for
(
;
i
<
this
->
localLoad
[
warpIdx
];
i
++
)
{
if
(
this
->
columnIndexes
[
elementPtr
]
<
this
->
getColumns
()
)
{
if
(
this
->
columnIndexes
[
elementPtr
]
<
this
->
getColumns
()
)
{
temp
[
threadIdx
.
x
]
+=
inVector
[
this
->
columnIndexes
[
elementPtr
]
]
*
this
->
values
[
elementPtr
];
elementPtr
+=
this
->
warpSize
;
}
temp
[
threadIdx
.
x
]
+=
inVector
[
this
->
columnIndexes
[
elementPtr
]
]
*
this
->
values
[
elementPtr
];
elementPtr
+=
this
->
warpSize
;
}
}
if
(
(
inWarpIdx
==
0
)
||
(
reduceMap
[
threadIdx
.
x
]
>
reduceMap
[
threadIdx
.
x
-
1
]
)
)
{
IndexType
elementPtr
=
threadIdx
.
x
+
1
;
...
...
@@ -1100,9 +1119,9 @@ void AdEllpack< Real, Device, Index >::spmvCuda2( const InVector& inVector,
globalIdx
<
(
(
warpIdx
+
1
)
<<
5
)
&&
reduceMap
[
elementPtr
]
==
reduceMap
[
threadIdx
.
x
]
)
{
temp
[
threadIdx
.
x
]
+=
temp
[
elementPtr
];
elementPtr
++
;
globalIdx
++
;
temp
[
threadIdx
.
x
]
+=
temp
[
elementPtr
];
elementPtr
++
;
globalIdx
++
;
}
outVector
[
reduceMap
[
threadIdx
.
x
]
]
+=
temp
[
threadIdx
.
x
];
}
...
...
@@ -1118,9 +1137,11 @@ void AdEllpack< Real, Device, Index >::spmvCuda4( const InVector& inVector,
OutVector
&
outVector
,
const
int
gridIdx
)
const
{
IndexType
globalIdx
=
(
gridIdx
*
Cuda
::
getMaxGridSize
()
+
blockIdx
.
x
)
*
blockDim
.
x
+
threadIdx
.
x
;
printf
(
"
\n
< spmvCuda4 > %d"
,
threadIdx
.
x
);
IndexType
globalIdx
=
(
gridIdx
*
Devices
::
Cuda
::
getMaxGridSize
()
+
blockIdx
.
x
)
*
blockDim
.
x
+
threadIdx
.
x
;
IndexType
warpIdx
=
globalIdx
>>
5
;
IndexType
inWarpIdx
=
globalIdx
&
(
this
->
warpSize
-
1
);
printf
(
"
\n
globalIdx = %d;
\t
warpIdx = %d;
\t
inWarpIdx = %d;
\t
"
,
globalIdx
,
warpIdx
,
inWarpIdx
);
if
(
globalIdx
>=
this
->
reduceMap
.
getSize
()
)
return
;
...
...
@@ -1130,27 +1151,35 @@ void AdEllpack< Real, Device, Index >::spmvCuda4( const InVector& inVector,
reduceMap
[
threadIdx
.
x
]
=
this
->
reduceMap
[
globalIdx
];
temp
[
threadIdx
.
x
]
=
0.0
;
IndexType
i
=
0
;
IndexType
elementPtr
=
this
->
offset
[
warpIdx
]
+
inWarpIdx
;
IndexType
i
=
0
;
IndexType
elementPtr
=
this
->
offset
[
warpIdx
]
+
inWarpIdx
;
if
(
(
this
->
localLoad
[
warpIdx
]
&
1
)
==
1
)
if
(
this
->
columnIndexes
[
elementPtr
]
<
this
->
getColumns
()
)
{
temp
[
threadIdx
.
x
]
+=
inVector
[
this
->
columnIndexes
[
elementPtr
]
]
*
this
->
values
[
elementPtr
];
elementPtr
+=
this
->
warpSize
;
i
++
;
}
for
(
;
i
<
this
->
localLoad
[
warpIdx
];
i
+=
2
)
printf
(
"
\n
Thread: %d check 0"
,
threadIdx
.
x
);
if
(
(
this
->
localLoad
[
warpIdx
]
&
1
)
==
1
)
{
if
(
this
->
columnIndexes
[
elementPtr
]
<
this
->
getColumns
()
)
{
temp
[
threadIdx
.
x
]
+=
inVector
[
this
->
columnIndexes
[
elementPtr
]
]
*
this
->
values
[
elementPtr
];
elementPtr
+=
this
->
warpSize
;
i
++
;
}
}
printf
(
"
\n
Thread: %d check 1"
,
threadIdx
.
x
);
for
(
;
i
<
this
->
localLoad
[
warpIdx
];
i
+=
2
)
{
#pragma unroll
for
(
IndexType
j
=
0
;
j
<
2
;
j
++
)
{
#pragma unroll
for
(
IndexType
j
=
0
;
j
<
2
;
j
++
)
if
(
this
->
columnIndexes
[
elementPtr
]
<
this
->
getColumns
()
)
{
temp
[
threadIdx
.
x
]
+=
inVector
[
this
->
columnIndexes
[
elementPtr
]
]
*
this
->
values
[
elementPtr
];
elementPtr
+=
this
->
warpSize
;
}
if
(
this
->
columnIndexes
[
elementPtr
]
<
this
->
getColumns
()
)
{
temp
[
threadIdx
.
x
]
+=
inVector
[
this
->
columnIndexes
[
elementPtr
]
]
*
this
->
values
[
elementPtr
];
elementPtr
+=
this
->
warpSize
;
}
}
}
printf
(
"
\n
Thread: %d check 2"
,
threadIdx
.
x
);
if
(
(
inWarpIdx
==
0
)
||
(
reduceMap
[
threadIdx
.
x
]
>
reduceMap
[
threadIdx
.
x
-
1
]
)
)
{
IndexType
elementPtr
=
threadIdx
.
x
+
1
;
...
...
@@ -1159,12 +1188,13 @@ void AdEllpack< Real, Device, Index >::spmvCuda4( const InVector& inVector,
globalIdx
<
(
(
warpIdx
+
1
)
<<
5
)
&&
reduceMap
[
elementPtr
]
==
reduceMap
[
threadIdx
.
x
]
)
{
temp
[
threadIdx
.
x
]
+=
temp
[
elementPtr
];
elementPtr
++
;
globalIdx
++
;
temp
[
threadIdx
.
x
]
+=
temp
[
elementPtr
];
elementPtr
++
;
globalIdx
++
;
}
outVector
[
reduceMap
[
threadIdx
.
x
]
]
+=
temp
[
threadIdx
.
x
];
}
printf
(
"
\n
<// spmvCuda4 > %d"
,
threadIdx
.
x
);
}
template
<
typename
Real
,
...
...
@@ -1177,11 +1207,17 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector,
OutVector
&
outVector
,
const
int
gridIdx
)
const
{
IndexType
globalIdx
=
(
gridIdx
*
Cuda
::
getMaxGridSize
()
+
blockIdx
.
x
)
*
blockDim
.
x
+
threadIdx
.
x
;
printf
(
"
\n
< spmvCuda8 > %d"
,
threadIdx
.
x
);
IndexType
globalIdx
=
(
gridIdx
*
Devices
::
Cuda
::
getMaxGridSize
()
+
blockIdx
.
x
)
*
blockDim
.
x
+
threadIdx
.
x
;
IndexType
warpIdx
=
globalIdx
>>
5
;
IndexType
inWarpIdx
=
globalIdx
&
(
this
->
warpSize
-
1
);
printf
(
"
\n
globalIdx = %d;
\t
warpIdx = %d;
\t
inWarpIdx = %d;
\t
this->reduceMap.size() = %d"
,
globalIdx
,
warpIdx
,
inWarpIdx
,
this
->
reduceMap
.
getSize
()
);
if
(
globalIdx
>=
this
->
reduceMap
.
getSize
()
)
return
;
{
return
;
}
// Threads 32 - 127 returned (for matrix #3 in test_VectorProduct in SparseMatrixTest.hpp).
// They do not execute the rest of this function.
const
int
blockSize
=
128
;
Real
*
temp
=
Cuda
::
getSharedMemory
<
Real
>
();
...
...
@@ -1189,26 +1225,101 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector,
reduceMap
[
threadIdx
.
x
]
=
this
->
reduceMap
[
globalIdx
];
temp
[
threadIdx
.
x
]
=
0.0
;
IndexType
i
=
0
;
IndexType
elementPtr
=
this
->
offset
[
warpIdx
]
+
inWarpIdx
;
IndexType
i
=
0
;
IndexType
elementPtr
=
this
->
offset
[
warpIdx
]
+
inWarpIdx
;
while
(
(
this
->
localLoad
[
warpIdx
]
&
7
)
!=
0
)
if
(
this
->
columnIndexes
[
elementPtr
]
<
this
->
getColumns
()
)
{
temp
[
threadIdx
.
x
]
+=
inVector
[
this
->
columnIndexes
[
elementPtr
]
]
*
this
->
values
[
elementPtr
];
elementPtr
+=
this
->
warpSize
;
i
++
;
}
for
(
;
i
<
this
->
localLoad
[
warpIdx
];
i
+=
4
)
printf
(
"
\n
Thread: %d check 0"
,
threadIdx
.
x
);
if
(
threadIdx
.
x
==
0
)
{
printf
(
"
\n
this->localLoad.size() = %d"
,
this
->
localLoad
.
getSize
()
);
printf
(
"
\t
this->localLoad = "
);
for
(
IndexType
j
=
0
;
j
<
this
->
localLoad
.
getSize
();
j
++
)
{
#pragma unroll
for
(
IndexType
j
=
0
;
j
<
4
;
j
++
)
if
(
this
->
columnIndexes
[
elementPtr
]
<
this
->
getColumns
()
)
{
temp
[
threadIdx
.
x
]
+=
inVector
[
this
->
columnIndexes
[
elementPtr
]
]
*
this
->
values
[
elementPtr
];
elementPtr
+=
this
->
warpSize
;
}
printf
(
"%d, "
,
this
->
localLoad
[
j
]
);
}
}
if
(
threadIdx
.
x
==
0
)
{
printf
(
"
\n
values.size() = %d
\n
"
,
this
->
values
.
getSize
()
);
for
(
IndexType
j
=
0
;
j
<
this
->
values
.
getSize
();
j
++
)
{
printf
(
"%d, "
,
this
->
values
[
j
]
);
}
}
if
(
threadIdx
.
x
==
0
)
{
printf
(
"
\n
ColumnIndexes.size() = %d
\n
"
,
this
->
columnIndexes
.
getSize
()
);
for
(
IndexType
j
=
0
;
j
<
this
->
columnIndexes
.
getSize
();
j
++
)
{
printf
(
"%d, "
,
this
->
columnIndexes
[
j
]
);
}
}
// pragma unroll:
// https://stackoverflow.com/questions/22278631/what-does-pragma-unroll-do-exactly-does-it-affect-the-number-of-threads
// Theory:
// This needs to repeat itself, until i is a multiple of 4.
// Because of loop unrolling, we want the loop unrolling to unroll for 4 elements at a time.
// This while will ensure that elements are added up until they are multiples of 4.
// IF correct:
// * The loop unroll must be changed in spmvCuda 2 to be the same as in the paper.
// * Same for spmvCuda4, spmvCuda8, spmvCuda16 and spmvCuda 32
// Store the localLoad into a temporary variable.
// If the number of non-zero elements in a warp is not divisible by 4,
// i.e. the loop unroll cannot be used, cause it wouldn't compute all
// the elements or it would compute elements out of bounds.
// The loop unroll begin must be moved until the remaining number of
// non-zero elements can be divided by 4.
// Assign the result of if localLoad of this warp is divisible by 4. If not, how far is it.
IndexType
alignUnroll
=
this
->
localLoad
[
warpIdx
]
&
3
;
while
(
alignUnroll
!=
0
&&
alignUnroll
!=
4
)
{
printf
(
"
\n
Thread: %d
\t
check 0_1
\t
alignUnroll = %d"
,
threadIdx
.
x
,
alignUnroll
);
printf
(
"
\n
Thread: %d
\t
columnIndex < columns: %d < %d"
,
threadIdx
.
x
,
this
->
columnIndexes
[
elementPtr
],
this
->
getColumns
()
);
if
(
this
->
columnIndexes
[
elementPtr
]
<
this
->
getColumns
()
)
{
printf
(
"
\n
Thread: %d
\t
check 0_2"
,
threadIdx
.
x
);
temp
[
threadIdx
.
x
]
+=
inVector
[
this
->
columnIndexes
[
elementPtr
]
]
*
this
->
values
[
elementPtr
];
printf
(
"
\n
Thread: %d
\t
check 0_3"
,
threadIdx
.
x
);
elementPtr
+=
this
->
warpSize
;
i
++
;
// If alignUnroll is not divisible by 4, if it is one or two off: subtract until it is, else add until it is.
// In other words, we're trying to get to the closest multiple of 4 (loop Unroll factor).
if
(
alignUnroll
<=
2
)
alignUnroll
--
;
else
alignUnroll
++
;
}
else
{
break
;
}
}
printf
(
"
\n
Thread: %d check 1"
,
threadIdx
.
x
);
for
(
;
i
<
this
->
localLoad
[
warpIdx
];
i
+=
4
)
{
printf
(
"
\n
Thread: %d check 1_1"
,
threadIdx
.
x
);
#pragma unroll
for
(
IndexType
j
=
0
;
j
<
4
;
j
++
)
{
if
(
this
->
columnIndexes
[
elementPtr
]
<
this
->
getColumns
()
)
{
printf
(
"
\n
Thread: %d check 1_2"
,
threadIdx
.
x
);
temp
[
threadIdx
.
x
]
+=
inVector
[
this
->
columnIndexes
[
elementPtr
]
]
*
this
->
values
[
elementPtr
];
printf
(
"
\n
Thread: %d check 1_3"
,
threadIdx
.
x
);
elementPtr
+=
this
->
warpSize
;
}
}
}
printf
(
"
\n
Thread: %d check 2"
,
threadIdx
.
x
);
if
(
(
inWarpIdx
==
0
)
||
(
reduceMap
[
threadIdx
.
x
]
>
reduceMap
[
threadIdx
.
x
-
1
]
)
)
{
IndexType
elementPtr
=
threadIdx
.
x
+
1
;
...
...
@@ -1217,12 +1328,13 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector,
globalIdx
<
(
(
warpIdx
+
1
)
<<
5
)
&&
reduceMap
[
elementPtr
]
==
reduceMap
[
threadIdx
.
x
]
)
{
temp
[
threadIdx
.
x
]
+=
temp
[
elementPtr
];
elementPtr
++
;
globalIdx
++
;
temp
[
threadIdx
.
x
]
+=
temp
[
elementPtr
];
elementPtr
++
;
globalIdx
++
;
}
outVector
[
reduceMap
[
threadIdx
.
x
]
]
+=
temp
[
threadIdx
.
x
];
}
printf
(
"
\n
<// spmvCuda8 > %d"
,
threadIdx
.
x
);
}
template
<
typename
Real
,
...
...
@@ -1235,38 +1347,119 @@ void AdEllpack< Real, Device, Index >::spmvCuda16( const InVector& inVector,
OutVector
&
outVector
,
const
int
gridIdx
)
const
{
IndexType
globalIdx
=
(
gridIdx
*
Cuda
::
getMaxGridSize
()
+
blockIdx
.
x
)
*
blockDim
.
x
+
threadIdx
.
x
;
// TODO:
// * Print out the blockID of a thread next to the thread number.
// * The issue arises somewhere in the unrolling. So either I missed something in
// the unrolling alignment or idk.
printf
(
"
\n
< spmvCuda16 > %d (blockID: %d)"
,
threadIdx
.
x
,
blockIdx
.
x
);
IndexType
globalIdx
=
(
gridIdx
*
Devices
::
Cuda
::
getMaxGridSize
()
+
blockIdx
.
x
)
*
blockDim
.
x
+
threadIdx
.
x
;
IndexType
warpIdx
=
globalIdx
>>
5
;
IndexType
inWarpIdx
=
globalIdx
&
(
this
->
warpSize
-
1
);
if
(
globalIdx
>=
this
->
reduceMap
.
getSize
()
)
return
;
__syncthreads
();
printf
(
"
\n
threadIdx.x = %d
\t
globalIdx = %d;
\t
warpIdx = %d;
\t
inWarpIdx = %d;
\t
this->reduceMap.size() = %d"
,
threadIdx
.
x
,
globalIdx
,
warpIdx
,
inWarpIdx
,
this
->
reduceMap
.
getSize
()
);
__syncthreads
();
const
int
blockSize
=
128
;
Real
*
temp
=
Cuda
::
getSharedMemory
<
Real
>
();
__shared__
IndexType
reduceMap
[
blockSize
];
reduceMap
[
threadIdx
.
x
]
=
this
->
reduceMap
[
globalIdx
];
temp
[
threadIdx
.
x
]
=
0.0
;
IndexType
i
=
0
;
IndexType
elementPtr
=
this
->
offset
[
warpIdx
]
+
inWarpIdx
;
while
(
(
this
->
localLoad
[
warpIdx
]
&
15
)
!=
0
)
if
(
this
->
columnIndexes
[
elementPtr
]
<
this
->
getColumns
()
)
{
temp
[
threadIdx
.
x
]
+=
inVector
[
this
->
columnIndexes
[
elementPtr
]
]
*
this
->
values
[
elementPtr
];
IndexType
i
=
0
;
IndexType
elementPtr
=
this
->
offset
[
warpIdx
]
+
inWarpIdx
;
__syncthreads
();
printf
(
"
\n
Thread: %d check 0"
,
threadIdx
.
x
);
__syncthreads
();
IndexType
alignUnroll
=
this
->
localLoad
[
warpIdx
]
&
7
;
__syncthreads
();
// If the localLoad of a warp is less than the Unroll factor (8 in this case).
// The Unroll cannot be applied to that warp.
while
(
alignUnroll
!=
0
&&
alignUnroll
!=
8
)
{
printf
(
"
\n
[ %d ] Thread: %d
\t
check 0_1
\t
alignUnroll = %d"
,
globalIdx
,
threadIdx
.
x
,
alignUnroll
);
printf
(
"
\n
[ %d ] Thread: %d
\t
columnIndex < columns: %d < %d"
,
globalIdx
,
threadIdx
.
x
,
this
->
columnIndexes
[
elementPtr
],
this
->
getColumns
()
);
if
(
elementPtr
>=
this
->
columnIndexes
.
getSize
()
)
printf
(
"
\n
[ %d ] Thread: %d
\t
0 FOUND THE FUCKER"
,
globalIdx
,
threadIdx
.
x
);
if
(
this
->
columnIndexes
[
elementPtr
]
<
this
->
getColumns
()
)
{
printf
(
"
\n
[ %d ] Thread: %d
\t
check 0_2"
,
globalIdx
,
threadIdx
.
x
);
temp
[
threadIdx
.
x
]
+=
inVector
[
this
->
columnIndexes
[
elementPtr
]
]
*
this
->
values
[
elementPtr
];
elementPtr
+=
this
->
warpSize
;
i
++
;
if
(
alignUnroll
<=
4
)
alignUnroll
--
;
else
alignUnroll
++
;
}
else
{
break
;
}
printf
(
"
\n
[ %d ] Thread: %d
\t
check 0_3
\t
alignUnroll = %d"
,
globalIdx
,
threadIdx
.
x
,
alignUnroll
);
}
__syncthreads
();
printf
(
"
\n
[ %d ] Thread: %d
\t
check 1_0
\t
warpIdx = %d
\t
elementPtr = %d
\t
columIndexes.size() = %d"
,
globalIdx
,
threadIdx
.
x
,
warpIdx
,
elementPtr
,
this
->
columnIndexes
.
getSize
()
);
if
(
this
->
localLoad
[
warpIdx
]
<
8
)
{
while
(
i
<
this
->
localLoad
[
warpIdx
]
)
{
__syncthreads
();
printf
(
"
\n
[ %d ] Thread: %d
\t
check 1_0_1
\t
i = %d"
,
globalIdx
,
threadIdx
.
x
,
i
);
if
(
elementPtr
>=
this
->
columnIndexes
.
getSize
()
)
printf
(
"
\n
[ %d ] Thread: %d
\t
1 FOUND THE FUCKER"
,
globalIdx
,
threadIdx
.
x
);
if
(
this
->
columnIndexes
[
elementPtr
]
<
this
->
getColumns
()
)
{
__syncthreads
();
printf
(
"
\n
[ %d ] Thread: %d
\t
check 1_0_2
\t
i = %d"
,
globalIdx
,
threadIdx
.
x
,
i
);
temp
[
threadIdx
.
x
]
+=
inVector
[
this
->
columnIndexes
[
elementPtr
]
]
*
this
->
values
[
elementPtr
];
elementPtr
+=
this
->
warpSize
;
i
++
;
}
for
(
;
i
<
this
->
localLoad
[
warpIdx
];
i
+=
8
)
__syncthreads
();
printf
(
"
\n
[ %d ] Thread: %d
\t
check 1_0_3
\t
i = %d"
,
globalIdx
,
threadIdx
.
x
,
i
);
}
else
{
break
;
}
__syncthreads
();
printf
(
"
\n
[ %d ] Thread: %d
\t
check 1_0_4
\t
i = %d"
,
globalIdx
,
threadIdx
.
x
,
i
);
}
}
printf
(
"
\n
[ %d ] Thread: %d
\t
localLoad = %d
\t
i = %d"
,
globalIdx
,
threadIdx
.
x
,
this
->
localLoad
[
warpIdx
],
i
);
__syncthreads
();
printf
(
"
\n
[ %d ] Thread: %d check 1"
,
globalIdx
,
threadIdx
.
x
);
for
(
;
i
<
this
->
localLoad
[
warpIdx
];
i
+=
8
)
{
printf
(
"
\n
[ %d ] Thread: %d check 1_1"
,
globalIdx
,
threadIdx
.
x
);
#pragma unroll
for
(
IndexType
j
=
0
;
j
<
8
;
j
++
)
{
#pragma unroll
for
(
IndexType
j
=
0
;
j
<
8
;
j
++
)
if
(
this
->
columnIndexes
[
elementPtr
]
<
this
->
getColumns
()
)
{
temp
[
threadIdx
.
x
]
+=
inVector
[
this
->
columnIndexes
[
elementPtr
]
]
*
this
->
values
[
elementPtr
];
elementPtr
+=
this
->
warpSize
;
}
if
(
this
->
columnIndexes
[
elementPtr
]
<
this
->
getColumns
()
)
{
printf
(
"
\n
[ %d ] Thread: %d check 1_2"
,
globalIdx
,
threadIdx
.
x
);
temp
[
threadIdx
.
x
]
+=
inVector
[
this
->
columnIndexes
[
elementPtr
]
]
*
this
->
values
[
elementPtr
];
elementPtr
+=
this
->
warpSize
;
}
}
}
__syncthreads
();
printf
(
"
\n
[ %d ] Thread: %d check 2"
,
globalIdx
,
threadIdx
.
x
);
if
(
(
inWarpIdx
==
0
)
||
(
reduceMap
[
threadIdx
.
x
]
>
reduceMap
[
threadIdx
.
x
-
1
]
)
)
{
IndexType
elementPtr
=
threadIdx
.
x
+
1
;
...
...
@@ -1275,12 +1468,16 @@ void AdEllpack< Real, Device, Index >::spmvCuda16( const InVector& inVector,
globalIdx
<
(
(
warpIdx
+
1
)
<<
5
)
&&
reduceMap
[
elementPtr
]
==
reduceMap
[
threadIdx
.
x
]
)
{
temp
[
threadIdx
.
x
]
+=
temp
[
elementPtr
];
elementPtr
++
;
globalIdx
++
;
temp
[
threadIdx
.
x
]
+=
temp
[
elementPtr
];
elementPtr
++
;
globalIdx
++
;
}
outVector
[
reduceMap
[
threadIdx
.
x
]
]
+=
temp
[
threadIdx
.
x
];
}
__syncthreads
();
printf
(
"
\n
<// spmvCuda16 > %d (blockID: %d)"
,
threadIdx
.
x
,
blockIdx
.
x
);
}
template
<
typename
Real
,
...
...
@@ -1304,26 +1501,42 @@ void AdEllpack< Real, Device, Index >::spmvCuda32( const InVector& inVector,
__shared__
IndexType
reduceMap
[
blockSize
];
reduceMap
[
threadIdx
.
x
]
=
this
->
reduceMap
[
globalIdx
];
temp
[
threadIdx
.
x
]
=
0.0
;
IndexType
i
=
0
;
IndexType
elementPtr
=
this
->
offset
[
warpIdx
]
+
inWarpIdx
;
IndexType
i
=
0
;
IndexType
elementPtr
=
this
->
offset
[
warpIdx
]
+
inWarpIdx
;
while
(
(
this
->
localLoad
[
warpIdx
]
&
31
)
!=
0
)
if
(
this
->
columnIndexes
[
elementPtr
]
<
this
->
getColumns
()
)
{
temp
[
threadIdx
.
x
]
+=
inVector
[
this
->
columnIndexes
[
elementPtr
]
]
*
this
->
values
[
elementPtr
];
elementPtr
+=
this
->
warpSize
;
i
++
;
}
for
(
;
i
<
this
->
localLoad
[
warpIdx
];
i
+=
16
)
IndexType
alignUnroll
=
this
->
localLoad
[
warpIdx
]
&
15
;
while
(
alignUnroll
!=
0
&&
alignUnroll
!=
16
)
{
if
(
this
->
columnIndexes
[
elementPtr
]
<
this
->
getColumns
()
)
{
temp
[
threadIdx
.
x
]
+=
inVector
[
this
->
columnIndexes
[
elementPtr
]
]
*
this
->
values
[
elementPtr
];
elementPtr
+=
this
->
warpSize
;
i
++
;
if
(
alignUnroll
<=
8
)
alignUnroll
--
;
else
alignUnroll
++
;
}
else
{
break
;
}
}
for
(
;
i
<
this
->
localLoad
[
warpIdx
];
i
+=
16
)
{
#pragma unroll
for
(
IndexType
j
=
0
;
j
<
16
;
j
++
)
{
#pragma unroll
for
(
IndexType
j
=
0
;
j
<
16
;
j
++
)
if
(
this
->
columnIndexes
[
elementPtr
]
<
this
->
getColumns
()
)
{
temp
[
threadIdx
.
x
]
+=
inVector
[
this
->
columnIndexes
[
elementPtr
]
]
*
this
->
values
[
elementPtr
];
elementPtr
+=
this
->
warpSize
;
}
if
(
this
->
columnIndexes
[
elementPtr
]
<
this
->
getColumns
()
)
{
temp
[
threadIdx
.
x
]
+=
inVector
[
this
->
columnIndexes
[
elementPtr
]
]
*
this
->
values
[
elementPtr
];
elementPtr
+=
this
->
warpSize
;
}
}
}
if
(
(
inWarpIdx
==
0
)
||
(
reduceMap
[
threadIdx
.
x
]
>
reduceMap
[
threadIdx
.
x
-
1
]
)
)
{
IndexType
elementPtr
=
threadIdx
.
x
+
1
;
...
...
@@ -1332,9 +1545,9 @@ void AdEllpack< Real, Device, Index >::spmvCuda32( const InVector& inVector,
globalIdx
<
(
(
warpIdx
+
1
)
<<
5
)
&&
reduceMap
[
elementPtr
]
==
reduceMap
[
threadIdx
.
x
]
)
{
temp
[
threadIdx
.
x
]
+=
temp
[
elementPtr
];
elementPtr
++
;
globalIdx
++
;
temp
[
threadIdx
.
x
]
+=
temp
[
elementPtr
];
elementPtr
++
;
globalIdx
++
;
}
outVector
[
reduceMap
[
threadIdx
.
x
]
]
+=
temp
[
threadIdx
.
x
];
}
...
...
@@ -1424,12 +1637,21 @@ public:
const
InVector
&
inVector
,
OutVector
&
outVector
)
{
std
::
cout
<<
"matrix.totalLoad = "
<<
matrix
.
totalLoad
<<
std
::
endl
;
std
::
cout
<<
"matrix.localLoad = "
<<
matrix
.
localLoad
<<
std
::
endl
;
// printf( "\nthis->localLoad.size() = %d", matrix.localLoad.getSize() );
// printf( "\tthis->localLoad = " );
// for( int j = 0; j < matrix.localLoad.getSize(); j++ )
// {
// printf( "%d, ", matrix.localLoad[ j ] );
// }
typedef
AdEllpack
<
Real
,
Devices
::
Cuda
,
Index
>
Matrix
;
typedef
typename
Matrix
::
IndexType
IndexType
;
Matrix
*
kernel_this
=
Cuda
::
passToDevice
(
matrix
);
InVector
*
kernel_inVector
=
Cuda
::
passToDevice
(
inVector
);
OutVector
*
kernel_outVector
=
Cuda
::
passToDevice
(
outVector
);
if
(
matrix
.
totalLoad
<
2
)
Matrix
*
kernel_this
=
Devices
::
Cuda
::
passToDevice
(
matrix
);
InVector
*
kernel_inVector
=
Devices
::
Cuda
::
passToDevice
(
inVector
);
OutVector
*
kernel_outVector
=
Devices
::
Cuda
::
passToDevice
(
outVector
);
TNL_CHECK_CUDA_DEVICE
;
if
(
matrix
.
totalLoad
<
2
)
// Doesn't work for RealType int, long??? WORKS NOW FOR SOME REASON?
{
dim3
blockSize
(
256
),
cudaGridSize
(
Cuda
::
getMaxGridSize
()
);
IndexType
cudaBlocks
=
roundUpDivision
(
matrix
.
reduceMap
.
getSize
(),
blockSize
.
x
);
...
...
@@ -1452,7 +1674,7 @@ public:
Cuda
::
freeFromDevice
(
kernel_outVector
);
TNL_CHECK_CUDA_DEVICE
;
}
else
if
(
matrix
.
totalLoad
<
4
)
else
if
(
matrix
.
totalLoad
<
4
)
// WORKS
{
dim3
blockSize
(
192
),
cudaGridSize
(
Cuda
::
getMaxGridSize
()
);
IndexType
cudaBlocks
=
roundUpDivision
(
matrix
.
reduceMap
.
getSize
(),
blockSize
.
x
);
...
...
@@ -1475,7 +1697,7 @@ public:
Cuda
::
freeFromDevice
(
kernel_outVector
);
TNL_CHECK_CUDA_DEVICE
;
}
else
if
(
matrix
.
totalLoad
<
8
)
else
if
(
matrix
.
totalLoad
<
8
)
// Maybe works?
{
dim3
blockSize
(
128
),
cudaGridSize
(
Cuda
::
getMaxGridSize
()
);
IndexType
cudaBlocks
=
roundUpDivision
(
matrix
.
reduceMap
.
getSize
(),
blockSize
.
x
);
...
...
@@ -1485,6 +1707,7 @@ public:
if
(
gridIdx
==
cudaGrids
-
1
)
cudaGridSize
.
x
=
cudaBlocks
%
Cuda
::
getMaxGridSize
();
const
int
sharedMemory
=
blockSize
.
x
*
sizeof
(
Real
);
std
::
cout
<<
"Before kernel"
<<
std
::
endl
;
AdEllpackVectorProductCuda8
<
Real
,
Index
,
InVector
,
OutVector
>
<<<
cudaGridSize
,
blockSize
,
sharedMemory
>>>
(
kernel_this
,
...
...
@@ -1492,22 +1715,28 @@ public:
kernel_outVector
,
gridIdx
);
}
std
::
cout
<<
"After kernel"
<<
std
::
endl
;
TNL_CHECK_CUDA_DEVICE
;
Cuda
::
freeFromDevice
(
kernel_this
);
Cuda
::
freeFromDevice
(
kernel_inVector
);
Cuda
::
freeFromDevice
(
kernel_outVector
);
Devices
::
Cuda
::
freeFromDevice
(
kernel_this
);
std
::
cout
<<
"this free"
<<
std
::
endl
;
Devices
::
Cuda
::
freeFromDevice
(
kernel_inVector
);
std
::
cout
<<
"invector free"
<<
std
::
endl
;
Devices
::
Cuda
::
freeFromDevice
(
kernel_outVector
);
std
::
cout
<<
"outvector free"
<<
std
::
endl
;
TNL_CHECK_CUDA_DEVICE
;
}
else
if
(
matrix
.
totalLoad
<
16
)
else
if
(
matrix
.
totalLoad
<
16
)
// BROKEN
{
dim3
blockSize
(
128
),
cudaGridSize
(
Cuda
::
getMaxGridSize
()
);
IndexType
cudaBlocks
=
roundUpDivision
(
matrix
.
reduceMap
.
getSize
(),
blockSize
.
x
);
IndexType
cudaGrids
=
roundUpDivision
(
cudaBlocks
,
Cuda
::
getMaxGridSize
()
);
for
(
IndexType
gridIdx
=
0
;
gridIdx
<
cudaGrids
;
gridIdx
++
)
IndexType
cudaGrids
=
roundUpDivision
(
cudaBlocks
,
Devices
::
Cuda
::
getMaxGridSize
()
);
printf
(
"gridSize = %d
\t
cudaBlocks = %d
\t
cudaGrids = %d
\n
"
,
cudaGridSize
.
x
,
cudaBlocks
,
cudaGrids
);
for
(
IndexType
gridIdx
=
0
;
gridIdx
<
cudaGrids
;
gridIdx
++
)
{
if
(
gridIdx
==
cudaGrids
-
1
)
cudaGridSize
.
x
=
cudaBlocks
%
Cuda
::
getMaxGridSize
();
const
int
sharedMemory
=
blockSize
.
x
*
sizeof
(
Real
);
printf
(
"gridSize = %d
\t
blockSize = %d
\t
sharedMemory = %d
\t
gridIdx = %d"
,
cudaGridSize
.
x
,
blockSize
.
x
,
sharedMemory
,
gridIdx
);
AdEllpackVectorProductCuda16
<
Real
,
Index
,
InVector
,
OutVector
>
<<<
cudaGridSize
,
blockSize
,
sharedMemory
>>>
(
kernel_this
,
...
...
@@ -1521,7 +1750,7 @@ public:
Cuda
::
freeFromDevice
(
kernel_outVector
);
TNL_CHECK_CUDA_DEVICE
;
}
else
else
// BROKEN
{
dim3
blockSize
(
96
),
cudaGridSize
(
Cuda
::
getMaxGridSize
()
);
IndexType
cudaBlocks
=
roundUpDivision
(
matrix
.
reduceMap
.
getSize
(),
blockSize
.
x
);
...
...
@@ -1538,10 +1767,10 @@ public:
kernel_outVector
,
gridIdx
);
}
TNL_CHECK_CUDA_DEVICE
;
Cuda
::
freeFromDevice
(
kernel_this
);
Cuda
::
freeFromDevice
(
kernel_inVector
);
Cuda
::
freeFromDevice
(
kernel_outVector
);
TNL_CHECK_CUDA_DEVICE
;
// FREEZES right here on CHECK CUDA
Devices
::
Cuda
::
freeFromDevice
(
kernel_this
);
Devices
::
Cuda
::
freeFromDevice
(
kernel_inVector
);
Devices
::
Cuda
::
freeFromDevice
(
kernel_outVector
);
TNL_CHECK_CUDA_DEVICE
;
}
}
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment