Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
T
tnl-dev
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Model registry
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
This is an archived project. Repository and other project resources are read-only.
Show more breadcrumbs
TNL
tnl-dev
Commits
44d1be85
There was an error fetching the commit references. Please try again later.
Commit
44d1be85
authored
6 years ago
by
Jakub Klinkovský
Browse files
Options
Downloads
Patches
Plain Diff
Distributed GMRES
parent
67a99cb6
No related branches found
No related tags found
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
src/TNL/Solvers/Linear/GMRES.h
+48
-27
48 additions, 27 deletions
src/TNL/Solvers/Linear/GMRES.h
src/TNL/Solvers/Linear/GMRES_impl.h
+104
-124
104 additions, 124 deletions
src/TNL/Solvers/Linear/GMRES_impl.h
src/TNL/Solvers/Linear/Traits.h
+11
-0
11 additions, 0 deletions
src/TNL/Solvers/Linear/Traits.h
with
163 additions
and
151 deletions
src/TNL/Solvers/Linear/GMRES.h
+
48
−
27
View file @
44d1be85
...
...
@@ -14,8 +14,6 @@
#include
"LinearSolver.h"
#include
<TNL/Containers/Vector.h>
namespace
TNL
{
namespace
Solvers
{
namespace
Linear
{
...
...
@@ -25,12 +23,19 @@ class GMRES
:
public
LinearSolver
<
Matrix
>
{
using
Base
=
LinearSolver
<
Matrix
>
;
// compatibility shortcuts
using
Traits
=
Linear
::
Traits
<
Matrix
>
;
using
CommunicatorType
=
typename
Traits
::
CommunicatorType
;
public:
using
RealType
=
typename
Base
::
RealType
;
using
DeviceType
=
typename
Base
::
DeviceType
;
using
IndexType
=
typename
Base
::
IndexType
;
// distributed vectors/views
using
VectorViewType
=
typename
Base
::
VectorViewType
;
using
ConstVectorViewType
=
typename
Base
::
ConstVectorViewType
;
using
VectorType
=
typename
Traits
::
VectorType
;
String
getType
()
const
;
...
...
@@ -43,11 +48,12 @@ public:
bool
solve
(
ConstVectorViewType
b
,
VectorViewType
x
)
override
;
protected:
using
ConstDeviceView
=
Containers
::
VectorView
<
const
RealType
,
DeviceType
,
IndexType
>
;
using
DeviceView
=
Containers
::
VectorView
<
RealType
,
DeviceType
,
IndexType
>
;
using
HostView
=
Containers
::
VectorView
<
RealType
,
Devices
::
Host
,
IndexType
>
;
using
DeviceVector
=
Containers
::
Vector
<
RealType
,
DeviceType
,
IndexType
>
;
using
HostVector
=
Containers
::
Vector
<
RealType
,
Devices
::
Host
,
IndexType
>
;
// local vectors/views
using
ConstDeviceView
=
typename
Traits
::
ConstLocalVectorViewType
;
using
DeviceView
=
typename
Traits
::
LocalVectorViewType
;
using
HostView
=
typename
DeviceView
::
HostType
;
using
DeviceVector
=
typename
Traits
::
LocalVectorType
;
using
HostVector
=
typename
DeviceVector
::
HostType
;
enum
class
Variant
{
MGS
,
MGSR
,
CWY
};
...
...
@@ -57,36 +63,37 @@ protected:
void
compute_residue
(
VectorViewType
r
,
ConstVectorViewType
x
,
ConstVectorViewType
b
);
void
preconditioned_matvec
(
DeviceVector
&
w
,
ConstDeviceView
v
);
void
preconditioned_matvec
(
VectorViewType
w
,
ConstVectorViewType
v
);
void
hauseholder_generate
(
DeviceVector
&
Y
,
HostVector
&
T
,
const
int
i
,
DeviceVector
&
w
);
// nvcc allows __cuda_callable__ lambdas only in public methods
#ifdef __NVCC__
public
:
#endif
void
hauseholder_generate
(
const
int
i
,
VectorViewType
y_i
,
ConstVectorViewType
z
);
#ifdef __NVCC__
protected
:
#endif
void
hauseholder_apply_trunc
(
HostView
out
,
DeviceVector
&
Y
,
HostVector
&
T
,
const
int
i
,
DeviceVector
&
w
);
VectorViewType
y_i
,
ConstVectorViewType
z
);
void
hauseholder_cwy
(
DeviceVector
&
w
,
DeviceVector
&
Y
,
HostVector
&
T
,
void
hauseholder_cwy
(
VectorViewType
v
,
const
int
i
);
void
hauseholder_cwy_transposed
(
DeviceVector
&
w
,
DeviceVector
&
Y
,
HostVector
&
T
,
void
hauseholder_cwy_transposed
(
VectorViewType
z
,
const
int
i
,
DeviceVector
&
z
);
ConstVectorViewType
w
);
template
<
typename
Vector
>
void
update
(
const
int
k
,
const
int
m
,
const
HostVector
&
H
,
const
HostVector
&
s
,
DeviceVector
&
v
,
DeviceVector
&
V
,
Vector
&
x
);
void
generatePlaneRotation
(
RealType
&
dx
,
...
...
@@ -101,15 +108,28 @@ protected:
void
apply_givens_rotations
(
const
int
i
,
const
int
m
);
void
setSize
(
const
VectorViewType
&
x
);
// Specialized methods to distinguish between normal and distributed matrices
// in the implementation.
template
<
typename
M
>
static
IndexType
getLocalOffset
(
const
M
&
m
)
{
return
0
;
}
void
setSize
(
const
IndexType
size
);
template
<
typename
M
>
static
IndexType
getLocalOffset
(
const
DistributedContainers
::
DistributedMatrix
<
M
>&
m
)
{
return
m
.
getLocalRowRange
().
getBegin
();
}
// selected GMRES variant
Variant
variant
=
Variant
::
CWY
;
// single vectors
Device
Vector
r
,
w
,
z
,
_M_tmp
;
// matrices (in column-major format)
// single vectors
(distributed)
Vector
Type
r
,
w
,
z
,
_M_tmp
;
// matrices (in column-major format)
(local)
DeviceVector
V
,
Y
;
// (CWY only) duplicate of the upper (m+1)x(m+1) submatrix of Y (it is lower triangular) for fast access
HostVector
YL
,
T
;
...
...
@@ -118,6 +138,7 @@ protected:
IndexType
size
=
0
;
IndexType
ldSize
=
0
;
IndexType
localOffset
=
0
;
int
restarting_min
=
10
;
int
restarting_max
=
10
;
int
restarting_step_min
=
3
;
...
...
This diff is collapsed.
Click to expand it.
src/TNL/Solvers/Linear/GMRES_impl.h
+
104
−
124
View file @
44d1be85
...
...
@@ -91,7 +91,7 @@ solve( ConstVectorViewType b, VectorViewType x )
<<
", d_max = "
<<
restarting_step_max
<<
std
::
endl
;
return
false
;
}
setSize
(
this
->
matrix
->
getRows
()
);
setSize
(
x
);
// initialize the norm of the preconditioned right-hand-side
RealType
normb
;
...
...
@@ -183,13 +183,15 @@ int
GMRES
<
Matrix
>::
orthogonalize_MGS
(
const
int
m
,
const
RealType
normb
,
const
RealType
beta
)
{
DeviceView
vi
,
vk
;
// initial binding to _M_tmp sets the correct local range, global size and
// communication group for distributed views
VectorViewType
v_i
(
_M_tmp
),
v_k
(
_M_tmp
);
/***
* v_0 = r / | r | = 1.0 / beta * r
*/
vi
.
bind
(
V
.
getData
(),
ldS
ize
);
vi
.
addVector
(
r
,
1.0
/
beta
,
0.0
);
v
_
i
.
bind
(
V
.
getData
(),
s
ize
);
v
_
i
.
addVector
(
r
,
1.0
/
beta
,
0.0
);
H
.
setValue
(
0.0
);
s
.
setValue
(
0.0
);
...
...
@@ -199,28 +201,28 @@ orthogonalize_MGS( const int m, const RealType normb, const RealType beta )
* Starting m-loop
*/
for
(
int
i
=
0
;
i
<
m
&&
this
->
nextIteration
();
i
++
)
{
vi
.
bind
(
&
(
V
.
getData
()[
i
*
ldSize
]
)
,
size
);
v
_
i
.
bind
(
&
V
.
getData
()[
i
*
ldSize
],
size
);
/****
* Solve w from M w = A v_i
*/
preconditioned_matvec
(
w
,
vi
);
preconditioned_matvec
(
w
,
v
_
i
);
for
(
int
k
=
0
;
k
<=
i
;
k
++
)
H
[
k
+
i
*
(
m
+
1
)
]
=
0.0
;
const
int
reorthogonalize
=
(
variant
==
Variant
::
MGSR
)
?
2
:
1
;
for
(
int
l
=
0
;
l
<
reorthogonalize
;
l
++
)
for
(
int
k
=
0
;
k
<=
i
;
k
++
)
{
vk
.
bind
(
&
(
V
.
getData
()[
k
*
ldSize
]
)
,
size
);
v
_
k
.
bind
(
&
V
.
getData
()[
k
*
ldSize
],
size
);
/***
* H_{k,i} = ( w, v_k )
*/
RealType
H_k_i
=
w
.
scalarProduct
(
vk
);
RealType
H_k_i
=
w
.
scalarProduct
(
v
_
k
);
H
[
k
+
i
*
(
m
+
1
)
]
+=
H_k_i
;
/****
* w = w - H_{k,i} v_k
*/
w
.
addVector
(
vk
,
-
H_k_i
);
w
.
addVector
(
v
_
k
,
-
H_k_i
);
}
/***
* H_{i+1,i} = |w|
...
...
@@ -231,8 +233,8 @@ orthogonalize_MGS( const int m, const RealType normb, const RealType beta )
/***
* v_{i+1} = w / |w|
*/
vi
.
bind
(
&
(
V
.
getData
()[
(
i
+
1
)
*
ldSize
]
)
,
size
);
vi
.
addVector
(
w
,
1.0
/
normw
,
0.0
);
v
_
i
.
bind
(
&
V
.
getData
()[
(
i
+
1
)
*
ldSize
],
size
);
v
_
i
.
addVector
(
w
,
1.0
/
normw
,
0.0
);
/****
* Applying the Givens rotations G_0, ..., G_i
...
...
@@ -254,7 +256,9 @@ int
GMRES
<
Matrix
>::
orthogonalize_CWY
(
const
int
m
,
const
RealType
normb
,
const
RealType
beta
)
{
DeviceVector
vi
,
vk
;
// initial binding to _M_tmp sets the correct local range, global size and
// communication group for distributed views
VectorViewType
v_i
(
_M_tmp
),
y_i
(
_M_tmp
);
/***
* z = r / | r | = 1.0 / beta * r
...
...
@@ -278,40 +282,41 @@ orthogonalize_CWY( const int m, const RealType normb, const RealType beta )
/****
* Generate new Hauseholder transformation from vector z.
*/
hauseholder_generate
(
Y
,
T
,
i
,
z
);
y_i
.
bind
(
&
Y
.
getData
()[
i
*
ldSize
],
size
);
hauseholder_generate
(
i
,
y_i
,
z
);
if
(
i
==
0
)
{
/****
* s = e_1^T * P_i * z
*/
hauseholder_apply_trunc
(
s
,
Y
,
T
,
i
,
z
);
hauseholder_apply_trunc
(
s
,
i
,
y_
i
,
z
);
}
else
{
/***
* H_{i-1} = P_i * z
*/
HostView
h
(
&
H
.
getData
()[
(
i
-
1
)
*
(
m
+
1
)
],
m
+
1
);
hauseholder_apply_trunc
(
h
,
Y
,
T
,
i
,
z
);
hauseholder_apply_trunc
(
h
,
i
,
y_
i
,
z
);
}
/***
* Generate new basis vector v_i, using the compact WY representation:
* v_i = (I - Y_i * T_i Y_i^T) * e_i
*/
vi
.
bind
(
&
V
.
getData
()[
i
*
ldSize
],
size
);
hauseholder_cwy
(
vi
,
Y
,
T
,
i
);
v
_
i
.
bind
(
&
V
.
getData
()[
i
*
ldSize
],
size
);
hauseholder_cwy
(
v
_
i
,
i
);
if
(
i
<
m
)
{
/****
* Solve w from M w = A v_i
*/
preconditioned_matvec
(
w
,
vi
);
preconditioned_matvec
(
w
,
v
_
i
);
/****
* Apply all previous Hauseholder transformations, using the compact WY representation:
* z = (I - Y_i * T_i^T * Y_i^T) * w
*/
hauseholder_cwy_transposed
(
z
,
Y
,
T
,
i
,
w
);
hauseholder_cwy_transposed
(
z
,
i
,
w
);
}
/****
...
...
@@ -321,7 +326,7 @@ orthogonalize_CWY( const int m, const RealType normb, const RealType beta )
apply_givens_rotations
(
i
-
1
,
m
);
this
->
setResidue
(
std
::
fabs
(
s
[
i
]
)
/
normb
);
if
(
!
this
->
checkNextIteration
()
)
if
(
i
>
0
&&
!
this
->
checkNextIteration
()
)
return
i
-
1
;
else
this
->
refreshSolverMonitor
();
...
...
@@ -352,7 +357,7 @@ compute_residue( VectorViewType r, ConstVectorViewType x, ConstVectorViewType b
template
<
typename
Matrix
>
void
GMRES
<
Matrix
>::
preconditioned_matvec
(
DeviceVector
&
w
,
ConstDeviceView
v
)
preconditioned_matvec
(
VectorViewType
w
,
ConstVectorViewType
v
)
{
/****
* w = M.solve(A * v_i);
...
...
@@ -365,76 +370,47 @@ preconditioned_matvec( DeviceVector& w, ConstDeviceView v )
this
->
matrix
->
vectorProduct
(
v
,
w
);
}
#ifdef HAVE_CUDA
template
<
typename
DestinationElement
,
typename
SourceElement
,
typename
Index
>
__global__
void
copyTruncatedVectorKernel
(
DestinationElement
*
destination
,
const
SourceElement
*
source
,
const
Index
from
,
const
Index
size
)
{
Index
elementIdx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
const
Index
gridSize
=
blockDim
.
x
*
gridDim
.
x
;
while
(
elementIdx
<
from
)
{
destination
[
elementIdx
]
=
(
DestinationElement
)
0.0
;
elementIdx
+=
gridSize
;
}
while
(
elementIdx
<
size
)
{
destination
[
elementIdx
]
=
source
[
elementIdx
];
elementIdx
+=
gridSize
;
}
}
#endif
template
<
typename
Matrix
>
void
GMRES
<
Matrix
>::
hauseholder_generate
(
DeviceVector
&
Y
,
HostVector
&
T
,
const
int
i
,
DeviceVector
&
z
)
hauseholder_generate
(
const
int
i
,
VectorViewType
y_i
,
ConstVectorViewType
z
)
{
DeviceView
y_i
(
&
Y
.
getData
()[
i
*
ldSize
],
size
);
// XXX: the upper-right triangle of Y will be full of zeros, which can be exploited for optimization
if
(
std
::
is_same
<
DeviceType
,
Devices
::
Host
>::
value
)
{
for
(
IndexType
j
=
0
;
j
<
size
;
j
++
)
{
if
(
localOffset
==
0
)
{
TNL_ASSERT_LT
(
i
,
size
,
"upper-right triangle of Y is not on rank 0"
);
auto
kernel_truncation
=
[
=
]
__cuda_callable__
(
IndexType
j
)
mutable
{
if
(
j
<
i
)
y_i
[
j
]
=
0.0
;
else
y_i
[
j
]
=
z
[
j
];
}
};
ParallelFor
<
DeviceType
>::
exec
(
(
IndexType
)
0
,
size
,
kernel_truncation
);
}
if
(
std
::
is_same
<
DeviceType
,
Devices
::
Cuda
>::
value
)
{
#ifdef HAVE_CUDA
dim3
blockSize
(
256
);
dim3
gridSize
;
gridSize
.
x
=
min
(
Devices
::
Cuda
::
getMaxGridSize
(),
Devices
::
Cuda
::
getNumberOfBlocks
(
size
,
blockSize
.
x
)
);
copyTruncatedVectorKernel
<<<
gridSize
,
blockSize
>>>
(
y_i
.
getData
(),
z
.
getData
(),
i
,
size
);
TNL_CHECK_CUDA_DEVICE
;
#else
throw
Exceptions
::
CudaSupportMissing
();
#endif
else
{
ConstDeviceView
z_local
=
Traits
::
getLocalVectorView
(
z
);
DeviceView
y_i_local
=
Traits
::
getLocalVectorView
(
y_i
);
y_i_local
=
z_local
;
}
// norm of the TRUNCATED vector z
const
RealType
normz
=
y_i
.
lpNorm
(
2.0
);
const
RealType
y_ii
=
y_i
.
getElement
(
i
);
if
(
y_ii
>
0.0
)
y_i
.
setElement
(
i
,
y_ii
+
normz
);
else
y_i
.
setElement
(
i
,
y_ii
-
normz
);
RealType
norm_yi
=
0
;
if
(
localOffset
==
0
)
{
const
RealType
y_ii
=
y_i
.
getElement
(
i
);
if
(
y_ii
>
0.0
)
y_i
.
setElement
(
i
,
y_ii
+
normz
);
else
y_i
.
setElement
(
i
,
y_ii
-
normz
);
// compute the norm of the y_i vector; equivalent to this calculation by definition:
// const RealType norm_yi = y_i.lpNorm( 2.0 );
const
RealType
norm_yi
=
std
::
sqrt
(
2
*
(
normz
*
normz
+
std
::
fabs
(
y_ii
)
*
normz
)
);
// compute the norm of the y_i vector; equivalent to this calculation by definition:
// const RealType norm_yi = y_i.lpNorm( 2.0 );
norm_yi
=
std
::
sqrt
(
2
*
(
normz
*
normz
+
std
::
fabs
(
y_ii
)
*
normz
)
);
}
// no-op if the problem is not distributed
CommunicatorType
::
Bcast
(
&
norm_yi
,
1
,
0
,
Traits
::
getCommunicationGroup
(
*
this
->
matrix
)
);
// XXX: normalization is slower, but more stable
// y_i *= 1.0 / norm_yi;
...
...
@@ -453,8 +429,10 @@ hauseholder_generate( DeviceVector& Y,
size
,
Y
.
getData
(),
ldSize
,
y_i
.
getData
(),
Traits
::
getLocalVectorView
(
y_i
)
.
getData
(),
aux
);
// no-op if the problem is not distributed
CommunicatorType
::
Allreduce
(
aux
,
i
,
MPI_SUM
,
Traits
::
getCommunicationGroup
(
*
this
->
matrix
)
);
// [T_i]_{0..i-1} = - T_{i-1} * t_i * aux
for
(
int
k
=
0
;
k
<
i
;
k
++
)
{
...
...
@@ -469,49 +447,49 @@ template< typename Matrix >
void
GMRES
<
Matrix
>::
hauseholder_apply_trunc
(
HostView
out
,
DeviceVector
&
Y
,
HostVector
&
T
,
const
int
i
,
DeviceVector
&
z
)
VectorViewType
y_i
,
ConstVectorViewType
z
)
{
DeviceView
y_i
(
&
Y
.
getData
()[
i
*
ldSize
],
size
);
// copy part of y_i to the YL buffer
// The upper (m+1)x(m+1) submatrix of Y is duplicated in the YL buffer,
// which resides on host and is broadcasted from rank 0 to all processes.
HostView
YL_i
(
&
YL
[
i
*
(
restarting_max
+
1
)
],
restarting_max
+
1
);
Containers
::
Algorithms
::
ArrayOperations
<
Devices
::
Host
,
DeviceType
>::
copyMemory
(
YL_i
.
getData
(),
Traits
::
getLocalVectorView
(
y_i
).
getData
(),
YL_i
.
getSize
()
);
// no-op if the problem is not distributed
CommunicatorType
::
Bcast
(
YL_i
.
getData
(),
YL_i
.
getSize
(),
0
,
Traits
::
getCommunicationGroup
(
*
this
->
matrix
)
);
// TODO: is aux always 1?
const
RealType
aux
=
T
[
i
+
i
*
(
restarting_max
+
1
)
]
*
y_i
.
scalarProduct
(
z
);
if
(
std
::
is_same
<
DeviceType
,
Devices
::
Host
>::
value
)
{
for
(
int
k
=
0
;
k
<=
i
;
k
++
)
out
[
k
]
=
z
[
k
]
-
y_i
[
k
]
*
aux
;
}
if
(
std
::
is_same
<
DeviceType
,
Devices
::
Cuda
>::
value
)
{
// copy part of y_i to buffer on host
// here we duplicate the upper (m+1)x(m+1) submatrix of Y on host for fast access
RealType
*
host_yi
=
&
YL
[
i
*
(
restarting_max
+
1
)
];
RealType
host_z
[
i
+
1
];
Containers
::
Algorithms
::
ArrayOperations
<
Devices
::
Host
,
Devices
::
Cuda
>::
copyMemory
<
RealType
,
RealType
,
IndexType
>
(
host_yi
,
y_i
.
getData
(),
restarting_max
+
1
);
Containers
::
Algorithms
::
ArrayOperations
<
Devices
::
Host
,
Devices
::
Cuda
>::
copyMemory
<
RealType
,
RealType
,
IndexType
>
(
host_z
,
z
.
getData
(),
i
+
1
);
for
(
int
k
=
0
;
k
<=
i
;
k
++
)
out
[
k
]
=
host_z
[
k
]
-
host_yi
[
k
]
*
aux
;
if
(
localOffset
==
0
)
{
if
(
std
::
is_same
<
DeviceType
,
Devices
::
Host
>::
value
)
{
for
(
int
k
=
0
;
k
<=
i
;
k
++
)
out
[
k
]
=
z
[
k
]
-
y_i
[
k
]
*
aux
;
}
if
(
std
::
is_same
<
DeviceType
,
Devices
::
Cuda
>::
value
)
{
RealType
host_z
[
i
+
1
];
Containers
::
Algorithms
::
ArrayOperations
<
Devices
::
Host
,
Devices
::
Cuda
>::
copyMemory
(
host_z
,
Traits
::
getLocalVectorView
(
z
).
getData
(),
i
+
1
);
for
(
int
k
=
0
;
k
<=
i
;
k
++
)
out
[
k
]
=
host_z
[
k
]
-
YL_i
[
k
]
*
aux
;
}
}
// no-op if the problem is not distributed
CommunicatorType
::
Bcast
(
out
.
getData
(),
i
+
1
,
0
,
Traits
::
getCommunicationGroup
(
*
this
->
matrix
)
);
}
template
<
typename
Matrix
>
void
GMRES
<
Matrix
>::
hauseholder_cwy
(
DeviceVector
&
v
,
DeviceVector
&
Y
,
HostVector
&
T
,
hauseholder_cwy
(
VectorViewType
v
,
const
int
i
)
{
// aux = Y_i^T * e_i
RealType
aux
[
i
+
1
];
if
(
std
::
is_same
<
DeviceType
,
Devices
::
Host
>::
value
)
{
for
(
int
k
=
0
;
k
<=
i
;
k
++
)
aux
[
k
]
=
Y
[
i
+
k
*
ldSize
];
}
if
(
std
::
is_same
<
DeviceType
,
Devices
::
Cuda
>::
value
)
{
// the upper (m+1)x(m+1) submatrix of Y is duplicated on host for fast access
for
(
int
k
=
0
;
k
<=
i
;
k
++
)
aux
[
k
]
=
YL
[
i
+
k
*
(
restarting_max
+
1
)
];
}
// the upper (m+1)x(m+1) submatrix of Y is duplicated on host
// (faster access than from the device and it is broadcasted to all processes)
for
(
int
k
=
0
;
k
<=
i
;
k
++
)
aux
[
k
]
=
YL
[
i
+
k
*
(
restarting_max
+
1
)
];
// aux = T_i * aux
// Note that T_i is upper triangular, so we can overwrite the aux vector with the result in place
...
...
@@ -525,18 +503,17 @@ hauseholder_cwy( DeviceVector& v,
// v = e_i - Y_i * aux
MatrixOperations
<
DeviceType
>::
gemv
(
size
,
i
+
1
,
-
1.0
,
Y
.
getData
(),
ldSize
,
aux
,
0.0
,
v
.
getData
()
);
v
.
setElement
(
i
,
1.0
+
v
.
getElement
(
i
)
);
0.0
,
Traits
::
getLocalVectorView
(
v
).
getData
()
);
if
(
localOffset
==
0
)
v
.
setElement
(
i
,
1.0
+
v
.
getElement
(
i
)
);
}
template
<
typename
Matrix
>
void
GMRES
<
Matrix
>::
hauseholder_cwy_transposed
(
DeviceVector
&
z
,
DeviceVector
&
Y
,
HostVector
&
T
,
hauseholder_cwy_transposed
(
VectorViewType
z
,
const
int
i
,
DeviceVector
&
w
)
ConstVectorViewType
w
)
{
// aux = Y_i^T * w
RealType
aux
[
i
+
1
];
...
...
@@ -547,8 +524,10 @@ hauseholder_cwy_transposed( DeviceVector& z,
size
,
Y
.
getData
(),
ldSize
,
w
.
getData
(),
Traits
::
getLocalVectorView
(
w
)
.
getData
(),
aux
);
// no-op if the problem is not distributed
Traits
::
CommunicatorType
::
Allreduce
(
aux
,
i
+
1
,
MPI_SUM
,
Traits
::
getCommunicationGroup
(
*
this
->
matrix
)
);
// aux = T_i^T * aux
// Note that T_i^T is lower triangular, so we can overwrite the aux vector with the result in place
...
...
@@ -563,7 +542,7 @@ hauseholder_cwy_transposed( DeviceVector& z,
z
=
w
;
MatrixOperations
<
DeviceType
>::
gemv
(
size
,
i
+
1
,
-
1.0
,
Y
.
getData
(),
ldSize
,
aux
,
1.0
,
z
.
getData
()
);
1.0
,
Traits
::
getLocalVectorView
(
z
)
.
getData
()
);
}
template
<
typename
Matrix
>
...
...
@@ -574,7 +553,7 @@ update( const int k,
const
int
m
,
const
HostVector
&
H
,
const
HostVector
&
s
,
DeviceVector
&
v
,
DeviceVector
&
V
,
Vector
&
x
)
{
RealType
y
[
m
+
1
];
...
...
@@ -602,8 +581,8 @@ update( const int k,
// x = V * y + x
MatrixOperations
<
DeviceType
>::
gemv
(
size
,
k
+
1
,
1.0
,
v
.
getData
(),
ldSize
,
y
,
1.0
,
x
.
getData
()
);
1.0
,
V
.
getData
(),
ldSize
,
y
,
1.0
,
Traits
::
getLocalVectorView
(
x
)
.
getData
()
);
}
template
<
typename
Matrix
>
...
...
@@ -673,29 +652,30 @@ apply_givens_rotations( int i, int m )
template
<
typename
Matrix
>
void
GMRES
<
Matrix
>::
setSize
(
const
IndexType
size
)
setSize
(
const
VectorViewType
&
x
)
{
this
->
size
=
s
ize
;
this
->
size
=
Traits
::
getLocalVectorView
(
x
).
getS
ize
()
;
if
(
std
::
is_same
<
DeviceType
,
Devices
::
Cuda
>::
value
)
// align each column to 256 bytes - optimal for CUDA
ldSize
=
roundToMultiple
(
size
,
256
/
sizeof
(
RealType
)
);
else
// on the host, we add 1 to disrupt the cache false-sharing pattern
ldSize
=
roundToMultiple
(
size
,
256
/
sizeof
(
RealType
)
)
+
1
;
localOffset
=
getLocalOffset
(
*
this
->
matrix
);
const
int
m
=
restarting_max
;
r
.
setSize
(
size
);
w
.
setSize
(
size
);
r
.
setLike
(
x
);
w
.
setLike
(
x
);
_M_tmp
.
setLike
(
x
);
V
.
setSize
(
ldSize
*
(
m
+
1
)
);
cs
.
setSize
(
m
+
1
);
sn
.
setSize
(
m
+
1
);
H
.
setSize
(
(
m
+
1
)
*
m
);
s
.
setSize
(
m
+
1
);
_M_tmp
.
setSize
(
size
);
// CWY-specific storage
if
(
variant
==
Variant
::
CWY
)
{
z
.
set
Size
(
size
);
z
.
set
Like
(
x
);
Y
.
setSize
(
ldSize
*
(
m
+
1
)
);
T
.
setSize
(
(
m
+
1
)
*
(
m
+
1
)
);
YL
.
setSize
(
(
m
+
1
)
*
(
m
+
1
)
);
...
...
This diff is collapsed.
Click to expand it.
src/TNL/Solvers/Linear/Traits.h
+
11
−
0
View file @
44d1be85
...
...
@@ -12,7 +12,10 @@
#pragma once
#include
<TNL/Communicators/NoDistrCommunicator.h>
#include
<TNL/Containers/Vector.h>
#include
<TNL/Containers/VectorView.h>
#include
<TNL/DistributedContainers/DistributedVector.h>
#include
<TNL/DistributedContainers/DistributedVectorView.h>
#include
<TNL/DistributedContainers/DistributedMatrix.h>
...
...
@@ -23,6 +26,8 @@ namespace Linear {
template
<
typename
Matrix
>
struct
Traits
{
using
CommunicatorType
=
Communicators
::
NoDistrCommunicator
;
using
VectorType
=
Containers
::
Vector
<
typename
Matrix
::
RealType
,
typename
Matrix
::
DeviceType
,
...
...
@@ -45,11 +50,15 @@ struct Traits
static
const
Matrix
&
getLocalMatrix
(
const
Matrix
&
m
)
{
return
m
;
}
static
ConstLocalVectorViewType
getLocalVectorView
(
ConstVectorViewType
v
)
{
return
v
;
}
static
LocalVectorViewType
getLocalVectorView
(
VectorViewType
v
)
{
return
v
;
}
static
typename
CommunicatorType
::
CommunicationGroup
getCommunicationGroup
(
const
Matrix
&
m
)
{
return
CommunicatorType
::
AllGroup
;
}
};
template
<
typename
Matrix
,
typename
Communicator
>
struct
Traits
<
DistributedContainers
::
DistributedMatrix
<
Matrix
,
Communicator
>
>
{
using
CommunicatorType
=
Communicator
;
using
VectorType
=
DistributedContainers
::
DistributedVector
<
typename
Matrix
::
RealType
,
typename
Matrix
::
DeviceType
,
...
...
@@ -84,6 +93,8 @@ struct Traits< DistributedContainers::DistributedMatrix< Matrix, Communicator >
{
return
m
.
getLocalMatrix
();
}
static
ConstLocalVectorViewType
getLocalVectorView
(
ConstVectorViewType
v
)
{
return
v
.
getLocalVectorView
();
}
static
LocalVectorViewType
getLocalVectorView
(
VectorViewType
v
)
{
return
v
.
getLocalVectorView
();
}
static
typename
CommunicatorType
::
CommunicationGroup
getCommunicationGroup
(
const
DistributedContainers
::
DistributedMatrix
<
Matrix
,
Communicator
>&
m
)
{
return
m
.
getCommunicationGroup
();
}
};
}
// namespace Linear
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment