Newer
Older
Jakub Klinkovský
committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
/***************************************************************************
VectorOperations.h - description
-------------------
begin : Nov 8, 2012
copyright : (C) 2012 by Tomas Oberhuber
email : tomas.oberhuber@fjfi.cvut.cz
***************************************************************************/
/* See Copyright Notice in tnl/Copyright */
#pragma once
#include <TNL/Devices/Host.h>
#include <TNL/Devices/Cuda.h>
#include <TNL/ParallelFor.h>
namespace TNL {
namespace Benchmarks {
template< typename Device >
struct VectorOperations;
template<>
struct VectorOperations< Devices::Host >
{
static constexpr int OpenMPVectorOperationsThreshold = 512;
static constexpr int PrefetchDistance = 128;
template< typename Vector1, typename Vector2, typename Scalar1, typename Scalar2 >
static void addVector( Vector1& y,
const Vector2& x,
const Scalar1 alpha,
const Scalar2 thisMultiplicator = 1.0 )
{
typedef typename Vector1::IndexType Index;
TNL_ASSERT_GT( x.getSize(), 0, "Vector size must be positive." );
TNL_ASSERT_EQ( x.getSize(), y.getSize(), "The vector sizes must be the same." );
const Index n = y.getSize();
if( thisMultiplicator == 1.0 )
#ifdef HAVE_OPENMP
#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold )
#endif
for( Index i = 0; i < n; i ++ )
y[ i ] += alpha * x[ i ];
else
#ifdef HAVE_OPENMP
#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold )
#endif
for( Index i = 0; i < n; i ++ )
y[ i ] = thisMultiplicator * y[ i ] + alpha * x[ i ];
}
template< typename Vector1, typename Vector2, typename Vector3, typename Scalar1, typename Scalar2, typename Scalar3 >
static void addVectors( Vector1& v,
const Vector2& v1,
const Scalar1 multiplicator1,
const Vector3& v2,
const Scalar2 multiplicator2,
const Scalar3 thisMultiplicator = 1.0 )
{
typedef typename Vector1::IndexType Index;
TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
TNL_ASSERT_EQ( v.getSize(), v1.getSize(), "The vector sizes must be the same." );
TNL_ASSERT_EQ( v.getSize(), v2.getSize(), "The vector sizes must be the same." );
const Index n = v.getSize();
if( thisMultiplicator == 1.0 )
#ifdef HAVE_OPENMP
#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold )
#endif
for( Index i = 0; i < n; i ++ )
v[ i ] += multiplicator1 * v1[ i ] + multiplicator2 * v2[ i ];
else
#ifdef HAVE_OPENMP
#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() && n > OpenMPVectorOperationsThreshold )
#endif
for( Index i = 0; i < n; i ++ )
v[ i ] = thisMultiplicator * v[ i ] + multiplicator1 * v1[ i ] + multiplicator2 * v2[ i ];
}
};
template<>
struct VectorOperations< Devices::Cuda >
{
template< typename Vector1, typename Vector2, typename Scalar1, typename Scalar2 >
static void addVector( Vector1& _y,
const Vector2& _x,
const Scalar1 alpha,
const Scalar2 thisMultiplicator = 1.0 )
{
TNL_ASSERT_GT( _x.getSize(), 0, "Vector size must be positive." );
TNL_ASSERT_EQ( _x.getSize(), _y.getSize(), "The vector sizes must be the same." );
using IndexType = typename Vector1::IndexType;
using RealType = typename Vector1::RealType;
RealType* y = _y.getData();
const RealType* x = _x.getData();
auto add1 = [=] __cuda_callable__ ( IndexType i ) { y[ i ] += alpha * x[ i ]; };
auto add2 = [=] __cuda_callable__ ( IndexType i ) { y[ i ] = thisMultiplicator * y[ i ] + alpha * x[ i ]; };
if( thisMultiplicator == 1.0 )
ParallelFor< Devices::Cuda >::exec( (IndexType) 0, _y.getSize(), add1 );
else
ParallelFor< Devices::Cuda >::exec( (IndexType) 0, _y.getSize(), add2 );
}
template< typename Vector1, typename Vector2, typename Vector3, typename Scalar1, typename Scalar2, typename Scalar3 >
static void addVectors( Vector1& _v,
const Vector2& _v1,
const Scalar1 multiplicator1,
const Vector3& _v2,
const Scalar2 multiplicator2,
const Scalar3 thisMultiplicator = 1.0 )
{
TNL_ASSERT_GT( _v.getSize(), 0, "Vector size must be positive." );
TNL_ASSERT_EQ( _v.getSize(), _v1.getSize(), "The vector sizes must be the same." );
TNL_ASSERT_EQ( _v.getSize(), _v2.getSize(), "The vector sizes must be the same." );
using IndexType = typename Vector1::IndexType;
using RealType = typename Vector1::RealType;
RealType* v = _v.getData();
const RealType* v1 = _v1.getData();
const RealType* v2 = _v2.getData();
auto add1 = [=] __cuda_callable__ ( IndexType i ) { v[ i ] += multiplicator1 * v1[ i ] + multiplicator2 * v2[ i ]; };
auto add2 = [=] __cuda_callable__ ( IndexType i ) { v[ i ] = thisMultiplicator * v[ i ] + multiplicator1 * v1[ i ] + multiplicator2 * v2[ i ]; };
if( thisMultiplicator == 1.0 )
ParallelFor< Devices::Cuda >::exec( (IndexType) 0, _v.getSize(), add1 );
else
ParallelFor< Devices::Cuda >::exec( (IndexType) 0, _v.getSize(), add2 );
}
};
} // namespace Benchmarks
} // namespace TNL