Skip to content
Snippets Groups Projects
Commit aa5452fe authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Fixed bug in VectorOperationsCuda

The lambdas add1 and add2 used the Vector copy-constructor when
capturing x, which made a deep copy.
parent edd5efb5
No related branches found
No related tags found
No related merge requests found
...@@ -80,6 +80,7 @@ vectorScalarMultiplication( Vector& v, ...@@ -80,6 +80,7 @@ vectorScalarMultiplication( Vector& v,
#endif #endif
} }
/*
#ifdef HAVE_CUDA #ifdef HAVE_CUDA
template< typename Real1, typename Real2, typename Index, typename Scalar1, typename Scalar2 > template< typename Real1, typename Real2, typename Index, typename Scalar1, typename Scalar2 >
__global__ void __global__ void
...@@ -105,23 +106,25 @@ vectorAddVectorCudaKernel( Real1* y, ...@@ -105,23 +106,25 @@ vectorAddVectorCudaKernel( Real1* y,
} }
} }
#endif #endif
*/
template< typename Vector1, typename Vector2, typename Scalar1, typename Scalar2 > template< typename Vector1, typename Vector2, typename Scalar1, typename Scalar2 >
void void
VectorOperations< Devices::Cuda >:: VectorOperations< Devices::Cuda >::
addVector( Vector1& _y, addVector( Vector1& _y,
const Vector2& x, const Vector2& _x,
const Scalar1 alpha, const Scalar1 alpha,
const Scalar2 thisMultiplicator ) const Scalar2 thisMultiplicator )
{ {
TNL_ASSERT_GT( x.getSize(), 0, "Vector size must be positive." ); TNL_ASSERT_GT( _x.getSize(), 0, "Vector size must be positive." );
TNL_ASSERT_EQ( x.getSize(), _y.getSize(), "The vector sizes must be the same." ); TNL_ASSERT_EQ( _x.getSize(), _y.getSize(), "The vector sizes must be the same." );
#ifdef HAVE_CUDA #ifdef HAVE_CUDA
using IndexType = typename Vector1::IndexType; using IndexType = typename Vector1::IndexType;
using RealType = typename Vector1::RealType; using RealType = typename Vector1::RealType;
RealType* y = _y.getData(); RealType* y = _y.getData();
const RealType* x = _x.getData();
auto add1 = [=] __cuda_callable__ ( IndexType i ) { y[ i ] += alpha * x[ i ]; }; auto add1 = [=] __cuda_callable__ ( IndexType i ) { y[ i ] += alpha * x[ i ]; };
auto add2 = [=] __cuda_callable__ ( IndexType i ) { y[ i ] = thisMultiplicator * y[ i ] + alpha * x[ i ]; }; auto add2 = [=] __cuda_callable__ ( IndexType i ) { y[ i ] = thisMultiplicator * y[ i ] + alpha * x[ i ]; };
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment