Fixed bug in VectorOperationsCuda

The lambdas add1 and add2 used the Vector copy-constructor when capturing x, which made a deep copy.

Fixed bug in VectorOperationsCuda
aa5452fe · Jakub Klinkovský · edd5efb5 · aa5452fe
Commit aa5452fe authored 5 years ago by Jakub Klinkovský
--- a/src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h
+++ b/src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h
@@ -80,6 +80,7 @@ vectorScalarMultiplication( Vector& v,
 #endif
 }
+/*
 #ifdef HAVE_CUDA
 template< typename Real1, typename Real2, typename Index, typename Scalar1, typename Scalar2 >
 __global__ void
@@ -105,23 +106,25 @@ vectorAddVectorCudaKernel( Real1* y,
      }
 }
 #endif
+*/
 template< typename Vector1, typename Vector2, typename Scalar1, typename Scalar2 >
 void
 VectorOperations< Devices::Cuda >::
 addVector( Vector1& _y,
-           const Vector2& x,
+           const Vector2& _x,
           const Scalar1 alpha,
           const Scalar2 thisMultiplicator )
 {
-   TNL_ASSERT_GT( x.getSize(), 0, "Vector size must be positive." );
+   TNL_ASSERT_GT( _x.getSize(), 0, "Vector size must be positive." );
-   TNL_ASSERT_EQ( x.getSize(), _y.getSize(), "The vector sizes must be the same." );
+   TNL_ASSERT_EQ( _x.getSize(), _y.getSize(), "The vector sizes must be the same." );
 #ifdef HAVE_CUDA
   using IndexType = typename Vector1::IndexType;
   using RealType = typename Vector1::RealType;
   RealType* y = _y.getData();
+   const RealType* x = _x.getData();
   auto add1 = [=] __cuda_callable__ ( IndexType i ) { y[ i ] += alpha * x[ i ]; };
   auto add2 = [=] __cuda_callable__ ( IndexType i ) { y[ i ] = thisMultiplicator * y[ i ] + alpha * x[ i ]; };