Commit aa5452fe authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Fixed bug in VectorOperationsCuda

The lambdas add1 and add2 used the Vector copy-constructor when
capturing x, which made a deep copy.
parent edd5efb5
Loading
Loading
Loading
Loading
+6 −3
Original line number Diff line number Diff line
@@ -80,6 +80,7 @@ vectorScalarMultiplication( Vector& v,
#endif
}

/*
#ifdef HAVE_CUDA
template< typename Real1, typename Real2, typename Index, typename Scalar1, typename Scalar2 >
__global__ void
@@ -105,23 +106,25 @@ vectorAddVectorCudaKernel( Real1* y,
      }
}
#endif
*/

template< typename Vector1, typename Vector2, typename Scalar1, typename Scalar2 >
void
VectorOperations< Devices::Cuda >::
addVector( Vector1& _y,
           const Vector2& x,
           const Vector2& _x,
           const Scalar1 alpha,
           const Scalar2 thisMultiplicator )
{
   TNL_ASSERT_GT( x.getSize(), 0, "Vector size must be positive." );
   TNL_ASSERT_EQ( x.getSize(), _y.getSize(), "The vector sizes must be the same." );
   TNL_ASSERT_GT( _x.getSize(), 0, "Vector size must be positive." );
   TNL_ASSERT_EQ( _x.getSize(), _y.getSize(), "The vector sizes must be the same." );

#ifdef HAVE_CUDA
   using IndexType = typename Vector1::IndexType;
   using RealType = typename Vector1::RealType;

   RealType* y = _y.getData();
   const RealType* x = _x.getData();
   auto add1 = [=] __cuda_callable__ ( IndexType i ) { y[ i ] += alpha * x[ i ]; };
   auto add2 = [=] __cuda_callable__ ( IndexType i ) { y[ i ] = thisMultiplicator * y[ i ] + alpha * x[ i ]; };