Commit fd4d8429 authored by Jakub Klinkovský's avatar Jakub Klinkovský Committed by Jakub Klinkovský
Browse files

Added templated assignment operators for NDArray and NDArrayView

It works for any value, device and index types, but the permutations of
both arrays must be the same and both arrays have to be contiguous.
parent 560aba80
Loading
Loading
Loading
Loading
+37 −0
Original line number Diff line number Diff line
@@ -22,6 +22,42 @@ template< typename DestinationDevice,
          typename SourceDevice = DestinationDevice >
struct ArrayOperations;

// TODO: establish the concept of a "void device" for static computations in the whole TNL
template<>
struct ArrayOperations< void >
{
   template< typename Element >
   __cuda_callable__
   static void setElement( Element* data,
                           const Element& value );

   template< typename Element >
   __cuda_callable__
   static Element getElement( const Element* data );

   template< typename Element, typename Index >
   __cuda_callable__
   static void set( Element* data,
                    const Element& value,
                    const Index size );

   template< typename DestinationElement,
             typename SourceElement,
             typename Index >
   __cuda_callable__
   static void copy( DestinationElement* destination,
                     const SourceElement* source,
                     const Index size );

   template< typename Element1,
             typename Element2,
             typename Index >
   __cuda_callable__
   static bool compare( const Element1* destination,
                        const Element2* source,
                        const Index size );
};

template<>
struct ArrayOperations< Devices::Host >
{
@@ -251,6 +287,7 @@ struct ArrayOperations< Devices::Host, Devices::MIC >
} // namespace Containers
} // namespace TNL

#include <TNL/Containers/Algorithms/ArrayOperationsStatic.hpp>
#include <TNL/Containers/Algorithms/ArrayOperationsHost.hpp>
#include <TNL/Containers/Algorithms/ArrayOperationsCuda.hpp>
#include <TNL/Containers/Algorithms/ArrayOperationsMIC.hpp>
+82 −0
Original line number Diff line number Diff line
/***************************************************************************
                          ArrayOperationsStatic_impl.h  -  description
                             -------------------
    begin                : Apr 8, 2019
    copyright            : (C) 2019 by Tomas Oberhuber et al.
    email                : tomas.oberhuber@fjfi.cvut.cz
 ***************************************************************************/

/* See Copyright Notice in tnl/Copyright */

#pragma once

#include <TNL/Containers/Algorithms/ArrayOperations.h>

namespace TNL {
namespace Containers {
namespace Algorithms {

template< typename Element >
__cuda_callable__
void
ArrayOperations< void >::
setElement( Element* data,
            const Element& value )
{
   *data = value;
}

template< typename Element >
__cuda_callable__
Element
ArrayOperations< void >::
getElement( const Element* data )
{
   return *data;
}

template< typename Element, typename Index >
__cuda_callable__
void
ArrayOperations< void >::
set( Element* data,
     const Element& value,
     const Index size )
{
   for( Index i = 0; i < size; i ++ )
      data[ i ] = value;
}

template< typename DestinationElement,
          typename SourceElement,
          typename Index >
__cuda_callable__
void
ArrayOperations< void >::
copy( DestinationElement* destination,
      const SourceElement* source,
      const Index size )
{
   for( Index i = 0; i < size; i ++ )
      destination[ i ] = source[ i ];
}

template< typename Element1,
          typename Element2,
          typename Index >
__cuda_callable__
bool
ArrayOperations< void >::
compare( const Element1* destination,
         const Element2* source,
         const Index size )
{
   for( Index i = 0; i < size; i++ )
      if( ! ( destination[ i ] == source[ i ] ) )
         return false;
   return true;
}

} // namespace Algorithms
} // namespace Containers
} // namespace TNL
+73 −2
Original line number Diff line number Diff line
@@ -53,6 +53,12 @@ public:

   static_assert( Permutation::size() == SizesHolder::getDimension(), "invalid permutation" );

   // for compatibility with NDArrayView (which inherits from StrideBase)
   static constexpr bool isContiguous()
   {
      return true;
   }

   // all methods from NDArrayView

   NDArrayStorage() = default;
@@ -70,6 +76,21 @@ public:
   NDArrayStorage( NDArrayStorage&& ) = default;
   NDArrayStorage& operator=( NDArrayStorage&& ) = default;

   // Templated copy-assignment
   template< typename OtherArray >
   NDArrayStorage& operator=( const OtherArray& other )
   {
      static_assert( std::is_same< PermutationType, typename OtherArray::PermutationType >::value,
                     "Arrays must have the same permutation of indices." );
      // update sizes
      __ndarray_impl::SetSizesCopyHelper< SizesHolderType, typename OtherArray::SizesHolderType >::copy( sizes, other.getSizes() );
      // (re)allocate storage if necessary
      array.setSize( getStorageSize() );
      // copy data
      getView() = other.getConstView();
      return *this;
   }

   bool operator==( const NDArrayStorage& other ) const
   {
      // FIXME: uninitialized data due to alignment in NDArray and padding in SlicedNDArray
@@ -82,6 +103,14 @@ public:
      return sizes != other.sizes || array != other.array;
   }

   // accessor to the underlying data
   // (should not be used for accessing the elements, intended only for the implementation
   // of operator= and functions like cudaHostRegister)
   std::add_const_t< ValueType >* getData() const
   {
      return array.getData();
   }

   static constexpr std::size_t getDimension()
   {
      return SizesHolder::getDimension();
@@ -330,7 +359,18 @@ class NDArray
                                                    PermutationHost,
                                                    PermutationCuda >::type,
                         __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > > >
{};
{
   using Base = NDArrayStorage< Array< Value, Device, Index >,
                         SizesHolder,
                         typename std::conditional< std::is_same< Device, Devices::Host >::value,
                                                    PermutationHost,
                                                    PermutationCuda >::type,
                         __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > > >;

public:
   // inherit all assignment operators
   using Base::operator=;
};

template< typename Value,
          typename SizesHolder,
@@ -343,8 +383,17 @@ class StaticNDArray
                         __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > >,
                         void >
{
   using Base = NDArrayStorage< StaticArray< __ndarray_impl::StaticStorageSizeGetter< SizesHolder >::get(), Value >,
                         SizesHolder,
                         Permutation,
                         __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > >,
                         void >;
   static_assert( __ndarray_impl::StaticStorageSizeGetter< SizesHolder >::get() > 0,
                  "All dimensions of a static array must to be positive." );

public:
   // inherit all assignment operators
   using Base::operator=;
};

template< typename Value,
@@ -356,7 +405,14 @@ class StaticMatrix
                        SizesHolder< std::size_t, Rows, Columns >,
                        Permutation >
{
   using Base = StaticNDArray< Value,
                        SizesHolder< std::size_t, Rows, Columns >,
                        Permutation >;

public:
   // inherit all assignment operators
   using Base::operator=;

   static constexpr std::size_t getRows()
   {
      return Rows;
@@ -388,7 +444,22 @@ class SlicedNDArray
                                                       SliceInfoHost,
                                                       SliceInfoCuda >::type >
                        >
{};
{
   using Base = NDArrayStorage< Array< Value, Device, Index >,
                         SizesHolder,
                         typename std::conditional< std::is_same< Device, Devices::Host >::value,
                                                    PermutationHost,
                                                    PermutationCuda >::type,
                         __ndarray_impl::SlicedNDArrayBase<
                            typename std::conditional< std::is_same< Device, Devices::Host >::value,
                                                       SliceInfoHost,
                                                       SliceInfoCuda >::type >
                        >;

public:
   // inherit all assignment operators
   using Base::operator=;
};

} // namespace Containers
} // namespace TNL
+29 −59
Original line number Diff line number Diff line
@@ -18,6 +18,7 @@
#include <TNL/Containers/ndarray/Executors.h>
#include <TNL/Containers/ndarray/BoundaryExecutors.h>
#include <TNL/Containers/ndarray/Operations.h>
#include <TNL/Containers/Algorithms/ArrayOperations.h>

namespace TNL {
namespace Containers {
@@ -71,7 +72,24 @@ public:
   {
      TNL_ASSERT_EQ( sizes, other.sizes, "The sizes of the array views must be equal, views are not resizable." );
      if( getStorageSize() > 0 )
         ArrayOpsHelper< Device >::copy( array, other.array, getStorageSize() );
         Algorithms::ArrayOperations< DeviceType >::copy( array, other.array, getStorageSize() );
      return *this;
   }

   // Templated copy-assignment
   template< typename OtherView >
   NDArrayView& operator=( const OtherView& other )
   {
      static_assert( std::is_same< PermutationType, typename OtherView::PermutationType >::value,
                     "Arrays must have the same permutation of indices." );
      static_assert( NDArrayView::isContiguous() && OtherView::isContiguous(),
                     "Non-contiguous array views cannot be assigned." );
      TNL_ASSERT_TRUE( __ndarray_impl::sizesWeakCompare( getSizes(), other.getSizes() ),
                       "The sizes of the array views must be equal, views are not resizable." );
      if( getStorageSize() > 0 ) {
         TNL_ASSERT_TRUE( array, "Attempted to assign to an empty view." );
         Algorithms::ArrayOperations< DeviceType, typename OtherView::DeviceType >::copy( array, other.getData(), getStorageSize() );
      }
      return *this;
   }

@@ -101,7 +119,7 @@ public:
      if( sizes != other.sizes )
         return false;
      // FIXME: uninitialized data due to alignment in NDArray and padding in SlicedNDArray
      return ArrayOpsHelper< Device, Device >::compare( array, other.array, getStorageSize() );
      return Algorithms::ArrayOperations< Device, Device >::compare( array, other.array, getStorageSize() );
   }

   __cuda_callable__
@@ -110,7 +128,7 @@ public:
      if( sizes != other.sizes )
         return true;
      // FIXME: uninitialized data due to alignment in NDArray and padding in SlicedNDArray
      return ! ArrayOpsHelper< Device, Device >::compare( array, other.array, getStorageSize() );
      return ! Algorithms::ArrayOperations< Device, Device >::compare( array, other.array, getStorageSize() );
   }

   static constexpr std::size_t getDimension()
@@ -118,6 +136,14 @@ public:
      return SizesHolder::getDimension();
   }

   // accessor to the underlying data
   // (should not be used for accessing the elements, intended only for the implementation
   // of operator= and functions like cudaHostRegister)
   std::add_const_t< ValueType >* getData() const
   {
      return array;
   }

   const SizesHolderType& getSizes() const
   {
      return sizes;
@@ -285,62 +311,6 @@ public:
protected:
   Value* array = nullptr;
   SizesHolder sizes;

   // TODO: establish the concept of a "void device" for static computations in the whole TNL

   template< typename DestinationDevice, typename SourceDevice = DestinationDevice, typename _unused = void >
   struct ArrayOpsHelper
   {
      template< typename DestinationValue,
                typename SourceValue,
                typename Index >
      static void copy( DestinationValue* destination,
                        const SourceValue* source,
                        const Index size )
      {
         Algorithms::ArrayOperations< DestinationDevice, SourceDevice >::copy( destination, source, size );
      }

      template< typename Value1,
                typename Value2,
                typename Index >
      static bool compare( const Value1* destination,
                           const Value2* source,
                           const Index size )
      {
         return Algorithms::ArrayOperations< DestinationDevice, SourceDevice >::compare( destination, source, size );
      }
   };

   template< typename _unused >
   struct ArrayOpsHelper< void, void, _unused >
   {
      template< typename DestinationValue,
                typename SourceValue,
                typename Index >
      __cuda_callable__
      static void copy( DestinationValue* destination,
                        const SourceValue* source,
                        const Index size )
      {
         for( Index i = 0; i < size; i ++ )
            destination[ i ] = source[ i ];
      }

      template< typename Value1,
                typename Value2,
                typename Index >
      __cuda_callable__
      static bool compare( const Value1* destination,
                           const Value2* source,
                           const Index size )
      {
         for( Index i = 0; i < size; i++ )
            if( ! ( destination[ i ] == source[ i ] ) )
               return false;
         return true;
      }
   };
};

} // namespace Containers
+68 −22
Original line number Diff line number Diff line
@@ -204,12 +204,13 @@ struct ParallelBoundaryExecutor< Permutation, Device, IndexTag< 3 > >
      static_assert( Begins::getDimension() == Ends::getDimension(),
                     "wrong begins or ends" );

      using Index = typename Ends::IndexType;

      auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 )
      {
         call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 );
      };
      // nvcc does not like nested __cuda_callable__ and normal lambdas...
//      using Index = typename Ends::IndexType;
//      auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 )
//      {
//         call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 );
//      };
      Kernel< Device > kernel;

      const auto begin0 = begins.template getSize< get< 0 >( Permutation{} ) >();
      const auto begin1 = begins.template getSize< get< 1 >( Permutation{} ) >();
@@ -224,13 +225,35 @@ struct ParallelBoundaryExecutor< Permutation, Device, IndexTag< 3 > >
      const auto end1 = ends.template getSize< get< 1 >( Permutation{} ) >();
      const auto end2 = ends.template getSize< get< 2 >( Permutation{} ) >();

      ParallelFor3D< Device >::exec( begin2,     begin1,     begin0,   skipBegin2, end1,       end0,       kernel );
      ParallelFor3D< Device >::exec( skipEnd2,   begin1,     begin0,   end2,       end1,       end0,       kernel );
      ParallelFor3D< Device >::exec( skipBegin2, begin1,     begin0,   skipEnd2,   skipBegin1, end0,       kernel );
      ParallelFor3D< Device >::exec( skipBegin2, skipEnd1,   begin0,   skipEnd2,   end1,       end0,       kernel );
      ParallelFor3D< Device >::exec( skipBegin2, skipBegin1, begin0,   skipEnd2,   skipEnd1,   skipBegin0, kernel );
      ParallelFor3D< Device >::exec( skipBegin2, skipBegin1, skipEnd0, skipEnd2,   skipEnd1,   end0,       kernel );
      ParallelFor3D< Device >::exec( begin2,     begin1,     begin0,   skipBegin2, end1,       end0,       kernel, f );
      ParallelFor3D< Device >::exec( skipEnd2,   begin1,     begin0,   end2,       end1,       end0,       kernel, f );
      ParallelFor3D< Device >::exec( skipBegin2, begin1,     begin0,   skipEnd2,   skipBegin1, end0,       kernel, f );
      ParallelFor3D< Device >::exec( skipBegin2, skipEnd1,   begin0,   skipEnd2,   end1,       end0,       kernel, f );
      ParallelFor3D< Device >::exec( skipBegin2, skipBegin1, begin0,   skipEnd2,   skipEnd1,   skipBegin0, kernel, f );
      ParallelFor3D< Device >::exec( skipBegin2, skipBegin1, skipEnd0, skipEnd2,   skipEnd1,   end0,       kernel, f );
   }

   template< typename __Device, typename = void >
   struct Kernel
   {
      template< typename Index, typename Func >
      void operator()( Index i2, Index i1, Index i0, Func f )
      {
         call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 );
      };
   };

   // dummy specialization to avoid a shitpile of nvcc warnings
   template< typename __unused >
   struct Kernel< Devices::Cuda, __unused >
   {
      template< typename Index, typename Func >
      __cuda_callable__
      void operator()( Index i2, Index i1, Index i0, Func f )
      {
         call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 );
      };
   };
};

template< typename Permutation,
@@ -251,12 +274,13 @@ struct ParallelBoundaryExecutor< Permutation, Device, IndexTag< 2 > >
      static_assert( Begins::getDimension() == Ends::getDimension(),
                     "wrong begins or ends" );

      using Index = typename Ends::IndexType;

      auto kernel = [=] __cuda_callable__ ( Index i1, Index i0 )
      {
         call_with_unpermuted_arguments< Permutation >( f, i0, i1 );
      };
      // nvcc does not like nested __cuda_callable__ and normal lambdas...
//      using Index = typename Ends::IndexType;
//      auto kernel = [=] __cuda_callable__ ( Index i1, Index i0 )
//      {
//         call_with_unpermuted_arguments< Permutation >( f, i0, i1 );
//      };
      Kernel< Device > kernel;

      const auto begin0 = begins.template getSize< get< 0 >( Permutation{} ) >();
      const auto begin1 = begins.template getSize< get< 1 >( Permutation{} ) >();
@@ -267,11 +291,33 @@ struct ParallelBoundaryExecutor< Permutation, Device, IndexTag< 2 > >
      const auto end0 = ends.template getSize< get< 0 >( Permutation{} ) >();
      const auto end1 = ends.template getSize< get< 1 >( Permutation{} ) >();

      ParallelFor2D< Device >::exec( begin1,     begin0,   skipBegin1, end0,       kernel );
      ParallelFor2D< Device >::exec( skipEnd1,   begin0,   end1,       end0,       kernel );
      ParallelFor2D< Device >::exec( skipBegin1, begin0,   skipEnd1,   skipBegin0, kernel );
      ParallelFor2D< Device >::exec( skipBegin1, skipEnd0, skipEnd1,   end0,       kernel );
      ParallelFor2D< Device >::exec( begin1,     begin0,   skipBegin1, end0,       kernel, f );
      ParallelFor2D< Device >::exec( skipEnd1,   begin0,   end1,       end0,       kernel, f );
      ParallelFor2D< Device >::exec( skipBegin1, begin0,   skipEnd1,   skipBegin0, kernel, f );
      ParallelFor2D< Device >::exec( skipBegin1, skipEnd0, skipEnd1,   end0,       kernel, f );
   }

   template< typename __Device, typename = void >
   struct Kernel
   {
      template< typename Index, typename Func >
      void operator()( Index i1, Index i0, Func f )
      {
         call_with_unpermuted_arguments< Permutation >( f, i0, i1 );
      };
   };

   // dummy specialization to avoid a shitpile of nvcc warnings
   template< typename __unused >
   struct Kernel< Devices::Cuda, __unused >
   {
      template< typename Index, typename Func >
      __cuda_callable__
      void operator()( Index i1, Index i0, Func f )
      {
         call_with_unpermuted_arguments< Permutation >( f, i0, i1 );
      };
   };
};

template< typename Permutation,
Loading