Commit 07d933dc authored by Jakub Klinkovský's avatar Jakub Klinkovský Committed by Jakub Klinkovský
Browse files

Basic implementation of the distributed NDArray

parent 6c8c608e
Loading
Loading
Loading
Loading
+291 −0
Original line number Diff line number Diff line
/***************************************************************************
                          DistributedNDArray.h  -  description
                             -------------------
    begin                : Dec 27, 2018
    copyright            : (C) 2018 by Tomas Oberhuber et al.
    email                : tomas.oberhuber@fjfi.cvut.cz
 ***************************************************************************/

/* See Copyright Notice in tnl/Copyright */

// Implemented by: Jakub Klinkovsky

#pragma once

#include <TNL/Communicators/MpiCommunicator.h>
#include <TNL/Containers/NDArray.h>
#include <TNL/Containers/Subrange.h>
#include <TNL/Containers/DistributedNDArrayView.h>

namespace TNL {
namespace Containers {

template< typename NDArray,
          typename Communicator = Communicators::MpiCommunicator,
          typename Overlaps = __ndarray_impl::make_constant_index_sequence< NDArray::getDimension(), 0 > >
class DistributedNDArray
{
   using CommunicationGroup = typename Communicator::CommunicationGroup;
public:
   using ValueType = typename NDArray::ValueType;
   using DeviceType = typename NDArray::DeviceType;
   using IndexType = typename NDArray::IndexType;
   using SizesHolderType = typename NDArray::SizesHolderType;
   using PermutationType = typename NDArray::PermutationType;
   using CommunicatorType = Communicator;
   using LocalBeginsType = __ndarray_impl::LocalBeginsHolder< typename NDArray::SizesHolderType >;
   using LocalRangeType = Subrange< IndexType >;
   using OverlapsType = Overlaps;

   using ViewType = DistributedNDArrayView< typename NDArray::ViewType, Communicator, Overlaps >;
   using ConstViewType = DistributedNDArrayView< typename NDArray::ConstViewType, Communicator, Overlaps >;

   static_assert( Overlaps::size() == NDArray::getDimension(), "invalid overlaps" );

   // all methods from NDArrayView

   DistributedNDArray() = default;

   // The copy-constructor of TNL::Containers::Array makes shallow copy so our
   // copy-constructor cannot be default. Actually, we most likely don't need
   // it anyway, so let's just delete it.
   DistributedNDArray( const DistributedNDArray& ) = delete;

   // Standard copy-semantics with deep copy, just like regular 1D array.
   // Mismatched sizes cause reallocations.
   DistributedNDArray& operator=( const DistributedNDArray& other ) = default;

   // default move-semantics
   DistributedNDArray( DistributedNDArray&& ) = default;
   DistributedNDArray& operator=( DistributedNDArray&& ) = default;

   static constexpr std::size_t getDimension()
   {
      return NDArray::getDimension();
   }

   __cuda_callable__
   CommunicationGroup getCommunicationGroup() const
   {
      return group;
   }

   // Returns the *global* sizes
   __cuda_callable__
   const SizesHolderType& getSizes() const
   {
      return globalSizes;
   }

   // Returns the *global* size
   template< std::size_t level >
   __cuda_callable__
   IndexType getSize() const
   {
      return globalSizes.template getSize< level >();
   }

   __cuda_callable__
   LocalBeginsType getLocalBegins() const
   {
      return localBegins;
   }

   __cuda_callable__
   SizesHolderType getLocalEnds() const
   {
      return localEnds;
   }

   template< std::size_t level >
   __cuda_callable__
   LocalRangeType getLocalRange() const
   {
      return LocalRangeType( localBegins.template getSize< level >(), localEnds.template getSize< level >() );
   }

   // returns the local storage size
   __cuda_callable__
   IndexType getLocalStorageSize() const
   {
      return localArray.getStorageSize();
   }

   template< typename... IndexTypes >
   __cuda_callable__
   ValueType&
   operator()( IndexTypes&&... indices )
   {
      static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
      __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexTypes >( indices )... );
      return __ndarray_impl::call_with_unshifted_indices< LocalBeginsType, Overlaps >( localBegins, localArray, std::forward< IndexTypes >( indices )... );
   }

   template< typename... IndexTypes >
   __cuda_callable__
   const ValueType&
   operator()( IndexTypes&&... indices ) const
   {
      static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
      __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexTypes >( indices )... );
      return __ndarray_impl::call_with_unshifted_indices< LocalBeginsType, Overlaps >( localBegins, localArray, std::forward< IndexTypes >( indices )... );
   }

   // bracket operator for 1D arrays
   __cuda_callable__
   ValueType&
   operator[]( IndexType index )
   {
      static_assert( getDimension() == 1, "the access via operator[] is provided only for 1D arrays" );
      __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexType >( index ) );
      return localArray[ __ndarray_impl::get<0>( Overlaps{} ) + index - localBegins.template getSize< 0 >() ];
   }

   __cuda_callable__
   const ValueType&
   operator[]( IndexType index ) const
   {
      static_assert( getDimension() == 1, "the access via operator[] is provided only for 1D arrays" );
      __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexType >( index ) );
      return localArray[ __ndarray_impl::get<0>( Overlaps{} ) + index - localBegins.template getSize< 0 >() ];
   }

   __cuda_callable__
   ViewType getView()
   {
      return ViewType( localArray.getView(), globalSizes, localBegins, localEnds, group );
   }

   __cuda_callable__
   ConstViewType getConstView() const
   {
      return ConstViewType( localArray.getConstView(), globalSizes, localBegins, localEnds, group );
   }

   // TODO: overlaps should be skipped, otherwise it works only after synchronization
   bool operator==( const DistributedNDArray& other ) const
   {
      // we can't run allreduce if the communication groups are different
      if( group != other.getCommunicationGroup() )
         return false;
      const bool localResult =
            globalSizes == other.globalSizes &&
            localBegins == other.localBegins &&
            localEnds == other.localEnds &&
            localArray == other.localArray;
      bool result = true;
      if( group != CommunicatorType::NullGroup )
         CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, group );
      return result;
   }

   bool operator!=( const DistributedNDArray& other ) const
   {
      return ! (*this == other);
   }


   // extra methods

   // Sets the *global* size, but does not allocate storage
   template< typename... IndexTypes >
   void setSizes( IndexTypes&&... sizes )
   {
      static_assert( sizeof...( sizes ) == getDimension(), "got wrong number of sizes" );
      __ndarray_impl::setSizesHelper( globalSizes, std::forward< IndexTypes >( sizes )... );
      // initialize localBegins and localEnds
      localBegins = LocalBeginsType{};
      localEnds = globalSizes;
   }

   template< std::size_t level >
   void setDistribution( IndexType begin, IndexType end, CommunicationGroup group = Communicator::AllGroup )
   {
      static_assert( SizesHolderType::template getStaticSize< level >() == 0, "NDArray cannot be distributed in static dimensions." );
      TNL_ASSERT_GE( begin, 0, "begin must be non-negative" );
      TNL_ASSERT_LE( end, globalSizes.template getSize< level >(), "end must not be greater than global size" );
      TNL_ASSERT_LT( begin, end, "begin must be lesser than end" );
      localBegins.template setSize< level >( begin );
      localEnds.template setSize< level >( end );
      TNL_ASSERT( this->group == Communicator::NullGroup || this->group == group,
                  std::cerr << "different groups cannot be combined for different dimensions" );
      this->group = group;
   }

   // Computes the distributed storage size and allocates the local array
   void allocate()
   {
      SizesHolderType localSizes;
      TemplateStaticFor< std::size_t, 0, SizesHolderType::getDimension(), LocalSizesSetter >::execHost( localSizes, globalSizes, localBegins, localEnds );
      localArray.setSize( localSizes );
   }

   void setLike( const DistributedNDArray& other )
   {
      localArray.setLike( other.localArray );
      group = other.getCommunicationGroup();
      globalSizes = other.getSizes();
      localBegins = other.localBegins;
      localEnds = other.localEnds;
   }

   void reset()
   {
      localArray.reset();
      group = CommunicatorType::NullGroup;
      globalSizes = SizesHolderType{};
      localBegins = LocalBeginsType{};
      localEnds = SizesHolderType{};
   }

   // "safe" accessor - will do slow copy from device
   template< typename... IndexTypes >
   ValueType
   getElement( IndexTypes&&... indices ) const
   {
      static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
      __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexTypes >( indices )... );
      auto getElement = [this]( auto&&... indices )
      {
         return this->localArray.getElement( std::forward< decltype(indices) >( indices )... );
      };
      return __ndarray_impl::host_call_with_unshifted_indices< LocalBeginsType, Overlaps >( localBegins, getElement, std::forward< IndexTypes >( indices )... );
   }

   void setValue( ValueType value )
   {
      localArray.setValue( value );
   }

protected:
   NDArray localArray;
   CommunicationGroup group = Communicator::NullGroup;
   SizesHolderType globalSizes;
   // static sizes should have different type: localBegin is always 0, localEnd is always the full size
   LocalBeginsType localBegins;
   SizesHolderType localEnds;

private:
   template< std::size_t level >
   struct LocalSizesSetter
   {
      template< typename SizesHolder, typename LocalBegins >
      static void exec( SizesHolder& localSizes, const SizesHolder& globalSizes, const LocalBegins& localBegins, const SizesHolder& localEnds )
      {
         if( SizesHolder::template getStaticSize< level >() != 0 )
            return;

         const auto begin = localBegins.template getSize< level >();
         const auto end = localEnds.template getSize< level >();
         if( begin == end )
            localSizes.template setSize< level >( globalSizes.template getSize< level >() );
         else {
            TNL_ASSERT_GE( end - begin, (decltype(end)) __ndarray_impl::get<level>( Overlaps{} ), "local size is less than the size of overlaps" );
            localSizes.template setSize< level >( end - begin + 2 * __ndarray_impl::get<level>( Overlaps{} ) );
         }
      }
   };
};

} // namespace Containers
} // namespace TNL
+226 −0
Original line number Diff line number Diff line
/***************************************************************************
                          DistributedNDArrayView.h  -  description
                             -------------------
    begin                : Dec 27, 2018
    copyright            : (C) 2018 by Tomas Oberhuber et al.
    email                : tomas.oberhuber@fjfi.cvut.cz
 ***************************************************************************/

/* See Copyright Notice in tnl/Copyright */

// Implemented by: Jakub Klinkovsky

#pragma once

#include <TNL/Communicators/MpiCommunicator.h>
#include <TNL/Containers/NDArrayView.h>
#include <TNL/Containers/Subrange.h>

namespace TNL {
namespace Containers {

template< typename NDArrayView,
          typename Communicator = Communicators::MpiCommunicator,
          typename Overlaps = __ndarray_impl::make_constant_index_sequence< NDArrayView::getDimension(), 0 > >
class DistributedNDArrayView
{
   using CommunicationGroup = typename Communicator::CommunicationGroup;
public:
   using ValueType = typename NDArrayView::ValueType;
   using DeviceType = typename NDArrayView::DeviceType;
   using IndexType = typename NDArrayView::IndexType;
   using SizesHolderType = typename NDArrayView::SizesHolderType;
   using PermutationType = typename NDArrayView::PermutationType;
   using CommunicatorType = Communicator;
   using LocalBeginsType = __ndarray_impl::LocalBeginsHolder< typename NDArrayView::SizesHolderType >;
   using LocalRangeType = Subrange< IndexType >;
   using OverlapsType = Overlaps;

   using ViewType = DistributedNDArrayView< NDArrayView, Communicator, Overlaps >;
   using ConstViewType = DistributedNDArrayView< typename NDArrayView::ConstViewType, Communicator, Overlaps >;

   static_assert( Overlaps::size() == NDArrayView::getDimension(), "invalid overlaps" );

   __cuda_callable__
   DistributedNDArrayView() = default;

   // explicit initialization by local array view, global sizes and local begins and ends
   __cuda_callable__
   DistributedNDArrayView( NDArrayView localView, SizesHolderType globalSizes, LocalBeginsType localBegins, SizesHolderType localEnds, CommunicationGroup group )
   : localView(localView), group(group), globalSizes(globalSizes), localBegins(localBegins), localEnds(localEnds) {}

   // Copy-constructor does shallow copy, so views can be passed-by-value into
   // CUDA kernels and they can be captured-by-value in __cuda_callable__
   // lambda functions.
   __cuda_callable__
   DistributedNDArrayView( const DistributedNDArrayView& ) = default;

   // default move-constructor
   __cuda_callable__
   DistributedNDArrayView( DistributedNDArrayView&& ) = default;

   // Copy-assignment does deep copy, just like regular array, but the sizes
   // must match (i.e. copy-assignment cannot resize).
   __cuda_callable__
   DistributedNDArrayView& operator=( const DistributedNDArrayView& other ) = default;

   // There is no move-assignment operator, so expressions like `a = b.getView()`
   // are resolved as copy-assignment.

   // method for rebinding (reinitialization)
   __cuda_callable__
   void bind( DistributedNDArrayView view )
   {
      localView.bind( view.localView );
      group = view.group;
      globalSizes = view.globalSizes;
      localBegins = view.localBegins;
      localEnds = view.localEnds;
   }

   __cuda_callable__
   void reset()
   {
      localView.reset();
      group = CommunicatorType::NullGroup;
      globalSizes = SizesHolderType{};
      localBegins = LocalBeginsType{};
      localEnds = SizesHolderType{};
   }

   static constexpr std::size_t getDimension()
   {
      return NDArrayView::getDimension();
   }

   __cuda_callable__
   CommunicationGroup getCommunicationGroup() const
   {
      return group;
   }

   // Returns the *global* sizes
   __cuda_callable__
   const SizesHolderType& getSizes() const
   {
      return globalSizes;
   }

   // Returns the *global* size
   template< std::size_t level >
   __cuda_callable__
   IndexType getSize() const
   {
      return globalSizes.template getSize< level >();
   }

   __cuda_callable__
   LocalBeginsType getLocalBegins() const
   {
      return localBegins;
   }

   __cuda_callable__
   SizesHolderType getLocalEnds() const
   {
      return localEnds;
   }

   template< std::size_t level >
   __cuda_callable__
   LocalRangeType getLocalRange() const
   {
      return LocalRangeType( localBegins.template getSize< level >(), localEnds.template getSize< level >() );
   }

   // returns the local storage size
   __cuda_callable__
   IndexType getLocalStorageSize() const
   {
      return localView.getStorageSize();
   }

   template< typename... IndexTypes >
   __cuda_callable__
   ValueType&
   operator()( IndexTypes&&... indices )
   {
      static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
      __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexTypes >( indices )... );
      return __ndarray_impl::call_with_unshifted_indices< LocalBeginsType, Overlaps >( localBegins, localView, std::forward< IndexTypes >( indices )... );
   }

   template< typename... IndexTypes >
   __cuda_callable__
   const ValueType&
   operator()( IndexTypes&&... indices ) const
   {
      static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" );
      __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexTypes >( indices )... );
      return __ndarray_impl::call_with_unshifted_indices< LocalBeginsType, Overlaps >( localBegins, localView, std::forward< IndexTypes >( indices )... );
   }

   // bracket operator for 1D arrays
   __cuda_callable__
   ValueType&
   operator[]( IndexType index )
   {
      static_assert( getDimension() == 1, "the access via operator[] is provided only for 1D arrays" );
      __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexType >( index ) );
      return localView[ __ndarray_impl::get<0>( Overlaps{} ) + index - localBegins.template getSize< 0 >() ];
   }

   __cuda_callable__
   const ValueType&
   operator[]( IndexType index ) const
   {
      static_assert( getDimension() == 1, "the access via operator[] is provided only for 1D arrays" );
      __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexType >( index ) );
      return localView[ __ndarray_impl::get<0>( Overlaps{} ) + index - localBegins.template getSize< 0 >() ];
   }

   __cuda_callable__
   ViewType getView()
   {
      return ViewType( *this );
   }

   __cuda_callable__
   ConstViewType getConstView() const
   {
      return ConstViewType( localView, globalSizes, localBegins, localEnds, group );
   }

   // TODO: overlaps should be skipped, otherwise it works only after synchronization
   bool operator==( const DistributedNDArrayView& other ) const
   {
      // we can't run allreduce if the communication groups are different
      if( group != other.getCommunicationGroup() )
         return false;
      const bool localResult =
            globalSizes == other.globalSizes &&
            localBegins == other.localBegins &&
            localEnds == other.localEnds &&
            localView == other.localView;
      bool result = true;
      if( group != CommunicatorType::NullGroup )
         CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, group );
      return result;
   }

   bool operator!=( const DistributedNDArrayView& other ) const
   {
      return ! (*this == other);
   }

protected:
   NDArrayView localView;
   CommunicationGroup group = Communicator::NullGroup;
   SizesHolderType globalSizes;
   // static sizes should have different type: localBegin is always 0, localEnd is always the full size
   LocalBeginsType localBegins;
   SizesHolderType localEnds;
};

} // namespace Containers
} // namespace TNL
+81 −0
Original line number Diff line number Diff line
@@ -12,8 +12,11 @@

#pragma once

#include <array>

#include <TNL/Assert.h>
#include <TNL/Devices/CudaCallable.h>
#include <TNL/StaticFor.h>

#include <TNL/Containers/ndarray/Meta.h>

@@ -164,6 +167,84 @@ void assertIndicesInBounds( const SizesHolder& sizes, Index&& i, IndexTypes&&...
}


// A variadic bounds-checker for distributed indices with overlaps
template< typename SizesHolder1, typename SizesHolder2, typename Overlaps >
__cuda_callable__
void assertIndicesInRange( const SizesHolder1&, const SizesHolder2&, const Overlaps& )
{}

template< typename SizesHolder1,
          typename SizesHolder2,
          typename Overlaps,
          typename Index,
          typename... IndexTypes >
__cuda_callable__
void assertIndicesInRange( const SizesHolder1& begins, const SizesHolder2& ends, const Overlaps& overlaps, Index&& i, IndexTypes&&... indices )
{
   static_assert( SizesHolder1::getDimension() == SizesHolder2::getDimension(),
                  "Inconsistent begins and ends." );
#ifndef NDEBUG
   // sizes.template getSize<...>() cannot be inside the assert macro, but the variables
   // shouldn't be declared when compiling without assertions
   constexpr std::size_t level = SizesHolder1::getDimension() - sizeof...(indices) - 1;
   const auto begin = begins.template getSize< level >();
   const auto end = ends.template getSize< level >();
   TNL_ASSERT_LE( begin - get<level>( overlaps ), i, "Input error - some index is below the lower bound." );
   TNL_ASSERT_LT( i, end + get<level>( overlaps ), "Input error - some index is above the upper bound." );
#endif
   assertIndicesInRange( begins, ends, overlaps, std::forward< IndexTypes >( indices )... );
}


template< typename SizesHolder,
          typename Overlaps,
          typename Sequence >
struct IndexUnshiftHelper
{};

template< typename SizesHolder,
          typename Overlaps,
          std::size_t... N >
struct IndexUnshiftHelper< SizesHolder, Overlaps, std::index_sequence< N... > >
{
   template< typename Func,
             typename... Indices >
   __cuda_callable__
   static auto apply( const SizesHolder& begins, Func&& f, Indices&&... indices ) -> decltype(auto)
   {
      return f( ( get<N>( Overlaps{} ) + std::forward< Indices >( indices ) - begins.template getSize< N >() )... );
   }

   template< typename Func,
             typename... Indices >
   static auto apply_host( const SizesHolder& begins, Func&& f, Indices&&... indices ) -> decltype(auto)
   {
      return f( ( get<N>( Overlaps{} ) + std::forward< Indices >( indices ) - begins.template getSize< N >() )... );
   }
};

template< typename SizesHolder,
          typename Overlaps = make_constant_index_sequence< SizesHolder::getDimension(), 0 >,
          typename Func,
          typename... Indices >
__cuda_callable__
auto call_with_unshifted_indices( const SizesHolder& begins, Func&& f, Indices&&... indices ) -> decltype(auto)
{
   return IndexUnshiftHelper< SizesHolder, Overlaps, std::make_index_sequence< sizeof...( Indices ) > >
          ::apply( begins, std::forward< Func >( f ), std::forward< Indices >( indices )... );
}

template< typename SizesHolder,
          typename Overlaps = make_constant_index_sequence< SizesHolder::getDimension(), 0 >,
          typename Func,
          typename... Indices >
auto host_call_with_unshifted_indices( const SizesHolder& begins, Func&& f, Indices&&... indices ) -> decltype(auto)
{
   return IndexUnshiftHelper< SizesHolder, Overlaps, std::make_index_sequence< sizeof...( Indices ) > >
          ::apply_host( begins, std::forward< Func >( f ), std::forward< Indices >( indices )... );
}


template< typename Permutation,
          typename Alignment,
          typename SliceInfo,
+36 −0
Original line number Diff line number Diff line
@@ -355,6 +355,42 @@ filter_sequence( std::integer_sequence< Index, vals... > )
   return concat_sequences( FilterSingle< Mask >( std::integer_sequence< Index, vals >{} )... );
}


/*
 * make_constant_integer_sequence, make_constant_index_sequence - helper
 * templates for the generation of constant sequences like
 * std::make_integer_sequence, std::make_index_sequence
 */
template< typename T, typename N, T v > struct gen_const_seq;
template< typename T, typename N, T v > using gen_const_seq_t = typename gen_const_seq< T, N, v >::type;

template< typename T, typename N, T v >
struct gen_const_seq
{
   using type = decltype(concat_sequences(
                     gen_const_seq_t<T, std::integral_constant<T, N::value/2>, v>{},
                     gen_const_seq_t<T, std::integral_constant<T, N::value - N::value/2>, v>{}
                  ));
};

template< typename T, T v >
struct gen_const_seq< T, std::integral_constant<T, 0>, v >
{
   using type = std::integer_sequence<T>;
};

template< typename T, T v >
struct gen_const_seq< T, std::integral_constant<T, 1>, v >
{
   using type = std::integer_sequence<T, v>;
};

template< typename T, T N, T value >
using make_constant_integer_sequence = gen_const_seq_t< T, std::integral_constant<T, N>, value >;

template< std::size_t N, std::size_t value >
using make_constant_index_sequence = gen_const_seq_t< std::size_t, std::integral_constant<std::size_t, N>, value >;

} // namespace __ndarray_impl
} // namespace Containers
} // namespace TNL
+35 −1

File changed.

Preview size limit exceeded, changes collapsed.

Loading