Added templated assignment operators for NDArray and NDArrayView (fd4d8429) · Commits · TNL / tnl-dev

src/TNL/Containers/Algorithms/ArrayOperations.h

+37 −0

Original line number	Diff line number	Diff line
		@@ -22,6 +22,42 @@ template< typename DestinationDevice,
		typename SourceDevice = DestinationDevice >
		struct ArrayOperations;

		// TODO: establish the concept of a "void device" for static computations in the whole TNL
		template<>
		struct ArrayOperations< void >
		{
		template< typename Element >
		__cuda_callable__
		static void setElement( Element* data,
		const Element& value );

		template< typename Element >
		__cuda_callable__
		static Element getElement( const Element* data );

		template< typename Element, typename Index >
		__cuda_callable__
		static void set( Element* data,
		const Element& value,
		const Index size );

		template< typename DestinationElement,
		typename SourceElement,
		typename Index >
		__cuda_callable__
		static void copy( DestinationElement* destination,
		const SourceElement* source,
		const Index size );

		template< typename Element1,
		typename Element2,
		typename Index >
		__cuda_callable__
		static bool compare( const Element1* destination,
		const Element2* source,
		const Index size );
		};

		template<>
		struct ArrayOperations< Devices::Host >
		{
		@@ -251,6 +287,7 @@ struct ArrayOperations< Devices::Host, Devices::MIC >
		} // namespace Containers
		} // namespace TNL

		#include <TNL/Containers/Algorithms/ArrayOperationsStatic.hpp>
		#include <TNL/Containers/Algorithms/ArrayOperationsHost.hpp>
		#include <TNL/Containers/Algorithms/ArrayOperationsCuda.hpp>
		#include <TNL/Containers/Algorithms/ArrayOperationsMIC.hpp>

src/TNL/Containers/Algorithms/ArrayOperationsStatic.hpp

0 → 100644

+82 −0

Original line number	Diff line number	Diff line
		/***************************************************************************
		ArrayOperationsStatic_impl.h - description
		-------------------
		begin : Apr 8, 2019
		copyright : (C) 2019 by Tomas Oberhuber et al.
		email : tomas.oberhuber@fjfi.cvut.cz
		***************************************************************************/

		/* See Copyright Notice in tnl/Copyright */

		#pragma once

		#include <TNL/Containers/Algorithms/ArrayOperations.h>

		namespace TNL {
		namespace Containers {
		namespace Algorithms {

		template< typename Element >
		__cuda_callable__
		void
		ArrayOperations< void >::
		setElement( Element* data,
		const Element& value )
		{
		*data = value;
		}

		template< typename Element >
		__cuda_callable__
		Element
		ArrayOperations< void >::
		getElement( const Element* data )
		{
		return *data;
		}

		template< typename Element, typename Index >
		__cuda_callable__
		void
		ArrayOperations< void >::
		set( Element* data,
		const Element& value,
		const Index size )
		{
		for( Index i = 0; i < size; i ++ )
		data[ i ] = value;
		}

		template< typename DestinationElement,
		typename SourceElement,
		typename Index >
		__cuda_callable__
		void
		ArrayOperations< void >::
		copy( DestinationElement* destination,
		const SourceElement* source,
		const Index size )
		{
		for( Index i = 0; i < size; i ++ )
		destination[ i ] = source[ i ];
		}

		template< typename Element1,
		typename Element2,
		typename Index >
		__cuda_callable__
		bool
		ArrayOperations< void >::
		compare( const Element1* destination,
		const Element2* source,
		const Index size )
		{
		for( Index i = 0; i < size; i++ )
		if( ! ( destination[ i ] == source[ i ] ) )
		return false;
		return true;
		}

		} // namespace Algorithms
		} // namespace Containers
		} // namespace TNL

src/TNL/Containers/NDArray.h

+73 −2

Original line number	Diff line number	Diff line
		@@ -53,6 +53,12 @@ public:

		static_assert( Permutation::size() == SizesHolder::getDimension(), "invalid permutation" );

		// for compatibility with NDArrayView (which inherits from StrideBase)
		static constexpr bool isContiguous()
		{
		return true;
		}

		// all methods from NDArrayView

		NDArrayStorage() = default;
		@@ -70,6 +76,21 @@ public:
		NDArrayStorage( NDArrayStorage&& ) = default;
		NDArrayStorage& operator=( NDArrayStorage&& ) = default;

		// Templated copy-assignment
		template< typename OtherArray >
		NDArrayStorage& operator=( const OtherArray& other )
		{
		static_assert( std::is_same< PermutationType, typename OtherArray::PermutationType >::value,
		"Arrays must have the same permutation of indices." );
		// update sizes
		__ndarray_impl::SetSizesCopyHelper< SizesHolderType, typename OtherArray::SizesHolderType >::copy( sizes, other.getSizes() );
		// (re)allocate storage if necessary
		array.setSize( getStorageSize() );
		// copy data
		getView() = other.getConstView();
		return *this;
		}

		bool operator==( const NDArrayStorage& other ) const
		{
		// FIXME: uninitialized data due to alignment in NDArray and padding in SlicedNDArray
		@@ -82,6 +103,14 @@ public:
		return sizes != other.sizes \|\| array != other.array;
		}

		// accessor to the underlying data
		// (should not be used for accessing the elements, intended only for the implementation
		// of operator= and functions like cudaHostRegister)
		std::add_const_t< ValueType >* getData() const
		{
		return array.getData();
		}

		static constexpr std::size_t getDimension()
		{
		return SizesHolder::getDimension();
		@@ -330,7 +359,18 @@ class NDArray
		PermutationHost,
		PermutationCuda >::type,
		__ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > > >
		{};
		{
		using Base = NDArrayStorage< Array< Value, Device, Index >,
		SizesHolder,
		typename std::conditional< std::is_same< Device, Devices::Host >::value,
		PermutationHost,
		PermutationCuda >::type,
		__ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > > >;

		public:
		// inherit all assignment operators
		using Base::operator=;
		};

		template< typename Value,
		typename SizesHolder,
		@@ -343,8 +383,17 @@ class StaticNDArray
		__ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > >,
		void >
		{
		using Base = NDArrayStorage< StaticArray< __ndarray_impl::StaticStorageSizeGetter< SizesHolder >::get(), Value >,
		SizesHolder,
		Permutation,
		__ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > >,
		void >;
		static_assert( __ndarray_impl::StaticStorageSizeGetter< SizesHolder >::get() > 0,
		"All dimensions of a static array must to be positive." );

		public:
		// inherit all assignment operators
		using Base::operator=;
		};

		template< typename Value,
		@@ -356,7 +405,14 @@ class StaticMatrix
		SizesHolder< std::size_t, Rows, Columns >,
		Permutation >
		{
		using Base = StaticNDArray< Value,
		SizesHolder< std::size_t, Rows, Columns >,
		Permutation >;

		public:
		// inherit all assignment operators
		using Base::operator=;

		static constexpr std::size_t getRows()
		{
		return Rows;
		@@ -388,7 +444,22 @@ class SlicedNDArray
		SliceInfoHost,
		SliceInfoCuda >::type >
		>
		{};
		{
		using Base = NDArrayStorage< Array< Value, Device, Index >,
		SizesHolder,
		typename std::conditional< std::is_same< Device, Devices::Host >::value,
		PermutationHost,
		PermutationCuda >::type,
		__ndarray_impl::SlicedNDArrayBase<
		typename std::conditional< std::is_same< Device, Devices::Host >::value,
		SliceInfoHost,
		SliceInfoCuda >::type >
		>;

		public:
		// inherit all assignment operators
		using Base::operator=;
		};

		} // namespace Containers
		} // namespace TNL

src/TNL/Containers/NDArrayView.h

+29 −59

Original line number	Diff line number	Diff line
		@@ -18,6 +18,7 @@
		#include <TNL/Containers/ndarray/Executors.h>
		#include <TNL/Containers/ndarray/BoundaryExecutors.h>
		#include <TNL/Containers/ndarray/Operations.h>
		#include <TNL/Containers/Algorithms/ArrayOperations.h>

		namespace TNL {
		namespace Containers {
		@@ -71,7 +72,24 @@ public:
		{
		TNL_ASSERT_EQ( sizes, other.sizes, "The sizes of the array views must be equal, views are not resizable." );
		if( getStorageSize() > 0 )
		ArrayOpsHelper< Device >::copy( array, other.array, getStorageSize() );
		Algorithms::ArrayOperations< DeviceType >::copy( array, other.array, getStorageSize() );
		return *this;
		}

		// Templated copy-assignment
		template< typename OtherView >
		NDArrayView& operator=( const OtherView& other )
		{
		static_assert( std::is_same< PermutationType, typename OtherView::PermutationType >::value,
		"Arrays must have the same permutation of indices." );
		static_assert( NDArrayView::isContiguous() && OtherView::isContiguous(),
		"Non-contiguous array views cannot be assigned." );
		TNL_ASSERT_TRUE( __ndarray_impl::sizesWeakCompare( getSizes(), other.getSizes() ),
		"The sizes of the array views must be equal, views are not resizable." );
		if( getStorageSize() > 0 ) {
		TNL_ASSERT_TRUE( array, "Attempted to assign to an empty view." );
		Algorithms::ArrayOperations< DeviceType, typename OtherView::DeviceType >::copy( array, other.getData(), getStorageSize() );
		}
		return *this;
		}

		@@ -101,7 +119,7 @@ public:
		if( sizes != other.sizes )
		return false;
		// FIXME: uninitialized data due to alignment in NDArray and padding in SlicedNDArray
		return ArrayOpsHelper< Device, Device >::compare( array, other.array, getStorageSize() );
		return Algorithms::ArrayOperations< Device, Device >::compare( array, other.array, getStorageSize() );
		}

		__cuda_callable__
		@@ -110,7 +128,7 @@ public:
		if( sizes != other.sizes )
		return true;
		// FIXME: uninitialized data due to alignment in NDArray and padding in SlicedNDArray
		return ! ArrayOpsHelper< Device, Device >::compare( array, other.array, getStorageSize() );
		return ! Algorithms::ArrayOperations< Device, Device >::compare( array, other.array, getStorageSize() );
		}

		static constexpr std::size_t getDimension()
		@@ -118,6 +136,14 @@ public:
		return SizesHolder::getDimension();
		}

		// accessor to the underlying data
		// (should not be used for accessing the elements, intended only for the implementation
		// of operator= and functions like cudaHostRegister)
		std::add_const_t< ValueType >* getData() const
		{
		return array;
		}

		const SizesHolderType& getSizes() const
		{
		return sizes;
		@@ -285,62 +311,6 @@ public:
		protected:
		Value* array = nullptr;
		SizesHolder sizes;

		// TODO: establish the concept of a "void device" for static computations in the whole TNL

		template< typename DestinationDevice, typename SourceDevice = DestinationDevice, typename _unused = void >
		struct ArrayOpsHelper
		{
		template< typename DestinationValue,
		typename SourceValue,
		typename Index >
		static void copy( DestinationValue* destination,
		const SourceValue* source,
		const Index size )
		{
		Algorithms::ArrayOperations< DestinationDevice, SourceDevice >::copy( destination, source, size );
		}

		template< typename Value1,
		typename Value2,
		typename Index >
		static bool compare( const Value1* destination,
		const Value2* source,
		const Index size )
		{
		return Algorithms::ArrayOperations< DestinationDevice, SourceDevice >::compare( destination, source, size );
		}
		};

		template< typename _unused >
		struct ArrayOpsHelper< void, void, _unused >
		{
		template< typename DestinationValue,
		typename SourceValue,
		typename Index >
		__cuda_callable__
		static void copy( DestinationValue* destination,
		const SourceValue* source,
		const Index size )
		{
		for( Index i = 0; i < size; i ++ )
		destination[ i ] = source[ i ];
		}

		template< typename Value1,
		typename Value2,
		typename Index >
		__cuda_callable__
		static bool compare( const Value1* destination,
		const Value2* source,
		const Index size )
		{
		for( Index i = 0; i < size; i++ )
		if( ! ( destination[ i ] == source[ i ] ) )
		return false;
		return true;
		}
		};
		};

		} // namespace Containers

src/TNL/Containers/ndarray/BoundaryExecutors.h

+68 −22

Original line number	Diff line number	Diff line
		@@ -204,12 +204,13 @@ struct ParallelBoundaryExecutor< Permutation, Device, IndexTag< 3 > >
		static_assert( Begins::getDimension() == Ends::getDimension(),
		"wrong begins or ends" );

		using Index = typename Ends::IndexType;

		auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 )
		{
		call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 );
		};
		// nvcc does not like nested __cuda_callable__ and normal lambdas...
		// using Index = typename Ends::IndexType;
		// auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 )
		// {
		// call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 );
		// };
		Kernel< Device > kernel;

		const auto begin0 = begins.template getSize< get< 0 >( Permutation{} ) >();
		const auto begin1 = begins.template getSize< get< 1 >( Permutation{} ) >();
		@@ -224,13 +225,35 @@ struct ParallelBoundaryExecutor< Permutation, Device, IndexTag< 3 > >
		const auto end1 = ends.template getSize< get< 1 >( Permutation{} ) >();
		const auto end2 = ends.template getSize< get< 2 >( Permutation{} ) >();

		ParallelFor3D< Device >::exec( begin2, begin1, begin0, skipBegin2, end1, end0, kernel );
		ParallelFor3D< Device >::exec( skipEnd2, begin1, begin0, end2, end1, end0, kernel );
		ParallelFor3D< Device >::exec( skipBegin2, begin1, begin0, skipEnd2, skipBegin1, end0, kernel );
		ParallelFor3D< Device >::exec( skipBegin2, skipEnd1, begin0, skipEnd2, end1, end0, kernel );
		ParallelFor3D< Device >::exec( skipBegin2, skipBegin1, begin0, skipEnd2, skipEnd1, skipBegin0, kernel );
		ParallelFor3D< Device >::exec( skipBegin2, skipBegin1, skipEnd0, skipEnd2, skipEnd1, end0, kernel );
		ParallelFor3D< Device >::exec( begin2, begin1, begin0, skipBegin2, end1, end0, kernel, f );
		ParallelFor3D< Device >::exec( skipEnd2, begin1, begin0, end2, end1, end0, kernel, f );
		ParallelFor3D< Device >::exec( skipBegin2, begin1, begin0, skipEnd2, skipBegin1, end0, kernel, f );
		ParallelFor3D< Device >::exec( skipBegin2, skipEnd1, begin0, skipEnd2, end1, end0, kernel, f );
		ParallelFor3D< Device >::exec( skipBegin2, skipBegin1, begin0, skipEnd2, skipEnd1, skipBegin0, kernel, f );
		ParallelFor3D< Device >::exec( skipBegin2, skipBegin1, skipEnd0, skipEnd2, skipEnd1, end0, kernel, f );
		}

		template< typename __Device, typename = void >
		struct Kernel
		{
		template< typename Index, typename Func >
		void operator()( Index i2, Index i1, Index i0, Func f )
		{
		call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 );
		};
		};

		// dummy specialization to avoid a shitpile of nvcc warnings
		template< typename __unused >
		struct Kernel< Devices::Cuda, __unused >
		{
		template< typename Index, typename Func >
		__cuda_callable__
		void operator()( Index i2, Index i1, Index i0, Func f )
		{
		call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 );
		};
		};
		};

		template< typename Permutation,
		@@ -251,12 +274,13 @@ struct ParallelBoundaryExecutor< Permutation, Device, IndexTag< 2 > >
		static_assert( Begins::getDimension() == Ends::getDimension(),
		"wrong begins or ends" );

		using Index = typename Ends::IndexType;

		auto kernel = [=] __cuda_callable__ ( Index i1, Index i0 )
		{
		call_with_unpermuted_arguments< Permutation >( f, i0, i1 );
		};
		// nvcc does not like nested __cuda_callable__ and normal lambdas...
		// using Index = typename Ends::IndexType;
		// auto kernel = [=] __cuda_callable__ ( Index i1, Index i0 )
		// {
		// call_with_unpermuted_arguments< Permutation >( f, i0, i1 );
		// };
		Kernel< Device > kernel;

		const auto begin0 = begins.template getSize< get< 0 >( Permutation{} ) >();
		const auto begin1 = begins.template getSize< get< 1 >( Permutation{} ) >();
		@@ -267,11 +291,33 @@ struct ParallelBoundaryExecutor< Permutation, Device, IndexTag< 2 > >
		const auto end0 = ends.template getSize< get< 0 >( Permutation{} ) >();
		const auto end1 = ends.template getSize< get< 1 >( Permutation{} ) >();

		ParallelFor2D< Device >::exec( begin1, begin0, skipBegin1, end0, kernel );
		ParallelFor2D< Device >::exec( skipEnd1, begin0, end1, end0, kernel );
		ParallelFor2D< Device >::exec( skipBegin1, begin0, skipEnd1, skipBegin0, kernel );
		ParallelFor2D< Device >::exec( skipBegin1, skipEnd0, skipEnd1, end0, kernel );
		ParallelFor2D< Device >::exec( begin1, begin0, skipBegin1, end0, kernel, f );
		ParallelFor2D< Device >::exec( skipEnd1, begin0, end1, end0, kernel, f );
		ParallelFor2D< Device >::exec( skipBegin1, begin0, skipEnd1, skipBegin0, kernel, f );
		ParallelFor2D< Device >::exec( skipBegin1, skipEnd0, skipEnd1, end0, kernel, f );
		}

		template< typename __Device, typename = void >
		struct Kernel
		{
		template< typename Index, typename Func >
		void operator()( Index i1, Index i0, Func f )
		{
		call_with_unpermuted_arguments< Permutation >( f, i0, i1 );
		};
		};

		// dummy specialization to avoid a shitpile of nvcc warnings
		template< typename __unused >
		struct Kernel< Devices::Cuda, __unused >
		{
		template< typename Index, typename Func >
		__cuda_callable__
		void operator()( Index i1, Index i0, Func f )
		{
		call_with_unpermuted_arguments< Permutation >( f, i0, i1 );
		};
		};
		};

		template< typename Permutation,