Loading src/TNL/Containers/Algorithms/ArrayOperations.h +37 −0 Original line number Diff line number Diff line Loading @@ -22,6 +22,42 @@ template< typename DestinationDevice, typename SourceDevice = DestinationDevice > struct ArrayOperations; // TODO: establish the concept of a "void device" for static computations in the whole TNL template<> struct ArrayOperations< void > { template< typename Element > __cuda_callable__ static void setElement( Element* data, const Element& value ); template< typename Element > __cuda_callable__ static Element getElement( const Element* data ); template< typename Element, typename Index > __cuda_callable__ static void set( Element* data, const Element& value, const Index size ); template< typename DestinationElement, typename SourceElement, typename Index > __cuda_callable__ static void copy( DestinationElement* destination, const SourceElement* source, const Index size ); template< typename Element1, typename Element2, typename Index > __cuda_callable__ static bool compare( const Element1* destination, const Element2* source, const Index size ); }; template<> struct ArrayOperations< Devices::Host > { Loading Loading @@ -251,6 +287,7 @@ struct ArrayOperations< Devices::Host, Devices::MIC > } // namespace Containers } // namespace TNL #include <TNL/Containers/Algorithms/ArrayOperationsStatic.hpp> #include <TNL/Containers/Algorithms/ArrayOperationsHost.hpp> #include <TNL/Containers/Algorithms/ArrayOperationsCuda.hpp> #include <TNL/Containers/Algorithms/ArrayOperationsMIC.hpp> src/TNL/Containers/Algorithms/ArrayOperationsStatic.hpp 0 → 100644 +82 −0 Original line number Diff line number Diff line /*************************************************************************** ArrayOperationsStatic_impl.h - description ------------------- begin : Apr 8, 2019 copyright : (C) 2019 by Tomas Oberhuber et al. email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ #pragma once #include <TNL/Containers/Algorithms/ArrayOperations.h> namespace TNL { namespace Containers { namespace Algorithms { template< typename Element > __cuda_callable__ void ArrayOperations< void >:: setElement( Element* data, const Element& value ) { *data = value; } template< typename Element > __cuda_callable__ Element ArrayOperations< void >:: getElement( const Element* data ) { return *data; } template< typename Element, typename Index > __cuda_callable__ void ArrayOperations< void >:: set( Element* data, const Element& value, const Index size ) { for( Index i = 0; i < size; i ++ ) data[ i ] = value; } template< typename DestinationElement, typename SourceElement, typename Index > __cuda_callable__ void ArrayOperations< void >:: copy( DestinationElement* destination, const SourceElement* source, const Index size ) { for( Index i = 0; i < size; i ++ ) destination[ i ] = source[ i ]; } template< typename Element1, typename Element2, typename Index > __cuda_callable__ bool ArrayOperations< void >:: compare( const Element1* destination, const Element2* source, const Index size ) { for( Index i = 0; i < size; i++ ) if( ! ( destination[ i ] == source[ i ] ) ) return false; return true; } } // namespace Algorithms } // namespace Containers } // namespace TNL src/TNL/Containers/NDArray.h +73 −2 Original line number Diff line number Diff line Loading @@ -53,6 +53,12 @@ public: static_assert( Permutation::size() == SizesHolder::getDimension(), "invalid permutation" ); // for compatibility with NDArrayView (which inherits from StrideBase) static constexpr bool isContiguous() { return true; } // all methods from NDArrayView NDArrayStorage() = default; Loading @@ -70,6 +76,21 @@ public: NDArrayStorage( NDArrayStorage&& ) = default; NDArrayStorage& operator=( NDArrayStorage&& ) = default; // Templated copy-assignment template< typename OtherArray > NDArrayStorage& operator=( const OtherArray& other ) { static_assert( std::is_same< PermutationType, typename OtherArray::PermutationType >::value, "Arrays must have the same permutation of indices." ); // update sizes __ndarray_impl::SetSizesCopyHelper< SizesHolderType, typename OtherArray::SizesHolderType >::copy( sizes, other.getSizes() ); // (re)allocate storage if necessary array.setSize( getStorageSize() ); // copy data getView() = other.getConstView(); return *this; } bool operator==( const NDArrayStorage& other ) const { // FIXME: uninitialized data due to alignment in NDArray and padding in SlicedNDArray Loading @@ -82,6 +103,14 @@ public: return sizes != other.sizes || array != other.array; } // accessor to the underlying data // (should not be used for accessing the elements, intended only for the implementation // of operator= and functions like cudaHostRegister) std::add_const_t< ValueType >* getData() const { return array.getData(); } static constexpr std::size_t getDimension() { return SizesHolder::getDimension(); Loading Loading @@ -330,7 +359,18 @@ class NDArray PermutationHost, PermutationCuda >::type, __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > > > {}; { using Base = NDArrayStorage< Array< Value, Device, Index >, SizesHolder, typename std::conditional< std::is_same< Device, Devices::Host >::value, PermutationHost, PermutationCuda >::type, __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > > >; public: // inherit all assignment operators using Base::operator=; }; template< typename Value, typename SizesHolder, Loading @@ -343,8 +383,17 @@ class StaticNDArray __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > >, void > { using Base = NDArrayStorage< StaticArray< __ndarray_impl::StaticStorageSizeGetter< SizesHolder >::get(), Value >, SizesHolder, Permutation, __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > >, void >; static_assert( __ndarray_impl::StaticStorageSizeGetter< SizesHolder >::get() > 0, "All dimensions of a static array must to be positive." ); public: // inherit all assignment operators using Base::operator=; }; template< typename Value, Loading @@ -356,7 +405,14 @@ class StaticMatrix SizesHolder< std::size_t, Rows, Columns >, Permutation > { using Base = StaticNDArray< Value, SizesHolder< std::size_t, Rows, Columns >, Permutation >; public: // inherit all assignment operators using Base::operator=; static constexpr std::size_t getRows() { return Rows; Loading Loading @@ -388,7 +444,22 @@ class SlicedNDArray SliceInfoHost, SliceInfoCuda >::type > > {}; { using Base = NDArrayStorage< Array< Value, Device, Index >, SizesHolder, typename std::conditional< std::is_same< Device, Devices::Host >::value, PermutationHost, PermutationCuda >::type, __ndarray_impl::SlicedNDArrayBase< typename std::conditional< std::is_same< Device, Devices::Host >::value, SliceInfoHost, SliceInfoCuda >::type > >; public: // inherit all assignment operators using Base::operator=; }; } // namespace Containers } // namespace TNL src/TNL/Containers/NDArrayView.h +29 −59 Original line number Diff line number Diff line Loading @@ -18,6 +18,7 @@ #include <TNL/Containers/ndarray/Executors.h> #include <TNL/Containers/ndarray/BoundaryExecutors.h> #include <TNL/Containers/ndarray/Operations.h> #include <TNL/Containers/Algorithms/ArrayOperations.h> namespace TNL { namespace Containers { Loading Loading @@ -71,7 +72,24 @@ public: { TNL_ASSERT_EQ( sizes, other.sizes, "The sizes of the array views must be equal, views are not resizable." ); if( getStorageSize() > 0 ) ArrayOpsHelper< Device >::copy( array, other.array, getStorageSize() ); Algorithms::ArrayOperations< DeviceType >::copy( array, other.array, getStorageSize() ); return *this; } // Templated copy-assignment template< typename OtherView > NDArrayView& operator=( const OtherView& other ) { static_assert( std::is_same< PermutationType, typename OtherView::PermutationType >::value, "Arrays must have the same permutation of indices." ); static_assert( NDArrayView::isContiguous() && OtherView::isContiguous(), "Non-contiguous array views cannot be assigned." ); TNL_ASSERT_TRUE( __ndarray_impl::sizesWeakCompare( getSizes(), other.getSizes() ), "The sizes of the array views must be equal, views are not resizable." ); if( getStorageSize() > 0 ) { TNL_ASSERT_TRUE( array, "Attempted to assign to an empty view." ); Algorithms::ArrayOperations< DeviceType, typename OtherView::DeviceType >::copy( array, other.getData(), getStorageSize() ); } return *this; } Loading Loading @@ -101,7 +119,7 @@ public: if( sizes != other.sizes ) return false; // FIXME: uninitialized data due to alignment in NDArray and padding in SlicedNDArray return ArrayOpsHelper< Device, Device >::compare( array, other.array, getStorageSize() ); return Algorithms::ArrayOperations< Device, Device >::compare( array, other.array, getStorageSize() ); } __cuda_callable__ Loading @@ -110,7 +128,7 @@ public: if( sizes != other.sizes ) return true; // FIXME: uninitialized data due to alignment in NDArray and padding in SlicedNDArray return ! ArrayOpsHelper< Device, Device >::compare( array, other.array, getStorageSize() ); return ! Algorithms::ArrayOperations< Device, Device >::compare( array, other.array, getStorageSize() ); } static constexpr std::size_t getDimension() Loading @@ -118,6 +136,14 @@ public: return SizesHolder::getDimension(); } // accessor to the underlying data // (should not be used for accessing the elements, intended only for the implementation // of operator= and functions like cudaHostRegister) std::add_const_t< ValueType >* getData() const { return array; } const SizesHolderType& getSizes() const { return sizes; Loading Loading @@ -285,62 +311,6 @@ public: protected: Value* array = nullptr; SizesHolder sizes; // TODO: establish the concept of a "void device" for static computations in the whole TNL template< typename DestinationDevice, typename SourceDevice = DestinationDevice, typename _unused = void > struct ArrayOpsHelper { template< typename DestinationValue, typename SourceValue, typename Index > static void copy( DestinationValue* destination, const SourceValue* source, const Index size ) { Algorithms::ArrayOperations< DestinationDevice, SourceDevice >::copy( destination, source, size ); } template< typename Value1, typename Value2, typename Index > static bool compare( const Value1* destination, const Value2* source, const Index size ) { return Algorithms::ArrayOperations< DestinationDevice, SourceDevice >::compare( destination, source, size ); } }; template< typename _unused > struct ArrayOpsHelper< void, void, _unused > { template< typename DestinationValue, typename SourceValue, typename Index > __cuda_callable__ static void copy( DestinationValue* destination, const SourceValue* source, const Index size ) { for( Index i = 0; i < size; i ++ ) destination[ i ] = source[ i ]; } template< typename Value1, typename Value2, typename Index > __cuda_callable__ static bool compare( const Value1* destination, const Value2* source, const Index size ) { for( Index i = 0; i < size; i++ ) if( ! ( destination[ i ] == source[ i ] ) ) return false; return true; } }; }; } // namespace Containers Loading src/TNL/Containers/ndarray/BoundaryExecutors.h +68 −22 Original line number Diff line number Diff line Loading @@ -204,12 +204,13 @@ struct ParallelBoundaryExecutor< Permutation, Device, IndexTag< 3 > > static_assert( Begins::getDimension() == Ends::getDimension(), "wrong begins or ends" ); using Index = typename Ends::IndexType; auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 ) { call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 ); }; // nvcc does not like nested __cuda_callable__ and normal lambdas... // using Index = typename Ends::IndexType; // auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 ) // { // call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 ); // }; Kernel< Device > kernel; const auto begin0 = begins.template getSize< get< 0 >( Permutation{} ) >(); const auto begin1 = begins.template getSize< get< 1 >( Permutation{} ) >(); Loading @@ -224,13 +225,35 @@ struct ParallelBoundaryExecutor< Permutation, Device, IndexTag< 3 > > const auto end1 = ends.template getSize< get< 1 >( Permutation{} ) >(); const auto end2 = ends.template getSize< get< 2 >( Permutation{} ) >(); ParallelFor3D< Device >::exec( begin2, begin1, begin0, skipBegin2, end1, end0, kernel ); ParallelFor3D< Device >::exec( skipEnd2, begin1, begin0, end2, end1, end0, kernel ); ParallelFor3D< Device >::exec( skipBegin2, begin1, begin0, skipEnd2, skipBegin1, end0, kernel ); ParallelFor3D< Device >::exec( skipBegin2, skipEnd1, begin0, skipEnd2, end1, end0, kernel ); ParallelFor3D< Device >::exec( skipBegin2, skipBegin1, begin0, skipEnd2, skipEnd1, skipBegin0, kernel ); ParallelFor3D< Device >::exec( skipBegin2, skipBegin1, skipEnd0, skipEnd2, skipEnd1, end0, kernel ); ParallelFor3D< Device >::exec( begin2, begin1, begin0, skipBegin2, end1, end0, kernel, f ); ParallelFor3D< Device >::exec( skipEnd2, begin1, begin0, end2, end1, end0, kernel, f ); ParallelFor3D< Device >::exec( skipBegin2, begin1, begin0, skipEnd2, skipBegin1, end0, kernel, f ); ParallelFor3D< Device >::exec( skipBegin2, skipEnd1, begin0, skipEnd2, end1, end0, kernel, f ); ParallelFor3D< Device >::exec( skipBegin2, skipBegin1, begin0, skipEnd2, skipEnd1, skipBegin0, kernel, f ); ParallelFor3D< Device >::exec( skipBegin2, skipBegin1, skipEnd0, skipEnd2, skipEnd1, end0, kernel, f ); } template< typename __Device, typename = void > struct Kernel { template< typename Index, typename Func > void operator()( Index i2, Index i1, Index i0, Func f ) { call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 ); }; }; // dummy specialization to avoid a shitpile of nvcc warnings template< typename __unused > struct Kernel< Devices::Cuda, __unused > { template< typename Index, typename Func > __cuda_callable__ void operator()( Index i2, Index i1, Index i0, Func f ) { call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 ); }; }; }; template< typename Permutation, Loading @@ -251,12 +274,13 @@ struct ParallelBoundaryExecutor< Permutation, Device, IndexTag< 2 > > static_assert( Begins::getDimension() == Ends::getDimension(), "wrong begins or ends" ); using Index = typename Ends::IndexType; auto kernel = [=] __cuda_callable__ ( Index i1, Index i0 ) { call_with_unpermuted_arguments< Permutation >( f, i0, i1 ); }; // nvcc does not like nested __cuda_callable__ and normal lambdas... // using Index = typename Ends::IndexType; // auto kernel = [=] __cuda_callable__ ( Index i1, Index i0 ) // { // call_with_unpermuted_arguments< Permutation >( f, i0, i1 ); // }; Kernel< Device > kernel; const auto begin0 = begins.template getSize< get< 0 >( Permutation{} ) >(); const auto begin1 = begins.template getSize< get< 1 >( Permutation{} ) >(); Loading @@ -267,11 +291,33 @@ struct ParallelBoundaryExecutor< Permutation, Device, IndexTag< 2 > > const auto end0 = ends.template getSize< get< 0 >( Permutation{} ) >(); const auto end1 = ends.template getSize< get< 1 >( Permutation{} ) >(); ParallelFor2D< Device >::exec( begin1, begin0, skipBegin1, end0, kernel ); ParallelFor2D< Device >::exec( skipEnd1, begin0, end1, end0, kernel ); ParallelFor2D< Device >::exec( skipBegin1, begin0, skipEnd1, skipBegin0, kernel ); ParallelFor2D< Device >::exec( skipBegin1, skipEnd0, skipEnd1, end0, kernel ); ParallelFor2D< Device >::exec( begin1, begin0, skipBegin1, end0, kernel, f ); ParallelFor2D< Device >::exec( skipEnd1, begin0, end1, end0, kernel, f ); ParallelFor2D< Device >::exec( skipBegin1, begin0, skipEnd1, skipBegin0, kernel, f ); ParallelFor2D< Device >::exec( skipBegin1, skipEnd0, skipEnd1, end0, kernel, f ); } template< typename __Device, typename = void > struct Kernel { template< typename Index, typename Func > void operator()( Index i1, Index i0, Func f ) { call_with_unpermuted_arguments< Permutation >( f, i0, i1 ); }; }; // dummy specialization to avoid a shitpile of nvcc warnings template< typename __unused > struct Kernel< Devices::Cuda, __unused > { template< typename Index, typename Func > __cuda_callable__ void operator()( Index i1, Index i0, Func f ) { call_with_unpermuted_arguments< Permutation >( f, i0, i1 ); }; }; }; template< typename Permutation, Loading Loading
src/TNL/Containers/Algorithms/ArrayOperations.h +37 −0 Original line number Diff line number Diff line Loading @@ -22,6 +22,42 @@ template< typename DestinationDevice, typename SourceDevice = DestinationDevice > struct ArrayOperations; // TODO: establish the concept of a "void device" for static computations in the whole TNL template<> struct ArrayOperations< void > { template< typename Element > __cuda_callable__ static void setElement( Element* data, const Element& value ); template< typename Element > __cuda_callable__ static Element getElement( const Element* data ); template< typename Element, typename Index > __cuda_callable__ static void set( Element* data, const Element& value, const Index size ); template< typename DestinationElement, typename SourceElement, typename Index > __cuda_callable__ static void copy( DestinationElement* destination, const SourceElement* source, const Index size ); template< typename Element1, typename Element2, typename Index > __cuda_callable__ static bool compare( const Element1* destination, const Element2* source, const Index size ); }; template<> struct ArrayOperations< Devices::Host > { Loading Loading @@ -251,6 +287,7 @@ struct ArrayOperations< Devices::Host, Devices::MIC > } // namespace Containers } // namespace TNL #include <TNL/Containers/Algorithms/ArrayOperationsStatic.hpp> #include <TNL/Containers/Algorithms/ArrayOperationsHost.hpp> #include <TNL/Containers/Algorithms/ArrayOperationsCuda.hpp> #include <TNL/Containers/Algorithms/ArrayOperationsMIC.hpp>
src/TNL/Containers/Algorithms/ArrayOperationsStatic.hpp 0 → 100644 +82 −0 Original line number Diff line number Diff line /*************************************************************************** ArrayOperationsStatic_impl.h - description ------------------- begin : Apr 8, 2019 copyright : (C) 2019 by Tomas Oberhuber et al. email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /* See Copyright Notice in tnl/Copyright */ #pragma once #include <TNL/Containers/Algorithms/ArrayOperations.h> namespace TNL { namespace Containers { namespace Algorithms { template< typename Element > __cuda_callable__ void ArrayOperations< void >:: setElement( Element* data, const Element& value ) { *data = value; } template< typename Element > __cuda_callable__ Element ArrayOperations< void >:: getElement( const Element* data ) { return *data; } template< typename Element, typename Index > __cuda_callable__ void ArrayOperations< void >:: set( Element* data, const Element& value, const Index size ) { for( Index i = 0; i < size; i ++ ) data[ i ] = value; } template< typename DestinationElement, typename SourceElement, typename Index > __cuda_callable__ void ArrayOperations< void >:: copy( DestinationElement* destination, const SourceElement* source, const Index size ) { for( Index i = 0; i < size; i ++ ) destination[ i ] = source[ i ]; } template< typename Element1, typename Element2, typename Index > __cuda_callable__ bool ArrayOperations< void >:: compare( const Element1* destination, const Element2* source, const Index size ) { for( Index i = 0; i < size; i++ ) if( ! ( destination[ i ] == source[ i ] ) ) return false; return true; } } // namespace Algorithms } // namespace Containers } // namespace TNL
src/TNL/Containers/NDArray.h +73 −2 Original line number Diff line number Diff line Loading @@ -53,6 +53,12 @@ public: static_assert( Permutation::size() == SizesHolder::getDimension(), "invalid permutation" ); // for compatibility with NDArrayView (which inherits from StrideBase) static constexpr bool isContiguous() { return true; } // all methods from NDArrayView NDArrayStorage() = default; Loading @@ -70,6 +76,21 @@ public: NDArrayStorage( NDArrayStorage&& ) = default; NDArrayStorage& operator=( NDArrayStorage&& ) = default; // Templated copy-assignment template< typename OtherArray > NDArrayStorage& operator=( const OtherArray& other ) { static_assert( std::is_same< PermutationType, typename OtherArray::PermutationType >::value, "Arrays must have the same permutation of indices." ); // update sizes __ndarray_impl::SetSizesCopyHelper< SizesHolderType, typename OtherArray::SizesHolderType >::copy( sizes, other.getSizes() ); // (re)allocate storage if necessary array.setSize( getStorageSize() ); // copy data getView() = other.getConstView(); return *this; } bool operator==( const NDArrayStorage& other ) const { // FIXME: uninitialized data due to alignment in NDArray and padding in SlicedNDArray Loading @@ -82,6 +103,14 @@ public: return sizes != other.sizes || array != other.array; } // accessor to the underlying data // (should not be used for accessing the elements, intended only for the implementation // of operator= and functions like cudaHostRegister) std::add_const_t< ValueType >* getData() const { return array.getData(); } static constexpr std::size_t getDimension() { return SizesHolder::getDimension(); Loading Loading @@ -330,7 +359,18 @@ class NDArray PermutationHost, PermutationCuda >::type, __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > > > {}; { using Base = NDArrayStorage< Array< Value, Device, Index >, SizesHolder, typename std::conditional< std::is_same< Device, Devices::Host >::value, PermutationHost, PermutationCuda >::type, __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > > >; public: // inherit all assignment operators using Base::operator=; }; template< typename Value, typename SizesHolder, Loading @@ -343,8 +383,17 @@ class StaticNDArray __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > >, void > { using Base = NDArrayStorage< StaticArray< __ndarray_impl::StaticStorageSizeGetter< SizesHolder >::get(), Value >, SizesHolder, Permutation, __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > >, void >; static_assert( __ndarray_impl::StaticStorageSizeGetter< SizesHolder >::get() > 0, "All dimensions of a static array must to be positive." ); public: // inherit all assignment operators using Base::operator=; }; template< typename Value, Loading @@ -356,7 +405,14 @@ class StaticMatrix SizesHolder< std::size_t, Rows, Columns >, Permutation > { using Base = StaticNDArray< Value, SizesHolder< std::size_t, Rows, Columns >, Permutation >; public: // inherit all assignment operators using Base::operator=; static constexpr std::size_t getRows() { return Rows; Loading Loading @@ -388,7 +444,22 @@ class SlicedNDArray SliceInfoHost, SliceInfoCuda >::type > > {}; { using Base = NDArrayStorage< Array< Value, Device, Index >, SizesHolder, typename std::conditional< std::is_same< Device, Devices::Host >::value, PermutationHost, PermutationCuda >::type, __ndarray_impl::SlicedNDArrayBase< typename std::conditional< std::is_same< Device, Devices::Host >::value, SliceInfoHost, SliceInfoCuda >::type > >; public: // inherit all assignment operators using Base::operator=; }; } // namespace Containers } // namespace TNL
src/TNL/Containers/NDArrayView.h +29 −59 Original line number Diff line number Diff line Loading @@ -18,6 +18,7 @@ #include <TNL/Containers/ndarray/Executors.h> #include <TNL/Containers/ndarray/BoundaryExecutors.h> #include <TNL/Containers/ndarray/Operations.h> #include <TNL/Containers/Algorithms/ArrayOperations.h> namespace TNL { namespace Containers { Loading Loading @@ -71,7 +72,24 @@ public: { TNL_ASSERT_EQ( sizes, other.sizes, "The sizes of the array views must be equal, views are not resizable." ); if( getStorageSize() > 0 ) ArrayOpsHelper< Device >::copy( array, other.array, getStorageSize() ); Algorithms::ArrayOperations< DeviceType >::copy( array, other.array, getStorageSize() ); return *this; } // Templated copy-assignment template< typename OtherView > NDArrayView& operator=( const OtherView& other ) { static_assert( std::is_same< PermutationType, typename OtherView::PermutationType >::value, "Arrays must have the same permutation of indices." ); static_assert( NDArrayView::isContiguous() && OtherView::isContiguous(), "Non-contiguous array views cannot be assigned." ); TNL_ASSERT_TRUE( __ndarray_impl::sizesWeakCompare( getSizes(), other.getSizes() ), "The sizes of the array views must be equal, views are not resizable." ); if( getStorageSize() > 0 ) { TNL_ASSERT_TRUE( array, "Attempted to assign to an empty view." ); Algorithms::ArrayOperations< DeviceType, typename OtherView::DeviceType >::copy( array, other.getData(), getStorageSize() ); } return *this; } Loading Loading @@ -101,7 +119,7 @@ public: if( sizes != other.sizes ) return false; // FIXME: uninitialized data due to alignment in NDArray and padding in SlicedNDArray return ArrayOpsHelper< Device, Device >::compare( array, other.array, getStorageSize() ); return Algorithms::ArrayOperations< Device, Device >::compare( array, other.array, getStorageSize() ); } __cuda_callable__ Loading @@ -110,7 +128,7 @@ public: if( sizes != other.sizes ) return true; // FIXME: uninitialized data due to alignment in NDArray and padding in SlicedNDArray return ! ArrayOpsHelper< Device, Device >::compare( array, other.array, getStorageSize() ); return ! Algorithms::ArrayOperations< Device, Device >::compare( array, other.array, getStorageSize() ); } static constexpr std::size_t getDimension() Loading @@ -118,6 +136,14 @@ public: return SizesHolder::getDimension(); } // accessor to the underlying data // (should not be used for accessing the elements, intended only for the implementation // of operator= and functions like cudaHostRegister) std::add_const_t< ValueType >* getData() const { return array; } const SizesHolderType& getSizes() const { return sizes; Loading Loading @@ -285,62 +311,6 @@ public: protected: Value* array = nullptr; SizesHolder sizes; // TODO: establish the concept of a "void device" for static computations in the whole TNL template< typename DestinationDevice, typename SourceDevice = DestinationDevice, typename _unused = void > struct ArrayOpsHelper { template< typename DestinationValue, typename SourceValue, typename Index > static void copy( DestinationValue* destination, const SourceValue* source, const Index size ) { Algorithms::ArrayOperations< DestinationDevice, SourceDevice >::copy( destination, source, size ); } template< typename Value1, typename Value2, typename Index > static bool compare( const Value1* destination, const Value2* source, const Index size ) { return Algorithms::ArrayOperations< DestinationDevice, SourceDevice >::compare( destination, source, size ); } }; template< typename _unused > struct ArrayOpsHelper< void, void, _unused > { template< typename DestinationValue, typename SourceValue, typename Index > __cuda_callable__ static void copy( DestinationValue* destination, const SourceValue* source, const Index size ) { for( Index i = 0; i < size; i ++ ) destination[ i ] = source[ i ]; } template< typename Value1, typename Value2, typename Index > __cuda_callable__ static bool compare( const Value1* destination, const Value2* source, const Index size ) { for( Index i = 0; i < size; i++ ) if( ! ( destination[ i ] == source[ i ] ) ) return false; return true; } }; }; } // namespace Containers Loading
src/TNL/Containers/ndarray/BoundaryExecutors.h +68 −22 Original line number Diff line number Diff line Loading @@ -204,12 +204,13 @@ struct ParallelBoundaryExecutor< Permutation, Device, IndexTag< 3 > > static_assert( Begins::getDimension() == Ends::getDimension(), "wrong begins or ends" ); using Index = typename Ends::IndexType; auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 ) { call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 ); }; // nvcc does not like nested __cuda_callable__ and normal lambdas... // using Index = typename Ends::IndexType; // auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 ) // { // call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 ); // }; Kernel< Device > kernel; const auto begin0 = begins.template getSize< get< 0 >( Permutation{} ) >(); const auto begin1 = begins.template getSize< get< 1 >( Permutation{} ) >(); Loading @@ -224,13 +225,35 @@ struct ParallelBoundaryExecutor< Permutation, Device, IndexTag< 3 > > const auto end1 = ends.template getSize< get< 1 >( Permutation{} ) >(); const auto end2 = ends.template getSize< get< 2 >( Permutation{} ) >(); ParallelFor3D< Device >::exec( begin2, begin1, begin0, skipBegin2, end1, end0, kernel ); ParallelFor3D< Device >::exec( skipEnd2, begin1, begin0, end2, end1, end0, kernel ); ParallelFor3D< Device >::exec( skipBegin2, begin1, begin0, skipEnd2, skipBegin1, end0, kernel ); ParallelFor3D< Device >::exec( skipBegin2, skipEnd1, begin0, skipEnd2, end1, end0, kernel ); ParallelFor3D< Device >::exec( skipBegin2, skipBegin1, begin0, skipEnd2, skipEnd1, skipBegin0, kernel ); ParallelFor3D< Device >::exec( skipBegin2, skipBegin1, skipEnd0, skipEnd2, skipEnd1, end0, kernel ); ParallelFor3D< Device >::exec( begin2, begin1, begin0, skipBegin2, end1, end0, kernel, f ); ParallelFor3D< Device >::exec( skipEnd2, begin1, begin0, end2, end1, end0, kernel, f ); ParallelFor3D< Device >::exec( skipBegin2, begin1, begin0, skipEnd2, skipBegin1, end0, kernel, f ); ParallelFor3D< Device >::exec( skipBegin2, skipEnd1, begin0, skipEnd2, end1, end0, kernel, f ); ParallelFor3D< Device >::exec( skipBegin2, skipBegin1, begin0, skipEnd2, skipEnd1, skipBegin0, kernel, f ); ParallelFor3D< Device >::exec( skipBegin2, skipBegin1, skipEnd0, skipEnd2, skipEnd1, end0, kernel, f ); } template< typename __Device, typename = void > struct Kernel { template< typename Index, typename Func > void operator()( Index i2, Index i1, Index i0, Func f ) { call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 ); }; }; // dummy specialization to avoid a shitpile of nvcc warnings template< typename __unused > struct Kernel< Devices::Cuda, __unused > { template< typename Index, typename Func > __cuda_callable__ void operator()( Index i2, Index i1, Index i0, Func f ) { call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 ); }; }; }; template< typename Permutation, Loading @@ -251,12 +274,13 @@ struct ParallelBoundaryExecutor< Permutation, Device, IndexTag< 2 > > static_assert( Begins::getDimension() == Ends::getDimension(), "wrong begins or ends" ); using Index = typename Ends::IndexType; auto kernel = [=] __cuda_callable__ ( Index i1, Index i0 ) { call_with_unpermuted_arguments< Permutation >( f, i0, i1 ); }; // nvcc does not like nested __cuda_callable__ and normal lambdas... // using Index = typename Ends::IndexType; // auto kernel = [=] __cuda_callable__ ( Index i1, Index i0 ) // { // call_with_unpermuted_arguments< Permutation >( f, i0, i1 ); // }; Kernel< Device > kernel; const auto begin0 = begins.template getSize< get< 0 >( Permutation{} ) >(); const auto begin1 = begins.template getSize< get< 1 >( Permutation{} ) >(); Loading @@ -267,11 +291,33 @@ struct ParallelBoundaryExecutor< Permutation, Device, IndexTag< 2 > > const auto end0 = ends.template getSize< get< 0 >( Permutation{} ) >(); const auto end1 = ends.template getSize< get< 1 >( Permutation{} ) >(); ParallelFor2D< Device >::exec( begin1, begin0, skipBegin1, end0, kernel ); ParallelFor2D< Device >::exec( skipEnd1, begin0, end1, end0, kernel ); ParallelFor2D< Device >::exec( skipBegin1, begin0, skipEnd1, skipBegin0, kernel ); ParallelFor2D< Device >::exec( skipBegin1, skipEnd0, skipEnd1, end0, kernel ); ParallelFor2D< Device >::exec( begin1, begin0, skipBegin1, end0, kernel, f ); ParallelFor2D< Device >::exec( skipEnd1, begin0, end1, end0, kernel, f ); ParallelFor2D< Device >::exec( skipBegin1, begin0, skipEnd1, skipBegin0, kernel, f ); ParallelFor2D< Device >::exec( skipBegin1, skipEnd0, skipEnd1, end0, kernel, f ); } template< typename __Device, typename = void > struct Kernel { template< typename Index, typename Func > void operator()( Index i1, Index i0, Func f ) { call_with_unpermuted_arguments< Permutation >( f, i0, i1 ); }; }; // dummy specialization to avoid a shitpile of nvcc warnings template< typename __unused > struct Kernel< Devices::Cuda, __unused > { template< typename Index, typename Func > __cuda_callable__ void operator()( Index i1, Index i0, Func f ) { call_with_unpermuted_arguments< Permutation >( f, i0, i1 ); }; }; }; template< typename Permutation, Loading