diff --git a/src/Benchmarks/CMakeLists.txt b/src/Benchmarks/CMakeLists.txt index b4639d60ade55aba454b79c27c01e04b151980b1..6f3185329c8dab36b4c07bdb949b33a45595607a 100644 --- a/src/Benchmarks/CMakeLists.txt +++ b/src/Benchmarks/CMakeLists.txt @@ -1,5 +1,6 @@ add_subdirectory( HeatEquation ) add_subdirectory( BLAS ) +add_subdirectory( NDArray ) add_subdirectory( SpMV ) add_subdirectory( DistSpMV ) add_subdirectory( LinearSolvers ) diff --git a/src/Benchmarks/NDArray/CMakeLists.txt b/src/Benchmarks/NDArray/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..e6291c5f32c5633a70afb646a2ebcfe0e9d70303 --- /dev/null +++ b/src/Benchmarks/NDArray/CMakeLists.txt @@ -0,0 +1,17 @@ +add_executable( tnl-benchmark-ndarray tnl-benchmark-ndarray.cpp ) +target_compile_options( tnl-benchmark-ndarray PRIVATE ${CXX_TESTS_FLAGS} ) +install( TARGETS tnl-benchmark-ndarray RUNTIME DESTINATION bin ) + +add_executable( tnl-benchmark-ndarray-boundary tnl-benchmark-ndarray-boundary.cpp ) +target_compile_options( tnl-benchmark-ndarray-boundary PRIVATE ${CXX_TESTS_FLAGS} ) +install( TARGETS tnl-benchmark-ndarray-boundary RUNTIME DESTINATION bin ) + +if( BUILD_CUDA ) + cuda_add_executable( tnl-benchmark-ndarray-cuda tnl-benchmark-ndarray-cuda.cu + OPTIONS ${CXX_TESTS_FLAGS} ) + install( TARGETS tnl-benchmark-ndarray-cuda RUNTIME DESTINATION bin ) + + cuda_add_executable( tnl-benchmark-ndarray-boundary-cuda tnl-benchmark-ndarray-boundary-cuda.cu + OPTIONS ${CXX_TESTS_FLAGS} ) + install( TARGETS tnl-benchmark-ndarray-boundary-cuda RUNTIME DESTINATION bin ) +endif() diff --git a/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary-cuda.cu b/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary-cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..b5a2622a4b335a054844dda8c46d76344e370fec --- /dev/null +++ b/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary-cuda.cu @@ -0,0 +1 @@ +#include "tnl-benchmark-ndarray-boundary.h" diff --git a/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.cpp b/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b5a2622a4b335a054844dda8c46d76344e370fec --- /dev/null +++ b/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.cpp @@ -0,0 +1 @@ +#include "tnl-benchmark-ndarray-boundary.h" diff --git a/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.h b/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.h new file mode 100644 index 0000000000000000000000000000000000000000..a30a25352438c4dba25f525466faa147d59dfe8a --- /dev/null +++ b/src/Benchmarks/NDArray/tnl-benchmark-ndarray-boundary.h @@ -0,0 +1,466 @@ +/*************************************************************************** + tnl-benchmark-ndarray-boundary.h - description + ------------------- + begin : Feb 9, 2019 + copyright : (C) 2019 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky + +#pragma once + +#include +#include +#include + +#include + +#include "../Benchmarks.h" + +using namespace TNL; +using namespace TNL::Benchmarks; +using namespace TNL::Containers; +using std::index_sequence; + +using value_type = float; +//using index_type = std::size_t; +using index_type = unsigned; + +template< typename Array > +void expect_eq_chunked( Array& a, Array& b ) +{ + // TODO: use something like EXPECT_EQ + TNL_ASSERT_EQ( a.getSize(), b.getSize(), "array sizes don't match" ); + if( a.getSize() != b.getSize() ) + return; + + using IndexType = typename Array::IndexType; + + const IndexType chunk_size = 4096; + for( IndexType c = 0; c < (IndexType) roundUpDivision( a.getSize(), chunk_size ); c++ ) { + const typename Array::IndexType this_chunk_size = TNL::min( chunk_size, a.getSize() - c * chunk_size ); + Array a_chunk( &a[ c * chunk_size ], this_chunk_size ); + Array b_chunk( &b[ c * chunk_size ], this_chunk_size ); + // TODO: use something like EXPECT_EQ + TNL_ASSERT_EQ( a_chunk, b_chunk, "chunks are not equal" ); + } +} + +template< typename Array > +void expect_eq( Array& a, Array& b ) +{ + if( std::is_same< typename Array::DeviceType, TNL::Devices::Cuda >::value ) { + typename Array::HostType a_host, b_host; + a_host = a; + b_host = b; + expect_eq_chunked( a_host, b_host ); + } + else { + expect_eq_chunked( a, b ); + } +} + +template< typename Device > +const char* performer() +{ + if( std::is_same< Device, Devices::Host >::value ) + return "CPU"; + else if( std::is_same< Device, Devices::Cuda >::value ) + return "GPU"; + else + return "unknown"; +} + +void reset() {} + +// NOTE: having the sizes as function parameters keeps the compiler from treating them +// as "compile-time constants" and thus e.g. optimizing the 1D iterations with memcpy + +template< typename Device > +void benchmark_1D( Benchmark& benchmark, index_type size = 500000000 ) +{ + NDArray< value_type, + SizesHolder< index_type, 0 >, + std::make_index_sequence< 1 >, + Device > a, b; + a.setSizes( size ); + b.setSizes( size ); + a.getStorageArray().setValue( -1 ); + b.getStorageArray().setValue( 1 ); + + auto a_view = a.getView(); + auto b_view = b.getView(); + + auto f = [&]() { + a.forBoundary( [=] __cuda_callable__ ( index_type i ) mutable { a_view( i ) = b_view( i ); } ); + a.forInternal( [=] __cuda_callable__ ( index_type i ) mutable { a_view( i ) = b_view( i ); } ); + }; + + const double datasetSize = 2 * size * sizeof(value_type) / oneGB; + benchmark.setOperation( "1D", datasetSize ); + benchmark.time< Device >( reset, performer< Device >(), f ); + + expect_eq( a.getStorageArray(), b.getStorageArray() ); +} + +template< typename Device > +void benchmark_2D( Benchmark& benchmark, index_type size = 22333 ) +{ + NDArray< value_type, + SizesHolder< index_type, 0, 0 >, + std::make_index_sequence< 2 >, + Device > a, b; + a.setSizes( size, size ); + b.setSizes( size, size ); + a.getStorageArray().setValue( -1 ); + b.getStorageArray().setValue( 1 ); + + auto a_view = a.getView(); + auto b_view = b.getView(); + + auto f = [&]() { + a.forBoundary( [=] __cuda_callable__ ( index_type i, index_type j ) mutable { a_view( i, j ) = b_view( i, j ); } ); + a.forInternal( [=] __cuda_callable__ ( index_type i, index_type j ) mutable { a_view( i, j ) = b_view( i, j ); } ); + }; + + const double datasetSize = 2 * std::pow( size, 2 ) * sizeof(value_type) / oneGB; + benchmark.setOperation( "2D", datasetSize ); + benchmark.time< Device >( reset, performer< Device >(), f ); + + expect_eq( a.getStorageArray(), b.getStorageArray() ); +} + +template< typename Device > +void benchmark_3D( Benchmark& benchmark, index_type size = 800 ) +{ + NDArray< value_type, + SizesHolder< index_type, 0, 0, 0 >, + std::make_index_sequence< 3 >, + Device > a, b; + a.setSizes( size, size, size ); + b.setSizes( size, size, size ); + a.getStorageArray().setValue( -1 ); + b.getStorageArray().setValue( 1 ); + + auto a_view = a.getView(); + auto b_view = b.getView(); + + auto f = [&]() { + a.forBoundary( [=] __cuda_callable__ ( index_type i, index_type j, index_type k ) mutable { a_view( i, j, k ) = b_view( i, j, k ); } ); + a.forInternal( [=] __cuda_callable__ ( index_type i, index_type j, index_type k ) mutable { a_view( i, j, k ) = b_view( i, j, k ); } ); + }; + + const double datasetSize = 2 * std::pow( size, 3 ) * sizeof(value_type) / oneGB; + benchmark.setOperation( "3D", datasetSize ); + benchmark.time< Device >( reset, performer< Device >(), f ); + + expect_eq( a.getStorageArray(), b.getStorageArray() ); +} + +// TODO: implement general ParallelBoundaryExecutor +//template< typename Device > +//void benchmark_4D( Benchmark& benchmark, index_type size = 150 ) +//{ +// NDArray< value_type, +// SizesHolder< index_type, 0, 0, 0, 0 >, +// std::make_index_sequence< 4 >, +// Device > a, b; +// a.setSizes( size, size, size, size ); +// b.setSizes( size, size, size, size ); +// a.getStorageArray().setValue( -1 ); +// b.getStorageArray().setValue( 1 ); +// +// auto a_view = a.getView(); +// auto b_view = b.getView(); +// +// auto f = [&]() { +// a.forBoundary( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l ) mutable { a_view( i, j, k, l ) = b_view( i, j, k, l ); } ); +// a.forInternal( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l ) mutable { a_view( i, j, k, l ) = b_view( i, j, k, l ); } ); +// }; +// +// const double datasetSize = 2 * std::pow( size, 4 ) * sizeof(value_type) / oneGB; +// benchmark.setOperation( "4D", datasetSize ); +// benchmark.time< Device >( reset, performer< Device >(), f ); +// +// expect_eq( a.getStorageArray(), b.getStorageArray() ); +//} +// +//template< typename Device > +//void benchmark_5D( Benchmark& benchmark, index_type size = 56 ) +//{ +// NDArray< value_type, +// SizesHolder< index_type, 0, 0, 0, 0, 0 >, +// std::make_index_sequence< 5 >, +// Device > a, b; +// a.setSizes( size, size, size, size, size ); +// b.setSizes( size, size, size, size, size ); +// a.getStorageArray().setValue( -1 ); +// b.getStorageArray().setValue( 1 ); +// +// auto a_view = a.getView(); +// auto b_view = b.getView(); +// +// auto f = [&]() { +// a.forBoundary( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l, index_type m ) mutable { a_view( i, j, k, l, m ) = b_view( i, j, k, l, m ); } ); +// a.forInternal( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l, index_type m ) mutable { a_view( i, j, k, l, m ) = b_view( i, j, k, l, m ); } ); +// }; +// +// const double datasetSize = 2 * std::pow( size, 5 ) * sizeof(value_type) / oneGB; +// benchmark.setOperation( "5D", datasetSize ); +// benchmark.time< Device >( reset, performer< Device >(), f ); +// +// expect_eq( a.getStorageArray(), b.getStorageArray() ); +//} +// +//template< typename Device > +//void benchmark_6D( Benchmark& benchmark, index_type size = 28 ) +//{ +// NDArray< value_type, +// SizesHolder< index_type, 0, 0, 0, 0, 0, 0 >, +// std::make_index_sequence< 6 >, +// Device > a, b; +// a.setSizes( size, size, size, size, size, size ); +// b.setSizes( size, size, size, size, size, size ); +// a.getStorageArray().setValue( -1 ); +// b.getStorageArray().setValue( 1 ); +// +// auto a_view = a.getView(); +// auto b_view = b.getView(); +// +// auto f = [&]() { +// a.forBoundary( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l, index_type m, index_type n ) mutable { a_view( i, j, k, l, m, n ) = b_view( i, j, k, l, m, n ); } ); +// a.forInternal( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l, index_type m, index_type n ) mutable { a_view( i, j, k, l, m, n ) = b_view( i, j, k, l, m, n ); } ); +// }; +// +// const double datasetSize = 2 * std::pow( size, 6 ) * sizeof(value_type) / oneGB; +// benchmark.setOperation( "6D", datasetSize ); +// benchmark.time< Device >( reset, performer< Device >(), f ); +// +// expect_eq( a.getStorageArray(), b.getStorageArray() ); +//} + + +template< typename Device > +void benchmark_2D_perm( Benchmark& benchmark, index_type size = 22333 ) +{ + NDArray< value_type, + SizesHolder< index_type, 0, 0 >, + std::index_sequence< 1, 0 >, + Device > a, b; + a.setSizes( size, size ); + b.setSizes( size, size ); + a.getStorageArray().setValue( -1 ); + b.getStorageArray().setValue( 1 ); + + auto a_view = a.getView(); + auto b_view = b.getView(); + + auto f = [&]() { + a.forBoundary( [=] __cuda_callable__ ( index_type i, index_type j ) mutable { a_view( i, j ) = b_view( i, j ); } ); + a.forInternal( [=] __cuda_callable__ ( index_type i, index_type j ) mutable { a_view( i, j ) = b_view( i, j ); } ); + }; + + const double datasetSize = 2 * std::pow( size, 2 ) * sizeof(value_type) / oneGB; + benchmark.setOperation( "2D permuted", datasetSize ); + benchmark.time< Device >( reset, performer< Device >(), f ); + + expect_eq( a.getStorageArray(), b.getStorageArray() ); +} + +template< typename Device > +void benchmark_3D_perm( Benchmark& benchmark, index_type size = 800 ) +{ + NDArray< value_type, + SizesHolder< index_type, 0, 0, 0 >, + std::index_sequence< 2, 1, 0 >, + Device > a, b; + a.setSizes( size, size, size ); + b.setSizes( size, size, size ); + a.getStorageArray().setValue( -1 ); + b.getStorageArray().setValue( 1 ); + + auto a_view = a.getView(); + auto b_view = b.getView(); + + auto f = [&]() { + a.forBoundary( [=] __cuda_callable__ ( index_type i, index_type j, index_type k ) mutable { a_view( i, j, k ) = b_view( i, j, k ); } ); + a.forInternal( [=] __cuda_callable__ ( index_type i, index_type j, index_type k ) mutable { a_view( i, j, k ) = b_view( i, j, k ); } ); + }; + + const double datasetSize = 2 * std::pow( size, 3 ) * sizeof(value_type) / oneGB; + benchmark.setOperation( "3D permuted", datasetSize ); + benchmark.time< Device >( reset, performer< Device >(), f ); + + expect_eq( a.getStorageArray(), b.getStorageArray() ); +} + +// TODO: implement general ParallelBoundaryExecutor +//template< typename Device > +//void benchmark_4D_perm( Benchmark& benchmark, index_type size = 150 ) +//{ +// NDArray< value_type, +// SizesHolder< index_type, 0, 0, 0, 0 >, +// std::index_sequence< 3, 2, 1, 0 >, +// Device > a, b; +// a.setSizes( size, size, size, size ); +// b.setSizes( size, size, size, size ); +// a.getStorageArray().setValue( -1 ); +// b.getStorageArray().setValue( 1 ); +// +// auto a_view = a.getView(); +// auto b_view = b.getView(); +// +// auto f = [&]() { +// a.forBoundary( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l ) mutable { a_view( i, j, k, l ) = b_view( i, j, k, l ); } ); +// a.forInternal( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l ) mutable { a_view( i, j, k, l ) = b_view( i, j, k, l ); } ); +// }; +// +// const double datasetSize = 2 * std::pow( size, 4 ) * sizeof(value_type) / oneGB; +// benchmark.setOperation( "4D permuted", datasetSize ); +// benchmark.time< Device >( reset, performer< Device >(), f ); +// +// expect_eq( a.getStorageArray(), b.getStorageArray() ); +//} +// +//template< typename Device > +//void benchmark_5D_perm( Benchmark& benchmark, index_type size = 56 ) +//{ +// NDArray< value_type, +// SizesHolder< index_type, 0, 0, 0, 0, 0 >, +// std::index_sequence< 4, 3, 2, 1, 0 >, +// Device > a, b; +// a.setSizes( size, size, size, size, size ); +// b.setSizes( size, size, size, size, size ); +// a.getStorageArray().setValue( -1 ); +// b.getStorageArray().setValue( 1 ); +// +// auto a_view = a.getView(); +// auto b_view = b.getView(); +// +// auto f = [&]() { +// a.forBoundary( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l, index_type m ) mutable { a_view( i, j, k, l, m ) = b_view( i, j, k, l, m ); } ); +// a.forInternal( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l, index_type m ) mutable { a_view( i, j, k, l, m ) = b_view( i, j, k, l, m ); } ); +// }; +// +// const double datasetSize = 2 * std::pow( size, 5 ) * sizeof(value_type) / oneGB; +// benchmark.setOperation( "5D permuted", datasetSize ); +// benchmark.time< Device >( reset, performer< Device >(), f ); +// +// expect_eq( a.getStorageArray(), b.getStorageArray() ); +//} +// +//template< typename Device > +//void benchmark_6D_perm( Benchmark& benchmark, index_type size = 28 ) +//{ +// NDArray< value_type, +// SizesHolder< index_type, 0, 0, 0, 0, 0, 0 >, +// std::index_sequence< 5, 4, 3, 2, 1, 0 >, +// Device > a, b; +// a.setSizes( size, size, size, size, size, size ); +// b.setSizes( size, size, size, size, size, size ); +// a.getStorageArray().setValue( -1 ); +// b.getStorageArray().setValue( 1 ); +// +// auto a_view = a.getView(); +// auto b_view = b.getView(); +// +// auto f = [&]() { +// a.forBoundary( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l, index_type m, index_type n ) mutable { a_view( i, j, k, l, m, n ) = b_view( i, j, k, l, m, n ); } ); +// a.forInternal( [=] __cuda_callable__ ( index_type i, index_type j, index_type k, index_type l, index_type m, index_type n ) mutable { a_view( i, j, k, l, m, n ) = b_view( i, j, k, l, m, n ); } ); +// }; +// +// const double datasetSize = 2 * std::pow( size, 6 ) * sizeof(value_type) / oneGB; +// benchmark.setOperation( "6D permuted", datasetSize ); +// benchmark.time< Device >( reset, performer< Device >(), f ); +// +// expect_eq( a.getStorageArray(), b.getStorageArray() ); +//} + +template< typename Device > +void run_benchmarks( Benchmark& benchmark ) +{ + benchmark_1D< Device >( benchmark ); + benchmark_2D< Device >( benchmark ); + benchmark_3D< Device >( benchmark ); +// benchmark_4D< Device >( benchmark ); +// benchmark_5D< Device >( benchmark ); +// benchmark_6D< Device >( benchmark ); + benchmark_2D_perm< Device >( benchmark ); + benchmark_3D_perm< Device >( benchmark ); +// benchmark_4D_perm< Device >( benchmark ); +// benchmark_5D_perm< Device >( benchmark ); +// benchmark_6D_perm< Device >( benchmark ); +} + +void setupConfig( Config::ConfigDescription & config ) +{ + config.addDelimiter( "Benchmark settings:" ); + config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-ndarray-boundary.log"); + config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" ); + config.addEntryEnum( "append" ); + config.addEntryEnum( "overwrite" ); + config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 ); + config.addEntry< int >( "verbose", "Verbose mode.", 1 ); + config.addEntry< String >( "devices", "Run benchmarks on these devices.", "all" ); + config.addEntryEnum( "all" ); + config.addEntryEnum( "host" ); + #ifdef HAVE_CUDA + config.addEntryEnum( "cuda" ); + #endif + + config.addDelimiter( "Device settings:" ); + Devices::Host::configSetup( config ); + Devices::Cuda::configSetup( config ); +} + +int main( int argc, char* argv[] ) +{ + Config::ParameterContainer parameters; + Config::ConfigDescription conf_desc; + + setupConfig( conf_desc ); + + if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) { + conf_desc.printUsage( argv[ 0 ] ); + return EXIT_FAILURE; + } + + if( ! Devices::Host::setup( parameters ) || + ! Devices::Cuda::setup( parameters ) ) + return EXIT_FAILURE; + + const String & logFileName = parameters.getParameter< String >( "log-file" ); + const String & outputMode = parameters.getParameter< String >( "output-mode" ); + const int loops = parameters.getParameter< int >( "loops" ); + const int verbose = parameters.getParameter< int >( "verbose" ); + + // open log file + auto mode = std::ios::out; + if( outputMode == "append" ) + mode |= std::ios::app; + std::ofstream logFile( logFileName.getString(), mode ); + + // init benchmark and common metadata + Benchmark benchmark( loops, verbose ); + + // prepare global metadata + Benchmark::MetadataMap metadata = getHardwareMetadata(); + + const String devices = parameters.getParameter< String >( "devices" ); + if( devices == "all" || devices == "host" ) + run_benchmarks< Devices::Host >( benchmark ); +#ifdef HAVE_CUDA + if( devices == "all" || devices == "cuda" ) + run_benchmarks< Devices::Cuda >( benchmark ); +#endif + + if( ! benchmark.save( logFile ) ) { + std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl; + return EXIT_FAILURE; + } + + return EXIT_SUCCESS; +} diff --git a/src/Benchmarks/NDArray/tnl-benchmark-ndarray-cuda.cu b/src/Benchmarks/NDArray/tnl-benchmark-ndarray-cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..ccbac3b3841d1122989c658b1d181cd23c80e3ef --- /dev/null +++ b/src/Benchmarks/NDArray/tnl-benchmark-ndarray-cuda.cu @@ -0,0 +1 @@ +#include "tnl-benchmark-ndarray.h" diff --git a/src/Benchmarks/NDArray/tnl-benchmark-ndarray.cpp b/src/Benchmarks/NDArray/tnl-benchmark-ndarray.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ccbac3b3841d1122989c658b1d181cd23c80e3ef --- /dev/null +++ b/src/Benchmarks/NDArray/tnl-benchmark-ndarray.cpp @@ -0,0 +1 @@ +#include "tnl-benchmark-ndarray.h" diff --git a/src/Benchmarks/NDArray/tnl-benchmark-ndarray.h b/src/Benchmarks/NDArray/tnl-benchmark-ndarray.h new file mode 100644 index 0000000000000000000000000000000000000000..0de53ea8815033654194cc9e2eb6f3eaf6356356 --- /dev/null +++ b/src/Benchmarks/NDArray/tnl-benchmark-ndarray.h @@ -0,0 +1,453 @@ +/*************************************************************************** + tnl-benchmark-ndarray.h - description + ------------------- + begin : Dec 24, 2018 + copyright : (C) 2018 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky + +#pragma once + +#include +#include +#include + +#include +#include + +#include "../Benchmarks.h" + +using namespace TNL; +using namespace TNL::Benchmarks; +using namespace TNL::Containers; +using std::index_sequence; + +using value_type = float; +//using index_type = std::size_t; +using index_type = unsigned; + +template< typename Array > +void expect_eq_chunked( Array& a, Array& b ) +{ + // TODO: use something like EXPECT_EQ + TNL_ASSERT_EQ( a.getSize(), b.getSize(), "array sizes don't match" ); + if( a.getSize() != b.getSize() ) + return; + + using IndexType = typename Array::IndexType; + + const IndexType chunk_size = 4096; + for( IndexType c = 0; c < (IndexType) roundUpDivision( a.getSize(), chunk_size ); c++ ) { + const typename Array::IndexType this_chunk_size = TNL::min( chunk_size, a.getSize() - c * chunk_size ); + Array a_chunk( &a[ c * chunk_size ], this_chunk_size ); + Array b_chunk( &b[ c * chunk_size ], this_chunk_size ); + // TODO: use something like EXPECT_EQ + TNL_ASSERT_EQ( a_chunk, b_chunk, "chunks are not equal" ); + } +} + +template< typename Array > +void expect_eq( Array& a, Array& b ) +{ + if( std::is_same< typename Array::DeviceType, TNL::Devices::Cuda >::value ) { + typename Array::HostType a_host, b_host; + a_host = a; + b_host = b; + expect_eq_chunked( a_host, b_host ); + } + else { + expect_eq_chunked( a, b ); + } +} + +template< typename Device > +const char* performer() +{ + if( std::is_same< Device, Devices::Host >::value ) + return "CPU"; + else if( std::is_same< Device, Devices::Cuda >::value ) + return "GPU"; + else + return "unknown"; +} + +void reset() {} + +// NOTE: having the sizes as function parameters keeps the compiler from treating them +// as "compile-time constants" and thus e.g. optimizing the 1D iterations with memcpy + +template< typename Device > +void benchmark_array( Benchmark& benchmark, index_type size = 500000000 ) +{ + Array< value_type, Device > a, b; + a.setSize( size ); + b.setSize( size ); + a.setValue( -1 ); + b.setValue( 1 ); + + auto kernel = [] __cuda_callable__ + ( int i, + value_type* a, + const value_type* b ) + { + a[ i ] = b[ i ]; + }; + + auto f = [&]() { + TNL::ParallelFor< Device >::exec( 0, (int) size, kernel, a.getData(), b.getData() ); + }; + + // warm-up for all benchmarks + f(); + + const double datasetSize = 2 * size * sizeof(value_type) / oneGB; + benchmark.setOperation( "array", datasetSize ); + benchmark.time< Device >( reset, performer< Device >(), f ); + + expect_eq( a, b ); +} + +template< typename Device > +void benchmark_1D( Benchmark& benchmark, index_type size = 500000000 ) +{ + NDArray< value_type, + SizesHolder< index_type, 0 >, + std::make_index_sequence< 1 >, + Device > a, b; + a.setSizes( size ); + b.setSizes( size ); + a.getStorageArray().setValue( -1 ); + b.getStorageArray().setValue( 1 ); + + auto f = [&]() { + nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b ); + }; + + const double datasetSize = 2 * size * sizeof(value_type) / oneGB; + benchmark.setOperation( "1D", datasetSize ); + benchmark.time< Device >( reset, performer< Device >(), f ); + + expect_eq( a.getStorageArray(), b.getStorageArray() ); +} + +template< typename Device > +void benchmark_2D( Benchmark& benchmark, index_type size = 22333 ) +{ + NDArray< value_type, + SizesHolder< index_type, 0, 0 >, + std::make_index_sequence< 2 >, + Device > a, b; + a.setSizes( size, size ); + b.setSizes( size, size ); + a.getStorageArray().setValue( -1 ); + b.getStorageArray().setValue( 1 ); + + auto f = [&]() { + nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b ); + }; + + const double datasetSize = 2 * std::pow( size, 2 ) * sizeof(value_type) / oneGB; + benchmark.setOperation( "2D", datasetSize ); + benchmark.time< Device >( reset, performer< Device >(), f ); + + expect_eq( a.getStorageArray(), b.getStorageArray() ); +} + +template< typename Device > +void benchmark_3D( Benchmark& benchmark, index_type size = 800 ) +{ + NDArray< value_type, + SizesHolder< index_type, 0, 0, 0 >, + std::make_index_sequence< 3 >, + Device > a, b; + a.setSizes( size, size, size ); + b.setSizes( size, size, size ); + a.getStorageArray().setValue( -1 ); + b.getStorageArray().setValue( 1 ); + + auto f = [&]() { + nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b ); + }; + + const double datasetSize = 2 * std::pow( size, 3 ) * sizeof(value_type) / oneGB; + benchmark.setOperation( "3D", datasetSize ); + benchmark.time< Device >( reset, performer< Device >(), f ); + + expect_eq( a.getStorageArray(), b.getStorageArray() ); +} + +template< typename Device > +void benchmark_4D( Benchmark& benchmark, index_type size = 150 ) +{ + NDArray< value_type, + SizesHolder< index_type, 0, 0, 0, 0 >, + std::make_index_sequence< 4 >, + Device > a, b; + a.setSizes( size, size, size, size ); + b.setSizes( size, size, size, size ); + a.getStorageArray().setValue( -1 ); + b.getStorageArray().setValue( 1 ); + + auto f = [&]() { + nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b ); + }; + + const double datasetSize = 2 * std::pow( size, 4 ) * sizeof(value_type) / oneGB; + benchmark.setOperation( "4D", datasetSize ); + benchmark.time< Device >( reset, performer< Device >(), f ); + + expect_eq( a.getStorageArray(), b.getStorageArray() ); +} + +template< typename Device > +void benchmark_5D( Benchmark& benchmark, index_type size = 56 ) +{ + NDArray< value_type, + SizesHolder< index_type, 0, 0, 0, 0, 0 >, + std::make_index_sequence< 5 >, + Device > a, b; + a.setSizes( size, size, size, size, size ); + b.setSizes( size, size, size, size, size ); + a.getStorageArray().setValue( -1 ); + b.getStorageArray().setValue( 1 ); + + auto f = [&]() { + nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b ); + }; + + const double datasetSize = 2 * std::pow( size, 5 ) * sizeof(value_type) / oneGB; + benchmark.setOperation( "5D", datasetSize ); + benchmark.time< Device >( reset, performer< Device >(), f ); + + expect_eq( a.getStorageArray(), b.getStorageArray() ); +} + +template< typename Device > +void benchmark_6D( Benchmark& benchmark, index_type size = 28 ) +{ + NDArray< value_type, + SizesHolder< index_type, 0, 0, 0, 0, 0, 0 >, + std::make_index_sequence< 6 >, + Device > a, b; + a.setSizes( size, size, size, size, size, size ); + b.setSizes( size, size, size, size, size, size ); + a.getStorageArray().setValue( -1 ); + b.getStorageArray().setValue( 1 ); + + auto f = [&]() { + nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b ); + }; + + const double datasetSize = 2 * std::pow( size, 6 ) * sizeof(value_type) / oneGB; + benchmark.setOperation( "6D", datasetSize ); + benchmark.time< Device >( reset, performer< Device >(), f ); + + expect_eq( a.getStorageArray(), b.getStorageArray() ); +} + + +template< typename Device > +void benchmark_2D_perm( Benchmark& benchmark, index_type size = 22333 ) +{ + NDArray< value_type, + SizesHolder< index_type, 0, 0 >, + std::index_sequence< 1, 0 >, + Device > a, b; + a.setSizes( size, size ); + b.setSizes( size, size ); + a.getStorageArray().setValue( -1 ); + b.getStorageArray().setValue( 1 ); + + auto f = [&]() { + nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b ); + }; + + const double datasetSize = 2 * std::pow( size, 2 ) * sizeof(value_type) / oneGB; + benchmark.setOperation( "2D permuted", datasetSize ); + benchmark.time< Device >( reset, performer< Device >(), f ); + + expect_eq( a.getStorageArray(), b.getStorageArray() ); +} + +template< typename Device > +void benchmark_3D_perm( Benchmark& benchmark, index_type size = 800 ) +{ + NDArray< value_type, + SizesHolder< index_type, 0, 0, 0 >, + std::index_sequence< 2, 1, 0 >, + Device > a, b; + a.setSizes( size, size, size ); + b.setSizes( size, size, size ); + a.getStorageArray().setValue( -1 ); + b.getStorageArray().setValue( 1 ); + + auto f = [&]() { + nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b ); + }; + + const double datasetSize = 2 * std::pow( size, 3 ) * sizeof(value_type) / oneGB; + benchmark.setOperation( "3D permuted", datasetSize ); + benchmark.time< Device >( reset, performer< Device >(), f ); + + expect_eq( a.getStorageArray(), b.getStorageArray() ); +} + +template< typename Device > +void benchmark_4D_perm( Benchmark& benchmark, index_type size = 150 ) +{ + NDArray< value_type, + SizesHolder< index_type, 0, 0, 0, 0 >, + std::index_sequence< 3, 2, 1, 0 >, + Device > a, b; + a.setSizes( size, size, size, size ); + b.setSizes( size, size, size, size ); + a.getStorageArray().setValue( -1 ); + b.getStorageArray().setValue( 1 ); + + auto f = [&]() { + nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b ); + }; + + const double datasetSize = 2 * std::pow( size, 4 ) * sizeof(value_type) / oneGB; + benchmark.setOperation( "4D permuted", datasetSize ); + benchmark.time< Device >( reset, performer< Device >(), f ); + + expect_eq( a.getStorageArray(), b.getStorageArray() ); +} + +template< typename Device > +void benchmark_5D_perm( Benchmark& benchmark, index_type size = 56 ) +{ + NDArray< value_type, + SizesHolder< index_type, 0, 0, 0, 0, 0 >, + std::index_sequence< 4, 3, 2, 1, 0 >, + Device > a, b; + a.setSizes( size, size, size, size, size ); + b.setSizes( size, size, size, size, size ); + a.getStorageArray().setValue( -1 ); + b.getStorageArray().setValue( 1 ); + + auto f = [&]() { + nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b ); + }; + + const double datasetSize = 2 * std::pow( size, 5 ) * sizeof(value_type) / oneGB; + benchmark.setOperation( "5D permuted", datasetSize ); + benchmark.time< Device >( reset, performer< Device >(), f ); + + expect_eq( a.getStorageArray(), b.getStorageArray() ); +} + +template< typename Device > +void benchmark_6D_perm( Benchmark& benchmark, index_type size = 28 ) +{ + NDArray< value_type, + SizesHolder< index_type, 0, 0, 0, 0, 0, 0 >, + std::index_sequence< 5, 4, 3, 2, 1, 0 >, + Device > a, b; + a.setSizes( size, size, size, size, size, size ); + b.setSizes( size, size, size, size, size, size ); + a.getStorageArray().setValue( -1 ); + b.getStorageArray().setValue( 1 ); + + auto f = [&]() { + nd_map( a, [] __cuda_callable__ (value_type v1) { return v1; }, b ); + }; + + const double datasetSize = 2 * std::pow( size, 6 ) * sizeof(value_type) / oneGB; + benchmark.setOperation( "6D permuted", datasetSize ); + benchmark.time< Device >( reset, performer< Device >(), f ); + + expect_eq( a.getStorageArray(), b.getStorageArray() ); +} + +template< typename Device > +void run_benchmarks( Benchmark& benchmark ) +{ + benchmark_array< Device >( benchmark ); + benchmark_1D< Device >( benchmark ); + benchmark_2D< Device >( benchmark ); + benchmark_3D< Device >( benchmark ); + benchmark_4D< Device >( benchmark ); + benchmark_5D< Device >( benchmark ); + benchmark_6D< Device >( benchmark ); + benchmark_2D_perm< Device >( benchmark ); + benchmark_3D_perm< Device >( benchmark ); + benchmark_4D_perm< Device >( benchmark ); + benchmark_5D_perm< Device >( benchmark ); + benchmark_6D_perm< Device >( benchmark ); +} + +void setupConfig( Config::ConfigDescription & config ) +{ + config.addDelimiter( "Benchmark settings:" ); + config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-ndarray.log"); + config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" ); + config.addEntryEnum( "append" ); + config.addEntryEnum( "overwrite" ); + config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 ); + config.addEntry< int >( "verbose", "Verbose mode.", 1 ); + config.addEntry< String >( "devices", "Run benchmarks on these devices.", "all" ); + config.addEntryEnum( "all" ); + config.addEntryEnum( "host" ); + #ifdef HAVE_CUDA + config.addEntryEnum( "cuda" ); + #endif + + config.addDelimiter( "Device settings:" ); + Devices::Host::configSetup( config ); + Devices::Cuda::configSetup( config ); +} + +int main( int argc, char* argv[] ) +{ + Config::ParameterContainer parameters; + Config::ConfigDescription conf_desc; + + setupConfig( conf_desc ); + + if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) { + conf_desc.printUsage( argv[ 0 ] ); + return EXIT_FAILURE; + } + + if( ! Devices::Host::setup( parameters ) || + ! Devices::Cuda::setup( parameters ) ) + return EXIT_FAILURE; + + const String & logFileName = parameters.getParameter< String >( "log-file" ); + const String & outputMode = parameters.getParameter< String >( "output-mode" ); + const int loops = parameters.getParameter< int >( "loops" ); + const int verbose = parameters.getParameter< int >( "verbose" ); + + // open log file + auto mode = std::ios::out; + if( outputMode == "append" ) + mode |= std::ios::app; + std::ofstream logFile( logFileName.getString(), mode ); + + // init benchmark and common metadata + Benchmark benchmark( loops, verbose ); + + // prepare global metadata + Benchmark::MetadataMap metadata = getHardwareMetadata(); + + const String devices = parameters.getParameter< String >( "devices" ); + if( devices == "all" || devices == "host" ) + run_benchmarks< Devices::Host >( benchmark ); +#ifdef HAVE_CUDA + if( devices == "all" || devices == "cuda" ) + run_benchmarks< Devices::Cuda >( benchmark ); +#endif + + if( ! benchmark.save( logFile ) ) { + std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl; + return EXIT_FAILURE; + } + + return EXIT_SUCCESS; +} diff --git a/src/TNL/Containers/Algorithms/ArrayOperations.h b/src/TNL/Containers/Algorithms/ArrayOperations.h index 7977b6b728f1b9827b569d2520ef322d2f49b432..ca62f5b7ea45254298cb02d0ac909ee2242e72f2 100644 --- a/src/TNL/Containers/Algorithms/ArrayOperations.h +++ b/src/TNL/Containers/Algorithms/ArrayOperations.h @@ -22,6 +22,42 @@ template< typename DestinationDevice, typename SourceDevice = DestinationDevice > struct ArrayOperations; +// TODO: establish the concept of a "void device" for static computations in the whole TNL +template<> +struct ArrayOperations< void > +{ + template< typename Element > + __cuda_callable__ + static void setElement( Element* data, + const Element& value ); + + template< typename Element > + __cuda_callable__ + static Element getElement( const Element* data ); + + template< typename Element, typename Index > + __cuda_callable__ + static void set( Element* data, + const Element& value, + const Index size ); + + template< typename DestinationElement, + typename SourceElement, + typename Index > + __cuda_callable__ + static void copy( DestinationElement* destination, + const SourceElement* source, + const Index size ); + + template< typename Element1, + typename Element2, + typename Index > + __cuda_callable__ + static bool compare( const Element1* destination, + const Element2* source, + const Index size ); +}; + template<> struct ArrayOperations< Devices::Host > { @@ -251,6 +287,7 @@ struct ArrayOperations< Devices::Host, Devices::MIC > } // namespace Containers } // namespace TNL +#include #include #include #include diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsStatic.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsStatic.hpp new file mode 100644 index 0000000000000000000000000000000000000000..8115d25f4c3431e68fb40bf8b18406a4b176ca33 --- /dev/null +++ b/src/TNL/Containers/Algorithms/ArrayOperationsStatic.hpp @@ -0,0 +1,82 @@ +/*************************************************************************** + ArrayOperationsStatic_impl.h - description + ------------------- + begin : Apr 8, 2019 + copyright : (C) 2019 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include + +namespace TNL { +namespace Containers { +namespace Algorithms { + +template< typename Element > +__cuda_callable__ +void +ArrayOperations< void >:: +setElement( Element* data, + const Element& value ) +{ + *data = value; +} + +template< typename Element > +__cuda_callable__ +Element +ArrayOperations< void >:: +getElement( const Element* data ) +{ + return *data; +} + +template< typename Element, typename Index > +__cuda_callable__ +void +ArrayOperations< void >:: +set( Element* data, + const Element& value, + const Index size ) +{ + for( Index i = 0; i < size; i ++ ) + data[ i ] = value; +} + +template< typename DestinationElement, + typename SourceElement, + typename Index > +__cuda_callable__ +void +ArrayOperations< void >:: +copy( DestinationElement* destination, + const SourceElement* source, + const Index size ) +{ + for( Index i = 0; i < size; i ++ ) + destination[ i ] = source[ i ]; +} + +template< typename Element1, + typename Element2, + typename Index > +__cuda_callable__ +bool +ArrayOperations< void >:: +compare( const Element1* destination, + const Element2* source, + const Index size ) +{ + for( Index i = 0; i < size; i++ ) + if( ! ( destination[ i ] == source[ i ] ) ) + return false; + return true; +} + +} // namespace Algorithms +} // namespace Containers +} // namespace TNL diff --git a/src/TNL/Containers/DistributedNDArray.h b/src/TNL/Containers/DistributedNDArray.h new file mode 100644 index 0000000000000000000000000000000000000000..4b123d114faa37e9022d7b5caab6f9c7124c2263 --- /dev/null +++ b/src/TNL/Containers/DistributedNDArray.h @@ -0,0 +1,467 @@ +/*************************************************************************** + DistributedNDArray.h - description + ------------------- + begin : Dec 27, 2018 + copyright : (C) 2018 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky + +#pragma once + +#include +#include +#include +#include + +namespace TNL { +namespace Containers { + +template< typename NDArray, + typename Communicator = Communicators::MpiCommunicator, + typename Overlaps = __ndarray_impl::make_constant_index_sequence< NDArray::getDimension(), 0 > > +class DistributedNDArray +{ + using CommunicationGroup = typename Communicator::CommunicationGroup; +public: + using ValueType = typename NDArray::ValueType; + using DeviceType = typename NDArray::DeviceType; + using IndexType = typename NDArray::IndexType; + using SizesHolderType = typename NDArray::SizesHolderType; + using PermutationType = typename NDArray::PermutationType; + using CommunicatorType = Communicator; + using LocalBeginsType = __ndarray_impl::LocalBeginsHolder< typename NDArray::SizesHolderType >; + using LocalRangeType = Subrange< IndexType >; + using OverlapsType = Overlaps; + using LocalIndexerType = NDArrayIndexer< SizesHolderType, PermutationType, typename NDArray::NDBaseType, typename NDArray::StridesHolderType, Overlaps >; + + using ViewType = DistributedNDArrayView< typename NDArray::ViewType, Communicator, Overlaps >; + using ConstViewType = DistributedNDArrayView< typename NDArray::ConstViewType, Communicator, Overlaps >; + using LocalViewType = typename NDArray::ViewType; + using ConstLocalViewType = typename NDArray::ConstViewType; + + static_assert( Overlaps::size() == NDArray::getDimension(), "invalid overlaps" ); + + // all methods from NDArrayView + + DistributedNDArray() = default; + + // The copy-constructor of TNL::Containers::Array makes shallow copy so our + // copy-constructor cannot be default. Actually, we most likely don't need + // it anyway, so let's just delete it. + DistributedNDArray( const DistributedNDArray& ) = delete; + + // Standard copy-semantics with deep copy, just like regular 1D array. + // Mismatched sizes cause reallocations. + DistributedNDArray& operator=( const DistributedNDArray& other ) = default; + + // default move-semantics + DistributedNDArray( DistributedNDArray&& ) = default; + DistributedNDArray& operator=( DistributedNDArray&& ) = default; + + // Templated copy-assignment + template< typename OtherArray > + DistributedNDArray& operator=( const OtherArray& other ) + { + globalSizes = other.getSizes(); + localBegins = other.getLocalBegins(); + localEnds = other.getLocalEnds(); + group = other.getCommunicationGroup(); + localArray = other.getConstLocalView(); + return *this; + } + + static constexpr std::size_t getDimension() + { + return NDArray::getDimension(); + } + + __cuda_callable__ + CommunicationGroup getCommunicationGroup() const + { + return group; + } + + // Returns the *global* sizes + __cuda_callable__ + const SizesHolderType& getSizes() const + { + return globalSizes; + } + + // Returns the *global* size + template< std::size_t level > + __cuda_callable__ + IndexType getSize() const + { + return globalSizes.template getSize< level >(); + } + + __cuda_callable__ + LocalBeginsType getLocalBegins() const + { + return localBegins; + } + + __cuda_callable__ + SizesHolderType getLocalEnds() const + { + return localEnds; + } + + template< std::size_t level > + __cuda_callable__ + LocalRangeType getLocalRange() const + { + return LocalRangeType( localBegins.template getSize< level >(), localEnds.template getSize< level >() ); + } + + // returns the local storage size + __cuda_callable__ + IndexType getLocalStorageSize() const + { + return localArray.getStorageSize(); + } + + LocalIndexerType getLocalIndexer() const + { + return LocalIndexerType( localEnds - localBegins, typename NDArray::StridesHolderType{} ); + } + + LocalViewType getLocalView() + { + return localArray.getView(); + } + + ConstLocalViewType getConstLocalView() const + { + return localArray.getConstView(); + } + + // returns the *local* storage index for given *global* indices + template< typename... IndexTypes > + __cuda_callable__ + IndexType + getStorageIndex( IndexTypes&&... indices ) const + { + static_assert( sizeof...( indices ) == SizesHolderType::getDimension(), "got wrong number of indices" ); + __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexTypes >( indices )... ); + auto getStorageIndex = [this]( auto&&... indices ) + { + return this->localArray.getStorageIndex( std::forward< decltype(indices) >( indices )... ); + }; + return __ndarray_impl::call_with_unshifted_indices< LocalBeginsType, Overlaps >( localBegins, getStorageIndex, std::forward< IndexTypes >( indices )... ); + } + + __cuda_callable__ + ValueType* getData() + { + return localArray.getData(); + } + + __cuda_callable__ + std::add_const_t< ValueType >* getData() const + { + return localArray.getData(); + } + + + template< typename... IndexTypes > + __cuda_callable__ + ValueType& + operator()( IndexTypes&&... indices ) + { + static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" ); + __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexTypes >( indices )... ); + return __ndarray_impl::call_with_unshifted_indices< LocalBeginsType, Overlaps >( localBegins, localArray, std::forward< IndexTypes >( indices )... ); + } + + template< typename... IndexTypes > + __cuda_callable__ + const ValueType& + operator()( IndexTypes&&... indices ) const + { + static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" ); + __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexTypes >( indices )... ); + return __ndarray_impl::call_with_unshifted_indices< LocalBeginsType, Overlaps >( localBegins, localArray, std::forward< IndexTypes >( indices )... ); + } + + // bracket operator for 1D arrays + __cuda_callable__ + ValueType& + operator[]( IndexType index ) + { + static_assert( getDimension() == 1, "the access via operator[] is provided only for 1D arrays" ); + __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexType >( index ) ); + return localArray[ __ndarray_impl::get<0>( Overlaps{} ) + index - localBegins.template getSize< 0 >() ]; + } + + __cuda_callable__ + const ValueType& + operator[]( IndexType index ) const + { + static_assert( getDimension() == 1, "the access via operator[] is provided only for 1D arrays" ); + __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexType >( index ) ); + return localArray[ __ndarray_impl::get<0>( Overlaps{} ) + index - localBegins.template getSize< 0 >() ]; + } + + __cuda_callable__ + ViewType getView() + { + return ViewType( localArray.getView(), globalSizes, localBegins, localEnds, group ); + } + + __cuda_callable__ + ConstViewType getConstView() const + { + return ConstViewType( localArray.getConstView(), globalSizes, localBegins, localEnds, group ); + } + + // TODO: overlaps should be skipped, otherwise it works only after synchronization + bool operator==( const DistributedNDArray& other ) const + { + // we can't run allreduce if the communication groups are different + if( group != other.getCommunicationGroup() ) + return false; + const bool localResult = + globalSizes == other.globalSizes && + localBegins == other.localBegins && + localEnds == other.localEnds && + localArray == other.localArray; + bool result = true; + if( group != CommunicatorType::NullGroup ) + CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, group ); + return result; + } + + bool operator!=( const DistributedNDArray& other ) const + { + return ! (*this == other); + } + + // iterate over all local elements + template< typename Device2 = DeviceType, typename Func > + void forAll( Func f ) const + { + __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch; + dispatch( localBegins, localEnds, f ); + } + + // iterate over local elements which are not neighbours of *global* boundaries + template< typename Device2 = DeviceType, typename Func > + void forInternal( Func f ) const + { + // add static sizes + using Begins = __ndarray_impl::LocalBeginsHolder< SizesHolderType, 1 >; + // add dynamic sizes + Begins begins; + __ndarray_impl::SetSizesAddHelper< 1, Begins, SizesHolderType, Overlaps >::add( begins, SizesHolderType{} ); + __ndarray_impl::SetSizesMaxHelper< Begins, LocalBeginsType >::max( begins, localBegins ); + + // subtract static sizes + using Ends = typename __ndarray_impl::SubtractedSizesHolder< SizesHolderType, 1 >::type; + // subtract dynamic sizes + Ends ends; + __ndarray_impl::SetSizesSubtractHelper< 1, Ends, SizesHolderType, Overlaps >::subtract( ends, globalSizes ); + __ndarray_impl::SetSizesMinHelper< Ends, SizesHolderType >::min( ends, localEnds ); + + __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch; + dispatch( begins, ends, f ); + } + + // iterate over local elements inside the given [begins, ends) range specified by global indices + template< typename Device2 = DeviceType, typename Func, typename Begins, typename Ends > + void forInternal( Func f, const Begins& begins, const Ends& ends ) const + { + // TODO: assert "localBegins <= begins <= localEnds", "localBegins <= ends <= localEnds" + __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch; + dispatch( begins, ends, f ); + } + + // iterate over local elements which are neighbours of *global* boundaries + template< typename Device2 = DeviceType, typename Func > + void forBoundary( Func f ) const + { + // add static sizes + using SkipBegins = __ndarray_impl::LocalBeginsHolder< SizesHolderType, 1 >; + // add dynamic sizes + SkipBegins skipBegins; + __ndarray_impl::SetSizesAddHelper< 1, SkipBegins, SizesHolderType, Overlaps >::add( skipBegins, SizesHolderType{} ); + __ndarray_impl::SetSizesMaxHelper< SkipBegins, LocalBeginsType >::max( skipBegins, localBegins ); + + // subtract static sizes + using SkipEnds = typename __ndarray_impl::SubtractedSizesHolder< SizesHolderType, 1 >::type; + // subtract dynamic sizes + SkipEnds skipEnds; + __ndarray_impl::SetSizesSubtractHelper< 1, SkipEnds, SizesHolderType, Overlaps >::subtract( skipEnds, globalSizes ); + __ndarray_impl::SetSizesMinHelper< SkipEnds, SizesHolderType >::min( skipEnds, localEnds ); + + __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch; + dispatch( localBegins, skipBegins, skipEnds, localEnds, f ); + } + + // iterate over local elements outside the given [skipBegins, skipEnds) range specified by global indices + template< typename Device2 = DeviceType, typename Func, typename SkipBegins, typename SkipEnds > + void forBoundary( Func f, const SkipBegins& skipBegins, const SkipEnds& skipEnds ) const + { + // TODO: assert "localBegins <= skipBegins <= localEnds", "localBegins <= skipEnds <= localEnds" + __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch; + dispatch( localBegins, skipBegins, skipEnds, localEnds, f ); + } + + // iterate over local elements which are not neighbours of overlaps (if all overlaps are 0, it is equivalent to forAll) + template< typename Device2 = DeviceType, typename Func > + void forLocalInternal( Func f ) const + { + // add overlaps to dynamic sizes + LocalBeginsType begins; + __ndarray_impl::SetSizesAddOverlapsHelper< LocalBeginsType, SizesHolderType, Overlaps >::add( begins, localBegins ); + + // subtract overlaps from dynamic sizes + SizesHolderType ends; + __ndarray_impl::SetSizesSubtractOverlapsHelper< SizesHolderType, SizesHolderType, Overlaps >::subtract( ends, localEnds ); + + __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch; + dispatch( begins, ends, f ); + } + + // iterate over local elements which are neighbours of overlaps (if all overlaps are 0, it has no effect) + template< typename Device2 = DeviceType, typename Func > + void forLocalBoundary( Func f ) const + { + // add overlaps to dynamic sizes + LocalBeginsType skipBegins; + __ndarray_impl::SetSizesAddOverlapsHelper< LocalBeginsType, SizesHolderType, Overlaps >::add( skipBegins, localBegins ); + + // subtract overlaps from dynamic sizes + SizesHolderType skipEnds; + __ndarray_impl::SetSizesSubtractOverlapsHelper< SizesHolderType, SizesHolderType, Overlaps >::subtract( skipEnds, localEnds ); + + __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch; + dispatch( localBegins, skipBegins, skipEnds, localEnds, f ); + } + + // iterate over elements of overlaps (if all overlaps are 0, it has no effect) + template< typename Device2 = DeviceType, typename Func > + void forOverlaps( Func f ) const + { + // subtract overlaps from dynamic sizes + LocalBeginsType begins; + __ndarray_impl::SetSizesSubtractOverlapsHelper< LocalBeginsType, SizesHolderType, Overlaps >::subtract( begins, localBegins ); + + // add overlaps to dynamic sizes + SizesHolderType ends; + __ndarray_impl::SetSizesAddOverlapsHelper< SizesHolderType, SizesHolderType, Overlaps >::add( ends, localEnds ); + + __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch; + dispatch( begins, localBegins, localEnds, ends, f ); + } + + + // extra methods + + // Sets the *global* size, but does not allocate storage + template< typename... IndexTypes > + void setSizes( IndexTypes&&... sizes ) + { + static_assert( sizeof...( sizes ) == getDimension(), "got wrong number of sizes" ); + __ndarray_impl::setSizesHelper( globalSizes, std::forward< IndexTypes >( sizes )... ); + // initialize localBegins and localEnds + localBegins = LocalBeginsType{}; + localEnds = globalSizes; + } + + template< std::size_t level > + void setDistribution( IndexType begin, IndexType end, CommunicationGroup group = Communicator::AllGroup ) + { + static_assert( SizesHolderType::template getStaticSize< level >() == 0, "NDArray cannot be distributed in static dimensions." ); + TNL_ASSERT_GE( begin, 0, "begin must be non-negative" ); + TNL_ASSERT_LE( end, globalSizes.template getSize< level >(), "end must not be greater than global size" ); + TNL_ASSERT_LT( begin, end, "begin must be lesser than end" ); + localBegins.template setSize< level >( begin ); + localEnds.template setSize< level >( end ); + TNL_ASSERT( this->group == Communicator::NullGroup || this->group == group, + std::cerr << "different groups cannot be combined for different dimensions" ); + this->group = group; + } + + // Computes the distributed storage size and allocates the local array + void allocate() + { + SizesHolderType localSizes; + TemplateStaticFor< std::size_t, 0, SizesHolderType::getDimension(), LocalSizesSetter >::execHost( localSizes, globalSizes, localBegins, localEnds ); + localArray.setSize( localSizes ); + } + + void setLike( const DistributedNDArray& other ) + { + localArray.setLike( other.localArray ); + group = other.getCommunicationGroup(); + globalSizes = other.getSizes(); + localBegins = other.localBegins; + localEnds = other.localEnds; + } + + void reset() + { + localArray.reset(); + group = CommunicatorType::NullGroup; + globalSizes = SizesHolderType{}; + localBegins = LocalBeginsType{}; + localEnds = SizesHolderType{}; + } + + // "safe" accessor - will do slow copy from device + template< typename... IndexTypes > + ValueType + getElement( IndexTypes&&... indices ) const + { + static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" ); + __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexTypes >( indices )... ); + auto getElement = [this]( auto&&... indices ) + { + return this->localArray.getElement( std::forward< decltype(indices) >( indices )... ); + }; + return __ndarray_impl::host_call_with_unshifted_indices< LocalBeginsType, Overlaps >( localBegins, getElement, std::forward< IndexTypes >( indices )... ); + } + + void setValue( ValueType value ) + { + localArray.setValue( value ); + } + +protected: + NDArray localArray; + CommunicationGroup group = Communicator::NullGroup; + SizesHolderType globalSizes; + // static sizes should have different type: localBegin is always 0, localEnd is always the full size + LocalBeginsType localBegins; + SizesHolderType localEnds; + +private: + template< std::size_t level > + struct LocalSizesSetter + { + template< typename SizesHolder, typename LocalBegins > + static void exec( SizesHolder& localSizes, const SizesHolder& globalSizes, const LocalBegins& localBegins, const SizesHolder& localEnds ) + { + if( SizesHolder::template getStaticSize< level >() != 0 ) + return; + + const auto begin = localBegins.template getSize< level >(); + const auto end = localEnds.template getSize< level >(); + if( begin == end ) + localSizes.template setSize< level >( globalSizes.template getSize< level >() ); + else { + TNL_ASSERT_GE( end - begin, (decltype(end)) __ndarray_impl::get( Overlaps{} ), "local size is less than the size of overlaps" ); + localSizes.template setSize< level >( end - begin + 2 * __ndarray_impl::get( Overlaps{} ) ); + } + } + }; +}; + +} // namespace Containers +} // namespace TNL diff --git a/src/TNL/Containers/DistributedNDArraySynchronizer.h b/src/TNL/Containers/DistributedNDArraySynchronizer.h new file mode 100644 index 0000000000000000000000000000000000000000..e6e41ba3338010779a1b110af90e198cdae617aa --- /dev/null +++ b/src/TNL/Containers/DistributedNDArraySynchronizer.h @@ -0,0 +1,242 @@ +/*************************************************************************** + DistributedNDArraySynchronizer.h - description + ------------------- + begin : Mar 30, 2019 + copyright : (C) 2019 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky + +#pragma once + +#include + +#include + +namespace TNL { +namespace Containers { + +template< typename DistributedNDArray > +class DistributedNDArraySynchronizer +{ +public: + void synchronize( DistributedNDArray& array ) + { + auto future = synchronizeAsync( array, std::launch::deferred ); + future.wait(); + } + + // This method is not thread-safe - only the thread which created and "owns" the + // instance of this object can call this method. + // Also note that this method must not be called again until the previous + // asynchronous operation has finished. + std::shared_future synchronizeAsync( DistributedNDArray& array, std::launch policy = std::launch::async ) + { + // GOTCHA: https://devblogs.nvidia.com/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs/ + #ifdef HAVE_CUDA + if( std::is_same< typename DistributedNDArray::DeviceType, Devices::Cuda >::value ) + cudaGetDevice(&this->gpu_id); + #endif + + // NOTE: the allocation cannot be done in the worker, otherwise CUDA would crash + // skip allocation on repeated calls - compare only sizes, not the actual data + if( array_view.getCommunicationGroup() != array.getCommunicationGroup() || + array_view.getSizes() != array.getSizes() || + array_view.getLocalBegins() != array.getLocalBegins() || + array_view.getLocalEnds() != array.getLocalEnds() ) + { + array_view.bind( array.getView() ); + + // allocate buffers + TemplateStaticFor< std::size_t, 0, DistributedNDArray::getDimension(), AllocateHelper >::execHost( buffers, array_view ); + } + else { + // only bind to the actual data + array_view.bind( array.getView() ); + } + + auto worker = [this](){ this->worker(); }; + return std::async( policy, worker ); + } + +protected: + using DistributedNDArrayView = typename DistributedNDArray::ViewType; + using Communicator = typename DistributedNDArray::CommunicatorType; + using Buffers = __ndarray_impl::SynchronizerBuffers< DistributedNDArray >; + + DistributedNDArrayView array_view; + Buffers buffers; + int gpu_id = 0; + + void worker() + { + // GOTCHA: https://devblogs.nvidia.com/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs/ + #ifdef HAVE_CUDA + if( std::is_same< typename DistributedNDArray::DeviceType, Devices::Cuda >::value ) + cudaSetDevice(gpu_id); + #endif + + // fill send buffers + TemplateStaticFor< std::size_t, 0, DistributedNDArray::getDimension(), CopyHelper >::execHost( buffers, array_view, true ); + + // issue all send and receive async operations + std::vector< typename Communicator::Request > requests; + const typename Communicator::CommunicationGroup group = array_view.getCommunicationGroup(); + TemplateStaticFor< std::size_t, 0, DistributedNDArray::getDimension(), SendHelper >::execHost( buffers, requests, group ); + + // wait until send is done + Communicator::WaitAll( requests.data(), requests.size() ); + + // copy data from receive buffers + TemplateStaticFor< std::size_t, 0, DistributedNDArray::getDimension(), CopyHelper >::execHost( buffers, array_view, false ); + } + + template< std::size_t dim > + struct AllocateHelper + { + static void exec( Buffers& buffers, const DistributedNDArrayView& array_view ) + { + auto& dim_buffers = buffers.template getDimBuffers< dim >(); + + constexpr std::size_t overlap = __ndarray_impl::get< dim >( typename DistributedNDArray::OverlapsType{} ); + // TODO +// constexpr std::size_t overlap = array_view.template getOverlap< dim >(); + if( overlap == 0 ) { + dim_buffers.reset(); + return; + } + + using LocalBegins = typename DistributedNDArray::LocalBeginsType; + using SizesHolder = typename DistributedNDArray::SizesHolderType; + const LocalBegins& localBegins = array_view.getLocalBegins(); + const SizesHolder& localEnds = array_view.getLocalEnds(); + + SizesHolder bufferSize( localEnds ); + bufferSize.template setSize< dim >( overlap ); + + dim_buffers.left_send_buffer.setSize( bufferSize ); + dim_buffers.left_recv_buffer.setSize( bufferSize ); + dim_buffers.right_send_buffer.setSize( bufferSize ); + dim_buffers.right_recv_buffer.setSize( bufferSize ); + + // TODO: check overlap offsets for 2D and 3D distributions (watch out for the corners - maybe use SetSizesSubtractOverlapsHelper?) + + // offsets for left-send + dim_buffers.left_send_offsets = localBegins; + + // offsets for left-receive + dim_buffers.left_recv_offsets = localBegins; + dim_buffers.left_recv_offsets.template setSize< dim >( localBegins.template getSize< dim >() - overlap ); + + // offsets for right-send + dim_buffers.right_send_offsets = localBegins; + dim_buffers.right_send_offsets.template setSize< dim >( localEnds.template getSize< dim >() - overlap ); + + // offsets for right-receive + dim_buffers.right_recv_offsets = localBegins; + dim_buffers.right_recv_offsets.template setSize< dim >( localEnds.template getSize< dim >() ); + + // FIXME: set proper neighbor IDs !!! + const typename Communicator::CommunicationGroup group = array_view.getCommunicationGroup(); + const int rank = Communicator::GetRank(group); + const int nproc = Communicator::GetSize(group); + dim_buffers.left_neighbor = (rank + nproc - 1) % nproc; + dim_buffers.right_neighbor = (rank + 1) % nproc; + } + }; + + template< std::size_t dim > + struct CopyHelper + { + static void exec( Buffers& buffers, DistributedNDArrayView& array_view, bool to_buffer ) + { + const std::size_t overlap = __ndarray_impl::get< dim >( typename DistributedNDArray::OverlapsType{} ); + if( overlap == 0 ) + return; + + auto& dim_buffers = buffers.template getDimBuffers< dim >(); + + // TODO: specify CUDA stream for the copy, otherwise async won't work !!! + CopyKernel< decltype(dim_buffers.left_send_buffer.getView()) > copy_kernel; + copy_kernel.array_view.bind( array_view ); + copy_kernel.to_buffer = to_buffer; + + if( to_buffer ) { + copy_kernel.buffer_view.bind( dim_buffers.left_send_buffer.getView() ); + copy_kernel.array_offsets = dim_buffers.left_send_offsets; + dim_buffers.left_send_buffer.forAll( copy_kernel ); + + copy_kernel.buffer_view.bind( dim_buffers.right_send_buffer.getView() ); + copy_kernel.array_offsets = dim_buffers.right_send_offsets; + dim_buffers.right_send_buffer.forAll( copy_kernel ); + } + else { + copy_kernel.buffer_view.bind( dim_buffers.left_recv_buffer.getView() ); + copy_kernel.array_offsets = dim_buffers.left_recv_offsets; + dim_buffers.left_recv_buffer.forAll( copy_kernel ); + + copy_kernel.buffer_view.bind( dim_buffers.right_recv_buffer.getView() ); + copy_kernel.array_offsets = dim_buffers.right_recv_offsets; + dim_buffers.right_recv_buffer.forAll( copy_kernel ); + } + } + }; + + template< std::size_t dim > + struct SendHelper + { + template< typename Requests, typename Group > + static void exec( Buffers& buffers, Requests& requests, Group group ) + { + const std::size_t overlap = __ndarray_impl::get< dim >( typename DistributedNDArray::OverlapsType{} ); + if( overlap == 0 ) + return; + + auto& dim_buffers = buffers.template getDimBuffers< dim >(); + + requests.push_back( Communicator::ISend( dim_buffers.left_send_buffer.getStorageArray().getData(), + dim_buffers.left_send_buffer.getStorageSize(), + dim_buffers.left_neighbor, 0, group ) ); + requests.push_back( Communicator::IRecv( dim_buffers.left_recv_buffer.getStorageArray().getData(), + dim_buffers.left_recv_buffer.getStorageSize(), + dim_buffers.left_neighbor, 1, group ) ); + requests.push_back( Communicator::ISend( dim_buffers.right_send_buffer.getStorageArray().getData(), + dim_buffers.right_send_buffer.getStorageSize(), + dim_buffers.right_neighbor, 1, group ) ); + requests.push_back( Communicator::IRecv( dim_buffers.right_recv_buffer.getStorageArray().getData(), + dim_buffers.right_recv_buffer.getStorageSize(), + dim_buffers.right_neighbor, 0, group ) ); + } + }; + +#ifdef __NVCC__ +public: +#endif + template< typename BufferView > + struct CopyKernel + { + using ArrayView = typename DistributedNDArray::ViewType; + using LocalBegins = typename ArrayView::LocalBeginsType; + + BufferView buffer_view; + ArrayView array_view; + LocalBegins array_offsets; + bool to_buffer; + + template< typename... Indices > + __cuda_callable__ + void operator()( Indices... indices ) + { + if( to_buffer ) + buffer_view( indices... ) = call_with_shifted_indices( array_offsets, array_view, indices... ); + else + call_with_shifted_indices( array_offsets, array_view, indices... ) = buffer_view( indices... ); + } + }; +}; + +} // namespace Containers +} // namespace TNL diff --git a/src/TNL/Containers/DistributedNDArrayView.h b/src/TNL/Containers/DistributedNDArrayView.h new file mode 100644 index 0000000000000000000000000000000000000000..102985e9c15e4ff0d058dc79c04ff14b7ae2194b --- /dev/null +++ b/src/TNL/Containers/DistributedNDArrayView.h @@ -0,0 +1,417 @@ +/*************************************************************************** + DistributedNDArrayView.h - description + ------------------- + begin : Dec 27, 2018 + copyright : (C) 2018 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky + +#pragma once + +#include +#include +#include + +namespace TNL { +namespace Containers { + +template< typename NDArrayView, + typename Communicator = Communicators::MpiCommunicator, + typename Overlaps = __ndarray_impl::make_constant_index_sequence< NDArrayView::getDimension(), 0 > > +class DistributedNDArrayView +{ + using CommunicationGroup = typename Communicator::CommunicationGroup; +public: + using ValueType = typename NDArrayView::ValueType; + using DeviceType = typename NDArrayView::DeviceType; + using IndexType = typename NDArrayView::IndexType; + using SizesHolderType = typename NDArrayView::SizesHolderType; + using PermutationType = typename NDArrayView::PermutationType; + using CommunicatorType = Communicator; + using LocalBeginsType = __ndarray_impl::LocalBeginsHolder< typename NDArrayView::SizesHolderType >; + using LocalRangeType = Subrange< IndexType >; + using OverlapsType = Overlaps; + using LocalIndexerType = NDArrayIndexer< SizesHolderType, PermutationType, typename NDArrayView::NDBaseType, typename NDArrayView::StridesHolderType, Overlaps >; + + using ViewType = DistributedNDArrayView< NDArrayView, Communicator, Overlaps >; + using ConstViewType = DistributedNDArrayView< typename NDArrayView::ConstViewType, Communicator, Overlaps >; + using LocalViewType = NDArrayView; + using ConstLocalViewType = typename NDArrayView::ConstViewType; + + static_assert( Overlaps::size() == NDArrayView::getDimension(), "invalid overlaps" ); + + __cuda_callable__ + DistributedNDArrayView() = default; + + // explicit initialization by local array view, global sizes and local begins and ends + __cuda_callable__ + DistributedNDArrayView( NDArrayView localView, SizesHolderType globalSizes, LocalBeginsType localBegins, SizesHolderType localEnds, CommunicationGroup group ) + : localView(localView), group(group), globalSizes(globalSizes), localBegins(localBegins), localEnds(localEnds) {} + + // Copy-constructor does shallow copy, so views can be passed-by-value into + // CUDA kernels and they can be captured-by-value in __cuda_callable__ + // lambda functions. + __cuda_callable__ + DistributedNDArrayView( const DistributedNDArrayView& ) = default; + + // default move-constructor + __cuda_callable__ + DistributedNDArrayView( DistributedNDArrayView&& ) = default; + + // Copy-assignment does deep copy, just like regular array, but the sizes + // must match (i.e. copy-assignment cannot resize). + __cuda_callable__ + DistributedNDArrayView& operator=( const DistributedNDArrayView& other ) = default; + + // There is no move-assignment operator, so expressions like `a = b.getView()` + // are resolved as copy-assignment. + + // Templated copy-assignment + template< typename OtherArray > + DistributedNDArrayView& operator=( const OtherArray& other ) + { + globalSizes = other.getSizes(); + localBegins = other.getLocalBegins(); + localEnds = other.getLocalEnds(); + group = other.getCommunicationGroup(); + localView = other.getConstLocalView(); + return *this; + } + + // methods for rebinding (reinitialization) + __cuda_callable__ + void bind( DistributedNDArrayView view ) + { + localView.bind( view.localView ); + group = view.group; + globalSizes = view.globalSizes; + localBegins = view.localBegins; + localEnds = view.localEnds; + } + + // binds to the given raw pointer and changes the indexer + __cuda_callable__ + void bind( ValueType* data, LocalIndexerType indexer ) + { + localView.bind( data, indexer ); + localView.bind( data ); + } + + // binds to the given raw pointer and preserves the current indexer + __cuda_callable__ + void bind( ValueType* data ) + { + localView.bind( data ); + } + + __cuda_callable__ + void reset() + { + localView.reset(); + group = CommunicatorType::NullGroup; + globalSizes = SizesHolderType{}; + localBegins = LocalBeginsType{}; + localEnds = SizesHolderType{}; + } + + static constexpr std::size_t getDimension() + { + return NDArrayView::getDimension(); + } + + __cuda_callable__ + CommunicationGroup getCommunicationGroup() const + { + return group; + } + + // Returns the *global* sizes + __cuda_callable__ + const SizesHolderType& getSizes() const + { + return globalSizes; + } + + // Returns the *global* size + template< std::size_t level > + __cuda_callable__ + IndexType getSize() const + { + return globalSizes.template getSize< level >(); + } + + __cuda_callable__ + LocalBeginsType getLocalBegins() const + { + return localBegins; + } + + __cuda_callable__ + SizesHolderType getLocalEnds() const + { + return localEnds; + } + + template< std::size_t level > + __cuda_callable__ + LocalRangeType getLocalRange() const + { + return LocalRangeType( localBegins.template getSize< level >(), localEnds.template getSize< level >() ); + } + + // returns the local storage size + __cuda_callable__ + IndexType getLocalStorageSize() const + { + return localView.getStorageSize(); + } + + LocalIndexerType getLocalIndexer() const + { + return LocalIndexerType( localEnds - localBegins, typename NDArrayView::StridesHolderType{} ); + } + + LocalViewType getLocalView() + { + return localView; + } + + ConstLocalViewType getConstLocalView() const + { + return localView.getConstView(); + } + + // returns the *local* storage index for given *global* indices + template< typename... IndexTypes > + __cuda_callable__ + IndexType + getStorageIndex( IndexTypes&&... indices ) const + { + static_assert( sizeof...( indices ) == SizesHolderType::getDimension(), "got wrong number of indices" ); + __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexTypes >( indices )... ); + auto getStorageIndex = [this]( auto&&... indices ) + { + return this->localView.getStorageIndex( std::forward< decltype(indices) >( indices )... ); + }; + return __ndarray_impl::call_with_unshifted_indices< LocalBeginsType, Overlaps >( localBegins, getStorageIndex, std::forward< IndexTypes >( indices )... ); + } + + __cuda_callable__ + ValueType* getData() + { + return localView.getData(); + } + + __cuda_callable__ + std::add_const_t< ValueType >* getData() const + { + return localView.getData(); + } + + + template< typename... IndexTypes > + __cuda_callable__ + ValueType& + operator()( IndexTypes&&... indices ) + { + static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" ); + __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexTypes >( indices )... ); + return __ndarray_impl::call_with_unshifted_indices< LocalBeginsType, Overlaps >( localBegins, localView, std::forward< IndexTypes >( indices )... ); + } + + template< typename... IndexTypes > + __cuda_callable__ + const ValueType& + operator()( IndexTypes&&... indices ) const + { + static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" ); + __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexTypes >( indices )... ); + return __ndarray_impl::call_with_unshifted_indices< LocalBeginsType, Overlaps >( localBegins, localView, std::forward< IndexTypes >( indices )... ); + } + + // bracket operator for 1D arrays + __cuda_callable__ + ValueType& + operator[]( IndexType index ) + { + static_assert( getDimension() == 1, "the access via operator[] is provided only for 1D arrays" ); + __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexType >( index ) ); + return localView[ __ndarray_impl::get<0>( Overlaps{} ) + index - localBegins.template getSize< 0 >() ]; + } + + __cuda_callable__ + const ValueType& + operator[]( IndexType index ) const + { + static_assert( getDimension() == 1, "the access via operator[] is provided only for 1D arrays" ); + __ndarray_impl::assertIndicesInRange( localBegins, localEnds, Overlaps{}, std::forward< IndexType >( index ) ); + return localView[ __ndarray_impl::get<0>( Overlaps{} ) + index - localBegins.template getSize< 0 >() ]; + } + + __cuda_callable__ + ViewType getView() + { + return ViewType( *this ); + } + + __cuda_callable__ + ConstViewType getConstView() const + { + return ConstViewType( localView, globalSizes, localBegins, localEnds, group ); + } + + // TODO: overlaps should be skipped, otherwise it works only after synchronization + bool operator==( const DistributedNDArrayView& other ) const + { + // we can't run allreduce if the communication groups are different + if( group != other.getCommunicationGroup() ) + return false; + const bool localResult = + globalSizes == other.globalSizes && + localBegins == other.localBegins && + localEnds == other.localEnds && + localView == other.localView; + bool result = true; + if( group != CommunicatorType::NullGroup ) + CommunicatorType::Allreduce( &localResult, &result, 1, MPI_LAND, group ); + return result; + } + + bool operator!=( const DistributedNDArrayView& other ) const + { + return ! (*this == other); + } + + // iterate over all local elements + template< typename Device2 = DeviceType, typename Func > + void forAll( Func f ) const + { + __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch; + dispatch( localBegins, localEnds, f ); + } + + // iterate over local elements which are not neighbours of *global* boundaries + template< typename Device2 = DeviceType, typename Func > + void forInternal( Func f ) const + { + // add static sizes + using Begins = __ndarray_impl::LocalBeginsHolder< SizesHolderType, 1 >; + // add dynamic sizes + Begins begins; + __ndarray_impl::SetSizesAddHelper< 1, Begins, SizesHolderType, Overlaps >::add( begins, SizesHolderType{} ); + __ndarray_impl::SetSizesMaxHelper< Begins, LocalBeginsType >::max( begins, localBegins ); + + // subtract static sizes + using Ends = typename __ndarray_impl::SubtractedSizesHolder< SizesHolderType, 1 >::type; + // subtract dynamic sizes + Ends ends; + __ndarray_impl::SetSizesSubtractHelper< 1, Ends, SizesHolderType, Overlaps >::subtract( ends, globalSizes ); + __ndarray_impl::SetSizesMinHelper< Ends, SizesHolderType >::min( ends, localEnds ); + + __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch; + dispatch( begins, ends, f ); + } + + // iterate over local elements inside the given [begins, ends) range specified by global indices + template< typename Device2 = DeviceType, typename Func, typename Begins, typename Ends > + void forInternal( Func f, const Begins& begins, const Ends& ends ) const + { + // TODO: assert "localBegins <= begins <= localEnds", "localBegins <= ends <= localEnds" + __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch; + dispatch( begins, ends, f ); + } + + // iterate over local elements which are neighbours of *global* boundaries + template< typename Device2 = DeviceType, typename Func > + void forBoundary( Func f ) const + { + // add static sizes + using SkipBegins = __ndarray_impl::LocalBeginsHolder< SizesHolderType, 1 >; + // add dynamic sizes + SkipBegins skipBegins; + __ndarray_impl::SetSizesAddHelper< 1, SkipBegins, SizesHolderType, Overlaps >::add( skipBegins, SizesHolderType{} ); + __ndarray_impl::SetSizesMaxHelper< SkipBegins, LocalBeginsType >::max( skipBegins, localBegins ); + + // subtract static sizes + using SkipEnds = typename __ndarray_impl::SubtractedSizesHolder< SizesHolderType, 1 >::type; + // subtract dynamic sizes + SkipEnds skipEnds; + __ndarray_impl::SetSizesSubtractHelper< 1, SkipEnds, SizesHolderType, Overlaps >::subtract( skipEnds, globalSizes ); + __ndarray_impl::SetSizesMinHelper< SkipEnds, SizesHolderType >::min( skipEnds, localEnds ); + + __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch; + dispatch( localBegins, skipBegins, skipEnds, localEnds, f ); + } + + // iterate over local elements outside the given [skipBegins, skipEnds) range specified by global indices + template< typename Device2 = DeviceType, typename Func, typename SkipBegins, typename SkipEnds > + void forBoundary( Func f, const SkipBegins& skipBegins, const SkipEnds& skipEnds ) const + { + // TODO: assert "localBegins <= skipBegins <= localEnds", "localBegins <= skipEnds <= localEnds" + __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch; + dispatch( localBegins, skipBegins, skipEnds, localEnds, f ); + } + + // iterate over local elements which are not neighbours of overlaps (if all overlaps are 0, it is equivalent to forAll) + template< typename Device2 = DeviceType, typename Func > + void forLocalInternal( Func f ) const + { + // add overlaps to dynamic sizes + LocalBeginsType begins; + __ndarray_impl::SetSizesAddOverlapsHelper< LocalBeginsType, SizesHolderType, Overlaps >::add( begins, localBegins ); + + // subtract overlaps from dynamic sizes + SizesHolderType ends; + __ndarray_impl::SetSizesSubtractOverlapsHelper< SizesHolderType, SizesHolderType, Overlaps >::subtract( ends, localEnds ); + + __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch; + dispatch( begins, ends, f ); + } + + // iterate over local elements which are neighbours of overlaps (if all overlaps are 0, it has no effect) + template< typename Device2 = DeviceType, typename Func > + void forLocalBoundary( Func f ) const + { + // add overlaps to dynamic sizes + LocalBeginsType skipBegins; + __ndarray_impl::SetSizesAddOverlapsHelper< LocalBeginsType, SizesHolderType, Overlaps >::add( skipBegins, localBegins ); + + // subtract overlaps from dynamic sizes + SizesHolderType skipEnds; + __ndarray_impl::SetSizesSubtractOverlapsHelper< SizesHolderType, SizesHolderType, Overlaps >::subtract( skipEnds, localEnds ); + + __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch; + dispatch( localBegins, skipBegins, skipEnds, localEnds, f ); + } + + // iterate over elements of overlaps (if all overlaps are 0, it has no effect) + template< typename Device2 = DeviceType, typename Func > + void forOverlaps( Func f ) const + { + // subtract overlaps from dynamic sizes + LocalBeginsType begins; + __ndarray_impl::SetSizesSubtractOverlapsHelper< LocalBeginsType, SizesHolderType, Overlaps >::subtract( begins, localBegins ); + + // add overlaps to dynamic sizes + SizesHolderType ends; + __ndarray_impl::SetSizesAddOverlapsHelper< SizesHolderType, SizesHolderType, Overlaps >::add( ends, localEnds ); + + __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch; + dispatch( begins, localBegins, localEnds, ends, f ); + } + +protected: + NDArrayView localView; + CommunicationGroup group = Communicator::NullGroup; + SizesHolderType globalSizes; + // static sizes should have different type: localBegin is always 0, localEnd is always the full size + LocalBeginsType localBegins; + SizesHolderType localEnds; +}; + +} // namespace Containers +} // namespace TNL diff --git a/src/TNL/Containers/NDArray.h b/src/TNL/Containers/NDArray.h new file mode 100644 index 0000000000000000000000000000000000000000..8472f4d7151b9896a20c3f20af5d302286969022 --- /dev/null +++ b/src/TNL/Containers/NDArray.h @@ -0,0 +1,422 @@ +/*************************************************************************** + NDArray.h - description + ------------------- + begin : Dec 24, 2018 + copyright : (C) 2018 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky + +#pragma once + +#include +#include + +#include + +namespace TNL { +namespace Containers { + +template< std::size_t slicedDimension = 0, + std::size_t sliceSize = 0 > +struct SliceInfo +{ + // sliceSize == 0 means no slicing + static constexpr std::size_t getSliceSize( std::size_t dimension ) + { + return (dimension == slicedDimension) ? sliceSize : 0; + } +}; + + + + +template< typename Array, + typename SizesHolder, + typename Permutation, + typename Base, + typename Device = typename Array::DeviceType > +class NDArrayStorage + : public NDArrayIndexer< SizesHolder, Permutation, Base > +{ +public: + using StorageArray = Array; + using ValueType = typename Array::ValueType; + using DeviceType = Device; + using IndexType = typename Array::IndexType; + using SizesHolderType = SizesHolder; + using PermutationType = Permutation; + using IndexerType = NDArrayIndexer< SizesHolder, Permutation, Base >; + using ViewType = NDArrayView< ValueType, DeviceType, SizesHolder, Permutation, Base >; + using ConstViewType = NDArrayView< std::add_const_t< ValueType >, DeviceType, SizesHolder, Permutation, Base >; + + static_assert( Permutation::size() == SizesHolder::getDimension(), "invalid permutation" ); + + // all methods from NDArrayView + + NDArrayStorage() = default; + + // The copy-constructor of TNL::Containers::Array makes shallow copy so our + // copy-constructor cannot be default. Actually, we most likely don't need + // it anyway, so let's just delete it. + NDArrayStorage( const NDArrayStorage& ) = delete; + + // Standard copy-semantics with deep copy, just like regular 1D array. + // Mismatched sizes cause reallocations. + NDArrayStorage& operator=( const NDArrayStorage& other ) = default; + + // default move-semantics + NDArrayStorage( NDArrayStorage&& ) = default; + NDArrayStorage& operator=( NDArrayStorage&& ) = default; + + // Templated copy-assignment + template< typename OtherArray > + NDArrayStorage& operator=( const OtherArray& other ) + { + static_assert( std::is_same< PermutationType, typename OtherArray::PermutationType >::value, + "Arrays must have the same permutation of indices." ); + // update sizes + __ndarray_impl::SetSizesCopyHelper< SizesHolderType, typename OtherArray::SizesHolderType >::copy( getSizes(), other.getSizes() ); + // (re)allocate storage if necessary + array.setSize( getStorageSize() ); + // copy data + getView() = other.getConstView(); + return *this; + } + + bool operator==( const NDArrayStorage& other ) const + { + // FIXME: uninitialized data due to alignment in NDArray and padding in SlicedNDArray + return getSizes() == other.getSizes() && array == other.array; + } + + bool operator!=( const NDArrayStorage& other ) const + { + // FIXME: uninitialized data due to alignment in NDArray and padding in SlicedNDArray + return getSizes() != other.getSizes() || array != other.array; + } + + __cuda_callable__ + ValueType* getData() + { + return array.getData(); + } + + __cuda_callable__ + std::add_const_t< ValueType >* getData() const + { + return array.getData(); + } + + // methods from the base class + using IndexerType::getDimension; + using IndexerType::getSizes; + using IndexerType::getSize; + using IndexerType::getStride; + using IndexerType::getStorageSize; + using IndexerType::getStorageIndex; + + __cuda_callable__ + const IndexerType& getIndexer() const + { + return *this; + } + + __cuda_callable__ + ViewType getView() + { + return ViewType( array.getData(), getSizes() ); + } + + __cuda_callable__ + ConstViewType getConstView() const + { + return ConstViewType( array.getData(), getSizes() ); + } + + template< std::size_t... Dimensions, typename... IndexTypes > + __cuda_callable__ + auto getSubarrayView( IndexTypes&&... indices ) + { + static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" ); + static_assert( 0 < sizeof...(Dimensions) && sizeof...(Dimensions) <= getDimension(), "got wrong number of dimensions" ); + static_assert( __ndarray_impl::all_elements_in_range( 0, Permutation::size(), {Dimensions...} ), + "invalid dimensions" ); +// FIXME: nvcc chokes on the variadic brace-initialization +#ifndef __NVCC__ + static_assert( __ndarray_impl::is_increasing_sequence( {Dimensions...} ), + "specifying permuted dimensions is not supported" ); +#endif + + using Getter = __ndarray_impl::SubarrayGetter< Base, Permutation, Dimensions... >; + using Subpermutation = typename Getter::Subpermutation; + auto& begin = operator()( std::forward< IndexTypes >( indices )... ); + auto subarray_sizes = Getter::filterSizes( getSizes(), std::forward< IndexTypes >( indices )... ); + auto strides = Getter::getStrides( getSizes(), std::forward< IndexTypes >( indices )... ); + static_assert( Subpermutation::size() == sizeof...(Dimensions), "Bug - wrong subpermutation length." ); + static_assert( decltype(subarray_sizes)::getDimension() == sizeof...(Dimensions), "Bug - wrong dimension of the new sizes." ); + static_assert( decltype(strides)::getDimension() == sizeof...(Dimensions), "Bug - wrong dimension of the strides." ); + using SubarrayView = NDArrayView< ValueType, Device, decltype(subarray_sizes), Subpermutation, Base, decltype(strides) >; + return SubarrayView{ &begin, subarray_sizes, strides }; + } + + template< typename... IndexTypes > + __cuda_callable__ + ValueType& + operator()( IndexTypes&&... indices ) + { + static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" ); + __ndarray_impl::assertIndicesInBounds( getSizes(), std::forward< IndexTypes >( indices )... ); + TNL_ASSERT_LT( getStorageIndex( std::forward< IndexTypes >( indices )... ), getStorageSize(), + "storage index out of bounds - either input error or a bug in the indexer" ); + return array[ getStorageIndex( std::forward< IndexTypes >( indices )... ) ]; + } + + template< typename... IndexTypes > + __cuda_callable__ + const ValueType& + operator()( IndexTypes&&... indices ) const + { + static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" ); + __ndarray_impl::assertIndicesInBounds( getSizes(), std::forward< IndexTypes >( indices )... ); + TNL_ASSERT_LT( getStorageIndex( std::forward< IndexTypes >( indices )... ), getStorageSize(), + "storage index out of bounds - either input error or a bug in the indexer" ); + return array[ getStorageIndex( std::forward< IndexTypes >( indices )... ) ]; + } + + // bracket operator for 1D arrays + __cuda_callable__ + ValueType& + operator[]( IndexType index ) + { + static_assert( getDimension() == 1, "the access via operator[] is provided only for 1D arrays" ); + __ndarray_impl::assertIndicesInBounds( getSizes(), std::forward< IndexType >( index ) ); + return array[ index ]; + } + + __cuda_callable__ + const ValueType& + operator[]( IndexType index ) const + { + static_assert( getDimension() == 1, "the access via operator[] is provided only for 1D arrays" ); + __ndarray_impl::assertIndicesInBounds( getSizes(), std::forward< IndexType >( index ) ); + return array[ index ]; + } + + template< typename Device2 = DeviceType, typename Func > + void forAll( Func f ) const + { + __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch; + using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 0 >; + dispatch( Begins{}, getSizes(), f ); + } + + template< typename Device2 = DeviceType, typename Func > + void forInternal( Func f ) const + { + __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch; + using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 1 >; + // subtract static sizes + using Ends = typename __ndarray_impl::SubtractedSizesHolder< SizesHolder, 1 >::type; + // subtract dynamic sizes + Ends ends; + __ndarray_impl::SetSizesSubtractHelper< 1, Ends, SizesHolder >::subtract( ends, getSizes() ); + dispatch( Begins{}, ends, f ); + } + + template< typename Device2 = DeviceType, typename Func, typename Begins, typename Ends > + void forInternal( Func f, const Begins& begins, const Ends& ends ) const + { + // TODO: assert "begins <= sizes", "ends <= sizes" + __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch; + dispatch( begins, ends, f ); + } + + template< typename Device2 = DeviceType, typename Func > + void forBoundary( Func f ) const + { + using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 0 >; + using SkipBegins = ConstStaticSizesHolder< IndexType, getDimension(), 1 >; + // subtract static sizes + using SkipEnds = typename __ndarray_impl::SubtractedSizesHolder< SizesHolder, 1 >::type; + // subtract dynamic sizes + SkipEnds skipEnds; + __ndarray_impl::SetSizesSubtractHelper< 1, SkipEnds, SizesHolder >::subtract( skipEnds, getSizes() ); + + __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch; + dispatch( Begins{}, SkipBegins{}, skipEnds, getSizes(), f ); + } + + template< typename Device2 = DeviceType, typename Func, typename SkipBegins, typename SkipEnds > + void forBoundary( Func f, const SkipBegins& skipBegins, const SkipEnds& skipEnds ) const + { + // TODO: assert "skipBegins <= sizes", "skipEnds <= sizes" + using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 0 >; + __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch; + dispatch( Begins{}, skipBegins, skipEnds, getSizes(), f ); + } + + + // extra methods + + // TODO: rename to setSizes and make sure that overloading with the following method works + void setSize( const SizesHolderType& sizes ) + { + getSizes() = sizes; + array.setSize( getStorageSize() ); + } + + template< typename... IndexTypes > + void setSizes( IndexTypes&&... sizes ) + { + static_assert( sizeof...( sizes ) == getDimension(), "got wrong number of sizes" ); + __ndarray_impl::setSizesHelper( getSizes(), std::forward< IndexTypes >( sizes )... ); + array.setSize( getStorageSize() ); + } + + void setLike( const NDArrayStorage& other ) + { + getSizes() = other.getSizes(); + array.setSize( getStorageSize() ); + } + + void reset() + { + getSizes() = SizesHolder{}; + TNL_ASSERT_EQ( getStorageSize(), 0, "Failed to reset the sizes." ); + array.reset(); + } + + // "safe" accessor - will do slow copy from device + template< typename... IndexTypes > + ValueType + getElement( IndexTypes&&... indices ) const + { + static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" ); + __ndarray_impl::assertIndicesInBounds( getSizes(), std::forward< IndexTypes >( indices )... ); + TNL_ASSERT_LT( getStorageIndex( std::forward< IndexTypes >( indices )... ), getStorageSize(), + "storage index out of bounds - either input error or a bug in the indexer" ); + return array.getElement( getStorageIndex( std::forward< IndexTypes >( indices )... ) ); + } + + const StorageArray& getStorageArray() const + { + return array; + } + + StorageArray& getStorageArray() + { + return array; + } + + void setValue( ValueType value ) + { + array.setValue( value ); + } + +protected: + StorageArray array; + IndexerType indexer; +}; + +template< typename Value, + typename SizesHolder, + typename Permutation = std::make_index_sequence< SizesHolder::getDimension() >, // identity by default + typename Device = Devices::Host, + typename Index = typename SizesHolder::IndexType > +class NDArray +: public NDArrayStorage< Array< Value, Device, Index >, + SizesHolder, + Permutation, + __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > > > +{ + using Base = NDArrayStorage< Array< Value, Device, Index >, + SizesHolder, + Permutation, + __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > > >; + +public: + // inherit all assignment operators + using Base::operator=; +}; + +template< typename Value, + typename SizesHolder, + typename Permutation = std::make_index_sequence< SizesHolder::getDimension() >, // identity by default + typename Index = typename SizesHolder::IndexType > +class StaticNDArray +: public NDArrayStorage< StaticArray< __ndarray_impl::StaticStorageSizeGetter< SizesHolder >::get(), Value >, + SizesHolder, + Permutation, + __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > >, + void > +{ + using Base = NDArrayStorage< StaticArray< __ndarray_impl::StaticStorageSizeGetter< SizesHolder >::get(), Value >, + SizesHolder, + Permutation, + __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > >, + void >; + static_assert( __ndarray_impl::StaticStorageSizeGetter< SizesHolder >::get() > 0, + "All dimensions of a static array must to be positive." ); + +public: + // inherit all assignment operators + using Base::operator=; +}; + +template< typename Value, + std::size_t Rows, + std::size_t Columns, + typename Permutation = std::index_sequence< 0, 1 > > // identity by default +class StaticMatrix +: public StaticNDArray< Value, + SizesHolder< std::size_t, Rows, Columns >, + Permutation > +{ + using Base = StaticNDArray< Value, + SizesHolder< std::size_t, Rows, Columns >, + Permutation >; + +public: + // inherit all assignment operators + using Base::operator=; + + static constexpr std::size_t getRows() + { + return Rows; + } + + __cuda_callable__ + static constexpr std::size_t getColumns() + { + return Columns; + } +}; + +template< typename Value, + typename SizesHolder, + typename Permutation = std::make_index_sequence< SizesHolder::getDimension() >, // identity by default + typename SliceInfo = SliceInfo<>, // no slicing by default + typename Device = Devices::Host, + typename Index = typename SizesHolder::IndexType > +class SlicedNDArray +: public NDArrayStorage< Array< Value, Device, Index >, + SizesHolder, + Permutation, + __ndarray_impl::SlicedNDArrayBase< SliceInfo > > +{ + using Base = NDArrayStorage< Array< Value, Device, Index >, + SizesHolder, + Permutation, + __ndarray_impl::SlicedNDArrayBase< SliceInfo > >; + +public: + // inherit all assignment operators + using Base::operator=; +}; + +} // namespace Containers +} // namespace TNL diff --git a/src/TNL/Containers/NDArrayIndexer.h b/src/TNL/Containers/NDArrayIndexer.h new file mode 100644 index 0000000000000000000000000000000000000000..e3f068e0cacb774db90122c25617e8531ae52787 --- /dev/null +++ b/src/TNL/Containers/NDArrayIndexer.h @@ -0,0 +1,112 @@ +/*************************************************************************** + NDArrayIndexer.h - description + ------------------- + begin : Apr 14, 2019 + copyright : (C) 2019 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky + +#pragma once + +#include +#include // StorageSizeGetter +#include // DummyStrideBase + +namespace TNL { +namespace Containers { + +template< typename SizesHolder, + typename Permutation, + typename Base, + typename StridesHolder = __ndarray_impl::DummyStrideBase< typename SizesHolder::IndexType, SizesHolder::getDimension() >, + typename Overlaps = __ndarray_impl::make_constant_index_sequence< SizesHolder::getDimension(), 0 > > +class NDArrayIndexer + : public StridesHolder +{ +public: + using IndexType = typename SizesHolder::IndexType; + using NDBaseType = Base; + using SizesHolderType = SizesHolder; + using StridesHolderType = StridesHolder; + using PermutationType = Permutation; + using OverlapsType = Overlaps; + + static_assert( StridesHolder::getDimension() == SizesHolder::getDimension(), + "Dimension of strides does not match the dimension of sizes." ); + static_assert( Permutation::size() == SizesHolder::getDimension(), + "Dimension of permutation does not match the dimension of sizes." ); + static_assert( Overlaps::size() == SizesHolder::getDimension(), + "Dimension of overlaps does not match the dimension of sizes." ); + + __cuda_callable__ + NDArrayIndexer() = default; + + // explicit initialization by sizes and strides + __cuda_callable__ + NDArrayIndexer( SizesHolder sizes, StridesHolder strides ) + : StridesHolder(strides), sizes(sizes) {} + + static constexpr std::size_t getDimension() + { + return SizesHolder::getDimension(); + } + + __cuda_callable__ + const SizesHolderType& getSizes() const + { + return sizes; + } + + template< std::size_t level > + __cuda_callable__ + IndexType getSize() const + { + return sizes.template getSize< level >(); + } + + // method template from base class + using StridesHolder::getStride; + + template< std::size_t level > + static constexpr std::size_t getOverlap() + { + return __ndarray_impl::get< level >( Overlaps{} ); + } + + // returns the product of the aligned sizes + __cuda_callable__ + IndexType getStorageSize() const + { + using Alignment = typename Base::template Alignment< Permutation >; + return __ndarray_impl::StorageSizeGetter< SizesHolder, Alignment, Overlaps >::get( sizes ); + } + + template< typename... IndexTypes > + __cuda_callable__ + IndexType + getStorageIndex( IndexTypes&&... indices ) const + { + static_assert( sizeof...( indices ) == SizesHolder::getDimension(), "got wrong number of indices" ); + return Base::template getStorageIndex< Permutation, Overlaps > + ( sizes, + static_cast< const StridesHolder& >( *this ), + std::forward< IndexTypes >( indices )... ); + } + +protected: + // non-const reference accessor cannot be public - only subclasses like NDArrayStorage may modify the sizes + __cuda_callable__ + SizesHolderType& getSizes() + { + return sizes; + } + + SizesHolder sizes; +}; + +} // namespace Containers +} // namespace TNL diff --git a/src/TNL/Containers/NDArrayView.h b/src/TNL/Containers/NDArrayView.h new file mode 100644 index 0000000000000000000000000000000000000000..54a020a64e5b5130400c6e954ae92840cb0fe1df --- /dev/null +++ b/src/TNL/Containers/NDArrayView.h @@ -0,0 +1,311 @@ +/*************************************************************************** + NDArrayView.h - description + ------------------- + begin : Dec 24, 2018 + copyright : (C) 2018 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace TNL { +namespace Containers { + +template< typename Value, + typename Device, + typename SizesHolder, + typename Permutation, + typename Base, + typename StridesHolder = __ndarray_impl::DummyStrideBase< typename SizesHolder::IndexType, SizesHolder::getDimension() > > +class NDArrayView + : public NDArrayIndexer< SizesHolder, Permutation, Base, StridesHolder > +{ +public: + using ValueType = Value; + using DeviceType = Device; + using IndexType = typename SizesHolder::IndexType; + using SizesHolderType = SizesHolder; + using PermutationType = Permutation; + using IndexerType = NDArrayIndexer< SizesHolder, Permutation, Base, StridesHolder >; + using ViewType = NDArrayView< Value, Device, SizesHolder, Permutation, Base, StridesHolder >; + using ConstViewType = NDArrayView< std::add_const_t< Value >, Device, SizesHolder, Permutation, Base, StridesHolder >; + + static_assert( Permutation::size() == SizesHolder::getDimension(), "invalid permutation" ); + + __cuda_callable__ + NDArrayView() = default; + + // explicit initialization by raw data pointer and sizes and strides + __cuda_callable__ + NDArrayView( Value* data, SizesHolder sizes, StridesHolder strides = StridesHolder{} ) + : IndexerType(sizes, strides), array(data) {} + + // explicit initialization by raw data pointer and indexer + __cuda_callable__ + NDArrayView( Value* data, IndexerType indexer ) + : IndexerType(indexer), array(data) {} + + // Copy-constructor does shallow copy, so views can be passed-by-value into + // CUDA kernels and they can be captured-by-value in __cuda_callable__ + // lambda functions. + __cuda_callable__ + NDArrayView( const NDArrayView& ) = default; + + // default move-constructor + __cuda_callable__ + NDArrayView( NDArrayView&& ) = default; + + // Copy-assignment does deep copy, just like regular array, but the sizes + // must match (i.e. copy-assignment cannot resize). + __cuda_callable__ + NDArrayView& operator=( const NDArrayView& other ) + { + TNL_ASSERT_EQ( getSizes(), other.getSizes(), "The sizes of the array views must be equal, views are not resizable." ); + if( getStorageSize() > 0 ) + Algorithms::ArrayOperations< DeviceType >::copy( array, other.array, getStorageSize() ); + return *this; + } + + // Templated copy-assignment + template< typename OtherView > + NDArrayView& operator=( const OtherView& other ) + { + static_assert( std::is_same< PermutationType, typename OtherView::PermutationType >::value, + "Arrays must have the same permutation of indices." ); + static_assert( NDArrayView::isContiguous() && OtherView::isContiguous(), + "Non-contiguous array views cannot be assigned." ); + TNL_ASSERT_TRUE( __ndarray_impl::sizesWeakCompare( getSizes(), other.getSizes() ), + "The sizes of the array views must be equal, views are not resizable." ); + if( getStorageSize() > 0 ) { + TNL_ASSERT_TRUE( array, "Attempted to assign to an empty view." ); + Algorithms::ArrayOperations< DeviceType, typename OtherView::DeviceType >::copy( array, other.getData(), getStorageSize() ); + } + return *this; + } + + // There is no move-assignment operator, so expressions like `a = b.getView()` + // are resolved as copy-assignment. + + // methods for rebinding (reinitialization) + __cuda_callable__ + void bind( NDArrayView view ) + { + IndexerType::operator=( view ); + array = view.array; + } + + // binds to the given raw pointer and changes the indexer + __cuda_callable__ + void bind( Value* data, IndexerType indexer ) + { + IndexerType::operator=( indexer ); + array = data; + } + + // binds to the given raw pointer and preserves the current indexer + __cuda_callable__ + void bind( Value* data ) + { + array = data; + } + + __cuda_callable__ + void reset() + { + IndexerType::operator=( IndexerType{} ); + array = nullptr; + } + + __cuda_callable__ + bool operator==( const NDArrayView& other ) const + { + if( getSizes() != other.getSizes() ) + return false; + // FIXME: uninitialized data due to alignment in NDArray and padding in SlicedNDArray + return Algorithms::ArrayOperations< Device, Device >::compare( array, other.array, getStorageSize() ); + } + + __cuda_callable__ + bool operator!=( const NDArrayView& other ) const + { + if( getSizes() != other.getSizes() ) + return true; + // FIXME: uninitialized data due to alignment in NDArray and padding in SlicedNDArray + return ! Algorithms::ArrayOperations< Device, Device >::compare( array, other.array, getStorageSize() ); + } + + __cuda_callable__ + ValueType* getData() + { + return array; + } + + __cuda_callable__ + std::add_const_t< ValueType >* getData() const + { + return array; + } + + // methods from the base class + using IndexerType::getDimension; + using IndexerType::getSizes; + using IndexerType::getSize; + using IndexerType::getStride; + using IndexerType::getStorageSize; + using IndexerType::getStorageIndex; + + __cuda_callable__ + const IndexerType& getIndexer() const + { + return *this; + } + + __cuda_callable__ + ViewType getView() + { + return ViewType( *this ); + } + + __cuda_callable__ + ConstViewType getConstView() const + { + return ConstViewType( array, getSizes(), static_cast< const StridesHolder& >( *this ) ); + } + + template< std::size_t... Dimensions, typename... IndexTypes > + __cuda_callable__ + auto getSubarrayView( IndexTypes&&... indices ) + { + static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" ); + static_assert( 0 < sizeof...(Dimensions) && sizeof...(Dimensions) <= getDimension(), "got wrong number of dimensions" ); + static_assert( __ndarray_impl::all_elements_in_range( 0, Permutation::size(), {Dimensions...} ), + "invalid dimensions" ); +// FIXME: nvcc chokes on the variadic brace-initialization +#ifndef __NVCC__ + static_assert( __ndarray_impl::is_increasing_sequence( {Dimensions...} ), + "specifying permuted dimensions is not supported" ); +#endif + + using Getter = __ndarray_impl::SubarrayGetter< Base, Permutation, Dimensions... >; + using Subpermutation = typename Getter::Subpermutation; + auto& begin = operator()( std::forward< IndexTypes >( indices )... ); + auto subarray_sizes = Getter::filterSizes( getSizes(), std::forward< IndexTypes >( indices )... ); + auto strides = Getter::getStrides( getSizes(), std::forward< IndexTypes >( indices )... ); + static_assert( Subpermutation::size() == sizeof...(Dimensions), "Bug - wrong subpermutation length." ); + static_assert( decltype(subarray_sizes)::getDimension() == sizeof...(Dimensions), "Bug - wrong dimension of the new sizes." ); + static_assert( decltype(strides)::getDimension() == sizeof...(Dimensions), "Bug - wrong dimension of the strides." ); + using SubarrayView = NDArrayView< ValueType, Device, decltype(subarray_sizes), Subpermutation, Base, decltype(strides) >; + return SubarrayView{ &begin, subarray_sizes, strides }; + } + + template< typename... IndexTypes > + __cuda_callable__ + ValueType& + operator()( IndexTypes&&... indices ) + { + static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" ); + __ndarray_impl::assertIndicesInBounds( getSizes(), std::forward< IndexTypes >( indices )... ); + return array[ getStorageIndex( std::forward< IndexTypes >( indices )... ) ]; + } + + template< typename... IndexTypes > + __cuda_callable__ + const ValueType& + operator()( IndexTypes&&... indices ) const + { + static_assert( sizeof...( indices ) == getDimension(), "got wrong number of indices" ); + __ndarray_impl::assertIndicesInBounds( getSizes(), std::forward< IndexTypes >( indices )... ); + return array[ getStorageIndex( std::forward< IndexTypes >( indices )... ) ]; + } + + // bracket operator for 1D arrays + __cuda_callable__ + ValueType& + operator[]( IndexType&& index ) + { + static_assert( getDimension() == 1, "the access via operator[] is provided only for 1D arrays" ); + __ndarray_impl::assertIndicesInBounds( getSizes(), std::forward< IndexType >( index ) ); + return array[ index ]; + } + + __cuda_callable__ + const ValueType& + operator[]( IndexType index ) const + { + static_assert( getDimension() == 1, "the access via operator[] is provided only for 1D arrays" ); + __ndarray_impl::assertIndicesInBounds( getSizes(), std::forward< IndexType >( index ) ); + return array[ index ]; + } + + template< typename Device2 = DeviceType, typename Func > + void forAll( Func f ) const + { + __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch; + using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 0 >; + dispatch( Begins{}, getSizes(), f ); + } + + template< typename Device2 = DeviceType, typename Func > + void forInternal( Func f ) const + { + __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch; + using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 1 >; + // subtract static sizes + using Ends = typename __ndarray_impl::SubtractedSizesHolder< SizesHolder, 1 >::type; + // subtract dynamic sizes + Ends ends; + __ndarray_impl::SetSizesSubtractHelper< 1, Ends, SizesHolder >::subtract( ends, getSizes() ); + dispatch( Begins{}, ends, f ); + } + + template< typename Device2 = DeviceType, typename Func, typename Begins, typename Ends > + void forInternal( Func f, const Begins& begins, const Ends& ends ) const + { + // TODO: assert "begins <= getSizes()", "ends <= getSizes()" + __ndarray_impl::ExecutorDispatcher< PermutationType, Device2 > dispatch; + dispatch( begins, ends, f ); + } + + template< typename Device2 = DeviceType, typename Func > + void forBoundary( Func f ) const + { + using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 0 >; + using SkipBegins = ConstStaticSizesHolder< IndexType, getDimension(), 1 >; + // subtract static sizes + using SkipEnds = typename __ndarray_impl::SubtractedSizesHolder< SizesHolder, 1 >::type; + // subtract dynamic sizes + SkipEnds skipEnds; + __ndarray_impl::SetSizesSubtractHelper< 1, SkipEnds, SizesHolder >::subtract( skipEnds, getSizes() ); + + __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch; + dispatch( Begins{}, SkipBegins{}, skipEnds, getSizes(), f ); + } + + template< typename Device2 = DeviceType, typename Func, typename SkipBegins, typename SkipEnds > + void forBoundary( Func f, const SkipBegins& skipBegins, const SkipEnds& skipEnds ) const + { + // TODO: assert "skipBegins <= getSizes()", "skipEnds <= getSizes()" + using Begins = ConstStaticSizesHolder< IndexType, getDimension(), 0 >; + __ndarray_impl::BoundaryExecutorDispatcher< PermutationType, Device2 > dispatch; + dispatch( Begins{}, skipBegins, skipEnds, getSizes(), f ); + } + +protected: + Value* array = nullptr; + IndexerType indexer; +}; + +} // namespace Containers +} // namespace TNL diff --git a/src/TNL/Containers/ndarray/BoundaryExecutors.h b/src/TNL/Containers/ndarray/BoundaryExecutors.h new file mode 100644 index 0000000000000000000000000000000000000000..e4cd93705c7ae83dd36378662fa67b2e618f66eb --- /dev/null +++ b/src/TNL/Containers/ndarray/BoundaryExecutors.h @@ -0,0 +1,413 @@ +/*************************************************************************** + BoundaryExecutors.h - description + ------------------- + begin : Feb 09, 2019 + copyright : (C) 2019 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky + +#pragma once + +#include + +#include +#include + +namespace TNL { +namespace Containers { +namespace __ndarray_impl { + +template< typename Permutation, + typename LevelTag = IndexTag< 0 > > +struct SequentialBoundaryExecutor_inner +{ + template< typename Begins, + typename SkipBegins, + typename SkipEnds, + typename Ends, + typename Func, + typename... Indices > + __cuda_callable__ + void operator()( const Begins& begins, + const SkipBegins& skipBegins, + const SkipEnds& skipEnds, + const Ends& ends, + std::size_t level, + Func f, + Indices&&... indices ) + { + static_assert( Begins::getDimension() == Ends::getDimension(), + "wrong begins or ends" ); + + SequentialBoundaryExecutor_inner< Permutation, IndexTag< LevelTag::value + 1 > > exec; + const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >(); + const auto skipBegin = skipBegins.template getSize< get< LevelTag::value >( Permutation{} ) >(); + const auto skipEnd = skipEnds.template getSize< get< LevelTag::value >( Permutation{} ) >(); + const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >(); + if( level == LevelTag::value ) { + for( auto i = begin; i < skipBegin; i++ ) + exec( begins, skipBegins, skipEnds, ends, level, f, std::forward< Indices >( indices )..., i ); + for( auto i = skipEnd; i < end; i++ ) + exec( begins, skipBegins, skipEnds, ends, level, f, std::forward< Indices >( indices )..., i ); + } + else if( level > LevelTag::value ) { + for( auto i = skipBegin; i < skipEnd; i++ ) + exec( begins, skipBegins, skipEnds, ends, level, f, std::forward< Indices >( indices )..., i ); + } + else { + for( auto i = begin; i < end; i++ ) + exec( begins, skipBegins, skipEnds, ends, level, f, std::forward< Indices >( indices )..., i ); + } + } +}; + +template< typename Permutation > +struct SequentialBoundaryExecutor_inner< Permutation, IndexTag< Permutation::size() - 1 > > +{ + template< typename Begins, + typename SkipBegins, + typename SkipEnds, + typename Ends, + typename Func, + typename... Indices > + __cuda_callable__ + void operator()( const Begins& begins, + const SkipBegins& skipBegins, + const SkipEnds& skipEnds, + const Ends& ends, + std::size_t level, + Func f, + Indices&&... indices ) + { + static_assert( Begins::getDimension() == Ends::getDimension(), + "wrong begins or ends" ); + static_assert( sizeof...(indices) == Begins::getDimension() - 1, + "invalid number of indices in the final step of the SequentialBoundaryExecutor" ); + + using LevelTag = IndexTag< Permutation::size() - 1 >; + + const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >(); + const auto skipBegin = skipBegins.template getSize< get< LevelTag::value >( Permutation{} ) >(); + const auto skipEnd = skipEnds.template getSize< get< LevelTag::value >( Permutation{} ) >(); + const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >(); + if( level == LevelTag::value ) { + for( auto i = begin; i < skipBegin; i++ ) + call_with_unpermuted_arguments< Permutation >( f, std::forward< Indices >( indices )..., i ); + for( auto i = skipEnd; i < end; i++ ) + call_with_unpermuted_arguments< Permutation >( f, std::forward< Indices >( indices )..., i ); + } + else if( level > LevelTag::value ) { + for( auto i = skipBegin; i < skipEnd; i++ ) + call_with_unpermuted_arguments< Permutation >( f, std::forward< Indices >( indices )..., i ); + } + else { + for( auto i = begin; i < end; i++ ) + call_with_unpermuted_arguments< Permutation >( f, std::forward< Indices >( indices )..., i ); + } + } +}; + +template< typename Permutation, + std::size_t dim = Permutation::size() > +struct SequentialBoundaryExecutor +{ + template< typename Begins, + typename SkipBegins, + typename SkipEnds, + typename Ends, + typename Func > + __cuda_callable__ + void operator()( const Begins& begins, + const SkipBegins& skipBegins, + const SkipEnds& skipEnds, + const Ends& ends, + Func f ) + { + static_assert( Begins::getDimension() == Ends::getDimension(), + "wrong begins or ends" ); + + SequentialBoundaryExecutor_inner< Permutation > exec; + for( std::size_t level = 0; level < Permutation::size(); level++ ) + exec( begins, skipBegins, skipEnds, ends, level, f ); + } +}; + +template< typename Permutation > +struct SequentialBoundaryExecutor< Permutation, 0 > +{ + template< typename Begins, + typename SkipBegins, + typename SkipEnds, + typename Ends, + typename Func > + __cuda_callable__ + void operator()( const Begins& begins, + const SkipBegins& skipBegins, + const SkipEnds& skipEnds, + const Ends& ends, + Func f ) + { + static_assert( Begins::getDimension() == Ends::getDimension(), + "wrong begins or ends" ); + + const auto begin = begins.template getSize< get< 0 >( Permutation{} ) >(); + const auto skipBegin = skipBegins.template getSize< get< 0 >( Permutation{} ) >(); + const auto skipEnd = skipEnds.template getSize< get< 0 >( Permutation{} ) >(); + const auto end = ends.template getSize< get< 0 >( Permutation{} ) >(); + for( auto i = begin; i < skipBegin; i++ ) + f( i ); + for( auto i = skipEnd; i < end; i++ ) + f( i ); + } +}; + + +template< typename Permutation, + typename Device, + typename DimTag = IndexTag< Permutation::size() > > +struct ParallelBoundaryExecutor +{ + template< typename Begins, + typename SkipBegins, + typename SkipEnds, + typename Ends, + typename Func > + void operator()( const Begins& begins, + const SkipBegins& skipBegins, + const SkipEnds& skipEnds, + const Ends& ends, + Func f ) + { + static_assert( Permutation::size() <= 3, "ParallelBoundaryExecutor is implemented only for 1D, 2D, and 3D." ); + } +}; + +template< typename Permutation, + typename Device > +struct ParallelBoundaryExecutor< Permutation, Device, IndexTag< 3 > > +{ + template< typename Begins, + typename SkipBegins, + typename SkipEnds, + typename Ends, + typename Func > + void operator()( const Begins& begins, + const SkipBegins& skipBegins, + const SkipEnds& skipEnds, + const Ends& ends, + Func f ) + { + static_assert( Begins::getDimension() == Ends::getDimension(), + "wrong begins or ends" ); + + // nvcc does not like nested __cuda_callable__ and normal lambdas... +// using Index = typename Ends::IndexType; +// auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 ) +// { +// call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 ); +// }; + Kernel< Device > kernel; + + const auto begin0 = begins.template getSize< get< 0 >( Permutation{} ) >(); + const auto begin1 = begins.template getSize< get< 1 >( Permutation{} ) >(); + const auto begin2 = begins.template getSize< get< 2 >( Permutation{} ) >(); + const auto skipBegin0 = skipBegins.template getSize< get< 0 >( Permutation{} ) >(); + const auto skipBegin1 = skipBegins.template getSize< get< 1 >( Permutation{} ) >(); + const auto skipBegin2 = skipBegins.template getSize< get< 2 >( Permutation{} ) >(); + const auto skipEnd0 = skipEnds.template getSize< get< 0 >( Permutation{} ) >(); + const auto skipEnd1 = skipEnds.template getSize< get< 1 >( Permutation{} ) >(); + const auto skipEnd2 = skipEnds.template getSize< get< 2 >( Permutation{} ) >(); + const auto end0 = ends.template getSize< get< 0 >( Permutation{} ) >(); + const auto end1 = ends.template getSize< get< 1 >( Permutation{} ) >(); + const auto end2 = ends.template getSize< get< 2 >( Permutation{} ) >(); + + ParallelFor3D< Device >::exec( begin2, begin1, begin0, skipBegin2, end1, end0, kernel, f ); + ParallelFor3D< Device >::exec( skipEnd2, begin1, begin0, end2, end1, end0, kernel, f ); + ParallelFor3D< Device >::exec( skipBegin2, begin1, begin0, skipEnd2, skipBegin1, end0, kernel, f ); + ParallelFor3D< Device >::exec( skipBegin2, skipEnd1, begin0, skipEnd2, end1, end0, kernel, f ); + ParallelFor3D< Device >::exec( skipBegin2, skipBegin1, begin0, skipEnd2, skipEnd1, skipBegin0, kernel, f ); + ParallelFor3D< Device >::exec( skipBegin2, skipBegin1, skipEnd0, skipEnd2, skipEnd1, end0, kernel, f ); + } + + template< typename __Device, typename = void > + struct Kernel + { + template< typename Index, typename Func > + void operator()( Index i2, Index i1, Index i0, Func f ) + { + call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 ); + }; + }; + + // dummy specialization to avoid a shitpile of nvcc warnings + template< typename __unused > + struct Kernel< Devices::Cuda, __unused > + { + template< typename Index, typename Func > + __cuda_callable__ + void operator()( Index i2, Index i1, Index i0, Func f ) + { + call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 ); + }; + }; +}; + +template< typename Permutation, + typename Device > +struct ParallelBoundaryExecutor< Permutation, Device, IndexTag< 2 > > +{ + template< typename Begins, + typename SkipBegins, + typename SkipEnds, + typename Ends, + typename Func > + void operator()( const Begins& begins, + const SkipBegins& skipBegins, + const SkipEnds& skipEnds, + const Ends& ends, + Func f ) + { + static_assert( Begins::getDimension() == Ends::getDimension(), + "wrong begins or ends" ); + + // nvcc does not like nested __cuda_callable__ and normal lambdas... +// using Index = typename Ends::IndexType; +// auto kernel = [=] __cuda_callable__ ( Index i1, Index i0 ) +// { +// call_with_unpermuted_arguments< Permutation >( f, i0, i1 ); +// }; + Kernel< Device > kernel; + + const auto begin0 = begins.template getSize< get< 0 >( Permutation{} ) >(); + const auto begin1 = begins.template getSize< get< 1 >( Permutation{} ) >(); + const auto skipBegin0 = skipBegins.template getSize< get< 0 >( Permutation{} ) >(); + const auto skipBegin1 = skipBegins.template getSize< get< 1 >( Permutation{} ) >(); + const auto skipEnd0 = skipEnds.template getSize< get< 0 >( Permutation{} ) >(); + const auto skipEnd1 = skipEnds.template getSize< get< 1 >( Permutation{} ) >(); + const auto end0 = ends.template getSize< get< 0 >( Permutation{} ) >(); + const auto end1 = ends.template getSize< get< 1 >( Permutation{} ) >(); + + ParallelFor2D< Device >::exec( begin1, begin0, skipBegin1, end0, kernel, f ); + ParallelFor2D< Device >::exec( skipEnd1, begin0, end1, end0, kernel, f ); + ParallelFor2D< Device >::exec( skipBegin1, begin0, skipEnd1, skipBegin0, kernel, f ); + ParallelFor2D< Device >::exec( skipBegin1, skipEnd0, skipEnd1, end0, kernel, f ); + } + + template< typename __Device, typename = void > + struct Kernel + { + template< typename Index, typename Func > + void operator()( Index i1, Index i0, Func f ) + { + call_with_unpermuted_arguments< Permutation >( f, i0, i1 ); + }; + }; + + // dummy specialization to avoid a shitpile of nvcc warnings + template< typename __unused > + struct Kernel< Devices::Cuda, __unused > + { + template< typename Index, typename Func > + __cuda_callable__ + void operator()( Index i1, Index i0, Func f ) + { + call_with_unpermuted_arguments< Permutation >( f, i0, i1 ); + }; + }; +}; + +template< typename Permutation, + typename Device > +struct ParallelBoundaryExecutor< Permutation, Device, IndexTag< 1 > > +{ + template< typename Begins, + typename SkipBegins, + typename SkipEnds, + typename Ends, + typename Func > + void operator()( const Begins& begins, + const SkipBegins& skipBegins, + const SkipEnds& skipEnds, + const Ends& ends, + Func f ) + { + static_assert( Begins::getDimension() == Ends::getDimension(), + "wrong begins or ends" ); + + const auto begin = begins.template getSize< get< 0 >( Permutation{} ) >(); + const auto skipBegin = skipBegins.template getSize< get< 0 >( Permutation{} ) >(); + const auto skipEnd = skipEnds.template getSize< get< 0 >( Permutation{} ) >(); + const auto end = ends.template getSize< get< 0 >( Permutation{} ) >(); + + ParallelFor< Device >::exec( begin, skipBegin, f ); + ParallelFor< Device >::exec( skipEnd, end, f ); + } +}; + + +// Device may be void which stands for StaticNDArray +template< typename Permutation, + typename Device > +struct BoundaryExecutorDispatcher +{ + template< typename Begins, + typename SkipBegins, + typename SkipEnds, + typename Ends, + typename Func > + void operator()( const Begins& begins, + const SkipBegins& skipBegins, + const SkipEnds& skipEnds, + const Ends& ends, + Func f ) + { + SequentialBoundaryExecutor< Permutation >()( begins, skipBegins, skipEnds, ends, f ); + } +}; + +template< typename Permutation > +struct BoundaryExecutorDispatcher< Permutation, Devices::Host > +{ + template< typename Begins, + typename SkipBegins, + typename SkipEnds, + typename Ends, + typename Func > + void operator()( const Begins& begins, + const SkipBegins& skipBegins, + const SkipEnds& skipEnds, + const Ends& ends, + Func f ) + { + if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 ) + ParallelBoundaryExecutor< Permutation, Devices::Host >()( begins, skipBegins, skipEnds, ends, f ); + else + SequentialBoundaryExecutor< Permutation >()( begins, skipBegins, skipEnds, ends, f ); + } +}; + +template< typename Permutation > +struct BoundaryExecutorDispatcher< Permutation, Devices::Cuda > +{ + template< typename Begins, + typename SkipBegins, + typename SkipEnds, + typename Ends, + typename Func > + void operator()( const Begins& begins, + const SkipBegins& skipBegins, + const SkipEnds& skipEnds, + const Ends& ends, + Func f ) + { + ParallelBoundaryExecutor< Permutation, Devices::Cuda >()( begins, skipBegins, skipEnds, ends, f ); + } +}; + +} // namespace __ndarray_impl +} // namespace Containers +} // namespace TNL diff --git a/src/TNL/Containers/ndarray/Executors.h b/src/TNL/Containers/ndarray/Executors.h new file mode 100644 index 0000000000000000000000000000000000000000..d09b6ec234b8e7ce6a15632f5479130aa0cc1c60 --- /dev/null +++ b/src/TNL/Containers/ndarray/Executors.h @@ -0,0 +1,358 @@ +/*************************************************************************** + Executors.h - description + ------------------- + begin : Dec 24, 2018 + copyright : (C) 2018 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky + +#pragma once + +#include + +#include +#include + +namespace TNL { +namespace Containers { +namespace __ndarray_impl { + +template< typename Permutation, + typename LevelTag = IndexTag< 0 > > +struct SequentialExecutor +{ + template< typename Begins, + typename Ends, + typename Func, + typename... Indices > + __cuda_callable__ + void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices ) + { + static_assert( Begins::getDimension() == Ends::getDimension(), + "wrong begins or ends" ); + + SequentialExecutor< Permutation, IndexTag< LevelTag::value + 1 > > exec; + const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >(); + const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >(); + for( auto i = begin; i < end; i++ ) + exec( begins, ends, f, std::forward< Indices >( indices )..., i ); + } +}; + +template< typename Permutation > +struct SequentialExecutor< Permutation, IndexTag< Permutation::size() - 1 > > +{ + template< typename Begins, + typename Ends, + typename Func, + typename... Indices > + __cuda_callable__ + void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices ) + { + static_assert( Begins::getDimension() == Ends::getDimension(), + "wrong begins or ends" ); + static_assert( sizeof...(indices) == Begins::getDimension() - 1, + "invalid number of indices in the final step of the SequentialExecutor" ); + + using LevelTag = IndexTag< Permutation::size() - 1 >; + + const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >(); + const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >(); + for( auto i = begin; i < end; i++ ) + call_with_unpermuted_arguments< Permutation >( f, std::forward< Indices >( indices )..., i ); + } +}; + + +template< typename Permutation, + typename LevelTag = IndexTag< Permutation::size() - 1 > > +struct SequentialExecutorRTL +{ + template< typename Begins, + typename Ends, + typename Func, + typename... Indices > + __cuda_callable__ + void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices ) + { + static_assert( Begins::getDimension() == Ends::getDimension(), + "wrong begins or ends" ); + + SequentialExecutorRTL< Permutation, IndexTag< LevelTag::value - 1 > > exec; + const auto begin = begins.template getSize< get< LevelTag::value >( Permutation{} ) >(); + const auto end = ends.template getSize< get< LevelTag::value >( Permutation{} ) >(); + for( auto i = begin; i < end; i++ ) + exec( begins, ends, f, i, std::forward< Indices >( indices )... ); + } +}; + +template< typename Permutation > +struct SequentialExecutorRTL< Permutation, IndexTag< 0 > > +{ + template< typename Begins, + typename Ends, + typename Func, + typename... Indices > + __cuda_callable__ + void operator()( const Begins& begins, const Ends& ends, Func f, Indices&&... indices ) + { + static_assert( Begins::getDimension() == Ends::getDimension(), + "wrong begins or ends" ); + static_assert( sizeof...(indices) == Begins::getDimension() - 1, + "invalid number of indices in the final step of the SequentialExecutorRTL" ); + + const auto begin = begins.template getSize< get< 0 >( Permutation{} ) >(); + const auto end = ends.template getSize< get< 0 >( Permutation{} ) >(); + for( auto i = begin; i < end; i++ ) + call_with_unpermuted_arguments< Permutation >( f, i, std::forward< Indices >( indices )... ); + } +}; + + +template< typename Permutation, + typename Device > +struct ParallelExecutorDeviceDispatch +{ + template< typename Begins, + typename Ends, + typename Func > + void operator()( const Begins& begins, const Ends& ends, Func f ) + { + static_assert( Begins::getDimension() == Ends::getDimension(), + "wrong begins or ends" ); + + using Index = typename Ends::IndexType; + + auto kernel = [=] ( Index i2, Index i1, Index i0 ) + { + SequentialExecutor< Permutation, IndexTag< 3 > > exec; + exec( begins, ends, f, i0, i1, i2 ); + }; + + const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >(); + const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >(); + const Index begin2 = begins.template getSize< get< 2 >( Permutation{} ) >(); + const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >(); + const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >(); + const Index end2 = ends.template getSize< get< 2 >( Permutation{} ) >(); + ParallelFor3D< Device >::exec( begin2, begin1, begin0, end2, end1, end0, kernel ); + } +}; + +template< typename Permutation > +struct ParallelExecutorDeviceDispatch< Permutation, Devices::Cuda > +{ + template< typename Begins, + typename Ends, + typename Func > + void operator()( const Begins& begins, const Ends& ends, Func f ) + { + static_assert( Begins::getDimension() == Ends::getDimension(), + "wrong begins or ends" ); + + using Index = typename Ends::IndexType; + + auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 ) + { + SequentialExecutorRTL< Permutation, IndexTag< Begins::getDimension() - 4 > > exec; + exec( begins, ends, f, i0, i1, i2 ); + }; + + const Index begin0 = begins.template getSize< get< Begins::getDimension() - 3 >( Permutation{} ) >(); + const Index begin1 = begins.template getSize< get< Begins::getDimension() - 2 >( Permutation{} ) >(); + const Index begin2 = begins.template getSize< get< Begins::getDimension() - 1 >( Permutation{} ) >(); + const Index end0 = ends.template getSize< get< Ends::getDimension() - 3 >( Permutation{} ) >(); + const Index end1 = ends.template getSize< get< Ends::getDimension() - 2 >( Permutation{} ) >(); + const Index end2 = ends.template getSize< get< Ends::getDimension() - 1 >( Permutation{} ) >(); + ParallelFor3D< Devices::Cuda >::exec( begin2, begin1, begin0, end2, end1, end0, kernel ); + } +}; + +template< typename Permutation, + typename Device, + typename DimTag = IndexTag< Permutation::size() > > +struct ParallelExecutor +{ + template< typename Begins, + typename Ends, + typename Func > + void operator()( const Begins& begins, const Ends& ends, Func f ) + { + ParallelExecutorDeviceDispatch< Permutation, Device > dispatch; + dispatch( begins, ends, f ); + } +}; + +template< typename Permutation, + typename Device > +struct ParallelExecutor< Permutation, Device, IndexTag< 3 > > +{ + template< typename Begins, + typename Ends, + typename Func > + void operator()( const Begins& begins, const Ends& ends, Func f ) + { + static_assert( Begins::getDimension() == Ends::getDimension(), + "wrong begins or ends" ); + + using Index = typename Ends::IndexType; + + // nvcc does not like nested __cuda_callable__ and normal lambdas... +// auto kernel = [=] __cuda_callable__ ( Index i2, Index i1, Index i0 ) +// { +// call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 ); +// }; + Kernel< Device > kernel; + + const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >(); + const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >(); + const Index begin2 = begins.template getSize< get< 2 >( Permutation{} ) >(); + const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >(); + const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >(); + const Index end2 = ends.template getSize< get< 2 >( Permutation{} ) >(); + ParallelFor3D< Device >::exec( begin2, begin1, begin0, end2, end1, end0, kernel, f ); + } + + template< typename __Device, typename = void > + struct Kernel + { + template< typename Index, typename Func > + void operator()( Index i2, Index i1, Index i0, Func f ) + { + call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 ); + }; + }; + + // dummy specialization to avoid a shitpile of nvcc warnings + template< typename __unused > + struct Kernel< Devices::Cuda, __unused > + { + template< typename Index, typename Func > + __cuda_callable__ + void operator()( Index i2, Index i1, Index i0, Func f ) + { + call_with_unpermuted_arguments< Permutation >( f, i0, i1, i2 ); + }; + }; +}; + +template< typename Permutation, + typename Device > +struct ParallelExecutor< Permutation, Device, IndexTag< 2 > > +{ + template< typename Begins, + typename Ends, + typename Func > + void operator()( const Begins& begins, const Ends& ends, Func f ) + { + static_assert( Begins::getDimension() == Ends::getDimension(), + "wrong begins or ends" ); + + using Index = typename Ends::IndexType; + + // nvcc does not like nested __cuda_callable__ and normal lambdas... +// auto kernel = [=] __cuda_callable__ ( Index i1, Index i0 ) +// { +// call_with_unpermuted_arguments< Permutation >( f, i0, i1 ); +// }; + Kernel< Device > kernel; + + const Index begin0 = begins.template getSize< get< 0 >( Permutation{} ) >(); + const Index begin1 = begins.template getSize< get< 1 >( Permutation{} ) >(); + const Index end0 = ends.template getSize< get< 0 >( Permutation{} ) >(); + const Index end1 = ends.template getSize< get< 1 >( Permutation{} ) >(); + ParallelFor2D< Device >::exec( begin1, begin0, end1, end0, kernel, f ); + } + + template< typename __Device, typename = void > + struct Kernel + { + template< typename Index, typename Func > + void operator()( Index i1, Index i0, Func f ) + { + call_with_unpermuted_arguments< Permutation >( f, i0, i1 ); + }; + }; + + // dummy specialization to avoid a shitpile of nvcc warnings + template< typename __unused > + struct Kernel< Devices::Cuda, __unused > + { + template< typename Index, typename Func > + __cuda_callable__ + void operator()( Index i1, Index i0, Func f ) + { + call_with_unpermuted_arguments< Permutation >( f, i0, i1 ); + }; + }; +}; + +template< typename Permutation, + typename Device > +struct ParallelExecutor< Permutation, Device, IndexTag< 1 > > +{ + template< typename Begins, + typename Ends, + typename Func > + void operator()( const Begins& begins, const Ends& ends, Func f ) + { + static_assert( Begins::getDimension() == Ends::getDimension(), + "wrong begins or ends" ); + + using Index = typename Ends::IndexType; + +// auto kernel = [=] __cuda_callable__ ( Index i ) +// { +// call_with_unpermuted_arguments< Permutation >( f, i ); +// }; + + const Index begin = begins.template getSize< get< 0 >( Permutation{} ) >(); + const Index end = ends.template getSize< get< 0 >( Permutation{} ) >(); +// ParallelFor< Device >::exec( begin, end, kernel ); + ParallelFor< Device >::exec( begin, end, f ); + } +}; + + +// Device may be void which stands for StaticNDArray +template< typename Permutation, + typename Device > +struct ExecutorDispatcher +{ + template< typename Begins, typename Ends, typename Func > + void operator()( const Begins& begins, const Ends& ends, Func f ) + { + SequentialExecutor< Permutation >()( begins, ends, f ); + } +}; + +template< typename Permutation > +struct ExecutorDispatcher< Permutation, Devices::Host > +{ + template< typename Begins, typename Ends, typename Func > + void operator()( const Begins& begins, const Ends& ends, Func f ) + { + if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 ) + ParallelExecutor< Permutation, Devices::Host >()( begins, ends, f ); + else + SequentialExecutor< Permutation >()( begins, ends, f ); + } +}; + +template< typename Permutation > +struct ExecutorDispatcher< Permutation, Devices::Cuda > +{ + template< typename Begins, typename Ends, typename Func > + void operator()( const Begins& begins, const Ends& ends, Func f ) + { + ParallelExecutor< Permutation, Devices::Cuda >()( begins, ends, f ); + } +}; + +} // namespace __ndarray_impl +} // namespace Containers +} // namespace TNL diff --git a/src/TNL/Containers/ndarray/Indexing.h b/src/TNL/Containers/ndarray/Indexing.h new file mode 100644 index 0000000000000000000000000000000000000000..a1b83ae518c10ba0000b0aea7a14b808e04cb40b --- /dev/null +++ b/src/TNL/Containers/ndarray/Indexing.h @@ -0,0 +1,295 @@ +/*************************************************************************** + Indexing.h - description + ------------------- + begin : Dec 24, 2018 + copyright : (C) 2018 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky + +#pragma once + +#include + +namespace TNL { +namespace Containers { +namespace __ndarray_impl { + +template< typename OffsetsHolder, + typename Sequence > +struct IndexShiftHelper +{}; + +template< typename OffsetsHolder, + std::size_t... N > +struct IndexShiftHelper< OffsetsHolder, std::index_sequence< N... > > +{ + template< typename Func, + typename... Indices > + __cuda_callable__ + static auto apply( const OffsetsHolder& offsets, Func&& f, Indices&&... indices ) -> decltype(auto) + { + return f( ( std::forward< Indices >( indices ) + offsets.template getSize< N >() )... ); + } + + template< typename Func, + typename... Indices > + static auto apply_host( const OffsetsHolder& offsets, Func&& f, Indices&&... indices ) -> decltype(auto) + { + return f( ( std::forward< Indices >( indices ) + offsets.template getSize< N >() )... ); + } +}; + +template< typename OffsetsHolder, + typename Func, + typename... Indices > +__cuda_callable__ +auto call_with_shifted_indices( const OffsetsHolder& offsets, Func&& f, Indices&&... indices ) -> decltype(auto) +{ + return IndexShiftHelper< OffsetsHolder, std::make_index_sequence< sizeof...( Indices ) > > + ::apply( offsets, std::forward< Func >( f ), std::forward< Indices >( indices )... ); +} + +template< typename OffsetsHolder, + typename Func, + typename... Indices > +auto host_call_with_unshifted_indices( const OffsetsHolder& offsets, Func&& f, Indices&&... indices ) -> decltype(auto) +{ + return IndexShiftHelper< OffsetsHolder, std::make_index_sequence< sizeof...( Indices ) > > + ::apply_host( offsets, std::forward< Func >( f ), std::forward< Indices >( indices )... ); +} + + +template< typename SizesHolder, + typename Overlaps, + typename Sequence > +struct IndexUnshiftHelper +{}; + +template< typename SizesHolder, + typename Overlaps, + std::size_t... N > +struct IndexUnshiftHelper< SizesHolder, Overlaps, std::index_sequence< N... > > +{ + template< typename Func, + typename... Indices > + __cuda_callable__ + static auto apply( const SizesHolder& begins, Func&& f, Indices&&... indices ) -> decltype(auto) + { + return f( ( get( Overlaps{} ) + std::forward< Indices >( indices ) - begins.template getSize< N >() )... ); + } + + template< typename Func, + typename... Indices > + static auto apply_host( const SizesHolder& begins, Func&& f, Indices&&... indices ) -> decltype(auto) + { + return f( ( get( Overlaps{} ) + std::forward< Indices >( indices ) - begins.template getSize< N >() )... ); + } +}; + +template< typename SizesHolder, + typename Overlaps = make_constant_index_sequence< SizesHolder::getDimension(), 0 >, + typename Func, + typename... Indices > +__cuda_callable__ +auto call_with_unshifted_indices( const SizesHolder& begins, Func&& f, Indices&&... indices ) -> decltype(auto) +{ + return IndexUnshiftHelper< SizesHolder, Overlaps, std::make_index_sequence< sizeof...( Indices ) > > + ::apply( begins, std::forward< Func >( f ), std::forward< Indices >( indices )... ); +} + +template< typename SizesHolder, + typename Overlaps = make_constant_index_sequence< SizesHolder::getDimension(), 0 >, + typename Func, + typename... Indices > +auto host_call_with_unshifted_indices( const SizesHolder& begins, Func&& f, Indices&&... indices ) -> decltype(auto) +{ + return IndexUnshiftHelper< SizesHolder, Overlaps, std::make_index_sequence< sizeof...( Indices ) > > + ::apply_host( begins, std::forward< Func >( f ), std::forward< Indices >( indices )... ); +} + + +template< typename Permutation, + typename Overlaps, + typename Alignment, + typename SliceInfo, + std::size_t level = Permutation::size() - 1, + bool _sliced_level = ( SliceInfo::getSliceSize( get< level >( Permutation{} ) ) > 0 ) > +struct SlicedIndexer +{}; + +template< typename Permutation, + typename Overlaps, + typename Alignment, + typename SliceInfo, + std::size_t level > +struct SlicedIndexer< Permutation, Overlaps, Alignment, SliceInfo, level, false > +{ + template< typename SizesHolder, typename StridesHolder, typename... Indices > + __cuda_callable__ + static typename SizesHolder::IndexType + getIndex( const SizesHolder& sizes, + const StridesHolder& strides, + Indices&&... indices ) + { + static constexpr std::size_t idx = get< level >( Permutation{} ); + static constexpr std::size_t overlap = __ndarray_impl::get< idx >( Overlaps{} ); + const auto alpha = get_from_pack< idx >( std::forward< Indices >( indices )... ); + const auto previous = SlicedIndexer< Permutation, Overlaps, Alignment, SliceInfo, level - 1 >::getIndex( sizes, strides, std::forward< Indices >( indices )... ); + return strides.template getStride< idx >( alpha ) * ( alpha + overlap + Alignment::template getAlignedSize< idx >( sizes ) * previous ); + } +}; + +template< typename Permutation, + typename Overlaps, + typename Alignment, + typename SliceInfo, + std::size_t level > +struct SlicedIndexer< Permutation, Overlaps, Alignment, SliceInfo, level, true > +{ + template< typename SizesHolder, typename StridesHolder, typename... Indices > + __cuda_callable__ + static typename SizesHolder::IndexType + getIndex( const SizesHolder& sizes, + const StridesHolder& strides, + Indices&&... indices ) + { + static_assert( SizesHolder::template getStaticSize< get< level >( Permutation{} ) >() == 0, + "Invalid SliceInfo: static dimension cannot be sliced." ); + + static constexpr std::size_t idx = get< level >( Permutation{} ); + static constexpr std::size_t overlap = __ndarray_impl::get< idx >( Overlaps{} ); + const auto alpha = get_from_pack< idx >( std::forward< Indices >( indices )... ); + static constexpr std::size_t S = SliceInfo::getSliceSize( idx ); + // TODO: check the calculation with strides + return strides.template getStride< idx >( alpha ) * + ( S * ((alpha + overlap) / S) * StorageSizeGetter< SizesHolder, Alignment, Overlaps, IndexTag< level - 1 > >::getPermuted( sizes, Permutation{} ) + + (alpha + overlap) % S ) + + S * SlicedIndexer< Permutation, Overlaps, Alignment, SliceInfo, level - 1 >::getIndex( sizes, strides, std::forward< Indices >( indices )... ); + } +}; + +template< typename Permutation, + typename Overlaps, + typename Alignment, + typename SliceInfo > +struct SlicedIndexer< Permutation, Overlaps, Alignment, SliceInfo, 0, false > +{ + template< typename SizesHolder, typename StridesHolder, typename... Indices > + __cuda_callable__ + static typename SizesHolder::IndexType + getIndex( const SizesHolder& sizes, + const StridesHolder& strides, + Indices&&... indices ) + { + static constexpr std::size_t idx = get< 0 >( Permutation{} ); + static constexpr std::size_t overlap = __ndarray_impl::get< idx >( Overlaps{} ); + const auto alpha = get_from_pack< idx >( std::forward< Indices >( indices )... ); + return strides.template getStride< idx >( alpha ) * (alpha + overlap); + } +}; + +template< typename Permutation, + typename Overlaps, + typename Alignment, + typename SliceInfo > +struct SlicedIndexer< Permutation, Overlaps, Alignment, SliceInfo, 0, true > +{ + template< typename SizesHolder, typename StridesHolder, typename... Indices > + __cuda_callable__ + static typename SizesHolder::IndexType + getIndex( const SizesHolder& sizes, + const StridesHolder& strides, + Indices&&... indices ) + { + static constexpr std::size_t idx = get< 0 >( Permutation{} ); + static constexpr std::size_t overlap = __ndarray_impl::get< idx >( Overlaps{} ); + const auto alpha = get_from_pack< idx >( std::forward< Indices >( indices )... ); + return strides.template getStride< idx >( alpha ) * (alpha + overlap); + } +}; + + +// SliceInfo should be always empty (i.e. sliceSize == 0) +template< typename SliceInfo > +struct NDArrayBase +{ + template< typename Permutation > + struct Alignment + { + template< std::size_t dimension, typename SizesHolder > + __cuda_callable__ + static typename SizesHolder::IndexType + getAlignedSize( const SizesHolder& sizes ) + { + const auto size = sizes.template getSize< dimension >(); + // round up the last dynamic dimension to improve performance + // TODO: aligning is good for GPU, but bad for CPU +// static constexpr decltype(size) mult = 32; +// if( dimension == get< Permutation::size() - 1 >( Permutation{} ) +// && SizesHolder::template getStaticSize< dimension >() == 0 ) +// return mult * ( size / mult + ( size % mult != 0 ) ); + return size; + } + }; + + template< typename Permutation, typename Overlaps, typename SizesHolder, typename StridesHolder, typename... Indices > + __cuda_callable__ + typename SizesHolder::IndexType + static getStorageIndex( const SizesHolder& sizes, const StridesHolder& strides, Indices&&... indices ) + { + static_assert( check_slice_size( SizesHolder::getDimension(), 0 ), "BUG - invalid SliceInfo type passed to NDArrayBase" ); + using Alignment = Alignment< Permutation >; + return SlicedIndexer< Permutation, Overlaps, Alignment, SliceInfo >::getIndex( sizes, strides, std::forward< Indices >( indices )... ); + } + +private: + static constexpr bool check_slice_size( std::size_t dim, std::size_t sliceSize ) + { + for( std::size_t i = 0; i < dim; i++ ) + if( SliceInfo::getSliceSize( i ) != sliceSize ) + return false; + return true; + } +}; + + +template< typename SliceInfo > +struct SlicedNDArrayBase +{ + template< typename Permutation > + struct Alignment + { + template< std::size_t dimension, typename SizesHolder > + __cuda_callable__ + static typename SizesHolder::IndexType + getAlignedSize( const SizesHolder& sizes ) + { + const auto size = sizes.template getSize< dimension >(); + if( SliceInfo::getSliceSize(dimension) > 0 ) + // round to multiple of SliceSize + return SliceInfo::getSliceSize(dimension) * ( + size / SliceInfo::getSliceSize(dimension) + + ( size % SliceInfo::getSliceSize(dimension) != 0 ) + ); + // unmodified + return size; + } + }; + + template< typename Permutation, typename Overlaps, typename SizesHolder, typename StridesHolder, typename... Indices > + __cuda_callable__ + static typename SizesHolder::IndexType + getStorageIndex( const SizesHolder& sizes, const StridesHolder& strides, Indices&&... indices ) + { + using Alignment = Alignment< Permutation >; + return SlicedIndexer< Permutation, Overlaps, Alignment, SliceInfo >::getIndex( sizes, strides, std::forward< Indices >( indices )... ); + } +}; + +} // namespace __ndarray_impl +} // namespace Containers +} // namespace TNL diff --git a/src/TNL/Containers/ndarray/Meta.h b/src/TNL/Containers/ndarray/Meta.h new file mode 100644 index 0000000000000000000000000000000000000000..ccff5a329aba36ceaa7e85e9bb915211858d8688 --- /dev/null +++ b/src/TNL/Containers/ndarray/Meta.h @@ -0,0 +1,396 @@ +/*************************************************************************** + Meta.h - description + ------------------- + begin : Dec 24, 2018 + copyright : (C) 2018 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky + +#pragma once + +#include +#include + +#include + +namespace TNL { +namespace Containers { +namespace __ndarray_impl { + +/* + * Generic function to get the N-th element from a variadic pack. + * Reference: + * http://stackoverflow.com/questions/20162903/template-parameter-packs-access-nth-type-and-nth-element/37836252#37836252 + */ +template< std::size_t index, typename T, typename... Ts, + typename = typename std::enable_if< index == 0 >::type > +constexpr T +get_from_pack( T&& arg, Ts&&... args ) +{ + return arg; +} + +template< std::size_t index, typename T, typename... Ts, + typename = typename std::enable_if< (index > 0) && index <= sizeof...( Ts ) >::type > +constexpr auto +get_from_pack( T&& arg, Ts&&... args ) +{ + return get_from_pack< index-1 >( std::forward< Ts >( args )... ); +} + +// complementary specialization for getting a more readable compilation error +// in case calling get with a bad index +template< long long index, typename T, typename... Ts, + typename = typename std::enable_if< (index < 0) || (index > sizeof...( Ts )) >::type > +constexpr T +get_from_pack( T&& arg, Ts&&... args ) +{ + static_assert( index >= 0 && index <= sizeof...( Ts ), + "invalid index passed to the get function" ); + return arg; +} + + +// Get N-th element from std::integer_sequence. +template< std::size_t N, typename Index, Index... vals > +constexpr Index +get( std::integer_sequence< Index, vals... > ) +{ + return get_from_pack< N >( vals... ); +} + + +// Test if a variadic pack contains a value. +template< typename Index, typename T > +constexpr bool +is_in_pack( Index value, T&& pack_value ) +{ + return value == pack_value; +} + +template< typename Index, typename T, typename... Ts > +constexpr bool +is_in_pack( Index value, T&& pack_value, Ts&&... vals ) +{ + if( value == pack_value ) + return true; + return is_in_pack( value, std::forward< Ts >( vals )... ); +} + + +// Test if an std::integer_sequence contains an element. +template< typename Index, Index... vals > +constexpr bool +is_in_sequence( Index value, std::integer_sequence< Index, vals... > ) +{ + return is_in_pack( value, vals... ); +} + + +// Get index of the first occurrence of value in a variadic pack. +template< typename V > +constexpr std::size_t +index_in_pack( V&& value ) +{ + return 0; +} + +template< typename V, typename T, typename... Ts > +constexpr std::size_t +index_in_pack( V&& value, T&& arg, Ts&&... args ) +{ + if( value == arg ) + return 0; + return 1 + index_in_pack( value, std::forward< Ts >( args )... ); +} + + +// Get index of the first occurrence of value in a std::integer_sequence +template< typename V, typename Index, Index... vals > +constexpr std::size_t +index_in_sequence( V&& value, std::integer_sequence< Index, vals... > ) +{ + return index_in_pack( std::forward< V >( value ), vals... ); +} + + +/* + * Generic function to concatenate an arbitrary number of std::integer_sequence instances. + * Useful mainly for getting the type of the resulting sequence with `decltype`. + */ +// concatenate a single, potentially empty sequence +template< typename Index, Index... s > +constexpr auto +concat_sequences( std::integer_sequence< Index, s... > ) +{ + return std::integer_sequence< Index, s... >{}; +} + +// concatenate two sequences, each potentially empty +template< typename Index, Index... s, Index... t> +constexpr auto +concat_sequences( std::integer_sequence< Index, s... >, std::integer_sequence< Index, t... > ) +{ + return std::integer_sequence< Index, s... , t... >{}; +} + +// concatenate more than 2 sequences +template< typename Index, Index... s, Index... t, typename... R > +constexpr auto +concat_sequences( std::integer_sequence< Index, s... >, std::integer_sequence< Index, t...>, R... ) +{ + return concat_sequences( std::integer_sequence< Index, s..., t... >{}, R{}... ); +} + + +// Integer wrapper necessary for C++ templates specializations. +// As the C++ standard says: +// A partially specialized non-type argument expression shall not involve +// a template parameter of the partial specialization except when the argument +// expression is a simple identifier. +template< std::size_t v > +struct IndexTag +{ + static constexpr std::size_t value = v; +}; + + +template< typename Permutation, + typename Sequence > +struct CallPermutationHelper +{}; + +template< typename Permutation, + std::size_t... N > +struct CallPermutationHelper< Permutation, std::index_sequence< N... > > +{ + template< typename Func, + typename... Args > + __cuda_callable__ + static auto apply( Func&& f, Args&&... args ) -> decltype(auto) + { + return std::forward< Func >( f )( get_from_pack< + get< N >( Permutation{} ) + >( std::forward< Args >( args )... )... ); + } +}; + +// Call specified function with permuted arguments. +// [used in ndarray_operations.h] +template< typename Permutation, + typename Func, + typename... Args > +__cuda_callable__ +// FIXME: does not compile with nvcc 10.0 +//auto call_with_permuted_arguments( Func&& f, Args&&... args ) -> decltype(auto) +//{ +// return CallPermutationHelper< Permutation, std::make_index_sequence< sizeof...( Args ) > > +// ::apply( std::forward< Func >( f ), std::forward< Args >( args )... ); +//} +auto call_with_permuted_arguments( Func f, Args&&... args ) -> decltype(auto) +{ + return CallPermutationHelper< Permutation, std::make_index_sequence< sizeof...( Args ) > > + ::apply( f, std::forward< Args >( args )... ); +} + + +template< typename Permutation, + typename Sequence > +struct CallInversePermutationHelper +{}; + +template< typename Permutation, + std::size_t... N > +struct CallInversePermutationHelper< Permutation, std::index_sequence< N... > > +{ + template< typename Func, + typename... Args > + __cuda_callable__ + static auto apply( Func&& f, Args&&... args ) -> decltype(auto) + { + return std::forward< Func >( f )( get_from_pack< + index_in_sequence( N, Permutation{} ) + >( std::forward< Args >( args )... )... ); + } +}; + +// Call specified function with permuted arguments. +// [used in ndarray_operations.h] +template< typename Permutation, + typename Func, + typename... Args > +__cuda_callable__ +// FIXME: does not compile with nvcc 10.0 +//auto call_with_unpermuted_arguments( Func&& f, Args&&... args ) -> decltype(auto) +//{ +// return CallInversePermutationHelper< Permutation, std::make_index_sequence< sizeof...( Args ) > > +// ::apply( std::forward< Func >( f ), std::forward< Args >( args )... ); +//} +auto call_with_unpermuted_arguments( Func f, Args&&... args ) -> decltype(auto) +{ + return CallInversePermutationHelper< Permutation, std::make_index_sequence< sizeof...( Args ) > > + ::apply( f, std::forward< Args >( args )... ); +} + + +// Check that all elements of the initializer list are equal to the specified value. +// [used in ndarray_operations.h] +constexpr bool +all_elements_equal_to_value( std::size_t value, std::initializer_list< std::size_t > list ) +{ + for( auto elem : list ) + if( elem != value ) + return false; + return true; +} + + +// Check that all elements of the initializer list are in the specified range [begin, end). +// [used in ndarray.h -- static assertions on permutations] +constexpr bool +all_elements_in_range( std::size_t begin, std::size_t end, std::initializer_list< std::size_t > list ) +{ + for( auto elem : list ) + if( elem < begin || elem >= end ) + return false; + return true; +} + + +// Check that the elements of the initializer list form an increasing sequence. +// [used in ndarray.h -- static assertion in getSubarrayView()] +constexpr bool +is_increasing_sequence( std::initializer_list< std::size_t > list ) +{ + std::size_t prev = *list.begin(); + for( auto& elem : list ) { + if( &elem == list.begin() ) + continue; + if( elem <= prev ) + return false; + prev = elem; + } + return true; +} + + +// Count elements of a variadic pack smaller than a specified value +// [used in ndarray_subarray.h to generate a subpermutation] +template< typename T, typename V > +constexpr std::size_t +count_smaller( T threshold, V&& value ) +{ + return value < threshold ? 1 : 0; +} + +template< typename T, typename V, typename... Values > +constexpr std::size_t +count_smaller( T threshold, V&& value, Values&&... vals ) +{ + if( value < threshold ) + return 1 + count_smaller( threshold, vals... ); + return count_smaller( threshold, vals... ); +} + + +// C++17 version using "if constexpr" and a general predicate (lambda function) +// Reference: https://stackoverflow.com/a/41723705 +//template< typename Index, Index a, typename Predicate > +//constexpr auto +//FilterSingle( std::integer_sequence< Index, a >, Predicate pred ) +//{ +// if constexpr (pred(a)) +// return std::integer_sequence< Index, a >{}; +// else +// return std::integer_sequence< Index >{}; +//} +// +//// empty sequence case +//template< typename Index, typename Predicate > +//constexpr auto +//filter_sequence( std::integer_sequence< Index >, [[maybe_unused]] Predicate pred ) +//{ +// return std::integer_sequence< Index >{}; +//} +// +//// non empty sequence case +//template< typename Index, Index... vals, typename Predicate > +//constexpr auto +//filter_sequence( std::integer_sequence< Index, vals... >, [[maybe_unused]] Predicate pred ) +//{ +// return concat_sequences( FilterSingle( std::integer_sequence< Index, vals >{}, pred )... ); +//} + +// C++14 version, with hard-coded predicate +template< typename Mask, typename Index, Index val > +constexpr typename std::conditional_t< is_in_sequence( val, Mask{} ), + std::integer_sequence< Index, val >, + std::integer_sequence< Index > > +FilterSingle( std::integer_sequence< Index, val > ) +{ + return {}; +} + +/* + * Generic function returning a subsequence of a sequence obtained by omitting + * the elements not contained in the specified mask. + */ +// empty sequence case +template< typename Mask, typename Index > +constexpr auto +filter_sequence( std::integer_sequence< Index > ) +{ + return std::integer_sequence< Index >{}; +} + +// non empty sequence case +template< typename Mask, typename Index, Index... vals > +constexpr auto +filter_sequence( std::integer_sequence< Index, vals... > ) +{ + return concat_sequences( FilterSingle< Mask >( std::integer_sequence< Index, vals >{} )... ); +} + + +/* + * make_constant_integer_sequence, make_constant_index_sequence - helper + * templates for the generation of constant sequences like + * std::make_integer_sequence, std::make_index_sequence + */ +template< typename T, typename N, T v > struct gen_const_seq; +template< typename T, typename N, T v > using gen_const_seq_t = typename gen_const_seq< T, N, v >::type; + +template< typename T, typename N, T v > +struct gen_const_seq +{ + using type = decltype(concat_sequences( + gen_const_seq_t, v>{}, + gen_const_seq_t, v>{} + )); +}; + +template< typename T, T v > +struct gen_const_seq< T, std::integral_constant, v > +{ + using type = std::integer_sequence; +}; + +template< typename T, T v > +struct gen_const_seq< T, std::integral_constant, v > +{ + using type = std::integer_sequence; +}; + +template< typename T, T N, T value > +using make_constant_integer_sequence = gen_const_seq_t< T, std::integral_constant, value >; + +template< std::size_t N, std::size_t value > +using make_constant_index_sequence = gen_const_seq_t< std::size_t, std::integral_constant, value >; + +} // namespace __ndarray_impl +} // namespace Containers +} // namespace TNL diff --git a/src/TNL/Containers/ndarray/Operations.h b/src/TNL/Containers/ndarray/Operations.h new file mode 100644 index 0000000000000000000000000000000000000000..eb219b6e011e2b0a973e136d690318999f0b1c9a --- /dev/null +++ b/src/TNL/Containers/ndarray/Operations.h @@ -0,0 +1,366 @@ +/*************************************************************************** + Operations.h - description + ------------------- + begin : Dec 24, 2018 + copyright : (C) 2018 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky + +#pragma once + +#include + +namespace TNL { +namespace Containers { + +namespace __ndarray_impl { + +#ifndef __NVCC__ +template< typename Output, + typename Func, + typename... Input > +void nd_map_view( Output output, Func f, const Input... input ) +{ + static_assert( all_elements_equal_to_value( Output::getDimension(), {Input::getDimension()...} ), + "all arrays must be of the same dimension" ); + + // without mutable, the operator() would be const so output would be const as well + // https://stackoverflow.com/a/2835645/4180822 + auto wrapper = [=] __cuda_callable__ ( auto... indices ) mutable { + static_assert( sizeof...( indices ) == Output::getDimension(), + "wrong number of indices passed to the wrapper lambda function" ); + output( indices... ) = f( input( indices... )... ); + }; + + ExecutorDispatcher< typename Output::PermutationType, typename Output::DeviceType > dispatch; + using Begins = ConstStaticSizesHolder< typename Output::IndexType, output.getDimension(), 0 >; + dispatch( Begins{}, output.getSizes(), wrapper ); +} + +#else + + template< typename Output, + typename Func > + struct nvcc_map_helper_0 + { + Output output; + Func f; + + nvcc_map_helper_0( Output o, Func f ) : output(o), f(f) {} + + template< typename... Ts > + __cuda_callable__ + void operator()( Ts... indices ) + { + static_assert( sizeof...( indices ) == Output::getDimension(), + "wrong number of indices passed to the wrapper operator() function" ); + output( indices... ) = f(); + } + }; + + template< typename Output, + typename Func, + typename Input1 > + struct nvcc_map_helper_1 + { + Output output; + Func f; + Input1 input1; + + nvcc_map_helper_1( Output o, Func f, Input1 i1 ) : output(o), f(f), input1(i1) {} + + template< typename... Ts > + __cuda_callable__ + void operator()( Ts... indices ) + { + static_assert( sizeof...( indices ) == Output::getDimension(), + "wrong number of indices passed to the wrapper operator() function" ); + output( indices... ) = f( input1( indices... ) ); + } + }; + + template< typename Output, + typename Func, + typename Input1, + typename Input2 > + struct nvcc_map_helper_2 + { + Output output; + Func f; + Input1 input1; + Input2 input2; + + nvcc_map_helper_2( Output o, Func f, Input1 i1, Input2 i2 ) : output(o), f(f), input1(i1), input2(i2) {} + + template< typename... Ts > + __cuda_callable__ + void operator()( Ts... indices ) + { + static_assert( sizeof...( indices ) == Output::getDimension(), + "wrong number of indices passed to the wrapper operator() function" ); + output( indices... ) = f( input1( indices... ), input2( indices... ) ); + } + }; + + template< typename Output, + typename Func, + typename Input1, + typename Input2, + typename Input3 > + struct nvcc_map_helper_3 + { + Output output; + Func f; + Input1 input1; + Input2 input2; + Input3 input3; + + nvcc_map_helper_3( Output o, Func f, Input1 i1, Input2 i2, Input3 i3 ) : output(o), f(f), input1(i1), input2(i2), input3(i3) {} + + template< typename... Ts > + __cuda_callable__ + void operator()( Ts... indices ) + { + static_assert( sizeof...( indices ) == Output::getDimension(), + "wrong number of indices passed to the wrapper operator() function" ); + output( indices... ) = f( input1( indices... ), input2( indices... ), input3( indices... ) ); + } + }; + +template< typename Output, + typename Func > +void nd_map_view( Output output, Func f ) +{ + nvcc_map_helper_0< Output, Func > wrapper( output, f ); + ExecutorDispatcher< typename Output::PermutationType, typename Output::DeviceType > dispatch; + using Begins = ConstStaticSizesHolder< typename Output::IndexType, output.getDimension(), 0 >; + dispatch( Begins{}, output.getSizes(), wrapper ); +} + +template< typename Output, + typename Func, + typename Input1 > +void nd_map_view( Output output, Func f, const Input1 input1 ) +{ + static_assert( all_elements_equal_to_value( Output::getDimension(), {Input1::getDimension()} ), + "all arrays must be of the same dimension" ); + + nvcc_map_helper_1< Output, Func, Input1 > wrapper( output, f, input1 ); + ExecutorDispatcher< typename Output::PermutationType, typename Output::DeviceType > dispatch; + using Begins = ConstStaticSizesHolder< typename Output::IndexType, output.getDimension(), 0 >; + dispatch( Begins{}, output.getSizes(), wrapper ); +} + +template< typename Output, + typename Func, + typename Input1, + typename Input2 > +void nd_map_view( Output output, Func f, const Input1 input1, const Input2 input2 ) +{ + static_assert( all_elements_equal_to_value( Output::getDimension(), {Input1::getDimension(), Input2::getDimension()} ), + "all arrays must be of the same dimension" ); + + nvcc_map_helper_2< Output, Func, Input1, Input2 > wrapper( output, f, input1, input2 ); + ExecutorDispatcher< typename Output::PermutationType, typename Output::DeviceType > dispatch; + using Begins = ConstStaticSizesHolder< typename Output::IndexType, output.getDimension(), 0 >; + dispatch( Begins{}, output.getSizes(), wrapper ); +} + +template< typename Output, + typename Func, + typename Input1, + typename Input2, + typename Input3 > +void nd_map_view( Output output, Func f, const Input1 input1, const Input2 input2, const Input3 input3 ) +{ + static_assert( all_elements_equal_to_value( Output::getDimension(), {Input1::getDimension(), Input2::getDimension(), Input3::getDimension()} ), + "all arrays must be of the same dimension" ); + + nvcc_map_helper_3< Output, Func, Input1, Input2, Input3 > wrapper( output, f, input1, input2, input3 ); + ExecutorDispatcher< typename Output::PermutationType, typename Output::DeviceType > dispatch; + using Begins = ConstStaticSizesHolder< typename Output::IndexType, output.getDimension(), 0 >; + dispatch( Begins{}, output.getSizes(), wrapper ); +} + +#endif + +} // namespace __ndarray_impl + + +// f must be an N-ary function, where N is the dimension of the output and input arrays: +// output( i1, ..., iN ) = f( input1( i1, ..., iN ), ... inputM( i1, ..., iN ) ) +template< typename Output, + typename Func, + typename... Input > +void nd_map( Output& output, Func f, const Input&... input ) +{ + __ndarray_impl::nd_map_view( output.getView(), f, input.getConstView()... ); +} + +template< typename Output, + typename Input > +void nd_assign( Output& output, const Input& input ) +{ +#ifndef __NVCC__ + nd_map( output, [] __cuda_callable__ ( auto v ){ return v; }, input ); +#else + using value_type = typename Input::ValueType; + nd_map( output, [] __cuda_callable__ ( value_type v ){ return v; }, input ); +#endif +} + +// Some mathematical functions, inspired by NumPy: +// https://docs.scipy.org/doc/numpy/reference/ufuncs.html#math-operations + +template< typename Output, + typename Input1, + typename Input2 > +void nd_add( Output& output, const Input1& input1, const Input2& input2 ) +{ +#ifndef __NVCC__ + nd_map( output, [] __cuda_callable__ ( auto v1, auto v2 ){ return v1 + v2; }, input1, input2 ); +#else + using value_type_1 = typename Input1::ValueType; + using value_type_2 = typename Input2::ValueType; + nd_map( output, [] __cuda_callable__ ( value_type_1 v1, value_type_2 v2 ){ return v1 + v2; }, input1, input2 ); +#endif +} + +template< typename Output, + typename Input1, + typename Input2 > +void nd_subtract( Output& output, const Input1& input1, const Input2& input2 ) +{ +#ifndef __NVCC__ + nd_map( output, [] __cuda_callable__ ( auto v1, auto v2 ){ return v1 - v2; }, input1, input2 ); +#else + using value_type_1 = typename Input1::ValueType; + using value_type_2 = typename Input2::ValueType; + nd_map( output, [] __cuda_callable__ ( value_type_1 v1, value_type_2 v2 ){ return v1 - v2; }, input1, input2 ); +#endif +} + +template< typename Output, + typename Input1, + typename Input2 > +void nd_multiply( Output& output, const Input1& input1, const Input2& input2 ) +{ +#ifndef __NVCC__ + nd_map( output, [] __cuda_callable__ ( auto v1, auto v2 ){ return v1 * v2; }, input1, input2 ); +#else + using value_type_1 = typename Input1::ValueType; + using value_type_2 = typename Input2::ValueType; + nd_map( output, [] __cuda_callable__ ( value_type_1 v1, value_type_2 v2 ){ return v1 * v2; }, input1, input2 ); +#endif +} + +template< typename Output, + typename Input1, + typename Input2 > +void nd_divide( Output& output, const Input1& input1, const Input2& input2 ) +{ +#ifndef __NVCC__ + nd_map( output, [] __cuda_callable__ ( auto v1, auto v2 ){ return v1 / v2; }, input1, input2 ); +#else + using value_type_1 = typename Input1::ValueType; + using value_type_2 = typename Input2::ValueType; + nd_map( output, [] __cuda_callable__ ( value_type_1 v1, value_type_2 v2 ){ return v1 / v2; }, input1, input2 ); +#endif +} + +template< typename Output, + typename Input1, + typename Input2 > +void nd_maximum( Output& output, const Input1& input1, const Input2& input2 ) +{ +#ifndef __NVCC__ + nd_map( output, [] __cuda_callable__ ( auto v1, auto v2 ){ return TNL::max( v1, v2 ); }, input1, input2 ); +#else + using value_type_1 = typename Input1::ValueType; + using value_type_2 = typename Input2::ValueType; + nd_map( output, [] __cuda_callable__ ( value_type_1 v1, value_type_2 v2 ){ return TNL::max( v1, v2 ); }, input1, input2 ); +#endif +} + +template< typename Output, + typename Input1, + typename Input2 > +void nd_minimum( Output& output, const Input1& input1, const Input2& input2 ) +{ +#ifndef __NVCC__ + nd_map( output, [] __cuda_callable__ ( auto v1, auto v2 ){ return TNL::min( v1, v2 ); }, input1, input2 ); +#else + using value_type_1 = typename Input1::ValueType; + using value_type_2 = typename Input2::ValueType; + nd_map( output, [] __cuda_callable__ ( value_type_1 v1, value_type_2 v2 ){ return TNL::min( v1, v2 ); }, input1, input2 ); +#endif +} + +template< typename Output, + typename Input > +void nd_absolute( Output& output, const Input& input ) +{ +#ifndef __NVCC__ + nd_map( output, [] __cuda_callable__ ( auto v ){ return TNL::abs( v ); }, input ); +#else + using value_type = typename Input::ValueType; + nd_map( output, [] __cuda_callable__ ( value_type v ){ return TNL::abs( v ); }, input ); +#endif +} + +template< typename Output, + typename Input > +void nd_sign( Output& output, const Input& input ) +{ +#ifndef __NVCC__ + nd_map( output, [] __cuda_callable__ ( auto v ){ return TNL::sign( v ); }, input ); +#else + using value_type = typename Input::ValueType; + nd_map( output, [] __cuda_callable__ ( value_type v ){ return TNL::sign( v ); }, input ); +#endif +} + +template< typename Output, + typename Input1, + typename Input2 > +void nd_pow( Output& output, const Input1& input1, const Input2& input2 ) +{ +#ifndef __NVCC__ + nd_map( output, [] __cuda_callable__ ( auto v1, auto v2 ){ return TNL::pow( v1, v2 ); }, input1, input2 ); +#else + using value_type_1 = typename Input1::ValueType; + using value_type_2 = typename Input2::ValueType; + nd_map( output, [] __cuda_callable__ ( value_type_1 v1, value_type_2 v2 ){ return TNL::pow( v1, v2 ); }, input1, input2 ); +#endif +} + +template< typename Output, + typename Input > +void nd_sqrt( Output& output, const Input& input ) +{ +#ifndef __NVCC__ + nd_map( output, [] __cuda_callable__ ( auto v ){ return TNL::sqrt( v ); }, input ); +#else + using value_type = typename Input::ValueType; + nd_map( output, [] __cuda_callable__ ( value_type v ){ return TNL::sqrt( v ); }, input ); +#endif +} + +template< typename Output, + typename Input > +void nd_square( Output& output, const Input& input ) +{ +#ifndef __NVCC__ + nd_map( output, [] __cuda_callable__ ( auto v ){ return v*v; }, input ); +#else + using value_type = typename Input::ValueType; + nd_map( output, [] __cuda_callable__ ( value_type v ){ return v*v; }, input ); +#endif +} + +} // namespace Containers +} // namespace TNL diff --git a/src/TNL/Containers/ndarray/SizesHolder.h b/src/TNL/Containers/ndarray/SizesHolder.h new file mode 100644 index 0000000000000000000000000000000000000000..0b6e1f83d41f07c62306729de5ae0ed65e656f53 --- /dev/null +++ b/src/TNL/Containers/ndarray/SizesHolder.h @@ -0,0 +1,374 @@ +/*************************************************************************** + SizesHolder.h - description + ------------------- + begin : Dec 24, 2018 + copyright : (C) 2018 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky + +#pragma once + +#include +#include +#include + +#include + +namespace TNL { +namespace Containers { + +namespace __ndarray_impl { + +template< typename Index, + typename LevelTag, + std::size_t size > +class SizeHolder +{ +public: + __cuda_callable__ + constexpr Index getSize( LevelTag ) const + { + return size; + } + + __cuda_callable__ + void setSize( LevelTag, Index newSize ) + { + TNL_ASSERT_EQ( newSize, 0, "Dynamic size for a static dimension must be 0." ); + } + + __cuda_callable__ + bool operator==( const SizeHolder& ) const + { + return true; + } +}; + +template< typename Index, + typename LevelTag > +class SizeHolder< Index, LevelTag, 0 > +{ +public: + __cuda_callable__ + Index getSize( LevelTag ) const + { + return size; + } + + __cuda_callable__ + void setSize( LevelTag, Index size ) + { + this->size = size; + } + + __cuda_callable__ + bool operator==( const SizeHolder& other ) const + { + return size == other.size; + } + +private: + Index size = 0; +}; + +template< typename Index, + std::size_t currentSize, + std::size_t... otherSizes > +class SizesHolderLayer +: public SizesHolderLayer< Index, otherSizes... >, + public SizeHolder< Index, + IndexTag< sizeof...( otherSizes ) >, // LevelTag + currentSize > +{ + using BaseType = SizesHolderLayer< Index, otherSizes... >; + using Layer = SizeHolder< Index, + IndexTag< sizeof...( otherSizes ) >, // LevelTag + currentSize >; +protected: + using BaseType::getSize; + using BaseType::setSize; + using Layer::getSize; + using Layer::setSize; + + __cuda_callable__ + bool operator==( const SizesHolderLayer& other ) const + { + return BaseType::operator==( other ) && + Layer::operator==( other ); + } +}; + +// specializations to terminate the recursive inheritance +template< typename Index, + std::size_t currentSize > +class SizesHolderLayer< Index, currentSize > +: public SizeHolder< Index, + IndexTag< 0 >, // LevelTag + currentSize > +{ + using Layer = SizeHolder< Index, + IndexTag< 0 >, // LevelTag + currentSize >; +protected: + using Layer::getSize; + using Layer::setSize; + + __cuda_callable__ + bool operator==( const SizesHolderLayer& other ) const + { + return Layer::operator==( other ); + } +}; + +template< std::size_t dimension > +struct SizesHolderStaticSizePrinter +{ + template< typename SizesHolder > + static void exec( std::ostream& str, const SizesHolder& holder ) + { + str << holder.template getStaticSize< dimension >() << ", "; + } +}; + +template< std::size_t dimension > +struct SizesHolderSizePrinter +{ + template< typename SizesHolder > + static void exec( std::ostream& str, const SizesHolder& holder ) + { + str << holder.template getSize< dimension >() << ", "; + } +}; + +template< std::size_t level > +struct SizesHolerOperatorPlusHelper +{ + template< typename Result, typename LHS, typename RHS > + static void exec( Result& result, const LHS& lhs, const RHS& rhs ) + { + if( result.template getStaticSize< level >() == 0 ) + result.template setSize< level >( lhs.template getSize< level >() + rhs.template getSize< level >() ); + } +}; + +template< std::size_t level > +struct SizesHolerOperatorMinusHelper +{ + template< typename Result, typename LHS, typename RHS > + static void exec( Result& result, const LHS& lhs, const RHS& rhs ) + { + if( result.template getStaticSize< level >() == 0 ) + result.template setSize< level >( lhs.template getSize< level >() - rhs.template getSize< level >() ); + } +}; + +} // namespace __ndarray_impl + + +// dimensions and static sizes are specified as std::size_t, +// the type of dynamic sizes is configurable with Index + +template< typename Index, + std::size_t... sizes > +class SizesHolder +: public __ndarray_impl::SizesHolderLayer< Index, sizes... > +{ + using BaseType = __ndarray_impl::SizesHolderLayer< Index, sizes... >; + +public: + using IndexType = Index; + + static constexpr std::size_t getDimension() + { + return sizeof...( sizes ); + } + + template< std::size_t dimension > + static constexpr std::size_t getStaticSize() + { + static_assert( dimension < sizeof...(sizes), "Invalid dimension passed to getStaticSize()." ); + return __ndarray_impl::get_from_pack< dimension >( sizes... ); + } + + template< std::size_t level > + __cuda_callable__ + Index getSize() const + { + static_assert( level < sizeof...(sizes), "Invalid level passed to getSize()." ); + return BaseType::getSize( __ndarray_impl::IndexTag< getDimension() - level - 1 >() ); + } + + template< std::size_t level > + __cuda_callable__ + void setSize( Index size ) + { + static_assert( level < sizeof...(sizes), "Invalid level passed to setSize()." ); + BaseType::setSize( __ndarray_impl::IndexTag< getDimension() - level - 1 >(), size ); + } + + // methods for convenience + __cuda_callable__ + bool operator==( const SizesHolder& other ) const + { + return BaseType::operator==( other ); + } + + __cuda_callable__ + bool operator!=( const SizesHolder& other ) const + { + return ! operator==( other ); + } +}; + +template< typename Index, + std::size_t... sizes, + typename OtherHolder > +SizesHolder< Index, sizes... > +operator+( const SizesHolder< Index, sizes... >& lhs, const OtherHolder& rhs ) +{ + SizesHolder< Index, sizes... > result; + TemplateStaticFor< std::size_t, 0, sizeof...(sizes), __ndarray_impl::SizesHolerOperatorPlusHelper >::execHost( result, lhs, rhs ); + return result; +} + +template< typename Index, + std::size_t... sizes, + typename OtherHolder > +SizesHolder< Index, sizes... > +operator-( const SizesHolder< Index, sizes... >& lhs, const OtherHolder& rhs ) +{ + SizesHolder< Index, sizes... > result; + TemplateStaticFor< std::size_t, 0, sizeof...(sizes), __ndarray_impl::SizesHolerOperatorMinusHelper >::execHost( result, lhs, rhs ); + return result; +} + + +template< typename Index, + std::size_t dimension, + Index constSize > +class ConstStaticSizesHolder +{ +public: + using IndexType = Index; + + static constexpr std::size_t getDimension() + { + return dimension; + } + + template< std::size_t level > + static constexpr std::size_t getStaticSize() + { + static_assert( level < getDimension(), "Invalid level passed to getStaticSize()." ); + return constSize; + } + + template< std::size_t level > + __cuda_callable__ + Index getSize() const + { + static_assert( level < getDimension(), "Invalid dimension passed to getSize()." ); + return constSize; + } + + // methods for convenience + __cuda_callable__ + bool operator==( const ConstStaticSizesHolder& other ) const + { + return true; + } + + __cuda_callable__ + bool operator!=( const ConstStaticSizesHolder& other ) const + { + return false; + } +}; + + +template< typename Index, + std::size_t... sizes > +std::ostream& operator<<( std::ostream& str, const SizesHolder< Index, sizes... >& holder ) +{ + str << "SizesHolder< "; + TemplateStaticFor< std::size_t, 0, sizeof...(sizes) - 1, __ndarray_impl::SizesHolderStaticSizePrinter >::execHost( str, holder ); + str << holder.template getStaticSize< sizeof...(sizes) - 1 >() << " >( "; + TemplateStaticFor< std::size_t, 0, sizeof...(sizes) - 1, __ndarray_impl::SizesHolderSizePrinter >::execHost( str, holder ); + str << holder.template getSize< sizeof...(sizes) - 1 >() << " )"; + return str; +} + + +namespace __ndarray_impl { + +// helper for the forInternal method +template< typename SizesHolder, + std::size_t ConstValue > +struct SubtractedSizesHolder +{}; + +template< typename Index, + std::size_t ConstValue, + std::size_t... sizes > +struct SubtractedSizesHolder< SizesHolder< Index, sizes... >, ConstValue > +{ +// using type = SizesHolder< Index, std::max( (std::size_t) 0, sizes - ConstValue )... >; + using type = SizesHolder< Index, ( (sizes >= ConstValue) ? sizes - ConstValue : 0 )... >; +}; + + +// wrapper for localBegins in DistributedNDArray (static sizes cannot be distributed, begins are always 0) +template< typename SizesHolder, + // overridable value is useful in the forInternal method + std::size_t ConstValue = 0 > +struct LocalBeginsHolder : public SizesHolder +{ + template< std::size_t dimension > + static constexpr std::size_t getStaticSize() + { + static_assert( dimension < SizesHolder::getDimension(), "Invalid dimension passed to getStaticSize()." ); + return ConstValue; + } + + template< std::size_t level > + __cuda_callable__ + typename SizesHolder::IndexType getSize() const + { + if( SizesHolder::template getStaticSize< level >() != 0 ) + return ConstValue; + return SizesHolder::template getSize< level >(); + } + + template< std::size_t level > + __cuda_callable__ + void setSize( typename SizesHolder::IndexType newSize ) + { + if( SizesHolder::template getStaticSize< level >() == 0 ) + SizesHolder::template setSize< level >( newSize ); + else + TNL_ASSERT_EQ( newSize, ConstValue, "Dynamic size for a static dimension must be equal to the specified ConstValue." ); + } +}; + +template< typename Index, + std::size_t... sizes, + std::size_t ConstValue > +std::ostream& operator<<( std::ostream& str, const __ndarray_impl::LocalBeginsHolder< SizesHolder< Index, sizes... >, ConstValue >& holder ) +{ + str << "LocalBeginsHolder< SizesHolder< "; + TemplateStaticFor< std::size_t, 0, sizeof...(sizes) - 1, __ndarray_impl::SizesHolderStaticSizePrinter >::execHost( str, (SizesHolder< Index, sizes... >) holder ); + str << holder.template getStaticSize< sizeof...(sizes) - 1 >() << " >, "; + str << ConstValue << " >( "; + TemplateStaticFor< std::size_t, 0, sizeof...(sizes) - 1, __ndarray_impl::SizesHolderSizePrinter >::execHost( str, holder ); + str << holder.template getSize< sizeof...(sizes) - 1 >() << " )"; + return str; +} + +} // namespace __ndarray_impl + +} // namespace Containers +} // namespace TNL diff --git a/src/TNL/Containers/ndarray/SizesHolderHelpers.h b/src/TNL/Containers/ndarray/SizesHolderHelpers.h new file mode 100644 index 0000000000000000000000000000000000000000..2e92ed43dfba5cbe5041313312918464b70fc8e2 --- /dev/null +++ b/src/TNL/Containers/ndarray/SizesHolderHelpers.h @@ -0,0 +1,414 @@ +/*************************************************************************** + SizesHolderHelpers.h - description + ------------------- + begin : Dec 24, 2018 + copyright : (C) 2018 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky + +#pragma once + +#include + +#include +#include +#include + +namespace TNL { +namespace Containers { +namespace __ndarray_impl { + +// Dynamic storage size with alignment +template< typename SizesHolder, + typename Alignment, + typename Overlaps, + typename LevelTag = IndexTag< SizesHolder::getDimension() - 1 > > +struct StorageSizeGetter +{ + static typename SizesHolder::IndexType + __cuda_callable__ + get( const SizesHolder& sizes ) + { + static constexpr std::size_t overlap = __ndarray_impl::get< LevelTag::value >( Overlaps{} ); + const auto size = Alignment::template getAlignedSize< LevelTag::value >( sizes ); + return ( size + 2 * overlap ) + * StorageSizeGetter< SizesHolder, Alignment, Overlaps, IndexTag< LevelTag::value - 1 > >::get( sizes ); + } + + template< typename Permutation > + __cuda_callable__ + static typename SizesHolder::IndexType + getPermuted( const SizesHolder& sizes, Permutation ) + { + static constexpr std::size_t idx = __ndarray_impl::get< LevelTag::value >( Permutation{} ); + static constexpr std::size_t overlap = __ndarray_impl::get< idx >( Overlaps{} ); + const auto size = Alignment::template getAlignedSize< idx >( sizes ); + return ( size + 2 * overlap ) + * StorageSizeGetter< SizesHolder, Alignment, Overlaps, IndexTag< LevelTag::value - 1 > >::get( sizes ); + } +}; + +template< typename SizesHolder, typename Alignment, typename Overlaps > +struct StorageSizeGetter< SizesHolder, Alignment, Overlaps, IndexTag< 0 > > +{ + static typename SizesHolder::IndexType + __cuda_callable__ + get( const SizesHolder& sizes ) + { + static constexpr std::size_t overlap = __ndarray_impl::get< 0 >( Overlaps{} ); + return Alignment::template getAlignedSize< 0 >( sizes ) + 2 * overlap; + } + + template< typename Permutation > + __cuda_callable__ + static typename SizesHolder::IndexType + getPermuted( const SizesHolder& sizes, Permutation ) + { + static constexpr std::size_t idx = __ndarray_impl::get< 0 >( Permutation{} ); + static constexpr std::size_t overlap = __ndarray_impl::get< idx >( Overlaps{} ); + return Alignment::template getAlignedSize< idx >( sizes ) + 2 * overlap; + } +}; + + +// Static storage size without alignment, used in StaticNDArray +template< typename SizesHolder, + typename LevelTag = IndexTag< SizesHolder::getDimension() - 1 > > +struct StaticStorageSizeGetter +{ + constexpr static std::size_t get() + { + return SizesHolder::template getStaticSize< LevelTag::value >() * + StaticStorageSizeGetter< SizesHolder, IndexTag< LevelTag::value - 1 > >::get(); + } +}; + +template< typename SizesHolder > +struct StaticStorageSizeGetter< SizesHolder, IndexTag< 0 > > +{ + constexpr static std::size_t get() + { + return SizesHolder::template getStaticSize< 0 >(); + } +}; + + +template< std::size_t level = 0, + typename SizesHolder, + typename Index, + typename... IndexTypes > +void setSizesHelper( SizesHolder& holder, + Index&& size, + IndexTypes&&... otherSizes ) +{ + holder.template setSize< level >( std::forward< Index >( size ) ); + setSizesHelper< level + 1 >( holder, std::forward< IndexTypes >( otherSizes )... ); +} + +template< std::size_t level = 0, + typename SizesHolder, + typename Index > +void setSizesHelper( SizesHolder& holder, + Index&& size ) +{ + holder.template setSize< level >( std::forward< Index >( size ) ); +} + + +// A variadic bounds-checker for indices +template< typename SizesHolder > +__cuda_callable__ +void assertIndicesInBounds( const SizesHolder& ) +{} + +template< typename SizesHolder, + typename Index, + typename... IndexTypes > +__cuda_callable__ +void assertIndicesInBounds( const SizesHolder& sizes, Index&& i, IndexTypes&&... indices ) +{ +#ifndef NDEBUG + // sizes.template getSize<...>() cannot be inside the assert macro, but the variables + // shouldn't be declared when compiling without assertions + constexpr std::size_t level = SizesHolder::getDimension() - sizeof...(indices) - 1; + const auto size = sizes.template getSize< level >(); + TNL_ASSERT_LT( i, (Index) size, "Input error - some index is out of bounds." ); +#endif + assertIndicesInBounds( sizes, std::forward< IndexTypes >( indices )... ); +} + + +// A variadic bounds-checker for distributed indices with overlaps +template< typename SizesHolder1, typename SizesHolder2, typename Overlaps > +__cuda_callable__ +void assertIndicesInRange( const SizesHolder1&, const SizesHolder2&, const Overlaps& ) +{} + +template< typename SizesHolder1, + typename SizesHolder2, + typename Overlaps, + typename Index, + typename... IndexTypes > +__cuda_callable__ +void assertIndicesInRange( const SizesHolder1& begins, const SizesHolder2& ends, const Overlaps& overlaps, Index&& i, IndexTypes&&... indices ) +{ + static_assert( SizesHolder1::getDimension() == SizesHolder2::getDimension(), + "Inconsistent begins and ends." ); +#ifndef NDEBUG + // sizes.template getSize<...>() cannot be inside the assert macro, but the variables + // shouldn't be declared when compiling without assertions + constexpr std::size_t level = SizesHolder1::getDimension() - sizeof...(indices) - 1; + const auto begin = begins.template getSize< level >(); + const auto end = ends.template getSize< level >(); + TNL_ASSERT_LE( begin - (decltype(begin)) get( overlaps ), i, "Input error - some index is below the lower bound." ); + TNL_ASSERT_LT( i, end + (decltype(end)) get( overlaps ), "Input error - some index is above the upper bound." ); +#endif + assertIndicesInRange( begins, ends, overlaps, std::forward< IndexTypes >( indices )... ); +} + + +// helper for the assignment operator in NDArray +template< typename TargetHolder, + typename SourceHolder, + std::size_t level = TargetHolder::getDimension() - 1 > +struct SetSizesCopyHelper +{ + static void copy( TargetHolder& target, + const SourceHolder& source ) + { + if( target.template getStaticSize< level >() == 0 ) { + target.template setSize< level >( source.template getSize< level >() ); + SetSizesCopyHelper< TargetHolder, SourceHolder, level - 1 >::copy( target, source ); + } + else if( target.template getStaticSize< level >() != source.template getSize< level >() ) + throw std::logic_error( "Cannot copy sizes due to inconsistent underlying types (static sizes don't match)." ); + } +}; + +template< typename TargetHolder, + typename SourceHolder > +struct SetSizesCopyHelper< TargetHolder, SourceHolder, 0 > +{ + static void copy( TargetHolder& target, + const SourceHolder& source ) + { + if( target.template getStaticSize< 0 >() == 0 ) + target.template setSize< 0 >( source.template getSize< 0 >() ); + else if( target.template getStaticSize< 0 >() != source.template getSize< 0 >() ) + throw std::logic_error( "Cannot copy sizes due to inconsistent underlying types (static sizes don't match)." ); + } +}; + + +template< std::size_t level > +struct WeakCompareHelper +{ + template< typename SizesHolder1, + typename SizesHolder2 > + __cuda_callable__ + static void exec( const SizesHolder1& sizes1, const SizesHolder2& sizes2, bool& result ) + { + result &= sizes1.template getSize< level >() == sizes2.template getSize< level >(); + } +}; + +// helper for the assignment operator in NDArrayView +template< typename SizesHolder1, + typename SizesHolder2 > +__cuda_callable__ +bool sizesWeakCompare( const SizesHolder1& sizes1, const SizesHolder2& sizes2 ) +{ + static_assert( SizesHolder1::getDimension() == SizesHolder2::getDimension(), + "Cannot compare sizes of different dimensions." ); + bool result = true; + TemplateStaticFor< std::size_t, 0, SizesHolder1::getDimension(), WeakCompareHelper >::exec( sizes1, sizes2, result ); + return result; +} + + +// helper for the forInternal and forBoundary methods (NDArray and DistributedNDArray) +template< std::size_t ConstValue, + typename TargetHolder, + typename SourceHolder, + typename Overlaps = make_constant_index_sequence< TargetHolder::getDimension(), 0 >, + std::size_t level = TargetHolder::getDimension() - 1 > +struct SetSizesSubtractHelper +{ + static void subtract( TargetHolder& target, + const SourceHolder& source ) + { + if( source.template getStaticSize< level >() == 0 ) + target.template setSize< level >( source.template getSize< level >() - ConstValue * ! get< level >( Overlaps{} ) ); + SetSizesSubtractHelper< ConstValue, TargetHolder, SourceHolder, Overlaps, level - 1 >::subtract( target, source ); + } +}; + +template< std::size_t ConstValue, + typename TargetHolder, + typename SourceHolder, + typename Overlaps > +struct SetSizesSubtractHelper< ConstValue, TargetHolder, SourceHolder, Overlaps, 0 > +{ + static void subtract( TargetHolder& target, + const SourceHolder& source ) + { + if( source.template getStaticSize< 0 >() == 0 ) + target.template setSize< 0 >( source.template getSize< 0 >() - ConstValue * ! get< 0 >( Overlaps{} ) ); + } +}; + + +// helper for the forInternal and forBoundary methods (DistributedNDArray) +template< std::size_t ConstValue, + typename TargetHolder, + typename SourceHolder, + typename Overlaps = make_constant_index_sequence< TargetHolder::getDimension(), 0 >, + std::size_t level = TargetHolder::getDimension() - 1 > +struct SetSizesAddHelper +{ + static void add( TargetHolder& target, + const SourceHolder& source ) + { + if( source.template getStaticSize< level >() == 0 ) + target.template setSize< level >( source.template getSize< level >() + ConstValue * ! get< level >( Overlaps{} ) ); + SetSizesAddHelper< ConstValue, TargetHolder, SourceHolder, Overlaps, level - 1 >::add( target, source ); + } +}; + +template< std::size_t ConstValue, + typename TargetHolder, + typename SourceHolder, + typename Overlaps > +struct SetSizesAddHelper< ConstValue, TargetHolder, SourceHolder, Overlaps, 0 > +{ + static void add( TargetHolder& target, + const SourceHolder& source ) + { + if( source.template getStaticSize< 0 >() == 0 ) + target.template setSize< 0 >( source.template getSize< 0 >() + ConstValue * ! get< 0 >( Overlaps{} ) ); + } +}; + + +// helper for the forLocalInternal, forLocalBoundary and forOverlaps methods (DistributedNDArray) +template< typename TargetHolder, + typename SourceHolder, + typename Overlaps = make_constant_index_sequence< TargetHolder::getDimension(), 0 >, + std::size_t level = TargetHolder::getDimension() - 1 > +struct SetSizesSubtractOverlapsHelper +{ + static void subtract( TargetHolder& target, + const SourceHolder& source ) + { + if( source.template getStaticSize< level >() == 0 ) + target.template setSize< level >( source.template getSize< level >() - get< level >( Overlaps{} ) ); + SetSizesSubtractOverlapsHelper< TargetHolder, SourceHolder, Overlaps, level - 1 >::subtract( target, source ); + } +}; + +template< typename TargetHolder, + typename SourceHolder, + typename Overlaps > +struct SetSizesSubtractOverlapsHelper< TargetHolder, SourceHolder, Overlaps, 0 > +{ + static void subtract( TargetHolder& target, + const SourceHolder& source ) + { + if( source.template getStaticSize< 0 >() == 0 ) + target.template setSize< 0 >( source.template getSize< 0 >() - get< 0 >( Overlaps{} ) ); + } +}; + + +// helper for the forLocalInternal, forLocalBoundary and forOverlaps methods (DistributedNDArray) +template< typename TargetHolder, + typename SourceHolder, + typename Overlaps = make_constant_index_sequence< TargetHolder::getDimension(), 0 >, + std::size_t level = TargetHolder::getDimension() - 1 > +struct SetSizesAddOverlapsHelper +{ + static void add( TargetHolder& target, + const SourceHolder& source ) + { + if( source.template getStaticSize< level >() == 0 ) + target.template setSize< level >( source.template getSize< level >() + get< level >( Overlaps{} ) ); + SetSizesAddOverlapsHelper< TargetHolder, SourceHolder, Overlaps, level - 1 >::add( target, source ); + } +}; + +template< typename TargetHolder, + typename SourceHolder, + typename Overlaps > +struct SetSizesAddOverlapsHelper< TargetHolder, SourceHolder, Overlaps, 0 > +{ + static void add( TargetHolder& target, + const SourceHolder& source ) + { + if( source.template getStaticSize< 0 >() == 0 ) + target.template setSize< 0 >( source.template getSize< 0 >() + get< 0 >( Overlaps{} ) ); + } +}; + + +// helper for the forInternal method (DistributedNDArray) +template< typename TargetHolder, + typename SourceHolder, + std::size_t level = TargetHolder::getDimension() - 1 > +struct SetSizesMaxHelper +{ + static void max( TargetHolder& target, + const SourceHolder& source ) + { + if( source.template getStaticSize< level >() == 0 ) + target.template setSize< level >( std::max( target.template getSize< level >(), source.template getSize< level >() ) ); + SetSizesMaxHelper< TargetHolder, SourceHolder, level - 1 >::max( target, source ); + } +}; + +template< typename TargetHolder, + typename SourceHolder > +struct SetSizesMaxHelper< TargetHolder, SourceHolder, 0 > +{ + static void max( TargetHolder& target, + const SourceHolder& source ) + { + if( source.template getStaticSize< 0 >() == 0 ) + target.template setSize< 0 >( std::max( target.template getSize< 0 >(), source.template getSize< 0 >() ) ); + } +}; + + +// helper for the forInternal method (DistributedNDArray) +template< typename TargetHolder, + typename SourceHolder, + std::size_t level = TargetHolder::getDimension() - 1 > +struct SetSizesMinHelper +{ + static void min( TargetHolder& target, + const SourceHolder& source ) + { + if( source.template getStaticSize< level >() == 0 ) + target.template setSize< level >( std::min( target.template getSize< level >(), source.template getSize< level >() ) ); + SetSizesMinHelper< TargetHolder, SourceHolder, level - 1 >::min( target, source ); + } +}; + +template< typename TargetHolder, + typename SourceHolder > +struct SetSizesMinHelper< TargetHolder, SourceHolder, 0 > +{ + static void min( TargetHolder& target, + const SourceHolder& source ) + { + if( source.template getStaticSize< 0 >() == 0 ) + target.template setSize< 0 >( std::min( target.template getSize< 0 >(), source.template getSize< 0 >() ) ); + } +}; + +} // namespace __ndarray_impl +} // namespace Containers +} // namespace TNL diff --git a/src/TNL/Containers/ndarray/Subarrays.h b/src/TNL/Containers/ndarray/Subarrays.h new file mode 100644 index 0000000000000000000000000000000000000000..d50a30ea1178743342685801f1c3c22af2a52c00 --- /dev/null +++ b/src/TNL/Containers/ndarray/Subarrays.h @@ -0,0 +1,356 @@ +/*************************************************************************** + Subarrays.h - description + ------------------- + begin : Dec 24, 2018 + copyright : (C) 2018 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky + +#pragma once + +#include +#include +#include + +namespace TNL { +namespace Containers { +namespace __ndarray_impl { + +template< typename Dimensions, typename Permutation > +class SubpermutationGetter; + +template< std::size_t... dims, std::size_t... vals > +class SubpermutationGetter< std::index_sequence< dims... >, std::index_sequence< vals... > > +{ +private: + using Dimensions = std::index_sequence< dims... >; + using Permutation = std::index_sequence< vals... >; + using Subsequence = decltype( + filter_sequence< Dimensions >( Permutation{} ) + ); + + template< std::size_t... v > + static constexpr auto + get_subpermutation( std::index_sequence< v... > ) + { + using Subpermutation = std::index_sequence< count_smaller( v, v... )... >; + return Subpermutation{}; + } + +public: + using Subpermutation = decltype( + get_subpermutation( Subsequence{} ) + ); +}; + + +template< typename Dimensions, typename SihesHolder > +class SizesFilter; + +template< std::size_t... dims, typename Index, std::size_t... sizes > +class SizesFilter< std::index_sequence< dims... >, SizesHolder< Index, sizes... > > +{ +private: + using Dimensions = std::index_sequence< dims... >; + using SizesSequence = std::index_sequence< sizes... >; + using Subsequence = decltype( + concat_sequences( std::index_sequence< get_from_pack< dims >( sizes... ) >{} ... ) + ); + + template< std::size_t... v > + static constexpr auto + get_sizesholder( std::index_sequence< v... > ) + { + using Sizes = SizesHolder< Index, v... >; + return Sizes{}; + } + + template< std::size_t level = 0, typename = void > + struct SizeSetterHelper + { + template< typename NewSizes, + typename OldSizes > + __cuda_callable__ + static void setSizes( NewSizes& newSizes, + const OldSizes& oldSizes ) + { + if( oldSizes.template getStaticSize< level >() == 0 ) + newSizes.template setSize< level >( oldSizes.template getSize< get< level >( Dimensions{} ) >() ); + SizeSetterHelper< level + 1 >::setSizes( newSizes, oldSizes ); + } + }; + + template< typename _unused > + struct SizeSetterHelper< Dimensions::size() - 1, _unused > + { + template< typename NewSizes, + typename OldSizes > + __cuda_callable__ + static void setSizes( NewSizes& newSizes, + const OldSizes& oldSizes ) + { + static constexpr std::size_t level = Dimensions::size() - 1; + if( oldSizes.template getStaticSize< level >() == 0 ) + newSizes.template setSize< level >( oldSizes.template getSize< get< level >( Dimensions{} ) >() ); + } + }; + + template< std::size_t level = 0, typename = void > + struct IndexChecker + { + template< typename... IndexTypes > + static bool check( IndexTypes&&... indices ) + { + static constexpr std::size_t d = get< level >( Dimensions{} ); + if( get_from_pack< d >( std::forward< IndexTypes >( indices )... ) != 0 ) + return false; + return IndexChecker< level + 1 >::check( std::forward< IndexTypes >( indices )... ); + } + }; + + template< typename _unused > + struct IndexChecker< Dimensions::size() - 1, _unused > + { + template< typename... IndexTypes > + static bool check( IndexTypes&&... indices ) + { + static constexpr std::size_t d = get< Dimensions::size() - 1 >( Dimensions{} ); + if( get_from_pack< d >( std::forward< IndexTypes >( indices )... ) != 0 ) + return false; + return true; + } + }; + +public: + using Sizes = decltype( + get_sizesholder( Subsequence{} ) + ); + + template< typename... IndexTypes > + __cuda_callable__ + static Sizes filterSizes( const SizesHolder< Index, sizes... >& oldSizes, IndexTypes&&... indices ) + { + Sizes newSizes; + + // assert that indices are 0 for the dimensions in the subarray + // (contraction of dimensions is not supported yet, and it does not + // make sense for static dimensions anyway) + TNL_ASSERT_TRUE( IndexChecker<>::check( std::forward< IndexTypes >( indices )... ), + "Static dimensions of the subarray must start at index 0 of the array." ); + + // set dynamic sizes + // pseudo-python-code: + // for d, D in enumerate(dims...): + // newSizes.setSize< d >( oldSizes.getSize< D >() ) + SizeSetterHelper<>::setSizes( newSizes, oldSizes ); + + return newSizes; + } +}; + + +template< typename Index, std::size_t Dimension > +struct DummyStrideBase +{ + static constexpr std::size_t getDimension() + { + return Dimension; + } + + static constexpr bool isContiguous() + { + return true; + } + + template< std::size_t level > + __cuda_callable__ + constexpr Index getStride( Index i = 0 ) const + { + return 1; + } +}; + +template< typename Index, + std::size_t... sizes > +class StridesHolder +: private SizesHolder< Index, sizes... > +{ + using BaseType = SizesHolder< Index, sizes... >; + +public: + using BaseType::getDimension; + + static constexpr bool isContiguous() + { + // a priori not contiguous (otherwise DummyStrideBase would be used) + return false; + } + + template< std::size_t level > + static constexpr std::size_t getStaticStride( Index i = 0 ) + { + return BaseType::template getStaticSize< level >(); + } + + template< std::size_t level > + __cuda_callable__ + Index getStride( Index i = 0 ) const + { + return BaseType::template getSize< level >(); + } + + template< std::size_t level > + __cuda_callable__ + void setStride( Index size ) + { + BaseType::template setSize< level >( size ); + } +}; + +template< typename Base, typename Permutation, std::size_t... Dimensions > +class SubarrayGetter; + +template< typename SliceInfo, typename Permutation, std::size_t... Dimensions > +class SubarrayGetter< NDArrayBase< SliceInfo >, Permutation, Dimensions... > +{ + // returns the number of factors in the stride product + template< std::size_t dim, std::size_t... vals > + static constexpr std::size_t get_end( std::index_sequence< vals... > _perm ) + { + if( dim == get< Permutation::size() - 1 >( Permutation{} ) ) + return 0; + std::size_t i = 0; + std::size_t count = 0; +// FIXME: nvcc chokes on the variadic brace-initialization +#ifndef __NVCC__ + for( auto v : std::initializer_list< std::size_t >{ vals... } ) +#else + for( auto v : (std::size_t [sizeof...(vals)]){ vals... } ) +#endif + { + if( i++ <= index_in_pack( dim, vals... ) ) + continue; + if( is_in_sequence( v, std::index_sequence< Dimensions... >{} ) ) + break; + count++; + } + return count; + } + + // static calculation of the stride product + template< typename SizesHolder, + std::size_t start_dim, + std::size_t end = get_end< start_dim >( Permutation{} ), + std::size_t level = 0, + typename = void > + struct StaticStrideGetter + { + static constexpr std::size_t get() + { + constexpr std::size_t start_offset = index_in_sequence( start_dim, Permutation{} ); + constexpr std::size_t dim = __ndarray_impl::get< start_offset + level + 1 >( Permutation{} ); + return SizesHolder::template getStaticSize< dim >() * StaticStrideGetter< SizesHolder, start_dim, end, level + 1 >::get(); + } + }; + + template< typename SizesHolder, std::size_t start_dim, std::size_t end, typename _unused > + struct StaticStrideGetter< SizesHolder, start_dim, end, end, _unused > + { + static constexpr std::size_t get() + { + return 1; + } + }; + + // dynamic calculation of the stride product + template< std::size_t start_dim, + std::size_t end = get_end< start_dim >( Permutation{} ), + std::size_t level = 0, + typename = void > + struct DynamicStrideGetter + { + template< typename SizesHolder > + static constexpr std::size_t get( const SizesHolder& sizes ) + { + constexpr std::size_t start_offset = index_in_sequence( start_dim, Permutation{} ); + constexpr std::size_t dim = __ndarray_impl::get< start_offset + level + 1 >( Permutation{} ); + return sizes.template getSize< dim >() * DynamicStrideGetter< start_dim, end, level + 1 >::get( sizes ); + } + }; + + template< std::size_t start_dim, std::size_t end, typename _unused > + struct DynamicStrideGetter< start_dim, end, end, _unused > + { + template< typename SizesHolder > + static constexpr std::size_t get( const SizesHolder& sizes ) + { + return 1; + } + }; + + // helper class for setting dynamic strides + template< std::size_t level = 0, typename = void > + struct StrideSetterHelper + { + template< typename StridesHolder, typename SizesHolder > + __cuda_callable__ + static void setStrides( StridesHolder& strides, const SizesHolder& sizes ) + { + static constexpr std::size_t dim = get_from_pack< level >( Dimensions... ); + if( StridesHolder::template getStaticStride< level >() == 0 ) + strides.template setStride< level >( DynamicStrideGetter< dim >::get( sizes ) ); + StrideSetterHelper< level + 1 >::setStrides( strides, sizes ); + } + }; + + template< typename _unused > + struct StrideSetterHelper< sizeof...(Dimensions) - 1, _unused > + { + template< typename StridesHolder, typename SizesHolder > + __cuda_callable__ + static void setStrides( StridesHolder& strides, const SizesHolder& sizes ) + { + static constexpr std::size_t level = sizeof...(Dimensions) - 1; + static constexpr std::size_t dim = get_from_pack< level >( Dimensions... ); + if( StridesHolder::template getStaticStride< level >() == 0 ) + strides.template setStride< level >( DynamicStrideGetter< dim >::get( sizes ) ); + } + }; + +public: + using Subpermutation = typename SubpermutationGetter< std::index_sequence< Dimensions... >, Permutation >::Subpermutation; + + template< typename SizesHolder, typename... IndexTypes > + __cuda_callable__ + static auto filterSizes( const SizesHolder& sizes, IndexTypes&&... indices ) + { + using Filter = SizesFilter< std::index_sequence< Dimensions... >, SizesHolder >; + return Filter::filterSizes( sizes, std::forward< IndexTypes >( indices )... ); + } + + template< typename SizesHolder, typename... IndexTypes > + __cuda_callable__ + static auto getStrides( const SizesHolder& sizes, IndexTypes&&... indices ) + { + using Strides = StridesHolder< typename SizesHolder::IndexType, + StaticStrideGetter< SizesHolder, Dimensions >::get()... >; + Strides strides; + + // set dynamic strides + // pseudo-python-code: + // for i, d in enumerate(Dimensions): + // if is_dynamic_dimension(d): + // strides.setStride< i >( dynamic_stride(d, sizes) ) + StrideSetterHelper<>::setStrides( strides, sizes ); + + return strides; + } +}; + +} // namespace __ndarray_impl +} // namespace Containers +} // namespace TNL diff --git a/src/TNL/Containers/ndarray/SynchronizerBuffers.h b/src/TNL/Containers/ndarray/SynchronizerBuffers.h new file mode 100644 index 0000000000000000000000000000000000000000..d54fddfd709dea111c1b4ef7eabdcbf1fdaa08ba --- /dev/null +++ b/src/TNL/Containers/ndarray/SynchronizerBuffers.h @@ -0,0 +1,88 @@ +/*************************************************************************** + SynchronizerBuffers.h - description + ------------------- + begin : Mar 30, 2019 + copyright : (C) 2019 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky + +#pragma once + +#include + +namespace TNL { +namespace Containers { +namespace __ndarray_impl { + +template< typename DistributedNDArray, std::size_t level > +struct SynchronizerBuffersLayer +{ + SynchronizerBuffersLayer& getDimBuffers( std::integral_constant< std::size_t, level > ) + { + return *this; + } + + using NDArrayType = NDArray< typename DistributedNDArray::ValueType, + typename DistributedNDArray::SizesHolderType, + typename DistributedNDArray::PermutationType, + typename DistributedNDArray::DeviceType >; + NDArrayType left_send_buffer, left_recv_buffer, right_send_buffer, right_recv_buffer; + typename DistributedNDArray::LocalBeginsType left_send_offsets, left_recv_offsets, right_send_offsets, right_recv_offsets; + + int left_neighbor = -1; + int right_neighbor = -1; + + void reset() + { + left_send_buffer.reset(); + left_recv_buffer.reset(); + right_send_buffer.reset(); + right_recv_buffer.reset(); + + left_send_offsets = left_recv_offsets = right_send_offsets = right_recv_offsets = typename DistributedNDArray::LocalBeginsType{}; + + left_neighbor = right_neighbor = -1; + } +}; + +template< typename DistributedNDArray, + typename LevelTag = std::integral_constant< std::size_t, DistributedNDArray::getDimension() > > +struct SynchronizerBuffersLayerHelper +{}; + +template< typename DistributedNDArray, std::size_t level > +struct SynchronizerBuffersLayerHelper< DistributedNDArray, std::integral_constant< std::size_t, level > > +: public SynchronizerBuffersLayerHelper< DistributedNDArray, std::integral_constant< std::size_t, level - 1 > >, + public SynchronizerBuffersLayer< DistributedNDArray, level > +{ + using SynchronizerBuffersLayerHelper< DistributedNDArray, std::integral_constant< std::size_t, level - 1 > >::getDimBuffers; + using SynchronizerBuffersLayer< DistributedNDArray, level >::getDimBuffers; +}; + +template< typename DistributedNDArray > +struct SynchronizerBuffersLayerHelper< DistributedNDArray, std::integral_constant< std::size_t, 0 > > +: public SynchronizerBuffersLayer< DistributedNDArray, 0 > +{ + using SynchronizerBuffersLayer< DistributedNDArray, 0 >::getDimBuffers; +}; + +template< typename DistributedNDArray > +struct SynchronizerBuffers +: public SynchronizerBuffersLayerHelper< DistributedNDArray > +{ + using SynchronizerBuffersLayerHelper< DistributedNDArray >::getDimBuffers; + + template< std::size_t level > + auto& getDimBuffers() + { + return this->getDimBuffers( std::integral_constant< std::size_t, level >{} ); + } +}; + +} // namespace __ndarray_impl +} // namespace Containers +} // namespace TNL diff --git a/src/TNL/StaticFor.h b/src/TNL/StaticFor.h index 1539e05aa52cd25222098f10a38b2688417bd8dd..0fa3bc0ef7a0f5b916a9a7546d4d6f59758f0681 100644 --- a/src/TNL/StaticFor.h +++ b/src/TNL/StaticFor.h @@ -14,25 +14,47 @@ namespace TNL { +// Manual unrolling does not make sense for loops with a large iterations +// count. For a very large iterations count it would trigger the compiler's +// limit on recursive template instantiation. Also note that the compiler +// will (at least partially) unroll loops with static bounds anyway. +template< int Begin, int End, bool unrolled = (End - Begin <= 8) > +struct StaticFor; + template< int Begin, int End > -struct StaticFor +struct StaticFor< Begin, End, true > { - template< typename Function, typename... Args > - __cuda_callable__ - static void exec( const Function& f, Args... args ) - { - static_assert( Begin < End, "Wrong index interval for StaticFor. Being must be lower than end." ); - f( Begin, args... ); - StaticFor< Begin + 1, End >::exec( f, args... ); - }; + static_assert( Begin < End, "Wrong index interval for StaticFor. Begin must be less than end." ); + + template< typename Function, typename... Args > + __cuda_callable__ + static void exec( const Function& f, Args... args ) + { + f( Begin, args... ); + StaticFor< Begin + 1, End >::exec( f, args... ); + } }; template< int End > -struct StaticFor< End, End > +struct StaticFor< End, End, true > { - template< typename Function, typename... Args > - __cuda_callable__ - static void exec( const Function& f, Args... args ){}; + template< typename Function, typename... Args > + __cuda_callable__ + static void exec( const Function& f, Args... args ) {} +}; + +template< int Begin, int End > +struct StaticFor< Begin, End, false > +{ + static_assert( Begin <= End, "Wrong index interval for StaticFor. Begin must be less than or equal to end." ); + + template< typename Function, typename... Args > + __cuda_callable__ + static void exec( const Function& f, Args... args ) + { + for( int i = Begin; i < End; i++ ) + f( i, args... ); + } }; } //namespace TNL diff --git a/src/UnitTests/Containers/CMakeLists.txt b/src/UnitTests/Containers/CMakeLists.txt index d33f5d2631bea262c9732d2489e02554738a72fb..c8cd88af9f3ae8df5109c439aba858bc059bca2d 100644 --- a/src/UnitTests/Containers/CMakeLists.txt +++ b/src/UnitTests/Containers/CMakeLists.txt @@ -120,6 +120,7 @@ ADD_TEST( StaticVectorOperationsTest ${EXECUTABLE_OUTPUT_PATH}/StaticVectorOpera ADD_SUBDIRECTORY( Multimaps ) +ADD_SUBDIRECTORY( ndarray ) if( ${BUILD_MPI} ) diff --git a/src/UnitTests/Containers/ndarray/CMakeLists.txt b/src/UnitTests/Containers/ndarray/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f87d64a9b79db1b29dad644660da0a94b56d9e4 --- /dev/null +++ b/src/UnitTests/Containers/ndarray/CMakeLists.txt @@ -0,0 +1,81 @@ +if( BUILD_CUDA ) + cuda_add_executable( NDArrayTest NDArrayTest.cu + OPTIONS ${CXX_TESTS_FLAGS} ) + target_link_libraries( NDArrayTest ${GTEST_BOTH_LIBRARIES} ) + add_test( NDArrayTest ${EXECUTABLE_OUTPUT_PATH}/NDArrayTest${CMAKE_EXECUTABLE_SUFFIX} ) +else() + add_executable( NDArrayTest NDArrayTest.cpp ) + target_compile_options( NDArrayTest PRIVATE ${CXX_TESTS_FLAGS} ) + target_link_libraries( NDArrayTest ${GTEST_BOTH_LIBRARIES} ) + add_test( NDArrayTest ${EXECUTABLE_OUTPUT_PATH}/NDArrayTest${CMAKE_EXECUTABLE_SUFFIX} ) +endif() + +add_executable( NDSubarrayTest NDSubarrayTest.cpp ) +target_compile_options( NDSubarrayTest PRIVATE ${CXX_TESTS_FLAGS} ) +target_link_libraries( NDSubarrayTest ${GTEST_BOTH_LIBRARIES} ) +add_test( NDSubarrayTest ${EXECUTABLE_OUTPUT_PATH}/NDSubarrayTest${CMAKE_EXECUTABLE_SUFFIX} ) + +add_executable( SlicedNDArrayTest SlicedNDArrayTest.cpp ) +target_compile_options( SlicedNDArrayTest PRIVATE ${CXX_TESTS_FLAGS} ) +target_link_libraries( SlicedNDArrayTest ${GTEST_BOTH_LIBRARIES} ) +add_test( SlicedNDArrayTest ${EXECUTABLE_OUTPUT_PATH}/SlicedNDArrayTest${CMAKE_EXECUTABLE_SUFFIX} ) + +add_executable( StaticNDArrayTest StaticNDArrayTest.cpp ) +target_compile_options( StaticNDArrayTest PRIVATE ${CXX_TESTS_FLAGS} ) +target_link_libraries( StaticNDArrayTest ${GTEST_BOTH_LIBRARIES} ) +add_test( StaticNDArrayTest ${EXECUTABLE_OUTPUT_PATH}/StaticNDArrayTest${CMAKE_EXECUTABLE_SUFFIX} ) + +if( BUILD_CUDA ) + cuda_add_executable( StaticNDArrayCudaTest StaticNDArrayCudaTest.cu + OPTIONS ${CXX_TESTS_FLAGS} ) + target_link_libraries( StaticNDArrayCudaTest ${GTEST_BOTH_LIBRARIES} ) + add_test( StaticNDArrayCudaTest ${EXECUTABLE_OUTPUT_PATH}/StaticNDArrayCudaTest${CMAKE_EXECUTABLE_SUFFIX} ) +endif() + +if( ${BUILD_MPI} ) + if( BUILD_CUDA ) + CUDA_ADD_EXECUTABLE( DistributedNDArray_1D_test DistributedNDArray_1D_test.cu + OPTIONS ${CXX_TESTS_FLAGS} ) + TARGET_LINK_LIBRARIES( DistributedNDArray_1D_test ${GTEST_BOTH_LIBRARIES} ) + + CUDA_ADD_EXECUTABLE( DistributedNDArray_semi1D_test DistributedNDArray_semi1D_test.cu + OPTIONS ${CXX_TESTS_FLAGS} ) + TARGET_LINK_LIBRARIES( DistributedNDArray_semi1D_test ${GTEST_BOTH_LIBRARIES} ) + + CUDA_ADD_EXECUTABLE( DistributedNDArrayOverlaps_1D_test DistributedNDArrayOverlaps_1D_test.cu + OPTIONS ${CXX_TESTS_FLAGS} ) + TARGET_LINK_LIBRARIES( DistributedNDArrayOverlaps_1D_test ${GTEST_BOTH_LIBRARIES} ) + + CUDA_ADD_EXECUTABLE( DistributedNDArrayOverlaps_semi1D_test DistributedNDArrayOverlaps_semi1D_test.cu + OPTIONS ${CXX_TESTS_FLAGS} ) + TARGET_LINK_LIBRARIES( DistributedNDArrayOverlaps_semi1D_test ${GTEST_BOTH_LIBRARIES} ) + else() + ADD_EXECUTABLE( DistributedNDArray_1D_test DistributedNDArray_1D_test.cpp ) + TARGET_COMPILE_OPTIONS( DistributedNDArray_1D_test PRIVATE ${CXX_TESTS_FLAGS} ) + TARGET_LINK_LIBRARIES( DistributedNDArray_1D_test ${GTEST_BOTH_LIBRARIES} ) + + ADD_EXECUTABLE( DistributedNDArray_semi1D_test DistributedNDArray_semi1D_test.cpp ) + TARGET_COMPILE_OPTIONS( DistributedNDArray_semi1D_test PRIVATE ${CXX_TESTS_FLAGS} ) + TARGET_LINK_LIBRARIES( DistributedNDArray_semi1D_test ${GTEST_BOTH_LIBRARIES} ) + + ADD_EXECUTABLE( DistributedNDArrayOverlaps_1D_test DistributedNDArrayOverlaps_1D_test.cpp ) + TARGET_COMPILE_OPTIONS( DistributedNDArrayOverlaps_1D_test PRIVATE ${CXX_TESTS_FLAGS} ) + TARGET_LINK_LIBRARIES( DistributedNDArrayOverlaps_1D_test ${GTEST_BOTH_LIBRARIES} ) + + ADD_EXECUTABLE( DistributedNDArrayOverlaps_semi1D_test DistributedNDArrayOverlaps_semi1D_test.cpp ) + TARGET_COMPILE_OPTIONS( DistributedNDArrayOverlaps_semi1D_test PRIVATE ${CXX_TESTS_FLAGS} ) + TARGET_LINK_LIBRARIES( DistributedNDArrayOverlaps_semi1D_test ${GTEST_BOTH_LIBRARIES} ) + endif() + + SET( mpi_test_parameters -np 4 -H localhost:4 "${EXECUTABLE_OUTPUT_PATH}/DistributedNDArray_1D_test${CMAKE_EXECUTABLE_SUFFIX}" ) + ADD_TEST( NAME DistributedNDArray_1D_test COMMAND "mpirun" ${mpi_test_parameters}) + + SET( mpi_test_parameters -np 4 -H localhost:4 "${EXECUTABLE_OUTPUT_PATH}/DistributedNDArray_semi1D_test${CMAKE_EXECUTABLE_SUFFIX}" ) + ADD_TEST( NAME DistributedNDArray_semi1D_test COMMAND "mpirun" ${mpi_test_parameters}) + + SET( mpi_test_parameters -np 4 -H localhost:4 "${EXECUTABLE_OUTPUT_PATH}/DistributedNDArrayOverlaps_1D_test${CMAKE_EXECUTABLE_SUFFIX}" ) + ADD_TEST( NAME DistributedNDArrayOverlaps_1D_test COMMAND "mpirun" ${mpi_test_parameters}) + + SET( mpi_test_parameters -np 4 -H localhost:4 "${EXECUTABLE_OUTPUT_PATH}/DistributedNDArrayOverlaps_semi1D_test${CMAKE_EXECUTABLE_SUFFIX}" ) + ADD_TEST( NAME DistributedNDArrayOverlaps_semi1D_test COMMAND "mpirun" ${mpi_test_parameters}) +endif() diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.cpp b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b0aa8e8e2d8a8158d7743a508d57acbb73426b2e --- /dev/null +++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.cpp @@ -0,0 +1 @@ +#include "DistributedNDArrayOverlaps_1D_test.h" diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.cu b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.cu new file mode 100644 index 0000000000000000000000000000000000000000..b0aa8e8e2d8a8158d7743a508d57acbb73426b2e --- /dev/null +++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.cu @@ -0,0 +1 @@ +#include "DistributedNDArrayOverlaps_1D_test.h" diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h new file mode 100644 index 0000000000000000000000000000000000000000..a7609ee7476931a0afed13711532bf162d0984ae --- /dev/null +++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_1D_test.h @@ -0,0 +1,369 @@ +/*************************************************************************** + DistributedNDArrayOverlaps_1D_test.h - description + ------------------- + begin : Dec 27, 2018 + copyright : (C) 2018 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +#ifdef HAVE_GTEST +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace TNL; +using namespace TNL::Containers; + +/* + * Light check of DistributedNDArray. + * + * - Number of processes is not limited. + * - Global size is hardcoded as 97 to force non-uniform distribution. + * - Communication group is hardcoded as AllGroup -- it may be changed as needed. + */ +template< typename DistributedNDArray > +class DistributedNDArrayOverlaps_1D_test +: public ::testing::Test +{ +protected: + using ValueType = typename DistributedNDArray::ValueType; + using DeviceType = typename DistributedNDArray::DeviceType; + using CommunicatorType = typename DistributedNDArray::CommunicatorType; + using IndexType = typename DistributedNDArray::IndexType; + using DistributedNDArrayType = DistributedNDArray; + + // TODO: use ndarray + using LocalArrayType = Array< ValueType, DeviceType, IndexType >; + using LocalArrayViewType = ArrayView< ValueType, DeviceType, IndexType >; + + const int globalSize = 97; // prime number to force non-uniform distribution + const int overlaps = __ndarray_impl::get< 0 >( typename DistributedNDArray::OverlapsType{} ); + + const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup; + + DistributedNDArrayType distributedNDArray; + + const int rank = CommunicatorType::GetRank(group); + const int nproc = CommunicatorType::GetSize(group); + + DistributedNDArrayOverlaps_1D_test() + { + using LocalRangeType = typename DistributedNDArray::LocalRangeType; + const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group ); + distributedNDArray.setSizes( globalSize ); + distributedNDArray.template setDistribution< 0 >( localRange.getBegin(), localRange.getEnd(), group ); + distributedNDArray.allocate(); + + EXPECT_EQ( distributedNDArray.template getLocalRange< 0 >(), localRange ); + EXPECT_EQ( distributedNDArray.getCommunicationGroup(), group ); + } +}; + +// types for which DistributedNDArrayOverlaps_1D_test is instantiated +using DistributedNDArrayTypes = ::testing::Types< + DistributedNDArray< NDArray< double, + SizesHolder< int, 0 >, + std::index_sequence< 0 >, + Devices::Host >, + Communicators::MpiCommunicator, + std::index_sequence< 2 > > +// TODO: does it make sense for NoDistrCommunicator? +// DistributedNDArray< NDArray< double, +// SizesHolder< int, 0 >, +// std::index_sequence< 0 >, +// Devices::Host >, +// Communicators::NoDistrCommunicator, +// std::index_sequence< 2 > > +#ifdef HAVE_CUDA + , + DistributedNDArray< NDArray< double, + SizesHolder< int, 0 >, + std::index_sequence< 0 >, + Devices::Cuda >, + Communicators::MpiCommunicator, + std::index_sequence< 2 > > +// TODO: does it make sense for NoDistrCommunicator? +// DistributedNDArray< NDArray< double, +// SizesHolder< int, 0 >, +// std::index_sequence< 0 >, +// Devices::Cuda >, +// Communicators::NoDistrCommunicator, +// std::index_sequence< 2 > > +#endif +>; + +TYPED_TEST_SUITE( DistributedNDArrayOverlaps_1D_test, DistributedNDArrayTypes ); + +TYPED_TEST( DistributedNDArrayOverlaps_1D_test, checkSumOfLocalSizes ) +{ + using CommunicatorType = typename TestFixture::CommunicatorType; + + const auto localRange = this->distributedNDArray.template getLocalRange< 0 >(); + const int localSize = localRange.getEnd() - localRange.getBegin(); + int sumOfLocalSizes = 0; + CommunicatorType::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group ); + EXPECT_EQ( sumOfLocalSizes, this->globalSize ); + EXPECT_EQ( this->distributedNDArray.template getSize< 0 >(), this->globalSize ); + + EXPECT_EQ( this->distributedNDArray.getLocalStorageSize(), 2 * this->overlaps + localSize ); +} + +// separate function because nvcc does not allow __cuda_callable__ lambdas inside +// private or protected methods (which are created by TYPED_TEST macro) +template< typename DistributedArray > +void test_helper_forLocalInternal( DistributedArray& a ) +{ + using IndexType = typename DistributedArray::IndexType; + + const int overlaps = __ndarray_impl::get< 0 >( typename DistributedArray::OverlapsType{} ); + const auto localRange = a.template getLocalRange< 0 >(); + auto a_view = a.getView(); + + auto setter = [=] __cuda_callable__ ( IndexType i ) mutable + { + a_view( i ) += 1; + }; + + a.setValue( 0 ); + a.forLocalInternal( setter ); + + for( int gi = localRange.getBegin(); gi < localRange.getBegin() + overlaps; gi++ ) + EXPECT_EQ( a.getElement( gi ), 0 ) + << "gi = " << gi; + for( int gi = localRange.getBegin() + overlaps; gi < localRange.getEnd() - overlaps; gi++ ) + EXPECT_EQ( a.getElement( gi ), 1 ) + << "gi = " << gi; + for( int gi = localRange.getEnd() - overlaps; gi < localRange.getEnd(); gi++ ) + EXPECT_EQ( a.getElement( gi ), 0 ) + << "gi = " << gi; + + a.setValue( 0 ); + a_view.forLocalInternal( setter ); + + for( int gi = localRange.getBegin(); gi < localRange.getBegin() + overlaps; gi++ ) + EXPECT_EQ( a.getElement( gi ), 0 ) + << "gi = " << gi; + for( int gi = localRange.getBegin() + overlaps; gi < localRange.getEnd() - overlaps; gi++ ) + EXPECT_EQ( a.getElement( gi ), 1 ) + << "gi = " << gi; + for( int gi = localRange.getEnd() - overlaps; gi < localRange.getEnd(); gi++ ) + EXPECT_EQ( a.getElement( gi ), 0 ) + << "gi = " << gi; +} + +TYPED_TEST( DistributedNDArrayOverlaps_1D_test, forLocalInternal ) +{ + test_helper_forLocalInternal( this->distributedNDArray ); +} + +// separate function because nvcc does not allow __cuda_callable__ lambdas inside +// private or protected methods (which are created by TYPED_TEST macro) +template< typename DistributedArray > +void test_helper_forLocalBoundary( DistributedArray& a ) +{ + using IndexType = typename DistributedArray::IndexType; + + const int overlaps = __ndarray_impl::get< 0 >( typename DistributedArray::OverlapsType{} ); + const auto localRange = a.template getLocalRange< 0 >(); + auto a_view = a.getView(); + + auto setter = [=] __cuda_callable__ ( IndexType i ) mutable + { + a_view( i ) += 1; + }; + + a.setValue( 0 ); + a.forLocalBoundary( setter ); + + for( int gi = localRange.getBegin(); gi < localRange.getBegin() + overlaps; gi++ ) + EXPECT_EQ( a.getElement( gi ), 1 ) + << "gi = " << gi; + for( int gi = localRange.getBegin() + overlaps; gi < localRange.getEnd() - overlaps; gi++ ) + EXPECT_EQ( a.getElement( gi ), 0 ) + << "gi = " << gi; + for( int gi = localRange.getEnd() - overlaps; gi < localRange.getEnd(); gi++ ) + EXPECT_EQ( a.getElement( gi ), 1 ) + << "gi = " << gi; + + a.setValue( 0 ); + a_view.forLocalBoundary( setter ); + + for( int gi = localRange.getBegin(); gi < localRange.getBegin() + overlaps; gi++ ) + EXPECT_EQ( a.getElement( gi ), 1 ) + << "gi = " << gi; + for( int gi = localRange.getBegin() + overlaps; gi < localRange.getEnd() - overlaps; gi++ ) + EXPECT_EQ( a.getElement( gi ), 0 ) + << "gi = " << gi; + for( int gi = localRange.getEnd() - overlaps; gi < localRange.getEnd(); gi++ ) + EXPECT_EQ( a.getElement( gi ), 1 ) + << "gi = " << gi; +} + +TYPED_TEST( DistributedNDArrayOverlaps_1D_test, forLocalBoundary ) +{ + test_helper_forLocalBoundary( this->distributedNDArray ); +} + +// separate function because nvcc does not allow __cuda_callable__ lambdas inside +// private or protected methods (which are created by TYPED_TEST macro) +template< typename DistributedArray > +void test_helper_forOverlaps( DistributedArray& a ) +{ + using IndexType = typename DistributedArray::IndexType; + + const int overlaps = __ndarray_impl::get< 0 >( typename DistributedArray::OverlapsType{} ); + const auto localRange = a.template getLocalRange< 0 >(); + auto a_view = a.getView(); + + auto setter = [=] __cuda_callable__ ( IndexType i ) mutable + { + a_view( i ) += 1; + }; + + a.setValue( 0 ); + a.forOverlaps( setter ); + + for( int gi = localRange.getBegin() - overlaps; gi < localRange.getBegin(); gi++ ) + EXPECT_EQ( a.getElement( gi ), 1 ) + << "gi = " << gi; + for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) + EXPECT_EQ( a.getElement( gi ), 0 ) + << "gi = " << gi; + for( int gi = localRange.getEnd(); gi < localRange.getEnd() + overlaps; gi++ ) + EXPECT_EQ( a.getElement( gi ), 1 ) + << "gi = " << gi; + + a.setValue( 0 ); + a_view.forOverlaps( setter ); + + for( int gi = localRange.getBegin() - overlaps; gi < localRange.getBegin(); gi++ ) + EXPECT_EQ( a.getElement( gi ), 1 ) + << "gi = " << gi; + for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) + EXPECT_EQ( a.getElement( gi ), 0 ) + << "gi = " << gi; + for( int gi = localRange.getEnd(); gi < localRange.getEnd() + overlaps; gi++ ) + EXPECT_EQ( a.getElement( gi ), 1 ) + << "gi = " << gi; +} + +TYPED_TEST( DistributedNDArrayOverlaps_1D_test, forOverlaps ) +{ + test_helper_forOverlaps( this->distributedNDArray ); +} + +// separate function because nvcc does not allow __cuda_callable__ lambdas inside +// private or protected methods (which are created by TYPED_TEST macro) +template< typename DistributedArray > +void test_helper_synchronize( DistributedArray& a, const int rank, const int nproc ) +{ + using IndexType = typename DistributedArray::IndexType; + + const int overlaps = __ndarray_impl::get< 0 >( typename DistributedArray::OverlapsType{} ); + const auto localRange = a.template getLocalRange< 0 >(); + auto a_view = a.getView(); + + auto setter = [=] __cuda_callable__ ( IndexType i ) mutable + { + a_view( i ) = i; + }; + + a.setValue( -1 ); + a.forAll( setter ); + DistributedNDArraySynchronizer< DistributedArray > s1; + s1.synchronize( a ); + + for( int gi = localRange.getBegin() - overlaps; gi < localRange.getBegin(); gi++ ) + EXPECT_EQ( a.getElement( gi ), gi + ((rank == 0) ? 97 : 0) ); + for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) + EXPECT_EQ( a.getElement( gi ), gi ); + for( int gi = localRange.getEnd(); gi < localRange.getEnd() + overlaps; gi++ ) + EXPECT_EQ( a.getElement( gi ), gi - ((rank == nproc-1) ? 97 : 0) ); + + a.setValue( -1 ); + a_view.forAll( setter ); + DistributedNDArraySynchronizer< decltype(a_view) > s2; + s2.synchronize( a_view ); + + for( int gi = localRange.getBegin() - overlaps; gi < localRange.getBegin(); gi++ ) + EXPECT_EQ( a.getElement( gi ), gi + ((rank == 0) ? 97 : 0) ); + for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) + EXPECT_EQ( a.getElement( gi ), gi ); + for( int gi = localRange.getEnd(); gi < localRange.getEnd() + overlaps; gi++ ) + EXPECT_EQ( a.getElement( gi ), gi - ((rank == nproc-1) ? 97 : 0) ); +} + +TYPED_TEST( DistributedNDArrayOverlaps_1D_test, synchronize ) +{ + test_helper_synchronize( this->distributedNDArray, this->rank, this->nproc ); +} + +#endif // HAVE_GTEST + + +#if (defined(HAVE_GTEST) && defined(HAVE_MPI)) +using CommunicatorType = Communicators::MpiCommunicator; + +#include + +class MinimalistBufferedPrinter +: public ::testing::EmptyTestEventListener +{ +private: + std::stringstream sout; + +public: + // Called before a test starts. + virtual void OnTestStart(const ::testing::TestInfo& test_info) + { + sout << test_info.test_case_name() << "." << test_info.name() << " Start." << std::endl; + } + + // Called after a failed assertion or a SUCCEED() invocation. + virtual void OnTestPartResult(const ::testing::TestPartResult& test_part_result) + { + sout << (test_part_result.failed() ? "====Failure=== " : "===Success=== ") + << test_part_result.file_name() << " " + << test_part_result.line_number() <listeners(); + + delete listeners.Release(listeners.default_result_printer()); + listeners.Append(new MinimalistBufferedPrinter); + + Communicators::ScopedInitializer< CommunicatorType > mpi(argc, argv); + #endif + return RUN_ALL_TESTS(); +#else + throw GtestMissingError(); +#endif +} diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.cpp b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.cpp new file mode 100644 index 0000000000000000000000000000000000000000..cf74a71d11fa94ca9b2ad83c1389445c965a6797 --- /dev/null +++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.cpp @@ -0,0 +1 @@ +#include "DistributedNDArrayOverlaps_semi1D_test.h" diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.cu b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.cu new file mode 100644 index 0000000000000000000000000000000000000000..cf74a71d11fa94ca9b2ad83c1389445c965a6797 --- /dev/null +++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.cu @@ -0,0 +1 @@ +#include "DistributedNDArrayOverlaps_semi1D_test.h" diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h new file mode 100644 index 0000000000000000000000000000000000000000..a7f28ead5de63420e97730592948a1cfc2622b11 --- /dev/null +++ b/src/UnitTests/Containers/ndarray/DistributedNDArrayOverlaps_semi1D_test.h @@ -0,0 +1,409 @@ +/*************************************************************************** + DistributedNDArrayOverlaps_semi1D_test.h - description + ------------------- + begin : Dec 9, 2018 + copyright : (C) 2018 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +#ifdef HAVE_GTEST +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace TNL; +using namespace TNL::Containers; + +/* + * Light check of DistributedNDArray. + * + * - Number of processes is not limited. + * - Global size is hardcoded as 97 to force non-uniform distribution. + * - Communication group is hardcoded as AllGroup -- it may be changed as needed. + */ +template< typename DistributedNDArray > +class DistributedNDArrayOverlaps_semi1D_test +: public ::testing::Test +{ +protected: + using ValueType = typename DistributedNDArray::ValueType; + using DeviceType = typename DistributedNDArray::DeviceType; + using CommunicatorType = typename DistributedNDArray::CommunicatorType; + using IndexType = typename DistributedNDArray::IndexType; + using DistributedNDArrayType = DistributedNDArray; + + // TODO: use ndarray + using LocalArrayType = Array< ValueType, DeviceType, IndexType >; + using LocalArrayViewType = ArrayView< ValueType, DeviceType, IndexType >; + + const int globalSize = 97; // prime number to force non-uniform distribution + const int overlaps = __ndarray_impl::get< 1 >( typename DistributedNDArray::OverlapsType{} ); + + const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup; + + DistributedNDArrayType distributedNDArray; + + const int rank = CommunicatorType::GetRank(group); + const int nproc = CommunicatorType::GetSize(group); + + DistributedNDArrayOverlaps_semi1D_test() + { + using LocalRangeType = typename DistributedNDArray::LocalRangeType; + const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group ); + distributedNDArray.setSizes( 0, globalSize, globalSize / 2 ); + distributedNDArray.template setDistribution< 1 >( localRange.getBegin(), localRange.getEnd(), group ); + distributedNDArray.allocate(); + + EXPECT_EQ( distributedNDArray.template getLocalRange< 1 >(), localRange ); + EXPECT_EQ( distributedNDArray.getCommunicationGroup(), group ); + } +}; + +// types for which DistributedNDArrayOverlaps_semi1D_test is instantiated +using DistributedNDArrayTypes = ::testing::Types< + DistributedNDArray< NDArray< double, + SizesHolder< int, 9, 0, 0 >, // Q, X, Y + std::index_sequence< 0, 1, 2 >, // permutation - should not matter + Devices::Host >, + Communicators::MpiCommunicator, + std::index_sequence< 0, 2, 0 > > +#ifdef HAVE_CUDA + , + DistributedNDArray< NDArray< double, + SizesHolder< int, 9, 0, 0 >, // Q, X, Y + std::index_sequence< 0, 1, 2 >, // permutation - should not matter + Devices::Cuda >, + Communicators::MpiCommunicator, + std::index_sequence< 0, 2, 0 > > +#endif +>; + +TYPED_TEST_SUITE( DistributedNDArrayOverlaps_semi1D_test, DistributedNDArrayTypes ); + +TYPED_TEST( DistributedNDArrayOverlaps_semi1D_test, checkSumOfLocalSizes ) +{ + using CommunicatorType = typename TestFixture::CommunicatorType; + + const auto localRange = this->distributedNDArray.template getLocalRange< 1 >(); + const int localSize = localRange.getEnd() - localRange.getBegin(); + int sumOfLocalSizes = 0; + CommunicatorType::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group ); + EXPECT_EQ( sumOfLocalSizes, this->globalSize ); + EXPECT_EQ( this->distributedNDArray.template getSize< 1 >(), this->globalSize ); + + EXPECT_EQ( this->distributedNDArray.getLocalStorageSize(), 9 * (2 * this->overlaps + localSize) * (this->globalSize / 2) ); +} + +// separate function because nvcc does not allow __cuda_callable__ lambdas inside +// private or protected methods (which are created by TYPED_TEST macro) +template< typename DistributedArray > +void test_helper_forLocalInternal( DistributedArray& a ) +{ + using IndexType = typename DistributedArray::IndexType; + + const int overlaps = __ndarray_impl::get< 1 >( typename DistributedArray::OverlapsType{} ); + const auto localRange = a.template getLocalRange< 1 >(); + auto a_view = a.getView(); + + auto setter = [=] __cuda_callable__ ( IndexType q, IndexType i, IndexType j ) mutable + { + a_view( q, i, j ) += 1; + }; + + a.setValue( 0 ); + a.forLocalInternal( setter ); + + for( int q = 0; q < 9; q++ ) + for( int gi = localRange.getBegin(); gi < localRange.getBegin() + overlaps; gi++ ) + for( int j = 0; j < a.template getSize< 2 >(); j++ ) + EXPECT_EQ( a.getElement( q, gi, j ), 0 ) + << "gi = " << gi; + for( int q = 0; q < 9; q++ ) + for( int gi = localRange.getBegin() + overlaps; gi < localRange.getEnd() - overlaps; gi++ ) + for( int j = 0; j < a.template getSize< 2 >(); j++ ) + EXPECT_EQ( a.getElement( q, gi, j ), 1 ) + << "gi = " << gi; + for( int q = 0; q < 9; q++ ) + for( int gi = localRange.getEnd() - overlaps; gi < localRange.getEnd(); gi++ ) + for( int j = 0; j < a.template getSize< 2 >(); j++ ) + EXPECT_EQ( a.getElement( q, gi, j ), 0 ) + << "gi = " << gi; + + a.setValue( 0 ); + a_view.forLocalInternal( setter ); + + for( int q = 0; q < 9; q++ ) + for( int gi = localRange.getBegin(); gi < localRange.getBegin() + overlaps; gi++ ) + for( int j = 0; j < a.template getSize< 2 >(); j++ ) + EXPECT_EQ( a.getElement( q, gi, j ), 0 ) + << "gi = " << gi; + for( int q = 0; q < 9; q++ ) + for( int gi = localRange.getBegin() + overlaps; gi < localRange.getEnd() - overlaps; gi++ ) + for( int j = 0; j < a.template getSize< 2 >(); j++ ) + EXPECT_EQ( a.getElement( q, gi, j ), 1 ) + << "gi = " << gi; + for( int q = 0; q < 9; q++ ) + for( int gi = localRange.getEnd() - overlaps; gi < localRange.getEnd(); gi++ ) + for( int j = 0; j < a.template getSize< 2 >(); j++ ) + EXPECT_EQ( a.getElement( q, gi, j ), 0 ) + << "gi = " << gi; +} + +TYPED_TEST( DistributedNDArrayOverlaps_semi1D_test, forLocalInternal ) +{ + test_helper_forLocalInternal( this->distributedNDArray ); +} + +// separate function because nvcc does not allow __cuda_callable__ lambdas inside +// private or protected methods (which are created by TYPED_TEST macro) +template< typename DistributedArray > +void test_helper_forLocalBoundary( DistributedArray& a ) +{ + using IndexType = typename DistributedArray::IndexType; + + const int overlaps = __ndarray_impl::get< 1 >( typename DistributedArray::OverlapsType{} ); + const auto localRange = a.template getLocalRange< 1 >(); + auto a_view = a.getView(); + + auto setter = [=] __cuda_callable__ ( IndexType q, IndexType i, IndexType j ) mutable + { + a_view( q, i, j ) += 1; + }; + + a.setValue( 0 ); + a.forLocalBoundary( setter ); + + for( int q = 0; q < 9; q++ ) + for( int gi = localRange.getBegin(); gi < localRange.getBegin() + overlaps; gi++ ) + for( int j = 0; j < a.template getSize< 2 >(); j++ ) + EXPECT_EQ( a.getElement( q, gi, j ), 1 ) + << "gi = " << gi; + for( int q = 0; q < 9; q++ ) + for( int gi = localRange.getBegin() + overlaps; gi < localRange.getEnd() - overlaps; gi++ ) + for( int j = 0; j < a.template getSize< 2 >(); j++ ) + EXPECT_EQ( a.getElement( q, gi, j ), 0 ) + << "gi = " << gi; + for( int q = 0; q < 9; q++ ) + for( int gi = localRange.getEnd() - overlaps; gi < localRange.getEnd(); gi++ ) + for( int j = 0; j < a.template getSize< 2 >(); j++ ) + EXPECT_EQ( a.getElement( q, gi, j ), 1 ) + << "gi = " << gi; + + a.setValue( 0 ); + a_view.forLocalBoundary( setter ); + + for( int q = 0; q < 9; q++ ) + for( int gi = localRange.getBegin(); gi < localRange.getBegin() + overlaps; gi++ ) + for( int j = 0; j < a.template getSize< 2 >(); j++ ) + EXPECT_EQ( a.getElement( q, gi, j ), 1 ) + << "gi = " << gi; + for( int q = 0; q < 9; q++ ) + for( int gi = localRange.getBegin() + overlaps; gi < localRange.getEnd() - overlaps; gi++ ) + for( int j = 0; j < a.template getSize< 2 >(); j++ ) + EXPECT_EQ( a.getElement( q, gi, j ), 0 ) + << "gi = " << gi; + for( int q = 0; q < 9; q++ ) + for( int gi = localRange.getEnd() - overlaps; gi < localRange.getEnd(); gi++ ) + for( int j = 0; j < a.template getSize< 2 >(); j++ ) + EXPECT_EQ( a.getElement( q, gi, j ), 1 ) + << "gi = " << gi; +} + +TYPED_TEST( DistributedNDArrayOverlaps_semi1D_test, forLocalBoundary ) +{ + test_helper_forLocalBoundary( this->distributedNDArray ); +} + +// separate function because nvcc does not allow __cuda_callable__ lambdas inside +// private or protected methods (which are created by TYPED_TEST macro) +template< typename DistributedArray > +void test_helper_forOverlaps( DistributedArray& a ) +{ + using IndexType = typename DistributedArray::IndexType; + + const int overlaps = __ndarray_impl::get< 1 >( typename DistributedArray::OverlapsType{} ); + const auto localRange = a.template getLocalRange< 1 >(); + auto a_view = a.getView(); + + auto setter = [=] __cuda_callable__ ( IndexType q, IndexType i, IndexType j ) mutable + { + a_view( q, i, j ) += 1; + }; + + a.setValue( 0 ); + a.forOverlaps( setter ); + + for( int q = 0; q < 9; q++ ) + for( int gi = localRange.getBegin() - overlaps; gi < localRange.getBegin(); gi++ ) + for( int j = 0; j < a.template getSize< 2 >(); j++ ) + EXPECT_EQ( a.getElement( q, gi, j ), 1 ) + << "gi = " << gi; + for( int q = 0; q < 9; q++ ) + for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) + for( int j = 0; j < a.template getSize< 2 >(); j++ ) + EXPECT_EQ( a.getElement( q, gi, j ), 0 ) + << "gi = " << gi; + for( int q = 0; q < 9; q++ ) + for( int gi = localRange.getEnd(); gi < localRange.getEnd() + overlaps; gi++ ) + for( int j = 0; j < a.template getSize< 2 >(); j++ ) + EXPECT_EQ( a.getElement( q, gi, j ), 1 ) + << "gi = " << gi; + + a.setValue( 0 ); + a_view.forOverlaps( setter ); + + for( int q = 0; q < 9; q++ ) + for( int gi = localRange.getBegin() - overlaps; gi < localRange.getBegin(); gi++ ) + for( int j = 0; j < a.template getSize< 2 >(); j++ ) + EXPECT_EQ( a.getElement( q, gi, j ), 1 ) + << "gi = " << gi; + for( int q = 0; q < 9; q++ ) + for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) + for( int j = 0; j < a.template getSize< 2 >(); j++ ) + EXPECT_EQ( a.getElement( q, gi, j ), 0 ) + << "gi = " << gi; + for( int q = 0; q < 9; q++ ) + for( int gi = localRange.getEnd(); gi < localRange.getEnd() + overlaps; gi++ ) + for( int j = 0; j < a.template getSize< 2 >(); j++ ) + EXPECT_EQ( a.getElement( q, gi, j ), 1 ) + << "gi = " << gi; +} + +TYPED_TEST( DistributedNDArrayOverlaps_semi1D_test, forOverlaps ) +{ + test_helper_forOverlaps( this->distributedNDArray ); +} + +// separate function because nvcc does not allow __cuda_callable__ lambdas inside +// private or protected methods (which are created by TYPED_TEST macro) +template< typename DistributedArray > +void test_helper_synchronize( DistributedArray& a, const int rank, const int nproc ) +{ + using IndexType = typename DistributedArray::IndexType; + + const int overlaps = __ndarray_impl::get< 1 >( typename DistributedArray::OverlapsType{} ); + const auto localRange = a.template getLocalRange< 1 >(); + auto a_view = a.getView(); + + auto setter = [=] __cuda_callable__ ( IndexType q, IndexType i, IndexType j ) mutable + { + a_view( i ) = i; + }; + + a.setValue( -1 ); + a.forAll( setter ); + DistributedNDArraySynchronizer< DistributedArray > s1; + s1.synchronize( a ); + + for( int q = 0; q < 9; q++ ) + for( int gi = localRange.getBegin() - overlaps; gi < localRange.getBegin(); gi++ ) + for( int j = 0; j < a.template getSize< 2 >(); j++ ) + EXPECT_EQ( a.getElement( q, gi, j ), gi + ((rank == 0) ? 97 : 0) ) + << "gi = " << gi; + for( int q = 0; q < 9; q++ ) + for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) + for( int j = 0; j < a.template getSize< 2 >(); j++ ) + EXPECT_EQ( a.getElement( q, gi, j ), gi ) + << "gi = " << gi; + for( int q = 0; q < 9; q++ ) + for( int gi = localRange.getEnd(); gi < localRange.getEnd() + overlaps; gi++ ) + for( int j = 0; j < a.template getSize< 2 >(); j++ ) + EXPECT_EQ( a.getElement( q, gi, j ), gi - ((rank == nproc-1) ? 97 : 0) ) + << "gi = " << gi; + + a.setValue( -1 ); + a_view.forAll( setter ); + DistributedNDArraySynchronizer< decltype(a_view) > s2; + s2.synchronize( a_view ); + + for( int q = 0; q < 9; q++ ) + for( int gi = localRange.getBegin() - overlaps; gi < localRange.getBegin(); gi++ ) + for( int j = 0; j < a.template getSize< 2 >(); j++ ) + EXPECT_EQ( a.getElement( q, gi, j ), gi + ((rank == 0) ? 97 : 0) ) + << "gi = " << gi; + for( int q = 0; q < 9; q++ ) + for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) + for( int j = 0; j < a.template getSize< 2 >(); j++ ) + EXPECT_EQ( a.getElement( q, gi, j ), gi ) + << "gi = " << gi; + for( int q = 0; q < 9; q++ ) + for( int gi = localRange.getEnd(); gi < localRange.getEnd() + overlaps; gi++ ) + for( int j = 0; j < a.template getSize< 2 >(); j++ ) + EXPECT_EQ( a.getElement( q, gi, j ), gi - ((rank == nproc-1) ? 97 : 0) ) + << "gi = " << gi; +} + +//TYPED_TEST( DistributedNDArrayOverlaps_semi1D_test, synchronize ) +//{ +// test_helper_synchronize( this->distributedNDArray, this->rank, this->nproc ); +//} + +#endif // HAVE_GTEST + + +#if (defined(HAVE_GTEST) && defined(HAVE_MPI)) +using CommunicatorType = Communicators::MpiCommunicator; + +#include + +class MinimalistBufferedPrinter +: public ::testing::EmptyTestEventListener +{ +private: + std::stringstream sout; + +public: + // Called before a test starts. + virtual void OnTestStart(const ::testing::TestInfo& test_info) + { + sout << test_info.test_case_name() << "." << test_info.name() << " Start." << std::endl; + } + + // Called after a failed assertion or a SUCCEED() invocation. + virtual void OnTestPartResult(const ::testing::TestPartResult& test_part_result) + { + sout << (test_part_result.failed() ? "====Failure=== " : "===Success=== ") + << test_part_result.file_name() << " " + << test_part_result.line_number() <listeners(); + + delete listeners.Release(listeners.default_result_printer()); + listeners.Append(new MinimalistBufferedPrinter); + + Communicators::ScopedInitializer< CommunicatorType > mpi(argc, argv); + #endif + return RUN_ALL_TESTS(); +#else + throw GtestMissingError(); +#endif +} diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.cpp b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f234d7711a1056e2bf0298a7ff87bacac0d8717f --- /dev/null +++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.cpp @@ -0,0 +1 @@ +#include "DistributedNDArray_1D_test.h" diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.cu b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.cu new file mode 100644 index 0000000000000000000000000000000000000000..f234d7711a1056e2bf0298a7ff87bacac0d8717f --- /dev/null +++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.cu @@ -0,0 +1 @@ +#include "DistributedNDArray_1D_test.h" diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h new file mode 100644 index 0000000000000000000000000000000000000000..3dda2d1b4716e8dc92671567c714b6ac9b1150d5 --- /dev/null +++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_1D_test.h @@ -0,0 +1,555 @@ +/*************************************************************************** + DistributedNDArray_1D_test.h - description + ------------------- + begin : Dec 27, 2018 + copyright : (C) 2018 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +#ifdef HAVE_GTEST +#include + +#include +#include +#include +#include +#include +#include +#include + +using namespace TNL; +using namespace TNL::Containers; + +/* + * Light check of DistributedNDArray. + * + * - Number of processes is not limited. + * - Global size is hardcoded as 97 to force non-uniform distribution. + * - Communication group is hardcoded as AllGroup -- it may be changed as needed. + */ +template< typename DistributedNDArray > +class DistributedNDArray_1D_test +: public ::testing::Test +{ +protected: + using ValueType = typename DistributedNDArray::ValueType; + using DeviceType = typename DistributedNDArray::DeviceType; + using CommunicatorType = typename DistributedNDArray::CommunicatorType; + using IndexType = typename DistributedNDArray::IndexType; + using DistributedNDArrayType = DistributedNDArray; + + // TODO: use ndarray + using LocalArrayType = Array< ValueType, DeviceType, IndexType >; + using LocalArrayViewType = ArrayView< ValueType, DeviceType, IndexType >; + + const int globalSize = 97; // prime number to force non-uniform distribution + + const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup; + + DistributedNDArrayType distributedNDArray; + + const int rank = CommunicatorType::GetRank(group); + const int nproc = CommunicatorType::GetSize(group); + + DistributedNDArray_1D_test() + { + using LocalRangeType = typename DistributedNDArray::LocalRangeType; + const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group ); + distributedNDArray.setSizes( globalSize ); + distributedNDArray.template setDistribution< 0 >( localRange.getBegin(), localRange.getEnd(), group ); + distributedNDArray.allocate(); + + EXPECT_EQ( distributedNDArray.template getLocalRange< 0 >(), localRange ); + EXPECT_EQ( distributedNDArray.getCommunicationGroup(), group ); + } +}; + +// types for which DistributedNDArray_1D_test is instantiated +using DistributedNDArrayTypes = ::testing::Types< + DistributedNDArray< NDArray< double, + SizesHolder< int, 0 >, + std::index_sequence< 0 >, + Devices::Host >, + Communicators::MpiCommunicator >, + DistributedNDArray< NDArray< double, + SizesHolder< int, 0 >, + std::index_sequence< 0 >, + Devices::Host >, + Communicators::NoDistrCommunicator > +#ifdef HAVE_CUDA + , + DistributedNDArray< NDArray< double, + SizesHolder< int, 0 >, + std::index_sequence< 0 >, + Devices::Cuda >, + Communicators::MpiCommunicator >, + DistributedNDArray< NDArray< double, + SizesHolder< int, 0 >, + std::index_sequence< 0 >, + Devices::Cuda >, + Communicators::NoDistrCommunicator > +#endif +>; + +TYPED_TEST_SUITE( DistributedNDArray_1D_test, DistributedNDArrayTypes ); + +TYPED_TEST( DistributedNDArray_1D_test, checkSumOfLocalSizes ) +{ + using CommunicatorType = typename TestFixture::CommunicatorType; + + const auto localRange = this->distributedNDArray.template getLocalRange< 0 >(); + const int localSize = localRange.getEnd() - localRange.getBegin(); + int sumOfLocalSizes = 0; + CommunicatorType::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group ); + EXPECT_EQ( sumOfLocalSizes, this->globalSize ); + EXPECT_EQ( this->distributedNDArray.template getSize< 0 >(), this->globalSize ); +} + +TYPED_TEST( DistributedNDArray_1D_test, setLike ) +{ + using DistributedNDArrayType = typename TestFixture::DistributedNDArrayType; + + const auto localRange = this->distributedNDArray.template getLocalRange< 0 >(); + EXPECT_EQ( this->distributedNDArray.getLocalStorageSize(), localRange.getEnd() - localRange.getBegin() ); + DistributedNDArrayType copy; + EXPECT_EQ( copy.getLocalStorageSize(), 0 ); + copy.setLike( this->distributedNDArray ); + EXPECT_EQ( copy.getLocalStorageSize(), localRange.getEnd() - localRange.getBegin() ); +} + +TYPED_TEST( DistributedNDArray_1D_test, reset ) +{ + const auto localRange = this->distributedNDArray.template getLocalRange< 0 >(); + EXPECT_EQ( this->distributedNDArray.getLocalStorageSize(), localRange.getEnd() - localRange.getBegin() ); + this->distributedNDArray.reset(); + EXPECT_EQ( this->distributedNDArray.getLocalStorageSize(), 0 ); +} + +// separate function because nvcc does not allow __cuda_callable__ lambdas inside +// private or protected methods (which are created by TYPED_TEST macro) +template< typename DistributedArray, typename BufferView > +void test_helper_setValue( DistributedArray& array, BufferView& buffer_view ) +{ + using DeviceType = typename DistributedArray::DeviceType; + using IndexType = typename DistributedArray::IndexType; + + const auto localRange = array.template getLocalRange< 0 >(); + auto array_view = array.getConstView(); + auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable + { + buffer_view[ i - localRange.getBegin() ] = array_view( i ); + }; + ParallelFor< DeviceType >::exec( localRange.getBegin(), localRange.getEnd(), kernel ); +} + +TYPED_TEST( DistributedNDArray_1D_test, setValue ) +{ + using LocalArrayType = typename TestFixture::LocalArrayType; + using LocalArrayViewType = typename TestFixture::LocalArrayViewType; + + this->distributedNDArray.setValue( 1.0 ); + + const auto localRange = this->distributedNDArray.template getLocalRange< 0 >(); + LocalArrayType buffer( localRange.getEnd() - localRange.getBegin() ); + LocalArrayViewType buffer_view( buffer ); + test_helper_setValue( this->distributedNDArray, buffer_view ); + + LocalArrayType expected( localRange.getEnd() - localRange.getBegin() ); + expected.setValue( 1.0 ); + EXPECT_EQ( buffer, expected ); +} + +TYPED_TEST( DistributedNDArray_1D_test, elementwiseAccess ) +{ +// using ArrayViewType = typename TestFixture::ArrayViewType; + using IndexType = typename TestFixture::IndexType; + + this->distributedNDArray.setValue( 0 ); +// ArrayViewType localArrayView = this->distributedNDArray.getLocalArrayView(); + const auto localRange = this->distributedNDArray.template getLocalRange< 0 >(); + + // check initial value + for( IndexType gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) { +// EXPECT_EQ( localArrayView.getElement( i ), 0 ); + EXPECT_EQ( this->distributedNDArray.getElement( gi ), 0 ); + if( std::is_same< typename TestFixture::DeviceType, Devices::Host >::value ) + EXPECT_EQ( this->distributedNDArray[ gi ], 0 ); + } + + // use operator() + if( std::is_same< typename TestFixture::DeviceType, Devices::Host >::value ) { + for( IndexType gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) { + this->distributedNDArray( gi ) = gi + 1; + } + + // check set value + for( IndexType gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) { +// EXPECT_EQ( localArrayView.getElement( i ), gi + 1 ); + EXPECT_EQ( this->distributedNDArray.getElement( gi ), gi + 1 ); + EXPECT_EQ( this->distributedNDArray( gi ), gi + 1 ); + EXPECT_EQ( this->distributedNDArray[ gi ], gi + 1 ); + } + } +} + +TYPED_TEST( DistributedNDArray_1D_test, copyAssignment ) +{ + using DistributedNDArrayType = typename TestFixture::DistributedNDArrayType; + + this->distributedNDArray.setValue( 1 ); + DistributedNDArrayType copy; + copy = this->distributedNDArray; + // no binding, but deep copy +// EXPECT_NE( copy.getLocalArrayView().getData(), this->distributedNDArray.getLocalArrayView().getData() ); +// EXPECT_EQ( copy.getLocalArrayView(), this->distributedNDArray.getLocalArrayView() ); +} + +// separate function because nvcc does not allow __cuda_callable__ lambdas inside +// private or protected methods (which are created by TYPED_TEST macro) +template< typename DistributedArray > +void test_helper_comparisonOperators( DistributedArray& u, DistributedArray& v, DistributedArray& w ) +{ + using DeviceType = typename DistributedArray::DeviceType; + using IndexType = typename DistributedArray::IndexType; + + const auto localRange = u.template getLocalRange< 0 >(); + auto u_view = u.getView(); + auto v_view = v.getView(); + auto w_view = w.getView(); + + auto kernel = [=] __cuda_callable__ ( IndexType gi ) mutable + { + u_view( gi ) = gi; + v_view( gi ) = gi; + w_view( gi ) = 2 * gi; + }; + ParallelFor< DeviceType >::exec( localRange.getBegin(), localRange.getEnd(), kernel ); +} + +TYPED_TEST( DistributedNDArray_1D_test, comparisonOperators ) +{ + using DistributedNDArrayType = typename TestFixture::DistributedNDArrayType; + + DistributedNDArrayType& u = this->distributedNDArray; + DistributedNDArrayType v, w; + v.setLike( u ); + w.setLike( u ); + + test_helper_comparisonOperators( u, v, w ); + + EXPECT_TRUE( u == u ); + EXPECT_TRUE( u == v ); + EXPECT_TRUE( v == u ); + EXPECT_FALSE( u != v ); + EXPECT_FALSE( v != u ); + EXPECT_TRUE( u != w ); + EXPECT_TRUE( w != u ); + EXPECT_FALSE( u == w ); + EXPECT_FALSE( w == u ); + + v.reset(); + EXPECT_FALSE( u == v ); + u.reset(); + EXPECT_TRUE( u == v ); +} + +// separate function because nvcc does not allow __cuda_callable__ lambdas inside +// private or protected methods (which are created by TYPED_TEST macro) +template< typename DistributedArray > +void test_helper_forAll( DistributedArray& a ) +{ + using IndexType = typename DistributedArray::IndexType; + + const auto localRange = a.template getLocalRange< 0 >(); + auto a_view = a.getView(); + + auto setter = [=] __cuda_callable__ ( IndexType i ) mutable + { + a_view( i ) += 1; + }; + + a.setValue( 0 ); + a.forAll( setter ); + + for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) + EXPECT_EQ( a.getElement( gi ), 1 ); + + a.setValue( 0 ); + a_view.forAll( setter ); + + for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) + EXPECT_EQ( a.getElement( gi ), 1 ); +} + +TYPED_TEST( DistributedNDArray_1D_test, forAll ) +{ + test_helper_forAll( this->distributedNDArray ); +} + +// separate function because nvcc does not allow __cuda_callable__ lambdas inside +// private or protected methods (which are created by TYPED_TEST macro) +template< typename DistributedArray > +void test_helper_forInternal( DistributedArray& a ) +{ + using IndexType = typename DistributedArray::IndexType; + + const auto localRange = a.template getLocalRange< 0 >(); + auto a_view = a.getView(); + + auto setter = [=] __cuda_callable__ ( IndexType i ) mutable + { + a_view( i ) += 1; + }; + + a.setValue( 0 ); + a.forInternal( setter ); + + for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) + { + if( gi == 0 || gi == a.template getSize< 0 >() - 1 ) + EXPECT_EQ( a.getElement( gi ), 0 ) + << "gi = " << gi; + else + EXPECT_EQ( a.getElement( gi ), 1 ) + << "gi = " << gi; + } + + a.setValue( 0 ); + a_view.forInternal( setter ); + + for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) + { + if( gi == 0 || gi == a.template getSize< 0 >() - 1 ) + EXPECT_EQ( a.getElement( gi ), 0 ) + << "gi = " << gi; + else + EXPECT_EQ( a.getElement( gi ), 1 ) + << "gi = " << gi; + } +} + +TYPED_TEST( DistributedNDArray_1D_test, forInternal ) +{ + test_helper_forInternal( this->distributedNDArray ); +} + +// separate function because nvcc does not allow __cuda_callable__ lambdas inside +// private or protected methods (which are created by TYPED_TEST macro) +template< typename DistributedArray > +void test_helper_forLocalInternal( DistributedArray& a ) +{ + using IndexType = typename DistributedArray::IndexType; + + const auto localRange = a.template getLocalRange< 0 >(); + auto a_view = a.getView(); + + auto setter = [=] __cuda_callable__ ( IndexType i ) mutable + { + a_view( i ) += 1; + }; + + a.setValue( 0 ); + // equivalent to forAll because all overlaps are 0 + a.forLocalInternal( setter ); + + for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) + EXPECT_EQ( a.getElement( gi ), 1 ) + << "gi = " << gi; + + a.setValue( 0 ); + // equivalent to forAll because all overlaps are 0 + a_view.forLocalInternal( setter ); + + for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) + EXPECT_EQ( a.getElement( gi ), 1 ) + << "gi = " << gi; +} + +TYPED_TEST( DistributedNDArray_1D_test, forLocalInternal ) +{ + test_helper_forLocalInternal( this->distributedNDArray ); +} + +// separate function because nvcc does not allow __cuda_callable__ lambdas inside +// private or protected methods (which are created by TYPED_TEST macro) +template< typename DistributedArray > +void test_helper_forBoundary( DistributedArray& a ) +{ + using IndexType = typename DistributedArray::IndexType; + + const auto localRange = a.template getLocalRange< 0 >(); + auto a_view = a.getView(); + + auto setter = [=] __cuda_callable__ ( IndexType i ) mutable + { + a_view( i ) += 1; + }; + + a.setValue( 0 ); + a.forBoundary( setter ); + + for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) + { + if( gi == 0 || gi == a.template getSize< 0 >() - 1 ) + EXPECT_EQ( a.getElement( gi ), 1 ) + << "gi = " << gi; + else + EXPECT_EQ( a.getElement( gi ), 0 ) + << "gi = " << gi; + } + + a.setValue( 0 ); + a_view.forBoundary( setter ); + + for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) + { + if( gi == 0 || gi == a.template getSize< 0 >() - 1 ) + EXPECT_EQ( a.getElement( gi ), 1 ) + << "gi = " << gi; + else + EXPECT_EQ( a.getElement( gi ), 0 ) + << "gi = " << gi; + } +} + +TYPED_TEST( DistributedNDArray_1D_test, forBoundary ) +{ + test_helper_forBoundary( this->distributedNDArray ); +} + +// separate function because nvcc does not allow __cuda_callable__ lambdas inside +// private or protected methods (which are created by TYPED_TEST macro) +template< typename DistributedArray > +void test_helper_forLocalBoundary( DistributedArray& a ) +{ + using IndexType = typename DistributedArray::IndexType; + + const auto localRange = a.template getLocalRange< 0 >(); + auto a_view = a.getView(); + + auto setter = [=] __cuda_callable__ ( IndexType i ) mutable + { + a_view( i ) += 1; + }; + + a.setValue( 0 ); + // empty set because all overlaps are 0 + a.forLocalBoundary( setter ); + + for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) + EXPECT_EQ( a.getElement( gi ), 0 ) + << "gi = " << gi; + + a.setValue( 0 ); + // empty set because all overlaps are 0 + a_view.forLocalBoundary( setter ); + + for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) + EXPECT_EQ( a.getElement( gi ), 0 ) + << "gi = " << gi; +} + +TYPED_TEST( DistributedNDArray_1D_test, forLocalBoundary ) +{ + test_helper_forLocalBoundary( this->distributedNDArray ); +} + +// separate function because nvcc does not allow __cuda_callable__ lambdas inside +// private or protected methods (which are created by TYPED_TEST macro) +template< typename DistributedArray > +void test_helper_forOverlaps( DistributedArray& a ) +{ + using IndexType = typename DistributedArray::IndexType; + + const auto localRange = a.template getLocalRange< 0 >(); + auto a_view = a.getView(); + + auto setter = [=] __cuda_callable__ ( IndexType i ) mutable + { + a_view( i ) += 1; + }; + + a.setValue( 0 ); + // empty set because all overlaps are 0 + a.forOverlaps( setter ); + + for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) + EXPECT_EQ( a.getElement( gi ), 0 ) + << "gi = " << gi; + + a.setValue( 0 ); + // empty set because all overlaps are 0 + a_view.forOverlaps( setter ); + + for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) + EXPECT_EQ( a.getElement( gi ), 0 ) + << "gi = " << gi; +} + +TYPED_TEST( DistributedNDArray_1D_test, forOverlaps ) +{ + test_helper_forOverlaps( this->distributedNDArray ); +} + +#endif // HAVE_GTEST + + +#if (defined(HAVE_GTEST) && defined(HAVE_MPI)) +using CommunicatorType = Communicators::MpiCommunicator; + +#include + +class MinimalistBufferedPrinter +: public ::testing::EmptyTestEventListener +{ +private: + std::stringstream sout; + +public: + // Called before a test starts. + virtual void OnTestStart(const ::testing::TestInfo& test_info) + { + sout << test_info.test_case_name() << "." << test_info.name() << " Start." << std::endl; + } + + // Called after a failed assertion or a SUCCEED() invocation. + virtual void OnTestPartResult(const ::testing::TestPartResult& test_part_result) + { + sout << (test_part_result.failed() ? "====Failure=== " : "===Success=== ") + << test_part_result.file_name() << " " + << test_part_result.line_number() <listeners(); + + delete listeners.Release(listeners.default_result_printer()); + listeners.Append(new MinimalistBufferedPrinter); + + Communicators::ScopedInitializer< CommunicatorType > mpi(argc, argv); + #endif + return RUN_ALL_TESTS(); +#else + throw GtestMissingError(); +#endif +} diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.cpp b/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.cpp new file mode 100644 index 0000000000000000000000000000000000000000..720641ed9bd8c1122782f6139cecd33d59ef29e9 --- /dev/null +++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.cpp @@ -0,0 +1 @@ +#include "DistributedNDArray_semi1D_test.h" diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.cu b/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.cu new file mode 100644 index 0000000000000000000000000000000000000000..720641ed9bd8c1122782f6139cecd33d59ef29e9 --- /dev/null +++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.cu @@ -0,0 +1 @@ +#include "DistributedNDArray_semi1D_test.h" diff --git a/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h b/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h new file mode 100644 index 0000000000000000000000000000000000000000..33390a33c8a230d2946f54569e211a4a711713d0 --- /dev/null +++ b/src/UnitTests/Containers/ndarray/DistributedNDArray_semi1D_test.h @@ -0,0 +1,541 @@ +/*************************************************************************** + DistributedNDArray_semi1D_test.h - description + ------------------- + begin : Dec 27, 2018 + copyright : (C) 2018 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +#ifdef HAVE_GTEST +#include + +#include +#include +#include +#include +#include +#include +#include + +using namespace TNL; +using namespace TNL::Containers; + +/* + * Light check of DistributedNDArray. + * + * - Number of processes is not limited. + * - Global size is hardcoded as 97 to force non-uniform distribution. + * - Communication group is hardcoded as AllGroup -- it may be changed as needed. + */ +template< typename DistributedNDArray > +class DistributedNDArray_semi1D_test +: public ::testing::Test +{ +protected: + using ValueType = typename DistributedNDArray::ValueType; + using DeviceType = typename DistributedNDArray::DeviceType; + using CommunicatorType = typename DistributedNDArray::CommunicatorType; + using IndexType = typename DistributedNDArray::IndexType; + using DistributedNDArrayType = DistributedNDArray; + + // TODO: use ndarray + using LocalArrayType = Array< ValueType, DeviceType, IndexType >; + using LocalArrayViewType = ArrayView< ValueType, DeviceType, IndexType >; + + const int globalSize = 97; // prime number to force non-uniform distribution + + const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup; + + DistributedNDArrayType distributedNDArray; + + const int rank = CommunicatorType::GetRank(group); + const int nproc = CommunicatorType::GetSize(group); + + DistributedNDArray_semi1D_test() + { + using LocalRangeType = typename DistributedNDArray::LocalRangeType; + const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group ); + distributedNDArray.setSizes( 0, globalSize, globalSize / 2 ); + distributedNDArray.template setDistribution< 1 >( localRange.getBegin(), localRange.getEnd(), group ); + distributedNDArray.allocate(); + + EXPECT_EQ( distributedNDArray.template getLocalRange< 1 >(), localRange ); + EXPECT_EQ( distributedNDArray.getCommunicationGroup(), group ); + } +}; + +// types for which DistributedNDArray_semi1D_test is instantiated +using DistributedNDArrayTypes = ::testing::Types< + DistributedNDArray< NDArray< double, + SizesHolder< int, 9, 0, 0 >, // Q, X, Y, Z + std::index_sequence< 0, 1, 2 >, // permutation - should not matter + Devices::Host >, + Communicators::MpiCommunicator > +#ifdef HAVE_CUDA + , + DistributedNDArray< NDArray< double, + SizesHolder< int, 9, 0, 0 >, // Q, X, Y, Z + std::index_sequence< 0, 1, 2 >, // permutation - should not matter + Devices::Cuda >, + Communicators::NoDistrCommunicator > +#endif +>; + +TYPED_TEST_SUITE( DistributedNDArray_semi1D_test, DistributedNDArrayTypes ); + +TYPED_TEST( DistributedNDArray_semi1D_test, checkSumOfLocalSizes ) +{ + using CommunicatorType = typename TestFixture::CommunicatorType; + + const auto localRange = this->distributedNDArray.template getLocalRange< 1 >(); + const int localSize = localRange.getEnd() - localRange.getBegin(); + int sumOfLocalSizes = 0; + CommunicatorType::Allreduce( &localSize, &sumOfLocalSizes, 1, MPI_SUM, this->group ); + EXPECT_EQ( sumOfLocalSizes, this->globalSize ); + EXPECT_EQ( this->distributedNDArray.template getSize< 1 >(), this->globalSize ); +} + +TYPED_TEST( DistributedNDArray_semi1D_test, setLike ) +{ + using DistributedNDArrayType = typename TestFixture::DistributedNDArrayType; + + const auto localRange = this->distributedNDArray.template getLocalRange< 1 >(); + EXPECT_EQ( this->distributedNDArray.getLocalStorageSize(), 9 * (localRange.getEnd() - localRange.getBegin()) * (this->globalSize / 2) ); + DistributedNDArrayType copy; + EXPECT_EQ( copy.getLocalStorageSize(), 0 ); + copy.setLike( this->distributedNDArray ); + EXPECT_EQ( copy.getLocalStorageSize(), 9 * (localRange.getEnd() - localRange.getBegin()) * (this->globalSize / 2) ); +} + +TYPED_TEST( DistributedNDArray_semi1D_test, reset ) +{ + const auto localRange = this->distributedNDArray.template getLocalRange< 1 >(); + EXPECT_EQ( this->distributedNDArray.getLocalStorageSize(), 9 * (localRange.getEnd() - localRange.getBegin()) * (this->globalSize / 2) ); + this->distributedNDArray.reset(); + EXPECT_EQ( this->distributedNDArray.getLocalStorageSize(), 0 ); +} + +TYPED_TEST( DistributedNDArray_semi1D_test, elementwiseAccess ) +{ +// using ArrayViewType = typename TestFixture::ArrayViewType; + using IndexType = typename TestFixture::IndexType; + + this->distributedNDArray.setValue( 0 ); +// ArrayViewType localArrayView = this->distributedNDArray.getLocalArrayView(); + const auto localRange = this->distributedNDArray.template getLocalRange< 1 >(); + + // check initial value + for( int q = 0; q < 9; q++ ) + for( IndexType gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) + for( int j = 0; j < this->distributedNDArray.template getSize< 2 >(); j++ ) { +// EXPECT_EQ( localArrayView.getElement( i ), 0 ); + EXPECT_EQ( this->distributedNDArray.getElement( q, gi, j ), 0 ); + } + + // use operator() + if( std::is_same< typename TestFixture::DeviceType, Devices::Host >::value ) { + for( int q = 0; q < 9; q++ ) + for( IndexType gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) + for( int j = 0; j < this->distributedNDArray.template getSize< 2 >(); j++ ) { + this->distributedNDArray( q, gi, j ) = gi + 1; + } + + // check set value + for( int q = 0; q < 9; q++ ) + for( IndexType gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) + for( int j = 0; j < this->distributedNDArray.template getSize< 2 >(); j++ ) { + EXPECT_EQ( this->distributedNDArray.getElement( q, gi, j ), gi + 1 ); + EXPECT_EQ( this->distributedNDArray( q, gi, j ), gi + 1 ); + } + } +} + +TYPED_TEST( DistributedNDArray_semi1D_test, copyAssignment ) +{ + using DistributedNDArrayType = typename TestFixture::DistributedNDArrayType; + + this->distributedNDArray.setValue( 1 ); + DistributedNDArrayType copy; + copy = this->distributedNDArray; + // no binding, but deep copy +// EXPECT_NE( copy.getLocalArrayView().getData(), this->distributedNDArray.getLocalArrayView().getData() ); +// EXPECT_EQ( copy.getLocalArrayView(), this->distributedNDArray.getLocalArrayView() ); +} + +// separate function because nvcc does not allow __cuda_callable__ lambdas inside +// private or protected methods (which are created by TYPED_TEST macro) +template< typename DistributedArray > +void test_helper_comparisonOperators( DistributedArray& u, DistributedArray& v, DistributedArray& w ) +{ + using DeviceType = typename DistributedArray::DeviceType; + using IndexType = typename DistributedArray::IndexType; + + const auto localRange = u.template getLocalRange< 1 >(); + auto u_view = u.getView(); + auto v_view = v.getView(); + auto w_view = w.getView(); + + auto kernel = [=] __cuda_callable__ ( IndexType q, IndexType gi, IndexType j ) mutable + { + u_view( q, gi, j ) = gi; + v_view( q, gi, j ) = gi; + w_view( q, gi, j ) = 2 * gi; + }; + ParallelFor3D< DeviceType >::exec( (IndexType) 0, localRange.getBegin(), (IndexType) 0, + 9, localRange.getEnd(), u.template getSize< 2 >(), + kernel ); +} + +TYPED_TEST( DistributedNDArray_semi1D_test, comparisonOperators ) +{ + using DistributedNDArrayType = typename TestFixture::DistributedNDArrayType; + + DistributedNDArrayType& u = this->distributedNDArray; + DistributedNDArrayType v, w; + v.setLike( u ); + w.setLike( u ); + + test_helper_comparisonOperators( u, v, w ); + + EXPECT_TRUE( u == u ); + EXPECT_TRUE( u == v ); + EXPECT_TRUE( v == u ); + EXPECT_FALSE( u != v ); + EXPECT_FALSE( v != u ); + EXPECT_TRUE( u != w ); + EXPECT_TRUE( w != u ); + EXPECT_FALSE( u == w ); + EXPECT_FALSE( w == u ); + + v.reset(); + EXPECT_FALSE( u == v ); + u.reset(); + EXPECT_TRUE( u == v ); +} + +// separate function because nvcc does not allow __cuda_callable__ lambdas inside +// private or protected methods (which are created by TYPED_TEST macro) +template< typename DistributedArray > +void test_helper_forAll( DistributedArray& a ) +{ + using IndexType = typename DistributedArray::IndexType; + + const auto localRange = a.template getLocalRange< 1 >(); + auto a_view = a.getView(); + + auto setter = [=] __cuda_callable__ ( IndexType q, IndexType i, IndexType j ) mutable + { + a_view( q, i, j ) += 1; + }; + + a.setValue( 0 ); + a.forAll( setter ); + + for( int q = 0; q < 9; q++ ) + for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) + for( int j = 0; j < a.template getSize< 2 >(); j++ ) + EXPECT_EQ( a.getElement( q, gi, j ), 1 ); + + a.setValue( 0 ); + a_view.forAll( setter ); + + for( int q = 0; q < 9; q++ ) + for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) + for( int j = 0; j < a.template getSize< 2 >(); j++ ) + EXPECT_EQ( a.getElement( q, gi, j ), 1 ); +} + +TYPED_TEST( DistributedNDArray_semi1D_test, forAll ) +{ + test_helper_forAll( this->distributedNDArray ); +} + +// separate function because nvcc does not allow __cuda_callable__ lambdas inside +// private or protected methods (which are created by TYPED_TEST macro) +template< typename DistributedArray > +void test_helper_forInternal( DistributedArray& a ) +{ + using IndexType = typename DistributedArray::IndexType; + + const auto localRange = a.template getLocalRange< 1 >(); + auto a_view = a.getView(); + + auto setter = [=] __cuda_callable__ ( IndexType q, IndexType i, IndexType j ) mutable + { + a_view( q, i, j ) += 1; + }; + + a.setValue( 0 ); + a.forInternal( setter ); + + for( int q = 0; q < 9; q++ ) + for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) + for( int j = 0; j < a.template getSize< 2 >(); j++ ) + { + if( q == 0 || q == 8 || + gi == 0 || gi == a.template getSize< 1 >() - 1 || + j == 0 || j == a.template getSize< 2 >() - 1 ) + EXPECT_EQ( a.getElement( q, gi, j ), 0 ) + << "gi = " << gi; + else + EXPECT_EQ( a.getElement( q, gi, j ), 1 ) + << "gi = " << gi; + } + + a.setValue( 0 ); + a_view.forInternal( setter ); + + for( int q = 0; q < 9; q++ ) + for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) + for( int j = 0; j < a.template getSize< 2 >(); j++ ) + { + if( q == 0 || q == 8 || + gi == 0 || gi == a.template getSize< 1 >() - 1 || + j == 0 || j == a.template getSize< 2 >() - 1 ) + EXPECT_EQ( a.getElement( q, gi, j ), 0 ) + << "gi = " << gi; + else + EXPECT_EQ( a.getElement( q, gi, j ), 1 ) + << "gi = " << gi; + } +} + +TYPED_TEST( DistributedNDArray_semi1D_test, forInternal ) +{ + test_helper_forInternal( this->distributedNDArray ); +} + +// separate function because nvcc does not allow __cuda_callable__ lambdas inside +// private or protected methods (which are created by TYPED_TEST macro) +template< typename DistributedArray > +void test_helper_forLocalInternal( DistributedArray& a ) +{ + using IndexType = typename DistributedArray::IndexType; + + const auto localRange = a.template getLocalRange< 1 >(); + auto a_view = a.getView(); + + auto setter = [=] __cuda_callable__ ( IndexType q, IndexType i, IndexType j ) mutable + { + a_view( q, i, j ) += 1; + }; + + a.setValue( 0 ); + // equivalent to forAll because all overlaps are 0 + a.forLocalInternal( setter ); + + for( int q = 0; q < 9; q++ ) + for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) + for( int j = 0; j < a.template getSize< 2 >(); j++ ) + EXPECT_EQ( a.getElement( q, gi, j ), 1 ); + + a.setValue( 0 ); + // equivalent to forAll because all overlaps are 0 + a_view.forLocalInternal( setter ); + + for( int q = 0; q < 9; q++ ) + for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) + for( int j = 0; j < a.template getSize< 2 >(); j++ ) + EXPECT_EQ( a.getElement( q, gi, j ), 1 ); +} + +TYPED_TEST( DistributedNDArray_semi1D_test, forLocalInternal ) +{ + test_helper_forLocalInternal( this->distributedNDArray ); +} + +// separate function because nvcc does not allow __cuda_callable__ lambdas inside +// private or protected methods (which are created by TYPED_TEST macro) +template< typename DistributedArray > +void test_helper_forBoundary( DistributedArray& a ) +{ + using IndexType = typename DistributedArray::IndexType; + + const auto localRange = a.template getLocalRange< 1 >(); + auto a_view = a.getView(); + + auto setter = [=] __cuda_callable__ ( IndexType q, IndexType i, IndexType j ) mutable + { + a_view( q, i, j ) += 1; + }; + + a.setValue( 0 ); + a.forBoundary( setter ); + + for( int q = 0; q < 9; q++ ) + for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) + for( int j = 0; j < a.template getSize< 2 >(); j++ ) + { + if( q == 0 || q == 8 || + gi == 0 || gi == a.template getSize< 1 >() - 1 || + j == 0 || j == a.template getSize< 2 >() - 1 ) + EXPECT_EQ( a.getElement( q, gi, j ), 1 ) + << "gi = " << gi; + else + EXPECT_EQ( a.getElement( q, gi, j ), 0 ) + << "gi = " << gi; + } + + a.setValue( 0 ); + a_view.forBoundary( setter ); + + for( int q = 0; q < 9; q++ ) + for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) + for( int j = 0; j < a.template getSize< 2 >(); j++ ) + { + if( q == 0 || q == 8 || + gi == 0 || gi == a.template getSize< 1 >() - 1 || + j == 0 || j == a.template getSize< 2 >() - 1 ) + EXPECT_EQ( a.getElement( q, gi, j ), 1 ) + << "gi = " << gi; + else + EXPECT_EQ( a.getElement( q, gi, j ), 0 ) + << "gi = " << gi; + } +} + +TYPED_TEST( DistributedNDArray_semi1D_test, forBoundary ) +{ + test_helper_forBoundary( this->distributedNDArray ); +} + +// separate function because nvcc does not allow __cuda_callable__ lambdas inside +// private or protected methods (which are created by TYPED_TEST macro) +template< typename DistributedArray > +void test_helper_forLocalBoundary( DistributedArray& a ) +{ + using IndexType = typename DistributedArray::IndexType; + + const auto localRange = a.template getLocalRange< 1 >(); + auto a_view = a.getView(); + + auto setter = [=] __cuda_callable__ ( IndexType q, IndexType i, IndexType j ) mutable + { + a_view( q, i, j ) += 1; + }; + + a.setValue( 0 ); + // empty set because all overlaps are 0 + a.forLocalBoundary( setter ); + + for( int q = 0; q < 9; q++ ) + for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) + for( int j = 0; j < a.template getSize< 2 >(); j++ ) + EXPECT_EQ( a.getElement( q, gi, j ), 0 ); + + a.setValue( 0 ); + // empty set because all overlaps are 0 + a_view.forLocalBoundary( setter ); + + for( int q = 0; q < 9; q++ ) + for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) + for( int j = 0; j < a.template getSize< 2 >(); j++ ) + EXPECT_EQ( a.getElement( q, gi, j ), 0 ); +} + +TYPED_TEST( DistributedNDArray_semi1D_test, forLocalBoundary ) +{ + test_helper_forLocalBoundary( this->distributedNDArray ); +} + +// separate function because nvcc does not allow __cuda_callable__ lambdas inside +// private or protected methods (which are created by TYPED_TEST macro) +template< typename DistributedArray > +void test_helper_forOverlaps( DistributedArray& a ) +{ + using IndexType = typename DistributedArray::IndexType; + + const auto localRange = a.template getLocalRange< 1 >(); + auto a_view = a.getView(); + + auto setter = [=] __cuda_callable__ ( IndexType q, IndexType i, IndexType j ) mutable + { + a_view( q, i, j ) += 1; + }; + + a.setValue( 0 ); + // empty set because all overlaps are 0 + a.forOverlaps( setter ); + + for( int q = 0; q < 9; q++ ) + for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) + for( int j = 0; j < a.template getSize< 2 >(); j++ ) + EXPECT_EQ( a.getElement( q, gi, j ), 0 ); + + a.setValue( 0 ); + // empty set because all overlaps are 0 + a_view.forOverlaps( setter ); + + for( int q = 0; q < 9; q++ ) + for( int gi = localRange.getBegin(); gi < localRange.getEnd(); gi++ ) + for( int j = 0; j < a.template getSize< 2 >(); j++ ) + EXPECT_EQ( a.getElement( q, gi, j ), 0 ); +} + +TYPED_TEST( DistributedNDArray_semi1D_test, forOverlaps ) +{ + test_helper_forOverlaps( this->distributedNDArray ); +} + +#endif // HAVE_GTEST + + +#if (defined(HAVE_GTEST) && defined(HAVE_MPI)) +using CommunicatorType = Communicators::MpiCommunicator; + +#include + +class MinimalistBufferedPrinter +: public ::testing::EmptyTestEventListener +{ +private: + std::stringstream sout; + +public: + // Called before a test starts. + virtual void OnTestStart(const ::testing::TestInfo& test_info) + { + sout << test_info.test_case_name() << "." << test_info.name() << " Start." << std::endl; + } + + // Called after a failed assertion or a SUCCEED() invocation. + virtual void OnTestPartResult(const ::testing::TestPartResult& test_part_result) + { + sout << (test_part_result.failed() ? "====Failure=== " : "===Success=== ") + << test_part_result.file_name() << " " + << test_part_result.line_number() <listeners(); + + delete listeners.Release(listeners.default_result_printer()); + listeners.Append(new MinimalistBufferedPrinter); + + Communicators::ScopedInitializer< CommunicatorType > mpi(argc, argv); + #endif + return RUN_ALL_TESTS(); +#else + throw GtestMissingError(); +#endif +} diff --git a/src/UnitTests/Containers/ndarray/NDArrayTest.cpp b/src/UnitTests/Containers/ndarray/NDArrayTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d0e9222189724207463b9b3b6a5ebebe3e568521 --- /dev/null +++ b/src/UnitTests/Containers/ndarray/NDArrayTest.cpp @@ -0,0 +1 @@ +#include "NDArrayTest.h" diff --git a/src/UnitTests/Containers/ndarray/NDArrayTest.cu b/src/UnitTests/Containers/ndarray/NDArrayTest.cu new file mode 100644 index 0000000000000000000000000000000000000000..d0e9222189724207463b9b3b6a5ebebe3e568521 --- /dev/null +++ b/src/UnitTests/Containers/ndarray/NDArrayTest.cu @@ -0,0 +1 @@ +#include "NDArrayTest.h" diff --git a/src/UnitTests/Containers/ndarray/NDArrayTest.h b/src/UnitTests/Containers/ndarray/NDArrayTest.h new file mode 100644 index 0000000000000000000000000000000000000000..1e5d9a30cedf4c25b4e56d1c3a44511588848d86 --- /dev/null +++ b/src/UnitTests/Containers/ndarray/NDArrayTest.h @@ -0,0 +1,1338 @@ +#include "gtest/gtest.h" + +#include + +using namespace TNL::Containers; +using std::index_sequence; + +template< typename Array > +void expect_identity( const Array& a ) +{ + Array identity; + identity.setLike( a ); + int last = 0; + for( int i = 0; i < identity.getSize(); i++ ) { + // skip negative/invalid entries due to alignment + if( a[ i ] < 0 ) + identity[ i ] = a[ i ]; + else + identity[ i ] = last++; + } + EXPECT_EQ( a, identity ); +} + +TEST( NDArrayTest, setLike ) +{ + int I = 2, J = 2, K = 2, L = 2, M = 2, N = 2; + NDArray< int, + SizesHolder< int, 0, 0, 0, 0, 0, 0 >, + index_sequence< 5, 3, 4, 2, 0, 1 > > a; + a.setSizes( I, J, K, L, M, N ); + + decltype(a) b; + EXPECT_EQ( b.template getSize< 0 >(), 0 ); + EXPECT_EQ( b.template getSize< 1 >(), 0 ); + EXPECT_EQ( b.template getSize< 2 >(), 0 ); + EXPECT_EQ( b.template getSize< 3 >(), 0 ); + EXPECT_EQ( b.template getSize< 4 >(), 0 ); + EXPECT_EQ( b.template getSize< 5 >(), 0 ); + b.setLike( a ); + EXPECT_EQ( b.template getSize< 0 >(), I ); + EXPECT_EQ( b.template getSize< 1 >(), J ); + EXPECT_EQ( b.template getSize< 2 >(), K ); + EXPECT_EQ( b.template getSize< 3 >(), L ); + EXPECT_EQ( b.template getSize< 4 >(), M ); + EXPECT_EQ( b.template getSize< 5 >(), N ); +} + +TEST( NDArrayTest, reset ) +{ + int I = 2, J = 2, K = 2, L = 2, M = 2, N = 2; + NDArray< int, + SizesHolder< int, 0, 0, 0, 0, 0, 0 >, + index_sequence< 5, 3, 4, 2, 0, 1 > > a; + a.setSizes( I, J, K, L, M, N ); + EXPECT_EQ( a.template getSize< 0 >(), I ); + EXPECT_EQ( a.template getSize< 1 >(), J ); + EXPECT_EQ( a.template getSize< 2 >(), K ); + EXPECT_EQ( a.template getSize< 3 >(), L ); + EXPECT_EQ( a.template getSize< 4 >(), M ); + EXPECT_EQ( a.template getSize< 5 >(), N ); + + a.reset(); + EXPECT_EQ( a.template getSize< 0 >(), 0 ); + EXPECT_EQ( a.template getSize< 1 >(), 0 ); + EXPECT_EQ( a.template getSize< 2 >(), 0 ); + EXPECT_EQ( a.template getSize< 3 >(), 0 ); + EXPECT_EQ( a.template getSize< 4 >(), 0 ); + EXPECT_EQ( a.template getSize< 5 >(), 0 ); +} + +TEST( NDArrayTest, Static_1D ) +{ + constexpr int I = 3; + NDArray< int, SizesHolder< int, I > > a; + a.setSizes( 0 ); + + int v = 0; + for( int i = 0; i < I; i++ ) { + a( i ) = v++; + EXPECT_EQ( a[ i ], a( i ) ); + } + + expect_identity( a.getStorageArray() ); +} + +TEST( NDArrayTest, Static_2D_Identity ) +{ + constexpr int I = 3, J = 5; + NDArray< int, SizesHolder< int, I, J > > a; + a.setSizes( 0, 0 ); + + int v = 0; + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + a( i, j ) = v++; + + expect_identity( a.getStorageArray() ); +} + +TEST( NDArrayTest, Static_2D_Permuted ) +{ + constexpr int I = 3, J = 5; + NDArray< int, + SizesHolder< int, I, J >, + index_sequence< 1, 0 > > a; + a.setSizes( 0, 0 ); + + int v = 0; + for( int j = 0; j < J; j++ ) + for( int i = 0; i < I; i++ ) + a( i, j ) = v++; + + expect_identity( a.getStorageArray() ); +} + +TEST( NDArrayTest, Dynamic_6D ) +{ + int I = 2, J = 2, K = 2, L = 2, M = 2, N = 2; + NDArray< int, + SizesHolder< int, 0, 0, 0, 0, 0, 0 >, + index_sequence< 5, 3, 4, 2, 0, 1 > > a; + a.setSizes( I, J, K, L, M, N ); + + // initialize entries invalid due to alignment to -1 + a.getStorageArray().setValue( -1 ); + + int v = 0; + for( int n = 0; n < N; n++ ) + for( int l = 0; l < L; l++ ) + for( int m = 0; m < M; m++ ) + for( int k = 0; k < K; k++ ) + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + a( i, j, k, l, m, n ) = v++; + + expect_identity( a.getStorageArray() ); +} + +TEST( NDArrayTest, CopySemantics ) +{ + constexpr int I = 3, J = 4; + NDArray< int, SizesHolder< int, 0, 0 > > a; + a.setSizes( I, J ); + + auto a_view = a.getView(); + + int v = 0; + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + a( i, j ) = v++; + + expect_identity( a.getStorageArray() ); + + // assignment with zero sizes + NDArray< int, SizesHolder< int, 0, 0 > > b; + b = a; + auto b_view = b.getView(); + EXPECT_EQ( a, b ); + EXPECT_EQ( a_view, b_view ); + EXPECT_EQ( a_view.getView(), b_view ); + EXPECT_EQ( a_view.getConstView(), b_view.getConstView() ); + EXPECT_EQ( a.getConstView(), b.getConstView() ); + EXPECT_EQ( a.getConstView(), b_view.getConstView() ); + + // assignment between views + NDArray< int, SizesHolder< int, 0, 0 > > c; + c.setSizes( I, J ); + auto c_view = c.getView(); + c_view = a_view; + EXPECT_EQ( a, c ); + EXPECT_EQ( a_view, c_view ); + EXPECT_EQ( a_view.getView(), c_view ); + EXPECT_EQ( a_view.getConstView(), c_view.getConstView() ); + EXPECT_EQ( a.getConstView(), c.getConstView() ); + EXPECT_EQ( a.getConstView(), c_view.getConstView() ); + + // move-assignment between views should do a deep copy + b_view = a.getView(); + EXPECT_EQ( a_view, b_view ); + EXPECT_EQ( a, b ); + EXPECT_NE( &b_view( 0, 0 ), &a_view( 0, 0 ) ); + + // assignment of view to array + c.setValue( 0 ); + c = a_view; + EXPECT_EQ( a, c ); + EXPECT_EQ( a_view, c_view ); + EXPECT_EQ( a_view.getView(), c_view ); + EXPECT_EQ( a_view.getConstView(), c_view.getConstView() ); + EXPECT_EQ( a.getConstView(), c.getConstView() ); + EXPECT_EQ( a.getConstView(), c_view.getConstView() ); + + // assignment of array to view + c.setValue( 0 ); + c_view = a; + EXPECT_EQ( a, c ); + EXPECT_EQ( a_view, c_view ); + EXPECT_EQ( a_view.getView(), c_view ); + EXPECT_EQ( a_view.getConstView(), c_view.getConstView() ); + EXPECT_EQ( a.getConstView(), c.getConstView() ); + EXPECT_EQ( a.getConstView(), c_view.getConstView() ); + + // assignment with different ValueType + NDArray< double, SizesHolder< int, 0, 0 > > d; + d = a; + expect_identity( d.getStorageArray() ); + + // assignment with different SizesHolder + NDArray< double, SizesHolder< int, I, J > > e; + e = a; + expect_identity( e.getStorageArray() ); + + // assignment with different IndexType + NDArray< double, SizesHolder< short int, 0, 0 > > f; + f = a; + expect_identity( f.getStorageArray() ); + + // assignment with different Permutation + // TODO +} + +#ifdef HAVE_CUDA +TEST( NDArrayTest, CopySemanticsCrossDevice ) +{ + constexpr int I = 3, J = 4; + NDArray< int, SizesHolder< int, 0, 0 > > a; + NDArray< int, SizesHolder< int, 0, 0 >, + std::index_sequence< 0, 1 >, + TNL::Devices::Cuda > da; + a.setSizes( I, J ); + da.setSizes( I, J ); + + auto a_view = a.getView(); + auto da_view = da.getView(); + + int v = 0; + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + a( i, j ) = v++; + + expect_identity( a.getStorageArray() ); + + // copy to the device, simple check + da = a; + EXPECT_EQ( da.getStorageArray(), a.getStorageArray() ); + + // assignment with zero sizes + NDArray< int, SizesHolder< int, 0, 0 > > b; + b = da; + auto b_view = b.getView(); + EXPECT_EQ( a, b ); + EXPECT_EQ( a_view, b_view ); + EXPECT_EQ( a_view.getView(), b_view ); + EXPECT_EQ( a_view.getConstView(), b_view.getConstView() ); + EXPECT_EQ( a.getConstView(), b.getConstView() ); + EXPECT_EQ( a.getConstView(), b_view.getConstView() ); + + // assignment between views + NDArray< int, SizesHolder< int, 0, 0 > > c; + c.setSizes( I, J ); + auto c_view = c.getView(); + c_view = da_view; + EXPECT_EQ( a, c ); + EXPECT_EQ( a_view, c_view ); + EXPECT_EQ( a_view.getView(), c_view ); + EXPECT_EQ( a_view.getConstView(), c_view.getConstView() ); + EXPECT_EQ( a.getConstView(), c.getConstView() ); + EXPECT_EQ( a.getConstView(), c_view.getConstView() ); + + // move-assignment between views should do a deep copy + b_view = da.getView(); + EXPECT_EQ( a_view, b_view ); + EXPECT_EQ( a, b ); + EXPECT_NE( &b_view( 0, 0 ), &a_view( 0, 0 ) ); + + // assignment of view to array + c.setValue( 0 ); + c = da_view; + EXPECT_EQ( a, c ); + EXPECT_EQ( a_view, c_view ); + EXPECT_EQ( a_view.getView(), c_view ); + EXPECT_EQ( a_view.getConstView(), c_view.getConstView() ); + EXPECT_EQ( a.getConstView(), c.getConstView() ); + EXPECT_EQ( a.getConstView(), c_view.getConstView() ); + + // assignment of array to view + c.setValue( 0 ); + c_view = da; + EXPECT_EQ( a, c ); + EXPECT_EQ( a_view, c_view ); + EXPECT_EQ( a_view.getView(), c_view ); + EXPECT_EQ( a_view.getConstView(), c_view.getConstView() ); + EXPECT_EQ( a.getConstView(), c.getConstView() ); + EXPECT_EQ( a.getConstView(), c_view.getConstView() ); + + // assignment with different ValueType + NDArray< double, SizesHolder< int, 0, 0 > > d; + d = da; + expect_identity( d.getStorageArray() ); + + // assignment with different SizesHolder + NDArray< double, SizesHolder< int, I, J > > e; + e = da; + expect_identity( e.getStorageArray() ); + + // assignment with different IndexType + NDArray< double, SizesHolder< short int, 0, 0 > > f; + f = da; + expect_identity( f.getStorageArray() ); + + // assignment with different Permutation + // TODO +} +#endif + +TEST( NDArrayTest, SizesHolderPrinter ) +{ + SizesHolder< int, 0, 1, 2 > holder; + holder.setSize< 0 >( 3 ); + + std::stringstream str; + str << holder; + EXPECT_EQ( str.str(), "SizesHolder< 0, 1, 2 >( 3, 1, 2 )" ); +} + +TEST( NDArrayTest, forAll_dynamic_1D ) +{ + int I = 2; + NDArray< int, + SizesHolder< int, 0 >, + index_sequence< 0 > > a; + a.setSizes( I ); + a.setValue( 0 ); + + auto setter = [&] ( int i ) + { + a( i ) += 1; + }; + + a.forAll( setter ); + + for( int i = 0; i < I; i++ ) + EXPECT_EQ( a( i ), 1 ); +} + +TEST( NDArrayTest, forAll_dynamic_2D ) +{ + int I = 2, J = 3; + NDArray< int, + SizesHolder< int, 0, 0 >, + index_sequence< 1, 0 > > a; + a.setSizes( I, J ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j ) + { + a( i, j ) += 1; + }; + + a.forAll( setter ); + + for( int j = 0; j < J; j++ ) + for( int i = 0; i < I; i++ ) + EXPECT_EQ( a( i, j ), 1 ); +} + +TEST( NDArrayTest, forAll_dynamic_3D ) +{ + int I = 2, J = 3, K = 4; + NDArray< int, + SizesHolder< int, 0, 0, 0 >, + index_sequence< 2, 0, 1 > > a; + a.setSizes( I, J, K ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j, int k ) + { + a( i, j, k ) += 1; + }; + + a.forAll( setter ); + + for( int k = 0; k < K; k++ ) + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + EXPECT_EQ( a( i, j, k ), 1 ); +} + +TEST( NDArrayTest, forAll_dynamic_4D ) +{ + int I = 2, J = 3, K = 4, L = 5; + NDArray< int, + SizesHolder< int, 0, 0, 0, 0 >, + index_sequence< 3, 2, 0, 1 > > a; + a.setSizes( I, J, K, L ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j, int k, int l ) + { + a( i, j, k, l ) += 1; + }; + + a.forAll( setter ); + + for( int l = 0; l < L; l++ ) + for( int k = 0; k < K; k++ ) + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + EXPECT_EQ( a( i, j, k, l ), 1 ); +} + +TEST( NDArrayTest, forAll_dynamic_5D ) +{ + int I = 2, J = 3, K = 4, L = 5, M = 6; + NDArray< int, + SizesHolder< int, 0, 0, 0, 0, 0 >, + index_sequence< 3, 4, 2, 0, 1 > > a; + a.setSizes( I, J, K, L, M ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j, int k, int l, int m ) + { + a( i, j, k, l, m ) += 1; + }; + + a.forAll( setter ); + + for( int l = 0; l < L; l++ ) + for( int m = 0; m < M; m++ ) + for( int k = 0; k < K; k++ ) + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + EXPECT_EQ( a( i, j, k, l, m ), 1 ); +} + +TEST( NDArrayTest, forAll_dynamic_6D ) +{ + int I = 2, J = 3, K = 4, L = 5, M = 6, N = 7; + NDArray< int, + SizesHolder< int, 0, 0, 0, 0, 0, 0 >, + index_sequence< 5, 3, 4, 2, 0, 1 > > a; + a.setSizes( I, J, K, L, M, N ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j, int k, int l, int m, int n ) + { + a( i, j, k, l, m, n ) += 1; + }; + + a.forAll( setter ); + + for( int n = 0; n < N; n++ ) + for( int l = 0; l < L; l++ ) + for( int m = 0; m < M; m++ ) + for( int k = 0; k < K; k++ ) + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + EXPECT_EQ( a( i, j, k, l, m, n ), 1 ); +} + +TEST( NDArrayTest, forAll_static_1D ) +{ + constexpr int I = 3; + StaticNDArray< int, SizesHolder< int, I > > a; +// a.setSizes( 0 ); + a.setValue( 0 ); + + auto setter = [&] ( int i ) + { + a( i ) += 1; + }; + + a.forAll( setter ); + + for( int i = 0; i < I; i++ ) + EXPECT_EQ( a( i ), 1 ); +} + +TEST( NDArrayTest, forAll_static_2D ) +{ + constexpr int I = 3, J = 4; + StaticNDArray< int, SizesHolder< int, I, J > > a; +// a.setSizes( 0, 0 ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j ) + { + a( i, j ) += 1; + }; + + a.forAll( setter ); + + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + EXPECT_EQ( a( i, j ), 1 ); +} + +TEST( NDArrayTest, forAll_static_3D ) +{ + constexpr int I = 3, J = 4, K = 5; + StaticNDArray< int, SizesHolder< int, I, J, K > > a; +// a.setSizes( 0, 0, 0 ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j, int k ) + { + a( i, j, k ) += 1; + }; + + a.forAll( setter ); + + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + for( int k = 0; k < K; k++ ) + EXPECT_EQ( a( i, j, k ), 1 ); +} + +TEST( NDArrayTest, forAll_static_4D ) +{ + constexpr int I = 3, J = 4, K = 5, L = 6; + StaticNDArray< int, SizesHolder< int, I, J, K, L > > a; +// a.setSizes( 0, 0, 0, 0 ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j, int k, int l ) + { + a( i, j, k, l ) += 1; + }; + + a.forAll( setter ); + + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + for( int k = 0; k < K; k++ ) + for( int l = 0; l < L; l++ ) + EXPECT_EQ( a( i, j, k, l ), 1 ); +} + +TEST( NDArrayTest, forAll_static_5D ) +{ + constexpr int I = 3, J = 4, K = 5, L = 6, M = 7; + StaticNDArray< int, SizesHolder< int, I, J, K, L, M > > a; +// a.setSizes( 0, 0, 0, 0, 0 ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j, int k, int l, int m ) + { + a( i, j, k, l, m ) += 1; + }; + + a.forAll( setter ); + + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + for( int k = 0; k < K; k++ ) + for( int l = 0; l < L; l++ ) + for( int m = 0; m < M; m++ ) + EXPECT_EQ( a( i, j, k, l, m ), 1 ); +} + +TEST( NDArrayTest, forAll_static_6D ) +{ + constexpr int I = 3, J = 4, K = 5, L = 6, M = 7, N = 8; + StaticNDArray< int, SizesHolder< int, I, J, K, L, M, N > > a; +// a.setSizes( 0, 0, 0, 0, 0, 0 ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j, int k, int l, int m, int n ) + { + a( i, j, k, l, m, n ) += 1; + }; + + a.forAll( setter ); + + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + for( int k = 0; k < K; k++ ) + for( int l = 0; l < L; l++ ) + for( int m = 0; m < M; m++ ) + for( int n = 0; n < N; n++ ) + EXPECT_EQ( a( i, j, k, l, m, n ), 1 ); +} + +TEST( NDArrayTest, forInternal_dynamic_1D ) +{ + int I = 3; + NDArray< int, + SizesHolder< int, 0 >, + index_sequence< 0 > > a; + a.setSizes( I ); + a.setValue( 0 ); + + auto setter = [&] ( int i ) + { + a( i ) += 1; + }; + + a.forInternal( setter ); + + for( int i = 0; i < I; i++ ) + { + if( i == 0 || i == I - 1 ) + EXPECT_EQ( a( i ), 0 ) + << "i = " << i; + else + EXPECT_EQ( a( i ), 1 ) + << "i = " << i; + } +} + +TEST( NDArrayTest, forInternal_dynamic_2D ) +{ + int I = 3, J = 4; + NDArray< int, + SizesHolder< int, 0, 0 >, + index_sequence< 1, 0 > > a; + a.setSizes( I, J ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j ) + { + a( i, j ) += 1; + }; + + a.forInternal( setter ); + + for( int j = 0; j < J; j++ ) + for( int i = 0; i < I; i++ ) + { + if( i == 0 || i == I - 1 || + j == 0 || j == J - 1 ) + EXPECT_EQ( a( i, j ), 0 ) + << "i = " << i << ", j = " << j; + else + EXPECT_EQ( a( i, j ), 1 ) + << "i = " << i << ", j = " << j; + } +} + +TEST( NDArrayTest, forInternal_dynamic_3D ) +{ + int I = 3, J = 4, K = 5; + NDArray< int, + SizesHolder< int, 0, 0, 0 >, + index_sequence< 2, 0, 1 > > a; + a.setSizes( I, J, K ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j, int k ) + { + a( i, j, k ) += 1; + }; + + a.forInternal( setter ); + + for( int k = 0; k < K; k++ ) + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + { + if( i == 0 || i == I - 1 || + j == 0 || j == J - 1 || + k == 0 || k == K - 1 ) + EXPECT_EQ( a( i, j, k ), 0 ) + << "i = " << i << ", j = " << j << ", k = " << k; + else + EXPECT_EQ( a( i, j, k ), 1 ) + << "i = " << i << ", j = " << j << ", k = " << k; + } +} + +TEST( NDArrayTest, forInternal_dynamic_4D ) +{ + int I = 3, J = 4, K = 5, L = 6; + NDArray< int, + SizesHolder< int, 0, 0, 0, 0 >, + index_sequence< 3, 2, 0, 1 > > a; + a.setSizes( I, J, K, L ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j, int k, int l ) + { + a( i, j, k, l ) += 1; + }; + + a.forInternal( setter ); + + for( int l = 0; l < L; l++ ) + for( int k = 0; k < K; k++ ) + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + { + if( i == 0 || i == I - 1 || + j == 0 || j == J - 1 || + k == 0 || k == K - 1 || + l == 0 || l == L - 1 ) + EXPECT_EQ( a( i, j, k, l ), 0 ) + << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l; + else + EXPECT_EQ( a( i, j, k, l ), 1 ) + << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l; + } +} + +TEST( NDArrayTest, forInternal_dynamic_5D ) +{ + int I = 3, J = 4, K = 5, L = 6, M = 7; + NDArray< int, + SizesHolder< int, 0, 0, 0, 0, 0 >, + index_sequence< 3, 4, 2, 0, 1 > > a; + a.setSizes( I, J, K, L, M ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j, int k, int l, int m ) + { + a( i, j, k, l, m ) += 1; + }; + + a.forInternal( setter ); + + for( int l = 0; l < L; l++ ) + for( int m = 0; m < M; m++ ) + for( int k = 0; k < K; k++ ) + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + { + if( i == 0 || i == I - 1 || + j == 0 || j == J - 1 || + k == 0 || k == K - 1 || + l == 0 || l == L - 1 || + m == 0 || m == M - 1 ) + EXPECT_EQ( a( i, j, k, l, m ), 0 ) + << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m; + else + EXPECT_EQ( a( i, j, k, l, m ), 1 ) + << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m; + } +} + +TEST( NDArrayTest, forInternal_dynamic_6D ) +{ + int I = 3, J = 4, K = 5, L = 6, M = 7, N = 8; + NDArray< int, + SizesHolder< int, 0, 0, 0, 0, 0, 0 >, + index_sequence< 5, 3, 4, 2, 0, 1 > > a; + a.setSizes( I, J, K, L, M, N ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j, int k, int l, int m, int n ) + { + a( i, j, k, l, m, n ) += 1; + }; + + a.forInternal( setter ); + + for( int n = 0; n < N; n++ ) + for( int l = 0; l < L; l++ ) + for( int m = 0; m < M; m++ ) + for( int k = 0; k < K; k++ ) + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + { + if( i == 0 || i == I - 1 || + j == 0 || j == J - 1 || + k == 0 || k == K - 1 || + l == 0 || l == L - 1 || + m == 0 || m == M - 1 || + n == 0 || n == N - 1 ) + EXPECT_EQ( a( i, j, k, l, m, n ), 0 ) + << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n; + else + EXPECT_EQ( a( i, j, k, l, m, n ), 1 ) + << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n; + } +} + +TEST( NDArrayTest, forInternal_static_1D ) +{ + constexpr int I = 3; + StaticNDArray< int, SizesHolder< int, I > > a; +// a.setSizes( 0 ); + a.setValue( 0 ); + + auto setter = [&] ( int i ) + { + a( i ) += 1; + }; + + a.forInternal( setter ); + + for( int i = 0; i < I; i++ ) + { + if( i == 0 || i == I - 1 ) + EXPECT_EQ( a( i ), 0 ) + << "i = " << i; + else + EXPECT_EQ( a( i ), 1 ) + << "i = " << i; + } +} + +TEST( NDArrayTest, forInternal_static_2D ) +{ + constexpr int I = 3, J = 4; + StaticNDArray< int, SizesHolder< int, I, J > > a; +// a.setSizes( 0, 0 ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j ) + { + a( i, j ) += 1; + }; + + a.forInternal( setter ); + + for( int j = 0; j < J; j++ ) + for( int i = 0; i < I; i++ ) + { + if( i == 0 || i == I - 1 || + j == 0 || j == J - 1 ) + EXPECT_EQ( a( i, j ), 0 ) + << "i = " << i << ", j = " << j; + else + EXPECT_EQ( a( i, j ), 1 ) + << "i = " << i << ", j = " << j; + } +} + +TEST( NDArrayTest, forInternal_static_3D ) +{ + constexpr int I = 3, J = 4, K = 5; + StaticNDArray< int, SizesHolder< int, I, J, K > > a; +// a.setSizes( 0, 0, 0 ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j, int k ) + { + a( i, j, k ) += 1; + }; + + a.forInternal( setter ); + + for( int k = 0; k < K; k++ ) + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + { + if( i == 0 || i == I - 1 || + j == 0 || j == J - 1 || + k == 0 || k == K - 1 ) + EXPECT_EQ( a( i, j, k ), 0 ) + << "i = " << i << ", j = " << j << ", k = " << k; + else + EXPECT_EQ( a( i, j, k ), 1 ) + << "i = " << i << ", j = " << j << ", k = " << k; + } +} + +TEST( NDArrayTest, forInternal_static_4D ) +{ + constexpr int I = 3, J = 4, K = 5, L = 6; + StaticNDArray< int, SizesHolder< int, I, J, K, L > > a; +// a.setSizes( 0, 0, 0, 0 ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j, int k, int l ) + { + a( i, j, k, l ) += 1; + }; + + a.forInternal( setter ); + + for( int l = 0; l < L; l++ ) + for( int k = 0; k < K; k++ ) + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + { + if( i == 0 || i == I - 1 || + j == 0 || j == J - 1 || + k == 0 || k == K - 1 || + l == 0 || l == L - 1 ) + EXPECT_EQ( a( i, j, k, l ), 0 ) + << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l; + else + EXPECT_EQ( a( i, j, k, l ), 1 ) + << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l; + } +} + +TEST( NDArrayTest, forInternal_static_5D ) +{ + constexpr int I = 3, J = 4, K = 5, L = 6, M = 7; + StaticNDArray< int, SizesHolder< int, I, J, K, L, M > > a; +// a.setSizes( 0, 0, 0, 0, 0 ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j, int k, int l, int m ) + { + a( i, j, k, l, m ) += 1; + }; + + a.forInternal( setter ); + + for( int l = 0; l < L; l++ ) + for( int m = 0; m < M; m++ ) + for( int k = 0; k < K; k++ ) + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + { + if( i == 0 || i == I - 1 || + j == 0 || j == J - 1 || + k == 0 || k == K - 1 || + l == 0 || l == L - 1 || + m == 0 || m == M - 1 ) + EXPECT_EQ( a( i, j, k, l, m ), 0 ) + << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m; + else + EXPECT_EQ( a( i, j, k, l, m ), 1 ) + << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m; + } +} + +TEST( NDArrayTest, forInternal_static_6D ) +{ + constexpr int I = 3, J = 4, K = 5, L = 6, M = 7, N = 8; + StaticNDArray< int, SizesHolder< int, I, J, K, L, M, N > > a; +// a.setSizes( 0, 0, 0, 0, 0, 0 ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j, int k, int l, int m, int n ) + { + a( i, j, k, l, m, n ) += 1; + }; + + a.forInternal( setter ); + + for( int n = 0; n < N; n++ ) + for( int l = 0; l < L; l++ ) + for( int m = 0; m < M; m++ ) + for( int k = 0; k < K; k++ ) + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + { + if( i == 0 || i == I - 1 || + j == 0 || j == J - 1 || + k == 0 || k == K - 1 || + l == 0 || l == L - 1 || + m == 0 || m == M - 1 || + n == 0 || n == N - 1 ) + EXPECT_EQ( a( i, j, k, l, m, n ), 0 ) + << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n; + else + EXPECT_EQ( a( i, j, k, l, m, n ), 1 ) + << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n; + } +} + +TEST( NDArrayTest, forBoundary_dynamic_1D ) +{ + int I = 3; + NDArray< int, + SizesHolder< int, 0 >, + index_sequence< 0 > > a; + a.setSizes( I ); + a.setValue( 0 ); + + auto setter = [&] ( int i ) + { + a( i ) += 1; + }; + + a.forBoundary( setter ); + + for( int i = 0; i < I; i++ ) + { + if( i == 0 || i == I - 1 ) + EXPECT_EQ( a( i ), 1 ) + << "i = " << i; + else + EXPECT_EQ( a( i ), 0 ) + << "i = " << i; + } +} + +TEST( NDArrayTest, forBoundary_dynamic_2D ) +{ + int I = 3, J = 4; + NDArray< int, + SizesHolder< int, 0, 0 >, + index_sequence< 1, 0 > > a; + a.setSizes( I, J ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j ) + { + a( i, j ) += 1; + }; + + a.forBoundary( setter ); + + for( int j = 0; j < J; j++ ) + for( int i = 0; i < I; i++ ) + { + if( i == 0 || i == I - 1 || + j == 0 || j == J - 1 ) + EXPECT_EQ( a( i, j ), 1 ) + << "i = " << i << ", j = " << j; + else + EXPECT_EQ( a( i, j ), 0 ) + << "i = " << i << ", j = " << j; + } +} + +TEST( NDArrayTest, forBoundary_dynamic_3D ) +{ + int I = 3, J = 4, K = 5; + NDArray< int, + SizesHolder< int, 0, 0, 0 >, + index_sequence< 2, 0, 1 > > a; + a.setSizes( I, J, K ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j, int k ) + { + a( i, j, k ) += 1; + }; + + a.forBoundary( setter ); + + for( int k = 0; k < K; k++ ) + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + { + if( i == 0 || i == I - 1 || + j == 0 || j == J - 1 || + k == 0 || k == K - 1 ) + EXPECT_EQ( a( i, j, k ), 1 ) + << "i = " << i << ", j = " << j << ", k = " << k; + else + EXPECT_EQ( a( i, j, k ), 0 ) + << "i = " << i << ", j = " << j << ", k = " << k; + } +} + +// TODO: implement general ParallelBoundaryExecutor +//TEST( NDArrayTest, forBoundary_dynamic_4D ) +//{ +// int I = 3, J = 4, K = 5, L = 6; +// NDArray< int, +// SizesHolder< int, 0, 0, 0, 0 >, +// index_sequence< 3, 2, 0, 1 > > a; +// a.setSizes( I, J, K, L ); +// a.setValue( 0 ); +// +// auto setter = [&] ( int i, int j, int k, int l ) +// { +// a( i, j, k, l ) += 1; +// }; +// +// a.forBoundary( setter ); +// +// for( int l = 0; l < L; l++ ) +// for( int k = 0; k < K; k++ ) +// for( int i = 0; i < I; i++ ) +// for( int j = 0; j < J; j++ ) +// { +// if( i == 0 || i == I - 1 || +// j == 0 || j == J - 1 || +// k == 0 || k == K - 1 || +// l == 0 || l == L - 1 ) +// EXPECT_EQ( a( i, j, k, l ), 1 ) +// << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l; +// else +// EXPECT_EQ( a( i, j, k, l ), 0 ) +// << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l; +// } +//} +// +//TEST( NDArrayTest, forBoundary_dynamic_5D ) +//{ +// int I = 3, J = 4, K = 5, L = 6, M = 7; +// NDArray< int, +// SizesHolder< int, 0, 0, 0, 0, 0 >, +// index_sequence< 3, 4, 2, 0, 1 > > a; +// a.setSizes( I, J, K, L, M ); +// a.setValue( 0 ); +// +// auto setter = [&] ( int i, int j, int k, int l, int m ) +// { +// a( i, j, k, l, m ) += 1; +// }; +// +// a.forBoundary( setter ); +// +// for( int l = 0; l < L; l++ ) +// for( int m = 0; m < M; m++ ) +// for( int k = 0; k < K; k++ ) +// for( int i = 0; i < I; i++ ) +// for( int j = 0; j < J; j++ ) +// { +// if( i == 0 || i == I - 1 || +// j == 0 || j == J - 1 || +// k == 0 || k == K - 1 || +// l == 0 || l == L - 1 || +// m == 0 || m == M - 1 ) +// EXPECT_EQ( a( i, j, k, l, m ), 1 ) +// << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m; +// else +// EXPECT_EQ( a( i, j, k, l, m ), 0 ) +// << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m; +// } +//} +// +//TEST( NDArrayTest, forBoundary_dynamic_6D ) +//{ +// int I = 3, J = 4, K = 5, L = 6, M = 7, N = 8; +// NDArray< int, +// SizesHolder< int, 0, 0, 0, 0, 0, 0 >, +// index_sequence< 5, 3, 4, 2, 0, 1 > > a; +// a.setSizes( I, J, K, L, M, N ); +// a.setValue( 0 ); +// +// auto setter = [&] ( int i, int j, int k, int l, int m, int n ) +// { +// a( i, j, k, l, m, n ) += 1; +// }; +// +// a.forBoundary( setter ); +// +// for( int n = 0; n < N; n++ ) +// for( int l = 0; l < L; l++ ) +// for( int m = 0; m < M; m++ ) +// for( int k = 0; k < K; k++ ) +// for( int i = 0; i < I; i++ ) +// for( int j = 0; j < J; j++ ) +// { +// if( i == 0 || i == I - 1 || +// j == 0 || j == J - 1 || +// k == 0 || k == K - 1 || +// l == 0 || l == L - 1 || +// m == 0 || m == M - 1 || +// n == 0 || n == N - 1 ) +// EXPECT_EQ( a( i, j, k, l, m, n ), 1 ) +// << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n; +// else +// EXPECT_EQ( a( i, j, k, l, m, n ), 0 ) +// << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n; +// } +//} + +TEST( NDArrayTest, forBoundary_static_1D ) +{ + constexpr int I = 3; + StaticNDArray< int, SizesHolder< int, I > > a; +// a.setSizes( 0 ); + a.setValue( 0 ); + + auto setter = [&] ( int i ) + { + a( i ) += 1; + }; + + a.forBoundary( setter ); + + for( int i = 0; i < I; i++ ) + { + if( i == 0 || i == I - 1 ) + EXPECT_EQ( a( i ), 1 ) + << "i = " << i; + else + EXPECT_EQ( a( i ), 0 ) + << "i = " << i; + } +} + +TEST( NDArrayTest, forBoundary_static_2D ) +{ + constexpr int I = 3, J = 4; + StaticNDArray< int, SizesHolder< int, I, J > > a; +// a.setSizes( 0, 0 ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j ) + { + a( i, j ) += 1; + }; + + a.forBoundary( setter ); + + for( int j = 0; j < J; j++ ) + for( int i = 0; i < I; i++ ) + { + if( i == 0 || i == I - 1 || + j == 0 || j == J - 1 ) + EXPECT_EQ( a( i, j ), 1 ) + << "i = " << i << ", j = " << j; + else + EXPECT_EQ( a( i, j ), 0 ) + << "i = " << i << ", j = " << j; + } +} + +TEST( NDArrayTest, forBoundary_static_3D ) +{ + constexpr int I = 3, J = 4, K = 5; + StaticNDArray< int, SizesHolder< int, I, J, K > > a; +// a.setSizes( 0, 0, 0 ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j, int k ) + { + a( i, j, k ) += 1; + }; + + a.forBoundary( setter ); + + for( int k = 0; k < K; k++ ) + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + { + if( i == 0 || i == I - 1 || + j == 0 || j == J - 1 || + k == 0 || k == K - 1 ) + EXPECT_EQ( a( i, j, k ), 1 ) + << "i = " << i << ", j = " << j << ", k = " << k; + else + EXPECT_EQ( a( i, j, k ), 0 ) + << "i = " << i << ", j = " << j << ", k = " << k; + } +} + +TEST( NDArrayTest, forBoundary_static_4D ) +{ + constexpr int I = 3, J = 4, K = 5, L = 6; + StaticNDArray< int, SizesHolder< int, I, J, K, L > > a; +// a.setSizes( 0, 0, 0, 0 ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j, int k, int l ) + { + a( i, j, k, l ) += 1; + }; + + a.forBoundary( setter ); + + for( int l = 0; l < L; l++ ) + for( int k = 0; k < K; k++ ) + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + { + if( i == 0 || i == I - 1 || + j == 0 || j == J - 1 || + k == 0 || k == K - 1 || + l == 0 || l == L - 1 ) + EXPECT_EQ( a( i, j, k, l ), 1 ) + << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l; + else + EXPECT_EQ( a( i, j, k, l ), 0 ) + << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l; + } +} + +TEST( NDArrayTest, forBoundary_static_5D ) +{ + constexpr int I = 3, J = 4, K = 5, L = 6, M = 7; + StaticNDArray< int, SizesHolder< int, I, J, K, L, M > > a; +// a.setSizes( 0, 0, 0, 0, 0 ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j, int k, int l, int m ) + { + a( i, j, k, l, m ) += 1; + }; + + a.forBoundary( setter ); + + for( int l = 0; l < L; l++ ) + for( int m = 0; m < M; m++ ) + for( int k = 0; k < K; k++ ) + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + { + if( i == 0 || i == I - 1 || + j == 0 || j == J - 1 || + k == 0 || k == K - 1 || + l == 0 || l == L - 1 || + m == 0 || m == M - 1 ) + EXPECT_EQ( a( i, j, k, l, m ), 1 ) + << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m; + else + EXPECT_EQ( a( i, j, k, l, m ), 0 ) + << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m; + } +} + +TEST( NDArrayTest, forBoundary_static_6D ) +{ + constexpr int I = 3, J = 4, K = 5, L = 6, M = 7, N = 8; + StaticNDArray< int, SizesHolder< int, I, J, K, L, M, N > > a; +// a.setSizes( 0, 0, 0, 0, 0, 0 ); + a.setValue( 0 ); + + auto setter = [&] ( int i, int j, int k, int l, int m, int n ) + { + a( i, j, k, l, m, n ) += 1; + }; + + a.forBoundary( setter ); + + for( int n = 0; n < N; n++ ) + for( int l = 0; l < L; l++ ) + for( int m = 0; m < M; m++ ) + for( int k = 0; k < K; k++ ) + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + { + if( i == 0 || i == I - 1 || + j == 0 || j == J - 1 || + k == 0 || k == K - 1 || + l == 0 || l == L - 1 || + m == 0 || m == M - 1 || + n == 0 || n == N - 1 ) + EXPECT_EQ( a( i, j, k, l, m, n ), 1 ) + << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n; + else + EXPECT_EQ( a( i, j, k, l, m, n ), 0 ) + << "i = " << i << ", j = " << j << ", k = " << k << ", l = " << l << ", m = " << m << ", n = " << n; + } +} + +//#include "GtestMissingError.h" +int main( int argc, char* argv[] ) +{ +//#ifdef HAVE_GTEST + ::testing::InitGoogleTest( &argc, argv ); + return RUN_ALL_TESTS(); +//#else +// throw GtestMissingError(); +//#endif +} diff --git a/src/UnitTests/Containers/ndarray/NDSubarrayTest.cpp b/src/UnitTests/Containers/ndarray/NDSubarrayTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1b57eed281183f4163501fabdc9027254e255eb9 --- /dev/null +++ b/src/UnitTests/Containers/ndarray/NDSubarrayTest.cpp @@ -0,0 +1,405 @@ +#include "gtest/gtest.h" + +#include + +using namespace TNL::Containers; +using std::index_sequence; + +// wrapper around static_assert to get the type names in the error message +template< typename Permutation, typename ExpectedPermutation > +void check_permutation() +{ + static_assert( std::is_same< Permutation, ExpectedPermutation >::value, + "The permutation is not the same as the expected permutation." ); +} + +TEST( NDArraySubarrayTest, StaticAsserts ) +{ + using namespace TNL::Containers::__ndarray_impl; + +// auto is_even = [](int _in) {return _in % 2 == 0;}; + using expected_type = std::integer_sequence; + using test_type = std::integer_sequence; +// constexpr auto result = filter_sequence(test_type{}, is_even); + constexpr auto result = filter_sequence< expected_type >(test_type{}); + using result_type = std::decay_t; + static_assert(std::is_same::value, "Integer sequences should be equal"); + + + + using Permutation = std::integer_sequence< std::size_t, 5, 3, 1, 4, 2, 6, 0 >; + { + using Dimensions = std::integer_sequence< std::size_t, 3, 4, 6 >; + using Subpermutation = typename SubpermutationGetter< Dimensions, Permutation >::Subpermutation; + check_permutation< Subpermutation, + std::integer_sequence< std::size_t, 0, 1, 2 > >(); + } + { + using Dimensions = std::integer_sequence< std::size_t, 1, 4, 2 >; + using Subpermutation = typename SubpermutationGetter< Dimensions, Permutation >::Subpermutation; + check_permutation< Subpermutation, + std::integer_sequence< std::size_t, 0, 2, 1 > >(); + } + { + using Dimensions = std::integer_sequence< std::size_t, 5, 1, 6 >; + using Subpermutation = typename SubpermutationGetter< Dimensions, Permutation >::Subpermutation; + check_permutation< Subpermutation, + std::integer_sequence< std::size_t, 1, 0, 2 > >(); + } + { + using Dimensions = std::integer_sequence< std::size_t, 5, 1, 2 >; + using Subpermutation = typename SubpermutationGetter< Dimensions, Permutation >::Subpermutation; + check_permutation< Subpermutation, + std::integer_sequence< std::size_t, 2, 0, 1 > >(); + } + { + using Dimensions = std::integer_sequence< std::size_t, 2, 3, 4 >; + using Subpermutation = typename SubpermutationGetter< Dimensions, Permutation >::Subpermutation; + check_permutation< Subpermutation, + std::integer_sequence< std::size_t, 1, 2, 0 > >(); + } + { + using Dimensions = std::integer_sequence< std::size_t, 0, 1, 5 >; + using Subpermutation = typename SubpermutationGetter< Dimensions, Permutation >::Subpermutation; + check_permutation< Subpermutation, + std::integer_sequence< std::size_t, 2, 1, 0 > >(); + } + + static_assert( is_increasing_sequence( {0, 1, 2, 3, 4} ), "bug" ); + static_assert( ! is_increasing_sequence( {0, 1, 2, 0, 4} ), "bug" ); + static_assert( ! is_increasing_sequence( {1, 0, 2, 3, 4} ), "bug" ); +} + +TEST( NDArraySubarrayTest, Dynamic_6D ) +{ + int I = 2, J = 3, K = 4, L = 5, M = 6, N = 7; + NDArray< int, + SizesHolder< int, 0, 0, 0, 0, 0, 0 >, + index_sequence< 5, 3, 4, 2, 0, 1 > > a; + a.setSizes( I, J, K, L, M, N ); + a.setValue( 0 ); + + auto v = a.getView(); + + auto s1 = v.template getSubarrayView< 0 >( 0, 0, 0, 0, 0, 0 ); + const int size1 = s1.template getSize< 0 >(); + const int stride1 = s1.template getStride< 0 >(); + EXPECT_EQ( size1, I ); + EXPECT_EQ( stride1, J ); + for( int i = 0; i < I; i++ ) { + s1( i ) = 1 + i; + EXPECT_EQ( v( i, 0, 0, 0, 0, 0 ), 1 + i ); + } + a.setValue( 0 ); + + auto s2 = v.template getSubarrayView< 1 >( 0, 0, 0, 0, 0, 0 ); + const int size2 = s2.template getSize< 0 >(); + const int stride2 = s2.template getStride< 0 >(); + EXPECT_EQ( size2, J ); + EXPECT_EQ( stride2, 1 ); + for( int j = 0; j < J; j++ ) { + s2( j ) = 1 + j; + EXPECT_EQ( v( 0, j, 0, 0, 0, 0 ), 1 + j ); + } + a.setValue( 0 ); + + auto s3 = v.template getSubarrayView< 2 >( 0, 0, 0, 0, 0, 0 ); + const int size3 = s3.template getSize< 0 >(); + const int stride3 = s3.template getStride< 0 >(); + EXPECT_EQ( size3, K ); + EXPECT_EQ( stride3, I*J ); + for( int k = 0; k < K; k++ ) { + s3( k ) = 1 + k; + EXPECT_EQ( v( 0, 0, k, 0, 0, 0 ), 1 + k ); + } + a.setValue( 0 ); + + auto s4 = v.template getSubarrayView< 3 >( 0, 0, 0, 0, 0, 0 ); + const int size4 = s4.template getSize< 0 >(); + const int stride4 = s4.template getStride< 0 >(); + EXPECT_EQ( size4, L ); + EXPECT_EQ( stride4, I*J*K*M ); + for( int l = 0; l < L; l++ ) { + s4( l ) = 1 + l; + EXPECT_EQ( v( 0, 0, 0, l, 0, 0 ), 1 + l ); + } + a.setValue( 0 ); + + auto s5 = v.template getSubarrayView< 4 >( 0, 0, 0, 0, 0, 0 ); + const int size5 = s5.template getSize< 0 >(); + const int stride5 = s5.template getStride< 0 >(); + EXPECT_EQ( size5, M ); + EXPECT_EQ( stride5, I*J*K ); + for( int m = 0; m < M; m++ ) { + s5( m ) = 1 + m; + EXPECT_EQ( v( 0, 0, 0, 0, m, 0 ), 1 + m ); + } + a.setValue( 0 ); + + auto s6 = v.template getSubarrayView< 5 >( 0, 0, 0, 0, 0, 0 ); + const int size6 = s6.template getSize< 0 >(); + const int stride6 = s6.template getStride< 0 >(); + EXPECT_EQ( size6, N ); + EXPECT_EQ( stride6, I*J*K*L*M ); + for( int n = 0; n < N; n++ ) { + s6( n ) = 1 + n; + EXPECT_EQ( v( 0, 0, 0, 0, 0, n ), 1 + n ); + } + a.setValue( 0 ); + + + auto s_ij = v.template getSubarrayView< 0, 1 >( 0, 0, 0, 0, 0, 0 ); + const int size_ij_0 = s_ij.template getSize< 0 >(); + const int size_ij_1 = s_ij.template getSize< 1 >(); + const int stride_ij_0 = s_ij.template getStride< 0 >(); + const int stride_ij_1 = s_ij.template getStride< 1 >(); + EXPECT_EQ( size_ij_0, I ); + EXPECT_EQ( size_ij_1, J ); + EXPECT_EQ( stride_ij_0, 1 ); + EXPECT_EQ( stride_ij_1, 1 ); + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) { + s_ij( i, j ) = 1; + EXPECT_EQ( v( i, j, 0, 0, 0, 0 ), 1 ); + } + a.setValue( 0 ); + + auto s_ik = v.template getSubarrayView< 0, 2 >( 0, 0, 0, 0, 0, 0 ); + const int size_ik_0 = s_ik.template getSize< 0 >(); + const int size_ik_1 = s_ik.template getSize< 1 >(); + const int stride_ik_0 = s_ik.template getStride< 0 >(); + const int stride_ik_1 = s_ik.template getStride< 1 >(); + EXPECT_EQ( size_ik_0, I ); + EXPECT_EQ( size_ik_1, K ); + EXPECT_EQ( stride_ik_0, J ); + EXPECT_EQ( stride_ik_1, 1 ); + for( int i = 0; i < I; i++ ) + for( int k = 0; k < K; k++ ) { + s_ik( i, k ) = 1 + k; + EXPECT_EQ( v( i, 0, k, 0, 0, 0 ), 1 + k ); + } + a.setValue( 0 ); + + auto s_il = v.template getSubarrayView< 0, 3 >( 0, 0, 0, 0, 0, 0 ); + const int size_il_0 = s_il.template getSize< 0 >(); + const int size_il_1 = s_il.template getSize< 1 >(); + const int stride_il_0 = s_il.template getStride< 0 >(); + const int stride_il_1 = s_il.template getStride< 1 >(); + EXPECT_EQ( size_il_0, I ); + EXPECT_EQ( size_il_1, L ); + EXPECT_EQ( stride_il_0, J ); + EXPECT_EQ( stride_il_1, K*M ); + for( int i = 0; i < I; i++ ) + for( int l = 0; l < L; l++ ) { + s_il( i, l ) = 1 + l; + EXPECT_EQ( v( i, 0, 0, l, 0, 0 ), 1 + l ); + } + a.setValue( 0 ); + + auto s_im = v.template getSubarrayView< 0, 4 >( 0, 0, 0, 0, 0, 0 ); + const int size_im_0 = s_im.template getSize< 0 >(); + const int size_im_1 = s_im.template getSize< 1 >(); + const int stride_im_0 = s_im.template getStride< 0 >(); + const int stride_im_1 = s_im.template getStride< 1 >(); + EXPECT_EQ( size_im_0, I ); + EXPECT_EQ( size_im_1, M ); + EXPECT_EQ( stride_im_0, J ); + EXPECT_EQ( stride_im_1, K ); + for( int i = 0; i < I; i++ ) + for( int m = 0; m < M; m++ ) { + s_im( i, m ) = 1 + m; + EXPECT_EQ( v( i, 0, 0, 0, m, 0 ), 1 + m ); + } + a.setValue( 0 ); + + auto s_in = v.template getSubarrayView< 0, 5 >( 0, 0, 0, 0, 0, 0 ); + const int size_in_0 = s_in.template getSize< 0 >(); + const int size_in_1 = s_in.template getSize< 1 >(); + const int stride_in_0 = s_in.template getStride< 0 >(); + const int stride_in_1 = s_in.template getStride< 1 >(); + EXPECT_EQ( size_in_0, I ); + EXPECT_EQ( size_in_1, N ); + EXPECT_EQ( stride_in_0, J ); + EXPECT_EQ( stride_in_1, K*L*M ); + for( int i = 0; i < I; i++ ) + for( int n = 0; n < N; n++ ) { + s_in( i, n ) = 1 + n; + EXPECT_EQ( v( i, 0, 0, 0, 0, n ), 1 + n ); + } + a.setValue( 0 ); + + + auto s_jk = v.template getSubarrayView< 1, 2 >( 0, 0, 0, 0, 0, 0 ); + const int size_jk_0 = s_jk.template getSize< 0 >(); + const int size_jk_1 = s_jk.template getSize< 1 >(); + const int stride_jk_0 = s_jk.template getStride< 0 >(); + const int stride_jk_1 = s_jk.template getStride< 1 >(); + EXPECT_EQ( size_jk_0, J ); + EXPECT_EQ( size_jk_1, K ); + EXPECT_EQ( stride_jk_0, 1 ); + EXPECT_EQ( stride_jk_1, I ); + for( int j = 0; j < J; j++ ) + for( int k = 0; k < K; k++ ) { + s_jk( j, k ) = 1 + k; + EXPECT_EQ( v( 0, j, k, 0, 0, 0 ), 1 + k ); + } + a.setValue( 0 ); + + auto s_jl = v.template getSubarrayView< 1, 3 >( 0, 0, 0, 0, 0, 0 ); + const int size_jl_0 = s_jl.template getSize< 0 >(); + const int size_jl_1 = s_jl.template getSize< 1 >(); + const int stride_jl_0 = s_jl.template getStride< 0 >(); + const int stride_jl_1 = s_jl.template getStride< 1 >(); + EXPECT_EQ( size_jl_0, J ); + EXPECT_EQ( size_jl_1, L ); + EXPECT_EQ( stride_jl_0, 1 ); + EXPECT_EQ( stride_jl_1, I*K*M ); + for( int j = 0; j < J; j++ ) + for( int l = 0; l < L; l++ ) { + s_jl( j, l ) = 1 + l; + EXPECT_EQ( v( 0, j, 0, l, 0, 0 ), 1 + l ); + } + a.setValue( 0 ); + + auto s_jm = v.template getSubarrayView< 1, 4 >( 0, 0, 0, 0, 0, 0 ); + const int size_jm_0 = s_jm.template getSize< 0 >(); + const int size_jm_1 = s_jm.template getSize< 1 >(); + const int stride_jm_0 = s_jm.template getStride< 0 >(); + const int stride_jm_1 = s_jm.template getStride< 1 >(); + EXPECT_EQ( size_jm_0, J ); + EXPECT_EQ( size_jm_1, M ); + EXPECT_EQ( stride_jm_0, 1 ); + EXPECT_EQ( stride_jm_1, I*K ); + for( int j = 0; j < J; j++ ) + for( int m = 0; m < M; m++ ) { + s_jm( j, m ) = 1 + m; + EXPECT_EQ( v( 0, j, 0, 0, m, 0 ), 1 + m ); + } + a.setValue( 0 ); + + auto s_jn = v.template getSubarrayView< 1, 5 >( 0, 0, 0, 0, 0, 0 ); + const int size_jn_0 = s_jn.template getSize< 0 >(); + const int size_jn_1 = s_jn.template getSize< 1 >(); + const int stride_jn_0 = s_jn.template getStride< 0 >(); + const int stride_jn_1 = s_jn.template getStride< 1 >(); + EXPECT_EQ( size_jn_0, J ); + EXPECT_EQ( size_jn_1, N ); + EXPECT_EQ( stride_jn_0, 1 ); + EXPECT_EQ( stride_jn_1, I*K*L*M ); + for( int j = 0; j < J; j++ ) + for( int n = 0; n < N; n++ ) { + s_jn( j, n ) = 1 + n; + EXPECT_EQ( v( 0, j, 0, 0, 0, n ), 1 + n ); + } + a.setValue( 0 ); + + + auto s_kl = v.template getSubarrayView< 2, 3 >( 0, 0, 0, 0, 0, 0 ); + const int size_kl_0 = s_kl.template getSize< 0 >(); + const int size_kl_1 = s_kl.template getSize< 1 >(); + const int stride_kl_0 = s_kl.template getStride< 0 >(); + const int stride_kl_1 = s_kl.template getStride< 1 >(); + EXPECT_EQ( size_kl_0, K ); + EXPECT_EQ( size_kl_1, L ); + EXPECT_EQ( stride_kl_0, I*J ); + EXPECT_EQ( stride_kl_1, M ); + for( int k = 0; k < K; k++ ) + for( int l = 0; l < L; l++ ) { + s_kl( k, l ) = 1 + l; + EXPECT_EQ( v( 0, 0, k, l, 0, 0 ), 1 + l ); + } + a.setValue( 0 ); + + auto s_km = v.template getSubarrayView< 2, 4 >( 0, 0, 0, 0, 0, 0 ); + const int size_km_0 = s_km.template getSize< 0 >(); + const int size_km_1 = s_km.template getSize< 1 >(); + const int stride_km_0 = s_km.template getStride< 0 >(); + const int stride_km_1 = s_km.template getStride< 1 >(); + EXPECT_EQ( size_km_0, K ); + EXPECT_EQ( size_km_1, M ); + EXPECT_EQ( stride_km_0, I*J ); + EXPECT_EQ( stride_km_1, 1 ); + for( int k = 0; k < K; k++ ) + for( int m = 0; m < M; m++ ) { + s_km( k, m ) = 1 + m; + EXPECT_EQ( v( 0, 0, k, 0, m, 0 ), 1 + m ); + } + a.setValue( 0 ); + + auto s_kn = v.template getSubarrayView< 2, 5 >( 0, 0, 0, 0, 0, 0 ); + const int size_kn_0 = s_kn.template getSize< 0 >(); + const int size_kn_1 = s_kn.template getSize< 1 >(); + const int stride_kn_0 = s_kn.template getStride< 0 >(); + const int stride_kn_1 = s_kn.template getStride< 1 >(); + EXPECT_EQ( size_kn_0, K ); + EXPECT_EQ( size_kn_1, N ); + EXPECT_EQ( stride_kn_0, I*J ); + EXPECT_EQ( stride_kn_1, L*M ); + for( int k = 0; k < K; k++ ) + for( int n = 0; n < N; n++ ) { + s_kn( k, n ) = 1 + n; + EXPECT_EQ( v( 0, 0, k, 0, 0, n ), 1 + n ); + } + a.setValue( 0 ); + + + auto s_lm = v.template getSubarrayView< 3, 4 >( 0, 0, 0, 0, 0, 0 ); + const int size_lm_0 = s_lm.template getSize< 0 >(); + const int size_lm_1 = s_lm.template getSize< 1 >(); + const int stride_lm_0 = s_lm.template getStride< 0 >(); + const int stride_lm_1 = s_lm.template getStride< 1 >(); + EXPECT_EQ( size_lm_0, L ); + EXPECT_EQ( size_lm_1, M ); + EXPECT_EQ( stride_lm_0, 1 ); + EXPECT_EQ( stride_lm_1, I*J*K ); + for( int l = 0; l < L; l++ ) + for( int m = 0; m < M; m++ ) { + s_lm( l, m ) = 1 + m; + EXPECT_EQ( v( 0, 0, 0, l, m, 0 ), 1 + m ); + } + a.setValue( 0 ); + + auto s_ln = v.template getSubarrayView< 3, 5 >( 0, 0, 0, 0, 0, 0 ); + const int size_ln_0 = s_ln.template getSize< 0 >(); + const int size_ln_1 = s_ln.template getSize< 1 >(); + const int stride_ln_0 = s_ln.template getStride< 0 >(); + const int stride_ln_1 = s_ln.template getStride< 1 >(); + EXPECT_EQ( size_ln_0, L ); + EXPECT_EQ( size_ln_1, N ); + EXPECT_EQ( stride_ln_0, I*J*K*M ); + EXPECT_EQ( stride_ln_1, 1 ); + for( int l = 0; l < L; l++ ) + for( int n = 0; n < N; n++ ) { + s_ln( l, n ) = 1 + n; + EXPECT_EQ( v( 0, 0, 0, l, 0, n ), 1 + n ); + } + a.setValue( 0 ); + + + auto s_mn = v.template getSubarrayView< 4, 5 >( 0, 0, 0, 0, 0, 0 ); + const int size_mn_0 = s_mn.template getSize< 0 >(); + const int size_mn_1 = s_mn.template getSize< 1 >(); + const int stride_mn_0 = s_mn.template getStride< 0 >(); + const int stride_mn_1 = s_mn.template getStride< 1 >(); + EXPECT_EQ( size_mn_0, M ); + EXPECT_EQ( size_mn_1, N ); + EXPECT_EQ( stride_mn_0, I*J*K ); + EXPECT_EQ( stride_mn_1, L ); + for( int m = 0; m < M; m++ ) + for( int n = 0; n < N; n++ ) { + s_mn( m, n ) = 1 + n; + EXPECT_EQ( v( 0, 0, 0, 0, m, n ), 1 + n ); + } + a.setValue( 0 ); +} + +//#include "GtestMissingError.h" +int main( int argc, char* argv[] ) +{ +//#ifdef HAVE_GTEST + ::testing::InitGoogleTest( &argc, argv ); + return RUN_ALL_TESTS(); +//#else +// throw GtestMissingError(); +//#endif +} diff --git a/src/UnitTests/Containers/ndarray/SlicedNDArrayTest.cpp b/src/UnitTests/Containers/ndarray/SlicedNDArrayTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8574a56021c6da57331dc63bb7747aa1ba262cb1 --- /dev/null +++ b/src/UnitTests/Containers/ndarray/SlicedNDArrayTest.cpp @@ -0,0 +1,251 @@ +#include "gtest/gtest.h" + +#include + +using namespace TNL::Containers; +using std::index_sequence; + +template< typename Array > +void expect_identity( const Array& a ) +{ + Array identity; + identity.setLike( a ); + for( int i = 0; i < identity.getSize(); i++ ) + identity[ i ] = i; + EXPECT_EQ( a, identity ); +} + +template< typename Array, typename Seq > +void expect_seq( const Array& a, const Seq& seq ) +{ + for( int i = 0; i < a.getSize(); i++ ) + EXPECT_EQ( a[ i ], seq[ i ] ); +} + +TEST( SlicedNDArrayTest, 2D_Static_Identity ) +{ + constexpr int I = 3, J = 5; + SlicedNDArray< int, SizesHolder< int, I, J > > a; + a.setSizes( 0, 0 ); + + int v = 0; + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + a( i, j ) = v++; + + expect_identity( a.getStorageArray() ); +} + +TEST( SlicedNDArrayTest, 2D_Static_Permuted ) +{ + constexpr int I = 3, J = 5; + SlicedNDArray< int, + SizesHolder< int, I, J >, + index_sequence< 1, 0 > > a; + a.setSizes( 0, 0 ); + + int v = 0; + for( int j = 0; j < J; j++ ) + for( int i = 0; i < I; i++ ) + a( i, j ) = v++; + + expect_identity( a.getStorageArray() ); +} + +TEST( SlicedNDArrayTest, 6D_Dynamic ) +{ + int I = 2, J = 2, K = 2, L = 2, M = 2, N = 2; + SlicedNDArray< int, + SizesHolder< int, 0, 0, 0, 0, 0, 0 >, + index_sequence< 5, 3, 4, 2, 0, 1 > > a; + a.setSizes( I, J, K, L, M, N ); + + int v = 0; + for( int n = 0; n < N; n++ ) + for( int l = 0; l < L; l++ ) + for( int m = 0; m < M; m++ ) + for( int k = 0; k < K; k++ ) + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + a( i, j, k, l, m, n ) = v++; + + expect_identity( a.getStorageArray() ); +} + + +TEST( SlicedNDArrayTest, Sliced2D_Dynamic_Identity ) +{ + const int I = 3, J = 5; + SlicedNDArray< int, + SizesHolder< int, 0, 0 >, + index_sequence< 0, 1 >, + SliceInfo< 1, 2 > > a; // J is sliced + a.setSizes( I, J ); + + a.getStorageArray().setValue(-1); + int v = 0; + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + a( i, j ) = v++; + + const std::vector< int > seq({ + // first slice + 0, 1, + 5, 6, + 10, 11, + // second slice + 2, 3, + 7, 8, + 12, 13, + // third slice + 4, -1, + 9, -1, + 14, -1, + }); + expect_seq( a.getStorageArray(), seq ); +} + +TEST( SlicedNDArrayTest, Sliced2D_HalfStatic_Identity ) +{ + constexpr int I = 3; + const int J = 5; + SlicedNDArray< int, + SizesHolder< int, I, 0 >, + index_sequence< 0, 1 >, + SliceInfo< 1, 2 > > a; // J is sliced + a.setSizes( 0, J ); + + a.getStorageArray().setValue(-1); + int v = 0; + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + a( i, j ) = v++; + + const std::vector< int > seq({ + // first slice + 0, 1, + 5, 6, + 10, 11, + // second slice + 2, 3, + 7, 8, + 12, 13, + // third slice + 4, -1, + 9, -1, + 14, -1, + }); + expect_seq( a.getStorageArray(), seq ); +} + +TEST( SlicedNDArrayTest, Sliced2D_Dynamic_Permuted ) +{ + const int I = 3, J = 5; + SlicedNDArray< int, + SizesHolder< int, 0, 0 >, + index_sequence< 1, 0 >, + SliceInfo< 0, 2 > > a; // I is sliced + a.setSizes( I, J ); + + a.getStorageArray().setValue(-1); + int v = 0; + for( int j = 0; j < J; j++ ) + for( int i = 0; i < I; i++ ) + a( i, j ) = v++; + + const std::vector< int > seq({ + // first slice (transposed) + 0, 1, + 3, 4, + 6, 7, + 9, 10, + 12, 13, + // second slice (transposed) + 2, -1, + 5, -1, + 8, -1, + 11, -1, + 14, -1, + }); + expect_seq( a.getStorageArray(), seq ); +} + +TEST( SlicedNDArrayTest, Sliced2D_HalfStatic_Permuted ) +{ + const int I = 3; + constexpr int J = 5; + SlicedNDArray< int, + SizesHolder< int, 0, J >, + index_sequence< 1, 0 >, + SliceInfo< 0, 2 > > a; // I is sliced + a.setSizes( I, 0 ); + + a.getStorageArray().setValue(-1); + int v = 0; + for( int j = 0; j < J; j++ ) + for( int i = 0; i < I; i++ ) + a( i, j ) = v++; + + const std::vector< int > seq({ + // first slice (transposed) + 0, 1, + 3, 4, + 6, 7, + 9, 10, + 12, 13, + // second slice (transposed) + 2, -1, + 5, -1, + 8, -1, + 11, -1, + 14, -1, + }); + expect_seq( a.getStorageArray(), seq ); +} + + +TEST( SlicedNDArrayTest, CopySemantics ) +{ + const int I = 3, J = 4; + SlicedNDArray< int, + SizesHolder< int, 0, 0 >, + index_sequence< 0, 1 >, + SliceInfo< 1, 2 > > a, b, c; // J is sliced + a.setSizes( I, J ); + + int v = 0; + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + a( i, j ) = v++; + + b = a; + EXPECT_EQ( a, b ); + + auto a_view = a.getView(); + auto b_view = b.getView(); + EXPECT_EQ( a_view, b_view ); + EXPECT_EQ( a_view.getView(), b_view ); + EXPECT_EQ( a_view.getConstView(), b_view.getConstView() ); + EXPECT_EQ( a.getConstView(), b.getConstView() ); + EXPECT_EQ( a.getConstView(), b_view.getConstView() ); + + c.setSizes( I, J ); + auto c_view = c.getView(); + c_view = b_view; + EXPECT_EQ( a_view, c_view ); + EXPECT_EQ( a_view.getView(), c_view ); + EXPECT_EQ( a_view.getConstView(), c_view.getConstView() ); + EXPECT_EQ( a.getConstView(), c.getConstView() ); + EXPECT_EQ( a.getConstView(), c_view.getConstView() ); +} + +//#include "GtestMissingError.h" +int main( int argc, char* argv[] ) +{ +//#ifdef HAVE_GTEST + ::testing::InitGoogleTest( &argc, argv ); + return RUN_ALL_TESTS(); +//#else +// throw GtestMissingError(); +//#endif +} diff --git a/src/UnitTests/Containers/ndarray/StaticNDArrayCudaTest.cu b/src/UnitTests/Containers/ndarray/StaticNDArrayCudaTest.cu new file mode 100644 index 0000000000000000000000000000000000000000..0a0a83dd83fae72ff0f1b5c349d81ba05ed0da65 --- /dev/null +++ b/src/UnitTests/Containers/ndarray/StaticNDArrayCudaTest.cu @@ -0,0 +1,88 @@ +#include "gtest/gtest.h" + +#include + +#include + +using namespace TNL::Containers; +using std::index_sequence; + +template< typename Array > +void expect_identity( const Array& a ) +{ + Array identity; + identity.setSize( a.getSize() ); + for( int i = 0; i < identity.getSize(); i++ ) + identity.setElement( i, i ); + EXPECT_EQ( a, identity ); +} + +// nvcc fuck-up: __host__ __device__ lambdas cannot be inside protected/private class methods +void __test_SetThroughView() +{ + constexpr int I = 3, J = 5; + using ViewType = typename StaticNDArray< int, SizesHolder< int, I, J > >::ViewType; + NDArray< int, + SizesHolder< int, I, J >, + std::make_index_sequence< 2 >, + TNL::Devices::Cuda > a; + a.setSizes( 0, 0 ); + ViewType a_view( a.getStorageArray().getData(), SizesHolder< int, I, J >{} ); + + auto kernel = [] __cuda_callable__ ( int, ViewType a ) { + int v = 0; + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + a( i, j ) = v++; + }; + + a.setValue(0); + TNL::ParallelFor< TNL::Devices::Cuda >::exec( 0, 1, kernel, a_view ); + expect_identity( a.getStorageArray() ); +} +TEST( StaticNDArrayCudaTest, SetThroughView ) +{ + __test_SetThroughView(); +} + +// nvcc fuck-up: __host__ __device__ lambdas cannot be inside protected/private class methods +void __test_CopyFromArray() +{ + constexpr int I = 3, J = 5; + using ViewType = typename StaticNDArray< int, SizesHolder< int, I, J > >::ViewType; + NDArray< int, + SizesHolder< int, I, J >, + std::make_index_sequence< 2 >, + TNL::Devices::Cuda > a; + a.setSizes( 0, 0 ); + ViewType a_view( a.getStorageArray().getData(), SizesHolder< int, I, J >{} ); + + auto kernel = [] __cuda_callable__ ( int, ViewType a ) { + StaticNDArray< int, SizesHolder< int, I, J > > b; + int v = 0; + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + b( i, j ) = v++; + a = b.getView(); + a( 0, 0 ) = a != b.getView(); + }; + + a.setValue(0); + TNL::ParallelFor< TNL::Devices::Cuda >::exec( 0, 1, kernel, a_view ); + expect_identity( a.getStorageArray() ); +} +TEST( StaticNDArrayCudaTest, CopyFromArray ) +{ + __test_CopyFromArray(); +} + +//#include "GtestMissingError.h" +int main( int argc, char* argv[] ) +{ +//#ifdef HAVE_GTEST + ::testing::InitGoogleTest( &argc, argv ); + return RUN_ALL_TESTS(); +//#else +// throw GtestMissingError(); +//#endif +} diff --git a/src/UnitTests/Containers/ndarray/StaticNDArrayTest.cpp b/src/UnitTests/Containers/ndarray/StaticNDArrayTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e3ea290f26b92723d0490873f7caa263fee35f36 --- /dev/null +++ b/src/UnitTests/Containers/ndarray/StaticNDArrayTest.cpp @@ -0,0 +1,105 @@ +#include "gtest/gtest.h" + +#include + +using namespace TNL::Containers; +using std::index_sequence; + +template< typename Array > +void expect_identity( const Array& a ) +{ + Array identity; + for( int i = 0; i < identity.getSize(); i++ ) + identity[ i ] = i; + EXPECT_EQ( a, identity ); +} + +TEST( StaticNDArrayTest, Static_2D_Identity ) +{ + constexpr int I = 3, J = 5; + StaticNDArray< int, SizesHolder< int, I, J > > a; + + int v = 0; + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + a( i, j ) = v++; + + expect_identity( a.getStorageArray() ); +} + +TEST( StaticNDArrayTest, Static_2D_Permuted ) +{ + constexpr int I = 3, J = 5; + StaticNDArray< int, + SizesHolder< int, I, J >, + index_sequence< 1, 0 > > a; + + int v = 0; + for( int j = 0; j < J; j++ ) + for( int i = 0; i < I; i++ ) + a( i, j ) = v++; + + expect_identity( a.getStorageArray() ); +} + +TEST( StaticNDArrayTest, Static_6D_Permuted ) +{ + constexpr int I = 2, J = 2, K = 2, L = 2, M = 2, N = 2; + StaticNDArray< int, + SizesHolder< int, I, J, K, L, M, N >, + index_sequence< 5, 3, 4, 2, 0, 1 > > a; + + int v = 0; + for( int n = 0; n < N; n++ ) + for( int l = 0; l < L; l++ ) + for( int m = 0; m < M; m++ ) + for( int k = 0; k < K; k++ ) + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + a( i, j, k, l, m, n ) = v++; + + expect_identity( a.getStorageArray() ); +} + +TEST( StaticNDArrayTest, CopySemantics ) +{ + constexpr int I = 3, J = 5; + StaticNDArray< int, SizesHolder< int, I, J > > a, b, c; + + int v = 0; + for( int i = 0; i < I; i++ ) + for( int j = 0; j < J; j++ ) + a( i, j ) = v++; + + expect_identity( a.getStorageArray() ); + + b = a; + EXPECT_EQ( a, b ); + + auto a_view = a.getView(); + auto b_view = b.getView(); + EXPECT_EQ( a_view, b_view ); + EXPECT_EQ( a_view.getView(), b_view ); + EXPECT_EQ( a_view.getConstView(), b_view.getConstView() ); + EXPECT_EQ( a.getConstView(), b.getConstView() ); + EXPECT_EQ( a.getConstView(), b_view.getConstView() ); + + auto c_view = c.getView(); + c_view = b_view; + EXPECT_EQ( a_view, c_view ); + EXPECT_EQ( a_view.getView(), c_view ); + EXPECT_EQ( a_view.getConstView(), c_view.getConstView() ); + EXPECT_EQ( a.getConstView(), c.getConstView() ); + EXPECT_EQ( a.getConstView(), c_view.getConstView() ); +} + +//#include "GtestMissingError.h" +int main( int argc, char* argv[] ) +{ +//#ifdef HAVE_GTEST + ::testing::InitGoogleTest( &argc, argv ); + return RUN_ALL_TESTS(); +//#else +// throw GtestMissingError(); +//#endif +}