Merge branch 'JK/expressions' into 'develop' (84b7a213) · Commits · TNL / tnl-dev

.gitlab-ci.yml

+1 −5

Original line number	Diff line number	Diff line
		@@ -33,11 +33,6 @@ stages:
		.build_template_def: &build_template
		stage: build
		script:
		# set MPI compiler wrapper
		- if [[ ${WITH_MPI} == "yes" ]]; then
		export CXX=mpicxx;
		export CC=mpicc;
		fi
		# all cores including hyperthreading
		# - export NUM_CORES=$(grep "core id" /proc/cpuinfo \| wc -l)
		# # all pyhsical cores
		@@ -53,6 +48,7 @@ stages:
		-DCMAKE_BUILD_TYPE=${BUILD_TYPE}
		-DCMAKE_INSTALL_PREFIX=$(pwd)/${BUILD_TYPE}_install_prefix
		-DWITH_OPENMP=${WITH_OPENMP}
		-DWITH_MPI=${WITH_MPI}
		-DWITH_CUDA=${WITH_CUDA}
		-DWITH_CUDA_ARCH=${WITH_CUDA_ARCH}
		-DWITH_MIC=${WITH_MIC}

CMakeLists.txt

+41 −37

Original line number	Diff line number	Diff line
		@@ -21,6 +21,7 @@ option(WITH_MIC "Build with MIC support" OFF)
		option(WITH_CUDA "Build with CUDA support" ON)
		set(WITH_CUDA_ARCH "auto" CACHE STRING "Build for these CUDA architectures")
		option(WITH_OPENMP "Build with OpenMP support" ON)
		option(WITH_MPI "Build with MPI support" ON)
		option(WITH_GMP "Build with GMP support" OFF)
		option(WITH_TESTS "Build tests" ON)
		option(WITH_PROFILING "Enable code profiling compiler flags" OFF )
		@@ -39,10 +40,11 @@ set( TNL_TARGET_DATA_DIRECTORY "share/TNL" )
		# the cmake directory in the TNL repository
		set( CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake" )

		# Note that in cmake 3.10 the FindOpenMP module is broken - it does not work when
		# Note that in cmake 3.10 the FindOpenMP and FindMPI modules are broken - they do not work when
		# CMAKE_EXECUTABLE_SUFFIX is not empty, see https://www.mail-archive.com/cmake@cmake.org/msg56886.html
		# Hence, we find OpenMP before setting CMAKE_EXECUTABLE_SUFFIX.
		# Hence, we find OpenMP and MPI before setting CMAKE_EXECUTABLE_SUFFIX.
		find_package( OpenMP )
		find_package( MPI )

		####
		# Settings for debug/release version
		@@ -127,27 +129,40 @@ if( DEFINED ENV{CI_JOB_NAME} OR ${CMAKE_GENERATOR} STREQUAL "Ninja" )
		endif()
		endif()

		#####
		# Check for MPI -- poznej podle vraperu compileru -- da se testovat preklad bez MPI
		# gtest has to be built before we add the MPI flags
		if( ${WITH_TESTS} )
		enable_testing()

		# build gtest libs
		include( BuildGtest )

		if( ${WITH_COVERAGE} AND CMAKE_BUILD_TYPE STREQUAL "Debug" )
		# enable code coverage reports
		include( UseCodeCoverage )
		endif()
		endif()

		####
		# Check for OpenMP
		#
		get_filename_component( CXX_COMPILER_NAME ${CMAKE_CXX_COMPILER} NAME )
		if( ${CXX_COMPILER_NAME} STREQUAL "mpicxx" )
		message( "MPI compiler detected." )
		set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_MPI" )
		set( CUDA_HOST_COMPILER "mpicxx" )
		set( BUILD_MPI ON )
		if( OPENMP_FOUND AND ${WITH_OPENMP} )
		set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_OPENMP ${OpenMP_CXX_FLAGS}" )
		endif()

		####
		# Check for MPI -- not working
		# Check for MPI
		#
		#find_package( MPI )
		#if( MPI_CXX_FOUND )
		# set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_MPI" )
		# message( "MPI headers found -- ${MPI_CXX_INCLUDE_PATH}")
		# message( "MPI link flags -- ${MPI_CXX_LINK_FLAGS}")
		# message( "MPI libraries-- ${MPI_CXX_LIBRARIES}")
		#endif()
		if( MPI_CXX_FOUND AND ${WITH_MPI} )
		set( BUILD_MPI TRUE)
		# add the appropriate flags to all targets (will be hidden from the CMAKE_CXX_* variables)
		include_directories( ${MPI_CXX_INCLUDE_DIRS} )
		add_compile_options( ${MPI_CXX_COMPILE_OPTIONS} )
		add_compile_definitions( ${MPI_CXX_COMPILE_DEFINITIONS} )
		add_link_options( "SHELL:${MPI_CXX_LINK_FLAGS}" )
		link_libraries( ${MPI_CXX_LIBRARIES} )
		# enable MPI in TNL
		set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_MPI" )
		endif()

		#####
		# Check for CUDA
		@@ -243,13 +258,6 @@ if( ${WITH_CUDA} )
		endif()


		####
		# Check for OpenMP
		#
		if( OPENMP_FOUND AND ${WITH_OPENMP} )
		set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_OPENMP ${OpenMP_CXX_FLAGS}" )
		endif()

		if( ${WITH_PROFILING} )
		set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g" )
		set( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-line-info")
		@@ -304,18 +312,6 @@ if( ${WITH_GMP} )
		endif()
		endif()

		if( ${WITH_TESTS} )
		enable_testing()

		# build gtest libs
		include( BuildGtest )

		if( ${WITH_COVERAGE} AND CMAKE_BUILD_TYPE STREQUAL "Debug" )
		# enable code coverage reports
		include( UseCodeCoverage )
		endif()
		endif()

		#if( BUILD_MPI )
		# FIND_PATH( PETSC_INCLUDE_DIR petsc.h
		# /usr/include/petsc
		@@ -396,6 +392,7 @@ message( " WITH_MIC = ${WITH_MIC}" )
		message( " WITH_CUDA = ${WITH_CUDA}" )
		message( " WITH_CUDA_ARCH = ${WITH_CUDA_ARCH}" )
		message( " WITH_OPENMP = ${WITH_OPENMP}" )
		message( " WITH_MPI = ${WITH_MPI}" )
		message( " WITH_GMP = ${WITH_GMP}" )
		message( " WITH_TESTS = ${WITH_TESTS}" )
		message( " WITH_PROFILING = ${WITH_PROFILING}" )
		@@ -420,3 +417,10 @@ message( " CMAKE_SHARED_LINKER_FLAGS_DEBUG = ${CMAKE_SHARED_LINKER_FLAGS_DEBUG
		message( " CMAKE_SHARED_LINKER_FLAGS_RELEASE = ${CMAKE_SHARED_LINKER_FLAGS_RELEASE}" )
		message( " CUDA_NVCC_FLAGS = ${CUDA_NVCC_FLAGS}" )
		message( " GMP_LIBRARIES = ${GMP_LIBRARIES}" )
		if( MPI_CXX_FOUND AND ${WITH_MPI} )
		message( " MPI_CXX_COMPILE_OPTIONS = ${MPI_CXX_COMPILE_OPTIONS}" )
		message( " MPI_CXX_COMPILE_DEFINITIONS = ${MPI_CXX_COMPILE_DEFINITIONS}" )
		message( " MPI_CXX_INCLUDE_DIRS = ${MPI_CXX_INCLUDE_DIRS}" )
		message( " MPI_CXX_LINK_FLAGS = ${MPI_CXX_LINK_FLAGS}" )
		message( " MPI_CXX_LIBRARIES = ${MPI_CXX_LIBRARIES}" )
		endif()

Documentation/Pages/comparison-with-other-libraries.md

0 → 100644

+52 −0

Original line number	Diff line number	Diff line
		\page comparison_with_other_libraries Comparison with other libraries

		## Memory space and execution model

		TNL has separate concepts for the memory space and execution model, which are
		represented by different template parameters. See the \ref core_concepts
		"Core concepts" page for details.

		- Most other libraries have separate types for CPU and GPU data structures
		(e.g. `Vector` and `cuVector`):
		- [Thrust](https://github.com/thrust/thrust/): `host_vector`, `device_vector`
		(plus macro-based selection, see below)
		- [Paralution](http://www.paralution.com/documentation/): `HostVector`, `AcceleratorVector`
		- [Bandicoot](https://coot.sourceforge.io/), [Kaldi](http://kaldi-asr.org/doc/about.html)
		- These libraries have the concept of a "memory space" which is configurable as
		a template parameter:
		- [CUV](https://github.com/deeplearningais/CUV)
		- [CUSP](http://cusplibrary.github.io/classcusp_1_1array1d.html) - but CUSP
		uses Thrust, so `device_memory` might be the same as `host_memory` if OpenMP
		is used as the `device`
		- [Kokkos](https://github.com/kokkos/kokkos) - they have a concept of a
		"memory space" and "execution space", but there is also some default choice
		of the spaces, possibly even through command-line arguments (in which case
		the array type would be polymorphic, because something has to store the
		current memory/execution space)
		- These libraries have transparent access to the data from GPU and CPU:
		- the CUDA toolkit itself, via `cudaMallocManaged`
		- [cudarrays](https://github.com/cudarrays/cudarrays) - they have custom
		virtual memory system using `cudaMalloc` and the standard host allocator
		- These libraries select the (default) device based on some macro
		(this approach is way too simple, because multiple different devices cannot be
		combined):
		- Thrust: see [device backends](https://github.com/thrust/thrust/wiki/Device-Backends)
		and [host backends](https://github.com/thrust/thrust/wiki/Host-Backends)
		- These libraries do not abstract memory space, only execution model:
		- [RAJA](https://github.com/LLNL/RAJA)
		- [Nebo](https://www.sciencedirect.com/science/article/pii/S0164121216000182)
		(also with a macro-based selection)

		## Multidimensional arrays

		TODO: compare the implementation of multidimensional arrays
		(features described in the merge request: https://mmg-gitlab.fjfi.cvut.cz/gitlab/tnl/tnl-dev/merge_requests/18 )

		- http://cpptruths.blogspot.cz/2011/10/multi-dimensional-arrays-in-c11.html
		- http://www.nongnu.org/tensors/ (last commit in 2012)
		- https://bitbucket.org/wlandry/ftensor/src
		- [Eigen tensors](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md?at=default&fileviewer=file-view-default) - Many operations, expression templates, either pure-static or pure-dynamic sizes, only column-major format (row-major support is incomplete), little GPU support.
		- [cudarrays](https://github.com/cudarrays/cudarrays) - Only up to 3D arrays, both static and dynamic, compile-time permutations using `std::tuple`.
		- [RAJA](https://github.com/LLNL/RAJA) - No memory management, views are initialized with a raw pointer, index permutations are initialized at runtime, only dynamic dimensions.
		- [Kokkos](https://github.com/kokkos/kokkos) - Configurable layout and default selection based on the memory/execution space, but only AoS and SoA are considered, even for `N > 2`. For parallel work there is only one leading dimension - it does not map to 2D or 3D CUDA grids.
		- [CUV](https://github.com/deeplearningais/CUV) - Assumption that "everything is an n-dimensional array" (like Matlab), CPU and GPU support, column-major or row-major, integration with Python and Numpy.

Documentation/Pages/core-concepts.md

+12 −3

Original line number	Diff line number	Diff line
		@@ -11,15 +11,24 @@ TNL is based on the following core concepts:
		(TODO: rename to `Executor` or something like that)
		- Device is responsible for the execution of algorithms in a specific way.
		- Algorithms can be specialized by the `Device` template parameter.
		3. \ref TNL::Containers::Algorithms "Algorithms"
		3. \ref TNL::Communicators "Communicators"
		- Communicators represent the main abstraction for distributed computations,
		where multiple programs (or instances of the same program) have to
		communicate with each other.
		- At present, there are only two communicators:
		\ref TNL::Communicators::MpiCommunicator "MpiCommunicator"
		(uses [MPI](https://en.wikipedia.org/wiki/Message_Passing_Interface)) and
		\ref TNL::Communicators::NoDistrCommunicator "NoDistrCommunicator"
		(dummy communicator without any distribution support).
		4. \ref TNL::Containers::Algorithms "Algorithms"
		- Basic (container-free) algorithms specialized by `Device`/`Executor`.
		- `ParallelFor`, `Reduction`, `MultiReduction`, `ArrayOperations`, ...
		4. \ref TNL::Containers "Containers"
		5. \ref TNL::Containers "Containers"
		- Classes for general data structures.
		(TODO: alternatively use "Dense" and "Sparse", because a dense matrix can
		be an extended alias for 2D array)
		- `Array`, `Vector` (also `VectorOperations`), `NDArray`, ...
		5. Views
		6. Views
		- Views wrap only a raw pointer to data and some metadata (such as the array
		size), they do not do allocation and deallocation of the data. Hence, views
		have a fixed size which cannot be changed.

Documentation/Pages/main-page.md

+2 −0

Original line number	Diff line number	Diff line
		@@ -51,6 +51,8 @@ several modules:
		[libpng](http://www.libpng.org/pub/png/libpng.html) for PNG files, or
		[libjpeg](http://libjpeg.sourceforge.net/) for JPEG files.

		See also \ref comparison_with_other_libraries "Comparison with other libraries".

		## Installation

		You can either download the [stable version](http://tnl-project.org/download/)