Commit 84b7a213 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Merge branch 'JK/expressions' into 'develop'

Fixed expression templates

Brief summary:

- fixed `DistributedExpressionTemplates`
- many fixes and simplifications in `ExpressionTemplates` and `StaticExpressionTemplates`
- extended tests (unified for `VectorExpressions`, `VectorViewExpressions`, `StaticVectorExpressions`, `DistributedVectorExpressions`, `DistributedVectorViewExpressions`)
- functions `maxNorm`, `l1Norm` and `l2Norm` - convenient aliases for `max(abs(...))`, `lpNorm(..., 1)` and `lpNorm(..., 2)`
- a `cast` function which makes something like this possible:
  `float s = TNL::sum(cast<float>(vector))`, where vector elements may be `int`, `double`, etc.
- fixed result types in expression templates and vertical operations
- removed `addVector`, `addVectors`, `addElement`, `scalarProduct` and `sum` methods from all vector types


See merge request !36
parents 500523a8 e4ed2628
Loading
Loading
Loading
Loading
+1 −5
Original line number Diff line number Diff line
@@ -33,11 +33,6 @@ stages:
.build_template_def: &build_template
    stage: build
    script:
        # set MPI compiler wrapper
        - if [[ ${WITH_MPI} == "yes" ]]; then
                export CXX=mpicxx;
                export CC=mpicc;
          fi
        # all cores including hyperthreading
#        - export NUM_CORES=$(grep "core id" /proc/cpuinfo | wc -l)
#       # all pyhsical cores
@@ -53,6 +48,7 @@ stages:
                -DCMAKE_BUILD_TYPE=${BUILD_TYPE}
                -DCMAKE_INSTALL_PREFIX=$(pwd)/${BUILD_TYPE}_install_prefix
                -DWITH_OPENMP=${WITH_OPENMP}
                -DWITH_MPI=${WITH_MPI}
                -DWITH_CUDA=${WITH_CUDA}
                -DWITH_CUDA_ARCH=${WITH_CUDA_ARCH}
                -DWITH_MIC=${WITH_MIC}
+41 −37
Original line number Diff line number Diff line
@@ -21,6 +21,7 @@ option(WITH_MIC "Build with MIC support" OFF)
option(WITH_CUDA "Build with CUDA support" ON)
set(WITH_CUDA_ARCH "auto" CACHE STRING "Build for these CUDA architectures")
option(WITH_OPENMP "Build with OpenMP support" ON)
option(WITH_MPI "Build with MPI support" ON)
option(WITH_GMP "Build with GMP support" OFF)
option(WITH_TESTS "Build tests" ON)
option(WITH_PROFILING "Enable code profiling compiler flags" OFF )
@@ -39,10 +40,11 @@ set( TNL_TARGET_DATA_DIRECTORY "share/TNL" )
# the cmake directory in the TNL repository
set( CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake" )

# Note that in cmake 3.10 the FindOpenMP module is broken - it does not work when
# Note that in cmake 3.10 the FindOpenMP and FindMPI modules are broken - they do not work when
# CMAKE_EXECUTABLE_SUFFIX is not empty, see https://www.mail-archive.com/cmake@cmake.org/msg56886.html
# Hence, we find OpenMP before setting CMAKE_EXECUTABLE_SUFFIX.
# Hence, we find OpenMP and MPI before setting CMAKE_EXECUTABLE_SUFFIX.
find_package( OpenMP )
find_package( MPI )

####
# Settings for debug/release version
@@ -127,27 +129,40 @@ if( DEFINED ENV{CI_JOB_NAME} OR ${CMAKE_GENERATOR} STREQUAL "Ninja" )
   endif()
endif()

#####
# Check for MPI -- poznej podle vraperu compileru -- da se testovat preklad bez MPI
# gtest has to be built before we add the MPI flags
if( ${WITH_TESTS} )
   enable_testing()

   # build gtest libs
   include( BuildGtest )

   if( ${WITH_COVERAGE} AND CMAKE_BUILD_TYPE STREQUAL "Debug" )
      # enable code coverage reports
      include( UseCodeCoverage )
   endif()
endif()

####
# Check for OpenMP
#
get_filename_component( CXX_COMPILER_NAME ${CMAKE_CXX_COMPILER} NAME )
if( ${CXX_COMPILER_NAME} STREQUAL "mpicxx" )
   message( "MPI compiler detected."    )
   set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_MPI" )
   set( CUDA_HOST_COMPILER "mpicxx" )
   set( BUILD_MPI ON )
if( OPENMP_FOUND AND ${WITH_OPENMP} )
   set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_OPENMP ${OpenMP_CXX_FLAGS}" )
endif()

####
# Check for MPI -- not working
# Check for MPI
#
#find_package( MPI )
#if( MPI_CXX_FOUND )
   # set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_MPI" )
   # message( "MPI headers found -- ${MPI_CXX_INCLUDE_PATH}")
   # message( "MPI link flags  -- ${MPI_CXX_LINK_FLAGS}")
   # message( "MPI libraries-- ${MPI_CXX_LIBRARIES}")
#endif()
if( MPI_CXX_FOUND AND ${WITH_MPI} )
   set( BUILD_MPI TRUE)
   # add the appropriate flags to all targets (will be hidden from the CMAKE_CXX_* variables)
   include_directories( ${MPI_CXX_INCLUDE_DIRS} )
   add_compile_options( ${MPI_CXX_COMPILE_OPTIONS} )
   add_compile_definitions( ${MPI_CXX_COMPILE_DEFINITIONS} )
   add_link_options( "SHELL:${MPI_CXX_LINK_FLAGS}" )
   link_libraries( ${MPI_CXX_LIBRARIES} )
   # enable MPI in TNL
   set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_MPI" )
endif()

#####
# Check for CUDA
@@ -243,13 +258,6 @@ if( ${WITH_CUDA} )
endif()


####
# Check for OpenMP
#
if( OPENMP_FOUND AND ${WITH_OPENMP} )
   set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_OPENMP ${OpenMP_CXX_FLAGS}" )
endif()

if( ${WITH_PROFILING} )
    set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g" )
    set( CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --generate-line-info")
@@ -304,18 +312,6 @@ if( ${WITH_GMP} )
   endif()
endif()

if( ${WITH_TESTS} )
   enable_testing()

   # build gtest libs
   include( BuildGtest )

   if( ${WITH_COVERAGE} AND CMAKE_BUILD_TYPE STREQUAL "Debug" )
      # enable code coverage reports
      include( UseCodeCoverage )
   endif()
endif()

#if( BUILD_MPI )
#   FIND_PATH( PETSC_INCLUDE_DIR petsc.h
#     /usr/include/petsc
@@ -396,6 +392,7 @@ message( " WITH_MIC = ${WITH_MIC}" )
message( "   WITH_CUDA = ${WITH_CUDA}" )
message( "   WITH_CUDA_ARCH = ${WITH_CUDA_ARCH}" )
message( "   WITH_OPENMP = ${WITH_OPENMP}" )
message( "   WITH_MPI = ${WITH_MPI}" )
message( "   WITH_GMP = ${WITH_GMP}" )
message( "   WITH_TESTS = ${WITH_TESTS}" )
message( "   WITH_PROFILING = ${WITH_PROFILING}" )
@@ -420,3 +417,10 @@ message( " CMAKE_SHARED_LINKER_FLAGS_DEBUG = ${CMAKE_SHARED_LINKER_FLAGS_DEBUG
message( "   CMAKE_SHARED_LINKER_FLAGS_RELEASE = ${CMAKE_SHARED_LINKER_FLAGS_RELEASE}" )
message( "   CUDA_NVCC_FLAGS = ${CUDA_NVCC_FLAGS}" )
message( "   GMP_LIBRARIES = ${GMP_LIBRARIES}" )
if( MPI_CXX_FOUND AND ${WITH_MPI} )
   message( "   MPI_CXX_COMPILE_OPTIONS = ${MPI_CXX_COMPILE_OPTIONS}" )
   message( "   MPI_CXX_COMPILE_DEFINITIONS = ${MPI_CXX_COMPILE_DEFINITIONS}" )
   message( "   MPI_CXX_INCLUDE_DIRS = ${MPI_CXX_INCLUDE_DIRS}" )
   message( "   MPI_CXX_LINK_FLAGS = ${MPI_CXX_LINK_FLAGS}" )
   message( "   MPI_CXX_LIBRARIES = ${MPI_CXX_LIBRARIES}" )
endif()
+52 −0
Original line number Diff line number Diff line
\page comparison_with_other_libraries  Comparison with other libraries

## Memory space and execution model

TNL has separate concepts for the memory space and execution model, which are
represented by different template parameters. See the \ref core_concepts
"Core concepts" page for details.

- Most other libraries have separate types for CPU and GPU data structures
  (e.g. `Vector` and `cuVector`):
  - [Thrust](https://github.com/thrust/thrust/): `host_vector`, `device_vector`
    (plus macro-based selection, see below)
  - [Paralution](http://www.paralution.com/documentation/): `HostVector`, `AcceleratorVector`
  - [Bandicoot](https://coot.sourceforge.io/), [Kaldi](http://kaldi-asr.org/doc/about.html)
- These libraries have the concept of a "memory space" which is configurable as
  a template parameter:
  - [CUV](https://github.com/deeplearningais/CUV)
  - [CUSP](http://cusplibrary.github.io/classcusp_1_1array1d.html) - but CUSP
    uses Thrust, so `device_memory` might be the same as `host_memory` if OpenMP
    is used as the `device`
  - [Kokkos](https://github.com/kokkos/kokkos) - they have a concept of a
    "memory space" and "execution space", but there is also some default choice
    of the spaces, possibly even through command-line arguments (in which case
    the array type would be polymorphic, because something has to store the
    current memory/execution space)
- These libraries have transparent access to the data from GPU and CPU:
  - the CUDA toolkit itself, via `cudaMallocManaged`
  - [cudarrays](https://github.com/cudarrays/cudarrays) - they have custom
    virtual memory system using `cudaMalloc` and the standard host allocator
- These libraries select the (default) device based on some macro
  (this approach is way too simple, because multiple different devices cannot be
  combined):
  - Thrust: see [device backends](https://github.com/thrust/thrust/wiki/Device-Backends)
    and [host backends](https://github.com/thrust/thrust/wiki/Host-Backends)
- These libraries do not abstract memory space, only execution model:
  - [RAJA](https://github.com/LLNL/RAJA)
  - [Nebo](https://www.sciencedirect.com/science/article/pii/S0164121216000182)
    (also with a macro-based selection)

## Multidimensional arrays

TODO: compare the implementation of multidimensional arrays
(features described in the merge request: https://mmg-gitlab.fjfi.cvut.cz/gitlab/tnl/tnl-dev/merge_requests/18 )

- http://cpptruths.blogspot.cz/2011/10/multi-dimensional-arrays-in-c11.html
- http://www.nongnu.org/tensors/ (last commit in 2012)
- https://bitbucket.org/wlandry/ftensor/src
- [Eigen tensors](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md?at=default&fileviewer=file-view-default) - Many operations, expression templates, either pure-static or pure-dynamic sizes, only column-major format (row-major support is incomplete), little GPU support.
- [cudarrays](https://github.com/cudarrays/cudarrays) - Only up to 3D arrays, both static and dynamic, compile-time permutations using `std::tuple`.
- [RAJA](https://github.com/LLNL/RAJA) - No memory management, views are initialized with a raw pointer, index permutations are initialized at runtime, only dynamic dimensions.
- [Kokkos](https://github.com/kokkos/kokkos) - Configurable layout and default selection based on the memory/execution space, but only AoS and SoA are considered, even for `N > 2`. For parallel work there is only one leading dimension - it does not map to 2D or 3D CUDA grids.
- [CUV](https://github.com/deeplearningais/CUV) - Assumption that "everything is an n-dimensional array" (like Matlab), CPU and GPU support, column-major or row-major, integration with Python and Numpy.
+12 −3
Original line number Diff line number Diff line
@@ -11,15 +11,24 @@ TNL is based on the following core concepts:
   (TODO: rename to `Executor` or something like that)
   - Device is responsible for the execution of algorithms in a specific way.
   - Algorithms can be specialized by the `Device` template parameter.
3. \ref TNL::Containers::Algorithms "Algorithms"
3. \ref TNL::Communicators "Communicators"
   - Communicators represent the main abstraction for distributed computations,
     where multiple programs (or instances of the same program) have to
     communicate with each other.
   - At present, there are only two communicators:
     \ref TNL::Communicators::MpiCommunicator "MpiCommunicator"
     (uses [MPI](https://en.wikipedia.org/wiki/Message_Passing_Interface)) and
     \ref TNL::Communicators::NoDistrCommunicator "NoDistrCommunicator"
     (dummy communicator without any distribution support).
4. \ref TNL::Containers::Algorithms "Algorithms"
   - Basic (container-free) algorithms specialized by `Device`/`Executor`.
   - `ParallelFor`, `Reduction`, `MultiReduction`, `ArrayOperations`, ...
4. \ref TNL::Containers "Containers"
5. \ref TNL::Containers "Containers"
   - Classes for general data structures.
     (TODO: alternatively use "Dense" and "Sparse", because a dense matrix can
     be an extended alias for 2D array)
   - `Array`, `Vector` (also `VectorOperations`), `NDArray`, ...
5. Views
6. Views
   - Views wrap only a raw pointer to data and some metadata (such as the array
     size), they do not do allocation and deallocation of the data. Hence, views
     have a fixed size which cannot be changed.
+2 −0
Original line number Diff line number Diff line
@@ -51,6 +51,8 @@ several modules:
  [libpng](http://www.libpng.org/pub/png/libpng.html) for PNG files, or
  [libjpeg](http://libjpeg.sourceforge.net/) for JPEG files.

See also \ref comparison_with_other_libraries "Comparison with other libraries".

## Installation

You can either download the [stable version](http://tnl-project.org/download/)
Loading