diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b4a98a9e1285fdbde4b2cefff816fca819bf9547..157374d4c7c1224768332810c40103f6aea4646c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -18,14 +18,24 @@ stages: WITH_CUDA: "no" WITH_CUDA_ARCH: "auto" WITH_MIC: "no" + WITH_MPI: "no" WITH_TESTS: "yes" WITH_COVERAGE: "no" - WITH_EXAMPLES: "yes" + # these are built only in the "full" config + WITH_BENCHMARKS: "no" + WITH_EXAMPLES: "no" + WITH_TOOLS: "no" + WITH_PYTHON: "no" # template for build jobs .build_template_def: &build_template stage: build script: + # set MPI compiler wrapper + - if [[ ${WITH_MPI} == "yes" ]]; then + export CXX=mpicxx; + export CC=mpicc; + fi # all cores including hyperthreading # - export NUM_CORES=$(grep "core id" /proc/cpuinfo | wc -l) # # all pyhsical cores @@ -46,7 +56,10 @@ stages: -DWITH_MIC=${WITH_MIC} -DWITH_TESTS=${WITH_TESTS} -DWITH_COVERAGE=${WITH_COVERAGE} + -DWITH_BENCHMARKS=${WITH_BENCHMARKS} -DWITH_EXAMPLES=${WITH_EXAMPLES} + -DWITH_TOOLS=${WITH_TOOLS} + -DWITH_PYTHON=${WITH_PYTHON} # - make # - make test # - make install @@ -61,7 +74,7 @@ stages: # Cuda builds are specified first because they take more time than host-only builds, # which can be allocated on hosts whitout GPUs. -cuda_Debug: +cuda_base_Debug: <<: *build_template tags: - gpu @@ -70,7 +83,7 @@ cuda_Debug: WITH_CUDA: "yes" BUILD_TYPE: Debug -cuda_Release: +cuda_base_Release: <<: *build_template tags: - gpu @@ -79,38 +92,94 @@ cuda_Release: WITH_CUDA: "yes" BUILD_TYPE: Release -cuda+openmp_Debug: +cuda_mpi_Debug: <<: *build_template tags: - openmp - gpu + - mpi variables: <<: *default_cmake_flags WITH_OPENMP: "yes" WITH_CUDA: "yes" + WITH_MPI: "yes" BUILD_TYPE: Debug -cuda+openmp_Release: +cuda_mpi_Release: <<: *build_template tags: - openmp - gpu + - mpi variables: <<: *default_cmake_flags WITH_OPENMP: "yes" WITH_CUDA: "yes" + WITH_MPI: "yes" BUILD_TYPE: Release -default_Debug: +cuda_full_Debug: <<: *build_template + tags: + - openmp + - gpu + variables: + <<: *default_cmake_flags + WITH_OPENMP: "yes" + WITH_CUDA: "yes" + BUILD_TYPE: Debug + WITH_BENCHMARKS: "yes" + WITH_EXAMPLES: "yes" + WITH_TOOLS: "yes" + WITH_PYTHON: "yes" + +cuda_full_Release: + <<: *build_template + tags: + - openmp + - gpu + variables: + <<: *default_cmake_flags + WITH_OPENMP: "yes" + WITH_CUDA: "yes" + BUILD_TYPE: Release + WITH_BENCHMARKS: "yes" + WITH_EXAMPLES: "yes" + WITH_TOOLS: "yes" + WITH_PYTHON: "yes" -default_Release: +default_base_Debug: <<: *build_template + +default_base_Release: + <<: *build_template + variables: + <<: *default_cmake_flags + BUILD_TYPE: Release + +default_mpi_Debug: + <<: *build_template + tags: + - openmp + - mpi variables: <<: *default_cmake_flags + WITH_OPENMP: "yes" + WITH_MPI: "yes" + BUILD_TYPE: Debug + +default_mpi_Release: + <<: *build_template + tags: + - openmp + - mpi + variables: + <<: *default_cmake_flags + WITH_OPENMP: "yes" + WITH_MPI: "yes" BUILD_TYPE: Release -openmp_Debug: +default_full_Debug: <<: *build_template tags: - openmp @@ -118,8 +187,12 @@ openmp_Debug: <<: *default_cmake_flags WITH_OPENMP: "yes" BUILD_TYPE: Debug + WITH_BENCHMARKS: "yes" + WITH_EXAMPLES: "yes" + WITH_TOOLS: "yes" + WITH_PYTHON: "yes" -openmp_Release: +default_full_Release: <<: *build_template tags: - openmp @@ -127,3 +200,7 @@ openmp_Release: <<: *default_cmake_flags WITH_OPENMP: "yes" BUILD_TYPE: Release + WITH_BENCHMARKS: "yes" + WITH_EXAMPLES: "yes" + WITH_TOOLS: "yes" + WITH_PYTHON: "yes" diff --git a/CMakeLists.txt b/CMakeLists.txt index 1a8b1543fdc04a27aec514676a66beb81f9ebc12..8ed064eb1de0ebce012daf328173b420ada0f0a4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,6 +29,7 @@ option(WITH_TESTS "Build tests" ON) option(WITH_COVERAGE "Enable code coverage reports from unit tests" OFF) option(WITH_EXAMPLES "Compile the 'examples' directory" ON) option(WITH_TOOLS "Compile the 'src/Tools' directory" ON) +option(WITH_BENCHMARKS "Compile the 'src/Benchmarks' directory" ON) option(WITH_PYTHON "Compile the Python bindings" ON) option(WITH_TEMPLATES_INSTANTIATION "Enable explicit template instantiation" OFF) @@ -62,6 +63,20 @@ else() set( EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/Release/bin ) endif() +# check if the compiler is good enough +if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + # GCC 5.0 is the first release with full C++11 support (due to libstdc++) + # https://gcc.gnu.org/gcc-5/changes.html + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "5.0") + message(FATAL_ERROR "Insufficient GCC version") + endif() +elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + # Clang 3.4 has full C++14 support: http://clang.llvm.org/cxx_status.html + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "3.4") + message(FATAL_ERROR "Insufficient Clang version") + endif() +endif() + # set Debug/Release options set( CMAKE_CXX_FLAGS "-std=c++11 -pthread -Wall -Wno-unused-local-typedefs -Wno-unused-variable" ) set( CMAKE_CXX_FLAGS_DEBUG "-g" ) @@ -109,10 +124,11 @@ endif() ##### # Check for MPI -- poznej podle vraperu compileru -- da se testovat preklad bez MPI # -if( ${CXX_COMPILER_NAME} STREQUAL "mpic++" ) +if( ${CXX_COMPILER_NAME} STREQUAL "mpicxx" ) message( "MPI compiler detected." ) set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_MPI" ) - set( CUDA_HOST_COMPILER "mpic++" ) + set( CUDA_HOST_COMPILER "mpicxx" ) + set( BUILD_MPI ON ) endif() #### @@ -150,7 +166,7 @@ if( ${WITH_CUDA} ) endif() endif() # An extra CUDA_ARCH_HOST_COMPILER variable for compiling tnl-cuda-arch alone, - # because it SHOULD NOT be compiled using mpic++, which would cause weird + # because it SHOULD NOT be compiled using mpicxx, which would cause weird # RPATH_CHANGE error in cmake. # FIXME: find better solution to switch between MPI-enabled and MPI-disabled binaries in cmake if( NOT $ENV{CUDA_ARCH_HOST_COMPILER} STREQUAL "" ) @@ -406,15 +422,9 @@ INCLUDE_DIRECTORIES( ${PROJECT_BUILD_PATH} ) LINK_DIRECTORIES( ${LIBRARY_OUTPUT_PATH} ) # Add all subdirectories -# Note that it is important to start building examples as soon as possible, -# because they take the longest time and other stuff can be pipelined before -# they are finished (at least with Ninja). -if( WITH_EXAMPLES STREQUAL "yes" ) - add_subdirectory( examples ) -endif( WITH_EXAMPLES STREQUAL "yes" ) -if( WITH_TESTS STREQUAL "yes" ) +if( ${WITH_TESTS} ) add_subdirectory( tests ) -endif( WITH_TESTS STREQUAL "yes" ) +endif() add_subdirectory( src ) add_subdirectory( share ) @@ -458,6 +468,7 @@ message( " WITH_TESTS=${WITH_TESTS}" ) message( " WITH_COVERAGE=${WITH_COVERAGE}" ) message( " WITH_EXAMPLES=${WITH_EXAMPLES}" ) message( " WITH_TOOLS=${WITH_TOOLS}" ) +message( " WITH_BENCHMARKS=${WITH_BENCHMARKS}" ) message( " WITH_PYTHON=${WITH_PYTHON}" ) message( " WITH_TEMPLATES_INSTANTIATION=${WITH_TEMPLATES_INSTANTIATION}" ) # Print compiler options diff --git a/build b/build index 69f01cb406b6d912f0241738db56d1370f1e6105..e0c8dbb993e592c420aa62abd991c189fbff4870 100755 --- a/build +++ b/build @@ -26,6 +26,7 @@ WITH_COVERAGE="no" WITH_EXAMPLES="yes" WITH_PYTHON="yes" WITH_TOOLS="yes" +WITH_BENCHMARKS="yes" WITH_TEMPLATE_INSTANTIATION="no" INSTANTIATE_LONG_INT="no" @@ -60,6 +61,7 @@ do --with-coverage=* ) WITH_COVERAGE="${option#*=}" ;; --with-examples=* ) WITH_EXAMPLES="${option#*=}" ;; --with-tools=* ) WITH_TOOLS="${option#*=}" ;; + --with-benchmarks=* ) WITH_BENCHMARKS="${option#*=}" ;; --with-python=* ) WITH_PYTHON="${option#*=}" ;; --with-templates-instantiation=* ) WITH_TEMPLATE_INSTANTIATION="${option#*=}" ;; --instantiate-long-int=* ) INSTANTIATE_LONG_INT="${option#*=}" ;; @@ -114,17 +116,18 @@ if [[ ${WITH_CLANG} == "yes" ]]; then fi if [[ ${WITH_MPI} == "yes" ]]; then - if [[ ! -x "$(command -v mpic++)" ]]; then - echo "Warning:mpic++ is not installed on this system. MPI support is turned off." + # NOTE: OpenMPI provides mpic++, but Intel MPI does not + if [[ ! -x "$(command -v mpicxx)" ]]; then + echo "Warning: mpicxx is not installed on this system. MPI support is turned off." else # instruct OpenMPI to use the original compiler # reference: https://www.open-mpi.org/faq/?category=mpi-apps#override-wrappers-after-v1.0 - # FIXME: this does not work with CUDA_HOST_COMPILER=mpic++ + # FIXME: this does not work with CUDA_HOST_COMPILER=mpicxx # if [ -n "$CXX" ]; then # export OMPI_CXX="$CXX" # fi - export CXX=mpic++ - export CUDA_HOST_COMPILER=mpic++ + export CXX=mpicxx + export CUDA_HOST_COMPILER=mpicxx fi if [[ ! -x "$(command -v mpicc)" ]]; then echo "Warning: mpicc is not installed on this system." @@ -163,6 +166,7 @@ cmake_command=( -DWITH_COVERAGE=${WITH_COVERAGE} -DWITH_EXAMPLES=${WITH_EXAMPLES} -DWITH_TOOLS=${WITH_TOOLS} + -DWITH_BENCHMARKS=${WITH_BENCHMARKS} -DWITH_PYTHON=${WITH_PYTHON} -DDCMTK_DIR=${DCMTK_DIR} -DWITH_TEMPLATE_INSTANTIATION=${WITH_TEMPLATE_INSTANTIATION} diff --git a/share/CMakeLists.txt b/share/CMakeLists.txt index 3559626d0dd91df5cdf0affcc0cea967bd5a2e21..2a6f286925ef2a17b6cd7adc902fdefbc930fb39 100644 --- a/share/CMakeLists.txt +++ b/share/CMakeLists.txt @@ -1 +1,2 @@ -add_subdirectory( Tools ) +add_subdirectory (cmake) +add_subdirectory (pkgconfig) diff --git a/share/Tools/CMakeLists.txt b/share/Tools/CMakeLists.txt deleted file mode 100644 index 2a6f286925ef2a17b6cd7adc902fdefbc930fb39..0000000000000000000000000000000000000000 --- a/share/Tools/CMakeLists.txt +++ /dev/null @@ -1,2 +0,0 @@ -add_subdirectory (cmake) -add_subdirectory (pkgconfig) diff --git a/share/Tools/cmake/CMakeFindTNL.cmake b/share/cmake/CMakeFindTNL.cmake similarity index 100% rename from share/Tools/cmake/CMakeFindTNL.cmake rename to share/cmake/CMakeFindTNL.cmake diff --git a/share/Tools/cmake/CMakeLists.txt b/share/cmake/CMakeLists.txt similarity index 100% rename from share/Tools/cmake/CMakeLists.txt rename to share/cmake/CMakeLists.txt diff --git a/share/Tools/pkgconfig/CMakeLists.txt b/share/pkgconfig/CMakeLists.txt similarity index 100% rename from share/Tools/pkgconfig/CMakeLists.txt rename to share/pkgconfig/CMakeLists.txt diff --git a/share/Tools/pkgconfig/cuda.pc.in b/share/pkgconfig/cuda.pc.in similarity index 100% rename from share/Tools/pkgconfig/cuda.pc.in rename to share/pkgconfig/cuda.pc.in diff --git a/share/Tools/pkgconfig/tnl-cuda.pc.in b/share/pkgconfig/tnl-cuda.pc.in similarity index 100% rename from share/Tools/pkgconfig/tnl-cuda.pc.in rename to share/pkgconfig/tnl-cuda.pc.in diff --git a/share/Tools/pkgconfig/tnl-openmp.pc.in b/share/pkgconfig/tnl-openmp.pc.in similarity index 100% rename from share/Tools/pkgconfig/tnl-openmp.pc.in rename to share/pkgconfig/tnl-openmp.pc.in diff --git a/share/Tools/pkgconfig/tnl.pc.in b/share/pkgconfig/tnl.pc.in similarity index 100% rename from share/Tools/pkgconfig/tnl.pc.in rename to share/pkgconfig/tnl.pc.in diff --git a/src/Benchmarks/BLAS/CMakeLists.txt b/src/Benchmarks/BLAS/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..dc396c46e60512b5cab5f5f7455756f609b76c47 --- /dev/null +++ b/src/Benchmarks/BLAS/CMakeLists.txt @@ -0,0 +1,10 @@ +if( BUILD_CUDA ) + CUDA_ADD_EXECUTABLE( tnl-benchmark-blas tnl-benchmark-blas.cu ) + CUDA_ADD_CUBLAS_TO_TARGET( tnl-benchmark-blas ) + TARGET_LINK_LIBRARIES( tnl-benchmark-blas tnl ) +else() + ADD_EXECUTABLE( tnl-benchmark-blas tnl-benchmark-blas.cpp ) + TARGET_LINK_LIBRARIES( tnl-benchmark-blas tnl ) +endif() + +install( TARGETS tnl-benchmark-blas RUNTIME DESTINATION bin ) diff --git a/src/Benchmarks/BLAS/array-operations.h b/src/Benchmarks/BLAS/array-operations.h new file mode 100644 index 0000000000000000000000000000000000000000..aacdb9cc65315af377ddec49c167a9a197483bd5 --- /dev/null +++ b/src/Benchmarks/BLAS/array-operations.h @@ -0,0 +1,165 @@ +/*************************************************************************** + array-operations.h - description + ------------------- + begin : Dec 30, 2015 + copyright : (C) 2015 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky + +#pragma once + +#include "../Benchmarks.h" + +#include <TNL/Containers/Array.h> + +namespace TNL { +namespace Benchmarks { + +template< typename Real = double, + typename Index = int > +bool +benchmarkArrayOperations( Benchmark & benchmark, + const int & loops, + const long & size ) +{ + typedef Containers::Array< Real, Devices::Host, Index > HostArray; + typedef Containers::Array< Real, Devices::Cuda, Index > CudaArray; + using namespace std; + + double datasetSize = ( double ) ( loops * size ) * sizeof( Real ) / oneGB; + + HostArray hostArray, hostArray2; + CudaArray deviceArray, deviceArray2; + hostArray.setSize( size ); + hostArray2.setSize( size ); +#ifdef HAVE_CUDA + deviceArray.setSize( size ); + deviceArray2.setSize( size ); +#endif + + Real resultHost, resultDevice; + + + // reset functions + auto reset1 = [&]() { + hostArray.setValue( 1.0 ); +#ifdef HAVE_CUDA + deviceArray.setValue( 1.0 ); +#endif + }; + auto reset2 = [&]() { + hostArray2.setValue( 1.0 ); +#ifdef HAVE_CUDA + deviceArray2.setValue( 1.0 ); +#endif + }; + auto reset12 = [&]() { + reset1(); + reset2(); + }; + + + reset12(); + + + auto compareHost = [&]() { + resultHost = (int) hostArray == hostArray2; + }; + auto compareCuda = [&]() { + resultDevice = (int) deviceArray == deviceArray2; + }; + benchmark.setOperation( "comparison (operator==)", 2 * datasetSize ); + benchmark.time( reset1, "CPU", compareHost ); +#ifdef HAVE_CUDA + benchmark.time( reset1, "GPU", compareCuda ); +#endif + + + auto copyAssignHostHost = [&]() { + hostArray = hostArray2; + }; + auto copyAssignCudaCuda = [&]() { + deviceArray = deviceArray2; + }; + benchmark.setOperation( "copy (operator=)", 2 * datasetSize ); + // copyBasetime is used later inside HAVE_CUDA guard, so the compiler will + // complain when compiling without CUDA + const double copyBasetime = benchmark.time( reset1, "CPU", copyAssignHostHost ); +#ifdef HAVE_CUDA + benchmark.time( reset1, "GPU", copyAssignCudaCuda ); +#endif + + + auto copyAssignHostCuda = [&]() { + deviceArray = hostArray; + }; + auto copyAssignCudaHost = [&]() { + hostArray = deviceArray; + }; +#ifdef HAVE_CUDA + benchmark.setOperation( "copy (operator=)", datasetSize, copyBasetime ); + benchmark.time( reset1, + "CPU->GPU", copyAssignHostCuda, + "GPU->CPU", copyAssignCudaHost ); +#endif + + + auto setValueHost = [&]() { + hostArray.setValue( 3.0 ); + }; + auto setValueCuda = [&]() { + deviceArray.setValue( 3.0 ); + }; + benchmark.setOperation( "setValue", datasetSize ); + benchmark.time( reset1, "CPU", setValueHost ); +#ifdef HAVE_CUDA + benchmark.time( reset1, "GPU", setValueCuda ); +#endif + + + auto setSizeHost = [&]() { + hostArray.setSize( size ); + }; + auto setSizeCuda = [&]() { + deviceArray.setSize( size ); + }; + auto resetSize1 = [&]() { + hostArray.reset(); +#ifdef HAVE_CUDA + deviceArray.reset(); +#endif + }; + benchmark.setOperation( "allocation (setSize)", datasetSize ); + benchmark.time( resetSize1, "CPU", setSizeHost ); +#ifdef HAVE_CUDA + benchmark.time( resetSize1, "GPU", setSizeCuda ); +#endif + + + auto resetSizeHost = [&]() { + hostArray.reset(); + }; + auto resetSizeCuda = [&]() { + deviceArray.reset(); + }; + auto setSize1 = [&]() { + hostArray.setSize( size ); +#ifdef HAVE_CUDA + deviceArray.setSize( size ); +#endif + }; + benchmark.setOperation( "deallocation (reset)", datasetSize ); + benchmark.time( setSize1, "CPU", resetSizeHost ); +#ifdef HAVE_CUDA + benchmark.time( setSize1, "GPU", resetSizeCuda ); +#endif + + return true; +} + +} // namespace Benchmarks +} // namespace TNL diff --git a/tests/benchmarks/cublasWrappers.h b/src/Benchmarks/BLAS/cublasWrappers.h similarity index 74% rename from tests/benchmarks/cublasWrappers.h rename to src/Benchmarks/BLAS/cublasWrappers.h index 6b71a4ed7befc12664440cf5425f5975e6feec0c..1e63e139d6faa513706ed1b18a207e87ea1a079d 100644 --- a/tests/benchmarks/cublasWrappers.h +++ b/src/Benchmarks/BLAS/cublasWrappers.h @@ -8,14 +8,14 @@ inline cublasStatus_t cublasIgamax( cublasHandle_t handle, int n, const float *x, int incx, int *result ) { - return cublasIsamax( handle, n, x, incx, result ); + return cublasIsamax( handle, n, x, incx, result ); } inline cublasStatus_t cublasIgamax( cublasHandle_t handle, int n, const double *x, int incx, int *result ) { - return cublasIdamax( handle, n, x, incx, result ); + return cublasIdamax( handle, n, x, incx, result ); } @@ -23,14 +23,14 @@ inline cublasStatus_t cublasIgamin( cublasHandle_t handle, int n, const float *x, int incx, int *result ) { - return cublasIsamin( handle, n, x, incx, result ); + return cublasIsamin( handle, n, x, incx, result ); } inline cublasStatus_t cublasIgamin( cublasHandle_t handle, int n, const double *x, int incx, int *result ) { - return cublasIdamin( handle, n, x, incx, result ); + return cublasIdamin( handle, n, x, incx, result ); } @@ -38,14 +38,14 @@ inline cublasStatus_t cublasGasum( cublasHandle_t handle, int n, const float *x, int incx, float *result ) { - return cublasSasum( handle, n, x, incx, result ); + return cublasSasum( handle, n, x, incx, result ); } inline cublasStatus_t cublasGasum( cublasHandle_t handle, int n, const double *x, int incx, double *result ) { - return cublasDasum( handle, n, x, incx, result ); + return cublasDasum( handle, n, x, incx, result ); } @@ -55,7 +55,7 @@ cublasGaxpy( cublasHandle_t handle, int n, const float *x, int incx, float *y, int incy ) { - return cublasSaxpy( handle, n, alpha, x, incx, y, incy ); + return cublasSaxpy( handle, n, alpha, x, incx, y, incy ); } inline cublasStatus_t @@ -64,7 +64,7 @@ cublasGaxpy( cublasHandle_t handle, int n, const double *x, int incx, double *y, int incy ) { - return cublasDaxpy( handle, n, alpha, x, incx, y, incy ); + return cublasDaxpy( handle, n, alpha, x, incx, y, incy ); } @@ -74,7 +74,7 @@ cublasGdot( cublasHandle_t handle, int n, const float *y, int incy, float *result ) { - return cublasSdot( handle, n, x, incx, y, incy, result ); + return cublasSdot( handle, n, x, incx, y, incy, result ); } inline cublasStatus_t @@ -83,7 +83,7 @@ cublasGdot( cublasHandle_t handle, int n, const double *y, int incy, double *result ) { - return cublasDdot( handle, n, x, incx, y, incy, result ); + return cublasDdot( handle, n, x, incx, y, incy, result ); } @@ -91,14 +91,14 @@ inline cublasStatus_t cublasGnrm2( cublasHandle_t handle, int n, const float *x, int incx, float *result ) { - return cublasSnrm2( handle, n, x, incx, result ); + return cublasSnrm2( handle, n, x, incx, result ); } inline cublasStatus_t cublasGnrm2( cublasHandle_t handle, int n, const double *x, int incx, double *result ) { - return cublasDnrm2( handle, n, x, incx, result ); + return cublasDnrm2( handle, n, x, incx, result ); } @@ -107,7 +107,7 @@ cublasGscal( cublasHandle_t handle, int n, const float *alpha, float *x, int incx ) { - return cublasSscal( handle, n, alpha, x, incx ); + return cublasSscal( handle, n, alpha, x, incx ); } inline cublasStatus_t @@ -115,7 +115,7 @@ cublasGscal( cublasHandle_t handle, int n, const double *alpha, double *x, int incx ) { - return cublasDscal( handle, n, alpha, x, incx ); + return cublasDscal( handle, n, alpha, x, incx ); } #endif diff --git a/src/Benchmarks/BLAS/spmv.h b/src/Benchmarks/BLAS/spmv.h new file mode 100644 index 0000000000000000000000000000000000000000..a6840af9f8c1f1dedeee876fd2bca258a55c64ba --- /dev/null +++ b/src/Benchmarks/BLAS/spmv.h @@ -0,0 +1,189 @@ +/*************************************************************************** + spmv.h - description + ------------------- + begin : Dec 30, 2015 + copyright : (C) 2015 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky + +#pragma once + +#include "../Benchmarks.h" + +#include <TNL/Containers/List.h> +#include <TNL/Pointers/DevicePointer.h> +#include <TNL/Matrices/CSR.h> +#include <TNL/Matrices/Ellpack.h> +#include <TNL/Matrices/SlicedEllpack.h> +#include <TNL/Matrices/ChunkedEllpack.h> + +namespace TNL { +namespace Benchmarks { + +// silly alias to match the number of template parameters with other formats +template< typename Real, typename Device, typename Index > +using SlicedEllpack = Matrices::SlicedEllpack< Real, Device, Index >; + +template< typename Matrix > +int setHostTestMatrix( Matrix& matrix, + const int elementsPerRow ) +{ + const int size = matrix.getRows(); + int elements( 0 ); + for( int row = 0; row < size; row++ ) { + int col = row - elementsPerRow / 2; + for( int element = 0; element < elementsPerRow; element++ ) { + if( col + element >= 0 && + col + element < size ) + { + matrix.setElement( row, col + element, element + 1 ); + elements++; + } + } + } + return elements; +} + +#ifdef HAVE_CUDA +template< typename Matrix > +__global__ void setCudaTestMatrixKernel( Matrix* matrix, + const int elementsPerRow, + const int gridIdx ) +{ + const int rowIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + if( rowIdx >= matrix->getRows() ) + return; + int col = rowIdx - elementsPerRow / 2; + for( int element = 0; element < elementsPerRow; element++ ) { + if( col + element >= 0 && + col + element < matrix->getColumns() ) + matrix->setElementFast( rowIdx, col + element, element + 1 ); + } +} +#endif + +template< typename Matrix > +void setCudaTestMatrix( Matrix& matrix, + const int elementsPerRow ) +{ +#ifdef HAVE_CUDA + typedef typename Matrix::IndexType IndexType; + typedef typename Matrix::RealType RealType; + Pointers::DevicePointer< Matrix > kernel_matrix( matrix ); + dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() ); + const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); + const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() ); + for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { + if( gridIdx == cudaGrids - 1 ) + cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize(); + setCudaTestMatrixKernel< Matrix > + <<< cudaGridSize, cudaBlockSize >>> + ( &kernel_matrix.template modifyData< Devices::Cuda >(), elementsPerRow, gridIdx ); + TNL_CHECK_CUDA_DEVICE; + } +#endif +} + + +// TODO: rename as benchmark_SpMV_synthetic and move to spmv-synthetic.h +template< typename Real, + template< typename, typename, typename > class Matrix, + template< typename, typename, typename > class Vector = Containers::Vector > +bool +benchmarkSpMV( Benchmark & benchmark, + const int & loops, + const int & size, + const int elementsPerRow = 5 ) +{ + typedef Matrix< Real, Devices::Host, int > HostMatrix; + typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix; + typedef Containers::Vector< Real, Devices::Host, int > HostVector; + typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector; + + HostMatrix hostMatrix; + DeviceMatrix deviceMatrix; + Containers::Vector< int, Devices::Host, int > hostRowLengths; + Containers::Vector< int, Devices::Cuda, int > deviceRowLengths; + HostVector hostVector, hostVector2; + CudaVector deviceVector, deviceVector2; + + // create benchmark group + Containers::List< String > parsedType; + parseObjectType( HostMatrix::getType(), parsedType ); + benchmark.createHorizontalGroup( parsedType[ 0 ], 2 ); + + hostRowLengths.setSize( size ); + hostMatrix.setDimensions( size, size ); + hostVector.setSize( size ); + hostVector2.setSize( size ); +#ifdef HAVE_CUDA + deviceRowLengths.setSize( size ); + deviceMatrix.setDimensions( size, size ); + deviceVector.setSize( size ); + deviceVector2.setSize( size ); +#endif + + hostRowLengths.setValue( elementsPerRow ); +#ifdef HAVE_CUDA + deviceRowLengths.setValue( elementsPerRow ); +#endif + + hostMatrix.setCompressedRowLengths( hostRowLengths ); +#ifdef HAVE_CUDA + deviceMatrix.setCompressedRowLengths( deviceRowLengths ); +#endif + + const int elements = setHostTestMatrix< HostMatrix >( hostMatrix, elementsPerRow ); + setCudaTestMatrix< DeviceMatrix >( deviceMatrix, elementsPerRow ); + const double datasetSize = ( double ) loops * elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; + + // reset function + auto reset = [&]() { + hostVector.setValue( 1.0 ); + hostVector2.setValue( 0.0 ); +#ifdef HAVE_CUDA + deviceVector.setValue( 1.0 ); + deviceVector2.setValue( 0.0 ); +#endif + }; + + // compute functions + auto spmvHost = [&]() { + hostMatrix.vectorProduct( hostVector, hostVector2 ); + }; + auto spmvCuda = [&]() { + deviceMatrix.vectorProduct( deviceVector, deviceVector2 ); + }; + + benchmark.setOperation( datasetSize ); + benchmark.time( reset, "CPU", spmvHost ); +#ifdef HAVE_CUDA + benchmark.time( reset, "GPU", spmvCuda ); +#endif + + return true; +} + +template< typename Real = double, + typename Index = int > +bool +benchmarkSpmvSynthetic( Benchmark & benchmark, + const int & loops, + const int & size, + const int & elementsPerRow ) +{ + bool result = true; + // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats) + result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, loops, size, elementsPerRow ); + result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, loops, size, elementsPerRow ); + result |= benchmarkSpMV< Real, SlicedEllpack >( benchmark, loops, size, elementsPerRow ); + result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, loops, size, elementsPerRow ); + return result; +} + +} // namespace Benchmarks +} // namespace TNL diff --git a/tests/benchmarks/tnl-benchmark-blas.cpp b/src/Benchmarks/BLAS/tnl-benchmark-blas.cpp similarity index 100% rename from tests/benchmarks/tnl-benchmark-blas.cpp rename to src/Benchmarks/BLAS/tnl-benchmark-blas.cpp diff --git a/tests/benchmarks/tnl-benchmark-blas.cu b/src/Benchmarks/BLAS/tnl-benchmark-blas.cu similarity index 100% rename from tests/benchmarks/tnl-benchmark-blas.cu rename to src/Benchmarks/BLAS/tnl-benchmark-blas.cu diff --git a/src/Benchmarks/BLAS/tnl-benchmark-blas.h b/src/Benchmarks/BLAS/tnl-benchmark-blas.h new file mode 100644 index 0000000000000000000000000000000000000000..73ea0b375a23440105ea6dfbae599c63fb108adb --- /dev/null +++ b/src/Benchmarks/BLAS/tnl-benchmark-blas.h @@ -0,0 +1,192 @@ +/*************************************************************************** + tnl-benchmark-blas.h - description + ------------------- + begin : Jan 27, 2010 + copyright : (C) 2010 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky + +#pragma once + +#include <TNL/Devices/Host.h> +#include <TNL/Devices/CudaDeviceInfo.h> +#include <TNL/Devices/SystemInfo.h> +#include <TNL/Config/ConfigDescription.h> +#include <TNL/Config/ParameterContainer.h> + +#include "array-operations.h" +#include "vector-operations.h" +#include "spmv.h" + +using namespace TNL; +using namespace TNL::Benchmarks; + + +// TODO: should benchmarks check the result of the computation? + + +template< typename Real > +void +runBlasBenchmarks( Benchmark & benchmark, + Benchmark::MetadataMap metadata, + const std::size_t & minSize, + const std::size_t & maxSize, + const double & sizeStepFactor, + const unsigned & loops, + const unsigned & elementsPerRow ) +{ + const String precision = getType< Real >(); + metadata["precision"] = precision; + + // Array operations + benchmark.newBenchmark( String("Array operations (") + precision + ")", + metadata ); + for( std::size_t size = minSize; size <= maxSize; size *= 2 ) { + benchmark.setMetadataColumns( Benchmark::MetadataColumns({ + {"size", size}, + } )); + benchmarkArrayOperations< Real >( benchmark, loops, size ); + } + + // Vector operations + benchmark.newBenchmark( String("Vector operations (") + precision + ")", + metadata ); + for( std::size_t size = minSize; size <= maxSize; size *= sizeStepFactor ) { + benchmark.setMetadataColumns( Benchmark::MetadataColumns({ + {"size", size}, + } )); + benchmarkVectorOperations< Real >( benchmark, loops, size ); + } + + // Sparse matrix-vector multiplication + benchmark.newBenchmark( String("Sparse matrix-vector multiplication (") + precision + ")", + metadata ); + for( std::size_t size = minSize; size <= maxSize; size *= 2 ) { + benchmark.setMetadataColumns( Benchmark::MetadataColumns({ + {"rows", size}, + {"columns", size}, + {"elements per row", elementsPerRow}, + } )); + benchmarkSpmvSynthetic< Real >( benchmark, loops, size, elementsPerRow ); + } +} + +void +setupConfig( Config::ConfigDescription & config ) +{ + config.addDelimiter( "Benchmark settings:" ); + config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-blas.log"); + config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" ); + config.addEntryEnum( "append" ); + config.addEntryEnum( "overwrite" ); + config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" ); + config.addEntryEnum( "float" ); + config.addEntryEnum( "double" ); + config.addEntryEnum( "all" ); + config.addEntry< int >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 100000 ); + config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 10000000 ); + config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 ); + config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 ); + config.addEntry< int >( "elements-per-row", "Number of elements per row of the sparse matrix used in the matrix-vector multiplication benchmark.", 5 ); + config.addEntry< int >( "verbose", "Verbose mode.", 1 ); + + config.addDelimiter( "Device settings:" ); + Devices::Host::configSetup( config ); + Devices::Cuda::configSetup( config ); +} + +int +main( int argc, char* argv[] ) +{ + Config::ParameterContainer parameters; + Config::ConfigDescription conf_desc; + + setupConfig( conf_desc ); + + if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) { + conf_desc.printUsage( argv[ 0 ] ); + return 1; + } + + Devices::Host::setup( parameters ); + Devices::Cuda::setup( parameters ); + + const String & logFileName = parameters.getParameter< String >( "log-file" ); + const String & outputMode = parameters.getParameter< String >( "output-mode" ); + const String & precision = parameters.getParameter< String >( "precision" ); + // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(), + // which have a default value. The workaround below works for int values, but it is not possible + // to pass 64-bit integer values +// const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" ); +// const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" ); + const std::size_t minSize = parameters.getParameter< int >( "min-size" ); + const std::size_t maxSize = parameters.getParameter< int >( "max-size" ); + const unsigned sizeStepFactor = parameters.getParameter< unsigned >( "size-step-factor" ); + const unsigned loops = parameters.getParameter< unsigned >( "loops" ); + const unsigned elementsPerRow = parameters.getParameter< unsigned >( "elements-per-row" ); + const unsigned verbose = parameters.getParameter< unsigned >( "verbose" ); + + if( sizeStepFactor <= 1 ) { + std::cerr << "The value of --size-step-factor must be greater than 1." << std::endl; + return EXIT_FAILURE; + } + + // open log file + auto mode = std::ios::out; + if( outputMode == "append" ) + mode |= std::ios::app; + std::ofstream logFile( logFileName.getString(), mode ); + + // init benchmark and common metadata + Benchmark benchmark( loops, verbose ); + + // prepare global metadata + const int cpu_id = 0; + Devices::CacheSizes cacheSizes = Devices::SystemInfo::getCPUCacheSizes( cpu_id ); + String cacheInfo = String( cacheSizes.L1data ) + ", " + + String( cacheSizes.L1instruction ) + ", " + + String( cacheSizes.L2 ) + ", " + + String( cacheSizes.L3 ); +#ifdef HAVE_CUDA + const int activeGPU = Devices::CudaDeviceInfo::getActiveDevice(); + const String deviceArch = String( Devices::CudaDeviceInfo::getArchitectureMajor( activeGPU ) ) + "." + + String( Devices::CudaDeviceInfo::getArchitectureMinor( activeGPU ) ); +#endif + Benchmark::MetadataMap metadata { + { "host name", Devices::SystemInfo::getHostname() }, + { "architecture", Devices::SystemInfo::getArchitecture() }, + { "system", Devices::SystemInfo::getSystemName() }, + { "system release", Devices::SystemInfo::getSystemRelease() }, + { "start time", Devices::SystemInfo::getCurrentTime() }, + { "CPU model name", Devices::SystemInfo::getCPUModelName( cpu_id ) }, + { "CPU cores", Devices::SystemInfo::getNumberOfCores( cpu_id ) }, + { "CPU threads per core", Devices::SystemInfo::getNumberOfThreads( cpu_id ) / Devices::SystemInfo::getNumberOfCores( cpu_id ) }, + { "CPU max frequency (MHz)", Devices::SystemInfo::getCPUMaxFrequency( cpu_id ) / 1e3 }, + { "CPU cache sizes (L1d, L1i, L2, L3) (kiB)", cacheInfo }, +#ifdef HAVE_CUDA + { "GPU name", Devices::CudaDeviceInfo::getDeviceName( activeGPU ) }, + { "GPU architecture", deviceArch }, + { "GPU CUDA cores", Devices::CudaDeviceInfo::getCudaCores( activeGPU ) }, + { "GPU clock rate (MHz)", (double) Devices::CudaDeviceInfo::getClockRate( activeGPU ) / 1e3 }, + { "GPU global memory (GB)", (double) Devices::CudaDeviceInfo::getGlobalMemory( activeGPU ) / 1e9 }, + { "GPU memory clock rate (MHz)", (double) Devices::CudaDeviceInfo::getMemoryClockRate( activeGPU ) / 1e3 }, + { "GPU memory ECC enabled", Devices::CudaDeviceInfo::getECCEnabled( activeGPU ) }, +#endif + }; + + if( precision == "all" || precision == "float" ) + runBlasBenchmarks< float >( benchmark, metadata, minSize, maxSize, sizeStepFactor, loops, elementsPerRow ); + if( precision == "all" || precision == "double" ) + runBlasBenchmarks< double >( benchmark, metadata, minSize, maxSize, sizeStepFactor, loops, elementsPerRow ); + + if( ! benchmark.save( logFile ) ) { + std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl; + return EXIT_FAILURE; + } + + return EXIT_SUCCESS; +} diff --git a/src/Benchmarks/BLAS/vector-operations.h b/src/Benchmarks/BLAS/vector-operations.h new file mode 100644 index 0000000000000000000000000000000000000000..e65f8980b1066e042206e328d15b50e32c81432f --- /dev/null +++ b/src/Benchmarks/BLAS/vector-operations.h @@ -0,0 +1,440 @@ +/*************************************************************************** + vector-operations.h - description + ------------------- + begin : Dec 30, 2015 + copyright : (C) 2015 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky + +#pragma once + +#include <stdlib.h> // srand48 + +#include "../Benchmarks.h" + +#include <TNL/Containers/Vector.h> + +#ifdef HAVE_CUDA +#include "cublasWrappers.h" +#endif + +namespace TNL { +namespace Benchmarks { + +template< typename Real = double, + typename Index = int > +bool +benchmarkVectorOperations( Benchmark & benchmark, + const int & loops, + const long & size ) +{ + typedef Containers::Vector< Real, Devices::Host, Index > HostVector; + typedef Containers::Vector< Real, Devices::Cuda, Index > CudaVector; + using namespace std; + + double datasetSize = ( double ) ( loops * size ) * sizeof( Real ) / oneGB; + + HostVector hostVector, hostVector2; + CudaVector deviceVector, deviceVector2; + hostVector.setSize( size ); + hostVector2.setSize( size ); +#ifdef HAVE_CUDA + deviceVector.setSize( size ); + deviceVector2.setSize( size ); +#endif + + Real resultHost, resultDevice; + +#ifdef HAVE_CUDA + cublasHandle_t cublasHandle; + cublasCreate( &cublasHandle ); +#endif + + + // reset functions + // (Make sure to always use some in benchmarks, even if it's not necessary + // to assure correct result - it helps to clear cache and avoid optimizations + // of the benchmark loop.) + auto reset1 = [&]() { + hostVector.setValue( 1.0 ); +#ifdef HAVE_CUDA + deviceVector.setValue( 1.0 ); +#endif + // A relatively harmless call to keep the compiler from realizing we + // don't actually do any useful work with the result of the reduciton. + srand48(resultHost); + resultHost = resultDevice = 0.0; + }; + auto reset2 = [&]() { + hostVector2.setValue( 1.0 ); +#ifdef HAVE_CUDA + deviceVector2.setValue( 1.0 ); +#endif + }; + auto reset12 = [&]() { + reset1(); + reset2(); + }; + + + reset12(); + + + auto maxHost = [&]() { + resultHost = hostVector.max(); + }; + auto maxHostGeneral = [&]() { + Real result( 0 ); + Containers::Algorithms::ParallelReductionMax< Real > operation; + Containers::Algorithms::Reduction< Devices::Host >::reduce( + operation, + hostVector.getSize(), + hostVector.getData(), + ( Real* ) 0, + result ); + return result; + }; + auto maxCuda = [&]() { + resultDevice = deviceVector.max(); + }; + benchmark.setOperation( "max", datasetSize ); + benchmark.time( reset1, "CPU", maxHost ); + benchmark.time( reset1, "CPU (general)", maxHostGeneral ); +#ifdef HAVE_CUDA + benchmark.time( reset1, "GPU", maxCuda ); +#endif + + + auto minHost = [&]() { + resultHost = hostVector.min(); + }; + auto minHostGeneral = [&]() { + Real result( 0 ); + Containers::Algorithms::ParallelReductionMin< Real > operation; + Containers::Algorithms::Reduction< Devices::Host >::reduce( + operation, + hostVector.getSize(), + hostVector.getData(), + ( Real* ) 0, + result ); + return result; + }; + auto minCuda = [&]() { + resultDevice = deviceVector.min(); + }; + benchmark.setOperation( "min", datasetSize ); + benchmark.time( reset1, "CPU", minHost ); + benchmark.time( reset1, "CPU (general)", minHostGeneral ); +#ifdef HAVE_CUDA + benchmark.time( reset1, "GPU", minCuda ); +#endif + + + auto absMaxHost = [&]() { + resultHost = hostVector.absMax(); + }; + auto absMaxHostGeneral = [&]() { + Real result( 0 ); + Containers::Algorithms::ParallelReductionAbsMax< Real > operation; + Containers::Algorithms::Reduction< Devices::Host >::reduce( + operation, + hostVector.getSize(), + hostVector.getData(), + ( Real* ) 0, + result ); + return result; + }; + auto absMaxCuda = [&]() { + resultDevice = deviceVector.absMax(); + }; +#ifdef HAVE_CUDA + auto absMaxCublas = [&]() { + int index = 0; + cublasIgamax( cublasHandle, size, + deviceVector.getData(), 1, + &index ); + resultDevice = deviceVector.getElement( index ); + }; +#endif + benchmark.setOperation( "absMax", datasetSize ); + benchmark.time( reset1, "CPU", absMaxHost ); + benchmark.time( reset1, "CPU (general)", absMaxHostGeneral ); +#ifdef HAVE_CUDA + benchmark.time( reset1, "GPU", absMaxCuda ); + benchmark.time( reset1, "cuBLAS", absMaxCublas ); +#endif + + + auto absMinHost = [&]() { + resultHost = hostVector.absMin(); + }; + auto absMinHostGeneral = [&]() { + Real result( 0 ); + Containers::Algorithms::ParallelReductionAbsMin< Real > operation; + Containers::Algorithms::Reduction< Devices::Host >::reduce( + operation, + hostVector.getSize(), + hostVector.getData(), + ( Real* ) 0, + result ); + return result; + }; + auto absMinCuda = [&]() { + resultDevice = deviceVector.absMin(); + }; +#ifdef HAVE_CUDA + auto absMinCublas = [&]() { + int index = 0; + cublasIgamin( cublasHandle, size, + deviceVector.getData(), 1, + &index ); + resultDevice = deviceVector.getElement( index ); + }; +#endif + benchmark.setOperation( "absMin", datasetSize ); + benchmark.time( reset1, "CPU", absMinHost ); + benchmark.time( reset1, "CPU (general)", absMinHostGeneral ); +#ifdef HAVE_CUDA + benchmark.time( reset1, "GPU", absMinCuda ); + benchmark.time( reset1, "cuBLAS", absMinCublas ); +#endif + + + auto sumHost = [&]() { + resultHost = hostVector.sum(); + }; + auto sumHostGeneral = [&]() { + Real result( 0 ); + Containers::Algorithms::ParallelReductionSum< Real > operation; + Containers::Algorithms::Reduction< Devices::Host >::reduce( + operation, + hostVector.getSize(), + hostVector.getData(), + ( Real* ) 0, + result ); + return result; + }; + auto sumCuda = [&]() { + resultDevice = deviceVector.sum(); + }; + benchmark.setOperation( "sum", datasetSize ); + benchmark.time( reset1, "CPU", sumHost ); + benchmark.time( reset1, "CPU (general)", sumHostGeneral ); +#ifdef HAVE_CUDA + benchmark.time( reset1, "GPU", sumCuda ); +#endif + + + auto l1normHost = [&]() { + resultHost = hostVector.lpNorm( 1.0 ); + }; + auto l1normHostGeneral = [&]() { + Real result( 0 ); + Containers::Algorithms::ParallelReductionAbsSum< Real > operation; + Containers::Algorithms::Reduction< Devices::Host >::reduce( + operation, + hostVector.getSize(), + hostVector.getData(), + ( Real* ) 0, + result ); + return result; + }; + auto l1normCuda = [&]() { + resultDevice = deviceVector.lpNorm( 1.0 ); + }; +#ifdef HAVE_CUDA + auto l1normCublas = [&]() { + cublasGasum( cublasHandle, size, + deviceVector.getData(), 1, + &resultDevice ); + }; +#endif + benchmark.setOperation( "l1 norm", datasetSize ); + benchmark.time( reset1, "CPU", l1normHost ); + benchmark.time( reset1, "CPU (general)", l1normHostGeneral ); +#ifdef HAVE_CUDA + benchmark.time( reset1, "GPU", l1normCuda ); + benchmark.time( reset1, "cuBLAS", l1normCublas ); +#endif + + + auto l2normHost = [&]() { + resultHost = hostVector.lpNorm( 2.0 ); + }; + auto l2normHostGeneral = [&]() { + Real result( 0 ); + Containers::Algorithms::ParallelReductionL2Norm< Real > operation; + Containers::Algorithms::Reduction< Devices::Host >::reduce( + operation, + hostVector.getSize(), + hostVector.getData(), + ( Real* ) 0, + result ); + return result; + }; + auto l2normCuda = [&]() { + resultDevice = deviceVector.lpNorm( 2.0 ); + }; +#ifdef HAVE_CUDA + auto l2normCublas = [&]() { + cublasGnrm2( cublasHandle, size, + deviceVector.getData(), 1, + &resultDevice ); + }; +#endif + benchmark.setOperation( "l2 norm", datasetSize ); + benchmark.time( reset1, "CPU", l2normHost ); + benchmark.time( reset1, "CPU (general)", l2normHostGeneral ); +#ifdef HAVE_CUDA + benchmark.time( reset1, "GPU", l2normCuda ); + benchmark.time( reset1, "cuBLAS", l2normCublas ); +#endif + + + auto l3normHost = [&]() { + resultHost = hostVector.lpNorm( 3.0 ); + }; + auto l3normHostGeneral = [&]() { + Real result( 0 ); + Containers::Algorithms::ParallelReductionLpNorm< Real > operation; + operation.setPower( 3.0 ); + Containers::Algorithms::Reduction< Devices::Host >::reduce( + operation, + hostVector.getSize(), + hostVector.getData(), + ( Real* ) 0, + result ); + return result; + }; + auto l3normCuda = [&]() { + resultDevice = deviceVector.lpNorm( 3.0 ); + }; + benchmark.setOperation( "l3 norm", datasetSize ); + benchmark.time( reset1, "CPU", l3normHost ); + benchmark.time( reset1, "CPU (general)", l3normHostGeneral ); +#ifdef HAVE_CUDA + benchmark.time( reset1, "GPU", l3normCuda ); +#endif + + + auto scalarProductHost = [&]() { + resultHost = hostVector.scalarProduct( hostVector2 ); + }; + auto scalarProductHostGeneral = [&]() { + Real result( 0 ); + Containers::Algorithms::ParallelReductionScalarProduct< Real, Real > operation; + Containers::Algorithms::Reduction< Devices::Host >::reduce( + operation, + hostVector.getSize(), + hostVector.getData(), + hostVector2.getData(), + result ); + return result; + }; + auto scalarProductCuda = [&]() { + resultDevice = deviceVector.scalarProduct( deviceVector2 ); + }; +#ifdef HAVE_CUDA + auto scalarProductCublas = [&]() { + cublasGdot( cublasHandle, size, + deviceVector.getData(), 1, + deviceVector2.getData(), 1, + &resultDevice ); + }; +#endif + benchmark.setOperation( "scalar product", 2 * datasetSize ); + benchmark.time( reset1, "CPU", scalarProductHost ); + benchmark.time( reset1, "CPU (general)", scalarProductHostGeneral ); +#ifdef HAVE_CUDA + benchmark.time( reset1, "GPU", scalarProductCuda ); + benchmark.time( reset1, "cuBLAS", scalarProductCublas ); +#endif + + /* + std::cout << "Benchmarking prefix-sum:" << std::endl; + timer.reset(); + timer.start(); + hostVector.computePrefixSum(); + timer.stop(); + timeHost = timer.getTime(); + bandwidth = 2 * datasetSize / loops / timer.getTime(); + std::cout << " CPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << std::endl; + + timer.reset(); + timer.start(); + deviceVector.computePrefixSum(); + timer.stop(); + timeDevice = timer.getTime(); + bandwidth = 2 * datasetSize / loops / timer.getTime(); + std::cout << " GPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << std::endl; + std::cout << " CPU/GPU speedup: " << timeHost / timeDevice << std::endl; + + HostVector auxHostVector; + auxHostVector.setLike( deviceVector ); + auxHostVector = deviceVector; + for( int i = 0; i < size; i++ ) + if( hostVector.getElement( i ) != auxHostVector.getElement( i ) ) + { + std::cerr << "Error in prefix sum at position " << i << ": " << hostVector.getElement( i ) << " != " << auxHostVector.getElement( i ) << std::endl; + } + */ + + + auto multiplyHost = [&]() { + hostVector *= 0.5; + }; + auto multiplyCuda = [&]() { + deviceVector *= 0.5; + }; +#ifdef HAVE_CUDA + auto multiplyCublas = [&]() { + const Real alpha = 0.5; + cublasGscal( cublasHandle, size, + &alpha, + deviceVector.getData(), 1 ); + }; +#endif + benchmark.setOperation( "scalar multiplication", 2 * datasetSize ); + benchmark.time( reset1, "CPU", multiplyHost ); +#ifdef HAVE_CUDA + benchmark.time( reset1, "GPU", multiplyCuda ); + benchmark.time( reset1, "cuBLAS", multiplyCublas ); +#endif + + + auto addVectorHost = [&]() { + hostVector.addVector( hostVector2 ); + }; + auto addVectorCuda = [&]() { + deviceVector.addVector( deviceVector2 ); + }; +#ifdef HAVE_CUDA + auto addVectorCublas = [&]() { + const Real alpha = 1.0; + cublasGaxpy( cublasHandle, size, + &alpha, + deviceVector2.getData(), 1, + deviceVector.getData(), 1 ); + }; +#endif + benchmark.setOperation( "vector addition", 3 * datasetSize ); + benchmark.time( reset1, "CPU", addVectorHost ); +#ifdef HAVE_CUDA + benchmark.time( reset1, "GPU", addVectorCuda ); + benchmark.time( reset1, "cuBLAS", addVectorCublas ); +#endif + + +#ifdef HAVE_CUDA + cublasDestroy( cublasHandle ); +#endif + + return true; +} + +} // namespace Benchmarks +} // namespace TNL diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h new file mode 100644 index 0000000000000000000000000000000000000000..60decddc8c04c9422451b1581f8db044655a93ba --- /dev/null +++ b/src/Benchmarks/Benchmarks.h @@ -0,0 +1,456 @@ +/*************************************************************************** + benchmarks.h - description + ------------------- + begin : Dec 30, 2015 + copyright : (C) 2015 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky + +#pragma once + +#include <iostream> +#include <iomanip> +#include <map> +#include <vector> + +#include <TNL/Timer.h> +#include <TNL/String.h> +#include <TNL/Solvers/IterativeSolverMonitor.h> + +namespace TNL { +namespace Benchmarks { + +const double oneGB = 1024.0 * 1024.0 * 1024.0; + +template< typename ComputeFunction, + typename ResetFunction, + typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > +double +timeFunction( ComputeFunction compute, + ResetFunction reset, + int loops, + Monitor && monitor = Monitor() ) +{ + // the timer is constructed zero-initialized and stopped + Timer timer; + + // set timer to the monitor + monitor.setTimer( timer ); + + // warm up + reset(); + compute(); + + for(int i = 0; i < loops; ++i) { + // abuse the monitor's "time" for loops + monitor.setTime( i + 1 ); + + reset(); + + // Explicit synchronization of the CUDA device + // TODO: not necessary for host computations +#ifdef HAVE_CUDA + cudaDeviceSynchronize(); +#endif + timer.start(); + compute(); +#ifdef HAVE_CUDA + cudaDeviceSynchronize(); +#endif + timer.stop(); + } + + return timer.getRealTime(); +} + + +class Logging +{ +public: + using MetadataElement = std::pair< const char*, String >; + using MetadataMap = std::map< const char*, String >; + using MetadataColumns = std::vector<MetadataElement>; + + using HeaderElements = std::initializer_list< String >; + using RowElements = std::initializer_list< double >; + + Logging( bool verbose = true ) + : verbose(verbose) + {} + + void + writeTitle( const String & title ) + { + if( verbose ) + std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl; + log << ": title = " << title << std::endl; + } + + void + writeMetadata( const MetadataMap & metadata ) + { + if( verbose ) + std::cout << "properties:" << std::endl; + + for( auto & it : metadata ) { + if( verbose ) + std::cout << " " << it.first << " = " << it.second << std::endl; + log << ": " << it.first << " = " << it.second << std::endl; + } + if( verbose ) + std::cout << std::endl; + } + + void + writeTableHeader( const String & spanningElement, + const HeaderElements & subElements ) + { + using namespace std; + + if( verbose && header_changed ) { + for( auto & it : metadataColumns ) { + std::cout << std::setw( 20 ) << it.first; + } + + // spanning element is printed as usual column to stdout, + // but is excluded from header + std::cout << std::setw( 15 ) << ""; + + for( auto & it : subElements ) { + std::cout << std::setw( 15 ) << it; + } + std::cout << std::endl; + + header_changed = false; + } + + // initial indent string + header_indent = "!"; + log << std::endl; + for( auto & it : metadataColumns ) { + log << header_indent << " " << it.first << std::endl; + } + + // dump stacked spanning columns + if( horizontalGroups.size() > 0 ) + while( horizontalGroups.back().second <= 0 ) { + horizontalGroups.pop_back(); + header_indent.pop_back(); + } + for( size_t i = 0; i < horizontalGroups.size(); i++ ) { + if( horizontalGroups[ i ].second > 0 ) { + log << header_indent << " " << horizontalGroups[ i ].first << std::endl; + header_indent += "!"; + } + } + + log << header_indent << " " << spanningElement << std::endl; + for( auto & it : subElements ) { + log << header_indent << "! " << it << std::endl; + } + + if( horizontalGroups.size() > 0 ) { + horizontalGroups.back().second--; + header_indent.pop_back(); + } + } + + void + writeTableRow( const String & spanningElement, + const RowElements & subElements ) + { + using namespace std; + + if( verbose ) { + for( auto & it : metadataColumns ) { + std::cout << std::setw( 20 ) << it.second; + } + // spanning element is printed as usual column to stdout + std::cout << std::setw( 15 ) << spanningElement; + for( auto & it : subElements ) { + std::cout << std::setw( 15 ); + if( it != 0.0 )std::cout << it; + else std::cout << "N/A"; + } + std::cout << std::endl; + } + + // only when changed (the header has been already adjusted) + // print each element on separate line + for( auto & it : metadataColumns ) { + log << it.second << std::endl; + } + + // benchmark data are indented + const String indent = " "; + for( auto & it : subElements ) { + if( it != 0.0 ) log << indent << it << std::endl; + else log << indent << "N/A" << std::endl; + } + } + + void + writeErrorMessage( const char* msg, + int colspan = 1 ) + { + // initial indent string + header_indent = "!"; + log << std::endl; + for( auto & it : metadataColumns ) { + log << header_indent << " " << it.first << std::endl; + } + + // make sure there is a header column for the message + if( horizontalGroups.size() == 0 ) + horizontalGroups.push_back( {"", 1} ); + + // dump stacked spanning columns + while( horizontalGroups.back().second <= 0 ) { + horizontalGroups.pop_back(); + header_indent.pop_back(); + } + for( size_t i = 0; i < horizontalGroups.size(); i++ ) { + if( horizontalGroups[ i ].second > 0 ) { + log << header_indent << " " << horizontalGroups[ i ].first << std::endl; + header_indent += "!"; + } + } + if( horizontalGroups.size() > 0 ) { + horizontalGroups.back().second -= colspan; + header_indent.pop_back(); + } + + // only when changed (the header has been already adjusted) + // print each element on separate line + for( auto & it : metadataColumns ) { + log << it.second << std::endl; + } + log << msg << std::endl; + } + + void + closeTable() + { + log << std::endl; + header_indent = body_indent = ""; + header_changed = true; + horizontalGroups.clear(); + } + + bool save( std::ostream & logFile ) + { + closeTable(); + logFile << log.str(); + if( logFile.good() ) { + log.str() = ""; + return true; + } + return false; + } + +protected: + + // manual double -> String conversion with fixed precision + static String + _to_string( double num, int precision = 0, bool fixed = false ) + { + std::stringstream str; + if( fixed ) + str << std::fixed; + if( precision ) + str << std::setprecision( precision ); + str << num; + return String( str.str().data() ); + } + + std::stringstream log; + std::string header_indent; + std::string body_indent; + + bool verbose; + MetadataColumns metadataColumns; + bool header_changed = true; + std::vector< std::pair< String, int > > horizontalGroups; +}; + + +class Benchmark +: protected Logging +{ +public: + using Logging::MetadataElement; + using Logging::MetadataMap; + using Logging::MetadataColumns; + + Benchmark( int loops = 10, + bool verbose = true ) + : Logging(verbose), loops(loops) + {} + + // TODO: ensure that this is not called in the middle of the benchmark + // (or just remove it completely?) + void + setLoops( int loops ) + { + this->loops = loops; + } + + // Marks the start of a new benchmark + void + newBenchmark( const String & title ) + { + closeTable(); + writeTitle( title ); + monitor.setStage( title.getString() ); + } + + // Marks the start of a new benchmark (with custom metadata) + void + newBenchmark( const String & title, + MetadataMap metadata ) + { + closeTable(); + writeTitle( title ); + monitor.setStage( title.getString() ); + // add loops to metadata + metadata["loops"] = String(loops); + writeMetadata( metadata ); + } + + // Sets metadata columns -- values used for all subsequent rows until + // the next call to this function. + void + setMetadataColumns( const MetadataColumns & metadata ) + { + if( metadataColumns != metadata ) + header_changed = true; + metadataColumns = metadata; + } + + // TODO: maybe should be renamed to createVerticalGroup and ensured that vertical and horizontal groups are not used within the same "Benchmark" + // Sets current operation -- operations expand the table vertically + // - baseTime should be reset to 0.0 for most operations, but sometimes + // it is useful to override it + // - Order of operations inside a "Benchmark" does not matter, rows can be + // easily sorted while converting to HTML.) + void + setOperation( const String & operation, + const double datasetSize = 0.0, // in GB + const double baseTime = 0.0 ) + { + if( metadataColumns.size() > 0 && String(metadataColumns[ 0 ].first) == "operation" ) { + metadataColumns[ 0 ].second = operation; + } + else { + metadataColumns.insert( metadataColumns.begin(), {"operation", operation} ); + } + setOperation( datasetSize, baseTime ); + header_changed = true; + } + + void + setOperation( const double datasetSize = 0.0, + const double baseTime = 0.0 ) + { + this->datasetSize = datasetSize; + this->baseTime = baseTime; + } + + // Creates new horizontal groups inside a benchmark -- increases the number + // of columns in the "Benchmark", implies column spanning. + // (Useful e.g. for SpMV formats, different configurations etc.) + void + createHorizontalGroup( const String & name, + int subcolumns ) + { + if( horizontalGroups.size() == 0 ) { + horizontalGroups.push_back( {name, subcolumns} ); + } + else { + auto & last = horizontalGroups.back(); + if( last.first != name && last.second > 0 ) { + horizontalGroups.push_back( {name, subcolumns} ); + } + else { + last.first = name; + last.second = subcolumns; + } + } + } + + // Times a single ComputeFunction. Subsequent calls implicitly split + // the current "horizontal group" into sub-columns identified by + // "performer", which are further split into "bandwidth", "time" and + // "speedup" columns. + // TODO: allow custom columns bound to lambda functions (e.g. for Gflops calculation) + // Also terminates the recursion of the following variadic template. + template< typename ResetFunction, + typename ComputeFunction > + double + time( ResetFunction reset, + const String & performer, + ComputeFunction & compute ) + { + double time; + if( verbose ) { + // run the monitor main loop + Solvers::SolverMonitorThread monitor_thread( monitor ); + time = timeFunction( compute, reset, loops, monitor ); + } + else { + time = timeFunction( compute, reset, loops, monitor ); + } + + const double bandwidth = datasetSize / time; + const double speedup = this->baseTime / time; + if( this->baseTime == 0.0 ) + this->baseTime = time; + + writeTableHeader( performer, HeaderElements({"bandwidth", "time", "speedup"}) ); + writeTableRow( performer, RowElements({ bandwidth, time, speedup }) ); + + return this->baseTime; + } + + // Recursive template function to deal with multiple computations with the + // same reset function. + template< typename ResetFunction, + typename ComputeFunction, + typename... NextComputations > + inline double + time( ResetFunction reset, + const String & performer, + ComputeFunction & compute, + NextComputations & ... nextComputations ) + { + time( reset, performer, compute ); + time( reset, nextComputations... ); + return this->baseTime; + } + + // Adds an error message to the log. Should be called in places where the + // "time" method could not be called (e.g. due to failed allocation). + void + addErrorMessage( const char* msg, + int numberOfComputations = 1 ) + { + // each computation has 3 subcolumns + const int colspan = 3 * numberOfComputations; + writeErrorMessage( msg, colspan ); + } + + using Logging::save; + +protected: + int loops; + double datasetSize = 0.0; + double baseTime = 0.0; + Solvers::IterativeSolverMonitor< double, int > monitor; +}; + +} // namespace Benchmarks +} // namespace TNL diff --git a/src/Benchmarks/CMakeLists.txt b/src/Benchmarks/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..e34ade5be305bdee495749a38db1b9af004a6a92 --- /dev/null +++ b/src/Benchmarks/CMakeLists.txt @@ -0,0 +1,10 @@ +add_subdirectory( HeatEquation ) +add_subdirectory( BLAS ) +add_subdirectory( SpMV ) +add_subdirectory( LinearSolvers ) + +set( headers + Benchmarks.h +) + +install( FILES ${headers} DESTINATION ${TNL_TARGET_INCLUDE_DIRECTORY}/Benchmarks ) diff --git a/tests/benchmarks/heat-equation-benchmark/BenchmarkLaplace.h b/src/Benchmarks/HeatEquation/BenchmarkLaplace.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/BenchmarkLaplace.h rename to src/Benchmarks/HeatEquation/BenchmarkLaplace.h diff --git a/tests/benchmarks/heat-equation-benchmark/BenchmarkLaplace_impl.h b/src/Benchmarks/HeatEquation/BenchmarkLaplace_impl.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/BenchmarkLaplace_impl.h rename to src/Benchmarks/HeatEquation/BenchmarkLaplace_impl.h diff --git a/tests/benchmarks/heat-equation-benchmark/CMakeLists.txt b/src/Benchmarks/HeatEquation/CMakeLists.txt similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/CMakeLists.txt rename to src/Benchmarks/HeatEquation/CMakeLists.txt diff --git a/tests/benchmarks/heat-equation-benchmark/DirichletBoundaryConditions.h b/src/Benchmarks/HeatEquation/DirichletBoundaryConditions.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/DirichletBoundaryConditions.h rename to src/Benchmarks/HeatEquation/DirichletBoundaryConditions.h diff --git a/tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkBuildConfigTag.h b/src/Benchmarks/HeatEquation/HeatEquationBenchmarkBuildConfigTag.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkBuildConfigTag.h rename to src/Benchmarks/HeatEquation/HeatEquationBenchmarkBuildConfigTag.h diff --git a/tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem.h b/src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem.h rename to src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem.h diff --git a/tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem_impl.h b/src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem_impl.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkProblem_impl.h rename to src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem_impl.h diff --git a/tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkRhs.h b/src/Benchmarks/HeatEquation/HeatEquationBenchmarkRhs.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/HeatEquationBenchmarkRhs.h rename to src/Benchmarks/HeatEquation/HeatEquationBenchmarkRhs.h diff --git a/tests/benchmarks/heat-equation-benchmark/TestGridEntity.h b/src/Benchmarks/HeatEquation/TestGridEntity.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/TestGridEntity.h rename to src/Benchmarks/HeatEquation/TestGridEntity.h diff --git a/tests/benchmarks/heat-equation-benchmark/Tuning/ExplicitUpdater.h b/src/Benchmarks/HeatEquation/Tuning/ExplicitUpdater.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/Tuning/ExplicitUpdater.h rename to src/Benchmarks/HeatEquation/Tuning/ExplicitUpdater.h diff --git a/tests/benchmarks/heat-equation-benchmark/Tuning/GridTraverser.h b/src/Benchmarks/HeatEquation/Tuning/GridTraverser.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/Tuning/GridTraverser.h rename to src/Benchmarks/HeatEquation/Tuning/GridTraverser.h diff --git a/tests/benchmarks/heat-equation-benchmark/Tuning/GridTraverser_impl.h b/src/Benchmarks/HeatEquation/Tuning/GridTraverser_impl.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/Tuning/GridTraverser_impl.h rename to src/Benchmarks/HeatEquation/Tuning/GridTraverser_impl.h diff --git a/tests/benchmarks/heat-equation-benchmark/Tuning/SimpleCell.h b/src/Benchmarks/HeatEquation/Tuning/SimpleCell.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/Tuning/SimpleCell.h rename to src/Benchmarks/HeatEquation/Tuning/SimpleCell.h diff --git a/tests/benchmarks/heat-equation-benchmark/Tuning/Traverser_Grid2D.h b/src/Benchmarks/HeatEquation/Tuning/Traverser_Grid2D.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/Tuning/Traverser_Grid2D.h rename to src/Benchmarks/HeatEquation/Tuning/Traverser_Grid2D.h diff --git a/tests/benchmarks/heat-equation-benchmark/Tuning/Traverser_Grid2D_impl.h b/src/Benchmarks/HeatEquation/Tuning/Traverser_Grid2D_impl.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/Tuning/Traverser_Grid2D_impl.h rename to src/Benchmarks/HeatEquation/Tuning/Traverser_Grid2D_impl.h diff --git a/tests/benchmarks/heat-equation-benchmark/Tuning/tunning.h b/src/Benchmarks/HeatEquation/Tuning/tunning.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/Tuning/tunning.h rename to src/Benchmarks/HeatEquation/Tuning/tunning.h diff --git a/tests/benchmarks/heat-equation-benchmark/pure-c-rhs.h b/src/Benchmarks/HeatEquation/pure-c-rhs.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/pure-c-rhs.h rename to src/Benchmarks/HeatEquation/pure-c-rhs.h diff --git a/tests/benchmarks/heat-equation-benchmark/run-HeatEquationBenchmark b/src/Benchmarks/HeatEquation/run-HeatEquationBenchmark similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/run-HeatEquationBenchmark rename to src/Benchmarks/HeatEquation/run-HeatEquationBenchmark diff --git a/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-heat-equation.cpp b/src/Benchmarks/HeatEquation/tnl-benchmark-heat-equation.cpp similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/tnl-benchmark-heat-equation.cpp rename to src/Benchmarks/HeatEquation/tnl-benchmark-heat-equation.cpp diff --git a/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-heat-equation.cu b/src/Benchmarks/HeatEquation/tnl-benchmark-heat-equation.cu similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/tnl-benchmark-heat-equation.cu rename to src/Benchmarks/HeatEquation/tnl-benchmark-heat-equation.cu diff --git a/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-heat-equation.h b/src/Benchmarks/HeatEquation/tnl-benchmark-heat-equation.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/tnl-benchmark-heat-equation.h rename to src/Benchmarks/HeatEquation/tnl-benchmark-heat-equation.h diff --git a/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation-bug.cu b/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation-bug.cu similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation-bug.cu rename to src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation-bug.cu diff --git a/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation-bug.h b/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation-bug.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation-bug.h rename to src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation-bug.h diff --git a/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation.cpp b/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.cpp similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation.cpp rename to src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.cpp diff --git a/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation.cu b/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.cu similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation.cu rename to src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.cu diff --git a/tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation.h b/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/tnl-benchmark-simple-heat-equation.h rename to src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.h diff --git a/tests/benchmarks/heat-equation-benchmark/tnlTestGrid2D.h b/src/Benchmarks/HeatEquation/tnlTestGrid2D.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/tnlTestGrid2D.h rename to src/Benchmarks/HeatEquation/tnlTestGrid2D.h diff --git a/tests/benchmarks/heat-equation-benchmark/tnlTestGridEntity.h b/src/Benchmarks/HeatEquation/tnlTestGridEntity.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/tnlTestGridEntity.h rename to src/Benchmarks/HeatEquation/tnlTestGridEntity.h diff --git a/tests/benchmarks/heat-equation-benchmark/tnlTestNeighbourGridEntitiesStorage.h b/src/Benchmarks/HeatEquation/tnlTestNeighbourGridEntitiesStorage.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/tnlTestNeighbourGridEntitiesStorage.h rename to src/Benchmarks/HeatEquation/tnlTestNeighbourGridEntitiesStorage.h diff --git a/tests/benchmarks/heat-equation-benchmark/tnlTestNeighbourGridEntityGetter.h b/src/Benchmarks/HeatEquation/tnlTestNeighbourGridEntityGetter.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/tnlTestNeighbourGridEntityGetter.h rename to src/Benchmarks/HeatEquation/tnlTestNeighbourGridEntityGetter.h diff --git a/tests/benchmarks/heat-equation-benchmark/tnlTestNeighbourGridEntityGetter2D_impl.h b/src/Benchmarks/HeatEquation/tnlTestNeighbourGridEntityGetter2D_impl.h similarity index 100% rename from tests/benchmarks/heat-equation-benchmark/tnlTestNeighbourGridEntityGetter2D_impl.h rename to src/Benchmarks/HeatEquation/tnlTestNeighbourGridEntityGetter2D_impl.h diff --git a/src/Benchmarks/LinearSolvers/CMakeLists.txt b/src/Benchmarks/LinearSolvers/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a95c92f791c7d7a0176415fd4968d8aa6bb8982 --- /dev/null +++ b/src/Benchmarks/LinearSolvers/CMakeLists.txt @@ -0,0 +1,9 @@ +if( BUILD_CUDA ) + CUDA_ADD_EXECUTABLE( tnl-benchmark-linear-solvers tnl-benchmark-linear-solvers.cu ) + TARGET_LINK_LIBRARIES( tnl-benchmark-linear-solvers tnl ) +else() + ADD_EXECUTABLE( tnl-benchmark-linear-solvers tnl-benchmark-linear-solvers.cpp ) + TARGET_LINK_LIBRARIES( tnl-benchmark-linear-solvers tnl ) +endif() + +install( TARGETS tnl-benchmark-linear-solvers RUNTIME DESTINATION bin ) diff --git a/tests/benchmarks/tnl-benchmark-linear-solvers.cpp b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.cpp similarity index 100% rename from tests/benchmarks/tnl-benchmark-linear-solvers.cpp rename to src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.cpp diff --git a/tests/benchmarks/tnl-benchmark-linear-solvers.cu b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.cu similarity index 100% rename from tests/benchmarks/tnl-benchmark-linear-solvers.cu rename to src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.cu diff --git a/tests/benchmarks/tnl-benchmark-linear-solvers.h b/src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h similarity index 100% rename from tests/benchmarks/tnl-benchmark-linear-solvers.h rename to src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h diff --git a/src/Benchmarks/SpMV/CMakeLists.txt b/src/Benchmarks/SpMV/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..a73e6738cdb8bb4effd889960f3ffcd5b7255b90 --- /dev/null +++ b/src/Benchmarks/SpMV/CMakeLists.txt @@ -0,0 +1,9 @@ +if( BUILD_CUDA ) + CUDA_ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cu ) + TARGET_LINK_LIBRARIES( tnl-benchmark-spmv tnl ${CUDA_cusparse_LIBRARY} ) +else() + ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cpp ) + TARGET_LINK_LIBRARIES( tnl-benchmark-spmv tnl ) +endif() + +install( TARGETS tnl-benchmark-spmv RUNTIME DESTINATION bin ) diff --git a/tests/benchmarks/tnl-benchmark-spmv.cpp b/src/Benchmarks/SpMV/tnl-benchmark-spmv.cpp similarity index 100% rename from tests/benchmarks/tnl-benchmark-spmv.cpp rename to src/Benchmarks/SpMV/tnl-benchmark-spmv.cpp diff --git a/tests/benchmarks/tnl-benchmark-spmv.cu b/src/Benchmarks/SpMV/tnl-benchmark-spmv.cu similarity index 100% rename from tests/benchmarks/tnl-benchmark-spmv.cu rename to src/Benchmarks/SpMV/tnl-benchmark-spmv.cu diff --git a/tests/benchmarks/tnl-benchmark-spmv.h b/src/Benchmarks/SpMV/tnl-benchmark-spmv.h similarity index 100% rename from tests/benchmarks/tnl-benchmark-spmv.h rename to src/Benchmarks/SpMV/tnl-benchmark-spmv.h diff --git a/tests/benchmarks/tnlCusparseCSRMatrix.h b/src/Benchmarks/SpMV/tnlCusparseCSRMatrix.h similarity index 100% rename from tests/benchmarks/tnlCusparseCSRMatrix.h rename to src/Benchmarks/SpMV/tnlCusparseCSRMatrix.h diff --git a/tests/benchmarks/share/CMakeLists.txt b/src/Benchmarks/scripts/CMakeLists.txt similarity index 100% rename from tests/benchmarks/share/CMakeLists.txt rename to src/Benchmarks/scripts/CMakeLists.txt diff --git a/tests/benchmarks/share/convert-matrices b/src/Benchmarks/scripts/convert-matrices similarity index 100% rename from tests/benchmarks/share/convert-matrices rename to src/Benchmarks/scripts/convert-matrices diff --git a/tests/benchmarks/share/cuda-profiler.conf b/src/Benchmarks/scripts/cuda-profiler.conf similarity index 100% rename from tests/benchmarks/share/cuda-profiler.conf rename to src/Benchmarks/scripts/cuda-profiler.conf diff --git a/tests/benchmarks/share/draw-matrices b/src/Benchmarks/scripts/draw-matrices similarity index 100% rename from tests/benchmarks/share/draw-matrices rename to src/Benchmarks/scripts/draw-matrices diff --git a/tests/benchmarks/share/florida-matrix-market b/src/Benchmarks/scripts/florida-matrix-market similarity index 100% rename from tests/benchmarks/share/florida-matrix-market rename to src/Benchmarks/scripts/florida-matrix-market diff --git a/tests/benchmarks/share/get-matrices b/src/Benchmarks/scripts/get-matrices similarity index 100% rename from tests/benchmarks/share/get-matrices rename to src/Benchmarks/scripts/get-matrices diff --git a/tests/benchmarks/share/matrix-market b/src/Benchmarks/scripts/matrix-market similarity index 100% rename from tests/benchmarks/share/matrix-market rename to src/Benchmarks/scripts/matrix-market diff --git a/tests/benchmarks/share/process-cuda-profile.pl b/src/Benchmarks/scripts/process-cuda-profile.pl similarity index 100% rename from tests/benchmarks/share/process-cuda-profile.pl rename to src/Benchmarks/scripts/process-cuda-profile.pl diff --git a/tests/benchmarks/share/run-matrix-solvers-benchmark b/src/Benchmarks/scripts/run-matrix-solvers-benchmark similarity index 100% rename from tests/benchmarks/share/run-matrix-solvers-benchmark rename to src/Benchmarks/scripts/run-matrix-solvers-benchmark diff --git a/tests/benchmarks/share/run-tnl-benchmark-linear-solvers b/src/Benchmarks/scripts/run-tnl-benchmark-linear-solvers similarity index 100% rename from tests/benchmarks/share/run-tnl-benchmark-linear-solvers rename to src/Benchmarks/scripts/run-tnl-benchmark-linear-solvers diff --git a/tests/benchmarks/share/run-tnl-benchmark-spmv b/src/Benchmarks/scripts/run-tnl-benchmark-spmv similarity index 100% rename from tests/benchmarks/share/run-tnl-benchmark-spmv rename to src/Benchmarks/scripts/run-tnl-benchmark-spmv diff --git a/tests/benchmarks/share/tnl-run-heat-equation-benchmark b/src/Benchmarks/scripts/tnl-run-heat-equation-benchmark similarity index 100% rename from tests/benchmarks/share/tnl-run-heat-equation-benchmark rename to src/Benchmarks/scripts/tnl-run-heat-equation-benchmark diff --git a/tests/benchmarks/share/tnl-run-spmv-benchmark b/src/Benchmarks/scripts/tnl-run-spmv-benchmark similarity index 100% rename from tests/benchmarks/share/tnl-run-spmv-benchmark rename to src/Benchmarks/scripts/tnl-run-spmv-benchmark diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a2086691dc4ccbe4c263fe5f11b40e361d957d73..ee275cafcc5f5f2a506fb1d2a79037f877963173 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,5 +1,16 @@ ADD_SUBDIRECTORY( TNL ) +# Note that it is important to start building examples as soon as possible, +# because they take the longest time and other stuff can be pipelined before +# they are finished (at least with Ninja). +if( ${WITH_EXAMPLES} ) + add_subdirectory( Examples ) +endif() + +if( ${WITH_BENCHMARKS} ) + ADD_SUBDIRECTORY( Benchmarks ) +endif() + if( ${WITH_TOOLS} ) ADD_SUBDIRECTORY( Tools ) endif() diff --git a/examples/CMakeLists.txt b/src/Examples/CMakeLists.txt similarity index 100% rename from examples/CMakeLists.txt rename to src/Examples/CMakeLists.txt diff --git a/examples/heat-equation/CMakeLists.txt b/src/Examples/heat-equation/CMakeLists.txt similarity index 100% rename from examples/heat-equation/CMakeLists.txt rename to src/Examples/heat-equation/CMakeLists.txt diff --git a/examples/heat-equation/HeatEquationBuildConfigTag.h b/src/Examples/heat-equation/HeatEquationBuildConfigTag.h similarity index 100% rename from examples/heat-equation/HeatEquationBuildConfigTag.h rename to src/Examples/heat-equation/HeatEquationBuildConfigTag.h diff --git a/examples/heat-equation/tnl-heat-equation-eoc.cpp b/src/Examples/heat-equation/tnl-heat-equation-eoc.cpp similarity index 100% rename from examples/heat-equation/tnl-heat-equation-eoc.cpp rename to src/Examples/heat-equation/tnl-heat-equation-eoc.cpp diff --git a/examples/heat-equation/tnl-heat-equation-eoc.cu b/src/Examples/heat-equation/tnl-heat-equation-eoc.cu similarity index 100% rename from examples/heat-equation/tnl-heat-equation-eoc.cu rename to src/Examples/heat-equation/tnl-heat-equation-eoc.cu diff --git a/examples/heat-equation/tnl-heat-equation-eoc.h b/src/Examples/heat-equation/tnl-heat-equation-eoc.h similarity index 100% rename from examples/heat-equation/tnl-heat-equation-eoc.h rename to src/Examples/heat-equation/tnl-heat-equation-eoc.h diff --git a/examples/heat-equation/tnl-heat-equation.cpp b/src/Examples/heat-equation/tnl-heat-equation.cpp similarity index 100% rename from examples/heat-equation/tnl-heat-equation.cpp rename to src/Examples/heat-equation/tnl-heat-equation.cpp diff --git a/examples/heat-equation/tnl-heat-equation.cu b/src/Examples/heat-equation/tnl-heat-equation.cu similarity index 100% rename from examples/heat-equation/tnl-heat-equation.cu rename to src/Examples/heat-equation/tnl-heat-equation.cu diff --git a/examples/heat-equation/tnl-heat-equation.h b/src/Examples/heat-equation/tnl-heat-equation.h similarity index 100% rename from examples/heat-equation/tnl-heat-equation.h rename to src/Examples/heat-equation/tnl-heat-equation.h diff --git a/examples/heat-equation/tnl-run-heat-equation b/src/Examples/heat-equation/tnl-run-heat-equation similarity index 100% rename from examples/heat-equation/tnl-run-heat-equation rename to src/Examples/heat-equation/tnl-run-heat-equation diff --git a/examples/heat-equation/tnl-run-heat-equation-eoc-test b/src/Examples/heat-equation/tnl-run-heat-equation-eoc-test similarity index 100% rename from examples/heat-equation/tnl-run-heat-equation-eoc-test rename to src/Examples/heat-equation/tnl-run-heat-equation-eoc-test diff --git a/examples/inviscid-flow/CMakeLists.txt b/src/Examples/inviscid-flow/CMakeLists.txt similarity index 100% rename from examples/inviscid-flow/CMakeLists.txt rename to src/Examples/inviscid-flow/CMakeLists.txt diff --git a/examples/inviscid-flow/CompressibleConservativeVariables.h b/src/Examples/inviscid-flow/CompressibleConservativeVariables.h similarity index 100% rename from examples/inviscid-flow/CompressibleConservativeVariables.h rename to src/Examples/inviscid-flow/CompressibleConservativeVariables.h diff --git a/examples/inviscid-flow/LaxFridrichs.h b/src/Examples/inviscid-flow/LaxFridrichs.h similarity index 100% rename from examples/inviscid-flow/LaxFridrichs.h rename to src/Examples/inviscid-flow/LaxFridrichs.h diff --git a/examples/inviscid-flow/LaxFridrichsContinuity.h b/src/Examples/inviscid-flow/LaxFridrichsContinuity.h similarity index 100% rename from examples/inviscid-flow/LaxFridrichsContinuity.h rename to src/Examples/inviscid-flow/LaxFridrichsContinuity.h diff --git a/examples/inviscid-flow/LaxFridrichsEnergy.h b/src/Examples/inviscid-flow/LaxFridrichsEnergy.h similarity index 100% rename from examples/inviscid-flow/LaxFridrichsEnergy.h rename to src/Examples/inviscid-flow/LaxFridrichsEnergy.h diff --git a/examples/inviscid-flow/LaxFridrichsMomentumBase.h b/src/Examples/inviscid-flow/LaxFridrichsMomentumBase.h similarity index 100% rename from examples/inviscid-flow/LaxFridrichsMomentumBase.h rename to src/Examples/inviscid-flow/LaxFridrichsMomentumBase.h diff --git a/examples/inviscid-flow/LaxFridrichsMomentumX.h b/src/Examples/inviscid-flow/LaxFridrichsMomentumX.h similarity index 100% rename from examples/inviscid-flow/LaxFridrichsMomentumX.h rename to src/Examples/inviscid-flow/LaxFridrichsMomentumX.h diff --git a/examples/inviscid-flow/LaxFridrichsMomentumY.h b/src/Examples/inviscid-flow/LaxFridrichsMomentumY.h similarity index 100% rename from examples/inviscid-flow/LaxFridrichsMomentumY.h rename to src/Examples/inviscid-flow/LaxFridrichsMomentumY.h diff --git a/examples/inviscid-flow/LaxFridrichsMomentumZ.h b/src/Examples/inviscid-flow/LaxFridrichsMomentumZ.h similarity index 100% rename from examples/inviscid-flow/LaxFridrichsMomentumZ.h rename to src/Examples/inviscid-flow/LaxFridrichsMomentumZ.h diff --git a/examples/inviscid-flow/PhysicalVariablesGetter.h b/src/Examples/inviscid-flow/PhysicalVariablesGetter.h similarity index 100% rename from examples/inviscid-flow/PhysicalVariablesGetter.h rename to src/Examples/inviscid-flow/PhysicalVariablesGetter.h diff --git a/examples/inviscid-flow/RiemannProblemInitialCondition.h b/src/Examples/inviscid-flow/RiemannProblemInitialCondition.h similarity index 100% rename from examples/inviscid-flow/RiemannProblemInitialCondition.h rename to src/Examples/inviscid-flow/RiemannProblemInitialCondition.h diff --git a/examples/inviscid-flow/euler.cpp b/src/Examples/inviscid-flow/euler.cpp similarity index 100% rename from examples/inviscid-flow/euler.cpp rename to src/Examples/inviscid-flow/euler.cpp diff --git a/examples/inviscid-flow/euler.cu b/src/Examples/inviscid-flow/euler.cu similarity index 100% rename from examples/inviscid-flow/euler.cu rename to src/Examples/inviscid-flow/euler.cu diff --git a/examples/inviscid-flow/euler.h b/src/Examples/inviscid-flow/euler.h similarity index 100% rename from examples/inviscid-flow/euler.h rename to src/Examples/inviscid-flow/euler.h diff --git a/examples/inviscid-flow/eulerBuildConfigTag.h b/src/Examples/inviscid-flow/eulerBuildConfigTag.h similarity index 100% rename from examples/inviscid-flow/eulerBuildConfigTag.h rename to src/Examples/inviscid-flow/eulerBuildConfigTag.h diff --git a/examples/inviscid-flow/eulerProblem.h b/src/Examples/inviscid-flow/eulerProblem.h similarity index 100% rename from examples/inviscid-flow/eulerProblem.h rename to src/Examples/inviscid-flow/eulerProblem.h diff --git a/examples/inviscid-flow/eulerProblem_impl.h b/src/Examples/inviscid-flow/eulerProblem_impl.h similarity index 100% rename from examples/inviscid-flow/eulerProblem_impl.h rename to src/Examples/inviscid-flow/eulerProblem_impl.h diff --git a/examples/inviscid-flow/eulerRhs.h b/src/Examples/inviscid-flow/eulerRhs.h similarity index 100% rename from examples/inviscid-flow/eulerRhs.h rename to src/Examples/inviscid-flow/eulerRhs.h diff --git a/examples/inviscid-flow/run-euler b/src/Examples/inviscid-flow/run-euler similarity index 100% rename from examples/inviscid-flow/run-euler rename to src/Examples/inviscid-flow/run-euler diff --git a/examples/mean-curvature-flow/CMakeLists.txt b/src/Examples/mean-curvature-flow/CMakeLists.txt similarity index 100% rename from examples/mean-curvature-flow/CMakeLists.txt rename to src/Examples/mean-curvature-flow/CMakeLists.txt diff --git a/examples/mean-curvature-flow/tnl-mean-curvature-flow-eoc.cpp b/src/Examples/mean-curvature-flow/tnl-mean-curvature-flow-eoc.cpp similarity index 100% rename from examples/mean-curvature-flow/tnl-mean-curvature-flow-eoc.cpp rename to src/Examples/mean-curvature-flow/tnl-mean-curvature-flow-eoc.cpp diff --git a/examples/mean-curvature-flow/tnl-mean-curvature-flow-eoc.cu b/src/Examples/mean-curvature-flow/tnl-mean-curvature-flow-eoc.cu similarity index 100% rename from examples/mean-curvature-flow/tnl-mean-curvature-flow-eoc.cu rename to src/Examples/mean-curvature-flow/tnl-mean-curvature-flow-eoc.cu diff --git a/examples/mean-curvature-flow/tnl-mean-curvature-flow-eoc.h b/src/Examples/mean-curvature-flow/tnl-mean-curvature-flow-eoc.h similarity index 100% rename from examples/mean-curvature-flow/tnl-mean-curvature-flow-eoc.h rename to src/Examples/mean-curvature-flow/tnl-mean-curvature-flow-eoc.h diff --git a/examples/mean-curvature-flow/tnl-mean-curvature-flow.cpp b/src/Examples/mean-curvature-flow/tnl-mean-curvature-flow.cpp similarity index 100% rename from examples/mean-curvature-flow/tnl-mean-curvature-flow.cpp rename to src/Examples/mean-curvature-flow/tnl-mean-curvature-flow.cpp diff --git a/examples/mean-curvature-flow/tnl-mean-curvature-flow.cu b/src/Examples/mean-curvature-flow/tnl-mean-curvature-flow.cu similarity index 100% rename from examples/mean-curvature-flow/tnl-mean-curvature-flow.cu rename to src/Examples/mean-curvature-flow/tnl-mean-curvature-flow.cu diff --git a/examples/mean-curvature-flow/tnl-mean-curvature-flow.h b/src/Examples/mean-curvature-flow/tnl-mean-curvature-flow.h similarity index 100% rename from examples/mean-curvature-flow/tnl-mean-curvature-flow.h rename to src/Examples/mean-curvature-flow/tnl-mean-curvature-flow.h diff --git a/examples/mean-curvature-flow/tnl-run-mean-curvature-flow b/src/Examples/mean-curvature-flow/tnl-run-mean-curvature-flow similarity index 100% rename from examples/mean-curvature-flow/tnl-run-mean-curvature-flow rename to src/Examples/mean-curvature-flow/tnl-run-mean-curvature-flow diff --git a/examples/mean-curvature-flow/tnl-run-mean-curvature-flow-contour-video b/src/Examples/mean-curvature-flow/tnl-run-mean-curvature-flow-contour-video similarity index 100% rename from examples/mean-curvature-flow/tnl-run-mean-curvature-flow-contour-video rename to src/Examples/mean-curvature-flow/tnl-run-mean-curvature-flow-contour-video diff --git a/examples/mean-curvature-flow/tnl-run-mean-curvature-flow-eoc-test b/src/Examples/mean-curvature-flow/tnl-run-mean-curvature-flow-eoc-test similarity index 100% rename from examples/mean-curvature-flow/tnl-run-mean-curvature-flow-eoc-test rename to src/Examples/mean-curvature-flow/tnl-run-mean-curvature-flow-eoc-test diff --git a/examples/mean-curvature-flow/tnl-run-mean-curvature-flow-videos b/src/Examples/mean-curvature-flow/tnl-run-mean-curvature-flow-videos similarity index 100% rename from examples/mean-curvature-flow/tnl-run-mean-curvature-flow-videos rename to src/Examples/mean-curvature-flow/tnl-run-mean-curvature-flow-videos diff --git a/examples/narrow-band/CMakeLists.txt b/src/Examples/narrow-band/CMakeLists.txt similarity index 100% rename from examples/narrow-band/CMakeLists.txt rename to src/Examples/narrow-band/CMakeLists.txt diff --git a/examples/narrow-band/MainBuildConfig.h b/src/Examples/narrow-band/MainBuildConfig.h similarity index 100% rename from examples/narrow-band/MainBuildConfig.h rename to src/Examples/narrow-band/MainBuildConfig.h diff --git a/examples/narrow-band/main.cpp b/src/Examples/narrow-band/main.cpp similarity index 100% rename from examples/narrow-band/main.cpp rename to src/Examples/narrow-band/main.cpp diff --git a/examples/narrow-band/main.cu b/src/Examples/narrow-band/main.cu similarity index 100% rename from examples/narrow-band/main.cu rename to src/Examples/narrow-band/main.cu diff --git a/examples/narrow-band/main.h b/src/Examples/narrow-band/main.h similarity index 100% rename from examples/narrow-band/main.h rename to src/Examples/narrow-band/main.h diff --git a/examples/narrow-band/narrowBandConfig.h b/src/Examples/narrow-band/narrowBandConfig.h similarity index 100% rename from examples/narrow-band/narrowBandConfig.h rename to src/Examples/narrow-band/narrowBandConfig.h diff --git a/examples/narrow-band/tnlNarrowBand.h b/src/Examples/narrow-band/tnlNarrowBand.h similarity index 100% rename from examples/narrow-band/tnlNarrowBand.h rename to src/Examples/narrow-band/tnlNarrowBand.h diff --git a/examples/narrow-band/tnlNarrowBand2D_CUDA_v4_impl.h b/src/Examples/narrow-band/tnlNarrowBand2D_CUDA_v4_impl.h similarity index 100% rename from examples/narrow-band/tnlNarrowBand2D_CUDA_v4_impl.h rename to src/Examples/narrow-band/tnlNarrowBand2D_CUDA_v4_impl.h diff --git a/examples/narrow-band/tnlNarrowBand2D_CUDA_v5_impl.h b/src/Examples/narrow-band/tnlNarrowBand2D_CUDA_v5_impl.h similarity index 100% rename from examples/narrow-band/tnlNarrowBand2D_CUDA_v5_impl.h rename to src/Examples/narrow-band/tnlNarrowBand2D_CUDA_v5_impl.h diff --git a/examples/narrow-band/tnlNarrowBand2D_impl.h b/src/Examples/narrow-band/tnlNarrowBand2D_impl.h similarity index 100% rename from examples/narrow-band/tnlNarrowBand2D_impl.h rename to src/Examples/narrow-band/tnlNarrowBand2D_impl.h diff --git a/examples/narrow-band/tnlNarrowBand3D_CUDA_impl.h b/src/Examples/narrow-band/tnlNarrowBand3D_CUDA_impl.h similarity index 100% rename from examples/narrow-band/tnlNarrowBand3D_CUDA_impl.h rename to src/Examples/narrow-band/tnlNarrowBand3D_CUDA_impl.h diff --git a/examples/narrow-band/tnlNarrowBand3D_impl.h b/src/Examples/narrow-band/tnlNarrowBand3D_impl.h similarity index 100% rename from examples/narrow-band/tnlNarrowBand3D_impl.h rename to src/Examples/narrow-band/tnlNarrowBand3D_impl.h diff --git a/examples/narrow-band/tnlNarrowBand_CUDA.h b/src/Examples/narrow-band/tnlNarrowBand_CUDA.h similarity index 100% rename from examples/narrow-band/tnlNarrowBand_CUDA.h rename to src/Examples/narrow-band/tnlNarrowBand_CUDA.h diff --git a/examples/navier-stokes/CMakeLists.txt b/src/Examples/navier-stokes/CMakeLists.txt similarity index 100% rename from examples/navier-stokes/CMakeLists.txt rename to src/Examples/navier-stokes/CMakeLists.txt diff --git a/examples/navier-stokes/Makefile b/src/Examples/navier-stokes/Makefile similarity index 100% rename from examples/navier-stokes/Makefile rename to src/Examples/navier-stokes/Makefile diff --git a/examples/navier-stokes/main.cpp b/src/Examples/navier-stokes/main.cpp similarity index 100% rename from examples/navier-stokes/main.cpp rename to src/Examples/navier-stokes/main.cpp diff --git a/examples/navier-stokes/make-png-from-gnuplot b/src/Examples/navier-stokes/make-png-from-gnuplot similarity index 100% rename from examples/navier-stokes/make-png-from-gnuplot rename to src/Examples/navier-stokes/make-png-from-gnuplot diff --git a/examples/navier-stokes/make-png-vectors-from-gnuplot b/src/Examples/navier-stokes/make-png-vectors-from-gnuplot similarity index 100% rename from examples/navier-stokes/make-png-vectors-from-gnuplot rename to src/Examples/navier-stokes/make-png-vectors-from-gnuplot diff --git a/examples/navier-stokes/merge-figures b/src/Examples/navier-stokes/merge-figures similarity index 100% rename from examples/navier-stokes/merge-figures rename to src/Examples/navier-stokes/merge-figures diff --git a/examples/navier-stokes/navier-stokes.cfg.desc b/src/Examples/navier-stokes/navier-stokes.cfg.desc similarity index 100% rename from examples/navier-stokes/navier-stokes.cfg.desc rename to src/Examples/navier-stokes/navier-stokes.cfg.desc diff --git a/examples/navier-stokes/navierStokesBoundaryConditions.h b/src/Examples/navier-stokes/navierStokesBoundaryConditions.h similarity index 100% rename from examples/navier-stokes/navierStokesBoundaryConditions.h rename to src/Examples/navier-stokes/navierStokesBoundaryConditions.h diff --git a/examples/navier-stokes/navierStokesBoundaryConditions_impl.h b/src/Examples/navier-stokes/navierStokesBoundaryConditions_impl.h similarity index 100% rename from examples/navier-stokes/navierStokesBoundaryConditions_impl.h rename to src/Examples/navier-stokes/navierStokesBoundaryConditions_impl.h diff --git a/examples/navier-stokes/navierStokesSetter.h b/src/Examples/navier-stokes/navierStokesSetter.h similarity index 100% rename from examples/navier-stokes/navierStokesSetter.h rename to src/Examples/navier-stokes/navierStokesSetter.h diff --git a/examples/navier-stokes/navierStokesSetter_impl.h b/src/Examples/navier-stokes/navierStokesSetter_impl.h similarity index 100% rename from examples/navier-stokes/navierStokesSetter_impl.h rename to src/Examples/navier-stokes/navierStokesSetter_impl.h diff --git a/examples/navier-stokes/navierStokesSolver.h b/src/Examples/navier-stokes/navierStokesSolver.h similarity index 100% rename from examples/navier-stokes/navierStokesSolver.h rename to src/Examples/navier-stokes/navierStokesSolver.h diff --git a/examples/navier-stokes/navierStokesSolverMonitor.h b/src/Examples/navier-stokes/navierStokesSolverMonitor.h similarity index 100% rename from examples/navier-stokes/navierStokesSolverMonitor.h rename to src/Examples/navier-stokes/navierStokesSolverMonitor.h diff --git a/examples/navier-stokes/navierStokesSolverMonitor_impl.h b/src/Examples/navier-stokes/navierStokesSolverMonitor_impl.h similarity index 100% rename from examples/navier-stokes/navierStokesSolverMonitor_impl.h rename to src/Examples/navier-stokes/navierStokesSolverMonitor_impl.h diff --git a/examples/navier-stokes/navierStokesSolver_impl.h b/src/Examples/navier-stokes/navierStokesSolver_impl.h similarity index 100% rename from examples/navier-stokes/navierStokesSolver_impl.h rename to src/Examples/navier-stokes/navierStokesSolver_impl.h diff --git a/examples/navier-stokes/share/CMakeLists.txt b/src/Examples/navier-stokes/share/CMakeLists.txt similarity index 100% rename from examples/navier-stokes/share/CMakeLists.txt rename to src/Examples/navier-stokes/share/CMakeLists.txt diff --git a/examples/navier-stokes/share/examples/CMakeLists.txt b/src/Examples/navier-stokes/share/examples/CMakeLists.txt similarity index 100% rename from examples/navier-stokes/share/examples/CMakeLists.txt rename to src/Examples/navier-stokes/share/examples/CMakeLists.txt diff --git a/examples/navier-stokes/share/examples/cavity b/src/Examples/navier-stokes/share/examples/cavity similarity index 100% rename from examples/navier-stokes/share/examples/cavity rename to src/Examples/navier-stokes/share/examples/cavity diff --git a/examples/quad-test/.gitignore b/src/Examples/quad-test/.gitignore similarity index 100% rename from examples/quad-test/.gitignore rename to src/Examples/quad-test/.gitignore diff --git a/examples/quad-test/Makefile b/src/Examples/quad-test/Makefile similarity index 100% rename from examples/quad-test/Makefile rename to src/Examples/quad-test/Makefile diff --git a/examples/quad-test/main.cpp b/src/Examples/quad-test/main.cpp similarity index 100% rename from examples/quad-test/main.cpp rename to src/Examples/quad-test/main.cpp diff --git a/examples/quad-test/quad-test.cfg.desc b/src/Examples/quad-test/quad-test.cfg.desc similarity index 100% rename from examples/quad-test/quad-test.cfg.desc rename to src/Examples/quad-test/quad-test.cfg.desc diff --git a/examples/simple-examples/CMakeLists.txt b/src/Examples/simple-examples/CMakeLists.txt similarity index 100% rename from examples/simple-examples/CMakeLists.txt rename to src/Examples/simple-examples/CMakeLists.txt diff --git a/examples/simple-examples/large-meshfunction-example.cpp b/src/Examples/simple-examples/large-meshfunction-example.cpp similarity index 100% rename from examples/simple-examples/large-meshfunction-example.cpp rename to src/Examples/simple-examples/large-meshfunction-example.cpp diff --git a/examples/simple-examples/large-meshfunction-example.cu b/src/Examples/simple-examples/large-meshfunction-example.cu similarity index 100% rename from examples/simple-examples/large-meshfunction-example.cu rename to src/Examples/simple-examples/large-meshfunction-example.cu diff --git a/examples/simple-examples/large-meshfunction-example.h b/src/Examples/simple-examples/large-meshfunction-example.h similarity index 100% rename from examples/simple-examples/large-meshfunction-example.h rename to src/Examples/simple-examples/large-meshfunction-example.h diff --git a/examples/transport-equation/CMakeLists.txt b/src/Examples/transport-equation/CMakeLists.txt similarity index 100% rename from examples/transport-equation/CMakeLists.txt rename to src/Examples/transport-equation/CMakeLists.txt diff --git a/examples/transport-equation/tnl-run-transport-equation b/src/Examples/transport-equation/tnl-run-transport-equation similarity index 100% rename from examples/transport-equation/tnl-run-transport-equation rename to src/Examples/transport-equation/tnl-run-transport-equation diff --git a/examples/transport-equation/tnl-run-transport-equation-eoc b/src/Examples/transport-equation/tnl-run-transport-equation-eoc similarity index 100% rename from examples/transport-equation/tnl-run-transport-equation-eoc rename to src/Examples/transport-equation/tnl-run-transport-equation-eoc diff --git a/examples/transport-equation/tnl-transport-equation-eoc.cpp b/src/Examples/transport-equation/tnl-transport-equation-eoc.cpp similarity index 100% rename from examples/transport-equation/tnl-transport-equation-eoc.cpp rename to src/Examples/transport-equation/tnl-transport-equation-eoc.cpp diff --git a/examples/transport-equation/tnl-transport-equation-eoc.cu b/src/Examples/transport-equation/tnl-transport-equation-eoc.cu similarity index 100% rename from examples/transport-equation/tnl-transport-equation-eoc.cu rename to src/Examples/transport-equation/tnl-transport-equation-eoc.cu diff --git a/examples/transport-equation/tnl-transport-equation-eoc.h b/src/Examples/transport-equation/tnl-transport-equation-eoc.h similarity index 100% rename from examples/transport-equation/tnl-transport-equation-eoc.h rename to src/Examples/transport-equation/tnl-transport-equation-eoc.h diff --git a/examples/transport-equation/tnl-transport-equation.cpp b/src/Examples/transport-equation/tnl-transport-equation.cpp similarity index 100% rename from examples/transport-equation/tnl-transport-equation.cpp rename to src/Examples/transport-equation/tnl-transport-equation.cpp diff --git a/examples/transport-equation/tnl-transport-equation.cu b/src/Examples/transport-equation/tnl-transport-equation.cu similarity index 100% rename from examples/transport-equation/tnl-transport-equation.cu rename to src/Examples/transport-equation/tnl-transport-equation.cu diff --git a/examples/transport-equation/tnl-transport-equation.h b/src/Examples/transport-equation/tnl-transport-equation.h similarity index 100% rename from examples/transport-equation/tnl-transport-equation.h rename to src/Examples/transport-equation/tnl-transport-equation.h diff --git a/examples/transport-equation/transportEquationBuildConfigTag.h b/src/Examples/transport-equation/transportEquationBuildConfigTag.h similarity index 100% rename from examples/transport-equation/transportEquationBuildConfigTag.h rename to src/Examples/transport-equation/transportEquationBuildConfigTag.h diff --git a/examples/transport-equation/transportEquationProblem.h b/src/Examples/transport-equation/transportEquationProblem.h similarity index 100% rename from examples/transport-equation/transportEquationProblem.h rename to src/Examples/transport-equation/transportEquationProblem.h diff --git a/examples/transport-equation/transportEquationProblemEoc.h b/src/Examples/transport-equation/transportEquationProblemEoc.h similarity index 100% rename from examples/transport-equation/transportEquationProblemEoc.h rename to src/Examples/transport-equation/transportEquationProblemEoc.h diff --git a/examples/transport-equation/transportEquationProblemEoc_impl.h b/src/Examples/transport-equation/transportEquationProblemEoc_impl.h similarity index 100% rename from examples/transport-equation/transportEquationProblemEoc_impl.h rename to src/Examples/transport-equation/transportEquationProblemEoc_impl.h diff --git a/examples/transport-equation/transportEquationProblem_impl.h b/src/Examples/transport-equation/transportEquationProblem_impl.h similarity index 100% rename from examples/transport-equation/transportEquationProblem_impl.h rename to src/Examples/transport-equation/transportEquationProblem_impl.h diff --git a/src/TNL/Atomic.h b/src/TNL/Atomic.h new file mode 100644 index 0000000000000000000000000000000000000000..3f0defe5e996d1d01d64bc5ec237a6bf1e47e96c --- /dev/null +++ b/src/TNL/Atomic.h @@ -0,0 +1,348 @@ +/*************************************************************************** + Atomic.h - description + ------------------- + begin : Sep 14, 2018 + copyright : (C) 2018 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovský + +#pragma once + +#include <atomic> // std::atomic + +#include <TNL/Devices/Host.h> +#include <TNL/Devices/Cuda.h> +#include <TNL/param-types.h> + +namespace TNL { + +template< typename T, typename Device > +class Atomic +{}; + +template< typename T > +class Atomic< T, Devices::Host > +: public std::atomic< T > +{ +public: + Atomic() noexcept = default; + + // inherit constructors + using std::atomic< T >::atomic; + + // NOTE: std::atomic is not copyable (see https://stackoverflow.com/a/15250851 for + // an explanation), but we need copyability for TNL::Containers::Array. Note that + // this copy-constructor and copy-assignment operator are not atomic as they + // synchronize only with respect to one or the other object. + Atomic( const Atomic& desired ) noexcept + { + this->store(desired.load()); + } + Atomic& operator=( const Atomic& desired ) noexcept + { + this->store(desired.load()); + return *this; + } + + // just for compatibility with TNL::Containers::Array... + static String getType() + { + return "Atomic< " + + TNL::getType< T >() + ", " + + Devices::Host::getDeviceType() + " >"; + } + + // CAS loops for updating maximum and minimum + // reference: https://stackoverflow.com/a/16190791 + T fetch_max( T value ) noexcept + { + const T old = *this; + T prev_value = old; + while(prev_value < value && + ! this->compare_exchange_weak(prev_value, value)) + ; + return old; + } + + T fetch_min( T value ) noexcept + { + const T old = *this; + T prev_value = old; + while(prev_value > value && + ! this->compare_exchange_weak(prev_value, value)) + ; + return old; + } +}; + +template< typename T > +class Atomic< T, Devices::Cuda > +{ +public: + using value_type = T; + // FIXME +// using difference_type = typename std::atomic< T >::difference_type; + + __cuda_callable__ + Atomic() noexcept = default; + + __cuda_callable__ + constexpr Atomic( T desired ) noexcept : value(desired) {} + + __cuda_callable__ + T operator=( T desired ) noexcept + { + store( desired ); + return desired; + } + + // NOTE: std::atomic is not copyable (see https://stackoverflow.com/a/15250851 for + // an explanation), but we need copyability for TNL::Containers::Array. Note that + // this copy-constructor and copy-assignment operator are not atomic as they + // synchronize only with respect to one or the other object. + __cuda_callable__ + Atomic( const Atomic& desired ) noexcept + { + // FIXME +// *this = desired.load(); + *this = desired.value; + } + __cuda_callable__ + Atomic& operator=( const Atomic& desired ) noexcept + { + // FIXME +// *this = desired.load(); + *this = desired.value; + return *this; + } + + // just for compatibility with TNL::Containers::Array... + static String getType() + { + return "Atomic< " + + TNL::getType< T >() + ", " + + Devices::Host::getDeviceType() + " >"; + } + + bool is_lock_free() const noexcept + { + return true; + } + + constexpr bool is_always_lock_free() const noexcept + { + return true; + } + + __cuda_callable__ + void store( T desired ) noexcept + { + // CUDA does not have a native atomic store, but it can be emulated with atomic exchange + exchange( desired ); + } + + __cuda_callable__ + T load() const noexcept + { + // CUDA does not have a native atomic load: + // https://stackoverflow.com/questions/32341081/how-to-have-atomic-load-in-cuda + return const_cast<Atomic*>(this)->fetch_add( 0 ); + } + + __cuda_callable__ + operator T() const noexcept + { + return load(); + } + + __cuda_callable__ + T exchange( T desired ) noexcept + { +#ifdef __CUDA_ARCH__ + return atomicExch( &value, desired ); +#else + const T old = value; + value = desired; + return old; +#endif + } + + __cuda_callable__ + bool compare_exchange_weak( T& expected, T desired ) noexcept + { + return compare_exchange_strong( expected, desired ); + } + + __cuda_callable__ + bool compare_exchange_strong( T& expected, T desired ) noexcept + { +#ifdef __CUDA_ARCH__ + const T old = atomicCAS( &value, expected, desired ); + const bool result = old == expected; + expected = old; + return result; +#else + if( value == expected ) { + value = desired; + return true; + } + else { + expected = value; + return false; + } +#endif + } + + __cuda_callable__ + T fetch_add( T arg ) + { +#ifdef __CUDA_ARCH__ + return atomicAdd( &value, arg ); +#else + const T old = value; + value += arg; + return old; +#endif + } + + __cuda_callable__ + T fetch_sub( T arg ) + { +#ifdef __CUDA_ARCH__ + return atomicSub( &value, arg ); +#else + const T old = value; + value -= arg; + return old; +#endif + } + + __cuda_callable__ + T fetch_and( T arg ) + { +#ifdef __CUDA_ARCH__ + return atomicAnd( &value, arg ); +#else + const T old = value; + value = value & arg; + return old; +#endif + } + + __cuda_callable__ + T fetch_or( T arg ) + { +#ifdef __CUDA_ARCH__ + return atomicOr( &value, arg ); +#else + const T old = value; + value = value | arg; + return old; +#endif + } + + __cuda_callable__ + T fetch_xor( T arg ) + { +#ifdef __CUDA_ARCH__ + return atomicXor( &value, arg ); +#else + const T old = value; + value = value ^ arg; + return old; +#endif + } + + __cuda_callable__ + T operator+=( T arg ) noexcept + { + return fetch_add( arg ) + arg; + } + + __cuda_callable__ + T operator-=( T arg ) noexcept + { + return fetch_sub( arg ) - arg; + } + + __cuda_callable__ + T operator&=( T arg ) noexcept + { + return fetch_and( arg ) & arg; + } + + __cuda_callable__ + T operator|=( T arg ) noexcept + { + return fetch_or( arg ) | arg; + } + + __cuda_callable__ + T operator^=( T arg ) noexcept + { + return fetch_xor( arg ) ^ arg; + } + + // pre-increment + __cuda_callable__ + T operator++() noexcept + { + return fetch_add(1) + 1; + } + + // post-increment + __cuda_callable__ + T operator++(int) noexcept + { + return fetch_add(1); + } + + // pre-decrement + __cuda_callable__ + T operator--() noexcept + { + return fetch_sub(1) - 1; + } + + // post-decrement + __cuda_callable__ + T operator--(int) noexcept + { + return fetch_sub(1); + } + + // extensions (methods not present in C++ standards) + + __cuda_callable__ + T fetch_max( T arg ) noexcept + { +#ifdef __CUDA_ARCH__ + return atomicMax( &value, arg ); +#else + const T old = value; + value = ( value > arg ) ? value : arg; + return old; +#endif + } + + __cuda_callable__ + T fetch_min( T arg ) noexcept + { +#ifdef __CUDA_ARCH__ + return atomicMin( &value, arg ); +#else + const T old = value; + value = ( value < arg ) ? value : arg; + return old; +#endif + } + +protected: + T value; +}; + +} // namespace TNL diff --git a/src/TNL/CMakeLists.txt b/src/TNL/CMakeLists.txt index 923d63b858fe28a36b0d9670a44525a630a1c519..cd07ae65910ab69bf004f69bb84231caf78aaab3 100644 --- a/src/TNL/CMakeLists.txt +++ b/src/TNL/CMakeLists.txt @@ -19,6 +19,7 @@ ADD_SUBDIRECTORY( legacy ) SET( CURRENT_DIR ${CMAKE_SOURCE_DIR}/src/TNL ) set( headers + Atomic.h Assert.h CudaSharedMemory.h CudaStreamPool.h @@ -99,31 +100,32 @@ TARGET_LINK_LIBRARIES( tnl INSTALL( TARGETS tnl DESTINATION lib ) -IF( BUILD_MPI ) - - ADD_LIBRARY( tnl-mpi_static STATIC ${tnl_SOURCES} ) - INSTALL( TARGETS tnl-mpi_static DESTINATION lib ) - - if( BUILD_CUDA ) - CUDA_ADD_LIBRARY( tnl-mpi SHARED ${tnl_CUDA__SOURCES} - OPTIONS ${CUDA_ADD_LIBRARY_OPTIONS} ) - # the static library with CUDA support has to be built separately - CUDA_ADD_LIBRARY( tnl-mpi-cuda_static STATIC ${tnl_CUDA__SOURCES} ) - INSTALL( TARGETS tnl-mpi-cuda_static DESTINATION lib ) - else( BUILD_CUDA ) - ADD_LIBRARY( tnl-mpi SHARED ${tnl_SOURCES} ) - endif( BUILD_CUDA ) - - SET_TARGET_PROPERTIES( tnl-mpi PROPERTIES - VERSION ${tnlVersion} ) -# SET_TARGET_PROPERTIES( tnl-mpi -# LINK_INTERFACE_LIBRARIES "") - - - TARGET_LINK_LIBRARIES( tnl-mpi - ${MPI_LIBRARIES} ) - INSTALL( TARGETS tnl-mpi DESTINATION lib ) - -endif() +# NOTE: this is not necessary until something in the library file actually depends on MPI +#IF( BUILD_MPI ) +# +# ADD_LIBRARY( tnl-mpi_static STATIC ${tnl_SOURCES} ) +# INSTALL( TARGETS tnl-mpi_static DESTINATION lib ) +# +# if( BUILD_CUDA ) +# CUDA_ADD_LIBRARY( tnl-mpi SHARED ${tnl_CUDA__SOURCES} +# OPTIONS ${CUDA_ADD_LIBRARY_OPTIONS} ) +# # the static library with CUDA support has to be built separately +# CUDA_ADD_LIBRARY( tnl-mpi-cuda_static STATIC ${tnl_CUDA__SOURCES} ) +# INSTALL( TARGETS tnl-mpi-cuda_static DESTINATION lib ) +# else( BUILD_CUDA ) +# ADD_LIBRARY( tnl-mpi SHARED ${tnl_SOURCES} ) +# endif( BUILD_CUDA ) +# +# SET_TARGET_PROPERTIES( tnl-mpi PROPERTIES +# VERSION ${tnlVersion} ) +## SET_TARGET_PROPERTIES( tnl-mpi +## LINK_INTERFACE_LIBRARIES "") +# +# +# TARGET_LINK_LIBRARIES( tnl-mpi +# ${MPI_LIBRARIES} ) +# INSTALL( TARGETS tnl-mpi DESTINATION lib ) +# +#endif() INSTALL( FILES ${headers} DESTINATION ${TNL_TARGET_INCLUDE_DIRECTORY} ) diff --git a/src/TNL/Communicators/CMakeLists.txt b/src/TNL/Communicators/CMakeLists.txt index 7a58eaa2a04efe42bc444b09e28b25c51fc5d7e4..fb3193b739c75ea41ddae9ecf664592e44821d79 100644 --- a/src/TNL/Communicators/CMakeLists.txt +++ b/src/TNL/Communicators/CMakeLists.txt @@ -1,6 +1,7 @@ SET( headers MpiCommunicator.h MpiDefs.h NoDistrCommunicator.h + ScopedInitializer.h ) INSTALL( FILES ${headers} DESTINATION ${TNL_TARGET_INCLUDE_DIRECTORY}/Communicators ) diff --git a/src/TNL/Communicators/MpiCommunicator.h b/src/TNL/Communicators/MpiCommunicator.h index c64abccb4603e1c0924ddad56f0e52201f724c95..c233004a602f31ce8b7220b9983c9541f47f6331 100644 --- a/src/TNL/Communicators/MpiCommunicator.h +++ b/src/TNL/Communicators/MpiCommunicator.h @@ -16,7 +16,10 @@ #ifdef HAVE_MPI #include <mpi.h> -#include <mpi-ext.h> +#ifdef OMPI_MAJOR_VERSION + // header specific to OpenMPI (needed for CUDA-aware detection) + #include <mpi-ext.h> +#endif #ifdef HAVE_CUDA #include <TNL/Devices/Cuda.h> @@ -74,7 +77,11 @@ class MpiCommunicator static bool isDistributed() { +#ifdef HAVE_MPI return GetSize(AllGroup)>1; +#else + return false; +#endif } static void configSetup( Config::ConfigDescription& config, const String& prefix = "" ) @@ -142,7 +149,7 @@ class MpiCommunicator return true; } - static void Init(int argc, char **argv ) + static void Init(int& argc, char**& argv ) { #ifdef HAVE_MPI MPI_Init( &argc, &argv ); @@ -173,6 +180,8 @@ class MpiCommunicator std::cout.rdbuf(psbuf); } } +#else + throw Exceptions::MPISupportMissing(); #endif } @@ -199,7 +208,7 @@ class MpiCommunicator MPI_Finalized(&finalized); return initialized && !finalized; #else - return false; + throw Exceptions::MPISupportMissing(); #endif } @@ -212,7 +221,7 @@ class MpiCommunicator MPI_Comm_rank(group,&rank); return rank; #else - return 1; + throw Exceptions::MPISupportMissing(); #endif } @@ -225,7 +234,7 @@ class MpiCommunicator MPI_Comm_size(group,&size); return size; #else - return 1; + throw Exceptions::MPISupportMissing(); #endif } @@ -250,6 +259,8 @@ class MpiCommunicator /***END OF HACK***/ MPI_Dims_create(nproc, dim, distr); +#else + throw Exceptions::MPISupportMissing(); #endif } @@ -419,7 +430,7 @@ class MpiCommunicator MPI_Comm_split(oldGroup, MPI_UNDEFINED, GetRank(oldGroup), &newGroup); } #else - newGroup=oldGroup; + throw Exceptions::MPISupportMissing(); #endif } diff --git a/src/TNL/Communicators/NoDistrCommunicator.h b/src/TNL/Communicators/NoDistrCommunicator.h index a25b9f1bb74712390769a8d317f26317ea820613..aac58b916bf17656e9d6c33bead7a4d37441fca7 100644 --- a/src/TNL/Communicators/NoDistrCommunicator.h +++ b/src/TNL/Communicators/NoDistrCommunicator.h @@ -37,7 +37,7 @@ class NoDistrCommunicator return true; } - static void Init(int argc, char **argv, bool redirect=false) {} + static void Init(int& argc, char**& argv) {} static void setRedirection( bool redirect_ ) {} diff --git a/src/TNL/Communicators/ScopedInitializer.h b/src/TNL/Communicators/ScopedInitializer.h new file mode 100644 index 0000000000000000000000000000000000000000..2970bc628319bdf9d4c40d7a2cb32694a8148f7d --- /dev/null +++ b/src/TNL/Communicators/ScopedInitializer.h @@ -0,0 +1,33 @@ +/*************************************************************************** + ScopedInitializer.h - description + ------------------- + begin : Sep 16, 2018 + copyright : (C) 2005 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovský + +#pragma once + +namespace TNL { +namespace Communicators { + +template< typename Communicator > +struct ScopedInitializer +{ + ScopedInitializer( int& argc, char**& argv ) + { + Communicator::Init( argc, argv ); + } + + ~ScopedInitializer() + { + Communicator::Finalize(); + } +}; + +} // namespace Communicators +} // namespace TNL diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsCuda_impl.h b/src/TNL/Containers/Algorithms/ArrayOperationsCuda_impl.h index 1f21dc261ddaadad55bcc5b25357e9c6bb616ba6..bca6bdb0479eb38a329234f92305421873864dc5 100644 --- a/src/TNL/Containers/Algorithms/ArrayOperationsCuda_impl.h +++ b/src/TNL/Containers/Algorithms/ArrayOperationsCuda_impl.h @@ -201,10 +201,9 @@ containsValue( const Element* data, TNL_ASSERT_GE( size, 0, "" ); if( size == 0 ) return false; bool result = false; - using Operation = Algorithms::ParallelReductionContainsValue< Element >; - Operation reductionContainsValue; + Algorithms::ParallelReductionContainsValue< Element > reductionContainsValue; reductionContainsValue.setValue( value ); - Reduction< Devices::Cuda >::template reduce< Operation, Index >( reductionContainsValue, size, data, 0, result ); + Reduction< Devices::Cuda >::reduce( reductionContainsValue, size, data, 0, result ); return result; } @@ -220,10 +219,9 @@ containsOnlyValue( const Element* data, TNL_ASSERT_GE( size, 0, "" ); if( size == 0 ) return false; bool result = false; - using Operation = Algorithms::ParallelReductionContainsOnlyValue< Element >; - Operation reductionContainsOnlyValue; + Algorithms::ParallelReductionContainsOnlyValue< Element > reductionContainsOnlyValue; reductionContainsOnlyValue.setValue( value ); - Reduction< Devices::Cuda >::template reduce< Operation, Index >( reductionContainsOnlyValue, size, data, 0, result ); + Reduction< Devices::Cuda >::reduce( reductionContainsOnlyValue, size, data, 0, result ); return result; } diff --git a/src/TNL/Functions/MeshFunction_impl.h b/src/TNL/Functions/MeshFunction_impl.h index 026a8dd3c0224935601ef8bc5ae9c51d39bdea4d..3a8c74b6785b68357707ff9e7349a42a67e07f5f 100644 --- a/src/TNL/Functions/MeshFunction_impl.h +++ b/src/TNL/Functions/MeshFunction_impl.h @@ -40,9 +40,6 @@ MeshFunction( const MeshPointer& meshPointer ) this->meshPointer=meshPointer; this->data.setSize( getMesh().template getEntitiesCount< typename Mesh::template EntityType< MeshEntityDimension > >() ); - TNL_ASSERT( this->data.getSize() == this->getMesh().template getEntitiesCount< typename MeshType::template EntityType< MeshEntityDimension > >(), - std::cerr << "this->data.getSize() = " << this->data.getSize() << std::endl - << "this->getMesh().template getEntitiesCount< typename MeshType::template EntityType< MeshEntityDimension > >() = " << this->getMesh().template getEntitiesCount< typename MeshType::template EntityType< MeshEntityDimension > >() ); } template< typename Mesh, @@ -69,7 +66,7 @@ MeshFunction( const MeshPointer& meshPointer, //: meshPointer( meshPointer ) { TNL_ASSERT_GE( data.getSize(), meshPointer->template getEntitiesCount< typename MeshType::template EntityType< MeshEntityDimension > >(), - "The input vector is not large enough for binding to the mesh function." ); + "The input vector is not large enough for binding to the mesh function." ); setupSynchronizer(meshPointer->getDistributedMesh()); this->meshPointer=meshPointer; @@ -88,7 +85,7 @@ MeshFunction( const MeshPointer& meshPointer, //: meshPointer( meshPointer ) { TNL_ASSERT_GE( data->getSize(), offset + meshPointer->template getEntitiesCount< typename MeshType::template EntityType< MeshEntityDimension > >(), - "The input vector is not large enough for binding to the mesh function." ); + "The input vector is not large enough for binding to the mesh function." ); setupSynchronizer(meshPointer->getDistributedMesh()); @@ -203,10 +200,9 @@ MeshFunction< Mesh, MeshEntityDimension, Real >:: bind( const Vector& data, const IndexType& offset ) { + TNL_ASSERT_GE( data.getSize(), offset + meshPointer->template getEntitiesCount< typename MeshType::template EntityType< MeshEntityDimension > >(), + "The input vector is not large enough for binding to the mesh function." ); this->data.bind( data, offset, getMesh().template getEntitiesCount< typename Mesh::template EntityType< MeshEntityDimension > >() ); - TNL_ASSERT( this->data.getSize() == this->getMesh().template getEntitiesCount< typename MeshType::template EntityType< MeshEntityDimension > >(), - std::cerr << "this->data.getSize() = " << this->data.getSize() << std::endl - << "this->getMesh().template getEntitiesCount< typename MeshType::template EntityType< MeshEntityDimension > >() = " << this->getMesh().template getEntitiesCount< typename MeshType::template EntityType< MeshEntityDimension > >() ); } template< typename Mesh, @@ -219,9 +215,9 @@ bind( const MeshPointer& meshPointer, const Vector& data, const IndexType& offset ) { - TNL_ASSERT_GE( data.getSize(), offset + meshPointer->template getEntitiesCount< typename MeshType::template EntityType< MeshEntityDimension > >(), - "The input vector is not large enough for binding to the mesh function." ); - + TNL_ASSERT_GE( data.getSize(), offset + meshPointer->template getEntitiesCount< typename MeshType::template EntityType< MeshEntityDimension > >(), + "The input vector is not large enough for binding to the mesh function." ); + setupSynchronizer(meshPointer->getDistributedMesh()); this->meshPointer=meshPointer; this->data.bind( data, offset, getMesh().template getEntitiesCount< typename Mesh::template EntityType< MeshEntityDimension > >() ); @@ -237,8 +233,8 @@ bind( const MeshPointer& meshPointer, const Pointers::SharedPointer< Vector >& data, const IndexType& offset ) { - TNL_ASSERT_GE( data->getSize(), offset + meshPointer->template getEntitiesCount< typename MeshType::template EntityType< MeshEntityDimension > >(), - "The input vector is not large enough for binding to the mesh function." ); + TNL_ASSERT_GE( data->getSize(), offset + meshPointer->template getEntitiesCount< typename MeshType::template EntityType< MeshEntityDimension > >(), + "The input vector is not large enough for binding to the mesh function." ); setupSynchronizer(meshPointer->getDistributedMesh()); this->meshPointer=meshPointer; @@ -478,11 +474,10 @@ bool MeshFunction< Mesh, MeshEntityDimension, Real >:: save( File& file ) const { - TNL_ASSERT( this->data.getSize() == this->getMesh().template getEntitiesCount< typename MeshType::template EntityType< MeshEntityDimension > >(), - std::cerr << "this->data.getSize() = " << this->data.getSize() << std::endl - << "this->getMesh().template getEntitiesCount< typename MeshType::template EntityType< MeshEntityDimension > >() = " << this->getMesh().template getEntitiesCount< typename MeshType::template EntityType< MeshEntityDimension > >() ); + TNL_ASSERT_EQ( this->data.getSize(), this->getMesh().template getEntitiesCount< typename MeshType::template EntityType< MeshEntityDimension > >(), + "Size of the mesh function data does not match the mesh." ); if( ! Object::save( file ) ) - return false; + return false; return this->data.save( file ); } @@ -494,12 +489,12 @@ MeshFunction< Mesh, MeshEntityDimension, Real >:: load( File& file ) { if( ! Object::load( file ) ) - return false; + return false; if( ! this->data.load( file ) ) return false; const IndexType meshSize = this->getMesh().template getEntitiesCount< typename MeshType::template EntityType< MeshEntityDimension > >(); if( this->data.getSize() != meshSize ) - { + { std::cerr << "Size of the data loaded to the mesh function (" << this->data.getSize() << ") does not fit with the mesh size (" << meshSize << ")." << std::endl; return false; } diff --git a/src/TNL/Math.h b/src/TNL/Math.h index 2d81bea6678a41508ea8a1bdfa19d93b0c831c01..67e9f5e082af0b238e1d045bc516bf578efe82a0 100644 --- a/src/TNL/Math.h +++ b/src/TNL/Math.h @@ -168,24 +168,6 @@ T sign( const T& a ) return ( T ) 1; } -template< class T > -__cuda_callable__ -bool isNan( const T& v ) -{ -#if defined HAVE_CUDA - #if defined(__CUDA_ARCH__) - return isnan( v ); - #else - #if defined (__GNUC__) && ( __GNUC__ < 5 ) - return false; - #else - return std::isnan( v ); - #endif - #endif -#else - return std::isnan( v ); -#endif -} template< typename Real > __cuda_callable__ bool isSmall( const Real& v, diff --git a/src/TNL/Meshes/GridDetails/NeighborGridEntitiesStorage.h b/src/TNL/Meshes/GridDetails/NeighborGridEntitiesStorage.h index 2bef7d8bdffbe4f797367af3ffb49af01fb47970..dd9562add377b02c3ab9ca91fa4804762182b46c 100644 --- a/src/TNL/Meshes/GridDetails/NeighborGridEntitiesStorage.h +++ b/src/TNL/Meshes/GridDetails/NeighborGridEntitiesStorage.h @@ -22,12 +22,7 @@ template< typename GridEntity, int NeighborEntityDimension, typename GridEntityConfig, bool storage = GridEntityConfig::template neighborEntityStorage< GridEntity >( NeighborEntityDimension ) > -class NeighborGridEntityLayer{}; - -template< typename GridEntity, - int NeighborEntityDimension, - typename GridEntityConfig > -class NeighborGridEntityLayer< GridEntity, NeighborEntityDimension, GridEntityConfig, true > +class NeighborGridEntityLayer : public NeighborGridEntityLayer< GridEntity, NeighborEntityDimension - 1, GridEntityConfig > { public: @@ -63,8 +58,9 @@ class NeighborGridEntityLayer< GridEntity, NeighborEntityDimension, GridEntityCo }; template< typename GridEntity, - typename GridEntityConfig > -class NeighborGridEntityLayer< GridEntity, 0, GridEntityConfig, true > + typename GridEntityConfig, + bool storage > +class NeighborGridEntityLayer< GridEntity, 0, GridEntityConfig, storage > { public: @@ -93,52 +89,6 @@ class NeighborGridEntityLayer< GridEntity, 0, GridEntityConfig, true > NeighborEntityGetterType neighborEntities; }; -template< typename GridEntity, - int NeighborEntityDimension, - typename GridEntityConfig > -class NeighborGridEntityLayer< GridEntity, NeighborEntityDimension, GridEntityConfig, false > -: public NeighborGridEntityLayer< GridEntity, NeighborEntityDimension - 1, GridEntityConfig > -{ - public: - - typedef NeighborGridEntityLayer< GridEntity, NeighborEntityDimension - 1, GridEntityConfig > BaseType; - typedef NeighborGridEntityGetter< GridEntity, NeighborEntityDimension > NeighborEntityGetterType; - - using BaseType::getNeighborEntities; - - __cuda_callable__ - NeighborGridEntityLayer( const GridEntity& entity ) - : BaseType( entity ) - {} - - __cuda_callable__ - const NeighborEntityGetterType& getNeighborEntities( const DimensionTag< NeighborEntityDimension >& tag ) const {} - - __cuda_callable__ - void refresh( const typename GridEntity::GridType& grid, - const typename GridEntity::GridType::IndexType& entityIndex ) {} -}; - -template< typename GridEntity, - typename GridEntityConfig > -class NeighborGridEntityLayer< GridEntity, 0, GridEntityConfig, false > -{ - public: - - typedef NeighborGridEntityGetter< GridEntity, 0 > NeighborEntityGetterType; - - __cuda_callable__ - NeighborGridEntityLayer( const GridEntity& entity ){} - - __cuda_callable__ - const NeighborEntityGetterType& getNeighborEntities( const DimensionTag< 0 >& tag ) const {} - - __cuda_callable__ - void refresh( const typename GridEntity::GridType& grid, - const typename GridEntity::GridType::IndexType& entityIndex ) {} -}; - - template< typename GridEntity, diff --git a/src/TNL/Solvers/IterativeSolver_impl.h b/src/TNL/Solvers/IterativeSolver_impl.h index e65b3cdd412c899b6803b2129f193c9846c05766..333c38cd44a54478d01b439dbc27780711b44ac1 100644 --- a/src/TNL/Solvers/IterativeSolver_impl.h +++ b/src/TNL/Solvers/IterativeSolver_impl.h @@ -107,7 +107,7 @@ bool IterativeSolver< Real, Index> :: checkNextIteration() { this->refreshSolverMonitor(); - if( isNan( this->getResidue() ) || + if( std::isnan( this->getResidue() ) || this->getIterations() > this->getMaxIterations() || ( this->getResidue() > this->getDivergenceResidue() && this->getIterations() >= this->getMinIterations() ) || ( this->getResidue() < this->getConvergenceResidue() && this->getIterations() >= this->getMinIterations() ) ) @@ -120,7 +120,7 @@ bool IterativeSolver< Real, Index>:: checkConvergence() { - if( isNan( this->getResidue() ) ) + if( std::isnan( this->getResidue() ) ) { std::cerr << std::endl << "The residue is NaN." << std::endl; return false; diff --git a/src/TNL/Solvers/Linear/BICGStabL_impl.h b/src/TNL/Solvers/Linear/BICGStabL_impl.h index 467f1a91a04c018377863b27ac85985759eb988d..6606bddd561fa8a8d5b3ce5e8855b6e41541b546 100644 --- a/src/TNL/Solvers/Linear/BICGStabL_impl.h +++ b/src/TNL/Solvers/Linear/BICGStabL_impl.h @@ -87,7 +87,7 @@ BICGStabL< Matrix >::solve( ConstVectorViewType b, VectorViewType x ) } sigma[ 0 ] = r_0.lpNorm( 2.0 ); - if( isNan( sigma[ 0 ] ) ) + if( std::isnan( sigma[ 0 ] ) ) throw std::runtime_error( "BiCGstab(ell): initial residue is NAN" ); r_ast = r_0; diff --git a/src/TNL/Solvers/SolverInitiator_impl.h b/src/TNL/Solvers/SolverInitiator_impl.h index 63fe4591f1de860e38966e9185eb9bd5ca6dd228..c6bc5ca7f494abd8922f1a0fcb45b4814277094f 100644 --- a/src/TNL/Solvers/SolverInitiator_impl.h +++ b/src/TNL/Solvers/SolverInitiator_impl.h @@ -16,10 +16,6 @@ #include <TNL/Config/ParameterContainer.h> #include <TNL/Meshes/TypeResolver/TypeResolver.h> #include <TNL/Solvers/BuildConfigTags.h> -#include <TNL/Solvers/Linear/SOR.h> -#include <TNL/Solvers/Linear/CG.h> -#include <TNL/Solvers/Linear/BICGStab.h> -#include <TNL/Solvers/Linear/GMRES.h> #include <TNL/Solvers/SolverStarter.h> #include <TNL/Meshes/DummyMesh.h> @@ -190,15 +186,9 @@ class CommunicatorTypeResolver< ProblemSetter, Real, Device, Index, ConfigTag, t public: static bool run( const Config::ParameterContainer& parameters ) { - if(Communicators::MpiCommunicator::isDistributed()) - { - bool ret=SolverInitiatorMeshResolver< ProblemSetter, Real, Device, Index, ConfigTag, Communicators::MpiCommunicator >::run( parameters ); - Communicators::MpiCommunicator::Finalize(); - return ret; - } - Communicators::MpiCommunicator::Finalize(); + if( Communicators::MpiCommunicator::isDistributed() ) + return SolverInitiatorMeshResolver< ProblemSetter, Real, Device, Index, ConfigTag, Communicators::MpiCommunicator >::run( parameters ); return SolverInitiatorMeshResolver< ProblemSetter, Real, Device, Index, ConfigTag, Communicators::NoDistrCommunicator >::run( parameters ); - } }; diff --git a/src/TNL/Solvers/Solver_impl.h b/src/TNL/Solvers/Solver_impl.h index a0e4f1953208edb1fc89d820571aa329e376ff5c..ef865e8b7b25b6ce426c1b0d3c2918187058918d 100644 --- a/src/TNL/Solvers/Solver_impl.h +++ b/src/TNL/Solvers/Solver_impl.h @@ -14,8 +14,8 @@ #include <TNL/Solvers/SolverStarter.h> #include <TNL/Solvers/SolverConfig.h> #include <TNL/Devices/Cuda.h> -#include <TNL/Communicators/NoDistrCommunicator.h> #include <TNL/Communicators/MpiCommunicator.h> +#include <TNL/Communicators/ScopedInitializer.h> namespace TNL { namespace Solvers { @@ -34,19 +34,15 @@ run( int argc, char* argv[] ) configDescription.addDelimiter( "Parallelization setup:" ); Devices::Host::configSetup( configDescription ); Devices::Cuda::configSetup( configDescription ); - Communicators::NoDistrCommunicator::configSetup( configDescription ); Communicators::MpiCommunicator::configSetup( configDescription ); - - Communicators::NoDistrCommunicator::Init(argc,argv); - Communicators::MpiCommunicator::Init(argc,argv); + + Communicators::ScopedInitializer< Communicators::MpiCommunicator > mpi( argc, argv ); if( ! parseCommandLine( argc, argv, configDescription, parameters ) ) return false; SolverInitiator< ProblemSetter, MeshConfig > solverInitiator; - bool ret= solverInitiator.run( parameters ); - - return ret; + return solverInitiator.run( parameters ); }; } // namespace Solvers diff --git a/src/Tools/tnl-init.cpp b/src/Tools/tnl-init.cpp index ac72f7a13283958fd407c4e281c1baeb4a038996..7dd7032810794f5f2ee43023472be6773fcb2de6 100644 --- a/src/Tools/tnl-init.cpp +++ b/src/Tools/tnl-init.cpp @@ -17,8 +17,8 @@ #include <TNL/Meshes/DummyMesh.h> #include <TNL/Meshes/Grid.h> -#include <TNL/Communicators/NoDistrCommunicator.h> #include <TNL/Communicators/MpiCommunicator.h> +#include <TNL/Communicators/ScopedInitializer.h> using namespace TNL; @@ -51,18 +51,14 @@ void setupConfig( Config::ConfigDescription& config ) int main( int argc, char* argv[] ) { - Config::ParameterContainer parameters; Config::ConfigDescription configDescription; setupConfig( configDescription ); - - Communicators::NoDistrCommunicator::configSetup( configDescription ); Communicators::MpiCommunicator::configSetup( configDescription ); - - Communicators::NoDistrCommunicator::Init(argc,argv); - Communicators::MpiCommunicator::Init(argc,argv); - + + Communicators::ScopedInitializer< Communicators::MpiCommunicator > mpi(argc, argv); + if( ! parseCommandLine( argc, argv, configDescription, parameters ) ) return EXIT_FAILURE; @@ -83,9 +79,5 @@ int main( int argc, char* argv[] ) if( ! resolveMeshType( parsedMeshType, parameters ) ) return EXIT_FAILURE; -#ifdef HAVE_MPI - Communicators::MpiCommunicator::Finalize(); -#endif - return EXIT_SUCCESS; } diff --git a/src/UnitTests/Meshes/DistributedMeshes/CMakeLists.txt b/src/UnitTests/Meshes/DistributedMeshes/CMakeLists.txt index 71cca245d18c157a98d2533c42f32b80ff16190a..ad4127dbd724d0fba9cc03bc21e065e3ae91c179 100644 --- a/src/UnitTests/Meshes/DistributedMeshes/CMakeLists.txt +++ b/src/UnitTests/Meshes/DistributedMeshes/CMakeLists.txt @@ -20,7 +20,7 @@ ADD_TEST( NAME DirectionsTest COMMAND ${EXECUTABLE_OUTPUT_PATH}/DirectionsTest${ ADD_TEST( NAME CopyEntitesTest COMMAND ${EXECUTABLE_OUTPUT_PATH}/CopyEntitesTest${CMAKE_EXECUTABLE_SUFFIX} ) ADD_TEST( NAME CutMeshFunctionTest COMMAND ${EXECUTABLE_OUTPUT_PATH}/CutMeshFunctionTest${CMAKE_EXECUTABLE_SUFFIX} ) -if( ${CXX_COMPILER_NAME} STREQUAL "mpic++" ) +if( BUILD_MPI ) ADD_EXECUTABLE( DistributedGridTest_1D DistributedGridTest_1D.cpp ) TARGET_COMPILE_OPTIONS( DistributedGridTest_1D PRIVATE ${CXX_TESTS_FLAGS} ) TARGET_LINK_LIBRARIES( DistributedGridTest_1D diff --git a/src/UnitTests/Meshes/DistributedMeshes/CutDistributedGridTest.cpp b/src/UnitTests/Meshes/DistributedMeshes/CutDistributedGridTest.cpp index bf7755e5fc3c6b1f1ced0b96bdbc0cfff7a47165..587ec807ec0ad01515832f777cd444c5c9ac386a 100644 --- a/src/UnitTests/Meshes/DistributedMeshes/CutDistributedGridTest.cpp +++ b/src/UnitTests/Meshes/DistributedMeshes/CutDistributedGridTest.cpp @@ -4,6 +4,7 @@ #ifdef HAVE_MPI #include <TNL/Communicators/MpiCommunicator.h> +#include <TNL/Communicators/ScopedInitializer.h> #include <TNL/Meshes/DistributedMeshes/DistributedMesh.h> #include <TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h> @@ -404,7 +405,7 @@ TEST(NoMPI, NoTest) }; #endif -#include "../../src/UnitTests/GtestMissingError.h" +#include "../../GtestMissingError.h" int main( int argc, char* argv[] ) { #ifdef HAVE_GTEST @@ -417,14 +418,9 @@ int main( int argc, char* argv[] ) delete listeners.Release(listeners.default_result_printer()); listeners.Append(new MinimalistBufferedPrinter); - CommunicatorType::Init(argc,argv); + Communicators::ScopedInitializer< CommunicatorType > mpi(argc, argv); #endif - int result= RUN_ALL_TESTS(); - - #ifdef HAVE_MPI - CommunicatorType::Finalize(); - #endif - return result; + return RUN_ALL_TESTS(); #else throw GtestMissingError(); diff --git a/src/UnitTests/Meshes/DistributedMeshes/CutDistributedMeshFunctionTest.cpp b/src/UnitTests/Meshes/DistributedMeshes/CutDistributedMeshFunctionTest.cpp index d78a1dff652a605806f98bea3b392f00f156d0e9..4907f1d269cfc3d7f5d205405546675f1b8d58fe 100644 --- a/src/UnitTests/Meshes/DistributedMeshes/CutDistributedMeshFunctionTest.cpp +++ b/src/UnitTests/Meshes/DistributedMeshes/CutDistributedMeshFunctionTest.cpp @@ -6,6 +6,7 @@ #include <TNL/Devices/Host.h> #include <TNL/Functions/CutMeshFunction.h> #include <TNL/Communicators/MpiCommunicator.h> +#include <TNL/Communicators/ScopedInitializer.h> #include <TNL/Meshes/DistributedMeshes/DistributedGridIO.h> #include <TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h> @@ -684,7 +685,7 @@ TEST(NoMPI, NoTest) }; #endif -#include "../../src/UnitTests/GtestMissingError.h" +#include "../../GtestMissingError.h" int main( int argc, char* argv[] ) { #ifdef HAVE_GTEST @@ -697,14 +698,9 @@ int main( int argc, char* argv[] ) delete listeners.Release(listeners.default_result_printer()); listeners.Append(new MinimalistBufferedPrinter); - CommunicatorType::Init(argc,argv); + Communicators::ScopedInitializer< CommunicatorType > mpi(argc, argv); #endif - int result= RUN_ALL_TESTS(); - - #ifdef HAVE_MPI - CommunicatorType::Finalize(); - #endif - return result; + return RUN_ALL_TESTS(); #else throw GtestMissingError(); diff --git a/src/UnitTests/Meshes/DistributedMeshes/CutMeshFunctionTest.cpp b/src/UnitTests/Meshes/DistributedMeshes/CutMeshFunctionTest.cpp index 5850c956e9f194806e8bca503f15763584516d7d..ce78b85680c69bba791d205661d918d1498ef3a0 100644 --- a/src/UnitTests/Meshes/DistributedMeshes/CutMeshFunctionTest.cpp +++ b/src/UnitTests/Meshes/DistributedMeshes/CutMeshFunctionTest.cpp @@ -213,15 +213,13 @@ TEST(CutMeshFunction, 3D_2) #endif -#include "../../src/UnitTests/GtestMissingError.h" +#include "../../GtestMissingError.h" int main( int argc, char* argv[] ) { #ifdef HAVE_GTEST ::testing::InitGoogleTest( &argc, argv ); - int result= RUN_ALL_TESTS(); - return result; + return RUN_ALL_TESTS(); #else - throw GtestMissingError(); #endif } diff --git a/src/UnitTests/Meshes/DistributedMeshes/DirectionsTest.cpp b/src/UnitTests/Meshes/DistributedMeshes/DirectionsTest.cpp index 52ed831d0baced3dbac35f36de9ac2719dd8363f..e8625bc3d6ab57d3ae01b23d99e68dc34792d7a2 100644 --- a/src/UnitTests/Meshes/DistributedMeshes/DirectionsTest.cpp +++ b/src/UnitTests/Meshes/DistributedMeshes/DirectionsTest.cpp @@ -121,7 +121,7 @@ TEST(XYZ, 3D ) #endif -#include "../../src/UnitTests/GtestMissingError.h" +#include "../../GtestMissingError.h" int main( int argc, char* argv[] ) { #ifdef HAVE_GTEST diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIOTestBase.h b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIOTestBase.h index d7774436c5da27645b62fd5f81f5954a154f9c28..537bcd9239ddde38785b4c462d4bf9d71c0537c7 100644 --- a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIOTestBase.h +++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIOTestBase.h @@ -12,6 +12,7 @@ #ifdef HAVE_MPI #include "DistributedGridIOTest.h" +#include <TNL/Communicators/ScopedInitializer.h> TEST( DistributedGridIO, Save_1D ) { @@ -134,16 +135,11 @@ int main( int argc, char* argv[] ) delete listeners.Release(listeners.default_result_printer()); listeners.Append(new MinimalistBufferedPrinter); - CommunicatorType::Init(argc,argv ); + Communicators::ScopedInitializer< CommunicatorType > mpi(argc, argv); CommunicatorType::setRedirection( false ); CommunicatorType::setupRedirection(); #endif - int result= RUN_ALL_TESTS(); - - #ifdef HAVE_MPI - CommunicatorType::Finalize(); - #endif - return result; + return RUN_ALL_TESTS(); #else throw GtestMissingError(); diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIO_MPIIOTestBase.h b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIO_MPIIOTestBase.h index aaf5073ec86aa4961d6a40b7528c7b4ac0b73772..4e3603a7a40c36cc04d247f52148f029868dcbdf 100644 --- a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIO_MPIIOTestBase.h +++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridIO_MPIIOTestBase.h @@ -10,6 +10,7 @@ #ifdef HAVE_MPI #include "DistributedGridIO_MPIIOTest.h" +#include <TNL/Communicators/ScopedInitializer.h> TEST( DistributedGridMPIIO, Save_1D ) { @@ -131,16 +132,11 @@ int main( int argc, char* argv[] ) delete listeners.Release(listeners.default_result_printer()); listeners.Append(new MinimalistBufferedPrinter); - CommunicatorType::Init(argc,argv ); + Communicators::ScopedInitializer< CommunicatorType > mpi(argc, argv); CommunicatorType::setRedirection( false ); CommunicatorType::setupRedirection(); #endif - int result= RUN_ALL_TESTS(); - - #ifdef HAVE_MPI - CommunicatorType::Finalize(); - #endif - return result; + return RUN_ALL_TESTS(); #else throw GtestMissingError(); diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_1D.cpp b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_1D.cpp index e907ecc9dcc5736dda015b807c090b23b1d6f1df..251b9f553a4fc82b7d1bb5ef768991a519aec047 100644 --- a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_1D.cpp +++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_1D.cpp @@ -13,6 +13,7 @@ #ifdef HAVE_MPI #include <TNL/Communicators/MpiCommunicator.h> +#include <TNL/Communicators/ScopedInitializer.h> #include <TNL/Functions/MeshFunction.h> #include <TNL/Meshes/DistributedMeshes/DistributedMesh.h> #include <TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h> @@ -422,7 +423,7 @@ TEST(NoMPI, NoTest) }; #endif -#include "../../src/UnitTests/GtestMissingError.h" +#include "../../GtestMissingError.h" int main( int argc, char* argv[] ) { #ifdef HAVE_GTEST @@ -435,14 +436,9 @@ int main( int argc, char* argv[] ) delete listeners.Release(listeners.default_result_printer()); listeners.Append(new MinimalistBufferedPrinter); - CommunicatorType::Init(argc,argv); + Communicators::ScopedInitializer< CommunicatorType > mpi(argc, argv); #endif - int result= RUN_ALL_TESTS(); - - #ifdef HAVE_MPI - CommunicatorType::Finalize(); - #endif - return result; + return RUN_ALL_TESTS(); #else throw GtestMissingError(); diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_2D.cpp b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_2D.cpp index 36426059363699661f637f3185382b85dda782a3..38276dd5436d83200b75f69a5b6b5221f23e1126 100644 --- a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_2D.cpp +++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_2D.cpp @@ -15,6 +15,7 @@ #include <TNL/Meshes/DistributedMeshes/DistributedMesh.h> #include <TNL/Functions/MeshFunction.h> #include <TNL/Communicators/MpiCommunicator.h> +#include <TNL/Communicators/ScopedInitializer.h> #include <TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h> #include "../../Functions/Functions.h" @@ -1049,7 +1050,7 @@ TEST(NoMPI, NoTest) }; #endif -#include "../../src/UnitTests/GtestMissingError.h" +#include "../../GtestMissingError.h" int main( int argc, char* argv[] ) { #ifdef HAVE_GTEST @@ -1062,14 +1063,9 @@ int main( int argc, char* argv[] ) delete listeners.Release(listeners.default_result_printer()); listeners.Append(new MinimalistBufferedPrinter); - CommunicatorType::Init(argc,argv); + Communicators::ScopedInitializer< CommunicatorType > mpi(argc, argv); #endif - int result= RUN_ALL_TESTS(); - - #ifdef HAVE_MPI - CommunicatorType::Finalize(); - #endif - return result; + return RUN_ALL_TESTS(); #else throw GtestMissingError(); diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_3D.cpp b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_3D.cpp index 1eecf6306c94bf570acd608e49e527da6036b780..6bbd7ad257a176f8f10f2dcefcd5315b385307ce 100644 --- a/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_3D.cpp +++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedGridTest_3D.cpp @@ -4,6 +4,7 @@ #ifdef HAVE_MPI #include <TNL/Communicators/MpiCommunicator.h> +#include <TNL/Communicators/ScopedInitializer.h> #include <TNL/Functions/MeshFunction.h> #include <TNL/Meshes/DistributedMeshes/DistributedMesh.h> #include <TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h> @@ -765,7 +766,7 @@ TEST(NoMPI, NoTest) }; #endif -#include "../../src/UnitTests/GtestMissingError.h" +#include "../../GtestMissingError.h" int main( int argc, char* argv[] ) { #ifdef HAVE_GTEST @@ -778,14 +779,9 @@ int main( int argc, char* argv[] ) delete listeners.Release(listeners.default_result_printer()); listeners.Append(new MinimalistBufferedPrinter); - CommunicatorType::Init(argc,argv); + Communicators::ScopedInitializer< CommunicatorType > mpi(argc, argv); #endif - int result= RUN_ALL_TESTS(); - - #ifdef HAVE_MPI - CommunicatorType::Finalize(); - #endif - return result; + return RUN_ALL_TESTS(); #else throw GtestMissingError(); diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedVectorFieldIO_MPIIOTest.cpp b/src/UnitTests/Meshes/DistributedMeshes/DistributedVectorFieldIO_MPIIOTest.cpp index d3a1cd55267ad569ecfcf2196a8922a0384f5e36..67098fc5db6c801410425378ab5b609778b376f3 100644 --- a/src/UnitTests/Meshes/DistributedMeshes/DistributedVectorFieldIO_MPIIOTest.cpp +++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedVectorFieldIO_MPIIOTest.cpp @@ -6,6 +6,7 @@ #ifdef HAVE_MPI #include <TNL/Communicators/MpiCommunicator.h> +#include <TNL/Communicators/ScopedInitializer.h> #include "DistributedVectorFieldIO_MPIIOTestBase.h" using namespace TNL::Communicators; @@ -102,16 +103,11 @@ int main( int argc, char* argv[] ) delete listeners.Release(listeners.default_result_printer()); listeners.Append(new MinimalistBufferedPrinter); - CommunicatorType::Init(argc,argv ); + Communicators::ScopedInitializer< CommunicatorType > mpi(argc, argv); CommunicatorType::setRedirection( false ); CommunicatorType::setupRedirection(); #endif - int result= RUN_ALL_TESTS(); - - #ifdef HAVE_MPI - CommunicatorType::Finalize(); - #endif - return result; + return RUN_ALL_TESTS(); #else throw GtestMissingError(); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 33873402d8074b5fa304e84546e1188b57d9943c..93c15ce593a9db00fc240c6263706f9c10a46ac5 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,9 +1,4 @@ -set( ENABLE_CODECOVERAGE ) - ADD_SUBDIRECTORY( data ) -ADD_SUBDIRECTORY( benchmarks ) #ADD_SUBDIRECTORY( unit-tests ) ADD_SUBDIRECTORY( long-time-unit-tests ) ADD_SUBDIRECTORY( mpi ) - -unset( ENABLE_CODECOVERAGE ) diff --git a/tests/benchmarks/CMakeLists.txt b/tests/benchmarks/CMakeLists.txt deleted file mode 100644 index e53ba6878070e6b8f8d3b830ab8f64afcccbcf77..0000000000000000000000000000000000000000 --- a/tests/benchmarks/CMakeLists.txt +++ /dev/null @@ -1,29 +0,0 @@ -ADD_SUBDIRECTORY( share ) -ADD_SUBDIRECTORY( heat-equation-benchmark ) - -IF( BUILD_CUDA ) - CUDA_ADD_EXECUTABLE( tnl-benchmark-blas tnl-benchmark-blas.cu ) - CUDA_ADD_CUBLAS_TO_TARGET( tnl-benchmark-blas ) - TARGET_LINK_LIBRARIES( tnl-benchmark-blas tnl ) - - CUDA_ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cu ) - TARGET_LINK_LIBRARIES( tnl-benchmark-spmv tnl ${CUDA_cusparse_LIBRARY} ) - - CUDA_ADD_EXECUTABLE( tnl-benchmark-linear-solvers tnl-benchmark-linear-solvers.cu ) - TARGET_LINK_LIBRARIES( tnl-benchmark-linear-solvers tnl ) -ELSE() - ADD_EXECUTABLE( tnl-benchmark-blas tnl-benchmark-blas.cpp ) - TARGET_LINK_LIBRARIES( tnl-benchmark-blas tnl ) - - ADD_EXECUTABLE( tnl-benchmark-spmv tnl-benchmark-spmv.cpp ) - TARGET_LINK_LIBRARIES( tnl-benchmark-spmv tnl ) - - ADD_EXECUTABLE( tnl-benchmark-linear-solvers tnl-benchmark-linear-solvers.cpp ) - TARGET_LINK_LIBRARIES( tnl-benchmark-linear-solvers tnl ) -ENDIF() - -INSTALL( TARGETS - tnl-benchmark-blas - tnl-benchmark-spmv - tnl-benchmark-linear-solvers - RUNTIME DESTINATION bin ) diff --git a/tests/benchmarks/array-operations.h b/tests/benchmarks/array-operations.h deleted file mode 100644 index 504dcc1da03a91fa87af913008f9355579e62930..0000000000000000000000000000000000000000 --- a/tests/benchmarks/array-operations.h +++ /dev/null @@ -1,167 +0,0 @@ -/*************************************************************************** - array-operations.h - description - ------------------- - begin : Dec 30, 2015 - copyright : (C) 2015 by Tomas Oberhuber et al. - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -// Implemented by: Jakub Klinkovsky - -#pragma once - -#include "benchmarks.h" - -#include <TNL/Containers/Array.h> - -namespace TNL -{ -namespace benchmarks -{ - -template< typename Real = double, - typename Index = int > -bool -benchmarkArrayOperations( Benchmark & benchmark, - const int & loops, - const long & size ) -{ - typedef Containers::Array< Real, Devices::Host, Index > HostArray; - typedef Containers::Array< Real, Devices::Cuda, Index > CudaArray; - using namespace std; - - double datasetSize = ( double ) ( loops * size ) * sizeof( Real ) / oneGB; - - HostArray hostArray, hostArray2; - CudaArray deviceArray, deviceArray2; - hostArray.setSize( size ); - hostArray2.setSize( size ); -#ifdef HAVE_CUDA - deviceArray.setSize( size ); - deviceArray2.setSize( size ); -#endif - - Real resultHost, resultDevice; - - - // reset functions - auto reset1 = [&]() { - hostArray.setValue( 1.0 ); -#ifdef HAVE_CUDA - deviceArray.setValue( 1.0 ); -#endif - }; - auto reset2 = [&]() { - hostArray2.setValue( 1.0 ); -#ifdef HAVE_CUDA - deviceArray2.setValue( 1.0 ); -#endif - }; - auto reset12 = [&]() { - reset1(); - reset2(); - }; - - - reset12(); - - - auto compareHost = [&]() { - resultHost = (int) hostArray == hostArray2; - }; - auto compareCuda = [&]() { - resultDevice = (int) deviceArray == deviceArray2; - }; - benchmark.setOperation( "comparison (operator==)", 2 * datasetSize ); - benchmark.time( reset1, "CPU", compareHost ); -#ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", compareCuda ); -#endif - - - auto copyAssignHostHost = [&]() { - hostArray = hostArray2; - }; - auto copyAssignCudaCuda = [&]() { - deviceArray = deviceArray2; - }; - benchmark.setOperation( "copy (operator=)", 2 * datasetSize ); - // copyBasetime is used later inside HAVE_CUDA guard, so the compiler will - // complain when compiling without CUDA - const double copyBasetime = benchmark.time( reset1, "CPU", copyAssignHostHost ); -#ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", copyAssignCudaCuda ); -#endif - - - auto copyAssignHostCuda = [&]() { - deviceArray = hostArray; - }; - auto copyAssignCudaHost = [&]() { - hostArray = deviceArray; - }; -#ifdef HAVE_CUDA - benchmark.setOperation( "copy (operator=)", datasetSize, copyBasetime ); - benchmark.time( reset1, - "CPU->GPU", copyAssignHostCuda, - "GPU->CPU", copyAssignCudaHost ); -#endif - - - auto setValueHost = [&]() { - hostArray.setValue( 3.0 ); - }; - auto setValueCuda = [&]() { - deviceArray.setValue( 3.0 ); - }; - benchmark.setOperation( "setValue", datasetSize ); - benchmark.time( reset1, "CPU", setValueHost ); -#ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", setValueCuda ); -#endif - - - auto setSizeHost = [&]() { - hostArray.setSize( size ); - }; - auto setSizeCuda = [&]() { - deviceArray.setSize( size ); - }; - auto resetSize1 = [&]() { - hostArray.reset(); -#ifdef HAVE_CUDA - deviceArray.reset(); -#endif - }; - benchmark.setOperation( "allocation (setSize)", datasetSize ); - benchmark.time( resetSize1, "CPU", setSizeHost ); -#ifdef HAVE_CUDA - benchmark.time( resetSize1, "GPU", setSizeCuda ); -#endif - - - auto resetSizeHost = [&]() { - hostArray.reset(); - }; - auto resetSizeCuda = [&]() { - deviceArray.reset(); - }; - auto setSize1 = [&]() { - hostArray.setSize( size ); -#ifdef HAVE_CUDA - deviceArray.setSize( size ); -#endif - }; - benchmark.setOperation( "deallocation (reset)", datasetSize ); - benchmark.time( setSize1, "CPU", resetSizeHost ); -#ifdef HAVE_CUDA - benchmark.time( setSize1, "GPU", resetSizeCuda ); -#endif - - return true; -} - -} // namespace benchmarks -} // namespace tnl diff --git a/tests/benchmarks/benchmarks.h b/tests/benchmarks/benchmarks.h deleted file mode 100644 index ce5e631a6899170cfeba58911be71e5cc17eb7e6..0000000000000000000000000000000000000000 --- a/tests/benchmarks/benchmarks.h +++ /dev/null @@ -1,437 +0,0 @@ -/*************************************************************************** - benchmarks.h - description - ------------------- - begin : Dec 30, 2015 - copyright : (C) 2015 by Tomas Oberhuber et al. - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -// Implemented by: Jakub Klinkovsky - -#pragma once - -#include <iostream> -#include <iomanip> -#include <map> -#include <vector> - -#include <TNL/Timer.h> -#include <TNL/String.h> - -namespace TNL -{ -namespace benchmarks -{ - -const double oneGB = 1024.0 * 1024.0 * 1024.0; - -template< typename ComputeFunction, - typename ResetFunction > -double -timeFunction( ComputeFunction compute, - ResetFunction reset, - const int & loops ) -{ - // the timer is constructed zero-initialized and stopped - Timer timer; - - reset(); - for(int i = 0; i < loops; ++i) { - // Explicit synchronization of the CUDA device - // TODO: not necessary for host computations -#ifdef HAVE_CUDA - cudaDeviceSynchronize(); -#endif - timer.start(); - compute(); -#ifdef HAVE_CUDA - cudaDeviceSynchronize(); -#endif - timer.stop(); - - reset(); - } - - return timer.getRealTime(); -} - - -struct InternalError {}; - - -class Logging -{ -public: - using MetadataElement = std::pair< const char*, String >; - using MetadataMap = std::map< const char*, String >; - using MetadataColumns = std::vector<MetadataElement>; - - using HeaderElements = std::initializer_list< String >; - using RowElements = std::initializer_list< double >; - - Logging( bool verbose = true ) - : verbose(verbose) - { } - - void - writeTitle( const String & title ) - { - if( verbose ) - std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl; - log << ": title = " << title << std::endl; - } - - void - writeMetadata( const MetadataMap & metadata ) - { - if( verbose ) - std::cout << "properties:" << std::endl; - - for( auto & it : metadata ) { - if( verbose ) - std::cout << " " << it.first << " = " << it.second << std::endl; - log << ": " << it.first << " = " << it.second << std::endl; - } - if( verbose ) - std::cout << std::endl; - } - - void - writeTableHeader( const String & spanningElement, - const HeaderElements & subElements ) - { - using namespace std; - - if( verbose && header_changed ) { - for( auto & it : metadataColumns ) { - std::cout << std::setw( 20 ) << it.first; - } - - // spanning element is printed as usual column to stdout, - // but is excluded from header - std::cout << std::setw( 15 ) << ""; - - for( auto & it : subElements ) { - std::cout << std::setw( 15 ) << it; - } - std::cout << std::endl; - - header_changed = false; - } - - // initial indent string - header_indent = "!"; - log << std::endl; - for( auto & it : metadataColumns ) { - log << header_indent << " " << it.first << std::endl; - } - - // dump stacked spanning columns - if( horizontalGroups.size() > 0 ) - while( horizontalGroups.back().second <= 0 ) { - horizontalGroups.pop_back(); - header_indent.pop_back(); - } - for( size_t i = 0; i < horizontalGroups.size(); i++ ) { - if( horizontalGroups[ i ].second > 0 ) { - log << header_indent << " " << horizontalGroups[ i ].first << std::endl; - header_indent += "!"; - } - } - - log << header_indent << " " << spanningElement << std::endl; - for( auto & it : subElements ) { - log << header_indent << "! " << it << std::endl; - } - - if( horizontalGroups.size() > 0 ) { - horizontalGroups.back().second--; - header_indent.pop_back(); - } - } - - void - writeTableRow( const String & spanningElement, - const RowElements & subElements ) - { - using namespace std; - - if( verbose ) { - for( auto & it : metadataColumns ) { - std::cout << std::setw( 20 ) << it.second; - } - // spanning element is printed as usual column to stdout - std::cout << std::setw( 15 ) << spanningElement; - for( auto & it : subElements ) { - std::cout << std::setw( 15 ); - if( it != 0.0 )std::cout << it; - else std::cout << "N/A"; - } - std::cout << std::endl; - } - - // only when changed (the header has been already adjusted) - // print each element on separate line - for( auto & it : metadataColumns ) { - log << it.second << std::endl; - } - - // benchmark data are indented - const String indent = " "; - for( auto & it : subElements ) { - if( it != 0.0 ) log << indent << it << std::endl; - else log << indent << "N/A" << std::endl; - } - } - - void - writeErrorMessage( const char* msg, - const int & colspan = 1 ) - { - // initial indent string - header_indent = "!"; - log << std::endl; - for( auto & it : metadataColumns ) { - log << header_indent << " " << it.first << std::endl; - } - - // make sure there is a header column for the message - if( horizontalGroups.size() == 0 ) - horizontalGroups.push_back( {"", 1} ); - - // dump stacked spanning columns - while( horizontalGroups.back().second <= 0 ) { - horizontalGroups.pop_back(); - header_indent.pop_back(); - } - for( size_t i = 0; i < horizontalGroups.size(); i++ ) { - if( horizontalGroups[ i ].second > 0 ) { - log << header_indent << " " << horizontalGroups[ i ].first << std::endl; - header_indent += "!"; - } - } - if( horizontalGroups.size() > 0 ) { - horizontalGroups.back().second -= colspan; - header_indent.pop_back(); - } - - // only when changed (the header has been already adjusted) - // print each element on separate line - for( auto & it : metadataColumns ) { - log << it.second << std::endl; - } - log << msg << std::endl; - } - - void - closeTable() - { - log << std::endl; - header_indent = body_indent = ""; - header_changed = true; - horizontalGroups.clear(); - } - - bool save( std::ostream & logFile ) - { - closeTable(); - logFile << log.str(); - if( logFile.good() ) { - log.str() = ""; - return true; - } - return false; - } - -protected: - - // manual double -> String conversion with fixed precision - static String - _to_string( const double & num, const int & precision = 0, bool fixed = false ) - { - std::stringstream str; - if( fixed ) - str << std::fixed; - if( precision ) - str << std::setprecision( precision ); - str << num; - return String( str.str().data() ); - } - - std::stringstream log; - std::string header_indent; - std::string body_indent; - - bool verbose; - MetadataColumns metadataColumns; - bool header_changed = true; - std::vector< std::pair< String, int > > horizontalGroups; -}; - - -class Benchmark - : protected Logging -{ -public: - using Logging::MetadataElement; - using Logging::MetadataMap; - using Logging::MetadataColumns; - - Benchmark( const int & loops = 10, - bool verbose = true ) - : Logging(verbose), loops(loops) - { } - - // TODO: ensure that this is not called in the middle of the benchmark - // (or just remove it completely?) - void - setLoops( const int & loops ) - { - this->loops = loops; - } - - // Marks the start of a new benchmark - void - newBenchmark( const String & title ) - { - closeTable(); - writeTitle( title ); - } - - // Marks the start of a new benchmark (with custom metadata) - void - newBenchmark( const String & title, - MetadataMap metadata ) - { - closeTable(); - writeTitle( title ); - // add loops to metadata - metadata["loops"] = String(loops); - writeMetadata( metadata ); - } - - // Sets metadata columns -- values used for all subsequent rows until - // the next call to this function. - void - setMetadataColumns( const MetadataColumns & metadata ) - { - if( metadataColumns != metadata ) - header_changed = true; - metadataColumns = metadata; - } - - // TODO: maybe should be renamed to createVerticalGroup and ensured that vertical and horizontal groups are not used within the same "Benchmark" - // Sets current operation -- operations expand the table vertically - // - baseTime should be reset to 0.0 for most operations, but sometimes - // it is useful to override it - // - Order of operations inside a "Benchmark" does not matter, rows can be - // easily sorted while converting to HTML.) - void - setOperation( const String & operation, - const double & datasetSize = 0.0, // in GB - const double & baseTime = 0.0 ) - { - if( metadataColumns.size() > 0 && String(metadataColumns[ 0 ].first) == "operation" ) { - metadataColumns[ 0 ].second = operation; - } - else { - metadataColumns.insert( metadataColumns.begin(), {"operation", operation} ); - } - setOperation( datasetSize, baseTime ); - header_changed = true; - } - - void - setOperation( const double & datasetSize = 0.0, - const double & baseTime = 0.0 ) - { - this->datasetSize = datasetSize; - this->baseTime = baseTime; - } - - // Creates new horizontal groups inside a benchmark -- increases the number - // of columns in the "Benchmark", implies column spanning. - // (Useful e.g. for SpMV formats, different configurations etc.) - void - createHorizontalGroup( const String & name, - const int & subcolumns ) - { - if( horizontalGroups.size() == 0 ) { - horizontalGroups.push_back( {name, subcolumns} ); - } - else { - auto & last = horizontalGroups.back(); - if( last.first != name && last.second > 0 ) { - horizontalGroups.push_back( {name, subcolumns} ); - } - else { - last.first = name; - last.second = subcolumns; - } - } - } - - // Times a single ComputeFunction. Subsequent calls implicitly split - // the current "horizontal group" into sub-columns identified by - // "performer", which are further split into "bandwidth", "time" and - // "speedup" columns. - // TODO: allow custom columns bound to lambda functions (e.g. for Gflops calculation) - // Also terminates the recursion of the following variadic template. - template< typename ResetFunction, - typename ComputeFunction > - double - time( ResetFunction reset, - const String & performer, - ComputeFunction & compute ) - { - const double time = timeFunction( compute, reset, loops ); - const double bandwidth = datasetSize / time; - const double speedup = this->baseTime / time; - if( this->baseTime == 0.0 ) - this->baseTime = time; - - writeTableHeader( performer, HeaderElements({"bandwidth", "time", "speedup"}) ); - writeTableRow( performer, RowElements({ bandwidth, time, speedup }) ); - - return this->baseTime; - } - - // Recursive template function to deal with multiple computations with the - // same reset function. - template< typename ResetFunction, - typename ComputeFunction, - typename... NextComputations > - inline double - time( ResetFunction reset, - const String & performer, - ComputeFunction & compute, - NextComputations & ... nextComputations ) - { - time( reset, performer, compute ); - time( reset, nextComputations... ); - return this->baseTime; - } - - // Adds an error message to the log. Should be called in places where the - // "time" method could not be called (e.g. due to failed allocation). - void - addErrorMessage( const char* msg, - const int & numberOfComputations = 1 ) - { - // each computation has 3 subcolumns - const int colspan = 3 * numberOfComputations; - writeErrorMessage( msg, colspan ); - } - - using Logging::save; - -protected: - int loops; - double datasetSize = 0.0; - double baseTime = 0.0; -}; - -} // namespace benchmarks -} // namespace tnl diff --git a/tests/benchmarks/spmv.h b/tests/benchmarks/spmv.h deleted file mode 100644 index cc957c7278ec9b21a418e627af9063e32bec3785..0000000000000000000000000000000000000000 --- a/tests/benchmarks/spmv.h +++ /dev/null @@ -1,191 +0,0 @@ -/*************************************************************************** - spmv.h - description - ------------------- - begin : Dec 30, 2015 - copyright : (C) 2015 by Tomas Oberhuber et al. - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -// Implemented by: Jakub Klinkovsky - -#pragma once - -#include "benchmarks.h" - -#include <TNL/Containers/List.h> -#include <TNL/Pointers/DevicePointer.h> -#include <TNL/Matrices/CSR.h> -#include <TNL/Matrices/Ellpack.h> -#include <TNL/Matrices/SlicedEllpack.h> -#include <TNL/Matrices/ChunkedEllpack.h> - -namespace TNL -{ -namespace benchmarks -{ - -// silly alias to match the number of template parameters with other formats -template< typename Real, typename Device, typename Index > -using SlicedEllpack = Matrices::SlicedEllpack< Real, Device, Index >; - -template< typename Matrix > -int setHostTestMatrix( Matrix& matrix, - const int elementsPerRow ) -{ - const int size = matrix.getRows(); - int elements( 0 ); - for( int row = 0; row < size; row++ ) { - int col = row - elementsPerRow / 2; - for( int element = 0; element < elementsPerRow; element++ ) { - if( col + element >= 0 && - col + element < size ) - { - matrix.setElement( row, col + element, element + 1 ); - elements++; - } - } - } - return elements; -} - -#ifdef HAVE_CUDA -template< typename Matrix > -__global__ void setCudaTestMatrixKernel( Matrix* matrix, - const int elementsPerRow, - const int gridIdx ) -{ - const int rowIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - if( rowIdx >= matrix->getRows() ) - return; - int col = rowIdx - elementsPerRow / 2; - for( int element = 0; element < elementsPerRow; element++ ) { - if( col + element >= 0 && - col + element < matrix->getColumns() ) - matrix->setElementFast( rowIdx, col + element, element + 1 ); - } -} -#endif - -template< typename Matrix > -void setCudaTestMatrix( Matrix& matrix, - const int elementsPerRow ) -{ -#ifdef HAVE_CUDA - typedef typename Matrix::IndexType IndexType; - typedef typename Matrix::RealType RealType; - Pointers::DevicePointer< Matrix > kernel_matrix( matrix ); - dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() ); - const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); - const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() ); - for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { - if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize(); - setCudaTestMatrixKernel< Matrix > - <<< cudaGridSize, cudaBlockSize >>> - ( &kernel_matrix.template modifyData< Devices::Cuda >(), elementsPerRow, gridIdx ); - TNL_CHECK_CUDA_DEVICE; - } -#endif -} - - -// TODO: rename as benchmark_SpMV_synthetic and move to spmv-synthetic.h -template< typename Real, - template< typename, typename, typename > class Matrix, - template< typename, typename, typename > class Vector = Containers::Vector > -bool -benchmarkSpMV( Benchmark & benchmark, - const int & loops, - const int & size, - const int elementsPerRow = 5 ) -{ - typedef Matrix< Real, Devices::Host, int > HostMatrix; - typedef Matrix< Real, Devices::Cuda, int > DeviceMatrix; - typedef Containers::Vector< Real, Devices::Host, int > HostVector; - typedef Containers::Vector< Real, Devices::Cuda, int > CudaVector; - - HostMatrix hostMatrix; - DeviceMatrix deviceMatrix; - Containers::Vector< int, Devices::Host, int > hostRowLengths; - Containers::Vector< int, Devices::Cuda, int > deviceRowLengths; - HostVector hostVector, hostVector2; - CudaVector deviceVector, deviceVector2; - - // create benchmark group - Containers::List< String > parsedType; - parseObjectType( HostMatrix::getType(), parsedType ); - benchmark.createHorizontalGroup( parsedType[ 0 ], 2 ); - - hostRowLengths.setSize( size ); - hostMatrix.setDimensions( size, size ); - hostVector.setSize( size ); - hostVector2.setSize( size ); -#ifdef HAVE_CUDA - deviceRowLengths.setSize( size ); - deviceMatrix.setDimensions( size, size ); - deviceVector.setSize( size ); - deviceVector2.setSize( size ); -#endif - - hostRowLengths.setValue( elementsPerRow ); -#ifdef HAVE_CUDA - deviceRowLengths.setValue( elementsPerRow ); -#endif - - hostMatrix.setCompressedRowLengths( hostRowLengths ); -#ifdef HAVE_CUDA - deviceMatrix.setCompressedRowLengths( deviceRowLengths ); -#endif - - const int elements = setHostTestMatrix< HostMatrix >( hostMatrix, elementsPerRow ); - setCudaTestMatrix< DeviceMatrix >( deviceMatrix, elementsPerRow ); - const double datasetSize = ( double ) loops * elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; - - // reset function - auto reset = [&]() { - hostVector.setValue( 1.0 ); - hostVector2.setValue( 0.0 ); -#ifdef HAVE_CUDA - deviceVector.setValue( 1.0 ); - deviceVector2.setValue( 0.0 ); -#endif - }; - - // compute functions - auto spmvHost = [&]() { - hostMatrix.vectorProduct( hostVector, hostVector2 ); - }; - auto spmvCuda = [&]() { - deviceMatrix.vectorProduct( deviceVector, deviceVector2 ); - }; - - benchmark.setOperation( datasetSize ); - benchmark.time( reset, "CPU", spmvHost ); -#ifdef HAVE_CUDA - benchmark.time( reset, "GPU", spmvCuda ); -#endif - - return true; -} - -template< typename Real = double, - typename Index = int > -bool -benchmarkSpmvSynthetic( Benchmark & benchmark, - const int & loops, - const int & size, - const int & elementsPerRow ) -{ - bool result = true; - // TODO: benchmark all formats from tnl-benchmark-spmv (different parameters of the base formats) - result |= benchmarkSpMV< Real, Matrices::CSR >( benchmark, loops, size, elementsPerRow ); - result |= benchmarkSpMV< Real, Matrices::Ellpack >( benchmark, loops, size, elementsPerRow ); - result |= benchmarkSpMV< Real, SlicedEllpack >( benchmark, loops, size, elementsPerRow ); - result |= benchmarkSpMV< Real, Matrices::ChunkedEllpack >( benchmark, loops, size, elementsPerRow ); - return result; -} - -} // namespace benchmarks -} // namespace tnl diff --git a/tests/benchmarks/tnl-benchmark-blas.h b/tests/benchmarks/tnl-benchmark-blas.h deleted file mode 100644 index 8c71209534894d10a430bae874035c9d14d7bbc3..0000000000000000000000000000000000000000 --- a/tests/benchmarks/tnl-benchmark-blas.h +++ /dev/null @@ -1,192 +0,0 @@ -/*************************************************************************** - tnl-benchmark-blas.h - description - ------------------- - begin : Jan 27, 2010 - copyright : (C) 2010 by Tomas Oberhuber et al. - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -// Implemented by: Jakub Klinkovsky - -#pragma once - -#include <TNL/Devices/Host.h> -#include <TNL/Devices/CudaDeviceInfo.h> -#include <TNL/Devices/SystemInfo.h> -#include <TNL/Config/ConfigDescription.h> -#include <TNL/Config/ParameterContainer.h> - -#include "array-operations.h" -#include "vector-operations.h" -#include "spmv.h" - -using namespace TNL; -using namespace TNL::benchmarks; - - -// TODO: should benchmarks check the result of the computation? - - -template< typename Real > -void -runBlasBenchmarks( Benchmark & benchmark, - Benchmark::MetadataMap metadata, - const std::size_t & minSize, - const std::size_t & maxSize, - const double & sizeStepFactor, - const unsigned & loops, - const unsigned & elementsPerRow ) -{ - const String precision = getType< Real >(); - metadata["precision"] = precision; - - // Array operations - benchmark.newBenchmark( String("Array operations (") + precision + ")", - metadata ); - for( std::size_t size = minSize; size <= maxSize; size *= 2 ) { - benchmark.setMetadataColumns( Benchmark::MetadataColumns({ - {"size", size}, - } )); - benchmarkArrayOperations< Real >( benchmark, loops, size ); - } - - // Vector operations - benchmark.newBenchmark( String("Vector operations (") + precision + ")", - metadata ); - for( std::size_t size = minSize; size <= maxSize; size *= sizeStepFactor ) { - benchmark.setMetadataColumns( Benchmark::MetadataColumns({ - {"size", size}, - } )); - benchmarkVectorOperations< Real >( benchmark, loops, size ); - } - - // Sparse matrix-vector multiplication - benchmark.newBenchmark( String("Sparse matrix-vector multiplication (") + precision + ")", - metadata ); - for( std::size_t size = minSize; size <= maxSize; size *= 2 ) { - benchmark.setMetadataColumns( Benchmark::MetadataColumns({ - {"rows", size}, - {"columns", size}, - {"elements per row", elementsPerRow}, - } )); - benchmarkSpmvSynthetic< Real >( benchmark, loops, size, elementsPerRow ); - } -} - -void -setupConfig( Config::ConfigDescription & config ) -{ - config.addDelimiter( "Benchmark settings:" ); - config.addEntry< String >( "log-file", "Log file name.", "tnl-benchmark-blas.log"); - config.addEntry< String >( "output-mode", "Mode for opening the log file.", "overwrite" ); - config.addEntryEnum( "append" ); - config.addEntryEnum( "overwrite" ); - config.addEntry< String >( "precision", "Precision of the arithmetics.", "double" ); - config.addEntryEnum( "float" ); - config.addEntryEnum( "double" ); - config.addEntryEnum( "all" ); - config.addEntry< int >( "min-size", "Minimum size of arrays/vectors used in the benchmark.", 100000 ); - config.addEntry< int >( "max-size", "Minimum size of arrays/vectors used in the benchmark.", 10000000 ); - config.addEntry< int >( "size-step-factor", "Factor determining the size of arrays/vectors used in the benchmark. First size is min-size and each following size is stepFactor*previousSize, up to max-size.", 2 ); - config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 ); - config.addEntry< int >( "elements-per-row", "Number of elements per row of the sparse matrix used in the matrix-vector multiplication benchmark.", 5 ); - config.addEntry< int >( "verbose", "Verbose mode.", 1 ); - - config.addDelimiter( "Device settings:" ); - Devices::Host::configSetup( config ); - Devices::Cuda::configSetup( config ); -} - -int -main( int argc, char* argv[] ) -{ - Config::ParameterContainer parameters; - Config::ConfigDescription conf_desc; - - setupConfig( conf_desc ); - - if( ! parseCommandLine( argc, argv, conf_desc, parameters ) ) { - conf_desc.printUsage( argv[ 0 ] ); - return 1; - } - - Devices::Host::setup( parameters ); - Devices::Cuda::setup( parameters ); - - const String & logFileName = parameters.getParameter< String >( "log-file" ); - const String & outputMode = parameters.getParameter< String >( "output-mode" ); - const String & precision = parameters.getParameter< String >( "precision" ); - // FIXME: getParameter< std::size_t >() does not work with parameters added with addEntry< int >(), - // which have a default value. The workaround below works for int values, but it is not possible - // to pass 64-bit integer values -// const std::size_t minSize = parameters.getParameter< std::size_t >( "min-size" ); -// const std::size_t maxSize = parameters.getParameter< std::size_t >( "max-size" ); - const std::size_t minSize = parameters.getParameter< int >( "min-size" ); - const std::size_t maxSize = parameters.getParameter< int >( "max-size" ); - const unsigned sizeStepFactor = parameters.getParameter< unsigned >( "size-step-factor" ); - const unsigned loops = parameters.getParameter< unsigned >( "loops" ); - const unsigned elementsPerRow = parameters.getParameter< unsigned >( "elements-per-row" ); - const unsigned verbose = parameters.getParameter< unsigned >( "verbose" ); - - if( sizeStepFactor <= 1 ) { - std::cerr << "The value of --size-step-factor must be greater than 1." << std::endl; - return EXIT_FAILURE; - } - - // open log file - auto mode = std::ios::out; - if( outputMode == "append" ) - mode |= std::ios::app; - std::ofstream logFile( logFileName.getString(), mode ); - - // init benchmark and common metadata - Benchmark benchmark( loops, verbose ); - - // prepare global metadata - const int cpu_id = 0; - Devices::CacheSizes cacheSizes = Devices::SystemInfo::getCPUCacheSizes( cpu_id ); - String cacheInfo = String( cacheSizes.L1data ) + ", " - + String( cacheSizes.L1instruction ) + ", " - + String( cacheSizes.L2 ) + ", " - + String( cacheSizes.L3 ); -#ifdef HAVE_CUDA - const int activeGPU = Devices::CudaDeviceInfo::getActiveDevice(); - const String deviceArch = String( Devices::CudaDeviceInfo::getArchitectureMajor( activeGPU ) ) + "." + - String( Devices::CudaDeviceInfo::getArchitectureMinor( activeGPU ) ); -#endif - Benchmark::MetadataMap metadata { - { "host name", Devices::SystemInfo::getHostname() }, - { "architecture", Devices::SystemInfo::getArchitecture() }, - { "system", Devices::SystemInfo::getSystemName() }, - { "system release", Devices::SystemInfo::getSystemRelease() }, - { "start time", Devices::SystemInfo::getCurrentTime() }, - { "CPU model name", Devices::SystemInfo::getCPUModelName( cpu_id ) }, - { "CPU cores", Devices::SystemInfo::getNumberOfCores( cpu_id ) }, - { "CPU threads per core", Devices::SystemInfo::getNumberOfThreads( cpu_id ) / Devices::SystemInfo::getNumberOfCores( cpu_id ) }, - { "CPU max frequency (MHz)", Devices::SystemInfo::getCPUMaxFrequency( cpu_id ) / 1e3 }, - { "CPU cache sizes (L1d, L1i, L2, L3) (kiB)", cacheInfo }, -#ifdef HAVE_CUDA - { "GPU name", Devices::CudaDeviceInfo::getDeviceName( activeGPU ) }, - { "GPU architecture", deviceArch }, - { "GPU CUDA cores", Devices::CudaDeviceInfo::getCudaCores( activeGPU ) }, - { "GPU clock rate (MHz)", (double) Devices::CudaDeviceInfo::getClockRate( activeGPU ) / 1e3 }, - { "GPU global memory (GB)", (double) Devices::CudaDeviceInfo::getGlobalMemory( activeGPU ) / 1e9 }, - { "GPU memory clock rate (MHz)", (double) Devices::CudaDeviceInfo::getMemoryClockRate( activeGPU ) / 1e3 }, - { "GPU memory ECC enabled", Devices::CudaDeviceInfo::getECCEnabled( activeGPU ) }, -#endif - }; - - if( precision == "all" || precision == "float" ) - runBlasBenchmarks< float >( benchmark, metadata, minSize, maxSize, sizeStepFactor, loops, elementsPerRow ); - if( precision == "all" || precision == "double" ) - runBlasBenchmarks< double >( benchmark, metadata, minSize, maxSize, sizeStepFactor, loops, elementsPerRow ); - - if( ! benchmark.save( logFile ) ) { - std::cerr << "Failed to write the benchmark results to file '" << parameters.getParameter< String >( "log-file" ) << "'." << std::endl; - return EXIT_FAILURE; - } - - return EXIT_SUCCESS; -} diff --git a/tests/benchmarks/vector-operations.h b/tests/benchmarks/vector-operations.h deleted file mode 100644 index cdf2443964a5bb33354de6e0ec0688018c7128a1..0000000000000000000000000000000000000000 --- a/tests/benchmarks/vector-operations.h +++ /dev/null @@ -1,442 +0,0 @@ -/*************************************************************************** - vector-operations.h - description - ------------------- - begin : Dec 30, 2015 - copyright : (C) 2015 by Tomas Oberhuber et al. - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -// Implemented by: Jakub Klinkovsky - -#pragma once - -#include <stdlib.h> // srand48 - -#include "benchmarks.h" - -#include <TNL/Containers/Vector.h> - -#ifdef HAVE_CUDA -#include "cublasWrappers.h" -#endif - -namespace TNL -{ -namespace benchmarks -{ - -template< typename Real = double, - typename Index = int > -bool -benchmarkVectorOperations( Benchmark & benchmark, - const int & loops, - const long & size ) -{ - typedef Containers::Vector< Real, Devices::Host, Index > HostVector; - typedef Containers::Vector< Real, Devices::Cuda, Index > CudaVector; - using namespace std; - - double datasetSize = ( double ) ( loops * size ) * sizeof( Real ) / oneGB; - - HostVector hostVector, hostVector2; - CudaVector deviceVector, deviceVector2; - hostVector.setSize( size ); - hostVector2.setSize( size ); -#ifdef HAVE_CUDA - deviceVector.setSize( size ); - deviceVector2.setSize( size ); -#endif - - Real resultHost, resultDevice; - -#ifdef HAVE_CUDA - cublasHandle_t cublasHandle; - cublasCreate( &cublasHandle ); -#endif - - - // reset functions - // (Make sure to always use some in benchmarks, even if it's not necessary - // to assure correct result - it helps to clear cache and avoid optimizations - // of the benchmark loop.) - auto reset1 = [&]() { - hostVector.setValue( 1.0 ); -#ifdef HAVE_CUDA - deviceVector.setValue( 1.0 ); -#endif - // A relatively harmless call to keep the compiler from realizing we - // don't actually do any useful work with the result of the reduciton. - srand48(resultHost); - resultHost = resultDevice = 0.0; - }; - auto reset2 = [&]() { - hostVector2.setValue( 1.0 ); -#ifdef HAVE_CUDA - deviceVector2.setValue( 1.0 ); -#endif - }; - auto reset12 = [&]() { - reset1(); - reset2(); - }; - - - reset12(); - - - auto maxHost = [&]() { - resultHost = hostVector.max(); - }; - auto maxHostGeneral = [&]() { - Real result( 0 ); - Containers::Algorithms::ParallelReductionMax< Real > operation; - Containers::Algorithms::Reduction< Devices::Host >::reduce( - operation, - hostVector.getSize(), - hostVector.getData(), - ( Real* ) 0, - result ); - return result; - }; - auto maxCuda = [&]() { - resultDevice = deviceVector.max(); - }; - benchmark.setOperation( "max", datasetSize ); - benchmark.time( reset1, "CPU", maxHost ); - benchmark.time( reset1, "CPU (general)", maxHostGeneral ); -#ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", maxCuda ); -#endif - - - auto minHost = [&]() { - resultHost = hostVector.min(); - }; - auto minHostGeneral = [&]() { - Real result( 0 ); - Containers::Algorithms::ParallelReductionMin< Real > operation; - Containers::Algorithms::Reduction< Devices::Host >::reduce( - operation, - hostVector.getSize(), - hostVector.getData(), - ( Real* ) 0, - result ); - return result; - }; - auto minCuda = [&]() { - resultDevice = deviceVector.min(); - }; - benchmark.setOperation( "min", datasetSize ); - benchmark.time( reset1, "CPU", minHost ); - benchmark.time( reset1, "CPU (general)", minHostGeneral ); -#ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", minCuda ); -#endif - - - auto absMaxHost = [&]() { - resultHost = hostVector.absMax(); - }; - auto absMaxHostGeneral = [&]() { - Real result( 0 ); - Containers::Algorithms::ParallelReductionAbsMax< Real > operation; - Containers::Algorithms::Reduction< Devices::Host >::reduce( - operation, - hostVector.getSize(), - hostVector.getData(), - ( Real* ) 0, - result ); - return result; - }; - auto absMaxCuda = [&]() { - resultDevice = deviceVector.absMax(); - }; -#ifdef HAVE_CUDA - auto absMaxCublas = [&]() { - int index = 0; - cublasIgamax( cublasHandle, size, - deviceVector.getData(), 1, - &index ); - resultDevice = deviceVector.getElement( index ); - }; -#endif - benchmark.setOperation( "absMax", datasetSize ); - benchmark.time( reset1, "CPU", absMaxHost ); - benchmark.time( reset1, "CPU (general)", absMaxHostGeneral ); -#ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", absMaxCuda ); - benchmark.time( reset1, "cuBLAS", absMaxCublas ); -#endif - - - auto absMinHost = [&]() { - resultHost = hostVector.absMin(); - }; - auto absMinHostGeneral = [&]() { - Real result( 0 ); - Containers::Algorithms::ParallelReductionAbsMin< Real > operation; - Containers::Algorithms::Reduction< Devices::Host >::reduce( - operation, - hostVector.getSize(), - hostVector.getData(), - ( Real* ) 0, - result ); - return result; - }; - auto absMinCuda = [&]() { - resultDevice = deviceVector.absMin(); - }; -#ifdef HAVE_CUDA - auto absMinCublas = [&]() { - int index = 0; - cublasIgamin( cublasHandle, size, - deviceVector.getData(), 1, - &index ); - resultDevice = deviceVector.getElement( index ); - }; -#endif - benchmark.setOperation( "absMin", datasetSize ); - benchmark.time( reset1, "CPU", absMinHost ); - benchmark.time( reset1, "CPU (general)", absMinHostGeneral ); -#ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", absMinCuda ); - benchmark.time( reset1, "cuBLAS", absMinCublas ); -#endif - - - auto sumHost = [&]() { - resultHost = hostVector.sum(); - }; - auto sumHostGeneral = [&]() { - Real result( 0 ); - Containers::Algorithms::ParallelReductionSum< Real > operation; - Containers::Algorithms::Reduction< Devices::Host >::reduce( - operation, - hostVector.getSize(), - hostVector.getData(), - ( Real* ) 0, - result ); - return result; - }; - auto sumCuda = [&]() { - resultDevice = deviceVector.sum(); - }; - benchmark.setOperation( "sum", datasetSize ); - benchmark.time( reset1, "CPU", sumHost ); - benchmark.time( reset1, "CPU (general)", sumHostGeneral ); -#ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", sumCuda ); -#endif - - - auto l1normHost = [&]() { - resultHost = hostVector.lpNorm( 1.0 ); - }; - auto l1normHostGeneral = [&]() { - Real result( 0 ); - Containers::Algorithms::ParallelReductionAbsSum< Real > operation; - Containers::Algorithms::Reduction< Devices::Host >::reduce( - operation, - hostVector.getSize(), - hostVector.getData(), - ( Real* ) 0, - result ); - return result; - }; - auto l1normCuda = [&]() { - resultDevice = deviceVector.lpNorm( 1.0 ); - }; -#ifdef HAVE_CUDA - auto l1normCublas = [&]() { - cublasGasum( cublasHandle, size, - deviceVector.getData(), 1, - &resultDevice ); - }; -#endif - benchmark.setOperation( "l1 norm", datasetSize ); - benchmark.time( reset1, "CPU", l1normHost ); - benchmark.time( reset1, "CPU (general)", l1normHostGeneral ); -#ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", l1normCuda ); - benchmark.time( reset1, "cuBLAS", l1normCublas ); -#endif - - - auto l2normHost = [&]() { - resultHost = hostVector.lpNorm( 2.0 ); - }; - auto l2normHostGeneral = [&]() { - Real result( 0 ); - Containers::Algorithms::ParallelReductionL2Norm< Real > operation; - Containers::Algorithms::Reduction< Devices::Host >::reduce( - operation, - hostVector.getSize(), - hostVector.getData(), - ( Real* ) 0, - result ); - return result; - }; - auto l2normCuda = [&]() { - resultDevice = deviceVector.lpNorm( 2.0 ); - }; -#ifdef HAVE_CUDA - auto l2normCublas = [&]() { - cublasGnrm2( cublasHandle, size, - deviceVector.getData(), 1, - &resultDevice ); - }; -#endif - benchmark.setOperation( "l2 norm", datasetSize ); - benchmark.time( reset1, "CPU", l2normHost ); - benchmark.time( reset1, "CPU (general)", l2normHostGeneral ); -#ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", l2normCuda ); - benchmark.time( reset1, "cuBLAS", l2normCublas ); -#endif - - - auto l3normHost = [&]() { - resultHost = hostVector.lpNorm( 3.0 ); - }; - auto l3normHostGeneral = [&]() { - Real result( 0 ); - Containers::Algorithms::ParallelReductionLpNorm< Real > operation; - operation.setPower( 3.0 ); - Containers::Algorithms::Reduction< Devices::Host >::reduce( - operation, - hostVector.getSize(), - hostVector.getData(), - ( Real* ) 0, - result ); - return result; - }; - auto l3normCuda = [&]() { - resultDevice = deviceVector.lpNorm( 3.0 ); - }; - benchmark.setOperation( "l3 norm", datasetSize ); - benchmark.time( reset1, "CPU", l3normHost ); - benchmark.time( reset1, "CPU (general)", l3normHostGeneral ); -#ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", l3normCuda ); -#endif - - - auto scalarProductHost = [&]() { - resultHost = hostVector.scalarProduct( hostVector2 ); - }; - auto scalarProductHostGeneral = [&]() { - Real result( 0 ); - Containers::Algorithms::ParallelReductionScalarProduct< Real, Real > operation; - Containers::Algorithms::Reduction< Devices::Host >::reduce( - operation, - hostVector.getSize(), - hostVector.getData(), - hostVector2.getData(), - result ); - return result; - }; - auto scalarProductCuda = [&]() { - resultDevice = deviceVector.scalarProduct( deviceVector2 ); - }; -#ifdef HAVE_CUDA - auto scalarProductCublas = [&]() { - cublasGdot( cublasHandle, size, - deviceVector.getData(), 1, - deviceVector2.getData(), 1, - &resultDevice ); - }; -#endif - benchmark.setOperation( "scalar product", 2 * datasetSize ); - benchmark.time( reset1, "CPU", scalarProductHost ); - benchmark.time( reset1, "CPU (general)", scalarProductHostGeneral ); -#ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", scalarProductCuda ); - benchmark.time( reset1, "cuBLAS", scalarProductCublas ); -#endif - - /* - std::cout << "Benchmarking prefix-sum:" << std::endl; - timer.reset(); - timer.start(); - hostVector.computePrefixSum(); - timer.stop(); - timeHost = timer.getTime(); - bandwidth = 2 * datasetSize / loops / timer.getTime(); - std::cout << " CPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << std::endl; - - timer.reset(); - timer.start(); - deviceVector.computePrefixSum(); - timer.stop(); - timeDevice = timer.getTime(); - bandwidth = 2 * datasetSize / loops / timer.getTime(); - std::cout << " GPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << std::endl; - std::cout << " CPU/GPU speedup: " << timeHost / timeDevice << std::endl; - - HostVector auxHostVector; - auxHostVector.setLike( deviceVector ); - auxHostVector = deviceVector; - for( int i = 0; i < size; i++ ) - if( hostVector.getElement( i ) != auxHostVector.getElement( i ) ) - { - std::cerr << "Error in prefix sum at position " << i << ": " << hostVector.getElement( i ) << " != " << auxHostVector.getElement( i ) << std::endl; - } - */ - - - auto multiplyHost = [&]() { - hostVector *= 0.5; - }; - auto multiplyCuda = [&]() { - deviceVector *= 0.5; - }; -#ifdef HAVE_CUDA - auto multiplyCublas = [&]() { - const Real alpha = 0.5; - cublasGscal( cublasHandle, size, - &alpha, - deviceVector.getData(), 1 ); - }; -#endif - benchmark.setOperation( "scalar multiplication", 2 * datasetSize ); - benchmark.time( reset1, "CPU", multiplyHost ); -#ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", multiplyCuda ); - benchmark.time( reset1, "cuBLAS", multiplyCublas ); -#endif - - - auto addVectorHost = [&]() { - hostVector.addVector( hostVector2 ); - }; - auto addVectorCuda = [&]() { - deviceVector.addVector( deviceVector2 ); - }; -#ifdef HAVE_CUDA - auto addVectorCublas = [&]() { - const Real alpha = 1.0; - cublasGaxpy( cublasHandle, size, - &alpha, - deviceVector2.getData(), 1, - deviceVector.getData(), 1 ); - }; -#endif - benchmark.setOperation( "vector addition", 3 * datasetSize ); - benchmark.time( reset1, "CPU", addVectorHost ); -#ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", addVectorCuda ); - benchmark.time( reset1, "cuBLAS", addVectorCublas ); -#endif - - -#ifdef HAVE_CUDA - cublasDestroy( cublasHandle ); -#endif - - return true; -} - -} // namespace benchmarks -} // namespace tnl diff --git a/tests/mpi/GPUmeshFunctionEvaluateTest.cu b/tests/mpi/GPUmeshFunctionEvaluateTest.cu index 1008b4a4c04dd9edebebd62e4044e34d80587b11..1dcdb95207bbe4896e8a55311658edd1b7d9d440 100644 --- a/tests/mpi/GPUmeshFunctionEvaluateTest.cu +++ b/tests/mpi/GPUmeshFunctionEvaluateTest.cu @@ -9,6 +9,7 @@ #include <TNL/Meshes/Grid.h> #include <TNL/Communicators/MpiCommunicator.h> #include <TNL/Communicators/NoDistrCommunicator.h> +#include <TNL/Communicators/ScopedInitializer.h> #include <TNL/Functions/MeshFunction.h> #include <TNL/Meshes/DistributedMeshes/DistributedMesh.h> #include <TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h> @@ -56,7 +57,7 @@ int main ( int argc, char *argv[]) typedef LinearFunction<double,DIMENSION> LinearFunctionType; typedef ConstFunction<double,DIMENSION> ConstFunctionType; - CommunicatorType::Init(argc,argv); + Communicators::ScopedInitializer< CommunicatorType > mpi(argc, argv); int size=10; int cycles=1; @@ -165,12 +166,8 @@ int main ( int argc, char *argv[]) cout <<"sync: "<<sync.getRealTime()<<endl; cout<<"all: "<<all.getRealTime()<<endl<<endl; } - - - CommunicatorType::Finalize(); return 0; - } #else diff --git a/tests/mpi/MeshFunctionEvaluateTest.cpp b/tests/mpi/MeshFunctionEvaluateTest.cpp index 6a59ae85ce373c05839f1861be31c61d4377f713..3c06a34cb7e9713b253e8dd1f11dce1f794381a7 100644 --- a/tests/mpi/MeshFunctionEvaluateTest.cpp +++ b/tests/mpi/MeshFunctionEvaluateTest.cpp @@ -16,6 +16,7 @@ using namespace std; #include <TNL/Meshes/Grid.h> #include <TNL/Communicators/MpiCommunicator.h> #include <TNL/Communicators/NoDistrCommunicator.h> +#include <TNL/Communicators/ScopedInitializer.h> #include <TNL/Functions/MeshFunction.h> #include <TNL/Meshes/DistributedMeshes/DistributedMesh.h> #include <TNL/Meshes/DistributedMeshes/SubdomainOverlapsGetter.h> @@ -62,7 +63,7 @@ int main ( int argc, char *argv[]) typedef LinearFunction<double,DIMENSION> LinearFunctionType; typedef ConstFunction<double,DIMENSION> ConstFunctionType; - CommunicatorType::Init(argc,argv); + Communicators::ScopedInitializer< CommunicatorType > mpi(argc, argv); int size=9; int cycles=1; @@ -173,7 +174,6 @@ int main ( int argc, char *argv[]) cout <<"sync: "<<sync.getRealTime()<<endl; cout<<"all: "<<all.getRealTime()<<endl<<endl; } - CommunicatorType::Finalize(); #else std::cout<<"MPI not Supported." << std::endl; #endif diff --git a/tests/mpi/mpiio-save-load-test.cpp b/tests/mpi/mpiio-save-load-test.cpp index 0e7a8dff2ae08c4f23b59f9f2ebc04f043446ad4..0fa7ee7f65d8756779d2ffb0548eadfc3d9c4b90 100644 --- a/tests/mpi/mpiio-save-load-test.cpp +++ b/tests/mpi/mpiio-save-load-test.cpp @@ -2,6 +2,7 @@ #define MPIIO #include <TNL/Communicators/MpiCommunicator.h> +#include <TNL/Communicators/ScopedInitializer.h> #include <TNL/Functions/MeshFunction.h> #include <TNL/Meshes/DistributedMeshes/DistributedMesh.h> #include <TNL/Meshes/DistributedMeshes/DistributedGridIO.h> @@ -38,7 +39,7 @@ int main(int argc, char **argv) typedef typename DistributedGridType::CoordinatesType CoordinatesType; typedef LinearFunction<double,DIM> LinearFunctionType; - CommunicatorType::Init(argc, argv); + Communicators::ScopedInitializer< CommunicatorType > mpi_init(argc, argv); Pointers::SharedPointer< LinearFunctionType, Device > linearFunctionPtr; MeshFunctionEvaluator< MeshFunctionType, LinearFunctionType > linearFunctionEvaluator; @@ -92,9 +93,6 @@ int main(int argc, char **argv) else std::cout <<"Ok!"<<std::endl; } - - CommunicatorType::Finalize(); - } #else diff --git a/tests/mpi/mpiio-save-test.h b/tests/mpi/mpiio-save-test.h index 6e14b537e5ca3e5c7a70a4b967d6a7a522beeb4e..a824bd5b74c917f8d65de00a57b90526e536424e 100644 --- a/tests/mpi/mpiio-save-test.h +++ b/tests/mpi/mpiio-save-test.h @@ -2,6 +2,7 @@ #define MPIIO #include <TNL/Communicators/MpiCommunicator.h> +#include <TNL/Communicators/ScopedInitializer.h> #include <TNL/Functions/MeshFunction.h> #include <TNL/Meshes/DistributedMeshes/DistributedMesh.h> #include <TNL/Meshes/DistributedMeshes/DistributedGridIO.h> @@ -38,7 +39,7 @@ int main(int argc, char **argv) typedef typename DistributedGridType::CoordinatesType CoordinatesType; typedef LinearFunction<double,DIM> LinearFunctionType; - CommunicatorType::Init(argc, argv); + Communicators::ScopedInitializer< CommunicatorType > mpi_init(argc, argv); Pointers::SharedPointer< LinearFunctionType, Device > linearFunctionPtr; MeshFunctionEvaluator< MeshFunctionType, LinearFunctionType > linearFunctionEvaluator; @@ -81,9 +82,6 @@ int main(int argc, char **argv) String fileName=String("./meshFunction.tnl"); DistributedGridIO<MeshFunctionType,MpiIO> ::save(fileName, *meshFunctionptr ); - - CommunicatorType::Finalize(); - } #else