Loading CMakeLists.txt +2 −2 Original line number Diff line number Diff line Loading @@ -44,8 +44,8 @@ endif() # set Debug/Release options set( CMAKE_CXX_FLAGS "-std=c++11 -Wall -Wno-unused-local-typedefs -Wno-unused-variable" ) set( CMAKE_CXX_FLAGS_DEBUG "-g" ) set( CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -DNDEBUG" ) #set( CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -DNDEBUG -ftree-vectorizer-verbose=1 -ftree-vectorize -fopt-info-vec-missed -funroll-loops" ) set( CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -mtune=native -DNDEBUG" ) #set( CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -mtune=native -DNDEBUG -ftree-vectorizer-verbose=1 -ftree-vectorize -fopt-info-vec-missed -funroll-loops" ) # pass -rdynamic only in Debug mode set( CMAKE_SHARED_LIBRARY_LINK_C_FLAGS "" ) set( CMAKE_SHARED_LIBRARY_LINK_C_FLAGS_DEBUG "-rdynamic" ) Loading src/TNL/String.cpp +6 −0 Original line number Diff line number Diff line Loading @@ -55,6 +55,12 @@ String :: String( int number ) this->setString( convertToString( number ).getString() ); } String :: String( unsigned long int number ) : string( 0 ), length( 0 ) { this->setString( convertToString( number ).getString() ); } String :: String( long int number ) : string( 0 ), length( 0 ) { Loading src/TNL/String.h +2 −0 Original line number Diff line number Diff line Loading @@ -55,6 +55,8 @@ class String String( int number ); String( unsigned long int number ); String( long int number ); String( float number ); Loading tests/benchmarks/CMakeLists.txt +14 −19 Original line number Diff line number Diff line Loading @@ -2,11 +2,11 @@ ADD_SUBDIRECTORY( share ) ADD_SUBDIRECTORY( heat-equation-benchmark ) IF( BUILD_CUDA ) CUDA_ADD_EXECUTABLE( tnl-cuda-benchmarks${debugExt} tnl-cuda-benchmarks.cu ) CUDA_ADD_EXECUTABLE( tnl-benchmark-blas${debugExt} tnl-benchmark-blas.cu ) if( HAVE_CUBLAS STREQUAL "yes" ) CUDA_ADD_CUBLAS_TO_TARGET( tnl-cuda-benchmarks${debugExt} ) CUDA_ADD_CUBLAS_TO_TARGET( tnl-benchmark-blas${debugExt} ) endif() TARGET_LINK_LIBRARIES( tnl-cuda-benchmarks${debugExt} tnl${debugExt}-${tnlVersion} ${CUSPARSE_LIBRARY} ) TARGET_LINK_LIBRARIES( tnl-benchmark-blas${debugExt} tnl${debugExt}-${tnlVersion} ${CUSPARSE_LIBRARY} ) CUDA_ADD_EXECUTABLE( tnl-benchmark-spmv${debugExt} tnl-benchmark-spmv.cu ) TARGET_LINK_LIBRARIES( tnl-benchmark-spmv${debugExt} tnl${debugExt}-${tnlVersion} ${CUSPARSE_LIBRARY} ) Loading @@ -14,6 +14,9 @@ IF( BUILD_CUDA ) CUDA_ADD_EXECUTABLE( tnl-benchmark-linear-solvers${debugExt} tnl-benchmark-linear-solvers.cu ) TARGET_LINK_LIBRARIES( tnl-benchmark-linear-solvers${debugExt} tnl${debugExt}-${tnlVersion} ${CUSPARSE_LIBRARY} ) ELSE() ADD_EXECUTABLE( tnl-benchmark-blas${debugExt} tnl-benchmark-blas.cpp ) TARGET_LINK_LIBRARIES( tnl-benchmark-blas${debugExt} tnl${debugExt}-${tnlVersion} ) ADD_EXECUTABLE( tnl-benchmark-spmv${debugExt} tnl-benchmark-spmv.cpp ) TARGET_LINK_LIBRARIES( tnl-benchmark-spmv${debugExt} tnl${debugExt}-${tnlVersion} ) Loading @@ -21,16 +24,8 @@ ELSE() TARGET_LINK_LIBRARIES( tnl-benchmark-linear-solvers${debugExt} tnl${debugExt}-${tnlVersion} ) ENDIF() if( BUILD_CUDA ) INSTALL( TARGETS tnl-cuda-benchmarks${debugExt} RUNTIME DESTINATION bin ) endif() INSTALL( TARGETS tnl-benchmark-blas${debugExt} tnl-benchmark-spmv${debugExt} tnl-benchmark-linear-solvers${debugExt} RUNTIME DESTINATION bin ) tests/benchmarks/array-operations.h +38 −18 Original line number Diff line number Diff line Loading @@ -14,7 +14,7 @@ template< typename Real = double, bool benchmarkArrayOperations( Benchmark & benchmark, const int & loops, const int & size ) const long & size ) { typedef Containers::Array< Real, Devices::Host, Index > HostArray; typedef Containers::Array< Real, Devices::Cuda, Index > CudaArray; Loading @@ -25,9 +25,14 @@ benchmarkArrayOperations( Benchmark & benchmark, HostArray hostArray, hostArray2; CudaArray deviceArray, deviceArray2; if( ! hostArray.setSize( size ) || ! hostArray2.setSize( size ) || ! hostArray2.setSize( size ) #ifdef HAVE_CUDA || ! deviceArray.setSize( size ) || ! deviceArray2.setSize( size ) ) ! deviceArray2.setSize( size ) #endif ) { const char* msg = "error: allocation of arrays failed"; std::cerr << msg << std::endl; Loading @@ -41,11 +46,15 @@ benchmarkArrayOperations( Benchmark & benchmark, // reset functions auto reset1 = [&]() { hostArray.setValue( 1.0 ); #ifdef HAVE_CUDA deviceArray.setValue( 1.0 ); #endif }; auto reset2 = [&]() { hostArray2.setValue( 1.0 ); #ifdef HAVE_CUDA deviceArray2.setValue( 1.0 ); #endif }; auto reset12 = [&]() { reset1(); Loading @@ -63,9 +72,10 @@ benchmarkArrayOperations( Benchmark & benchmark, resultDevice = (int) deviceArray == deviceArray2; }; benchmark.setOperation( "comparison (operator==)", 2 * datasetSize ); benchmark.time( reset1, "CPU", compareHost, "GPU", compareCuda ); benchmark.time( reset1, "CPU", compareHost ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", compareCuda ); #endif auto copyAssignHostHost = [&]() { Loading @@ -75,9 +85,10 @@ benchmarkArrayOperations( Benchmark & benchmark, deviceArray = deviceArray2; }; benchmark.setOperation( "copy (operator=)", 2 * datasetSize ); double basetime = benchmark.time( reset1, "CPU", copyAssignHostHost, "GPU", copyAssignCudaCuda ); benchmark.time( reset1, "CPU", copyAssignHostHost ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", copyAssignCudaCuda ); #endif auto copyAssignHostCuda = [&]() { Loading @@ -86,10 +97,12 @@ benchmarkArrayOperations( Benchmark & benchmark, auto copyAssignCudaHost = [&]() { hostArray = deviceArray; }; #ifdef HAVE_CUDA benchmark.setOperation( "copy (operator=)", datasetSize, basetime ); benchmark.time( reset1, "CPU->GPU", copyAssignHostCuda, "GPU->CPU", copyAssignCudaHost ); #endif auto setValueHost = [&]() { Loading @@ -99,9 +112,10 @@ benchmarkArrayOperations( Benchmark & benchmark, deviceArray.setValue( 3.0 ); }; benchmark.setOperation( "setValue", datasetSize ); benchmark.time( reset1, "CPU", setValueHost, "GPU", setValueCuda ); benchmark.time( reset1, "CPU", setValueHost ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", setValueCuda ); #endif auto setSizeHost = [&]() { Loading @@ -112,12 +126,15 @@ benchmarkArrayOperations( Benchmark & benchmark, }; auto resetSize1 = [&]() { hostArray.reset(); #ifdef HAVE_CUDA deviceArray.reset(); #endif }; benchmark.setOperation( "allocation (setSize)", datasetSize ); benchmark.time( resetSize1, "CPU", setSizeHost, "GPU", setSizeCuda ); benchmark.time( resetSize1, "CPU", setSizeHost ); #ifdef HAVE_CUDA benchmark.time( resetSize1, "GPU", setSizeCuda ); #endif auto resetSizeHost = [&]() { Loading @@ -128,12 +145,15 @@ benchmarkArrayOperations( Benchmark & benchmark, }; auto setSize1 = [&]() { hostArray.setSize( size ); #ifdef HAVE_CUDA deviceArray.setSize( size ); #endif }; benchmark.setOperation( "deallocation (reset)", datasetSize ); benchmark.time( setSize1, "CPU", resetSizeHost, "GPU", resetSizeCuda ); benchmark.time( setSize1, "CPU", resetSizeHost ); #ifdef HAVE_CUDA benchmark.time( setSize1, "GPU", resetSizeCuda ); #endif return true; } Loading Loading
CMakeLists.txt +2 −2 Original line number Diff line number Diff line Loading @@ -44,8 +44,8 @@ endif() # set Debug/Release options set( CMAKE_CXX_FLAGS "-std=c++11 -Wall -Wno-unused-local-typedefs -Wno-unused-variable" ) set( CMAKE_CXX_FLAGS_DEBUG "-g" ) set( CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -DNDEBUG" ) #set( CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -DNDEBUG -ftree-vectorizer-verbose=1 -ftree-vectorize -fopt-info-vec-missed -funroll-loops" ) set( CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -mtune=native -DNDEBUG" ) #set( CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -mtune=native -DNDEBUG -ftree-vectorizer-verbose=1 -ftree-vectorize -fopt-info-vec-missed -funroll-loops" ) # pass -rdynamic only in Debug mode set( CMAKE_SHARED_LIBRARY_LINK_C_FLAGS "" ) set( CMAKE_SHARED_LIBRARY_LINK_C_FLAGS_DEBUG "-rdynamic" ) Loading
src/TNL/String.cpp +6 −0 Original line number Diff line number Diff line Loading @@ -55,6 +55,12 @@ String :: String( int number ) this->setString( convertToString( number ).getString() ); } String :: String( unsigned long int number ) : string( 0 ), length( 0 ) { this->setString( convertToString( number ).getString() ); } String :: String( long int number ) : string( 0 ), length( 0 ) { Loading
src/TNL/String.h +2 −0 Original line number Diff line number Diff line Loading @@ -55,6 +55,8 @@ class String String( int number ); String( unsigned long int number ); String( long int number ); String( float number ); Loading
tests/benchmarks/CMakeLists.txt +14 −19 Original line number Diff line number Diff line Loading @@ -2,11 +2,11 @@ ADD_SUBDIRECTORY( share ) ADD_SUBDIRECTORY( heat-equation-benchmark ) IF( BUILD_CUDA ) CUDA_ADD_EXECUTABLE( tnl-cuda-benchmarks${debugExt} tnl-cuda-benchmarks.cu ) CUDA_ADD_EXECUTABLE( tnl-benchmark-blas${debugExt} tnl-benchmark-blas.cu ) if( HAVE_CUBLAS STREQUAL "yes" ) CUDA_ADD_CUBLAS_TO_TARGET( tnl-cuda-benchmarks${debugExt} ) CUDA_ADD_CUBLAS_TO_TARGET( tnl-benchmark-blas${debugExt} ) endif() TARGET_LINK_LIBRARIES( tnl-cuda-benchmarks${debugExt} tnl${debugExt}-${tnlVersion} ${CUSPARSE_LIBRARY} ) TARGET_LINK_LIBRARIES( tnl-benchmark-blas${debugExt} tnl${debugExt}-${tnlVersion} ${CUSPARSE_LIBRARY} ) CUDA_ADD_EXECUTABLE( tnl-benchmark-spmv${debugExt} tnl-benchmark-spmv.cu ) TARGET_LINK_LIBRARIES( tnl-benchmark-spmv${debugExt} tnl${debugExt}-${tnlVersion} ${CUSPARSE_LIBRARY} ) Loading @@ -14,6 +14,9 @@ IF( BUILD_CUDA ) CUDA_ADD_EXECUTABLE( tnl-benchmark-linear-solvers${debugExt} tnl-benchmark-linear-solvers.cu ) TARGET_LINK_LIBRARIES( tnl-benchmark-linear-solvers${debugExt} tnl${debugExt}-${tnlVersion} ${CUSPARSE_LIBRARY} ) ELSE() ADD_EXECUTABLE( tnl-benchmark-blas${debugExt} tnl-benchmark-blas.cpp ) TARGET_LINK_LIBRARIES( tnl-benchmark-blas${debugExt} tnl${debugExt}-${tnlVersion} ) ADD_EXECUTABLE( tnl-benchmark-spmv${debugExt} tnl-benchmark-spmv.cpp ) TARGET_LINK_LIBRARIES( tnl-benchmark-spmv${debugExt} tnl${debugExt}-${tnlVersion} ) Loading @@ -21,16 +24,8 @@ ELSE() TARGET_LINK_LIBRARIES( tnl-benchmark-linear-solvers${debugExt} tnl${debugExt}-${tnlVersion} ) ENDIF() if( BUILD_CUDA ) INSTALL( TARGETS tnl-cuda-benchmarks${debugExt} RUNTIME DESTINATION bin ) endif() INSTALL( TARGETS tnl-benchmark-blas${debugExt} tnl-benchmark-spmv${debugExt} tnl-benchmark-linear-solvers${debugExt} RUNTIME DESTINATION bin )
tests/benchmarks/array-operations.h +38 −18 Original line number Diff line number Diff line Loading @@ -14,7 +14,7 @@ template< typename Real = double, bool benchmarkArrayOperations( Benchmark & benchmark, const int & loops, const int & size ) const long & size ) { typedef Containers::Array< Real, Devices::Host, Index > HostArray; typedef Containers::Array< Real, Devices::Cuda, Index > CudaArray; Loading @@ -25,9 +25,14 @@ benchmarkArrayOperations( Benchmark & benchmark, HostArray hostArray, hostArray2; CudaArray deviceArray, deviceArray2; if( ! hostArray.setSize( size ) || ! hostArray2.setSize( size ) || ! hostArray2.setSize( size ) #ifdef HAVE_CUDA || ! deviceArray.setSize( size ) || ! deviceArray2.setSize( size ) ) ! deviceArray2.setSize( size ) #endif ) { const char* msg = "error: allocation of arrays failed"; std::cerr << msg << std::endl; Loading @@ -41,11 +46,15 @@ benchmarkArrayOperations( Benchmark & benchmark, // reset functions auto reset1 = [&]() { hostArray.setValue( 1.0 ); #ifdef HAVE_CUDA deviceArray.setValue( 1.0 ); #endif }; auto reset2 = [&]() { hostArray2.setValue( 1.0 ); #ifdef HAVE_CUDA deviceArray2.setValue( 1.0 ); #endif }; auto reset12 = [&]() { reset1(); Loading @@ -63,9 +72,10 @@ benchmarkArrayOperations( Benchmark & benchmark, resultDevice = (int) deviceArray == deviceArray2; }; benchmark.setOperation( "comparison (operator==)", 2 * datasetSize ); benchmark.time( reset1, "CPU", compareHost, "GPU", compareCuda ); benchmark.time( reset1, "CPU", compareHost ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", compareCuda ); #endif auto copyAssignHostHost = [&]() { Loading @@ -75,9 +85,10 @@ benchmarkArrayOperations( Benchmark & benchmark, deviceArray = deviceArray2; }; benchmark.setOperation( "copy (operator=)", 2 * datasetSize ); double basetime = benchmark.time( reset1, "CPU", copyAssignHostHost, "GPU", copyAssignCudaCuda ); benchmark.time( reset1, "CPU", copyAssignHostHost ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", copyAssignCudaCuda ); #endif auto copyAssignHostCuda = [&]() { Loading @@ -86,10 +97,12 @@ benchmarkArrayOperations( Benchmark & benchmark, auto copyAssignCudaHost = [&]() { hostArray = deviceArray; }; #ifdef HAVE_CUDA benchmark.setOperation( "copy (operator=)", datasetSize, basetime ); benchmark.time( reset1, "CPU->GPU", copyAssignHostCuda, "GPU->CPU", copyAssignCudaHost ); #endif auto setValueHost = [&]() { Loading @@ -99,9 +112,10 @@ benchmarkArrayOperations( Benchmark & benchmark, deviceArray.setValue( 3.0 ); }; benchmark.setOperation( "setValue", datasetSize ); benchmark.time( reset1, "CPU", setValueHost, "GPU", setValueCuda ); benchmark.time( reset1, "CPU", setValueHost ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", setValueCuda ); #endif auto setSizeHost = [&]() { Loading @@ -112,12 +126,15 @@ benchmarkArrayOperations( Benchmark & benchmark, }; auto resetSize1 = [&]() { hostArray.reset(); #ifdef HAVE_CUDA deviceArray.reset(); #endif }; benchmark.setOperation( "allocation (setSize)", datasetSize ); benchmark.time( resetSize1, "CPU", setSizeHost, "GPU", setSizeCuda ); benchmark.time( resetSize1, "CPU", setSizeHost ); #ifdef HAVE_CUDA benchmark.time( resetSize1, "GPU", setSizeCuda ); #endif auto resetSizeHost = [&]() { Loading @@ -128,12 +145,15 @@ benchmarkArrayOperations( Benchmark & benchmark, }; auto setSize1 = [&]() { hostArray.setSize( size ); #ifdef HAVE_CUDA deviceArray.setSize( size ); #endif }; benchmark.setOperation( "deallocation (reset)", datasetSize ); benchmark.time( setSize1, "CPU", resetSizeHost, "GPU", resetSizeCuda ); benchmark.time( setSize1, "CPU", resetSizeHost ); #ifdef HAVE_CUDA benchmark.time( setSize1, "GPU", resetSizeCuda ); #endif return true; } Loading