diff --git a/CMakeLists.txt b/CMakeLists.txt index ccc0d9750d4bdfe4545dd8ee299bb1fdb4f4a522..2447f8540365f4446b4bbdec7046112183058836 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -320,7 +320,7 @@ endif() find_package( BLAS ) if( BLAS_FOUND ) set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_BLAS" ) - #message( "BLAS: ${BLAS_LIBRARIES}" ) + set( HAVE_BLAS TRUE) endif() #if( BUILD_MPI ) # FIND_PATH( PETSC_INCLUDE_DIR petsc.h @@ -455,3 +455,4 @@ message( " CMAKE_SHARED_LINKER_FLAGS_DEBUG = ${CMAKE_SHARED_LINKER_FLAGS_DEBUG message( " CMAKE_SHARED_LINKER_FLAGS_RELEASE = ${CMAKE_SHARED_LINKER_FLAGS_RELEASE}" ) message( " CUDA_NVCC_FLAGS = ${CUDA_NVCC_FLAGS}" ) message( " GMP_LIBRARIES = ${GMP_LIBRARIES}" ) +message( " BLAS_libraries = ${BLAS_LIBRARIES}" ) diff --git a/src/Benchmarks/BLAS/CMakeLists.txt b/src/Benchmarks/BLAS/CMakeLists.txt index 4e5a1199b7c6b77a6c8bc8b6b4caaa9981a1aa00..96611d0031cdbf3c6781074fc88abf4d9fd9bc67 100644 --- a/src/Benchmarks/BLAS/CMakeLists.txt +++ b/src/Benchmarks/BLAS/CMakeLists.txt @@ -4,6 +4,8 @@ if( BUILD_CUDA ) else() ADD_EXECUTABLE( tnl-benchmark-blas tnl-benchmark-blas.cpp ) endif() -target_link_libraries( tnl-benchmark-blas ${BLAS_LIBRARIES} ) +if( HAVE_BLAS ) + target_link_libraries( tnl-benchmark-blas ${BLAS_LIBRARIES} ) +endif() install( TARGETS tnl-benchmark-blas RUNTIME DESTINATION bin ) diff --git a/src/Benchmarks/BLAS/blasWrappers.h b/src/Benchmarks/BLAS/blasWrappers.h index c54981af4505ea54b2ae028e1ee2cef96a498b12..d1e0edff119c733126db27cdcc913cd189d4468e 100644 --- a/src/Benchmarks/BLAS/blasWrappers.h +++ b/src/Benchmarks/BLAS/blasWrappers.h @@ -1,7 +1,5 @@ #pragma once -#ifdef HAVE_CUDA - #ifdef HAVE_BLAS #include <cblas.h> @@ -17,15 +15,15 @@ inline int blasIgamax( int n, const double *x, int incx ) } -inline int blasIgamin( int n, const float *x, int incx ) +/*inline int blasIgamin( int n, const float *x, int incx ) { - return cblas_Isamin( n, x, incx ); + return cblas_isamin( n, x, incx ); } inline int blasIgamin( int n, const double *x, int incx ) { - return cblas_Idamin( n, x, incx ); -} + return cblas_idamin( n, x, incx ); +}*/ inline float blasGasum( int n, const float *x, int incx ) @@ -39,73 +37,54 @@ inline double blasGasum( int n, const double *x, int incx ) } -inline void -blasGaxpy( int n, const float *alpha, - const float *x, int incx, - float *y, int incy ) +inline void blasGaxpy( int n, const float alpha, + const float *x, int incx, + float *y, int incy ) { cblas_saxpy( n, alpha, x, incx, y, incy ); } -inline blasStatus_t -blasGaxpy( blasHandle_t int n, - const double *alpha, - const double *x, int incx, - double *y, int incy ) +inline void blasGaxpy( int n, const double alpha, + const double* x, int incx, + double *y, int incy ) { - return cblas_Daxpy( n, alpha, x, incx, y, incy ); + cblas_daxpy( n, alpha, x, incx, y, incy ); } -inline blasStatus_t -blasGdot( blasHandle_t int n, - const float *x, int incx, - const float *y, int incy, - float *result ) +inline float blasGdot( int n, const float* x, int incx, + const float* y, int incy ) { - return cblas_Sdot( n, x, incx, y, incy, result ); + return cblas_sdot( n, x, incx, y, incy ); } -inline blasStatus_t -blasGdot( blasHandle_t int n, - const double *x, int incx, - const double *y, int incy, - double *result ) +inline double blasGdot( int n, const double* x, int incx, + const double* y, int incy ) { - return cblas_Ddot( n, x, incx, y, incy, result ); + return cblas_ddot( n, x, incx, y, incy ); } -inline blasStatus_t -blasGnrm2( blasHandle_t int n, - const float *x, int incx, float *result ) +inline float blasGnrm2( int n, const float* x, int incx ) { - return cblas_Snrm2( n, x, incx, result ); + return cblas_snrm2( n, x, incx ); } -inline blasStatus_t -blasGnrm2( blasHandle_t int n, - const double *x, int incx, double *result ) +inline double blasGnrm2( int n, const double* x, int incx ) { - return cblas_Dnrm2( n, x, incx, result ); + return cblas_dnrm2( n, x, incx ); } -inline blasStatus_t -blasGscal( blasHandle_t int n, - const float *alpha, - float *x, int incx ) +inline void blasGscal( int n, const float alpha, + float* x, int incx ) { - return cblas_Sscal( n, alpha, x, incx ); + cblas_sscal( n, alpha, x, incx ); } -inline blasStatus_t -blasGscal( blasHandle_t int n, - const double *alpha, - double *x, int incx ) +inline void blasGscal( int n, const double alpha, + double* x, int incx ) { - return cblas_Dscal( n, alpha, x, incx ); + cblas_dscal( n, alpha, x, incx ); } - -#endif #endif diff --git a/src/Benchmarks/BLAS/vector-operations.h b/src/Benchmarks/BLAS/vector-operations.h index a732726db8e03301435ae171b8e9e1f0b6e1dc63..32c843b43244fd4afe04721c593736f1333a5775 100644 --- a/src/Benchmarks/BLAS/vector-operations.h +++ b/src/Benchmarks/BLAS/vector-operations.h @@ -18,7 +18,9 @@ #include <TNL/Containers/Vector.h> +#ifdef HAVE_BLAS #include "blasWrappers.h" +#endif #ifdef HAVE_CUDA #include "cublasWrappers.h" @@ -87,10 +89,10 @@ benchmarkVectorOperations( Benchmark & benchmark, reset2(); }; - reset12(); - + //// + // Max auto maxHost = [&]() { resultHost = hostVector.max(); }; @@ -112,7 +114,8 @@ benchmarkVectorOperations( Benchmark & benchmark, benchmark.time< Devices::Cuda >( reset1, "GPU ET", maxCudaET ); #endif - + //// + // Min auto minHost = [&]() { resultHost = hostVector.min(); }; @@ -133,7 +136,8 @@ benchmarkVectorOperations( Benchmark & benchmark, benchmark.time< Devices::Cuda >( reset1, "GPU", minCudaET ); #endif - + //// + // Absmax auto absMaxHost = [&]() { resultHost = hostVector.absMax(); }; @@ -146,6 +150,12 @@ benchmarkVectorOperations( Benchmark & benchmark, auto absMaxCudaET = [&]() { resultDevice = max( abs( deviceView ) ); }; +#ifdef HAVE_BLAS + auto absMaxBlas = [&]() { + int index = blasIgamax( size, hostVector.getData(), 1 ); + resultHost = hostVector.getElement( index ); + }; +#endif #ifdef HAVE_CUDA auto absMaxCublas = [&]() { int index = 0; @@ -158,13 +168,15 @@ benchmarkVectorOperations( Benchmark & benchmark, benchmark.setOperation( "absMax", datasetSize ); benchmark.time< Devices::Host >( reset1, "CPU", absMaxHost ); benchmark.time< Devices::Host >( reset1, "CPU ET", absMaxHostET ); + benchmark.time< Devices::Host >( reset1, "BLAS", absMaxBlas ); #ifdef HAVE_CUDA benchmark.time< Devices::Cuda >( reset1, "GPU", absMaxCuda ); benchmark.time< Devices::Cuda >( reset1, "GPU ET", absMaxCudaET ); benchmark.time< Devices::Cuda >( reset1, "cuBLAS", absMaxCublas ); #endif - + //// + // Absmin auto absMinHost = [&]() { resultHost = hostVector.absMin(); }; @@ -177,6 +189,12 @@ benchmarkVectorOperations( Benchmark & benchmark, auto absMinCudaET = [&]() { resultDevice = min( abs( deviceView ) ); }; +/*#ifdef HAVE_BLAS + auto absMinBlas = [&]() { + int index = blasIgamin( size, hostVector.getData(), 1 ); + resultHost = hostVector.getElement( index ); + }; +#endif*/ #ifdef HAVE_CUDA auto absMinCublas = [&]() { int index = 0; @@ -188,14 +206,16 @@ benchmarkVectorOperations( Benchmark & benchmark, #endif benchmark.setOperation( "absMin", datasetSize ); benchmark.time< Devices::Host >( reset1, "CPU", absMinHost ); - benchmark.time< Devices::Host >( reset1, "CPU", absMinHostET ); + benchmark.time< Devices::Host >( reset1, "CPU ET", absMinHostET ); + //benchmark.time< Devices::Host >( reset1, "BLAS", absMinBlas ); #ifdef HAVE_CUDA benchmark.time< Devices::Cuda >( reset1, "GPU", absMinCuda ); benchmark.time< Devices::Cuda >( reset1, "GPU", absMinCudaET ); benchmark.time< Devices::Cuda >( reset1, "cuBLAS", absMinCublas ); #endif - + //// + // Sum auto sumHost = [&]() { resultHost = hostVector.sum(); }; @@ -216,7 +236,8 @@ benchmarkVectorOperations( Benchmark & benchmark, benchmark.time< Devices::Cuda >( reset1, "GPU", sumCudaET ); #endif - + //// + // L1 norm auto l1normHost = [&]() { resultHost = hostVector.lpNorm( 1.0 ); }; @@ -245,7 +266,8 @@ benchmarkVectorOperations( Benchmark & benchmark, benchmark.time< Devices::Cuda >( reset1, "cuBLAS", l1normCublas ); #endif - + //// + // L2 norm auto l2normHost = [&]() { resultHost = hostVector.lpNorm( 2.0 ); }; @@ -258,6 +280,11 @@ benchmarkVectorOperations( Benchmark & benchmark, auto l2normCudaET = [&]() { resultDevice = lpNorm( deviceView, 2.0 ); }; +#ifdef HAVE_BLAS + auto l2normBlas = [&]() { + resultHost = blasGnrm2( size, hostVector.getData(), 1 ); + }; +#endif #ifdef HAVE_CUDA auto l2normCublas = [&]() { cublasGnrm2( cublasHandle, size, @@ -268,13 +295,15 @@ benchmarkVectorOperations( Benchmark & benchmark, benchmark.setOperation( "l2 norm", datasetSize ); benchmark.time< Devices::Host >( reset1, "CPU", l2normHost ); benchmark.time< Devices::Host >( reset1, "CPU", l2normHostET ); + benchmark.time< Devices::Host >( reset1, "BLAS", l2normBlas ); #ifdef HAVE_CUDA benchmark.time< Devices::Cuda >( reset1, "GPU", l2normCuda ); benchmark.time< Devices::Cuda >( reset1, "GPU", l2normCudaET ); benchmark.time< Devices::Cuda >( reset1, "cuBLAS", l2normCublas ); #endif - + //// + // L3 norm auto l3normHost = [&]() { resultHost = hostVector.lpNorm( 3.0 ); }; @@ -296,13 +325,26 @@ benchmarkVectorOperations( Benchmark & benchmark, benchmark.time< Devices::Cuda >( reset1, "GPU", l3normCudaET ); #endif - + //// + // Scalar product auto scalarProductHost = [&]() { resultHost = hostVector.scalarProduct( hostVector2 ); }; auto scalarProductCuda = [&]() { resultDevice = deviceVector.scalarProduct( deviceVector2 ); }; + auto scalarProductHostET = [&]() { + resultHost = ( hostView, hostView2 ); + }; + auto scalarProductCudaET = [&]() { + resultDevice = ( deviceView, deviceView2 ); + }; + +#ifdef HAVE_BLAS + auto scalarProductBlas = [&]() { + resultHost = blasGdot( size, hostVector.getData(), 1, hostVector2.getData(), 1 ); + }; +#endif #ifdef HAVE_CUDA auto scalarProductCublas = [&]() { cublasGdot( cublasHandle, size, @@ -313,11 +355,16 @@ benchmarkVectorOperations( Benchmark & benchmark, #endif benchmark.setOperation( "scalar product", 2 * datasetSize ); benchmark.time< Devices::Host >( reset1, "CPU", scalarProductHost ); + benchmark.time< Devices::Host >( reset1, "CPU ET", scalarProductHostET ); + benchmark.time< Devices::Host >( reset1, "BLAS", scalarProductBlas ); #ifdef HAVE_CUDA benchmark.time< Devices::Cuda >( reset1, "GPU", scalarProductCuda ); + benchmark.time< Devices::Cuda >( reset1, "GPU ET", scalarProductCudaET ); benchmark.time< Devices::Cuda >( reset1, "cuBLAS", scalarProductCublas ); #endif + //// + // Prefix sum /* std::cout << "Benchmarking prefix-sum:" << std::endl; timer.reset(); @@ -348,6 +395,8 @@ benchmarkVectorOperations( Benchmark & benchmark, */ + //// + // Scalar multiplication auto multiplyHost = [&]() { hostVector *= 0.5; }; @@ -369,13 +418,28 @@ benchmarkVectorOperations( Benchmark & benchmark, benchmark.time< Devices::Cuda >( reset1, "cuBLAS", multiplyCublas ); #endif - + //// + // Vector addition auto addVectorHost = [&]() { hostVector.addVector( hostVector2 ); }; auto addVectorCuda = [&]() { deviceVector.addVector( deviceVector2 ); }; + auto addVectorHostET = [&]() { + hostView += hostView2; + }; + auto addVectorCudaET = [&]() { + deviceView += deviceView2; + }; +#ifdef HAVE_CUDA + auto addVectorBlas = [&]() { + const Real alpha = 1.0; + blasGaxpy( size, alpha, + deviceVector2.getData(), 1, + deviceVector.getData(), 1 ); + }; +#endif #ifdef HAVE_CUDA auto addVectorCublas = [&]() { const Real alpha = 1.0; @@ -387,8 +451,11 @@ benchmarkVectorOperations( Benchmark & benchmark, #endif benchmark.setOperation( "vector addition", 3 * datasetSize ); benchmark.time< Devices::Host >( reset1, "CPU", addVectorHost ); + benchmark.time< Devices::Host >( reset1, "CPU ET", addVectorHostET ); + benchmark.time< Devices::Host >( reset1, "BLAS", addVectorBlas ); #ifdef HAVE_CUDA benchmark.time< Devices::Cuda >( reset1, "GPU", addVectorCuda ); + benchmark.time< Devices::Cuda >( reset1, "GPU ET", addVectorCudaET ); benchmark.time< Devices::Cuda >( reset1, "cuBLAS", addVectorCublas ); #endif diff --git a/src/TNL/Containers/VectorViewExpressions.h b/src/TNL/Containers/VectorViewExpressions.h index f83641f2f294ad504bff6c12622b865e0c0727f3..b57089fd7b934b79d4f9092c1ce3594168ceff36 100644 --- a/src/TNL/Containers/VectorViewExpressions.h +++ b/src/TNL/Containers/VectorViewExpressions.h @@ -24,7 +24,6 @@ namespace TNL { //// // Addition template< typename Real, typename Device, typename Index, typename ET > -__cuda_callable__ const Containers::Expressions::BinaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, ET, Containers::Expressions::Addition > operator+( const Containers::VectorView< Real, Device, Index >& a, const ET& b ) { @@ -32,7 +31,6 @@ operator+( const Containers::VectorView< Real, Device, Index >& a, const ET& b ) } template< typename ET, typename Real, typename Device, typename Index > -__cuda_callable__ const Containers::Expressions::BinaryExpressionTemplate< ET, Containers::VectorView< Real, Device, Index >, Containers::Expressions::Addition > operator+( const ET& a, const Containers::VectorView< Real, Device, Index >& b ) { @@ -40,7 +38,6 @@ operator+( const ET& a, const Containers::VectorView< Real, Device, Index >& b ) } template< typename Real1, typename Real2, typename Device, typename Index > -__cuda_callable__ const Containers::Expressions::BinaryExpressionTemplate< Containers::VectorView< Real1, Device, Index >, Containers::VectorView< Real2, Device, Index >, Containers::Expressions::Addition > operator+( const Containers::VectorView< Real1, Device, Index >& a, const Containers::VectorView< Real2, Device, Index >& b ) { @@ -50,7 +47,6 @@ operator+( const Containers::VectorView< Real1, Device, Index >& a, const Contai //// // Subtraction template< typename Real, typename Device, typename Index, typename ET > -__cuda_callable__ const Containers::Expressions::BinaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, ET, Containers::Expressions::Subtraction > operator-( const Containers::VectorView< Real, Device, Index >& a, const ET& b ) { @@ -58,7 +54,6 @@ operator-( const Containers::VectorView< Real, Device, Index >& a, const ET& b ) } template< typename ET, typename Real, typename Device, typename Index > -__cuda_callable__ const Containers::Expressions::BinaryExpressionTemplate< ET, Containers::VectorView< Real, Device, Index >, Containers::Expressions::Subtraction > operator-( const ET& a, const Containers::VectorView< Real, Device, Index >& b ) { @@ -66,7 +61,6 @@ operator-( const ET& a, const Containers::VectorView< Real, Device, Index >& b ) } template< typename Real1, typename Real2, typename Device, typename Index > -__cuda_callable__ const Containers::Expressions::BinaryExpressionTemplate< Containers::VectorView< Real1, Device, Index >, Containers::VectorView< Real2, Device, Index >, Containers::Expressions::Subtraction > operator-( const Containers::VectorView< Real1, Device, Index >& a, const Containers::VectorView< Real2, Device, Index >& b ) { @@ -76,7 +70,6 @@ operator-( const Containers::VectorView< Real1, Device, Index >& a, const Contai //// // Multiplication template< typename Real, typename Device, typename Index, typename ET > -__cuda_callable__ const Containers::Expressions::BinaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, ET, Containers::Expressions::Multiplication > operator*( const Containers::VectorView< Real, Device, Index >& a, const ET& b ) { @@ -84,7 +77,6 @@ operator*( const Containers::VectorView< Real, Device, Index >& a, const ET& b ) } template< typename ET, typename Real, typename Device, typename Index > -__cuda_callable__ const Containers::Expressions::BinaryExpressionTemplate< ET, Containers::VectorView< Real, Device, Index >, Containers::Expressions::Multiplication > operator*( const ET& a, const Containers::VectorView< Real, Device, Index >& b ) { @@ -92,7 +84,6 @@ operator*( const ET& a, const Containers::VectorView< Real, Device, Index >& b ) } template< typename Real1, typename Real2, typename Device, typename Index > -__cuda_callable__ const Containers::Expressions::BinaryExpressionTemplate< Containers::VectorView< Real1, Device, Index >, Containers::VectorView< Real2, Device, Index >, Containers::Expressions::Multiplication > operator*( const Containers::VectorView< Real1, Device, Index >& a, const Containers::VectorView< Real2, Device, Index >& b ) { @@ -102,7 +93,6 @@ operator*( const Containers::VectorView< Real1, Device, Index >& a, const Contai //// // Division template< typename Real, typename Device, typename Index, typename ET > -__cuda_callable__ const Containers::Expressions::BinaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, ET, Containers::Expressions::Division > operator/( const Containers::VectorView< Real, Device, Index >& a, const ET& b ) { @@ -110,7 +100,6 @@ operator/( const Containers::VectorView< Real, Device, Index >& a, const ET& b ) } template< typename ET, typename Real, typename Device, typename Index > -__cuda_callable__ const Containers::Expressions::BinaryExpressionTemplate< ET, Containers::VectorView< Real, Device, Index >, Containers::Expressions::Division > operator/( const ET& a, const Containers::VectorView< Real, Device, Index >& b ) { @@ -118,7 +107,6 @@ operator/( const ET& a, const Containers::VectorView< Real, Device, Index >& b ) } template< typename Real1, typename Real2, typename Device, typename Index > -__cuda_callable__ const Containers::Expressions::BinaryExpressionTemplate< Containers::VectorView< Real1, Device, Index >, Containers::VectorView< Real2, Device, Index >, Containers::Expressions::Division > operator/( const Containers::VectorView< Real1, Device, Index >& a, const Containers::VectorView< Real2, Device, Index >& b ) { @@ -128,7 +116,6 @@ operator/( const Containers::VectorView< Real1, Device, Index >& a, const Contai //// // Min template< typename Real, typename Device, typename Index, typename ET > -__cuda_callable__ const Containers::Expressions::BinaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, ET, Containers::Expressions::Min > min( const Containers::VectorView< Real, Device, Index >& a, const ET& b ) { @@ -136,7 +123,6 @@ min( const Containers::VectorView< Real, Device, Index >& a, const ET& b ) } template< typename ET, typename Real, typename Device, typename Index > -__cuda_callable__ const Containers::Expressions::BinaryExpressionTemplate< ET, Containers::VectorView< Real, Device, Index >, Containers::Expressions::Min > min( const ET& a, const Containers::VectorView< Real, Device, Index >& b ) { @@ -144,7 +130,6 @@ min( const ET& a, const Containers::VectorView< Real, Device, Index >& b ) } template< typename Real1, typename Real2, typename Device, typename Index > -__cuda_callable__ const Containers::Expressions::BinaryExpressionTemplate< Containers::VectorView< Real1, Device, Index >, Containers::VectorView< Real2, Device, Index >, Containers::Expressions::Min > min( const Containers::VectorView< Real1, Device, Index >& a, const Containers::VectorView< Real2, Device, Index >& b ) { @@ -154,7 +139,6 @@ min( const Containers::VectorView< Real1, Device, Index >& a, const Containers:: //// // Max template< typename Real, typename Device, typename Index, typename ET > -__cuda_callable__ const Containers::Expressions::BinaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, ET, Containers::Expressions::Max > max( const Containers::VectorView< Real, Device, Index >& a, const ET& b ) { @@ -162,7 +146,6 @@ max( const Containers::VectorView< Real, Device, Index >& a, const ET& b ) } template< typename ET, typename Real, typename Device, typename Index > -__cuda_callable__ const Containers::Expressions::BinaryExpressionTemplate< ET, Containers::VectorView< Real, Device, Index >, Containers::Expressions::Max > max( const ET& a, const Containers::VectorView< Real, Device, Index >& b ) { @@ -170,7 +153,6 @@ max( const ET& a, const Containers::VectorView< Real, Device, Index >& b ) } template< typename Real1, typename Real2, typename Device, typename Index > -__cuda_callable__ const Containers::Expressions::BinaryExpressionTemplate< Containers::VectorView< Real1, Device, Index >, Containers::VectorView< Real2, Device, Index >, Containers::Expressions::Max > max( const Containers::VectorView< Real1, Device, Index >& a, const Containers::VectorView< Real2, Device, Index >& b ) { @@ -180,21 +162,18 @@ max( const Containers::VectorView< Real1, Device, Index >& a, const Containers:: //// // Comparison operations - operator == template< typename Real, typename Device, typename Index, typename ET > -__cuda_callable__ bool operator==( const Containers::VectorView< Real, Device, Index >& a, const ET& b ) { return Containers::Expressions::ComparisonEQ( a, b ); } template< typename ET, typename Real, typename Device, typename Index > -__cuda_callable__ bool operator==( const ET& a, const Containers::VectorView< Real, Device, Index >& b ) { return Containers::Expressions::ComparisonEQ( a, b ); } template< typename Real1, typename Real2, typename Device, typename Index > -__cuda_callable__ bool operator==( const Containers::VectorView< Real1, Device, Index >& a, const Containers::VectorView< Real2, Device, Index >& b ) { return Containers::Expressions::ComparisonEQ( a, b ); @@ -203,21 +182,18 @@ bool operator==( const Containers::VectorView< Real1, Device, Index >& a, const //// // Comparison operations - operator != template< typename Real, typename Device, typename Index, typename ET > -__cuda_callable__ bool operator!=( const Containers::VectorView< Real, Device, Index >& a, const ET& b ) { return Containers::Expressions::ComparisonNE( a, b ); } template< typename ET, typename Real, typename Device, typename Index > -__cuda_callable__ bool operator!=( const ET& a, const Containers::VectorView< Real, Device, Index >& b ) { return Containers::Expressions::ComparisonNE( a, b ); } template< typename Real1, typename Real2, typename Device, typename Index > -__cuda_callable__ bool operator!=( const Containers::VectorView< Real1, Device, Index >& a, const Containers::VectorView< Real2, Device, Index >& b ) { return Containers::Expressions::ComparisonNE( a, b ); @@ -226,21 +202,18 @@ bool operator!=( const Containers::VectorView< Real1, Device, Index >& a, const //// // Comparison operations - operator < template< typename Real, typename Device, typename Index, typename ET > -__cuda_callable__ bool operator<( const Containers::VectorView< Real, Device, Index >& a, const ET& b ) { return Containers::Expressions::ComparisonLT( a, b ); } template< typename ET, typename Real, typename Device, typename Index > -__cuda_callable__ bool operator<( const ET& a, const Containers::VectorView< Real, Device, Index >& b ) { return Containers::Expressions::ComparisonLT( a, b ); } template< typename Real1, typename Real2, typename Device, typename Index > -__cuda_callable__ bool operator<( const Containers::VectorView< Real1, Device, Index >& a, const Containers::VectorView< Real2, Device, Index >& b ) { return Containers::Expressions::ComparisonLT( a, b ); @@ -249,21 +222,18 @@ bool operator<( const Containers::VectorView< Real1, Device, Index >& a, const C //// // Comparison operations - operator <= template< typename Real, typename Device, typename Index, typename ET > -__cuda_callable__ bool operator<=( const Containers::VectorView< Real, Device, Index >& a, const ET& b ) { return Containers::Expressions::ComparisonLE( a, b ); } template< typename ET, typename Real, typename Device, typename Index > -__cuda_callable__ bool operator<=( const ET& a, const Containers::VectorView< Real, Device, Index >& b ) { return Containers::Expressions::ComparisonLE( a, b ); } template< typename Real1, typename Real2, typename Device, typename Index > -__cuda_callable__ bool operator<=( const Containers::VectorView< Real1, Device, Index >& a, const Containers::VectorView< Real2, Device, Index >& b ) { return Containers::Expressions::ComparisonLE( a, b ); @@ -272,21 +242,18 @@ bool operator<=( const Containers::VectorView< Real1, Device, Index >& a, const //// // Comparison operations - operator > template< typename Real, typename Device, typename Index, typename ET > -__cuda_callable__ bool operator>( const Containers::VectorView< Real, Device, Index >& a, const ET& b ) { return Containers::Expressions::ComparisonGT( a, b ); } template< typename ET, typename Real, typename Device, typename Index > -__cuda_callable__ bool operator>( const ET& a, const Containers::VectorView< Real, Device, Index >& b ) { return Containers::Expressions::ComparisonGT( a, b ); } template< typename Real1, typename Real2, typename Device, typename Index > -__cuda_callable__ bool operator>( const Containers::VectorView< Real1, Device, Index >& a, const Containers::VectorView< Real2, Device, Index >& b ) { return Containers::Expressions::ComparisonGT( a, b ); @@ -295,21 +262,18 @@ bool operator>( const Containers::VectorView< Real1, Device, Index >& a, const C //// // Comparison operations - operator >= template< typename Real, typename Device, typename Index, typename ET > -__cuda_callable__ bool operator>=( const Containers::VectorView< Real, Device, Index >& a, const ET& b ) { return Containers::Expressions::ComparisonGE( a, b ); } template< typename ET, typename Real, typename Device, typename Index > -__cuda_callable__ bool operator>=( const ET& a, const Containers::VectorView< Real, Device, Index >& b ) { return Containers::Expressions::ComparisonGE( a, b ); } template< typename Real1, typename Real2, typename Device, typename Index > -__cuda_callable__ bool operator>=( const Containers::VectorView< Real1, Device, Index >& a, const Containers::VectorView< Real2, Device, Index >& b ) { return Containers::Expressions::ComparisonGE( a, b ); @@ -318,7 +282,6 @@ bool operator>=( const Containers::VectorView< Real1, Device, Index >& a, const //// // Minus template< typename Real, typename Device, typename Index > -__cuda_callable__ const Containers::Expressions::UnaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, Containers::Expressions::Minus > operator-( const Containers::VectorView< Real, Device, Index >& a ) { @@ -328,7 +291,6 @@ operator-( const Containers::VectorView< Real, Device, Index >& a ) //// // Abs template< typename Real, typename Device, typename Index > -__cuda_callable__ const Containers::Expressions::UnaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, Containers::Expressions::Abs > abs( const Containers::VectorView< Real, Device, Index >& a ) { @@ -338,7 +300,6 @@ abs( const Containers::VectorView< Real, Device, Index >& a ) //// // Sine template< typename Real, typename Device, typename Index > -__cuda_callable__ const Containers::Expressions::UnaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, Containers::Expressions::Sin > sin( const Containers::VectorView< Real, Device, Index >& a ) { @@ -348,7 +309,6 @@ sin( const Containers::VectorView< Real, Device, Index >& a ) //// // Cosine template< typename Real, typename Device, typename Index > -__cuda_callable__ const Containers::Expressions::UnaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, Containers::Expressions::Cos > cos( const Containers::VectorView< Real, Device, Index >& a ) { @@ -358,7 +318,6 @@ cos( const Containers::VectorView< Real, Device, Index >& a ) //// // Tangent template< typename Real, typename Device, typename Index > -__cuda_callable__ const Containers::Expressions::UnaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, Containers::Expressions::Tan > tan( const Containers::VectorView< Real, Device, Index >& a ) { @@ -368,7 +327,6 @@ tan( const Containers::VectorView< Real, Device, Index >& a ) //// // Sqrt template< typename Real, typename Device, typename Index > -__cuda_callable__ const Containers::Expressions::UnaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, Containers::Expressions::Sqrt > sqrt( const Containers::VectorView< Real, Device, Index >& a ) { @@ -378,7 +336,6 @@ sqrt( const Containers::VectorView< Real, Device, Index >& a ) //// // Cbrt template< typename Real, typename Device, typename Index > -__cuda_callable__ const Containers::Expressions::UnaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, Containers::Expressions::Cbrt > cbrt( const Containers::VectorView< Real, Device, Index >& a ) { @@ -388,7 +345,6 @@ cbrt( const Containers::VectorView< Real, Device, Index >& a ) //// // Power template< typename Real, typename Device, typename Index, typename ExpType > -__cuda_callable__ const Containers::Expressions::UnaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, Containers::Expressions::Pow, ExpType > pow( const Containers::VectorView< Real, Device, Index >& a, const ExpType& exp ) { @@ -398,7 +354,6 @@ pow( const Containers::VectorView< Real, Device, Index >& a, const ExpType& exp //// // Floor template< typename Real, typename Device, typename Index > -__cuda_callable__ const Containers::Expressions::UnaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, Containers::Expressions::Floor > floor( const Containers::VectorView< Real, Device, Index >& a ) { @@ -408,7 +363,6 @@ floor( const Containers::VectorView< Real, Device, Index >& a ) //// // Ceil template< typename Real, typename Device, typename Index > -__cuda_callable__ const Containers::Expressions::UnaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, Containers::Expressions::Ceil > ceil( const Containers::VectorView< Real, Device, Index >& a ) { @@ -418,7 +372,6 @@ ceil( const Containers::VectorView< Real, Device, Index >& a ) //// // Acos template< typename Real, typename Device, typename Index > -__cuda_callable__ const Containers::Expressions::UnaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, Containers::Expressions::Acos > acos( const Containers::VectorView< Real, Device, Index >& a ) { @@ -428,7 +381,6 @@ acos( const Containers::VectorView< Real, Device, Index >& a ) //// // Asin template< typename Real, typename Device, typename Index > -__cuda_callable__ const Containers::Expressions::UnaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, Containers::Expressions::Asin > asin( const Containers::VectorView< Real, Device, Index >& a ) { @@ -438,7 +390,6 @@ asin( const Containers::VectorView< Real, Device, Index >& a ) //// // Atan template< typename Real, typename Device, typename Index > -__cuda_callable__ const Containers::Expressions::UnaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, Containers::Expressions::Atan > atan( const Containers::VectorView< Real, Device, Index >& a ) { @@ -448,7 +399,6 @@ atan( const Containers::VectorView< Real, Device, Index >& a ) //// // Cosh template< typename Real, typename Device, typename Index > -__cuda_callable__ const Containers::Expressions::UnaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, Containers::Expressions::Cosh > cosh( const Containers::VectorView< Real, Device, Index >& a ) { @@ -458,7 +408,6 @@ cosh( const Containers::VectorView< Real, Device, Index >& a ) //// // Tanh template< typename Real, typename Device, typename Index > -__cuda_callable__ const Containers::Expressions::UnaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, Containers::Expressions::Tanh > tanh( const Containers::VectorView< Real, Device, Index >& a ) { @@ -468,7 +417,6 @@ tanh( const Containers::VectorView< Real, Device, Index >& a ) //// // Log template< typename Real, typename Device, typename Index > -__cuda_callable__ const Containers::Expressions::UnaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, Containers::Expressions::Log > log( const Containers::VectorView< Real, Device, Index >& a ) { @@ -478,7 +426,6 @@ log( const Containers::VectorView< Real, Device, Index >& a ) //// // Log10 template< typename Real, typename Device, typename Index > -__cuda_callable__ const Containers::Expressions::UnaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, Containers::Expressions::Log10 > log10( const Containers::VectorView< Real, Device, Index >& a ) { @@ -488,7 +435,6 @@ log10( const Containers::VectorView< Real, Device, Index >& a ) //// // Log2 template< typename Real, typename Device, typename Index > -__cuda_callable__ const Containers::Expressions::UnaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, Containers::Expressions::Log2 > log2( const Containers::VectorView< Real, Device, Index >& a ) { @@ -498,7 +444,6 @@ log2( const Containers::VectorView< Real, Device, Index >& a ) //// // Exp template< typename Real, typename Device, typename Index > -__cuda_callable__ const Containers::Expressions::UnaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, Containers::Expressions::Exp > exp( const Containers::VectorView< Real, Device, Index >& a ) { @@ -508,7 +453,6 @@ exp( const Containers::VectorView< Real, Device, Index >& a ) //// // Sign template< typename Real, typename Device, typename Index > -__cuda_callable__ const Containers::Expressions::UnaryExpressionTemplate< Containers::VectorView< Real, Device, Index >, Containers::Expressions::Sign > sign( const Containers::VectorView< Real, Device, Index >& a ) { @@ -602,21 +546,18 @@ binaryAnd( const Containers::VectorView< Real, Device, Index >& a ) //// // Scalar product template< typename Real, typename Device, typename Index, typename ET > -__cuda_callable__ Real operator,( const Containers::VectorView< Real, Device, Index >& a, const ET& b ) { return TNL::sum( a * b ); } template< typename ET, typename Real, typename Device, typename Index > -__cuda_callable__ Real operator,( const ET& a, const Containers::VectorView< Real, Device, Index >& b ) { return TNL::sum( a * b ); } template< typename Real1, typename Real2, typename Device, typename Index > -__cuda_callable__ auto operator,( const Containers::VectorView< Real1, Device, Index >& a, const Containers::VectorView< Real2, Device, Index >& b ) ->decltype( TNL::sum( a * b ) ) { @@ -626,7 +567,6 @@ auto operator,( const Containers::VectorView< Real1, Device, Index >& a, const C //// // TODO: Replace this with multiplication when its safe template< typename Real, typename Device, typename Index, typename ET > -__cuda_callable__ Containers::VectorView< Real, Device, Index > Scale( const Containers::VectorView< Real, Device, Index >& a, const ET& b ) { @@ -635,7 +575,6 @@ Scale( const Containers::VectorView< Real, Device, Index >& a, const ET& b ) } template< typename ET, typename Real, typename Device, typename Index > -__cuda_callable__ Containers::Expressions::BinaryExpressionTemplate< ET, Containers::VectorView< Real, Device, Index >, Containers::Expressions::Multiplication > Scale( const ET& a, const Containers::VectorView< Real, Device, Index >& b ) { @@ -644,7 +583,6 @@ Scale( const ET& a, const Containers::VectorView< Real, Device, Index >& b ) } template< typename Real1, typename Real2, typename Device, typename Index > -__cuda_callable__ Containers::Expressions::BinaryExpressionTemplate< Containers::VectorView< Real1, Device, Index >, Containers::VectorView< Real2, Device, Index >, Containers::Expressions::Multiplication > Scale( const Containers::VectorView< Real1, Device, Index >& a, const Containers::VectorView< Real2, Device, Index >& b ) {