Loading src/Benchmarks/BLAS/array-operations.h +16 −16 Original line number Diff line number Diff line Loading @@ -69,12 +69,12 @@ benchmarkArrayOperations( Benchmark & benchmark, auto compareHost = [&]() { resultHost = (int) ( hostArray == hostArray2 ); }; auto compareCuda = [&]() { resultDevice = (int) ( deviceArray == deviceArray2 ); }; benchmark.setOperation( "comparison (operator==)", 2 * datasetSize ); benchmark.time< Devices::Host >( reset1, "CPU", compareHost ); #ifdef HAVE_CUDA auto compareCuda = [&]() { resultDevice = (int) ( deviceArray == deviceArray2 ); }; benchmark.time< Devices::Cuda >( reset1, "GPU", compareCuda ); #endif Loading @@ -82,25 +82,25 @@ benchmarkArrayOperations( Benchmark & benchmark, auto copyAssignHostHost = [&]() { hostArray = hostArray2; }; auto copyAssignCudaCuda = [&]() { deviceArray = deviceArray2; }; benchmark.setOperation( "copy (operator=)", 2 * datasetSize ); // copyBasetime is used later inside HAVE_CUDA guard, so the compiler will // complain when compiling without CUDA const double copyBasetime = benchmark.time< Devices::Host >( reset1, "CPU", copyAssignHostHost ); #ifdef HAVE_CUDA auto copyAssignCudaCuda = [&]() { deviceArray = deviceArray2; }; benchmark.time< Devices::Cuda >( reset1, "GPU", copyAssignCudaCuda ); #endif #ifdef HAVE_CUDA auto copyAssignHostCuda = [&]() { deviceArray = hostArray; }; auto copyAssignCudaHost = [&]() { hostArray = deviceArray; }; #ifdef HAVE_CUDA benchmark.setOperation( "copy (operator=)", datasetSize, copyBasetime ); benchmark.time< Devices::Cuda >( reset1, "CPU->GPU", copyAssignHostCuda ); benchmark.time< Devices::Cuda >( reset1, "GPU->CPU", copyAssignCudaHost ); Loading @@ -110,12 +110,12 @@ benchmarkArrayOperations( Benchmark & benchmark, auto setValueHost = [&]() { hostArray.setValue( 3.0 ); }; auto setValueCuda = [&]() { deviceArray.setValue( 3.0 ); }; benchmark.setOperation( "setValue", datasetSize ); benchmark.time< Devices::Host >( reset1, "CPU", setValueHost ); #ifdef HAVE_CUDA auto setValueCuda = [&]() { deviceArray.setValue( 3.0 ); }; benchmark.time< Devices::Cuda >( reset1, "GPU", setValueCuda ); #endif Loading @@ -123,9 +123,6 @@ benchmarkArrayOperations( Benchmark & benchmark, auto setSizeHost = [&]() { hostArray.setSize( size ); }; auto setSizeCuda = [&]() { deviceArray.setSize( size ); }; auto resetSize1 = [&]() { hostArray.reset(); #ifdef HAVE_CUDA Loading @@ -135,6 +132,9 @@ benchmarkArrayOperations( Benchmark & benchmark, benchmark.setOperation( "allocation (setSize)", datasetSize ); benchmark.time< Devices::Host >( resetSize1, "CPU", setSizeHost ); #ifdef HAVE_CUDA auto setSizeCuda = [&]() { deviceArray.setSize( size ); }; benchmark.time< Devices::Cuda >( resetSize1, "GPU", setSizeCuda ); #endif Loading @@ -142,9 +142,6 @@ benchmarkArrayOperations( Benchmark & benchmark, auto resetSizeHost = [&]() { hostArray.reset(); }; auto resetSizeCuda = [&]() { deviceArray.reset(); }; auto setSize1 = [&]() { hostArray.setSize( size ); #ifdef HAVE_CUDA Loading @@ -154,6 +151,9 @@ benchmarkArrayOperations( Benchmark & benchmark, benchmark.setOperation( "deallocation (reset)", datasetSize ); benchmark.time< Devices::Host >( setSize1, "CPU", resetSizeHost ); #ifdef HAVE_CUDA auto resetSizeCuda = [&]() { deviceArray.reset(); }; benchmark.time< Devices::Cuda >( setSize1, "GPU", resetSizeCuda ); #endif } Loading src/Benchmarks/BLAS/spmv.h +3 −4 Original line number Diff line number Diff line Loading @@ -155,13 +155,12 @@ benchmarkSpMV( Benchmark & benchmark, auto spmvHost = [&]() { hostMatrix.vectorProduct( hostVector, hostVector2 ); }; auto spmvCuda = [&]() { deviceMatrix.vectorProduct( deviceVector, deviceVector2 ); }; benchmark.setOperation( datasetSize ); benchmark.time< Devices::Host >( reset, "CPU", spmvHost ); #ifdef HAVE_CUDA auto spmvCuda = [&]() { deviceMatrix.vectorProduct( deviceVector, deviceVector2 ); }; benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda ); #endif } Loading src/Benchmarks/BLAS/vector-operations.h +134 −153 File changed.Preview size limit exceeded, changes collapsed. Show changes src/TNL/Containers/ndarray/SizesHolderHelpers.h +4 −2 Original line number Diff line number Diff line Loading @@ -184,7 +184,8 @@ struct SetSizesCopyHelper target.template setSize< level >( source.template getSize< level >() ); SetSizesCopyHelper< TargetHolder, SourceHolder, level - 1 >::copy( target, source ); } else if( target.template getStaticSize< level >() != source.template getSize< level >() ) else if( source.template getSize< level >() < 0 || target.template getStaticSize< level >() != (std::size_t) source.template getSize< level >() ) throw std::logic_error( "Cannot copy sizes due to inconsistent underlying types (static sizes don't match)." ); } }; Loading @@ -198,7 +199,8 @@ struct SetSizesCopyHelper< TargetHolder, SourceHolder, 0 > { if( target.template getStaticSize< 0 >() == 0 ) target.template setSize< 0 >( source.template getSize< 0 >() ); else if( target.template getStaticSize< 0 >() != source.template getSize< 0 >() ) else if( source.template getSize< 0 >() || target.template getStaticSize< 0 >() != (std::size_t) source.template getSize< 0 >() ) throw std::logic_error( "Cannot copy sizes due to inconsistent underlying types (static sizes don't match)." ); } }; Loading src/TNL/Devices/Cuda_impl.h +5 −2 Original line number Diff line number Diff line Loading @@ -294,6 +294,7 @@ __device__ Element* Cuda::getSharedMemory() { return CudaSharedMemory< Element >(); } #endif #ifdef HAVE_CUDA inline void Cuda::checkDevice( const char* file_name, int line, cudaError error ) Loading Loading @@ -326,6 +327,8 @@ inline bool Cuda::synchronizeDevice( int deviceId ) getSmartPointersSynchronizationTimer().stop(); return b; #endif #else return true; #endif } Loading Loading @@ -353,6 +356,7 @@ namespace { // double-precision atomicAdd function for Maxwell and older GPUs // copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions #ifdef HAVE_CUDA #if __CUDA_ARCH__ < 600 namespace { __device__ double atomicAdd(double* address, double val) Loading @@ -374,8 +378,7 @@ namespace { } } // namespace #endif #endif /* HAVE_CUDA */ #endif } // namespace Devices } // namespace TNL Loading
src/Benchmarks/BLAS/array-operations.h +16 −16 Original line number Diff line number Diff line Loading @@ -69,12 +69,12 @@ benchmarkArrayOperations( Benchmark & benchmark, auto compareHost = [&]() { resultHost = (int) ( hostArray == hostArray2 ); }; auto compareCuda = [&]() { resultDevice = (int) ( deviceArray == deviceArray2 ); }; benchmark.setOperation( "comparison (operator==)", 2 * datasetSize ); benchmark.time< Devices::Host >( reset1, "CPU", compareHost ); #ifdef HAVE_CUDA auto compareCuda = [&]() { resultDevice = (int) ( deviceArray == deviceArray2 ); }; benchmark.time< Devices::Cuda >( reset1, "GPU", compareCuda ); #endif Loading @@ -82,25 +82,25 @@ benchmarkArrayOperations( Benchmark & benchmark, auto copyAssignHostHost = [&]() { hostArray = hostArray2; }; auto copyAssignCudaCuda = [&]() { deviceArray = deviceArray2; }; benchmark.setOperation( "copy (operator=)", 2 * datasetSize ); // copyBasetime is used later inside HAVE_CUDA guard, so the compiler will // complain when compiling without CUDA const double copyBasetime = benchmark.time< Devices::Host >( reset1, "CPU", copyAssignHostHost ); #ifdef HAVE_CUDA auto copyAssignCudaCuda = [&]() { deviceArray = deviceArray2; }; benchmark.time< Devices::Cuda >( reset1, "GPU", copyAssignCudaCuda ); #endif #ifdef HAVE_CUDA auto copyAssignHostCuda = [&]() { deviceArray = hostArray; }; auto copyAssignCudaHost = [&]() { hostArray = deviceArray; }; #ifdef HAVE_CUDA benchmark.setOperation( "copy (operator=)", datasetSize, copyBasetime ); benchmark.time< Devices::Cuda >( reset1, "CPU->GPU", copyAssignHostCuda ); benchmark.time< Devices::Cuda >( reset1, "GPU->CPU", copyAssignCudaHost ); Loading @@ -110,12 +110,12 @@ benchmarkArrayOperations( Benchmark & benchmark, auto setValueHost = [&]() { hostArray.setValue( 3.0 ); }; auto setValueCuda = [&]() { deviceArray.setValue( 3.0 ); }; benchmark.setOperation( "setValue", datasetSize ); benchmark.time< Devices::Host >( reset1, "CPU", setValueHost ); #ifdef HAVE_CUDA auto setValueCuda = [&]() { deviceArray.setValue( 3.0 ); }; benchmark.time< Devices::Cuda >( reset1, "GPU", setValueCuda ); #endif Loading @@ -123,9 +123,6 @@ benchmarkArrayOperations( Benchmark & benchmark, auto setSizeHost = [&]() { hostArray.setSize( size ); }; auto setSizeCuda = [&]() { deviceArray.setSize( size ); }; auto resetSize1 = [&]() { hostArray.reset(); #ifdef HAVE_CUDA Loading @@ -135,6 +132,9 @@ benchmarkArrayOperations( Benchmark & benchmark, benchmark.setOperation( "allocation (setSize)", datasetSize ); benchmark.time< Devices::Host >( resetSize1, "CPU", setSizeHost ); #ifdef HAVE_CUDA auto setSizeCuda = [&]() { deviceArray.setSize( size ); }; benchmark.time< Devices::Cuda >( resetSize1, "GPU", setSizeCuda ); #endif Loading @@ -142,9 +142,6 @@ benchmarkArrayOperations( Benchmark & benchmark, auto resetSizeHost = [&]() { hostArray.reset(); }; auto resetSizeCuda = [&]() { deviceArray.reset(); }; auto setSize1 = [&]() { hostArray.setSize( size ); #ifdef HAVE_CUDA Loading @@ -154,6 +151,9 @@ benchmarkArrayOperations( Benchmark & benchmark, benchmark.setOperation( "deallocation (reset)", datasetSize ); benchmark.time< Devices::Host >( setSize1, "CPU", resetSizeHost ); #ifdef HAVE_CUDA auto resetSizeCuda = [&]() { deviceArray.reset(); }; benchmark.time< Devices::Cuda >( setSize1, "GPU", resetSizeCuda ); #endif } Loading
src/Benchmarks/BLAS/spmv.h +3 −4 Original line number Diff line number Diff line Loading @@ -155,13 +155,12 @@ benchmarkSpMV( Benchmark & benchmark, auto spmvHost = [&]() { hostMatrix.vectorProduct( hostVector, hostVector2 ); }; auto spmvCuda = [&]() { deviceMatrix.vectorProduct( deviceVector, deviceVector2 ); }; benchmark.setOperation( datasetSize ); benchmark.time< Devices::Host >( reset, "CPU", spmvHost ); #ifdef HAVE_CUDA auto spmvCuda = [&]() { deviceMatrix.vectorProduct( deviceVector, deviceVector2 ); }; benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda ); #endif } Loading
src/Benchmarks/BLAS/vector-operations.h +134 −153 File changed.Preview size limit exceeded, changes collapsed. Show changes
src/TNL/Containers/ndarray/SizesHolderHelpers.h +4 −2 Original line number Diff line number Diff line Loading @@ -184,7 +184,8 @@ struct SetSizesCopyHelper target.template setSize< level >( source.template getSize< level >() ); SetSizesCopyHelper< TargetHolder, SourceHolder, level - 1 >::copy( target, source ); } else if( target.template getStaticSize< level >() != source.template getSize< level >() ) else if( source.template getSize< level >() < 0 || target.template getStaticSize< level >() != (std::size_t) source.template getSize< level >() ) throw std::logic_error( "Cannot copy sizes due to inconsistent underlying types (static sizes don't match)." ); } }; Loading @@ -198,7 +199,8 @@ struct SetSizesCopyHelper< TargetHolder, SourceHolder, 0 > { if( target.template getStaticSize< 0 >() == 0 ) target.template setSize< 0 >( source.template getSize< 0 >() ); else if( target.template getStaticSize< 0 >() != source.template getSize< 0 >() ) else if( source.template getSize< 0 >() || target.template getStaticSize< 0 >() != (std::size_t) source.template getSize< 0 >() ) throw std::logic_error( "Cannot copy sizes due to inconsistent underlying types (static sizes don't match)." ); } }; Loading
src/TNL/Devices/Cuda_impl.h +5 −2 Original line number Diff line number Diff line Loading @@ -294,6 +294,7 @@ __device__ Element* Cuda::getSharedMemory() { return CudaSharedMemory< Element >(); } #endif #ifdef HAVE_CUDA inline void Cuda::checkDevice( const char* file_name, int line, cudaError error ) Loading Loading @@ -326,6 +327,8 @@ inline bool Cuda::synchronizeDevice( int deviceId ) getSmartPointersSynchronizationTimer().stop(); return b; #endif #else return true; #endif } Loading Loading @@ -353,6 +356,7 @@ namespace { // double-precision atomicAdd function for Maxwell and older GPUs // copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions #ifdef HAVE_CUDA #if __CUDA_ARCH__ < 600 namespace { __device__ double atomicAdd(double* address, double val) Loading @@ -374,8 +378,7 @@ namespace { } } // namespace #endif #endif /* HAVE_CUDA */ #endif } // namespace Devices } // namespace TNL