From a1a054bf877e5ed7f879a9f4786b13e6a412b234 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz> Date: Thu, 22 Aug 2019 19:55:27 +0200 Subject: [PATCH] Removed duplicate TransferBufferSize constants Also set the buffer size to 1 MiB, because larger buffer size slows down memory copies significantly (e.g. MeshTest would take about 10x longer). Addresses #26 --- .../Algorithms/ArrayOperationsCuda.hpp | 18 +++++++++--------- src/TNL/Cuda/LaunchHelpers.h | 8 ++++++++ src/TNL/Devices/Cuda.h | 10 ---------- src/TNL/Devices/Cuda_impl.h | 5 ----- src/TNL/File.h | 8 -------- src/TNL/File.hpp | 13 +++++++------ 6 files changed, 24 insertions(+), 38 deletions(-) diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp index b81fd7f2b7..5e97f1ac26 100644 --- a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp +++ b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp @@ -106,11 +106,11 @@ copyFromIterator( DestinationElement* destination, SourceIterator last ) { using BaseType = typename std::remove_cv< DestinationElement >::type; - std::unique_ptr< BaseType[] > buffer{ new BaseType[ Devices::Cuda::getGPUTransferBufferSize() ] }; + std::unique_ptr< BaseType[] > buffer{ new BaseType[ Cuda::getTransferBufferSize() ] }; Index copiedElements = 0; while( copiedElements < destinationSize && first != last ) { Index i = 0; - while( i < Devices::Cuda::getGPUTransferBufferSize() && first != last ) + while( i < Cuda::getTransferBufferSize() && first != last ) buffer[ i++ ] = *first++; ArrayOperations< Devices::Cuda, Devices::Host >::copy( &destination[ copiedElements ], buffer.get(), i ); copiedElements += i; @@ -197,18 +197,18 @@ copy( DestinationElement* destination, else { using BaseType = typename std::remove_cv< SourceElement >::type; - std::unique_ptr< BaseType[] > buffer{ new BaseType[ Devices::Cuda::getGPUTransferBufferSize() ] }; + std::unique_ptr< BaseType[] > buffer{ new BaseType[ Cuda::getTransferBufferSize() ] }; Index i( 0 ); while( i < size ) { if( cudaMemcpy( (void*) buffer.get(), (void*) &source[ i ], - TNL::min( size - i, Devices::Cuda::getGPUTransferBufferSize() ) * sizeof( SourceElement ), + TNL::min( size - i, Cuda::getTransferBufferSize() ) * sizeof( SourceElement ), cudaMemcpyDeviceToHost ) != cudaSuccess ) std::cerr << "Transfer of data from CUDA device to host failed." << std::endl; TNL_CHECK_CUDA_DEVICE; Index j( 0 ); - while( j < Devices::Cuda::getGPUTransferBufferSize() && i + j < size ) + while( j < Cuda::getTransferBufferSize() && i + j < size ) { destination[ i + j ] = buffer[ j ]; j++; @@ -239,11 +239,11 @@ compare( const Element1* destination, TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." ); TNL_ASSERT_GE( size, (Index) 0, "Array size must be non-negative." ); #ifdef HAVE_CUDA - std::unique_ptr< Element2[] > host_buffer{ new Element2[ Devices::Cuda::getGPUTransferBufferSize() ] }; + std::unique_ptr< Element2[] > host_buffer{ new Element2[ Cuda::getTransferBufferSize() ] }; Index compared( 0 ); while( compared < size ) { - Index transfer = min( size - compared, Devices::Cuda::getGPUTransferBufferSize() ); + Index transfer = min( size - compared, Cuda::getTransferBufferSize() ); if( cudaMemcpy( (void*) host_buffer.get(), (void*) &source[ compared ], transfer * sizeof( Element2 ), @@ -288,12 +288,12 @@ copy( DestinationElement* destination, } else { - std::unique_ptr< DestinationElement[] > buffer{ new DestinationElement[ Devices::Cuda::getGPUTransferBufferSize() ] }; + std::unique_ptr< DestinationElement[] > buffer{ new DestinationElement[ Cuda::getTransferBufferSize() ] }; Index i( 0 ); while( i < size ) { Index j( 0 ); - while( j < Devices::Cuda::getGPUTransferBufferSize() && i + j < size ) + while( j < Cuda::getTransferBufferSize() && i + j < size ) { buffer[ j ] = source[ i + j ]; j++; diff --git a/src/TNL/Cuda/LaunchHelpers.h b/src/TNL/Cuda/LaunchHelpers.h index aaca4a67d8..6e5d3c9757 100644 --- a/src/TNL/Cuda/LaunchHelpers.h +++ b/src/TNL/Cuda/LaunchHelpers.h @@ -30,6 +30,14 @@ inline constexpr int getWarpSize() return 32; } +// When we transfer data between the GPU and the CPU we use 1 MiB buffer. This +// size should ensure good performance. +// We use the same buffer size even for retyping data during IO operations. +inline constexpr int getTransferBufferSize() +{ + return 1 << 20; +} + #ifdef HAVE_CUDA __device__ inline int getGlobalThreadIdx( const int gridIdx = 0, const int gridSize = getMaxGridSize() ) diff --git a/src/TNL/Devices/Cuda.h b/src/TNL/Devices/Cuda.h index 6784da34d9..e1dd264b4c 100644 --- a/src/TNL/Devices/Cuda.h +++ b/src/TNL/Devices/Cuda.h @@ -24,16 +24,6 @@ public: static inline bool setup( const Config::ParameterContainer& parameters, const String& prefix = "" ); - - static inline constexpr int getGPUTransferBufferSize(); - - //// - // When we transfer data between the GPU and the CPU we use 5 MB buffer. This - // size should ensure good performance -- see. - // http://wiki.accelereyes.com/wiki/index.php/GPU_Memory_Transfer . - // We use the same buffer size even for retyping data during IO operations. - // - static constexpr std::size_t TransferBufferSize = 5 * 2<<20; }; } // namespace Devices diff --git a/src/TNL/Devices/Cuda_impl.h b/src/TNL/Devices/Cuda_impl.h index 5109f689e8..ae6bbcd178 100644 --- a/src/TNL/Devices/Cuda_impl.h +++ b/src/TNL/Devices/Cuda_impl.h @@ -51,10 +51,5 @@ Cuda::setup( const Config::ParameterContainer& parameters, return true; } -inline constexpr int Cuda::getGPUTransferBufferSize() -{ - return 1 << 20; -} - } // namespace Devices } // namespace TNL diff --git a/src/TNL/File.h b/src/TNL/File.h index 70eb013b77..747f4f4e71 100644 --- a/src/TNL/File.h +++ b/src/TNL/File.h @@ -168,14 +168,6 @@ class File std::fstream file; String fileName; - - //// - // When we transfer data between the GPU and the CPU we use 5 MB buffer. This - // size should ensure good performance -- see. - // http://wiki.accelereyes.com/wiki/index.php/GPU_Memory_Transfer . - // We use the same buffer size even for retyping data during IO operations. - // - static constexpr std::streamsize TransferBufferSize = 5 * 2<<20; }; /** diff --git a/src/TNL/File.hpp b/src/TNL/File.hpp index a3eb66066e..d00903703f 100644 --- a/src/TNL/File.hpp +++ b/src/TNL/File.hpp @@ -18,6 +18,7 @@ #include <TNL/File.h> #include <TNL/Assert.h> #include <TNL/Cuda/CheckDevice.h> +#include <TNL/Cuda/LaunchHelpers.h> #include <TNL/Exceptions/CudaSupportMissing.h> #include <TNL/Exceptions/FileSerializationError.h> #include <TNL/Exceptions/FileDeserializationError.h> @@ -101,7 +102,7 @@ void File::load_impl( Type* buffer, std::streamsize elements ) file.read( reinterpret_cast<char*>(buffer), sizeof(Type) * elements ); else { - const std::streamsize cast_buffer_size = std::min( TransferBufferSize / (std::streamsize) sizeof(SourceType), elements ); + const std::streamsize cast_buffer_size = std::min( Cuda::getTransferBufferSize() / (std::streamsize) sizeof(SourceType), elements ); using BaseType = typename std::remove_cv< SourceType >::type; std::unique_ptr< BaseType[] > cast_buffer{ new BaseType[ cast_buffer_size ] }; std::streamsize readElements = 0; @@ -124,7 +125,7 @@ template< typename Type, void File::load_impl( Type* buffer, std::streamsize elements ) { #ifdef HAVE_CUDA - const std::streamsize host_buffer_size = std::min( TransferBufferSize / (std::streamsize) sizeof(Type), elements ); + const std::streamsize host_buffer_size = std::min( Cuda::getTransferBufferSize() / (std::streamsize) sizeof(Type), elements ); using BaseType = typename std::remove_cv< Type >::type; std::unique_ptr< BaseType[] > host_buffer{ new BaseType[ host_buffer_size ] }; @@ -145,7 +146,7 @@ void File::load_impl( Type* buffer, std::streamsize elements ) } else { - const std::streamsize cast_buffer_size = std::min( TransferBufferSize / (std::streamsize) sizeof(SourceType), elements ); + const std::streamsize cast_buffer_size = std::min( Cuda::getTransferBufferSize() / (std::streamsize) sizeof(SourceType), elements ); using BaseType = typename std::remove_cv< SourceType >::type; std::unique_ptr< BaseType[] > cast_buffer{ new BaseType[ cast_buffer_size ] }; @@ -192,7 +193,7 @@ void File::save_impl( const Type* buffer, std::streamsize elements ) file.write( reinterpret_cast<const char*>(buffer), sizeof(Type) * elements ); else { - const std::streamsize cast_buffer_size = std::min( TransferBufferSize / (std::streamsize) sizeof(TargetType), elements ); + const std::streamsize cast_buffer_size = std::min( Cuda::getTransferBufferSize() / (std::streamsize) sizeof(TargetType), elements ); using BaseType = typename std::remove_cv< TargetType >::type; std::unique_ptr< BaseType[] > cast_buffer{ new BaseType[ cast_buffer_size ] }; std::streamsize writtenElements = 0; @@ -216,7 +217,7 @@ template< typename Type, void File::save_impl( const Type* buffer, std::streamsize elements ) { #ifdef HAVE_CUDA - const std::streamsize host_buffer_size = std::min( TransferBufferSize / (std::streamsize) sizeof(Type), elements ); + const std::streamsize host_buffer_size = std::min( Cuda::getTransferBufferSize() / (std::streamsize) sizeof(Type), elements ); using BaseType = typename std::remove_cv< Type >::type; std::unique_ptr< BaseType[] > host_buffer{ new BaseType[ host_buffer_size ] }; @@ -237,7 +238,7 @@ void File::save_impl( const Type* buffer, std::streamsize elements ) } else { - const std::streamsize cast_buffer_size = std::min( TransferBufferSize / (std::streamsize) sizeof(TargetType), elements ); + const std::streamsize cast_buffer_size = std::min( Cuda::getTransferBufferSize() / (std::streamsize) sizeof(TargetType), elements ); using BaseType = typename std::remove_cv< TargetType >::type; std::unique_ptr< BaseType[] > cast_buffer{ new BaseType[ cast_buffer_size ] }; -- GitLab