From a1a054bf877e5ed7f879a9f4786b13e6a412b234 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Thu, 22 Aug 2019 19:55:27 +0200
Subject: [PATCH] Removed duplicate TransferBufferSize constants

Also set the buffer size to 1 MiB, because larger buffer size slows down
memory copies significantly (e.g. MeshTest would take about 10x longer).

Addresses #26
---
 .../Algorithms/ArrayOperationsCuda.hpp         | 18 +++++++++---------
 src/TNL/Cuda/LaunchHelpers.h                   |  8 ++++++++
 src/TNL/Devices/Cuda.h                         | 10 ----------
 src/TNL/Devices/Cuda_impl.h                    |  5 -----
 src/TNL/File.h                                 |  8 --------
 src/TNL/File.hpp                               | 13 +++++++------
 6 files changed, 24 insertions(+), 38 deletions(-)

diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
index b81fd7f2b7..5e97f1ac26 100644
--- a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
+++ b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
@@ -106,11 +106,11 @@ copyFromIterator( DestinationElement* destination,
                   SourceIterator last )
 {
    using BaseType = typename std::remove_cv< DestinationElement >::type;
-   std::unique_ptr< BaseType[] > buffer{ new BaseType[ Devices::Cuda::getGPUTransferBufferSize() ] };
+   std::unique_ptr< BaseType[] > buffer{ new BaseType[ Cuda::getTransferBufferSize() ] };
    Index copiedElements = 0;
    while( copiedElements < destinationSize && first != last ) {
       Index i = 0;
-      while( i < Devices::Cuda::getGPUTransferBufferSize() && first != last )
+      while( i < Cuda::getTransferBufferSize() && first != last )
          buffer[ i++ ] = *first++;
       ArrayOperations< Devices::Cuda, Devices::Host >::copy( &destination[ copiedElements ], buffer.get(), i );
       copiedElements += i;
@@ -197,18 +197,18 @@ copy( DestinationElement* destination,
    else
    {
       using BaseType = typename std::remove_cv< SourceElement >::type;
-      std::unique_ptr< BaseType[] > buffer{ new BaseType[ Devices::Cuda::getGPUTransferBufferSize() ] };
+      std::unique_ptr< BaseType[] > buffer{ new BaseType[ Cuda::getTransferBufferSize() ] };
       Index i( 0 );
       while( i < size )
       {
          if( cudaMemcpy( (void*) buffer.get(),
                          (void*) &source[ i ],
-                         TNL::min( size - i, Devices::Cuda::getGPUTransferBufferSize() ) * sizeof( SourceElement ),
+                         TNL::min( size - i, Cuda::getTransferBufferSize() ) * sizeof( SourceElement ),
                          cudaMemcpyDeviceToHost ) != cudaSuccess )
             std::cerr << "Transfer of data from CUDA device to host failed." << std::endl;
          TNL_CHECK_CUDA_DEVICE;
          Index j( 0 );
-         while( j < Devices::Cuda::getGPUTransferBufferSize() && i + j < size )
+         while( j < Cuda::getTransferBufferSize() && i + j < size )
          {
             destination[ i + j ] = buffer[ j ];
             j++;
@@ -239,11 +239,11 @@ compare( const Element1* destination,
    TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." );
    TNL_ASSERT_GE( size, (Index) 0, "Array size must be non-negative." );
 #ifdef HAVE_CUDA
-   std::unique_ptr< Element2[] > host_buffer{ new Element2[ Devices::Cuda::getGPUTransferBufferSize() ] };
+   std::unique_ptr< Element2[] > host_buffer{ new Element2[ Cuda::getTransferBufferSize() ] };
    Index compared( 0 );
    while( compared < size )
    {
-      Index transfer = min( size - compared, Devices::Cuda::getGPUTransferBufferSize() );
+      Index transfer = min( size - compared, Cuda::getTransferBufferSize() );
       if( cudaMemcpy( (void*) host_buffer.get(),
                       (void*) &source[ compared ],
                       transfer * sizeof( Element2 ),
@@ -288,12 +288,12 @@ copy( DestinationElement* destination,
    }
    else
    {
-      std::unique_ptr< DestinationElement[] > buffer{ new DestinationElement[ Devices::Cuda::getGPUTransferBufferSize() ] };
+      std::unique_ptr< DestinationElement[] > buffer{ new DestinationElement[ Cuda::getTransferBufferSize() ] };
       Index i( 0 );
       while( i < size )
       {
          Index j( 0 );
-         while( j < Devices::Cuda::getGPUTransferBufferSize() && i + j < size )
+         while( j < Cuda::getTransferBufferSize() && i + j < size )
          {
             buffer[ j ] = source[ i + j ];
             j++;
diff --git a/src/TNL/Cuda/LaunchHelpers.h b/src/TNL/Cuda/LaunchHelpers.h
index aaca4a67d8..6e5d3c9757 100644
--- a/src/TNL/Cuda/LaunchHelpers.h
+++ b/src/TNL/Cuda/LaunchHelpers.h
@@ -30,6 +30,14 @@ inline constexpr int getWarpSize()
    return 32;
 }
 
+// When we transfer data between the GPU and the CPU we use 1 MiB buffer. This
+// size should ensure good performance.
+// We use the same buffer size even for retyping data during IO operations.
+inline constexpr int getTransferBufferSize()
+{
+   return 1 << 20;
+}
+
 #ifdef HAVE_CUDA
 __device__ inline int getGlobalThreadIdx( const int gridIdx = 0,
                                           const int gridSize = getMaxGridSize() )
diff --git a/src/TNL/Devices/Cuda.h b/src/TNL/Devices/Cuda.h
index 6784da34d9..e1dd264b4c 100644
--- a/src/TNL/Devices/Cuda.h
+++ b/src/TNL/Devices/Cuda.h
@@ -24,16 +24,6 @@ public:
 
    static inline bool setup( const Config::ParameterContainer& parameters,
                              const String& prefix = "" );
-
-   static inline constexpr int getGPUTransferBufferSize();
-
-   ////
-   // When we transfer data between the GPU and the CPU we use 5 MB buffer. This
-   // size should ensure good performance -- see.
-   // http://wiki.accelereyes.com/wiki/index.php/GPU_Memory_Transfer .
-   // We use the same buffer size even for retyping data during IO operations.
-   //
-   static constexpr std::size_t TransferBufferSize = 5 * 2<<20;
 };
 
 } // namespace Devices
diff --git a/src/TNL/Devices/Cuda_impl.h b/src/TNL/Devices/Cuda_impl.h
index 5109f689e8..ae6bbcd178 100644
--- a/src/TNL/Devices/Cuda_impl.h
+++ b/src/TNL/Devices/Cuda_impl.h
@@ -51,10 +51,5 @@ Cuda::setup( const Config::ParameterContainer& parameters,
    return true;
 }
 
-inline constexpr int Cuda::getGPUTransferBufferSize()
-{
-   return 1 << 20;
-}
-
 } // namespace Devices
 } // namespace TNL
diff --git a/src/TNL/File.h b/src/TNL/File.h
index 70eb013b77..747f4f4e71 100644
--- a/src/TNL/File.h
+++ b/src/TNL/File.h
@@ -168,14 +168,6 @@ class File
 
       std::fstream file;
       String fileName;
-
-      ////
-      // When we transfer data between the GPU and the CPU we use 5 MB buffer. This
-      // size should ensure good performance -- see.
-      // http://wiki.accelereyes.com/wiki/index.php/GPU_Memory_Transfer .
-      // We use the same buffer size even for retyping data during IO operations.
-      //
-      static constexpr std::streamsize TransferBufferSize = 5 * 2<<20;
 };
 
 /**
diff --git a/src/TNL/File.hpp b/src/TNL/File.hpp
index a3eb66066e..d00903703f 100644
--- a/src/TNL/File.hpp
+++ b/src/TNL/File.hpp
@@ -18,6 +18,7 @@
 #include <TNL/File.h>
 #include <TNL/Assert.h>
 #include <TNL/Cuda/CheckDevice.h>
+#include <TNL/Cuda/LaunchHelpers.h>
 #include <TNL/Exceptions/CudaSupportMissing.h>
 #include <TNL/Exceptions/FileSerializationError.h>
 #include <TNL/Exceptions/FileDeserializationError.h>
@@ -101,7 +102,7 @@ void File::load_impl( Type* buffer, std::streamsize elements )
       file.read( reinterpret_cast<char*>(buffer), sizeof(Type) * elements );
    else
    {
-      const std::streamsize cast_buffer_size = std::min( TransferBufferSize / (std::streamsize) sizeof(SourceType), elements );
+      const std::streamsize cast_buffer_size = std::min( Cuda::getTransferBufferSize() / (std::streamsize) sizeof(SourceType), elements );
       using BaseType = typename std::remove_cv< SourceType >::type;
       std::unique_ptr< BaseType[] > cast_buffer{ new BaseType[ cast_buffer_size ] };
       std::streamsize readElements = 0;
@@ -124,7 +125,7 @@ template< typename Type,
 void File::load_impl( Type* buffer, std::streamsize elements )
 {
 #ifdef HAVE_CUDA
-   const std::streamsize host_buffer_size = std::min( TransferBufferSize / (std::streamsize) sizeof(Type), elements );
+   const std::streamsize host_buffer_size = std::min( Cuda::getTransferBufferSize() / (std::streamsize) sizeof(Type), elements );
    using BaseType = typename std::remove_cv< Type >::type;
    std::unique_ptr< BaseType[] > host_buffer{ new BaseType[ host_buffer_size ] };
 
@@ -145,7 +146,7 @@ void File::load_impl( Type* buffer, std::streamsize elements )
    }
    else
    {
-      const std::streamsize cast_buffer_size = std::min( TransferBufferSize / (std::streamsize) sizeof(SourceType), elements );
+      const std::streamsize cast_buffer_size = std::min( Cuda::getTransferBufferSize() / (std::streamsize) sizeof(SourceType), elements );
       using BaseType = typename std::remove_cv< SourceType >::type;
       std::unique_ptr< BaseType[] > cast_buffer{ new BaseType[ cast_buffer_size ] };
 
@@ -192,7 +193,7 @@ void File::save_impl( const Type* buffer, std::streamsize elements )
       file.write( reinterpret_cast<const char*>(buffer), sizeof(Type) * elements );
    else
    {
-      const std::streamsize cast_buffer_size = std::min( TransferBufferSize / (std::streamsize) sizeof(TargetType), elements );
+      const std::streamsize cast_buffer_size = std::min( Cuda::getTransferBufferSize() / (std::streamsize) sizeof(TargetType), elements );
       using BaseType = typename std::remove_cv< TargetType >::type;
       std::unique_ptr< BaseType[] > cast_buffer{ new BaseType[ cast_buffer_size ] };
       std::streamsize writtenElements = 0;
@@ -216,7 +217,7 @@ template< typename Type,
 void File::save_impl( const Type* buffer, std::streamsize elements )
 {
 #ifdef HAVE_CUDA
-   const std::streamsize host_buffer_size = std::min( TransferBufferSize / (std::streamsize) sizeof(Type), elements );
+   const std::streamsize host_buffer_size = std::min( Cuda::getTransferBufferSize() / (std::streamsize) sizeof(Type), elements );
    using BaseType = typename std::remove_cv< Type >::type;
    std::unique_ptr< BaseType[] > host_buffer{ new BaseType[ host_buffer_size ] };
 
@@ -237,7 +238,7 @@ void File::save_impl( const Type* buffer, std::streamsize elements )
    }
    else
    {
-      const std::streamsize cast_buffer_size = std::min( TransferBufferSize / (std::streamsize) sizeof(TargetType), elements );
+      const std::streamsize cast_buffer_size = std::min( Cuda::getTransferBufferSize() / (std::streamsize) sizeof(TargetType), elements );
       using BaseType = typename std::remove_cv< TargetType >::type;
       std::unique_ptr< BaseType[] > cast_buffer{ new BaseType[ cast_buffer_size ] };
 
-- 
GitLab