Loading src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp +9 −9 Original line number Diff line number Diff line Loading @@ -106,11 +106,11 @@ copyFromIterator( DestinationElement* destination, SourceIterator last ) { using BaseType = typename std::remove_cv< DestinationElement >::type; std::unique_ptr< BaseType[] > buffer{ new BaseType[ Devices::Cuda::getGPUTransferBufferSize() ] }; std::unique_ptr< BaseType[] > buffer{ new BaseType[ Cuda::getTransferBufferSize() ] }; Index copiedElements = 0; while( copiedElements < destinationSize && first != last ) { Index i = 0; while( i < Devices::Cuda::getGPUTransferBufferSize() && first != last ) while( i < Cuda::getTransferBufferSize() && first != last ) buffer[ i++ ] = *first++; ArrayOperations< Devices::Cuda, Devices::Host >::copy( &destination[ copiedElements ], buffer.get(), i ); copiedElements += i; Loading Loading @@ -197,18 +197,18 @@ copy( DestinationElement* destination, else { using BaseType = typename std::remove_cv< SourceElement >::type; std::unique_ptr< BaseType[] > buffer{ new BaseType[ Devices::Cuda::getGPUTransferBufferSize() ] }; std::unique_ptr< BaseType[] > buffer{ new BaseType[ Cuda::getTransferBufferSize() ] }; Index i( 0 ); while( i < size ) { if( cudaMemcpy( (void*) buffer.get(), (void*) &source[ i ], TNL::min( size - i, Devices::Cuda::getGPUTransferBufferSize() ) * sizeof( SourceElement ), TNL::min( size - i, Cuda::getTransferBufferSize() ) * sizeof( SourceElement ), cudaMemcpyDeviceToHost ) != cudaSuccess ) std::cerr << "Transfer of data from CUDA device to host failed." << std::endl; TNL_CHECK_CUDA_DEVICE; Index j( 0 ); while( j < Devices::Cuda::getGPUTransferBufferSize() && i + j < size ) while( j < Cuda::getTransferBufferSize() && i + j < size ) { destination[ i + j ] = buffer[ j ]; j++; Loading Loading @@ -239,11 +239,11 @@ compare( const Element1* destination, TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." ); TNL_ASSERT_GE( size, (Index) 0, "Array size must be non-negative." ); #ifdef HAVE_CUDA std::unique_ptr< Element2[] > host_buffer{ new Element2[ Devices::Cuda::getGPUTransferBufferSize() ] }; std::unique_ptr< Element2[] > host_buffer{ new Element2[ Cuda::getTransferBufferSize() ] }; Index compared( 0 ); while( compared < size ) { Index transfer = min( size - compared, Devices::Cuda::getGPUTransferBufferSize() ); Index transfer = min( size - compared, Cuda::getTransferBufferSize() ); if( cudaMemcpy( (void*) host_buffer.get(), (void*) &source[ compared ], transfer * sizeof( Element2 ), Loading Loading @@ -288,12 +288,12 @@ copy( DestinationElement* destination, } else { std::unique_ptr< DestinationElement[] > buffer{ new DestinationElement[ Devices::Cuda::getGPUTransferBufferSize() ] }; std::unique_ptr< DestinationElement[] > buffer{ new DestinationElement[ Cuda::getTransferBufferSize() ] }; Index i( 0 ); while( i < size ) { Index j( 0 ); while( j < Devices::Cuda::getGPUTransferBufferSize() && i + j < size ) while( j < Cuda::getTransferBufferSize() && i + j < size ) { buffer[ j ] = source[ i + j ]; j++; Loading src/TNL/Cuda/LaunchHelpers.h +8 −0 Original line number Diff line number Diff line Loading @@ -30,6 +30,14 @@ inline constexpr int getWarpSize() return 32; } // When we transfer data between the GPU and the CPU we use 1 MiB buffer. This // size should ensure good performance. // We use the same buffer size even for retyping data during IO operations. inline constexpr int getTransferBufferSize() { return 1 << 20; } #ifdef HAVE_CUDA __device__ inline int getGlobalThreadIdx( const int gridIdx = 0, const int gridSize = getMaxGridSize() ) Loading src/TNL/Devices/Cuda.h +0 −10 Original line number Diff line number Diff line Loading @@ -24,16 +24,6 @@ public: static inline bool setup( const Config::ParameterContainer& parameters, const String& prefix = "" ); static inline constexpr int getGPUTransferBufferSize(); //// // When we transfer data between the GPU and the CPU we use 5 MB buffer. This // size should ensure good performance -- see. // http://wiki.accelereyes.com/wiki/index.php/GPU_Memory_Transfer . // We use the same buffer size even for retyping data during IO operations. // static constexpr std::size_t TransferBufferSize = 5 * 2<<20; }; } // namespace Devices Loading src/TNL/Devices/Cuda_impl.h +0 −5 Original line number Diff line number Diff line Loading @@ -51,10 +51,5 @@ Cuda::setup( const Config::ParameterContainer& parameters, return true; } inline constexpr int Cuda::getGPUTransferBufferSize() { return 1 << 20; } } // namespace Devices } // namespace TNL src/TNL/File.h +0 −8 Original line number Diff line number Diff line Loading @@ -168,14 +168,6 @@ class File std::fstream file; String fileName; //// // When we transfer data between the GPU and the CPU we use 5 MB buffer. This // size should ensure good performance -- see. // http://wiki.accelereyes.com/wiki/index.php/GPU_Memory_Transfer . // We use the same buffer size even for retyping data during IO operations. // static constexpr std::streamsize TransferBufferSize = 5 * 2<<20; }; /** Loading Loading
src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp +9 −9 Original line number Diff line number Diff line Loading @@ -106,11 +106,11 @@ copyFromIterator( DestinationElement* destination, SourceIterator last ) { using BaseType = typename std::remove_cv< DestinationElement >::type; std::unique_ptr< BaseType[] > buffer{ new BaseType[ Devices::Cuda::getGPUTransferBufferSize() ] }; std::unique_ptr< BaseType[] > buffer{ new BaseType[ Cuda::getTransferBufferSize() ] }; Index copiedElements = 0; while( copiedElements < destinationSize && first != last ) { Index i = 0; while( i < Devices::Cuda::getGPUTransferBufferSize() && first != last ) while( i < Cuda::getTransferBufferSize() && first != last ) buffer[ i++ ] = *first++; ArrayOperations< Devices::Cuda, Devices::Host >::copy( &destination[ copiedElements ], buffer.get(), i ); copiedElements += i; Loading Loading @@ -197,18 +197,18 @@ copy( DestinationElement* destination, else { using BaseType = typename std::remove_cv< SourceElement >::type; std::unique_ptr< BaseType[] > buffer{ new BaseType[ Devices::Cuda::getGPUTransferBufferSize() ] }; std::unique_ptr< BaseType[] > buffer{ new BaseType[ Cuda::getTransferBufferSize() ] }; Index i( 0 ); while( i < size ) { if( cudaMemcpy( (void*) buffer.get(), (void*) &source[ i ], TNL::min( size - i, Devices::Cuda::getGPUTransferBufferSize() ) * sizeof( SourceElement ), TNL::min( size - i, Cuda::getTransferBufferSize() ) * sizeof( SourceElement ), cudaMemcpyDeviceToHost ) != cudaSuccess ) std::cerr << "Transfer of data from CUDA device to host failed." << std::endl; TNL_CHECK_CUDA_DEVICE; Index j( 0 ); while( j < Devices::Cuda::getGPUTransferBufferSize() && i + j < size ) while( j < Cuda::getTransferBufferSize() && i + j < size ) { destination[ i + j ] = buffer[ j ]; j++; Loading Loading @@ -239,11 +239,11 @@ compare( const Element1* destination, TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." ); TNL_ASSERT_GE( size, (Index) 0, "Array size must be non-negative." ); #ifdef HAVE_CUDA std::unique_ptr< Element2[] > host_buffer{ new Element2[ Devices::Cuda::getGPUTransferBufferSize() ] }; std::unique_ptr< Element2[] > host_buffer{ new Element2[ Cuda::getTransferBufferSize() ] }; Index compared( 0 ); while( compared < size ) { Index transfer = min( size - compared, Devices::Cuda::getGPUTransferBufferSize() ); Index transfer = min( size - compared, Cuda::getTransferBufferSize() ); if( cudaMemcpy( (void*) host_buffer.get(), (void*) &source[ compared ], transfer * sizeof( Element2 ), Loading Loading @@ -288,12 +288,12 @@ copy( DestinationElement* destination, } else { std::unique_ptr< DestinationElement[] > buffer{ new DestinationElement[ Devices::Cuda::getGPUTransferBufferSize() ] }; std::unique_ptr< DestinationElement[] > buffer{ new DestinationElement[ Cuda::getTransferBufferSize() ] }; Index i( 0 ); while( i < size ) { Index j( 0 ); while( j < Devices::Cuda::getGPUTransferBufferSize() && i + j < size ) while( j < Cuda::getTransferBufferSize() && i + j < size ) { buffer[ j ] = source[ i + j ]; j++; Loading
src/TNL/Cuda/LaunchHelpers.h +8 −0 Original line number Diff line number Diff line Loading @@ -30,6 +30,14 @@ inline constexpr int getWarpSize() return 32; } // When we transfer data between the GPU and the CPU we use 1 MiB buffer. This // size should ensure good performance. // We use the same buffer size even for retyping data during IO operations. inline constexpr int getTransferBufferSize() { return 1 << 20; } #ifdef HAVE_CUDA __device__ inline int getGlobalThreadIdx( const int gridIdx = 0, const int gridSize = getMaxGridSize() ) Loading
src/TNL/Devices/Cuda.h +0 −10 Original line number Diff line number Diff line Loading @@ -24,16 +24,6 @@ public: static inline bool setup( const Config::ParameterContainer& parameters, const String& prefix = "" ); static inline constexpr int getGPUTransferBufferSize(); //// // When we transfer data between the GPU and the CPU we use 5 MB buffer. This // size should ensure good performance -- see. // http://wiki.accelereyes.com/wiki/index.php/GPU_Memory_Transfer . // We use the same buffer size even for retyping data during IO operations. // static constexpr std::size_t TransferBufferSize = 5 * 2<<20; }; } // namespace Devices Loading
src/TNL/Devices/Cuda_impl.h +0 −5 Original line number Diff line number Diff line Loading @@ -51,10 +51,5 @@ Cuda::setup( const Config::ParameterContainer& parameters, return true; } inline constexpr int Cuda::getGPUTransferBufferSize() { return 1 << 20; } } // namespace Devices } // namespace TNL
src/TNL/File.h +0 −8 Original line number Diff line number Diff line Loading @@ -168,14 +168,6 @@ class File std::fstream file; String fileName; //// // When we transfer data between the GPU and the CPU we use 5 MB buffer. This // size should ensure good performance -- see. // http://wiki.accelereyes.com/wiki/index.php/GPU_Memory_Transfer . // We use the same buffer size even for retyping data during IO operations. // static constexpr std::streamsize TransferBufferSize = 5 * 2<<20; }; /** Loading