Moved synchronization of smart pointers from Devices::Cuda into TNL::Pointers... (1743358a) · Commits · TNL / tnl-dev

src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem_impl.h

+2 −2

Original line number	Diff line number	Diff line
		@@ -490,7 +490,7 @@ getExplicitUpdate( const RealType& time,

		//std::cerr << "Setting boundary conditions..." << std::endl;

		Devices::Cuda::synchronizeDevice();
		Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
		for( IndexType gridYIdx = 0; gridYIdx < cudaYGrids; gridYIdx ++ )
		for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ )
		boundaryConditionsTemplatedCompact< MeshType, CellType, BoundaryCondition, MeshFunctionType >
		@@ -594,7 +594,7 @@ getExplicitUpdate( const RealType& time,
		gridYSize / 16 + ( gridYSize % 16 != 0 ) );
		*/

		TNL::Devices::Cuda::synchronizeDevice();
		Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
		int cudaErr;
		Meshes::Traverser< MeshType, Cell > meshTraverser;
		meshTraverser.template processInteriorEntities< UserData,

src/Benchmarks/HeatEquation/Tuning/GridTraverser_impl.h

+2 −2

Original line number	Diff line number	Diff line
		@@ -246,7 +246,7 @@ processEntities(
		IndexType cudaThreadsCount = 2 * ( end.x() - begin.x() + end.y() - begin.y() + 1 );
		Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, cudaThreadsCount );
		dim3 gridIdx, cudaGridSize;
		Devices::Cuda::synchronizeDevice();
		Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
		for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x++ )
		{
		Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
		@@ -273,7 +273,7 @@ processEntities(
		auto& pool = Cuda::StreamPool::getInstance();
		const cudaStream_t& s = pool.getStream( stream );

		Devices::Cuda::synchronizeDevice();
		Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
		dim3 gridIdx, cudaGridSize;
		for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ )
		for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ )

src/Benchmarks/LinearSolvers/tnl-benchmark-linear-solvers.h

+1 −1

Original line number	Diff line number	Diff line
		@@ -130,7 +130,7 @@ benchmarkIterativeSolvers( Benchmark& benchmark,
		cudaMatrixPointer = matrixPointer;

		// synchronize shared pointers
		Devices::Cuda::synchronizeDevice();
		Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
		#endif

		using namespace Solvers::Linear;

src/TNL/Devices/Cuda.h

+0 −21

Original line number	Diff line number	Diff line
		@@ -10,13 +10,7 @@

		#pragma once

		#include <iostream>

		#include <TNL/String.h>
		#include <TNL/Assert.h>
		#include <TNL/Pointers/SmartPointersRegister.h>
		#include <TNL/Timer.h>
		#include <TNL/Cuda/CudaCallable.h>
		#include <TNL/Config/ConfigDescription.h>
		#include <TNL/Config/ParameterContainer.h>

		@@ -33,16 +27,6 @@ public:

		static inline constexpr int getGPUTransferBufferSize();

		static inline void insertSmartPointer( Pointers::SmartPointer* pointer );

		static inline void removeSmartPointer( Pointers::SmartPointer* pointer );

		// Negative deviceId means that CudaDeviceInfo::getActiveDevice will be
		// called to get the device ID.
		static inline bool synchronizeDevice( int deviceId = -1 );

		static inline Timer& getSmartPointersSynchronizationTimer();

		////
		// When we transfer data between the GPU and the CPU we use 5 MB buffer. This
		// size should ensure good performance -- see.
		@@ -50,11 +34,6 @@ public:
		// We use the same buffer size even for retyping data during IO operations.
		//
		static constexpr std::size_t TransferBufferSize = 5 * 2<<20;


		protected:

		static inline Pointers::SmartPointersRegister& getSmartPointersRegister();
		};

		#ifdef HAVE_CUDA

src/TNL/Devices/Cuda_impl.h

+5 −38

Original line number	Diff line number	Diff line
		@@ -10,12 +10,15 @@

		#pragma once

		#include <iostream>

		#include <TNL/Math.h>
		#include <TNL/Devices/Cuda.h>
		#include <TNL/Cuda/DeviceInfo.h>
		#include <TNL/Exceptions/CudaBadAlloc.h>
		#include <TNL/Exceptions/CudaSupportMissing.h>
		#include <TNL/Exceptions/CudaRuntimeError.h>
		#include <TNL/Pointers/SmartPointersRegister.h>

		namespace TNL {
		namespace Devices {
		@@ -42,8 +45,8 @@ Cuda::setup( const Config::ParameterContainer& parameters,
		std::cerr << "I cannot activate CUDA device number " << cudaDevice << "." << std::endl;
		return false;
		}
		getSmartPointersSynchronizationTimer().reset();
		getSmartPointersSynchronizationTimer().stop();
		Pointers::getSmartPointersSynchronizationTimer< Devices::Cuda >().reset();
		Pointers::getSmartPointersSynchronizationTimer< Devices::Cuda >().stop();
		#endif
		return true;
		}
		@@ -53,42 +56,6 @@ inline constexpr int Cuda::getGPUTransferBufferSize()
		return 1 << 20;
		}

		inline void Cuda::insertSmartPointer( Pointers::SmartPointer* pointer )
		{
		getSmartPointersRegister().insert( pointer, TNL::Cuda::DeviceInfo::getActiveDevice() );
		}

		inline void Cuda::removeSmartPointer( Pointers::SmartPointer* pointer )
		{
		getSmartPointersRegister().remove( pointer, TNL::Cuda::DeviceInfo::getActiveDevice() );
		}

		inline bool Cuda::synchronizeDevice( int deviceId )
		{
		#ifdef HAVE_CUDA
		if( deviceId < 0 )
		deviceId = TNL::Cuda::DeviceInfo::getActiveDevice();
		getSmartPointersSynchronizationTimer().start();
		bool b = getSmartPointersRegister().synchronizeDevice( deviceId );
		getSmartPointersSynchronizationTimer().stop();
		return b;
		#else
		return true;
		#endif
		}

		inline Timer& Cuda::getSmartPointersSynchronizationTimer()
		{
		static Timer timer;
		return timer;
		}

		inline Pointers::SmartPointersRegister& Cuda::getSmartPointersRegister()
		{
		static Pointers::SmartPointersRegister reg;
		return reg;
		}

		// double-precision atomicAdd function for Maxwell and older GPUs
		// copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
		#ifdef HAVE_CUDA