Moved everything from Cuda.cpp and Cuda.cu into Cuda_impl.h (07d4cb49) · Commits · TNL / tnl-dev

src/TNL/Devices/CMakeLists.txt

+0 −2

Original line number	Diff line number	Diff line
		@@ -8,7 +8,6 @@ set (headers Cuda.h

		SET( CURRENT_DIR ${CMAKE_SOURCE_DIR}/src/TNL/Devices )
		set( common_SOURCES
		${CURRENT_DIR}/Cuda.cpp
		${CURRENT_DIR}/Host.cpp
		${CURRENT_DIR}/MIC.cpp
		${CURRENT_DIR}/SystemInfo.cpp )
		@@ -16,7 +15,6 @@ set( common_SOURCES
		IF( BUILD_CUDA )
		set( tnl_devices_CUDA__SOURCES
		${common_SOURCES}
		${CURRENT_DIR}/Cuda.cu
		${CURRENT_DIR}/CudaDeviceInfo.cu
		PARENT_SCOPE )
		ENDIF()

src/TNL/Devices/Cuda.cpp

deleted100644 → 0

+0 −64

Original line number	Diff line number	Diff line
		/***************************************************************************
		Cuda.cpp - description
		-------------------
		begin : Jul 11, 2013
		copyright : (C) 2013 by Tomas Oberhuber
		email : tomas.oberhuber@fjfi.cvut.cz
		***************************************************************************/

		/* See Copyright Notice in tnl/Copyright */

		#include <TNL/Devices/Cuda.h>
		#include <TNL/Math.h>
		#include <TNL/Devices/CudaDeviceInfo.h>

		namespace TNL {
		namespace Devices {

		SmartPointersRegister Cuda::smartPointersRegister;
		Timer Cuda::smartPointersSynchronizationTimer;

		String Cuda::getDeviceType()
		{
		return String( "Devices::Cuda" );
		}

		int Cuda::getNumberOfBlocks( const int threads,
		const int blockSize )
		{
		return roundUpDivision( threads, blockSize );
		}

		int Cuda::getNumberOfGrids( const int blocks,
		const int gridSize )
		{
		return roundUpDivision( blocks, gridSize );
		}

		void Cuda::insertSmartPointer( SmartPointer* pointer )
		{
		smartPointersRegister.insert( pointer, Devices::CudaDeviceInfo::getActiveDevice() );
		}

		void Cuda::removeSmartPointer( SmartPointer* pointer )
		{
		smartPointersRegister.remove( pointer, Devices::CudaDeviceInfo::getActiveDevice() );
		}

		bool Cuda::synchronizeDevice( int deviceId )
		{
		#ifdef HAVE_CUDA_UNIFIED_MEMORY
		return true;
		#else
		if( deviceId < 0 )
		deviceId = Devices::CudaDeviceInfo::getActiveDevice();
		smartPointersSynchronizationTimer.start();
		bool b = smartPointersRegister.synchronizeDevice( deviceId );
		smartPointersSynchronizationTimer.stop();
		return b;
		#endif
		}

		} // namespace Devices
		} // namespace TNL

src/TNL/Devices/Cuda.cu

deleted100644 → 0

+0 −119

Original line number	Diff line number	Diff line
		/***************************************************************************
		Cuda.cu - description
		-------------------
		begin : Dec 22, 2014
		copyright : (C) 2014 by Tomas Oberhuber
		email : tomas.oberhuber@fjfi.cvut.cz
		***************************************************************************/

		/* See Copyright Notice in tnl/Copyright */

		#include <TNL/Devices/Cuda.h>
		#include <TNL/Exceptions/CudaRuntimeError.h>
		#include <TNL/Config/ConfigDescription.h>
		#include <TNL/Config/ParameterContainer.h>

		namespace TNL {
		namespace Devices {


		void Cuda::setupThreads( const dim3& blockSize,
		dim3& blocksCount,
		dim3& gridsCount,
		long long int xThreads,
		long long int yThreads,
		long long int zThreads )
		{
		blocksCount.x = max( 1, xThreads / blockSize.x + ( xThreads % blockSize.x != 0 ) );
		blocksCount.y = max( 1, yThreads / blockSize.y + ( yThreads % blockSize.y != 0 ) );
		blocksCount.z = max( 1, zThreads / blockSize.z + ( zThreads % blockSize.z != 0 ) );

		/****
		* TODO: Fix the following:
		* I do not known how to get max grid size in kernels :(
		*
		* Also, this is very slow. */
		/*int currentDevice( 0 );
		cudaGetDevice( currentDevice );
		cudaDeviceProp properties;
		cudaGetDeviceProperties( &properties, currentDevice );
		gridsCount.x = blocksCount.x / properties.maxGridSize[ 0 ] + ( blocksCount.x % properties.maxGridSize[ 0 ] != 0 );
		gridsCount.y = blocksCount.y / properties.maxGridSize[ 1 ] + ( blocksCount.y % properties.maxGridSize[ 1 ] != 0 );
		gridsCount.z = blocksCount.z / properties.maxGridSize[ 2 ] + ( blocksCount.z % properties.maxGridSize[ 2 ] != 0 );
		*/
		gridsCount.x = blocksCount.x / getMaxGridSize() + ( blocksCount.x % getMaxGridSize() != 0 );
		gridsCount.y = blocksCount.y / getMaxGridSize() + ( blocksCount.y % getMaxGridSize() != 0 );
		gridsCount.z = blocksCount.z / getMaxGridSize() + ( blocksCount.z % getMaxGridSize() != 0 );
		}

		void Cuda::setupGrid( const dim3& blocksCount,
		const dim3& gridsCount,
		const dim3& gridIdx,
		dim3& gridSize )
		{
		/* TODO: this is extremely slow!!!!
		int currentDevice( 0 );
		cudaGetDevice( &currentDevice );
		cudaDeviceProp properties;
		cudaGetDeviceProperties( &properties, currentDevice );*/

		/****
		* TODO: fix the following
		if( gridIdx.x < gridsCount.x )
		gridSize.x = properties.maxGridSize[ 0 ];
		else
		gridSize.x = blocksCount.x % properties.maxGridSize[ 0 ];

		if( gridIdx.y < gridsCount.y )
		gridSize.y = properties.maxGridSize[ 1 ];
		else
		gridSize.y = blocksCount.y % properties.maxGridSize[ 1 ];

		if( gridIdx.z < gridsCount.z )
		gridSize.z = properties.maxGridSize[ 2 ];
		else
		gridSize.z = blocksCount.z % properties.maxGridSize[ 2 ];*/

		if( gridIdx.x < gridsCount.x - 1 )
		gridSize.x = getMaxGridSize();
		else
		gridSize.x = blocksCount.x % getMaxGridSize();

		if( gridIdx.y < gridsCount.y - 1 )
		gridSize.y = getMaxGridSize();
		else
		gridSize.y = blocksCount.y % getMaxGridSize();

		if( gridIdx.z < gridsCount.z - 1 )
		gridSize.z = getMaxGridSize();
		else
		gridSize.z = blocksCount.z % getMaxGridSize();
		}

		void Cuda::printThreadsSetup( const dim3& blockSize,
		const dim3& blocksCount,
		const dim3& gridSize,
		const dim3& gridsCount,
		std::ostream& str )
		{
		str << "Block size: " << blockSize << std::endl
		<< " Blocks count: " << blocksCount << std::endl
		<< " Grid size: " << gridSize << std::endl
		<< " Grids count: " << gridsCount << std::endl;
		}


		void Cuda::checkDevice( const char* file_name, int line, cudaError error )
		{
		if( error != cudaSuccess )
		throw Exceptions::CudaRuntimeError( error, file_name, line );
		}

		std::ostream& operator << ( std::ostream& str, const dim3& d )
		{
		str << "( " << d.x << ", " << d.y << ", " << d.z << " )";
		return str;
		}

		} // namespace Devices
		} // namespace TNL

src/TNL/Devices/Cuda.h

+26 −24

Original line number	Diff line number	Diff line
		@@ -31,7 +31,12 @@ class Cuda
		{
		public:

		static String getDeviceType();
		static inline String getDeviceType();

		static inline void configSetup( Config::ConfigDescription& config, const String& prefix = "" );

		static inline bool setup( const Config::ParameterContainer& parameters,
		const String& prefix = "" );

		__cuda_callable__ static inline constexpr int getMaxGridSize();

		@@ -66,7 +71,7 @@ class Cuda
		* number of the CUDA threads and the block size.
		* It is obsolete and it will be replaced by setupThreads.
		*/
		static int getNumberOfBlocks( const int threads,
		static inline int getNumberOfBlocks( const int threads,
		const int blockSize );

		/****
		@@ -74,7 +79,7 @@ class Cuda
		* number of the CUDA blocks and maximum grid size.
		* It is obsolete and it will be replaced by setupThreads.
		*/
		static int getNumberOfGrids( const int blocks,
		static inline int getNumberOfGrids( const int blocks,
		const int gridSize = getMaxGridSize() );

		#ifdef HAVE_CUDA
		@@ -153,29 +158,24 @@ class Cuda
		* of calling cudaGetLastError() inside the method.
		* We recommend to use macro 'TNL_CHECK_CUDA_DEVICE' defined bellow.
		*/
		static void checkDevice( const char* file_name, int line, cudaError error );
		static inline void checkDevice( const char* file_name, int line, cudaError error );
		#else
		static void checkDevice() {}
		static inline void checkDevice() {}
		#endif

		static void configSetup( Config::ConfigDescription& config, const String& prefix = "" );

		static bool setup( const Config::ParameterContainer& parameters,
		const String& prefix = "" );

		static void insertSmartPointer( SmartPointer* pointer );
		static inline void insertSmartPointer( SmartPointer* pointer );

		static void removeSmartPointer( SmartPointer* pointer );
		static inline void removeSmartPointer( SmartPointer* pointer );

		// Negative deviceId means that CudaDeviceInfo::getActiveDevice will be
		// called to get the device ID.
		static bool synchronizeDevice( int deviceId = -1 );
		static inline bool synchronizeDevice( int deviceId = -1 );

		static Timer smartPointersSynchronizationTimer;
		static inline Timer& getSmartPointersSynchronizationTimer();

		protected:

		static SmartPointersRegister smartPointersRegister;
		static inline SmartPointersRegister& getSmartPointersRegister();
		};

		#ifdef HAVE_CUDA
		@@ -185,7 +185,9 @@ class Cuda
		#endif

		#ifdef HAVE_CUDA
		namespace {
		std::ostream& operator << ( std::ostream& str, const dim3& d );
		}
		#endif

		#ifdef HAVE_CUDA

src/TNL/Devices/Cuda_impl.h

+178 −17

Original line number	Diff line number	Diff line
		@@ -10,9 +10,12 @@

		#pragma once

		#include <TNL/Math.h>
		#include <TNL/Devices/Cuda.h>
		#include <TNL/Devices/CudaDeviceInfo.h>
		#include <TNL/Exceptions/CudaBadAlloc.h>
		#include <TNL/Exceptions/CudaSupportMissing.h>
		#include <TNL/Exceptions/CudaRuntimeError.h>
		#include <TNL/CudaSharedMemory.h>
		#include <TNL/Config/ConfigDescription.h>
		#include <TNL/Config/ParameterContainer.h>
		@@ -20,6 +23,39 @@
		namespace TNL {
		namespace Devices {

		inline String Cuda::getDeviceType()
		{
		return String( "Cuda" );
		}

		inline void
		Cuda::configSetup( Config::ConfigDescription& config,
		const String& prefix )
		{
		#ifdef HAVE_CUDA
		config.addEntry< int >( prefix + "cuda-device", "Choose CUDA device to run the computation.", 0 );
		#else
		config.addEntry< int >( prefix + "cuda-device", "Choose CUDA device to run the computation (not supported on this system).", 0 );
		#endif
		}

		inline bool
		Cuda::setup( const Config::ParameterContainer& parameters,
		const String& prefix )
		{
		#ifdef HAVE_CUDA
		int cudaDevice = parameters.getParameter< int >( prefix + "cuda-device" );
		if( cudaSetDevice( cudaDevice ) != cudaSuccess )
		{
		std::cerr << "I cannot activate CUDA device number " << cudaDevice << "." << std::endl;
		return false;
		}
		getSmartPointersSynchronizationTimer().reset();
		getSmartPointersSynchronizationTimer().stop();
		#endif
		return true;
		}

		__cuda_callable__
		inline constexpr int Cuda::getMaxGridSize()
		{
		@@ -71,6 +107,105 @@ __device__ inline int Cuda::getGlobalThreadIdx_z( const dim3& gridIdx )
		}
		#endif

		inline int Cuda::getNumberOfBlocks( const int threads,
		const int blockSize )
		{
		return roundUpDivision( threads, blockSize );
		}

		inline int Cuda::getNumberOfGrids( const int blocks,
		const int gridSize )
		{
		return roundUpDivision( blocks, gridSize );
		}

		#ifdef HAVE_CUDA
		inline void Cuda::setupThreads( const dim3& blockSize,
		dim3& blocksCount,
		dim3& gridsCount,
		long long int xThreads,
		long long int yThreads,
		long long int zThreads )
		{
		blocksCount.x = max( 1, xThreads / blockSize.x + ( xThreads % blockSize.x != 0 ) );
		blocksCount.y = max( 1, yThreads / blockSize.y + ( yThreads % blockSize.y != 0 ) );
		blocksCount.z = max( 1, zThreads / blockSize.z + ( zThreads % blockSize.z != 0 ) );

		/****
		* TODO: Fix the following:
		* I do not known how to get max grid size in kernels :(
		*
		* Also, this is very slow. */
		/*int currentDevice( 0 );
		cudaGetDevice( currentDevice );
		cudaDeviceProp properties;
		cudaGetDeviceProperties( &properties, currentDevice );
		gridsCount.x = blocksCount.x / properties.maxGridSize[ 0 ] + ( blocksCount.x % properties.maxGridSize[ 0 ] != 0 );
		gridsCount.y = blocksCount.y / properties.maxGridSize[ 1 ] + ( blocksCount.y % properties.maxGridSize[ 1 ] != 0 );
		gridsCount.z = blocksCount.z / properties.maxGridSize[ 2 ] + ( blocksCount.z % properties.maxGridSize[ 2 ] != 0 );
		*/
		gridsCount.x = blocksCount.x / getMaxGridSize() + ( blocksCount.x % getMaxGridSize() != 0 );
		gridsCount.y = blocksCount.y / getMaxGridSize() + ( blocksCount.y % getMaxGridSize() != 0 );
		gridsCount.z = blocksCount.z / getMaxGridSize() + ( blocksCount.z % getMaxGridSize() != 0 );
		}

		inline void Cuda::setupGrid( const dim3& blocksCount,
		const dim3& gridsCount,
		const dim3& gridIdx,
		dim3& gridSize )
		{
		/* TODO: this is extremely slow!!!!
		int currentDevice( 0 );
		cudaGetDevice( &currentDevice );
		cudaDeviceProp properties;
		cudaGetDeviceProperties( &properties, currentDevice );*/

		/****
		* TODO: fix the following
		if( gridIdx.x < gridsCount.x )
		gridSize.x = properties.maxGridSize[ 0 ];
		else
		gridSize.x = blocksCount.x % properties.maxGridSize[ 0 ];

		if( gridIdx.y < gridsCount.y )
		gridSize.y = properties.maxGridSize[ 1 ];
		else
		gridSize.y = blocksCount.y % properties.maxGridSize[ 1 ];

		if( gridIdx.z < gridsCount.z )
		gridSize.z = properties.maxGridSize[ 2 ];
		else
		gridSize.z = blocksCount.z % properties.maxGridSize[ 2 ];*/

		if( gridIdx.x < gridsCount.x - 1 )
		gridSize.x = getMaxGridSize();
		else
		gridSize.x = blocksCount.x % getMaxGridSize();

		if( gridIdx.y < gridsCount.y - 1 )
		gridSize.y = getMaxGridSize();
		else
		gridSize.y = blocksCount.y % getMaxGridSize();

		if( gridIdx.z < gridsCount.z - 1 )
		gridSize.z = getMaxGridSize();
		else
		gridSize.z = blocksCount.z % getMaxGridSize();
		}

		inline void Cuda::printThreadsSetup( const dim3& blockSize,
		const dim3& blocksCount,
		const dim3& gridSize,
		const dim3& gridsCount,
		std::ostream& str )
		{
		str << "Block size: " << blockSize << std::endl
		<< " Blocks count: " << blocksCount << std::endl
		<< " Grid size: " << gridSize << std::endl
		<< " Grids count: " << gridsCount << std::endl;
		}
		#endif


		template< typename ObjectType >
		ObjectType* Cuda::passToDevice( const ObjectType& object )
		@@ -162,33 +297,59 @@ __device__ Element* Cuda::getSharedMemory()
		return CudaSharedMemory< Element >();
		}

		inline void
		Cuda::configSetup( Config::ConfigDescription& config,
		const String& prefix )
		{
		#ifdef HAVE_CUDA
		config.addEntry< int >( prefix + "cuda-device", "Choose CUDA device to run the computation.", 0 );
		inline void Cuda::checkDevice( const char* file_name, int line, cudaError error )
		{
		if( error != cudaSuccess )
		throw Exceptions::CudaRuntimeError( error, file_name, line );
		}
		#endif

		inline void Cuda::insertSmartPointer( SmartPointer* pointer )
		{
		getSmartPointersRegister().insert( pointer, Devices::CudaDeviceInfo::getActiveDevice() );
		}

		inline void Cuda::removeSmartPointer( SmartPointer* pointer )
		{
		getSmartPointersRegister().remove( pointer, Devices::CudaDeviceInfo::getActiveDevice() );
		}

		inline bool Cuda::synchronizeDevice( int deviceId )
		{
		#ifdef HAVE_CUDA_UNIFIED_MEMORY
		return true;
		#else
		config.addEntry< int >( prefix + "cuda-device", "Choose CUDA device to run the computation (not supported on this system).", 0 );
		if( deviceId < 0 )
		deviceId = Devices::CudaDeviceInfo::getActiveDevice();
		getSmartPointersSynchronizationTimer().start();
		bool b = getSmartPointersRegister().synchronizeDevice( deviceId );
		getSmartPointersSynchronizationTimer().stop();
		return b;
		#endif
		}

		inline bool
		Cuda::setup( const Config::ParameterContainer& parameters,
		const String& prefix )
		inline Timer& Cuda::getSmartPointersSynchronizationTimer()
		{
		static Timer timer;
		return timer;
		}

		inline SmartPointersRegister& Cuda::getSmartPointersRegister()
		{
		static SmartPointersRegister reg;
		return reg;
		}

		#ifdef HAVE_CUDA
		int cudaDevice = parameters.getParameter< int >( prefix + "cuda-device" );
		if( cudaSetDevice( cudaDevice ) != cudaSuccess )
		namespace {
		std::ostream& operator << ( std::ostream& str, const dim3& d )
		{
		std::cerr << "I cannot activate CUDA device number " << cudaDevice << "." << std::endl;
		return false;
		str << "( " << d.x << ", " << d.y << ", " << d.z << " )";
		return str;
		}
		smartPointersSynchronizationTimer.reset();
		smartPointersSynchronizationTimer.stop();
		#endif
		return true;
		}
		#endif

		// double-precision atomicAdd function for Maxwell and older GPUs
		// copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions