Improving data transfers between the GPU and the CPU. (91ab8e1a) · Commits · TNL / tnl-dev

src/core/tnlCuda.h

+21 −89

Original line number	Diff line number	Diff line
		@@ -27,57 +27,29 @@ class tnlCuda
		{
		public:

		static tnlString getDeviceType()
		{
		return tnlString( "tnlCuda" );
		}
		static tnlString getDeviceType();

		static tnlDeviceEnum getDevice()
		{
		return tnlCudaDevice;
		};
		static tnlDeviceEnum getDevice();

		template< typename Element, typename Index >
		static void allocateMemory( Element*& data, const Index size )
		{
		allocateMemoryCuda( data, size );
		}
		static void allocateMemory( Element*& data, const Index size );

		template< typename Element >
		static void freeMemory( Element* data )
		{
		freeMemoryCuda( data );
		}
		static void freeMemory( Element* data );


		template< typename Element >
		static void setMemoryElement( Element* data,
		const Element& value )
		{
		setMemoryCuda( data, value, 1 );
		}
		const Element& value );

		template< typename Element >
		static Element getMemoryElement( const Element* data )
		{
		Element result;
		copyMemoryCudaToHost( &result, data, 1 );
		return result;
		}
		static Element getMemoryElement( const Element* data );

		template< typename Element, typename Index >
		static Element& getArrayElementReference( Element* data, const Index i )
		{
		tnlAssert( false, );
		abort();
		}
		static Element& getArrayElementReference( Element* data, const Index i );

		template< typename Element, typename Index >
		static const Element& getArrayElementReference(const Element* data, const Index i )
		{
		tnlAssert( false, );
		abort();
		}
		static const Element& getArrayElementReference(const Element* data, const Index i );

		template< typename DestinationElement,
		typename SourceElement,
		@@ -85,76 +57,36 @@ class tnlCuda
		typename Device >
		static bool memcpy( DestinationElement* destination,
		const SourceElement* source,
		const Index size )
		{
		switch( Device :: getDevice() )
		{
		case tnlHostDevice:
		return copyMemoryHostToCuda( destination, source, size );
		case tnlCudaDevice:
		return copyMemoryCudaToCuda( destination, source, size );
		}
		return true;
		}

		const Index size );

		template< typename Element, typename Index, typename Device >
		static bool memcpy( Element* destination,
		const Element* source,
		const Index size )
		{
		return tnlCuda :: memcpy< Element, Element, Index, Device >
		( destination,
		source,
		size );
		}
		const Index size );

		template< typename Element, typename Index, typename Device >
		static bool memcmp( const Element* data1,
		const Element* data2,
		const Index size )
		{
		switch( Device :: getDevice() )
		{
		case tnlHostDevice:
		return compareMemoryHostCuda( data2, data1, size );
		case tnlCudaDevice:
		return compareMemoryCuda( data1, data2, size );
		}
		}
		const Index size );

		template< typename Element, typename Index >
		static bool memset( Element* destination,
		const Element& value,
		const Index size )
		{
		return setMemoryCuda( destination, value, size );
		}
		const Index size );

		static int getMaxGridSize()
		{
		return maxGridSize;
		}
		static int getMaxGridSize();

		static void setMaxGridSize( int newMaxGridSize )
		{
		maxGridSize = newMaxGridSize;
		}
		static void setMaxGridSize( int newMaxGridSize );

		static int getMaxBlockSize()
		{
		return maxBlockSize;
		}
		static int getMaxBlockSize();

		static void setMaxBlockSize( int newMaxBlockSize )
		{
		maxBlockSize = newMaxBlockSize;
		}
		static void setMaxBlockSize( int newMaxBlockSize );

		protected:

		static int maxGridSize, maxBlockSize;
		};

		#include <implementation/core/tnlCuda_impl.h>

		#endif /* TNLCUDA_H_ */

src/implementation/core/CMakeLists.txt

+1 −0

Original line number	Diff line number	Diff line
		@@ -3,6 +3,7 @@ ADD_SUBDIRECTORY( cuda )
		SET( headers vector-operations.h
		memory-operations.h
		tnlArray_impl.h
		tnlCuda_impl.h
		tnlHost_impl.h
		tnlLogger_impl.h
		tnlMultiArray1D_impl.h

src/implementation/core/memory-operations.h

+70 −27

Original line number	Diff line number	Diff line
		@@ -87,11 +87,11 @@ __global__ void setVectorValueCudaKernel( Element* data,
		const Element value )
		{
		Index elementIdx = blockDim. x * blockIdx. x + threadIdx. x;
		const Index gridSize = blockDim. x * gridDim. x;
		const Index maxGridSize = blockDim. x * gridDim. x;
		while( elementIdx < size )
		{
		data[ elementIdx ] = value;
		elementIdx += gridSize;
		elementIdx += maxGridSize;
		}
		}
		#endif
		@@ -159,6 +159,42 @@ bool copyMemoryHostToCuda( Element* destination,
		#endif
		}

		template< typename DestinationElement,
		typename SourceElement,
		typename Index >
		bool copyMemoryHostToCuda( DestinationElement* destination,
		const SourceElement* source,
		const Index size )
		{
		#ifdef HAVE_CUDA
		DestinationElement* buffer = new DestinationElement[ tnlGPUvsCPUTransferBufferSize ];
		if( ! buffer )
		{
		cerr << "Unable to allocate supporting buffer to transfer data between the CUDA device and the host." << endl;
		return false;
		}
		Index i( 0 );
		while( i < size )
		{
		Index j( 0 );
		while( j < tnlGPUvsCPUTransferBufferSize && i + j < size )
		buffer[ j ] = source[ i + j++ ];
		if( ! copyMemoryHostTuCuda( buffer,
		&destination[ i ],
		j ) )
		{
		delete[] buffer;
		return false;
		}
		i += j;
		}
		delete[] buffer;
		return true;
		#else
		cerr << "CUDA support is missing on this system " << __FILE__ << " line " << __LINE__ << "." << endl;
		return false;
		#endif
		}

		template< typename Element, typename Index >
		bool copyMemoryCudaToHost( Element* destination,
		@@ -189,22 +225,29 @@ bool copyMemoryCudaToHost( DestinationElement* destination,
		const SourceElement* source,
		const Index size )
		{
		#ifdef HAVE_CUDA
		abort(); // TODO: fix this
		cudaMemcpy( destination,
		source,
		size * sizeof( SourceElement ),
		cudaMemcpyDeviceToHost );
		if( ! checkCudaDevice )
		SourceElement* buffer = new SourceElement[ tnlGPUvsCPUTransferBufferSize ];
		if( ! buffer )
		{
		cerr << "Transfer of data from CUDA device to host failed." << endl;
		cerr << "Unable to allocate supporting buffer to transfer data between the CUDA device and the host." << endl;
		return false;
		}
		return true;
		#else
		cerr << "CUDA support is missing on this system " << __FILE__ << " line " << __LINE__ << "." << endl;
		Index i( 0 );
		while( i < size )
		{
		if( ! copyMemoryCudaToHost( &source[ i ],
		buffer,
		Min( size - i, tnlGPUvsCPUTransferBufferSize ) ) )
		{
		delete[] buffer;
		return false;
		#endif
		}
		Index j( 0 );
		while( j < tnlGPUvsCPUTransferBufferSize && i + j < size )
		destination[ i + j ] = buffer[ j++ ];
		i += j;
		}
		delete[] buffer;
		return true;
		}


		@@ -363,19 +406,19 @@ extern template bool setMemoryHost( float* data, const float& value, const long
		extern template bool setMemoryHost( double* data, const double& value, const long int size );
		extern template bool setMemoryHost( long double* data, const long double& value, const long int size );

		extern template bool setMemoryCuda( char* data, const char& value, const int size );
		extern template bool setMemoryCuda( int* data, const int& value, const int size );
		extern template bool setMemoryCuda( long int* data, const long int& value, const int size );
		extern template bool setMemoryCuda( float* data, const float& value, const int size );
		extern template bool setMemoryCuda( double* data, const double& value, const int size );
		extern template bool setMemoryCuda( long double* data, const long double& value, const int size );

		extern template bool setMemoryCuda( char* data, const char& value, const long int size );
		extern template bool setMemoryCuda( int* data, const int& value, const long int size );
		extern template bool setMemoryCuda( long int* data, const long int& value, const long int size );
		extern template bool setMemoryCuda( float* data, const float& value, const long int size );
		extern template bool setMemoryCuda( double* data, const double& value, const long int size );
		extern template bool setMemoryCuda( long double* data, const long double& value, const long int size );
		extern template bool setMemoryCuda( char* data, const char& value, const int size, const int maxGridSize );
		extern template bool setMemoryCuda( int* data, const int& value, const int size, const int maxGridSize );
		extern template bool setMemoryCuda( long int* data, const long int& value, const int size, const int maxGridSize );
		extern template bool setMemoryCuda( float* data, const float& value, const int size, const int maxGridSize );
		extern template bool setMemoryCuda( double* data, const double& value, const int size, const int maxGridSize );
		extern template bool setMemoryCuda( long double* data, const long double& value, const int size, const int maxGridSize );

		extern template bool setMemoryCuda( char* data, const char& value, const long int size, const int maxGridSize );
		extern template bool setMemoryCuda( int* data, const int& value, const long int size, const int maxGridSize );
		extern template bool setMemoryCuda( long int* data, const long int& value, const long int size, const int maxGridSize );
		extern template bool setMemoryCuda( float* data, const float& value, const long int size, const int maxGridSize );
		extern template bool setMemoryCuda( double* data, const double& value, const long int size, const int maxGridSize );
		extern template bool setMemoryCuda( long double* data, const long double& value, const long int size, const int maxGridSize );

		extern template bool copyMemoryHostToHost( char* destination, const char* source, const int size );
		extern template bool copyMemoryHostToHost( int* destination, const int* source, const int size );

src/implementation/core/memory-operations_impl.cu

+14 −14

Original line number	Diff line number	Diff line
		@@ -75,19 +75,19 @@ template bool setMemoryHost( float* data, const float& value, const long int siz
		template bool setMemoryHost( double* data, const double& value, const long int size );
		template bool setMemoryHost( long double* data, const long double& value, const long int size );

		template bool setMemoryCuda( char* data, const char& value, const int size );
		template bool setMemoryCuda( int* data, const int& value, const int size );
		template bool setMemoryCuda( long int* data, const long int& value, const int size );
		template bool setMemoryCuda( float* data, const float& value, const int size );
		template bool setMemoryCuda( double* data, const double& value, const int size );
		template bool setMemoryCuda( long double* data, const long double& value, const int size );

		template bool setMemoryCuda( char* data, const char& value, const long int size );
		template bool setMemoryCuda( int* data, const int& value, const long int size );
		template bool setMemoryCuda( long int* data, const long int& value, const long int size );
		template bool setMemoryCuda( float* data, const float& value, const long int size );
		template bool setMemoryCuda( double* data, const double& value, const long int size );
		template bool setMemoryCuda( long double* data, const long double& value, const long int size );
		template bool setMemoryCuda( char* data, const char& value, const int size, const int maxGridSize );
		template bool setMemoryCuda( int* data, const int& value, const int size, const int maxGridSize );
		template bool setMemoryCuda( long int* data, const long int& value, const int size, const int maxGridSize );
		template bool setMemoryCuda( float* data, const float& value, const int size, const int maxGridSize );
		template bool setMemoryCuda( double* data, const double& value, const int size, const int maxGridSize );
		template bool setMemoryCuda( long double* data, const long double& value, const int size, const int maxGridSize );

		template bool setMemoryCuda( char* data, const char& value, const long int size, const int maxGridSize );
		template bool setMemoryCuda( int* data, const int& value, const long int size, const int maxGridSize );
		template bool setMemoryCuda( long int* data, const long int& value, const long int size, const int maxGridSize );
		template bool setMemoryCuda( float* data, const float& value, const long int size, const int maxGridSize );
		template bool setMemoryCuda( double* data, const double& value, const long int size, const int maxGridSize );
		template bool setMemoryCuda( long double* data, const long double& value, const long int size, const int maxGridSize );

		template bool copyMemoryHostToHost( char* destination, const char* source, const int size );
		template bool copyMemoryHostToHost( int* destination, const int* source, const int size );

src/implementation/core/tnlCuda_impl.h

0 → 100644

+146 −0

Original line number	Diff line number	Diff line
		/***************************************************************************
		tnlCuda_impl.h - description
		-------------------
		begin : Jul 11, 2013
		copyright : (C) 2013 by Tomas Oberhuber
		email : tomas.oberhuber@fjfi.cvut.cz
		***************************************************************************/

		/***************************************************************************
		* *
		* This program is free software; you can redistribute it and/or modify *
		* it under the terms of the GNU General Public License as published by *
		* the Free Software Foundation; either version 2 of the License, or *
		* (at your option) any later version. *
		* *
		***************************************************************************/

		#ifndef TNLCUDA_IMPL_H_
		#define TNLCUDA_IMPL_H_

		inline tnlString tnlCuda :: getDeviceType()
		{
		return tnlString( "tnlCuda" );
		}

		inline tnlDeviceEnum tnlCuda :: getDevice()
		{
		return tnlCudaDevice;
		};

		template< typename Element, typename Index >
		void tnlCuda :: allocateMemory( Element*& data, const Index size )
		{
		allocateMemoryCuda( data, size );
		}

		template< typename Element >
		void tnlCuda :: freeMemory( Element* data )
		{
		freeMemoryCuda( data );
		}


		template< typename Element >
		void tnlCuda :: setMemoryElement( Element* data,
		const Element& value )
		{
		setMemoryCuda( data, value, 1, maxGridSize );
		}

		template< typename Element >
		Element tnlCuda :: getMemoryElement( const Element* data )
		{
		Element result;
		copyMemoryCudaToHost( &result, data, 1 );
		return result;
		}

		template< typename Element, typename Index >
		Element& tnlCuda :: getArrayElementReference( Element* data, const Index i )
		{
		tnlAssert( false, );
		abort();
		}

		template< typename Element, typename Index >
		const Element& tnlCuda :: getArrayElementReference(const Element* data, const Index i )
		{
		tnlAssert( false, );
		abort();
		}

		template< typename DestinationElement,
		typename SourceElement,
		typename Index,
		typename Device >
		bool tnlCuda :: memcpy( DestinationElement* destination,
		const SourceElement* source,
		const Index size )
		{
		switch( Device :: getDevice() )
		{
		case tnlHostDevice:
		return copyMemoryHostToCuda( destination, source, size );
		case tnlCudaDevice:
		return copyMemoryCudaToCuda( destination, source, size );
		}
		return true;
		}


		template< typename Element, typename Index, typename Device >
		bool tnlCuda :: memcpy( Element* destination,
		const Element* source,
		const Index size )
		{
		return tnlCuda :: memcpy< Element, Element, Index, Device >
		( destination,
		source,
		size );
		}

		template< typename Element, typename Index, typename Device >
		bool tnlCuda :: memcmp( const Element* data1,
		const Element* data2,
		const Index size )
		{
		switch( Device :: getDevice() )
		{
		case tnlHostDevice:
		return compareMemoryHostCuda( data2, data1, size );
		case tnlCudaDevice:
		return compareMemoryCuda( data1, data2, size );
		}
		}

		template< typename Element, typename Index >
		bool tnlCuda :: memset( Element* destination,
		const Element& value,
		const Index size )
		{
		return setMemoryCuda( destination, value, size, maxGridSize );
		}

		inline int tnlCuda :: getMaxGridSize()
		{
		return maxGridSize;
		}

		inline void tnlCuda :: setMaxGridSize( int newMaxGridSize )
		{
		maxGridSize = newMaxGridSize;
		}

		inline int tnlCuda :: getMaxBlockSize()
		{
		return maxBlockSize;
		}

		inline void tnlCuda :: setMaxBlockSize( int newMaxBlockSize )
		{
		maxBlockSize = newMaxBlockSize;
		}


		#endif /* TNLCUDA_IMPL_H_ */