Implementing the Ellpack format in CUDA. (3e00d6f0) · Commits · TNL / tnl-dev

src/core/mfuncs.h

+10 −0

Original line number	Diff line number	Diff line
		@@ -79,6 +79,16 @@ inline int roundUpDivision( const int num, const int div )
		{
		return num / div + ( num % div != 0 );
		}

		#ifdef HAVE_CUDA
		__device__ __host__
		#endif
		inline int roundToMultiple( int number, int multiple )
		{
		return multiple*( number/ multiple + ( number % multiple != 0 ) );
		}


		/*template< typename T >
		void swap( T& a, T& b)
		{

src/core/tnlCuda.h

+9 −0

Original line number	Diff line number	Diff line
		@@ -28,6 +28,8 @@ class tnlCuda
		{
		public:

		enum { DeviceType = tnlCudaDevice };

		static tnlString getDeviceType();

		#ifdef HAVE_CUDA
		@@ -63,6 +65,13 @@ static inline int getWarpSize();
		template< typename ObjectType >
		static ObjectType* passToDevice( const ObjectType& object );

		template< typename ObjectType >
		static ObjectType passFromDevice( const ObjectType& object );

		template< typename ObjectType >
		static void passFromDevice( const ObjectType& deviceObject,
		ObjectType& hostObject );

		template< typename ObjectType >
		static void freeFromDevice( ObjectType* object );

src/core/tnlHost.h

+2 −0

Original line number	Diff line number	Diff line
		@@ -26,6 +26,8 @@ class tnlHost
		{
		public:

		enum { DeviceType = tnlHostDevice };

		static tnlString getDeviceType();

		#ifdef HAVE_CUDA

src/implementation/core/tnlCuda_impl.h

+23 −1

Original line number	Diff line number	Diff line
		@@ -64,7 +64,6 @@ inline int tnlCuda::getNumberOfSharedMemoryBanks()
		return 32;
		}


		template< typename ObjectType >
		ObjectType* tnlCuda::passToDevice( const ObjectType& object )
		{
		@@ -87,6 +86,29 @@ ObjectType* tnlCuda::passToDevice( const ObjectType& object )
		return deviceObject;
		}

		template< typename ObjectType >
		ObjectType tnlCuda::passFromDevice( const ObjectType& object )
		{
		ObjectType aux;
		cudaMemcpy( ( void* ) &aux,
		( void* ) &object,
		sizeof( ObjectType ),
		cudaMemcpyDeviceToHost );
		checkCudaDevice;
		return aux;
		}

		template< typename ObjectType >
		void tnlCuda::passFromDevice( const ObjectType& deviceObject,
		ObjectType& hostObject )
		{
		cudaMemcpy( ( void* ) &hostObject,
		( void* ) &deviceObject,
		sizeof( ObjectType ),
		cudaMemcpyDeviceToHost );
		checkCudaDevice;
		}

		template< typename ObjectType >
		void tnlCuda::freeFromDevice( ObjectType* deviceObject )
		{

src/implementation/matrices/tnlDenseMatrix_impl.h

+3 −23

Original line number	Diff line number	Diff line
		@@ -352,7 +352,7 @@ typename Vector::RealType tnlDenseMatrix< Real, Device, Index >::rowVectorProduc
		return sum;
		}

		#ifdef HAVE_CUDA
		/*#ifdef HAVE_CUDA
		template< typename Real,
		typename Index,
		typename Vector >
		@@ -365,7 +365,7 @@ __global__ void tnlDenseMatrixVectorProductCudaKernel( tnlDenseMatrix< Real, tnl
		if( rowIdx < matrix->getRows() )
		( outVector )[ rowIdx ] = matrix->rowVectorProduct( rowIdx, inVector );
		}
		#endif
		#endif*/

		template< typename Real,
		typename Device,
		@@ -389,27 +389,7 @@ void tnlDenseMatrix< Real, Device, Index >::vectorProduct( const Vector& inVecto
		for( IndexType row = 0; row < this->getRows(); row++ )
		outVector[ row ] = rowVectorProduct( row, inVector );
		if( Device::getDevice() == tnlCudaDevice )
		{
		#ifdef HAVE_CUDA
		ThisType* kernel_this = tnlCuda::passToDevice( *this );
		Vector* kernel_inVector = tnlCuda::passToDevice( inVector );
		Vector* kernel_outVector = tnlCuda::passToDevice( outVector );
		dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() );
		const IndexType cudaBlocks = roundUpDivision( this->getRows(), cudaBlockSize.x );
		const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() );
		for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
		{
		if( gridIdx == cudaGrids - 1 )
		cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize();
		tnlDenseMatrixVectorProductCudaKernel<<< cudaGridSize, cudaBlockSize >>>
		( kernel_this, kernel_inVector, kernel_outVector, gridIdx );
		}
		tnlCuda::freeFromDevice( kernel_this );
		tnlCuda::freeFromDevice( kernel_inVector );
		tnlCuda::freeFromDevice( kernel_outVector );
		checkCudaDevice;
		#endif
		}
		tnlMatrixVectorProductCuda( *this, inVector, outVector );
		}

		template< typename Real,