Implementing CUDA reduction. (d70edfe8) · Commits · TNL / tnl-dev

src/core/Makefile.am

+3 −0

Original line number	Diff line number	Diff line
		@@ -37,6 +37,8 @@ headers = tnlAssert.h \
		tnlTimerRT.h \
		tnlTester.h \
		tnlVector.h \
		tnl-cuda-kernels.h \
		tnlCUDAKernelsTester.h \
		compress-file.h \
		mfilename.h \
		mfuncs.h \
		@@ -53,6 +55,7 @@ sources = tnlConfigDescription.cpp \
		tnlTimerCPU.cpp \
		tnlTimerRT.cpp \
		tnlTester.cpp \
		tnl-cuda-kernels.cu \
		compress-file.cpp \
		mfilename.cpp \
		mpi-supp.cpp \

src/core/tnl-cuda-kernels.cu

0 → 100644

+93 −0

Original line number	Diff line number	Diff line
		/***************************************************************************
		tnl-cuda-kernels.cu
		-------------------
		begin : Jan 14, 2010
		copyright : (C) 2009 by Tomas Oberhuber
		email : tomas.oberhuber@fjfi.cvut.cz
		***************************************************************************/

		/***************************************************************************
		* *
		* This program is free software; you can redistribute it and/or modify *
		* it under the terms of the GNU General Public License as published by *
		* the Free Software Foundation; either version 2 of the License, or *
		* (at your option) any later version. *
		* *
		***************************************************************************/

		#include <tnl-cuda-kernels.h>

		int tnlCUDAReductionMin( const int size,
		const int block_size,
		const int grid_size,
		const int* input )
		{
		return tnlCUDAReduction< int, tnlMin >( size, block_size, grid_size, input );
		}

		int tnlCUDAReductionMax( const int size,
		const int block_size,
		const int grid_size,
		const int* input )
		{
		return tnlCUDAReduction< int, tnlMax >( size, block_size, grid_size, input );
		}

		int tnlCUDAReductionSum( const int size,
		const int block_size,
		const int grid_size,
		const int* input )
		{
		return tnlCUDAReduction< int, tnlSum >( size, block_size, grid_size, input );
		}


		float tnlCUDAReductionMin( const int size,
		const int block_size,
		const int grid_size,
		const float* input )
		{
		return tnlCUDAReduction< float, tnlMin >( size, block_size, grid_size, input );
		}

		float tnlCUDAReductionMax( const int size,
		const int block_size,
		const int grid_size,
		const float* input )
		{
		return tnlCUDAReduction< float, tnlMax >( size, block_size, grid_size, input );
		}

		float tnlCUDAReductionSum( const int size,
		const int block_size,
		const int grid_size,
		const float* input )
		{
		return tnlCUDAReduction< float, tnlSum >( size, block_size, grid_size, input );
		}

		double tnlCUDAReductionMin( const int size,
		const int block_size,
		const int grid_size,
		const double* input )
		{
		return tnlCUDAReduction< double, tnlMin >( size, block_size, grid_size, input );
		}

		double tnlCUDAReductionMax( const int size,
		const int block_size,
		const int grid_size,
		const double* input )
		{
		return tnlCUDAReduction< double, tnlMax >( size, block_size, grid_size, input );
		}

		double tnlCUDAReductionSum( const int size,
		const int block_size,
		const int grid_size,
		const double* input )
		{
		return tnlCUDAReduction< double, tnlSum >( size, block_size, grid_size, input );
		}

src/core/tnl-cuda-kernels.h

0 → 100644

+240 −0

Original line number	Diff line number	Diff line
		/***************************************************************************
		tnl-cuda-kernels.h
		-------------------
		begin : Jan 14, 2010
		copyright : (C) 2009 by Tomas Oberhuber
		email : tomas.oberhuber@fjfi.cvut.cz
		***************************************************************************/

		/***************************************************************************
		* *
		* This program is free software; you can redistribute it and/or modify *
		* it under the terms of the GNU General Public License as published by *
		* the Free Software Foundation; either version 2 of the License, or *
		* (at your option) any later version. *
		* *
		***************************************************************************/

		#ifndef TNLCUDAKERNELS_H_
		#define TNLCUDAKERNELS_H_

		#include <core/tnlAssert.h>

		using namespace std;

		enum tnlOperation { tnlMin, tnlMax, tnlSum };

		#ifdef HAVE_CUDA

		template< class T > __device__ T tnlCudaMin( const T& a,
		const T& b )
		{
		return a < b ? a : b;
		}

		template< class T > __device__ T tnlCudaMax( const T& a,
		const T& b )
		{
		return a > b ? a : b;
		}

		/*
		* This kernel has been adopted from the diploma work of Jan Vacata.
		* Vacata Jan, GPGPU: General Purpose Computation on GPUs, diploma thesis,
		* Department of mathematics, FNSPE, CTU in Prague, 2008.
		*
		* Call this kernel with grid size divided by 2.
		* Maximum block size is 512.
		*
		*/

		template < class T, tnlOperation operation, int blockSize >
		__global__ void tnlCUDAReductionKernel( const int size,
		const T* d_input,
		T* d_output )
		{
		extern __shared__ __align__( 8 ) T sdata[];
		// Read the data into shared memory
		int tid = threadIdx.x;
		int gid = blockIdx. x * blockSize * 2 + threadIdx. x;
		int gridSize = blockSize * 2 * gridDim. x;
		if( operation == tnlMin \|\|
		operation == tnlMax )
		sdata[ tid ] = d_input[ gid ];
		else
		sdata[ tid ] = 0;
		while( gid < size )
		{
		if( operation == tnlMin )
		sdata[ tid ] = tnlCudaMin( d_input[ gid ], d_input[ tnlCudaMin( gid + blockSize, size ) ] );
		if( operation == tnlMax )
		sdata[ tid ] = tnlCudaMax( d_input[ gid ], d_input[ tnlCudaMin( gid + blockSize, size ) ] );
		if( operation == tnlSum )
		sdata[ tid ] += d_input[gid] + d_input[gid+blockSize];
		gid += gridSize;
		}
		__syncthreads();

		// Parallel reduction
		if( blockSize == 512 )
		{
		if( tid < 256 )
		{
		if( operation == tnlMin )
		sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 256 ] );
		if( operation == tnlMax )
		sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 256 ] );
		if( operation == tnlSum )
		sdata[ tid ] += sdata[ tid + 256 ];
		}
		__syncthreads();
		}
		if( blockSize >= 256 )
		{
		if( tid < 128 )
		{
		if( operation == tnlMin )
		sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 128 ] );
		if( operation == tnlMax )
		sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 128 ] );
		if( operation == tnlSum )
		sdata[ tid ] += sdata[ tid + 128 ];
		}
		__syncthreads();
		}
		if( blockSize >= 128 )
		{
		if (tid< 64)
		{
		if( operation == tnlMin )
		sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 64 ] );
		if( operation == tnlMax )
		sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 64 ] );
		if( operation == tnlSum )
		sdata[ tid ] += sdata[ tid + 64 ];
		}
		__syncthreads();
		}
		/*
		* What follows runs in warp so it does not need to be synchronised.
		*/
		if( tid < 32 )
		{
		if( blockSize >= 64 )
		{
		if( operation == tnlMin )
		sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 32 ] );
		if( operation == tnlMax )
		sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 32 ] );
		if( operation == tnlSum )
		sdata[ tid ] += sdata[ tid + 32 ];
		}
		if( blockSize >= 32 )
		{
		if( operation == tnlMin )
		sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 16 ] );
		if( operation == tnlMax )
		sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 16 ] );
		if( operation == tnlSum )
		sdata[ tid ] += sdata[ tid + 16 ];
		}
		if( blockSize >= 16 )
		{
		if( operation == tnlMin )
		sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 8 ] );
		if( operation == tnlMax )
		sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 8 ] );
		if( operation == tnlSum )
		sdata[ tid ] += sdata[ tid + 8 ];
		}
		if( blockSize >= 8 )
		{
		if( operation == tnlMin )
		sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 4 ] );
		if( operation == tnlMax )
		sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 4 ] );
		if( operation == tnlSum )
		sdata[ tid ] += sdata[ tid + 4 ];
		}
		if( blockSize >= 4 )
		{
		if( operation == tnlMin )
		sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 2 ] );
		if( operation == tnlMax )
		sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 2 ] );
		if( operation == tnlSum )
		sdata[ tid ] += sdata[ tid + 2 ];
		}
		if( blockSize >= 2 )
		{
		if( operation == tnlMin )
		sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 1 ] );
		if( operation == tnlMax )
		sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 1 ] );
		if( operation == tnlSum )
		sdata[ tid ] += sdata[ tid + 1 ];
		}
		}
		// Store the result back to the global memory of the device
		if( tid == 0 ) d_output[ blockIdx. x ] = sdata[ 0 ];
		}

		/*
		* CUDA reduction kernel caller.
		* block_size can be only some of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512.
		* d_input must reside on the device.
		*/
		template< class T, tnlOperation operation >
		T tnlCUDAReduction( const int size,
		const int block_size,
		const int grid_size,
		const T* d_input )
		{
		T result;

		dim3 blockSize( block_size );
		dim3 gridSize( grid_size );
		int shmem = 512 * sizeof( T );
		tnlAssert( shmem < 16384, cerr << shmem << " bytes are required." );
		switch( block_size )
		{
		case 512:
		tnlCUDAReductionKernel< T, operation, 512 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
		break;
		case 256:
		tnlCUDAReductionKernel< T, operation, 256 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
		break;
		case 128:
		tnlCUDAReductionKernel< T, operation, 128 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
		break;
		case 64:
		tnlCUDAReductionKernel< T, operation, 64 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
		break;
		case 32:
		tnlCUDAReductionKernel< T, operation, 32 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
		break;
		case 16:
		tnlCUDAReductionKernel< T, operation, 16 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
		break;
		case 8:
		tnlCUDAReductionKernel< T, operation, 8 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
		break;
		case 4:
		tnlCUDAReductionKernel< T, operation, 4 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
		break;
		case 2:
		tnlCUDAReductionKernel< T, operation, 2 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
		break;
		case 1:
		tnlCUDAReductionKernel< T, operation, 1 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
		break;
		default:
		tnlAssert( false, cerr << "Block size is " << block_size << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
		break;
		}
		return result;
		}

		#endif /* HAVE_CUDA */

		#endif /* TNLCUDAKERNELS_H_ */

src/core/tnlCUDAKernelsTester.h

0 → 100644

+132 −0

Original line number	Diff line number	Diff line
		/***************************************************************************
		tnlCUDAKernelsTester.h
		-------------------
		begin : Jan 14, 2010
		copyright : (C) 2009 by Tomas Oberhuber
		email : tomas.oberhuber@fjfi.cvut.cz
		***************************************************************************/

		/***************************************************************************
		* *
		* This program is free software; you can redistribute it and/or modify *
		* it under the terms of the GNU General Public License as published by *
		* the Free Software Foundation; either version 2 of the License, or *
		* (at your option) any later version. *
		* *
		***************************************************************************/

		#ifndef TNLCUDAKERNELSTESTER_H_
		#define TNLCUDAKERNELSTESTER_H_

		#include <iostream>
		#include <cppunit/TestSuite.h>
		#include <cppunit/TestResult.h>
		#include <cppunit/TestCaller.h>
		#include <cppunit/TestCase.h>
		#include <core/tnlLongVectorCUDA.h>
		#include <core/tnlLongVector.h>
		#include <core/mfuncs.h>

		using namespace std;

		#ifdef HAVE_CUDA
		int tnlCUDAReductionMin( const int size,
		const int block_size,
		const int grid_size,
		const int* input );
		int tnlCUDAReductionMax( const int size,
		const int block_size,
		const int grid_size,
		const int* input );
		int tnlCUDAReductionSum( const int size,
		const int block_size,
		const int grid_size,
		const int* input );
		float tnlCUDAReductionMin( const int size,
		const int block_size,
		const int grid_size,
		const float* input );
		float tnlCUDAReductionMax( const int size,
		const int block_size,
		const int grid_size,
		const float* input );
		float tnlCUDAReductionSum( const int size,
		const int block_size,
		const int grid_size,
		const float* input );
		double tnlCUDAReductionMin( const int size,
		const int block_size,
		const int grid_size,
		const double* input );
		double tnlCUDAReductionMax( const int size,
		const int block_size,
		const int grid_size,
		const double* input );
		double tnlCUDAReductionSum( const int size,
		const int block_size,
		const int grid_size,
		const double* input );

		#endif


		template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
		{
		public:
		tnlCUDAKernelsTester(){};

		virtual
		~tnlCUDAKernelsTester(){};

		static CppUnit :: Test* suite()
		{
		CppUnit :: TestSuite* suiteOfTests = new CppUnit :: TestSuite( "tnlCUDAKernelsTester" );
		CppUnit :: TestResult result;
		suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >(
		"testReduction",
		& tnlCUDAKernelsTester< T > :: testReduction )
		);

		return suiteOfTests;
		};

		void testReduction()
		{
		/*
		* Test by Jan Vacata.
		*/
		int size = 100;
		int desBlockSize = 128; //Desired block size
		int desGridSize = 2048; //Impose limitation on grid size so that threads could perform sequential work

		tnlLongVector< T > host_input( size );
		tnlLongVector< T > host_output( size );

		tnlLongVectorCUDA< T > device_input( size );
		tnlLongVectorCUDA< T > device_output( size );

		for( int i=0; i < size; i ++ )
		{
		host_input[ i ] = 1;
		host_output[ i ] = 0;
		}
		device_input. copyFrom( host_input );

		//Calculate necessary block/grid dimensions
		int block_size = :: Min( size/2, desBlockSize );
		//Grid size is limited in this case
		int grid_size = :: Min( desGridSize, size / block_size / 2 );

		T min = tnlCUDAReductionMin( size, block_size, grid_size, device_input. Data() );
		T max = tnlCUDAReductionMax( size, block_size, grid_size, device_input. Data() );
		T sum = tnlCUDAReductionSum( size, block_size, grid_size, device_input. Data() );

		cout << "Min: " << min
		<< "Max: " << max
		<< "Sum: " << sum << endl;

		}
		};


		#endif /* TNLCUDAKERNELSTESTER_H_ */

src/tnl-unit-tests.cpp

+7 −0

Original line number	Diff line number	Diff line
		@@ -18,10 +18,13 @@

		#include <cppunit/ui/text/TestRunner.h>


		#include <core/tnlLongVectorCUDATester.h>
		#include <core/tnlFieldCUDA2DTester.h>
		#include <core/tnlCUDAKernelsTester.h>
		#include <diff/tnlGridCUDA2DTester.h>


		using namespace std;

		int main( int argc, char* argv[] )
		@@ -40,6 +43,10 @@ int main( int argc, char* argv[] )
		runner.addTest( tnlGridCUDA2DTester< float > :: suite() );
		runner.addTest( tnlGridCUDA2DTester< double > :: suite() );

		runner.addTest( tnlCUDAKernelsTester< int > :: suite() );
		runner.addTest( tnlCUDAKernelsTester< float > :: suite() );
		runner.addTest( tnlCUDAKernelsTester< double > :: suite() );

		runner.run();
		return 0;
		}