Commit d70edfe8 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Implementing CUDA reduction.

parent 71606a46
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -37,6 +37,8 @@ headers = tnlAssert.h \
		    tnlTimerRT.h \
		    tnlTester.h \
		    tnlVector.h \
		    tnl-cuda-kernels.h \
		    tnlCUDAKernelsTester.h \
		    compress-file.h \
		    mfilename.h \
		    mfuncs.h \
@@ -53,6 +55,7 @@ sources = tnlConfigDescription.cpp \
	  tnlTimerCPU.cpp \
	  tnlTimerRT.cpp \
	  tnlTester.cpp \
	  tnl-cuda-kernels.cu \
	  compress-file.cpp \
	  mfilename.cpp \
	  mpi-supp.cpp \
+93 −0
Original line number Diff line number Diff line
/***************************************************************************
                          tnl-cuda-kernels.cu
                             -------------------
    begin                : Jan 14, 2010
    copyright            : (C) 2009 by Tomas Oberhuber
    email                : tomas.oberhuber@fjfi.cvut.cz
 ***************************************************************************/

/***************************************************************************
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
 ***************************************************************************/

#include <tnl-cuda-kernels.h>

int tnlCUDAReductionMin( const int size,
                         const int block_size,
                         const int grid_size,
                         const int* input )
{
   return tnlCUDAReduction< int, tnlMin >( size, block_size, grid_size, input );
}

int tnlCUDAReductionMax( const int size,
                         const int block_size,
                         const int grid_size,
                         const int* input )
{
   return tnlCUDAReduction< int, tnlMax >( size, block_size, grid_size, input );
}
                         
int tnlCUDAReductionSum( const int size,
                         const int block_size,
                         const int grid_size,
                         const int* input )
{
   return tnlCUDAReduction< int, tnlSum >( size, block_size, grid_size, input );
}


float tnlCUDAReductionMin( const int size,
                           const int block_size,
                           const int grid_size,
                           const float* input )
{
   return tnlCUDAReduction< float, tnlMin >( size, block_size, grid_size, input );
}

float tnlCUDAReductionMax( const int size,
                           const int block_size,
                           const int grid_size,
                           const float* input )
{
   return tnlCUDAReduction< float, tnlMax >( size, block_size, grid_size, input );
}
                         
float tnlCUDAReductionSum( const int size,
                           const int block_size,
                           const int grid_size,
                           const float* input )
{
   return tnlCUDAReduction< float, tnlSum >( size, block_size, grid_size, input );
}

double tnlCUDAReductionMin( const int size,
                            const int block_size,
                            const int grid_size,
                            const double* input )
{
   return tnlCUDAReduction< double, tnlMin >( size, block_size, grid_size, input );
}

double tnlCUDAReductionMax( const int size,
                            const int block_size,
                            const int grid_size,
                            const double* input )
{
   return tnlCUDAReduction< double, tnlMax >( size, block_size, grid_size, input );
}
                         
double tnlCUDAReductionSum( const int size,
                            const int block_size,
                            const int grid_size,
                            const double* input )
{
   return tnlCUDAReduction< double, tnlSum >( size, block_size, grid_size, input );
}

+240 −0
Original line number Diff line number Diff line
/***************************************************************************
                          tnl-cuda-kernels.h
                             -------------------
    begin                : Jan 14, 2010
    copyright            : (C) 2009 by Tomas Oberhuber
    email                : tomas.oberhuber@fjfi.cvut.cz
 ***************************************************************************/

/***************************************************************************
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
 ***************************************************************************/

#ifndef TNLCUDAKERNELS_H_
#define TNLCUDAKERNELS_H_

#include <core/tnlAssert.h>

using namespace std;

enum tnlOperation { tnlMin, tnlMax, tnlSum };

#ifdef HAVE_CUDA

template< class T > __device__ T tnlCudaMin( const T& a,
		                                     const T& b )
{
	return a < b ? a : b;
}

template< class T > __device__ T tnlCudaMax( const T& a,
		                                     const T& b )
{
	return a > b ? a : b;
}

/*
 * This kernel has been adopted from the diploma work of Jan Vacata.
 * Vacata Jan, GPGPU: General Purpose Computation on GPUs, diploma thesis,
 *  Department of mathematics, FNSPE, CTU in Prague, 2008.
 *
 * Call this kernel with grid size divided by 2.
 * Maximum block size is 512.
 *
 */

template < class T, tnlOperation operation, int blockSize >
__global__ void tnlCUDAReductionKernel( const int size,
		                                const T* d_input,
		                                T* d_output )
{
	extern __shared__ __align__( 8 ) T sdata[];
	// Read the data into shared memory
	int tid = threadIdx.x;
	int gid = blockIdx. x * blockSize * 2 + threadIdx. x;
	int gridSize = blockSize * 2 * gridDim. x;
	if( operation == tnlMin ||
		operation == tnlMax )
		sdata[ tid ] = d_input[ gid ];
	else
		sdata[ tid ] = 0;
	while( gid < size )
	{
		if( operation == tnlMin )
			sdata[ tid ] = tnlCudaMin( d_input[ gid ], d_input[ tnlCudaMin( gid + blockSize, size ) ] );
		if( operation == tnlMax )
			sdata[ tid ] = tnlCudaMax( d_input[ gid ], d_input[ tnlCudaMin( gid + blockSize, size ) ] );
		if( operation == tnlSum )
			sdata[ tid ] += d_input[gid] + d_input[gid+blockSize];
		gid += gridSize;
	}
	__syncthreads();

	// Parallel reduction
	if( blockSize == 512 )
	{
		if( tid < 256 )
		{
			if( operation == tnlMin )
				sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 256 ] );
			if( operation == tnlMax )
				sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 256 ] );
			if( operation == tnlSum )
				sdata[ tid ] += sdata[ tid + 256 ];
		}
		__syncthreads();
	}
	if( blockSize >= 256 )
	{
		if( tid < 128 )
		{
			if( operation == tnlMin )
				sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 128 ] );
			if( operation == tnlMax )
				sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 128 ] );
			if( operation == tnlSum )
				sdata[ tid ] += sdata[ tid + 128 ];
		}
		__syncthreads();
	}
	if( blockSize >= 128 )
	{
		if (tid< 64)
		{
			if( operation == tnlMin )
				sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 64 ] );
			if( operation == tnlMax )
				sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 64 ] );
			if( operation == tnlSum )
				sdata[ tid ] += sdata[ tid + 64 ];
		}
		__syncthreads();
	}
	/*
	 * What follows runs in warp so it does not need to be synchronised.
	 */
	if( tid < 32 )
	{
		if( blockSize >= 64 )
		{
			if( operation == tnlMin )
				sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 32 ] );
			if( operation == tnlMax )
				sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 32 ] );
			if( operation == tnlSum )
				sdata[ tid ] += sdata[ tid + 32 ];
		}
		if( blockSize >= 32 )
		{
			if( operation == tnlMin )
				sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 16 ] );
			if( operation == tnlMax )
				sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 16 ] );
			if( operation == tnlSum )
				sdata[ tid ] += sdata[ tid + 16 ];
		}
		if( blockSize >= 16 )
		{
			if( operation == tnlMin )
				sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 8 ] );
			if( operation == tnlMax )
				sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 8 ] );
			if( operation == tnlSum )
				sdata[ tid ] += sdata[ tid + 8 ];
		}
	    if( blockSize >= 8 )
	    {
	    	if( operation == tnlMin )
	    		sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 4 ] );
	    	if( operation == tnlMax )
	    		sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 4 ] );
	    	if( operation == tnlSum )
	    		sdata[ tid ] += sdata[ tid + 4 ];
	    }
		if( blockSize >= 4 )
		{
			if( operation == tnlMin )
				sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 2 ] );
			if( operation == tnlMax )
				sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 2 ] );
			if( operation == tnlSum )
				sdata[ tid ] += sdata[ tid + 2 ];
		}
		if( blockSize >= 2 )
		{
			if( operation == tnlMin )
				sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 1 ] );
			if( operation == tnlMax )
				sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 1 ] );
			if( operation == tnlSum )
				sdata[ tid ] += sdata[ tid + 1 ];
		}
	}
	// Store the result back to the global memory of the device
	if( tid == 0 ) d_output[ blockIdx. x ] = sdata[ 0 ];
}

/*
 * CUDA reduction kernel caller.
 * block_size can be only some of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512.
 * d_input must reside on the device.
 */
template< class T, tnlOperation operation >
T tnlCUDAReduction( const int size,
					const int block_size,
					const int grid_size,
	                const T* d_input )
{
	T result;

	dim3 blockSize( block_size );
	dim3 gridSize( grid_size );
	int shmem = 512 * sizeof( T );
	tnlAssert( shmem < 16384, cerr << shmem << " bytes are required." );
	switch( block_size )
	{
		case 512:
	        tnlCUDAReductionKernel< T, operation, 512 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
	        break;
	    case 256:
	    	tnlCUDAReductionKernel< T, operation, 256 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
	    	break;
	    case 128:
	    	tnlCUDAReductionKernel< T, operation, 128 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
	    	break;
	    case  64:
	    	tnlCUDAReductionKernel< T, operation,  64 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
	    	break;
	    case  32:
	    	tnlCUDAReductionKernel< T, operation,  32 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
	    	break;
	    case  16:
	    	tnlCUDAReductionKernel< T, operation,  16 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
	    	break;
	    case   8:
	    	tnlCUDAReductionKernel< T, operation,   8 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
	    	break;
	    case   4:
	    	tnlCUDAReductionKernel< T, operation,   4 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
	    	break;
	    case   2:
	    	tnlCUDAReductionKernel< T, operation,   2 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
	    	break;
	    case   1:
	    	tnlCUDAReductionKernel< T, operation,   1 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
	    	break;
	    default:
	    	tnlAssert( false, cerr << "Block size is " << block_size << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
	    	break;
	}
	return result;
}

#endif /* HAVE_CUDA */

#endif /* TNLCUDAKERNELS_H_ */
+132 −0
Original line number Diff line number Diff line
/***************************************************************************
                          tnlCUDAKernelsTester.h
                             -------------------
    begin                : Jan 14, 2010
    copyright            : (C) 2009 by Tomas Oberhuber
    email                : tomas.oberhuber@fjfi.cvut.cz
 ***************************************************************************/

/***************************************************************************
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
 ***************************************************************************/

#ifndef TNLCUDAKERNELSTESTER_H_
#define TNLCUDAKERNELSTESTER_H_

#include <iostream>
#include <cppunit/TestSuite.h>
#include <cppunit/TestResult.h>
#include <cppunit/TestCaller.h>
#include <cppunit/TestCase.h>
#include <core/tnlLongVectorCUDA.h>
#include <core/tnlLongVector.h>
#include <core/mfuncs.h>

using namespace std;

#ifdef HAVE_CUDA
int tnlCUDAReductionMin( const int size,
                         const int block_size,
                         const int grid_size,
                         const int* input );
int tnlCUDAReductionMax( const int size,
                         const int block_size,
                         const int grid_size,
                         const int* input );
int tnlCUDAReductionSum( const int size,
                         const int block_size,
                         const int grid_size,
                         const int* input );
float tnlCUDAReductionMin( const int size,
                           const int block_size,
                           const int grid_size,
                           const float* input );
float tnlCUDAReductionMax( const int size,
                           const int block_size,
                           const int grid_size,
                           const float* input );
float tnlCUDAReductionSum( const int size,
                           const int block_size,
                           const int grid_size,
                           const float* input );
double tnlCUDAReductionMin( const int size,
                            const int block_size,
                            const int grid_size,
                            const double* input );
double tnlCUDAReductionMax( const int size,
                            const int block_size,
                            const int grid_size,
                            const double* input );
double tnlCUDAReductionSum( const int size,
                            const int block_size,
                            const int grid_size,
                            const double* input );

#endif


template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
{
   public:
   tnlCUDAKernelsTester(){};

   virtual
   ~tnlCUDAKernelsTester(){};

   static CppUnit :: Test* suite()
   {
      CppUnit :: TestSuite* suiteOfTests = new CppUnit :: TestSuite( "tnlCUDAKernelsTester" );
      CppUnit :: TestResult result;
      suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >(
                               "testReduction",
                               & tnlCUDAKernelsTester< T > :: testReduction )
                             );

      return suiteOfTests;
   };

   void testReduction()
   {
	   /*
	    * Test by Jan Vacata.
	    */
	   int size = 100;
	   int desBlockSize = 128;    //Desired block size
	   int desGridSize = 2048;    //Impose limitation on grid size so that threads could perform sequential work

	   tnlLongVector< T > host_input( size );
	   tnlLongVector< T > host_output( size );

	   tnlLongVectorCUDA< T > device_input( size );
	   tnlLongVectorCUDA< T > device_output( size );

	   for( int i=0; i < size; i ++ )
	   {
		   host_input[ i ] = 1;
		   host_output[ i ] = 0;
	   }
	   device_input. copyFrom( host_input );

	   //Calculate necessary block/grid dimensions
	   int block_size = :: Min( size/2, desBlockSize );
	   //Grid size is limited in this case
	   int grid_size = :: Min( desGridSize, size / block_size / 2 );

	   T min = tnlCUDAReductionMin( size, block_size, grid_size, device_input. Data() );
	   T max = tnlCUDAReductionMax( size, block_size, grid_size, device_input. Data() );
	   T sum = tnlCUDAReductionSum( size, block_size, grid_size, device_input. Data() );

	   cout << "Min: " << min
			<< "Max: " << max
			<< "Sum: " << sum << endl;

   }
};


#endif /* TNLCUDAKERNELSTESTER_H_ */
+7 −0
Original line number Diff line number Diff line
@@ -18,10 +18,13 @@

#include <cppunit/ui/text/TestRunner.h>


#include <core/tnlLongVectorCUDATester.h>
#include <core/tnlFieldCUDA2DTester.h>
#include <core/tnlCUDAKernelsTester.h>
#include <diff/tnlGridCUDA2DTester.h>


using namespace std;

int main( int argc, char* argv[] )
@@ -40,6 +43,10 @@ int main( int argc, char* argv[] )
   runner.addTest( tnlGridCUDA2DTester< float > :: suite() );
   runner.addTest( tnlGridCUDA2DTester< double > :: suite() );
   
   runner.addTest( tnlCUDAKernelsTester< int > :: suite() );
   runner.addTest( tnlCUDAKernelsTester< float > :: suite() );
   runner.addTest( tnlCUDAKernelsTester< double > :: suite() );
   
   runner.run();
   return 0;
}