Commit 40cb098f authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Almost working simple parallel reduction in CUDA.

parent d70edfe8
Loading
Loading
Loading
Loading
+161 −1
Original line number Diff line number Diff line
@@ -15,7 +15,11 @@
 *                                                                         *
 ***************************************************************************/

#include <tnl-cuda-kernels.h>
#include <iostream>
#include <core/mfuncs.h>
#include <core/tnl-cuda-kernels.h>

using namespace std;

int tnlCUDAReductionMin( const int size,
                         const int block_size,
@@ -90,4 +94,160 @@ double tnlCUDAReductionSum( const int size,
   return tnlCUDAReduction< double, tnlSum >( size, block_size, grid_size, input );
}

/*
 * Simple redcution 1
 */

int tnlCUDASimpleReduction1Min( const int size,
                                const int block_size,
                                const int grid_size,
                                const int* input,
                                int* output )
{
   //Calculate necessary block/grid dimensions
   const int cpuThreshold = 1;
   const int desBlockSize = 128;    //Desired block size   
   dim3 blockSize = :: Min( size, desBlockSize );
   dim3 gridSize = size / blockSize. x;
   unsigned int shmem = blockSize. x * sizeof( int );
   cout << "Grid size: " << gridSize. x << endl 
        << "Block size: " << blockSize. x << endl
        << "Shmem: " << shmem << endl;
   tnlCUDASimpleReductionKernel1< int, tnlMin ><<< gridSize, blockSize, shmem >>>( size, input, output );
   int sizeReduced = gridSize. x;
   while( sizeReduced > cpuThreshold )
   {
      cout << "Reducing with size reduced = " << sizeReduced << endl;
      blockSize. x = :: Min( sizeReduced, desBlockSize );
      gridSize. x = sizeReduced / blockSize. x;
      shmem = blockSize. x * sizeof(int);
      tnlCUDASimpleReductionKernel1< int, tnlMin ><<< gridSize, blockSize, shmem >>>( size, input, output );
      sizeReduced = gridSize. x;
   }
   int* host_output = new int[ sizeReduced ];
   cudaMemcpy( host_output, output, sizeReduced * sizeof(int), cudaMemcpyDeviceToHost );
   int result = host_output[ 0 ];
   for( int i = 1;i < sizeReduced; i++ )
        result = :: Min( result, host_output[ i ] );
   delete[] host_output;
   return result;
}

int tnlCUDASimpleReduction1Max( const int size,
                                const int block_size,
                                const int grid_size,
                                const int* input,
                                int* output )
{
   //Calculate necessary block/grid dimensions
   const int cpuThreshold = 1;
   const int desBlockSize = 128;    //Desired block size   
   dim3 blockSize = :: Min( size, desBlockSize );
   dim3 gridSize = size / blockSize. x;
   unsigned int shmem = blockSize. x * sizeof( int );
   cout << "Grid size: " << gridSize. x << endl 
        << "Block size: " << blockSize. x << endl
        << "Shmem: " << shmem << endl;
   tnlCUDASimpleReductionKernel1< int, tnlMax ><<< gridSize, blockSize, shmem >>>( size, input, output );
   int sizeReduced = gridSize. x;
   while( sizeReduced > cpuThreshold )
   {
      cout << "Reducing with size reduced = " << sizeReduced << endl;
      blockSize. x = :: Min( sizeReduced, desBlockSize );
      gridSize. x = sizeReduced / blockSize. x;
      shmem = blockSize. x * sizeof(int);
      tnlCUDASimpleReductionKernel1< int, tnlMax ><<< gridSize, blockSize, shmem >>>( size, input, output );
      sizeReduced = gridSize. x;
   }
   int* host_output = new int[ sizeReduced ];
   cudaMemcpy( host_output, output, sizeReduced * sizeof(int), cudaMemcpyDeviceToHost );
   int result = host_output[ 0 ];
   for( int i = 1;i < sizeReduced; i++ )
        result = :: Max( result, host_output[ i ] );
   delete[] host_output;
   return result;
}
                         
int tnlCUDASimpleReduction1Sum( const int size,
                                const int block_size,
                                const int grid_size,
                                const int* input,
                                int* output )
{
   //Calculate necessary block/grid dimensions
   const int cpuThreshold = 1;
   const int desBlockSize = 128;    //Desired block size   
   dim3 blockSize = :: Min( size, desBlockSize );
   dim3 gridSize = size / blockSize. x;
   unsigned int shmem = blockSize. x * sizeof( int );
   cout << "Grid size: " << gridSize. x << endl 
        << "Block size: " << blockSize. x << endl
        << "Shmem: " << shmem << endl;
   tnlCUDASimpleReductionKernel1< int, tnlSum ><<< gridSize, blockSize, shmem >>>( size, input, output );
   int sizeReduced = gridSize. x;
   while( sizeReduced > cpuThreshold )
   {
      cout << "Reducing with size reduced = " << sizeReduced << endl;
      blockSize. x = :: Min( sizeReduced, desBlockSize );
      gridSize. x = sizeReduced / blockSize. x;
      shmem = blockSize. x * sizeof(int);
      tnlCUDASimpleReductionKernel1< int, tnlSum ><<< gridSize, blockSize, shmem >>>( size, input, output );
      sizeReduced = gridSize. x;
   }
   int* host_output = new int[ sizeReduced ];
   cudaMemcpy( host_output, output, sizeReduced * sizeof(int), cudaMemcpyDeviceToHost );
   int result = host_output[ 0 ];
   for( int i = 1;i < sizeReduced; i++ )
        result += host_output[ i ];
   delete[] host_output;
   return result;  
}

/*
float tnlCUDASimpleReduction1Min( const int size,
                                  const int block_size,
                                  const int grid_size,
                                  const float* input )
{
   return tnlCUDASimpleReduction1< float, tnlMin >( size, block_size, grid_size, input );
}

float tnlCUDASimpleReduction1Max( const int size,
                                  const int block_size,
                                  const int grid_size,
                                  const float* input )
{
   return tnlCUDASimpleReduction1< float, tnlMax >( size, block_size, grid_size, input );
}
                         
float tnlCUDASimpleReduction1Sum( const int size,
                                  const int block_size,
                                  const int grid_size,
                                  const float* input )
{
   return tnlCUDASimpleReduction1< float, tnlSum >( size, block_size, grid_size, input );
}

double tnlCUDASimpleReduction1Min( const int size,
                                   const int block_size,
                                   const int grid_size,
                                   const double* input )
{
   return tnlCUDASimpleReduction1< double, tnlMin >( size, block_size, grid_size, input );
}

double tnlCUDASimpleReduction1Max( const int size,
                                   const int block_size,
                                   const int grid_size,
                                   const double* input )
{
   return tnlCUDASimpleReduction1< double, tnlMax >( size, block_size, grid_size, input );
}
                         
double tnlCUDASimpleReduction1Sum( const int size,
                                   const int block_size,
                                   const int grid_size,
                                   const double* input )
{
   return tnlCUDASimpleReduction1< double, tnlSum >( size, block_size, grid_size, input );
}*/
 No newline at end of file
+87 −0
Original line number Diff line number Diff line
@@ -235,6 +235,93 @@ T tnlCUDAReduction( const int size,
	return result;
}

template < class T, tnlOperation operation >
__global__ void tnlCUDASimpleReductionKernel1( const int size,
		                                       const T* d_input,
		                                       T* d_output )
{
	extern __shared__ int sdata[];

	// Read data into the shared memory
	int tid = threadIdx. x;
	int gid = blockIdx. x * blockDim. x + threadIdx. x;
	// Last thread ID which manipulates meaningful data
	int last_tid = size - blockIdx. x * blockDim. x;
	if( gid < size )
		sdata[tid] = d_input[gid];
	__syncthreads();

	// Parallel reduction
	for( int s = 1; s < blockDim. x; s *= 2 )
	{
		if( ( tid % ( 2 * s ) ) == 0 && tid + s < last_tid )
		{
			if( operation == tnlMin )
				sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] );
			if( operation == tnlMax )
				sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] );
			if( operation == tnlSum )
				sdata[ tid ] += sdata[ tid + s ];
		}
		__syncthreads();
	}

	// Store the result back in global memory
	if( tid == 0 )
		d_output[ blockIdx. x ] = sdata[ 0 ];
}

template< class T, tnlOperation operation >
T tnlCUDASimpleReduction1( const int size,
					       const int block_size,
					       const int grid_size,
	                       const T* d_input )
{
	T result;

	dim3 blockSize( block_size );
	dim3 gridSize( grid_size );
	int shmem = 512 * sizeof( T );
	tnlAssert( shmem < 16384, cerr << shmem << " bytes are required." );
	switch( block_size )
	{
		case 512:
	        tnlCUDASimpleReductionKernel1< T, operation, 512 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
	        break;
	    case 256:
	    	tnlCUDASimpleReductionKernel1< T, operation, 256 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
	    	break;
	    case 128:
	    	tnlCUDASimpleReductionKernel1< T, operation, 128 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
	    	break;
	    case  64:
	    	tnlCUDASimpleReductionKernel1< T, operation,  64 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
	    	break;
	    case  32:
	    	tnlCUDASimpleReductionKernel1< T, operation,  32 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
	    	break;
	    case  16:
	    	tnlCUDASimpleReductionKernel1< T, operation,  16 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
	    	break;
	    case   8:
	    	tnlCUDASimpleReductionKernel1< T, operation,   8 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
	    	break;
	    case   4:
	    	tnlCUDAReductionKernel< T, operation,   4 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
	    	break;
	    case   2:
	    	tnlCUDAReductionKernel< T, operation,   2 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
	    	break;
	    case   1:
	    	tnlCUDAReductionKernel< T, operation,   1 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
	    	break;
	    default:
	    	tnlAssert( false, cerr << "Block size is " << block_size << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
	    	break;
	}
	return result;
}

#endif /* HAVE_CUDA */

#endif /* TNLCUDAKERNELS_H_ */
+107 −20
Original line number Diff line number Diff line
@@ -67,6 +67,49 @@ double tnlCUDAReductionSum( const int size,
                            const int grid_size,
                            const double* input );

/*
 * Simple reduction 1
 */
int tnlCUDASimpleReduction1Min( const int size,
                          const int block_size,
                          const int grid_size,
                          const int* input,
                          int* output );
int tnlCUDASimpleReduction1Max( const int size,
                          const int block_size,
                          const int grid_size,
                          const int* input,
                          int* output );
int tnlCUDASimpleReduction1Sum( const int size,
                          const int block_size,
                          const int grid_size,
                          const int* input,
                          int* output );
float tnlCUDASimpleReduction1Min( const int size,
                            const int block_size,
                            const int grid_size,
                            const float* input );
float tnlCUDASimpleReduction1Max( const int size,
                            const int block_size,
                            const int grid_size,
                            const float* input );
float tnlCUDASimpleReduction1Sum( const int size,
                            const int block_size,
                            const int grid_size,
                            const float* input );
double tnlCUDASimpleReduction1Min( const int size,
                             const int block_size,
                             const int grid_size,
                             const double* input );
double tnlCUDASimpleReduction1Max( const int size,
                             const int block_size,
                             const int grid_size,
                             const double* input );
double tnlCUDASimpleReduction1Sum( const int size,
                             const int block_size,
                             const int grid_size,
                             const double* input );

#endif


@@ -82,50 +125,94 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
   {
      CppUnit :: TestSuite* suiteOfTests = new CppUnit :: TestSuite( "tnlCUDAKernelsTester" );
      CppUnit :: TestResult result;
      suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >(
    		                   "testSimpleReduction1",
                               & tnlCUDAKernelsTester< T > :: testSimpleReduction1 )
                             );
      suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >(
                               "testReduction",
                               & tnlCUDAKernelsTester< T > :: testReduction )
                               & tnlCUDAKernelsTester< T > :: testFastReduction )
                             );

      return suiteOfTests;
   };

   void testReduction()
   bool testSetup( tnlLongVector< T >& host_input,
		           tnlLongVector< T >& host_output,
		           tnlLongVectorCUDA< T >& device_input,
		           tnlLongVectorCUDA< T >& device_output,
		           int size )
   {
	   /*
	    * Test by Jan Vacata.
	    */
	   int size = 100;
	   int desBlockSize = 128;    //Desired block size
	   int desGridSize = 2048;    //Impose limitation on grid size so that threads could perform sequential work

	   tnlLongVector< T > host_input( size );
	   tnlLongVector< T > host_output( size );

	   tnlLongVectorCUDA< T > device_input( size );
	   tnlLongVectorCUDA< T > device_output( size );
	   if( ! host_input. SetNewSize( size ) )
		   return false;
	   if( ! host_output. SetNewSize( size ) )
		   return false;
	   if( ! device_input. SetNewSize( size ) )
		   return false;
	   if( ! device_output. SetNewSize( size ) )
		   return false;

	   for( int i=0; i < size; i ++ )
	   {
		   host_input[ i ] = 1;
		   host_input[ i ] = i + 1;
		   host_output[ i ] = 0;
	   }
	   device_input. copyFrom( host_input );
	   return true;
   }

   void testReduction( int algorithm_efficiency = 0 )
   {
	   int size = 1<<16;
	   int desBlockSize = 128;    //Desired block size
	   int desGridSize = 2048;    //Impose limitation on grid size so that threads could perform sequential work

	   tnlLongVector< T > host_input, host_output;
	   tnlLongVectorCUDA< T > device_input, device_output;
	   CPPUNIT_ASSERT( testSetup( host_input,
		         	   host_output,
		         	   device_input,
		         	   device_output,
		         	   size )  );



	   //Calculate necessary block/grid dimensions
	   int block_size = :: Min( size/2, desBlockSize );
	   //Grid size is limited in this case
	   int grid_size = :: Min( desGridSize, size / block_size / 2 );

	   T min = tnlCUDAReductionMin( size, block_size, grid_size, device_input. Data() );
	   T max = tnlCUDAReductionMax( size, block_size, grid_size, device_input. Data() );
	   T sum = tnlCUDAReductionSum( size, block_size, grid_size, device_input. Data() );
	   T min, max, sum;
	   switch( algorithm_efficiency )
	   {
		   case 1:
			   min = tnlCUDASimpleReduction1Min( size, block_size, grid_size, device_input. Data(), device_output. Data() );
			   max = tnlCUDASimpleReduction1Max( size, block_size, grid_size, device_input. Data(), device_output. Data() );
			   sum = tnlCUDASimpleReduction1Sum( size, block_size, grid_size, device_input. Data(), device_output. Data() );
			   break;
		   default:
			   min = tnlCUDAReductionMin( size, block_size, grid_size, device_input. Data() );
			   max = tnlCUDAReductionMax( size, block_size, grid_size, device_input. Data() );
			   sum = tnlCUDAReductionSum( size, block_size, grid_size, device_input. Data() );
	   }

	   cout << "Min: " << min
			<< "Max: " << max
	   cout << "Min: " << min << endl
			<< "Max: " << max << endl
			<< "Sum: " << sum << endl;

   };

   void testSimpleReduction1()
   {
	   testReduction( 1 );
   };

   void testFastReduction()
   {
	   testReduction( 0 );
   }


};


+2 −2
Original line number Diff line number Diff line
@@ -44,8 +44,8 @@ int main( int argc, char* argv[] )
   runner.addTest( tnlGridCUDA2DTester< double > :: suite() );
   
   runner.addTest( tnlCUDAKernelsTester< int > :: suite() );
   runner.addTest( tnlCUDAKernelsTester< float > :: suite() );
   runner.addTest( tnlCUDAKernelsTester< double > :: suite() );
   //runner.addTest( tnlCUDAKernelsTester< float > :: suite() );
   //runner.addTest( tnlCUDAKernelsTester< double > :: suite() );
   
   runner.run();
   return 0;