Commit 016f90e1 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Implementing CUDA parallel reduction - problem with odd number of elements in a block.

parent 3c8e9889
Loading
Loading
Loading
Loading
+85 −39
Original line number Diff line number Diff line
@@ -60,17 +60,19 @@ __global__ void tnlCUDAReductionKernel( const int size,

   // Read data into the shared memory
   int tid = threadIdx. x;
   int gid = 2 * blockSize * blockDim. x + threadIdx. x;
   int gid = 2 * blockDim. x * blockIdx. x + threadIdx. x;

   if( gid + blockSize < size )
   {
      if( operation == tnlMin ) sdata[ tid ] = :: tnlCudaMin( d_input[ gid ], d_input[ gid + blockSize ] );
      if( operation == tnlMax ) sdata[ tid ] = :: tnlCudaMax( d_input[ gid ], d_input[ gid + blockSize ] );
      if( operation == tnlSum ) sdata[ tid ] = d_input[ gid ] + d_input[ gid + blockSize ];
      //dbg_array1[ blockIdx. x * blockDim. x + threadIdx. x ] = -1;
   }
   else if( gid < size )
   {
      sdata[ tid ] = d_input[ gid ];
      //dbg_array1[ blockIdx. x * blockDim. x + threadIdx. x ] = -2;
   }
   gid += grid_size;

@@ -80,8 +82,9 @@ __global__ void tnlCUDAReductionKernel( const int size,
      if( operation == tnlMax ) sdata[ tid ] = :: tnlCudaMax( sdata[ tid ], :: tnlCudaMax( d_input[ gid ], d_input[ gid + blockSize ] ) );
      if( operation == tnlSum ) sdata[ tid ] += d_input[gid] + d_input[ gid + blockSize ];
      gid += grid_size;
      //dbg_array1[ blockIdx. x * blockDim. x + threadIdx. x ] = -4;
   }
   dbg_array1[ blockIdx. x * blockDim. x + threadIdx. x ] = sdata[ tid ];
   //dbg_array1[ blockIdx. x * blockDim. x + threadIdx. x ] = sdata[ tid ];
   __syncthreads();

   if( gid + blockDim. x < size )
@@ -216,6 +219,9 @@ bool tnlCUDAReduction( const int size,
	               T& result,
	               T* device_output = 0 )
{
   if( ! size ) return false;


   //Calculate necessary block/grid dimensions
   const int cpuThreshold = 1;
   const int desBlockSize = 16;    //Desired block size
@@ -235,7 +241,7 @@ bool tnlCUDAReduction( const int size,
       }
       device_output_allocated = true;

       cudaMalloc( ( void** ) &dbg_array1, size * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!
       //cudaMalloc( ( void** ) &dbg_array1, size * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!
       //cudaMalloc( ( void** ) &dbg_array2, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!!
   }
   dim3 block_size( 0 ), grid_size( 0 );
@@ -247,40 +253,40 @@ bool tnlCUDAReduction( const int size,
      block_size. x = :: Min( size_reduced, desBlockSize );
      grid_size. x = :: Min( ( int ) ( size_reduced / block_size. x + 1 ) / 2, desGridSize );
      shmem = block_size. x * sizeof( T );
      cout << "Size: " << size_reduced
      /*cout << "Size: " << size_reduced
           << " Grid size: " << grid_size. x
           << " Block size: " << block_size. x
           << " Shmem: " << shmem << endl;
           << " Shmem: " << shmem << endl;*/
      tnlAssert( shmem < 16384, cerr << shmem << " bytes are required." << endl; );
      tnlAssert( block_size. x <= 512, cerr << "Block size is " << block_size. x << endl; );
      switch( block_size. x )
      {
		  case 512:
			  tnlCUDAReductionKernel< T, operation, 512 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
			  tnlCUDAReductionKernel< T, operation, 512 ><<< grid_size, block_size, shmem >>>( size_reduced, 2 * grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
			  break;
		  case 256:
			  tnlCUDAReductionKernel< T, operation, 256 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
			  tnlCUDAReductionKernel< T, operation, 256 ><<< grid_size, block_size, shmem >>>( size_reduced, 2 * grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
			  break;
		  case 128:
			  tnlCUDAReductionKernel< T, operation, 128 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
			  tnlCUDAReductionKernel< T, operation, 128 ><<< grid_size, block_size, shmem >>>( size_reduced, 2 * grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
			  break;
		  case  64:
			  tnlCUDAReductionKernel< T, operation,  64 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
			  tnlCUDAReductionKernel< T, operation,  64 ><<< grid_size, block_size, shmem >>>( size_reduced, 2 * grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
			  break;
		  case  32:
			  tnlCUDAReductionKernel< T, operation,  32 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
			  tnlCUDAReductionKernel< T, operation,  32 ><<< grid_size, block_size, shmem >>>( size_reduced, 2 * grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
			  break;
		  case  16:
			  tnlCUDAReductionKernel< T, operation,  16 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
			  tnlCUDAReductionKernel< T, operation,  16 ><<< grid_size, block_size, shmem >>>( size_reduced, 2 * grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
			  break;
		  case   8:
			  tnlCUDAReductionKernel< T, operation,   8 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
			  tnlCUDAReductionKernel< T, operation,   8 ><<< grid_size, block_size, shmem >>>( size_reduced, 2 * grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
			  break;
		  case   4:
			  tnlCUDAReductionKernel< T, operation,   4 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
			  tnlCUDAReductionKernel< T, operation,   4 ><<< grid_size, block_size, shmem >>>( size_reduced, 2 * grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
			  break;
		  case   2:
			  tnlCUDAReductionKernel< T, operation,   2 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
			  tnlCUDAReductionKernel< T, operation,   2 ><<< grid_size, block_size, shmem >>>( size_reduced, 2 * grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
			  break;
		  case   1:
			  tnlAssert( false, cerr << "blockSize should not be 1." << endl );
@@ -293,21 +299,24 @@ bool tnlCUDAReduction( const int size,
      reduction_input = device_output;

      // debuging part
      T* host_array = new T[ size ];
      /*T* host_array = new T[ size ];
      cudaMemcpy( host_array, dbg_array1,  size * sizeof( T ), cudaMemcpyDeviceToHost );
      for( int i = 0; i< size; i ++ )
    	  cout << host_array[ i ] << " - ";
      cout << endl;
      cout << endl;*/

      T* output = new T[ size_reduced ];
      /*T* output = new T[ size_reduced ];
      cudaMemcpy( output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost );
      cout << endl;
      for( int i = 0; i < size_reduced; i ++ )
    	  cout << output[ i ] << "   ";
      cout << endl;
      delete[] output;
      delete[] output;*/
   }
   T* host_output = new T[ size_reduced ];
   if( size == 1 )
   	   cudaMemcpy( host_output, device_input, sizeof( T ), cudaMemcpyDeviceToHost );
   else
	   cudaMemcpy( host_output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost );
   result = host_output[ 0 ];
   for( int i = 1; i < size_reduced; i++ )
@@ -563,6 +572,9 @@ bool tnlCUDASimpleReduction5( const int size,
      delete[] output;*/
   }
   T* host_output = new T[ size_reduced ];
   if( size == 1 )
   	   cudaMemcpy( host_output, device_input, sizeof( T ), cudaMemcpyDeviceToHost );
   else
	   cudaMemcpy( host_output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost );
   result = host_output[ 0 ];
   for( int i = 1; i < size_reduced; i++ )
@@ -615,11 +627,13 @@ __global__ void tnlCUDASimpleReductionKernel4( const int size,
		sdata[ tid ] = d_input[ gid ];
	}
	__syncthreads();
	dbg_array1[ tid ] = sdata[ tid ];

	// Parallel reduction
	for( int s = blockDim. x / 2; s > 0; s >>= 1 )
	int n = last_tid < blockDim. x ? last_tid : blockDim. x;
	for( int s = n / 2; s > 0; s >>= 1 )
	{
		if( tid < s && tid + s < last_tid )
		if( tid < s && tid + s < n )
		{
			if( operation == tnlMin )
				sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] );
@@ -628,7 +642,25 @@ __global__ void tnlCUDASimpleReductionKernel4( const int size,
			if( operation == tnlSum )
				sdata[ tid ] += sdata[ tid + s ];
		}
		/* This is for the case when we have odd number of elements.
		 * The last one will be reduced using the thread with ID 0.
		 */
		if( 2 * s < n && tid == 0 )
		{
			if( operation == tnlMin )
				sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ n ] );
			if( operation == tnlMax )
				sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ n ] );
			if( operation == tnlSum )
				sdata[ 0 ] += sdata[ n ];
			dbg_array1[ n ] = -555; //sdata[ 0 ];

		}
		n = s;

		__syncthreads();
		//dbg_array1[ tid ] = -sdata[ tid ];

	}

	// Store the result back in global memory
@@ -660,7 +692,7 @@ bool tnlCUDASimpleReduction4( const int size,
       }
       device_output_allocated = true;

       //cudaMalloc( ( void** ) &dbg_array1, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!
       cudaMalloc( ( void** ) &dbg_array1, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!
       //cudaMalloc( ( void** ) &dbg_array2, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!!
   }
   dim3 block_size( 0 ), grid_size( 0 );
@@ -670,18 +702,20 @@ bool tnlCUDASimpleReduction4( const int size,
   while( size_reduced > cpuThreshold )
   {
      block_size. x = :: Min( size_reduced, desBlockSize );
      grid_size. x = ( size_reduced / block_size. x + 1 ) / 2;
      grid_size. x = size_reduced / block_size. x / 2;
      if( grid_size. x * 2 * block_size. x < size_reduced )
    	  grid_size. x ++;
      shmem = block_size. x * sizeof( T );
      /*cout << "Size: " << size_reduced
      cout << "Size: " << size_reduced
           << " Grid size: " << grid_size. x
           << " Block size: " << block_size. x
           << " Shmem: " << shmem << endl;*/
           << " Shmem: " << shmem << endl;
      tnlCUDASimpleReductionKernel4< T, operation ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 );
      size_reduced = grid_size. x;
      reduction_input = device_output;

      // debuging part
      /*T* host_array = new T[ desBlockSize ];
      T* host_array = new T[ desBlockSize ];
      cudaMemcpy( host_array, dbg_array1,  desBlockSize * sizeof( T ), cudaMemcpyDeviceToHost );
      for( int i = 0; i< :: Min( ( int ) block_size. x, desBlockSize ); i ++ )
    	  cout << host_array[ i ] << " - ";
@@ -693,9 +727,12 @@ bool tnlCUDASimpleReduction4( const int size,
      for( int i = 0; i < size_reduced; i ++ )
    	  cout << output[ i ] << "   ";
      cout << endl;
      delete[] output;*/
      delete[] output;
   }
   T* host_output = new T[ size_reduced ];
   if( size == 1 )
	   cudaMemcpy( host_output, device_input, sizeof( T ), cudaMemcpyDeviceToHost );
   else
	   cudaMemcpy( host_output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost );
   result = host_output[ 0 ];
   for( int i = 1; i < size_reduced; i++ )
@@ -783,7 +820,7 @@ bool tnlCUDASimpleReduction3( const int size,
   while( size_reduced > cpuThreshold )
   {
      block_size. x = :: Min( size_reduced, desBlockSize );
      grid_size. x = size_reduced / block_size. x;
      grid_size. x = size_reduced / block_size. x + ( size_reduced % block_size. x != 0 );
      shmem = block_size. x * sizeof( T );
      /*cout << "Size: " << size_reduced
           << " Grid size: " << grid_size. x
@@ -803,6 +840,9 @@ bool tnlCUDASimpleReduction3( const int size,
      delete[] output;*/
   }
   T* host_output = new T[ size_reduced ];
   if( size == 1 )
   	   cudaMemcpy( host_output, device_input, sizeof( T ), cudaMemcpyDeviceToHost );
   else
	   cudaMemcpy( host_output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost );
   result = host_output[ 0 ];
   for( int i = 1; i < size_reduced; i++ )
@@ -891,7 +931,7 @@ bool tnlCUDASimpleReduction2( const int size,
   while( size_reduced > cpuThreshold )
   {
      block_size. x = :: Min( size_reduced, desBlockSize );
      grid_size. x = size_reduced / block_size. x;
      grid_size. x = size_reduced / block_size. x + ( size_reduced % block_size. x != 0 );
      shmem = block_size. x * sizeof( T );
      /*cout << "Size: " << size_reduced
           << " Grid size: " << grid_size. x
@@ -911,6 +951,9 @@ bool tnlCUDASimpleReduction2( const int size,
      delete[] output;*/
   }
   T* host_output = new T[ size_reduced ];
   if( size == 1 )
	   cudaMemcpy( host_output, device_input, sizeof( T ), cudaMemcpyDeviceToHost );
   else
	   cudaMemcpy( host_output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost );
   result = host_output[ 0 ];
   for( int i = 1; i < size_reduced; i++ )
@@ -998,7 +1041,7 @@ bool tnlCUDASimpleReduction1( const int size,
   while( size_reduced > cpuThreshold )
   {
      block_size. x = :: Min( size_reduced, desBlockSize );
      grid_size. x = size_reduced / block_size. x;
      grid_size. x = size_reduced / block_size. x + ( size_reduced % block_size. x != 0 );
      shmem = block_size. x * sizeof( T );
      /*cout << "Size: " << size_reduced
           << " Grid size: " << grid_size. x
@@ -1018,6 +1061,9 @@ bool tnlCUDASimpleReduction1( const int size,
      delete[] output;*/
   }
   T* host_output = new T[ size_reduced ];
   if( size == 1 )
	   cudaMemcpy( host_output, device_input, sizeof( T ), cudaMemcpyDeviceToHost );
   else
	   cudaMemcpy( host_output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost );
   result = host_output[ 0 ];
   for( int i = 1; i < size_reduced; i++ )
+127 −25
Original line number Diff line number Diff line
@@ -19,6 +19,7 @@
#define TNLCUDAKERNELSTESTER_H_

#include <iostream>
#include <math.h>
#include <cppunit/TestSuite.h>
#include <cppunit/TestResult.h>
#include <cppunit/TestCaller.h>
@@ -98,20 +99,21 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
	   return true;
   }

   void testReduction( int algorithm_efficiency = 0 )
   bool mainReduction( const tnlLongVector< T >& host_input,
		               int algorithm_efficiency,
		               const int desired_block_size,
		               const int desired_grid_size )
   {
	   int size = 32; 1024; //1<<10;
	   int desBlockSize = 128;    //Desired block size
	   int desGridSize = 2048;    //Impose limitation on grid size so that threads could perform sequential work

	   tnlLongVector< T > host_input;
	   const int size = host_input. GetSize();
	   tnlLongVectorCUDA< T > device_input;
	   CPPUNIT_ASSERT( testSetup( host_input,
		         	   device_input,
		         	   size )  );
	   if( ! device_input. SetNewSize( size ) )
		   return false;
	   device_input. copyFrom( host_input );

	   T seq_min( host_input[ 0 ] ),
		 seq_max( host_input[ 0 ] ),
		 seq_sum( host_input[ 0 ] );

	   for( int i = 1; i < size; i ++ )
	   {
		   seq_min = :: Min( seq_min, host_input[ i ] );
@@ -119,11 +121,6 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
		   seq_sum += host_input[ i ];
	   }

	   //Calculate necessary block/grid dimensions
	   int block_size = :: Min( size/2, desBlockSize );
	   //Grid size is limited in this case
	   int grid_size = :: Min( desGridSize, size / block_size / 2 );

	   T min, max, sum;
	   switch( algorithm_efficiency )
	   {
@@ -159,13 +156,118 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
	   }


	   cout << "Min: " << min << " Seq. min: " << seq_min << endl
			<< "Max: " << max << " Seq. max: " << seq_max << endl
			<< "Sum: " << sum << " Seq. sum: " << seq_sum << endl;
	   if( min == seq_min )
		   cout << "Min: " << min << " Seq. min: " << seq_min << " :-)" << endl;
	   else
		   cout << "Min: " << min << " Seq. min: " << seq_min << " !!!!!!!!!!" << endl;
	   if( max == seq_max )
		   cout	<< "Max: " << max << " Seq. max: " << seq_max << " :-)" << endl;
	   else
		   cout	<< "Max: " << max << " Seq. max: " << seq_max << " !!!!!!!!!!" << endl;
	   if( sum == seq_sum )
		   cout << "Sum: " << sum << " Seq. sum: " << seq_sum << " :-)" << endl;
	   else
		   cout << "Sum: " << sum << " Seq. sum: " << seq_sum << " !!!!!!!!!!" << endl;

	   T param;
	   if( GetParameterType( param ) == "float" )
	   {
		   CPPUNIT_ASSERT( min == seq_min );
		   CPPUNIT_ASSERT( max == seq_max );
		   if( sum == 0.0 )
		   {
			   CPPUNIT_ASSERT( sum == seq_sum );
		   }
		   else
		   {
			   double diff = ( ( double ) sum - ( double ) seq_sum ) / ( double) sum;
			   CPPUNIT_ASSERT( fabs( diff ) < 1.0e-5 );
		   }
	   }
	   else
	   {
		   CPPUNIT_ASSERT( min == seq_min );
		   CPPUNIT_ASSERT( max == seq_max );
		   CPPUNIT_ASSERT( sum == seq_sum );
	   }

   }

   void testReduction( int algorithm_efficiency = 0 )
   {
	   tnlLongVector< T > host_input;
	   int size = 2;
	   /*for( int s = 1; s < 12; s ++ )
	   {
		   tnlLongVector< T > host_input( size );

		   for( int i = 0; i < size; i ++ )
			   host_input[ i ] = 0.0;
		   mainReduction( host_input,
		   		          algorithm_efficiency,
		   		          256,
		   		          2048 );

		   for( int i = 0; i < size; i ++ )
			   host_input[ i ] = 1.0;
		   mainReduction( host_input,
		   		          algorithm_efficiency,
		   		          256,
		   		          2048 );

		   for( int i = 0; i < size; i ++ )
		   		   host_input[ i ] = i;
		    mainReduction( host_input,
						   algorithm_efficiency,
		   		   		   256,
		   		   		   2048 );

		    for( int i = 0; i < size; i ++ )
		    	host_input[ i ] = ( i - size / 2 ) * ( i - size / 2 );
		    mainReduction( host_input,
		                   algorithm_efficiency,
		    		   	   256,
		    		   	   2048 );
		    size *= 2;
	   }*/
	   for( size = 257; size < 5000; size ++ )
	   {
		   cout << "************* Size is " << size << " ******************* " << endl;
		   tnlLongVector< T > host_input( size );

		   /*for( int i = 0; i < size; i ++ )
			   host_input[ i ] = 0.0;
		   mainReduction( host_input,
				   algorithm_efficiency,
				   256,
				   2048 );*/

		   for( int i = 0; i < size; i ++ )
			   host_input[ i ] = 1.0;
		   mainReduction( host_input,
						   algorithm_efficiency,
						   256,
						   2048 );

		   for( int i = 0; i < size; i ++ )
			   host_input[ i ] = i;
		   mainReduction( host_input,
				   algorithm_efficiency,
				   256,
				   2048 );

		   for( int i = 0; i < size; i ++ )
			   host_input[ i ] = ( i - size / 2 ) * ( i - size / 2 );
		   mainReduction( host_input,
				   algorithm_efficiency,
				   256,
				   2048 );

	   	   }





   };

@@ -196,13 +298,13 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
   void testSimpleReduction2()
   {
	   cout << "Test reduction 2" << endl;
  	   testReduction( 2 );
  	   //testReduction( 2 );
   };

   void testSimpleReduction1()
   {
	   cout << "Test reduction 1" << endl;
	   testReduction( 1 );
	   //testReduction( 1 );
   };