Commit 025646c7 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Implementing CUDA parallel reduction.

parent 016f90e1
Loading
Loading
Loading
Loading
+51 −38
Original line number Diff line number Diff line
@@ -627,13 +627,13 @@ __global__ void tnlCUDASimpleReductionKernel4( const int size,
		sdata[ tid ] = d_input[ gid ];
	}
	__syncthreads();
	dbg_array1[ tid ] = sdata[ tid ];
	//dbg_array1[ tid ] = tid; //sdata[ tid ];

	// Parallel reduction
	int n = last_tid < blockDim. x ? last_tid : blockDim. x;
	for( int s = n / 2; s > 0; s >>= 1 )
	{
		if( tid < s && tid + s < n )
		if( tid < s )
		{
			if( operation == tnlMin )
				sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] );
@@ -645,16 +645,15 @@ __global__ void tnlCUDASimpleReductionKernel4( const int size,
		/* This is for the case when we have odd number of elements.
		 * The last one will be reduced using the thread with ID 0.
		 */
		if( 2 * s < n && tid == 0 )
		if( 2 * s < n && tid == n - 1 )
		{
			if( operation == tnlMin )
				sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ n ] );
				sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] );
			if( operation == tnlMax )
				sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ n ] );
				sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] );
			if( operation == tnlSum )
				sdata[ 0 ] += sdata[ n ];
			dbg_array1[ n ] = -555; //sdata[ 0 ];

				sdata[ 0 ] += sdata[ tid ];
			dbg_array1[ 0 ] = sdata[ tid ];
		}
		n = s;

@@ -663,7 +662,7 @@ __global__ void tnlCUDASimpleReductionKernel4( const int size,

	}

	// Store the result back in global memory
	// Store the result back in the global memory
	if( tid == 0 )
		d_output[ blockIdx. x ] = sdata[ 0 ];
}
@@ -692,7 +691,7 @@ bool tnlCUDASimpleReduction4( const int size,
       }
       device_output_allocated = true;

       cudaMalloc( ( void** ) &dbg_array1, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!
       //cudaMalloc( ( void** ) &dbg_array1, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!
       //cudaMalloc( ( void** ) &dbg_array2, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!!
   }
   dim3 block_size( 0 ), grid_size( 0 );
@@ -706,19 +705,19 @@ bool tnlCUDASimpleReduction4( const int size,
      if( grid_size. x * 2 * block_size. x < size_reduced )
    	  grid_size. x ++;
      shmem = block_size. x * sizeof( T );
      cout << "Size: " << size_reduced
      /*cout << "Size: " << size_reduced
           << " Grid size: " << grid_size. x
           << " Block size: " << block_size. x
           << " Shmem: " << shmem << endl;
           << " Shmem: " << shmem << endl;*/
      tnlCUDASimpleReductionKernel4< T, operation ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 );
      size_reduced = grid_size. x;
      reduction_input = device_output;

      // debuging part
      T* host_array = new T[ desBlockSize ];
      /*T* host_array = new T[ desBlockSize ];
      cudaMemcpy( host_array, dbg_array1,  desBlockSize * sizeof( T ), cudaMemcpyDeviceToHost );
      for( int i = 0; i< :: Min( ( int ) block_size. x, desBlockSize ); i ++ )
    	  cout << host_array[ i ] << " - ";
    	  cout << host_array[ i ] << " ";
      cout << endl;

      T* output = new T[ size_reduced ];
@@ -727,7 +726,7 @@ bool tnlCUDASimpleReduction4( const int size,
      for( int i = 0; i < size_reduced; i ++ )
    	  cout << output[ i ] << "   ";
      cout << endl;
      delete[] output;
      delete[] output;*/
   }
   T* host_output = new T[ size_reduced ];
   if( size == 1 )
@@ -772,7 +771,8 @@ __global__ void tnlCUDASimpleReductionKernel3( const int size,
	__syncthreads();

	// Parallel reduction
	for( int s = blockDim. x / 2; s > 0; s >>= 1 )
	int n = last_tid < blockDim. x ? last_tid : blockDim. x;
	for( int s = n / 2; s > 0; s >>= 1 )
	{
		if( tid < s && tid + s < last_tid )
		{
@@ -783,6 +783,19 @@ __global__ void tnlCUDASimpleReductionKernel3( const int size,
			if( operation == tnlSum )
				sdata[ tid ] += sdata[ tid + s ];
		}
		/* This is for the case when we have odd number of elements.
		 * The last one will be reduced using the thread with ID 0.
		 */
		if( 2 * s < n && tid == 0 )
		{
			if( operation == tnlMin )
				sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ n - 1 ] );
			if( operation == tnlMax )
				sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ n - 1 ] );
			if( operation == tnlSum )
				sdata[ 0 ] += sdata[ n - 1 ];
		}
		n = s;
		__syncthreads();
	}

+34 −22
Original line number Diff line number Diff line
@@ -156,7 +156,7 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
	   }


	   if( min == seq_min )
	   /*if( min == seq_min )
		   cout << "Min: " << min << " Seq. min: " << seq_min << " :-)" << endl;
	   else
		   cout << "Min: " << min << " Seq. min: " << seq_min << " !!!!!!!!!!" << endl;
@@ -167,11 +167,16 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
	   if( sum == seq_sum )
		   cout << "Sum: " << sum << " Seq. sum: " << seq_sum << " :-)" << endl;
	   else
		   cout << "Sum: " << sum << " Seq. sum: " << seq_sum << " !!!!!!!!!!" << endl;
		   cout << "Sum: " << sum << " Seq. sum: " << seq_sum << " !!!!!!!!!!" << endl;*/

	   T param;
	   if( GetParameterType( param ) == "float" )
	   if( GetParameterType( param ) == "float" ||
		   GetParameterType( param ) == "double" )
	   {
		   if( min != seq_min )
			   cerr << "Diff. min = " << min << " seq. min = " << seq_min;
		   if( max != seq_max )
			   cerr << "Diff. max = " << max << " seq. max = " << seq_max;
		   CPPUNIT_ASSERT( min == seq_min );
		   CPPUNIT_ASSERT( max == seq_max );
		   if( sum == 0.0 )
@@ -181,11 +186,19 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
		   else
		   {
			   double diff = ( ( double ) sum - ( double ) seq_sum ) / ( double) sum;
			   if( fabs( diff > 1.0e-5 ) )
				   cerr << "Diff is " << diff << " for " << GetParameterType( param ) << endl;
			   CPPUNIT_ASSERT( fabs( diff ) < 1.0e-5 );
		   }
	   }
	   else
	   {
		   if( min != seq_min )
			   cerr << "Diff. min = " << min << " seq. min = " << seq_min;
		   if( max != seq_max )
			   cerr << "Diff. max = " << max << " seq. max = " << seq_max;
		   if( sum != seq_sum )
			   cerr << "Diff. sum = " << sum << " seq. sum = " << seq_sum;
		   CPPUNIT_ASSERT( min == seq_min );
		   CPPUNIT_ASSERT( max == seq_max );
		   CPPUNIT_ASSERT( sum == seq_sum );
@@ -197,10 +210,11 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
   {
	   tnlLongVector< T > host_input;
	   int size = 2;
	   /*for( int s = 1; s < 12; s ++ )
	   for( int s = 1; s < 12; s ++ )
	   {
		   tnlLongVector< T > host_input( size );

		   cout << "Alg. " << algorithm_efficiency << "Testing zeros with size "  << size << " ";
		   for( int i = 0; i < size; i ++ )
			   host_input[ i ] = 0.0;
		   mainReduction( host_input,
@@ -208,20 +222,21 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
		   		          256,
		   		          2048 );

		   cout << "Alg. " << algorithm_efficiency  << "Testing ones with size "  << size << " ";
		   for( int i = 0; i < size; i ++ )
			   host_input[ i ] = 1.0;
		   mainReduction( host_input,
		   		          algorithm_efficiency,
		   		          256,
		   		          2048 );

		   cout << "Alg. " << algorithm_efficiency  << "Testing linear sequence with size "  << size << " ";
		   for( int i = 0; i < size; i ++ )
		   		   host_input[ i ] = i;
		    mainReduction( host_input,
						   algorithm_efficiency,
		   		   		   256,
		   		   		   2048 );

		    cout << "Alg. " << algorithm_efficiency  << "Testing quadratic sequence with size "  << size << " ";
		    for( int i = 0; i < size; i ++ )
		    	host_input[ i ] = ( i - size / 2 ) * ( i - size / 2 );
		    mainReduction( host_input,
@@ -229,46 +244,43 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
		    		   	   256,
		    		   	   2048 );
		    size *= 2;
	   }*/
	   for( size = 257; size < 5000; size ++ )
		    cout << endl;
	   }
	   for( size = 1; size < 5000; size ++ )
	   {
		   cout << "************* Size is " << size << " ******************* " << endl;
		   tnlLongVector< T > host_input( size );

		   /*for( int i = 0; i < size; i ++ )
		   cout << "Alg. " << algorithm_efficiency  << "Testing zeros with size "  << size << " ";
		   for( int i = 0; i < size; i ++ )
			   host_input[ i ] = 0.0;
		   mainReduction( host_input,
				   algorithm_efficiency,
				   256,
				   2048 );*/
				   2048 );

		   cout << "Alg. " << algorithm_efficiency  << "Testing ones with size "  << size << " ";
		   for( int i = 0; i < size; i ++ )
			   host_input[ i ] = 1.0;
		   mainReduction( host_input,
						   algorithm_efficiency,
						   256,
						   2048 );

		   cout << "Alg. " << algorithm_efficiency  << "Testing linear sequence with size "  << size << " ";
		   for( int i = 0; i < size; i ++ )
			   host_input[ i ] = i;
		   mainReduction( host_input,
				   algorithm_efficiency,
				   256,
				   2048 );

		   cout << "Alg. " << algorithm_efficiency  << "Testing quadratic sequence with size "  << size << " ";
		   for( int i = 0; i < size; i ++ )
			   host_input[ i ] = ( i - size / 2 ) * ( i - size / 2 );
		   mainReduction( host_input,
				   algorithm_efficiency,
				   256,
				   2048 );

		   cout << endl;
	   }





   };

   void testReduction()
@@ -298,13 +310,13 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
   void testSimpleReduction2()
   {
	   cout << "Test reduction 2" << endl;
  	   //testReduction( 2 );
  	   testReduction( 2 );
   };

   void testSimpleReduction1()
   {
	   cout << "Test reduction 1" << endl;
	   //testReduction( 1 );
	   testReduction( 1 );
   };