Commit a37a9195 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Implementing CUDA parallel reduction.

parent 40cb098f
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -44,7 +44,7 @@ then
   STORE_CPPFLAGS=$CPPFLAGS
   STORE_LDFLAGS=$LDFLAGS
   CUDA_LDFLAGS="-L$CUDA_LIBS"
   CUDA_CXXFLAGS="-I$CUDA_HEADERS -DHAVE_CUDA -arch sm_13"
   CUDA_CXXFLAGS="-I$CUDA_HEADERS -DHAVE_CUDA"
   CXXFLAGS="$CXXFLAGS -I$CUDA_HEADERS"
   CPPFLAGS="$CPPFLAGS -I$CUDA_HEADERS"
   LDFLAGS="$LDFLAGS $CUDA_LDFLAGS"
+2 −0
Original line number Diff line number Diff line
@@ -61,4 +61,6 @@ tnl_unit_tests_dbg_LDADD = libtnl-dbg-0.1.la \
                           core/libcore-tests-dbg.la
endif



TESTS = tnl-unit-tests
 No newline at end of file
+1 −27
Original line number Diff line number Diff line
@@ -104,33 +104,7 @@ int tnlCUDASimpleReduction1Min( const int size,
                                const int* input,
                                int* output )
{
   //Calculate necessary block/grid dimensions
   const int cpuThreshold = 1;
   const int desBlockSize = 128;    //Desired block size   
   dim3 blockSize = :: Min( size, desBlockSize );
   dim3 gridSize = size / blockSize. x;
   unsigned int shmem = blockSize. x * sizeof( int );
   cout << "Grid size: " << gridSize. x << endl 
        << "Block size: " << blockSize. x << endl
        << "Shmem: " << shmem << endl;
   tnlCUDASimpleReductionKernel1< int, tnlMin ><<< gridSize, blockSize, shmem >>>( size, input, output );
   int sizeReduced = gridSize. x;
   while( sizeReduced > cpuThreshold )
   {
      cout << "Reducing with size reduced = " << sizeReduced << endl;
      blockSize. x = :: Min( sizeReduced, desBlockSize );
      gridSize. x = sizeReduced / blockSize. x;
      shmem = blockSize. x * sizeof(int);
      tnlCUDASimpleReductionKernel1< int, tnlMin ><<< gridSize, blockSize, shmem >>>( size, input, output );
      sizeReduced = gridSize. x;
   }
   int* host_output = new int[ sizeReduced ];
   cudaMemcpy( host_output, output, sizeReduced * sizeof(int), cudaMemcpyDeviceToHost );
   int result = host_output[ 0 ];
   for( int i = 1;i < sizeReduced; i++ )
        result = :: Min( result, host_output[ i ] );
   delete[] host_output;
   return result;

}

int tnlCUDASimpleReduction1Max( const int size,
+41 −45
Original line number Diff line number Diff line
@@ -273,52 +273,48 @@ __global__ void tnlCUDASimpleReductionKernel1( const int size,

template< class T, tnlOperation operation >
T tnlCUDASimpleReduction1( const int size,
					       const int block_size,
					       const int grid_size,
	                       const T* d_input )
	                   const T* d_input,
	                   T* output = 0 )
{
	T result;

	dim3 blockSize( block_size );
	dim3 gridSize( grid_size );
	int shmem = 512 * sizeof( T );
	tnlAssert( shmem < 16384, cerr << shmem << " bytes are required." );
	switch( block_size )
   //Calculate necessary block/grid dimensions
   const int cpuThreshold = 1;
   const int desBlockSize = 128;    //Desired block size
   dim3 block_size = :: Min( size, desBlockSize );
   dim3 grid_size = size / blockSize. x;
   unsigned int shmem = blockSize. x * sizeof( int );
   cout << "Grid size: " << grid_size. x << endl
        << "Block size: " << block_size. x << endl
        << "Shmem: " << shmem << endl;
   if( ! output )
   {
		case 512:
	        tnlCUDASimpleReductionKernel1< T, operation, 512 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
	        break;
	    case 256:
	    	tnlCUDASimpleReductionKernel1< T, operation, 256 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
	    	break;
	    case 128:
	    	tnlCUDASimpleReductionKernel1< T, operation, 128 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
	    	break;
	    case  64:
	    	tnlCUDASimpleReductionKernel1< T, operation,  64 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
	    	break;
	    case  32:
	    	tnlCUDASimpleReductionKernel1< T, operation,  32 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
	    	break;
	    case  16:
	    	tnlCUDASimpleReductionKernel1< T, operation,  16 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
	    	break;
	    case   8:
	    	tnlCUDASimpleReductionKernel1< T, operation,   8 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
	    	break;
	    case   4:
	    	tnlCUDAReductionKernel< T, operation,   4 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
	    	break;
	    case   2:
	    	tnlCUDAReductionKernel< T, operation,   2 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
	    	break;
	    case   1:
	    	tnlCUDAReductionKernel< T, operation,   1 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
	    	break;
	    default:
	    	tnlAssert( false, cerr << "Block size is " << block_size << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
	    	break;
      cudaMalloc( ( void** ) &output, grid_size * sizeof( T ) );
      if( cuda // TODO: add allocation check
   }
   tnlCUDASimpleReductionKernel1< T, operation ><<< grid_size, block_size, shmem >>>( size, input, output );
   int size_reduced = grid_size. x;
   while( sizeReduced > cpuThreshold )
   {
      cout << "Reducing with size reduced = " << size_reduced << endl;
      block_size. x = :: Min( size_reduced, desBlockSize );
      grid_size. x = size_reduced / block_size. x;
      shmem = block_size. x * sizeof(int);
      tnlCUDASimpleReductionKernel1< T, operation ><<< grid_size, block_size, shmem >>>( size, input, output );
      size_reduced = grid_size. x;
   }
   int* host_output = new int[ size_reduced ];
   cudaMemcpy( host_output, output, sizeReduced * sizeof(int), cudaMemcpyDeviceToHost );
   int result = host_output[ 0 ];
   for( int i = 1;i < size_reduced; i++ )
   {
      if( operation == tnlMin)
         result = :: Min( result, host_output[ i ] );
      if( operation == tnlMax )
         result = :: Max( result, host_output[ i ] );
      if( operation == tnlSum )
         result += host_ouput[ i ];
   }
   delete[] host_output;

   return result;
}

+10 −8
Original line number Diff line number Diff line
@@ -15,6 +15,15 @@ __global__ void setMultiBlockNumber( const T c, T* A, const int size )
   if( i < size ) A[ i ] = c;
};

template< class T >
__global__ void setNumber( const T c, T* A, const int size )
{
   int i = threadIdx. x;
   if( i < size )
      A[ i ] = c;
};


template< class T >
void testMultiBlockKernel( const T& number, const int size )
{
@@ -37,20 +46,13 @@ void testMultiBlockKernel( const T& number, const int size )
   CPPUNIT_ASSERT( ! errors );
};

template< class T >
__global__ void setNumber( T* A, const T c )
{
   int i = threadIdx. x;
   A[ i ] = c;
};

template< class T >
void testKernel( const T& number, const int size )
{
   tnlLongVectorCUDA< T > device_vector( size );
   tnlLongVector< T > host_vector( size );
   T* data = device_vector. Data();
   setNumber<<< 1, size >>>( data, number );
   setNumber<<< 1, size >>>( number, data, size );
   host_vector. copyFrom( device_vector );

   int errors( 0 );
Loading