Commit 0076d54f authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Implementing CUDA parallel reduction.

parent 9a01097d
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -38,6 +38,7 @@ headers = tnlAssert.h \
		    tnlTester.h \
		    tnlVector.h \
		    tnl-cuda-kernels.h \
		    tnl-cuda-kernels.cu.h \
		    tnlCUDAKernelsTester.h \
		    compress-file.h \
		    mfilename.h \
+45 −47
Original line number Diff line number Diff line
@@ -22,76 +22,74 @@
using namespace std;

int tnlCUDAReductionMin( const int size,
                         const int block_size,
                         const int grid_size,
                         const int* input )
                         const int* input,
                         int& result,
                         int* device_aux_array = 0 )
{
   return tnlCUDAReduction< int, tnlMin >( size, block_size, grid_size, input );
   return tnlCUDAReduction< int, tnlMin >( size, input, result, device_aux_array );
}

int tnlCUDAReductionMax( const int size,
                         const int block_size,
                         const int grid_size,
                         const int* input )
                         const int* input,
                         int& result,
                         int* device_aux_array = 0 )
{
   return tnlCUDAReduction< int, tnlMax >( size, block_size, grid_size, input );
   return tnlCUDAReduction< int, tnlMax >( size, input, result, device_aux_array );
}

int tnlCUDAReductionSum( const int size,
                         const int block_size,
                         const int grid_size,
                         const int* input )
                         const int* input,
                         int& result,
                         int* device_aux_array = 0 )
{
   return tnlCUDAReduction< int, tnlSum >( size, block_size, grid_size, input );
   return tnlCUDAReduction< int, tnlSum >( size, input, result, device_aux_array );
}


float tnlCUDAReductionMin( const int size,
                           const int block_size,
                           const int grid_size,
                           const float* input )
bool tnlCUDAReductionMin( const int size,
                          const float* input,
                          float& result,
                          float* device_aux_array = 0 )
{
   return tnlCUDAReduction< float, tnlMin >( size, block_size, grid_size, input );
   return tnlCUDAReduction< float, tnlMin >( size, input, result, device_aux_array );
}

float tnlCUDAReductionMax( const int size,
                           const int block_size,
                           const int grid_size,
                           const float* input )
bool tnlCUDAReductionMax( const int size,
                          const float* input,
                          float& result,
                          float* device_aux_array = 0 )
{
   return tnlCUDAReduction< float, tnlMax >( size, block_size, grid_size, input );
   return tnlCUDAReduction< float, tnlMax >( size, input, result, device_aux_array );
}

float tnlCUDAReductionSum( const int size,
                           const int block_size,
                           const int grid_size,
                           const float* input )
bool tnlCUDAReductionSum( const int size,
                          const float* input,
                          float& result,
                          float* device_aux_array = 0 )
{
   return tnlCUDAReduction< float, tnlSum >( size, block_size, grid_size, input );
   return tnlCUDAReduction< float, tnlSum >( size, input, result, device_aux_array );
}

double tnlCUDAReductionMin( const int size,
                            const int block_size,
                            const int grid_size,
                            const double* input )
bool tnlCUDAReductionMin( const int size,
                          const double* input,
                          double& result,
                          double* device_aux_array = 0 )
{
   return tnlCUDAReduction< double, tnlMin >( size, block_size, grid_size, input );
   return tnlCUDAReduction< double, tnlMin >( size, input, result, device_aux_array );
}

double tnlCUDAReductionMax( const int size,
                            const int block_size,
                            const int grid_size,
                            const double* input )
bool tnlCUDAReductionMax( const int size,
                          const double* input,
                          double& result,
                          double* device_aux_array = 0 )
{
   return tnlCUDAReduction< double, tnlMax >( size, block_size, grid_size, input );
   return tnlCUDAReduction< double, tnlMax >( size, input, result, device_aux_array );
}

double tnlCUDAReductionSum( const int size,
                            const int block_size,
                            const int grid_size,
                            const double* input )
bool tnlCUDAReductionSum( const int size,
                          const double* input,
                          double& result,
                          double* device_aux_array = 0 )
{
   return tnlCUDAReduction< double, tnlSum >( size, block_size, grid_size, input );
   return tnlCUDAReduction< double, tnlSum >( size, input, result, device_aux_array );
}

/*
+222 −0
Original line number Diff line number Diff line
/***************************************************************************
                          tnl-cuda-kernels.cu.h  -  description
                             -------------------
    begin                : Jan 19, 2010
    copyright            : (C) 2010 by Tomas Oberhuber
    email                : tomas.oberhuber@fjfi.cvut.cz
 ***************************************************************************/

/***************************************************************************
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
 ***************************************************************************/

#ifndef TNLCUDAKERNELS_CU_H_
#define TNLCUDAKERNELS_CU_H_

int tnlCUDAReductionMin( const int size,
                         const int* input,
                         int& result,
                         int* device_aux_array = 0 );

int tnlCUDAReductionMax( const int size,
                         const int* input,
                         int& result,
                         int* device_aux_array = 0 );

int tnlCUDAReductionSum( const int size,
                         const int* input,
                         int& result,
                         int* device_aux_array = 0 );

bool tnlCUDAReductionMin( const int size,
                          const float* input,
                          float& result,
                          float* device_aux_array = 0 );

bool tnlCUDAReductionMax( const int size,
                          const float* input,
                          float& result,
                          float* device_aux_array = 0 );

bool tnlCUDAReductionSum( const int size,
                          const float* input,
                          float& result,
                          float* device_aux_array = 0 );

bool tnlCUDAReductionMin( const int size,
                          const double* input,
                          double& result,
                          double* device_aux_array = 0 );

bool tnlCUDAReductionMax( const int size,
                          const double* input,
                          double& result,
                          double* device_aux_array = 0 );

bool tnlCUDAReductionSum( const int size,
                          const double* input,
                          double& result,
                          double* device_aux_array = 0 );

/*
 * Simple reduction 5
 */
bool tnlCUDASimpleReduction5Min( const int size,
                                 const int* input,
                                 int& result );
bool tnlCUDASimpleReduction5Max( const int size,
                                 const int* input,
                                 int& result );
bool tnlCUDASimpleReduction5Sum( const int size,
                                 const int* input,
                                 int& result );
bool tnlCUDASimpleReduction5Min( const int size,
                                 const float* input,
                                 float& result);
bool tnlCUDASimpleReduction5Max( const int size,
                                 const float* input,
                                 float& result);
bool tnlCUDASimpleReduction5Sum( const int size,
                                 const float* input,
                                 float& result);
bool tnlCUDASimpleReduction5Min( const int size,
                                 const double* input,
                                 double& result);
bool tnlCUDASimpleReduction5Max( const int size,
                                 const double* input,
                                 double& result );
bool tnlCUDASimpleReduction5Sum( const int size,
                                 const double* input,
                                 double& result );

/*
 * Simple reduction 4
 */
bool tnlCUDASimpleReduction4Min( const int size,
                                 const int* input,
                                 int& result );
bool tnlCUDASimpleReduction4Max( const int size,
                                 const int* input,
                                 int& result );
bool tnlCUDASimpleReduction4Sum( const int size,
                                 const int* input,
                                 int& result );
bool tnlCUDASimpleReduction4Min( const int size,
                                 const float* input,
                                 float& result);
bool tnlCUDASimpleReduction4Max( const int size,
                                 const float* input,
                                 float& result);
bool tnlCUDASimpleReduction4Sum( const int size,
                                 const float* input,
                                 float& result);
bool tnlCUDASimpleReduction4Min( const int size,
                                 const double* input,
                                 double& result);
bool tnlCUDASimpleReduction4Max( const int size,
                                 const double* input,
                                 double& result );
bool tnlCUDASimpleReduction4Sum( const int size,
                                 const double* input,
                                 double& result );

/*
 * Simple reduction 3
 */
bool tnlCUDASimpleReduction3Min( const int size,
                          const int* input,
                          int& result );
bool tnlCUDASimpleReduction3Max( const int size,
                          const int* input,
                          int& result );
bool tnlCUDASimpleReduction3Sum( const int size,
                          const int* input,
                          int& result );
bool tnlCUDASimpleReduction3Min( const int size,
                            const float* input,
                            float& result);
bool tnlCUDASimpleReduction3Max( const int size,
                            const float* input,
                            float& result);
bool tnlCUDASimpleReduction3Sum( const int size,
                            const float* input,
                            float& result);
bool tnlCUDASimpleReduction3Min( const int size,
                             const double* input,
                             double& result);
bool tnlCUDASimpleReduction3Max( const int size,
                             const double* input,
                             double& result );
bool tnlCUDASimpleReduction3Sum( const int size,
                             const double* input,
                             double& result );

/*
 * Simple reduction 2
 */
bool tnlCUDASimpleReduction2Min( const int size,
                          const int* input,
                          int& result );
bool tnlCUDASimpleReduction2Max( const int size,
                          const int* input,
                          int& result );
bool tnlCUDASimpleReduction2Sum( const int size,
                          const int* input,
                          int& result );
bool tnlCUDASimpleReduction2Min( const int size,
                            const float* input,
                            float& result);
bool tnlCUDASimpleReduction2Max( const int size,
                            const float* input,
                            float& result);
bool tnlCUDASimpleReduction2Sum( const int size,
                            const float* input,
                            float& result);
bool tnlCUDASimpleReduction2Min( const int size,
                             const double* input,
                             double& result);
bool tnlCUDASimpleReduction2Max( const int size,
                             const double* input,
                             double& result );
bool tnlCUDASimpleReduction2Sum( const int size,
                             const double* input,
                             double& result );

/*
 * Simple reduction 1
 */
bool tnlCUDASimpleReduction1Min( const int size,
                          const int* input,
                          int& result );
bool tnlCUDASimpleReduction1Max( const int size,
                          const int* input,
                          int& result );
bool tnlCUDASimpleReduction1Sum( const int size,
                          const int* input,
                          int& result );
bool tnlCUDASimpleReduction1Min( const int size,
                            const float* input,
                            float& result);
bool tnlCUDASimpleReduction1Max( const int size,
                            const float* input,
                            float& result);
bool tnlCUDASimpleReduction1Sum( const int size,
                            const float* input,
                            float& result);
bool tnlCUDASimpleReduction1Min( const int size,
                             const double* input,
                             double& result);
bool tnlCUDASimpleReduction1Max( const int size,
                             const double* input,
                             double& result );
bool tnlCUDASimpleReduction1Sum( const int size,
                             const double* input,
                             double& result );


#endif /* TNLCUDAKERNELS_CU_H_ */
+164 −169
Original line number Diff line number Diff line
@@ -56,18 +56,18 @@ __global__ void tnlCUDAReductionKernel( const int size,
	                                T* d_output,
	                                T* dbg_array1 = 0  )
{
	extern __shared__ T sdata[];
   extern __shared__ __align__ ( 8 ) T sdata[];

   // Read data into the shared memory
   int tid = threadIdx. x;
   int gid = 2 * blockIdx. x * blockDim. x + threadIdx. x;
   // Last thread ID which manipulates meaningful data

	int grid_size = 2 * blockSize * gridDim. x;
   //int grid_size = 2 * blockSize * gridDim. x;
   if( gid + blockSize < size )
   {
		if( operation == tnlMin ) sdata[ tid ] = :: Min( d_input[ gid ], d_input[ gid + blockSize ] );
		if( operation == tnlMax ) sdata[ tid ] = :: Max( d_input[ gid ], d_input[ gid + blockSize ] );
      if( operation == tnlMin ) sdata[ tid ] = :: tnlCudaMin( d_input[ gid ], d_input[ gid + blockSize ] );
      if( operation == tnlMax ) sdata[ tid ] = :: tnlCudaMax( d_input[ gid ], d_input[ gid + blockSize ] );
      if( operation == tnlSum ) sdata[ tid ] = d_input[ gid ] + d_input[ gid + blockSize ];
   }
   else
@@ -78,10 +78,10 @@ __global__ void tnlCUDAReductionKernel( const int size,

   while( gid < size )
   {
		if( operation == tnlMin ) sdata[ tid ] = :: Min( sdata[ tid ], :: Min( d_input[gid], d_input[gid+blockSize] );
		if( operation == tnlMax ) sdata[ tid ] = :: Max( sdata[ tid ], :: Max( d_input[gid], d_input[gid+blockSize] );
      if( operation == tnlMin ) sdata[ tid ] = :: tnlCudaMin( sdata[ tid ], :: tnlCudaMin( d_input[gid], d_input[gid+blockSize] ) );
      if( operation == tnlMax ) sdata[ tid ] = :: tnlCudaMax( sdata[ tid ], :: tnlCudaMax( d_input[gid], d_input[gid+blockSize] ) );
      if( operation == tnlSum ) sdata[ tid ] += d_input[gid] + d_input[ gid + blockSize ];
		gid += gridSize;
      gid += grid_size;
   }
   __syncthreads();

@@ -220,8 +220,7 @@ bool tnlCUDAReduction( const int size,
   //Calculate necessary block/grid dimensions
   const int cpuThreshold = 1;
   const int desBlockSize = 16;    //Desired block size

   T* dbg_array1;
   const int desGridSize = 2048;

   bool device_output_allocated( false );
   if( ! device_output )
@@ -234,9 +233,6 @@ bool tnlCUDAReduction( const int size,
    	   return false;
       }
       device_output_allocated = true;

       //cudaMalloc( ( void** ) &dbg_array1, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!
       //cudaMalloc( ( void** ) &dbg_array2, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!!
   }
   dim3 block_size( 0 ), grid_size( 0 );
   int shmem;
@@ -245,42 +241,41 @@ bool tnlCUDAReduction( const int size,
   while( size_reduced > cpuThreshold )
   {
      block_size. x = :: Min( size_reduced, desBlockSize );
      grid_size. x = :: Min( ( size_reduced / block_size. x + 1 ) / 2, desGridSize );
      grid_size. x = :: Min( ( int ) ( size_reduced / block_size. x + 1 ) / 2, desGridSize );
      shmem = block_size. x * sizeof( T );
      cout << "Size: " << size_reduced
           << " Grid size: " << grid_size. x
           << " Block size: " << block_size. x
           << " Shmem: " << shmem << endl;
      //tnlCUDASimpleReductionKernel4< T, operation ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output );
      tnlAssert( shmem < 16384, cerr << shmem << " bytes are required." );
      switch( block_size. x )
      {
		  case 512:
			  tnlCUDASimpleReductionKernel5< T, operation, 512 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
			  tnlCUDAReductionKernel< T, operation, 512 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
			  break;
		  case 256:
			  tnlCUDASimpleReductionKernel5< T, operation, 256 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
			  tnlCUDAReductionKernel< T, operation, 256 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
			  break;
		  case 128:
			  tnlCUDASimpleReductionKernel5< T, operation, 128 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
			  tnlCUDAReductionKernel< T, operation, 128 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
			  break;
		  case  64:
			  tnlCUDASimpleReductionKernel5< T, operation,  64 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
			  tnlCUDAReductionKernel< T, operation,  64 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
			  break;
		  case  32:
			  tnlCUDASimpleReductionKernel5< T, operation,  32 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
			  tnlCUDAReductionKernel< T, operation,  32 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
			  break;
		  case  16:
			  tnlCUDASimpleReductionKernel5< T, operation,  16 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
			  tnlCUDAReductionKernel< T, operation,  16 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
			  break;
		  case   8:
			  tnlCUDASimpleReductionKernel5< T, operation,   8 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
			  tnlCUDAReductionKernel< T, operation,   8 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
			  break;
		  case   4:
			  tnlCUDASimpleReductionKernel5< T, operation,   4 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
			  tnlCUDAReductionKernel< T, operation,   4 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
			  break;
		  case   2:
			  tnlCUDASimpleReductionKernel5< T, operation,   2 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
			  tnlCUDAReductionKernel< T, operation,   2 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
			  break;
		  case   1:
			  tnlAssert( false, cerr << "blockSize should not be 1." << endl );
+15 −200

File changed.

Preview size limit exceeded, changes collapsed.

Loading