Skip to content
Snippets Groups Projects
Commit 40cb098f authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Almost working simple parallel reduction in CUDA.

parent d70edfe8
No related branches found
No related tags found
No related merge requests found
......@@ -15,7 +15,11 @@
* *
***************************************************************************/
#include <tnl-cuda-kernels.h>
#include <iostream>
#include <core/mfuncs.h>
#include <core/tnl-cuda-kernels.h>
using namespace std;
int tnlCUDAReductionMin( const int size,
const int block_size,
......@@ -90,4 +94,160 @@ double tnlCUDAReductionSum( const int size,
return tnlCUDAReduction< double, tnlSum >( size, block_size, grid_size, input );
}
/*
* Simple redcution 1
*/
int tnlCUDASimpleReduction1Min( const int size,
const int block_size,
const int grid_size,
const int* input,
int* output )
{
//Calculate necessary block/grid dimensions
const int cpuThreshold = 1;
const int desBlockSize = 128; //Desired block size
dim3 blockSize = :: Min( size, desBlockSize );
dim3 gridSize = size / blockSize. x;
unsigned int shmem = blockSize. x * sizeof( int );
cout << "Grid size: " << gridSize. x << endl
<< "Block size: " << blockSize. x << endl
<< "Shmem: " << shmem << endl;
tnlCUDASimpleReductionKernel1< int, tnlMin ><<< gridSize, blockSize, shmem >>>( size, input, output );
int sizeReduced = gridSize. x;
while( sizeReduced > cpuThreshold )
{
cout << "Reducing with size reduced = " << sizeReduced << endl;
blockSize. x = :: Min( sizeReduced, desBlockSize );
gridSize. x = sizeReduced / blockSize. x;
shmem = blockSize. x * sizeof(int);
tnlCUDASimpleReductionKernel1< int, tnlMin ><<< gridSize, blockSize, shmem >>>( size, input, output );
sizeReduced = gridSize. x;
}
int* host_output = new int[ sizeReduced ];
cudaMemcpy( host_output, output, sizeReduced * sizeof(int), cudaMemcpyDeviceToHost );
int result = host_output[ 0 ];
for( int i = 1;i < sizeReduced; i++ )
result = :: Min( result, host_output[ i ] );
delete[] host_output;
return result;
}
int tnlCUDASimpleReduction1Max( const int size,
const int block_size,
const int grid_size,
const int* input,
int* output )
{
//Calculate necessary block/grid dimensions
const int cpuThreshold = 1;
const int desBlockSize = 128; //Desired block size
dim3 blockSize = :: Min( size, desBlockSize );
dim3 gridSize = size / blockSize. x;
unsigned int shmem = blockSize. x * sizeof( int );
cout << "Grid size: " << gridSize. x << endl
<< "Block size: " << blockSize. x << endl
<< "Shmem: " << shmem << endl;
tnlCUDASimpleReductionKernel1< int, tnlMax ><<< gridSize, blockSize, shmem >>>( size, input, output );
int sizeReduced = gridSize. x;
while( sizeReduced > cpuThreshold )
{
cout << "Reducing with size reduced = " << sizeReduced << endl;
blockSize. x = :: Min( sizeReduced, desBlockSize );
gridSize. x = sizeReduced / blockSize. x;
shmem = blockSize. x * sizeof(int);
tnlCUDASimpleReductionKernel1< int, tnlMax ><<< gridSize, blockSize, shmem >>>( size, input, output );
sizeReduced = gridSize. x;
}
int* host_output = new int[ sizeReduced ];
cudaMemcpy( host_output, output, sizeReduced * sizeof(int), cudaMemcpyDeviceToHost );
int result = host_output[ 0 ];
for( int i = 1;i < sizeReduced; i++ )
result = :: Max( result, host_output[ i ] );
delete[] host_output;
return result;
}
int tnlCUDASimpleReduction1Sum( const int size,
const int block_size,
const int grid_size,
const int* input,
int* output )
{
//Calculate necessary block/grid dimensions
const int cpuThreshold = 1;
const int desBlockSize = 128; //Desired block size
dim3 blockSize = :: Min( size, desBlockSize );
dim3 gridSize = size / blockSize. x;
unsigned int shmem = blockSize. x * sizeof( int );
cout << "Grid size: " << gridSize. x << endl
<< "Block size: " << blockSize. x << endl
<< "Shmem: " << shmem << endl;
tnlCUDASimpleReductionKernel1< int, tnlSum ><<< gridSize, blockSize, shmem >>>( size, input, output );
int sizeReduced = gridSize. x;
while( sizeReduced > cpuThreshold )
{
cout << "Reducing with size reduced = " << sizeReduced << endl;
blockSize. x = :: Min( sizeReduced, desBlockSize );
gridSize. x = sizeReduced / blockSize. x;
shmem = blockSize. x * sizeof(int);
tnlCUDASimpleReductionKernel1< int, tnlSum ><<< gridSize, blockSize, shmem >>>( size, input, output );
sizeReduced = gridSize. x;
}
int* host_output = new int[ sizeReduced ];
cudaMemcpy( host_output, output, sizeReduced * sizeof(int), cudaMemcpyDeviceToHost );
int result = host_output[ 0 ];
for( int i = 1;i < sizeReduced; i++ )
result += host_output[ i ];
delete[] host_output;
return result;
}
/*
float tnlCUDASimpleReduction1Min( const int size,
const int block_size,
const int grid_size,
const float* input )
{
return tnlCUDASimpleReduction1< float, tnlMin >( size, block_size, grid_size, input );
}
float tnlCUDASimpleReduction1Max( const int size,
const int block_size,
const int grid_size,
const float* input )
{
return tnlCUDASimpleReduction1< float, tnlMax >( size, block_size, grid_size, input );
}
float tnlCUDASimpleReduction1Sum( const int size,
const int block_size,
const int grid_size,
const float* input )
{
return tnlCUDASimpleReduction1< float, tnlSum >( size, block_size, grid_size, input );
}
double tnlCUDASimpleReduction1Min( const int size,
const int block_size,
const int grid_size,
const double* input )
{
return tnlCUDASimpleReduction1< double, tnlMin >( size, block_size, grid_size, input );
}
double tnlCUDASimpleReduction1Max( const int size,
const int block_size,
const int grid_size,
const double* input )
{
return tnlCUDASimpleReduction1< double, tnlMax >( size, block_size, grid_size, input );
}
double tnlCUDASimpleReduction1Sum( const int size,
const int block_size,
const int grid_size,
const double* input )
{
return tnlCUDASimpleReduction1< double, tnlSum >( size, block_size, grid_size, input );
}*/
\ No newline at end of file
......@@ -235,6 +235,93 @@ T tnlCUDAReduction( const int size,
return result;
}
template < class T, tnlOperation operation >
__global__ void tnlCUDASimpleReductionKernel1( const int size,
const T* d_input,
T* d_output )
{
extern __shared__ int sdata[];
// Read data into the shared memory
int tid = threadIdx. x;
int gid = blockIdx. x * blockDim. x + threadIdx. x;
// Last thread ID which manipulates meaningful data
int last_tid = size - blockIdx. x * blockDim. x;
if( gid < size )
sdata[tid] = d_input[gid];
__syncthreads();
// Parallel reduction
for( int s = 1; s < blockDim. x; s *= 2 )
{
if( ( tid % ( 2 * s ) ) == 0 && tid + s < last_tid )
{
if( operation == tnlMin )
sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] );
if( operation == tnlMax )
sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] );
if( operation == tnlSum )
sdata[ tid ] += sdata[ tid + s ];
}
__syncthreads();
}
// Store the result back in global memory
if( tid == 0 )
d_output[ blockIdx. x ] = sdata[ 0 ];
}
template< class T, tnlOperation operation >
T tnlCUDASimpleReduction1( const int size,
const int block_size,
const int grid_size,
const T* d_input )
{
T result;
dim3 blockSize( block_size );
dim3 gridSize( grid_size );
int shmem = 512 * sizeof( T );
tnlAssert( shmem < 16384, cerr << shmem << " bytes are required." );
switch( block_size )
{
case 512:
tnlCUDASimpleReductionKernel1< T, operation, 512 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
break;
case 256:
tnlCUDASimpleReductionKernel1< T, operation, 256 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
break;
case 128:
tnlCUDASimpleReductionKernel1< T, operation, 128 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
break;
case 64:
tnlCUDASimpleReductionKernel1< T, operation, 64 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
break;
case 32:
tnlCUDASimpleReductionKernel1< T, operation, 32 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
break;
case 16:
tnlCUDASimpleReductionKernel1< T, operation, 16 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
break;
case 8:
tnlCUDASimpleReductionKernel1< T, operation, 8 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
break;
case 4:
tnlCUDAReductionKernel< T, operation, 4 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
break;
case 2:
tnlCUDAReductionKernel< T, operation, 2 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
break;
case 1:
tnlCUDAReductionKernel< T, operation, 1 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
break;
default:
tnlAssert( false, cerr << "Block size is " << block_size << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
break;
}
return result;
}
#endif /* HAVE_CUDA */
#endif /* TNLCUDAKERNELS_H_ */
......@@ -67,6 +67,49 @@ double tnlCUDAReductionSum( const int size,
const int grid_size,
const double* input );
/*
* Simple reduction 1
*/
int tnlCUDASimpleReduction1Min( const int size,
const int block_size,
const int grid_size,
const int* input,
int* output );
int tnlCUDASimpleReduction1Max( const int size,
const int block_size,
const int grid_size,
const int* input,
int* output );
int tnlCUDASimpleReduction1Sum( const int size,
const int block_size,
const int grid_size,
const int* input,
int* output );
float tnlCUDASimpleReduction1Min( const int size,
const int block_size,
const int grid_size,
const float* input );
float tnlCUDASimpleReduction1Max( const int size,
const int block_size,
const int grid_size,
const float* input );
float tnlCUDASimpleReduction1Sum( const int size,
const int block_size,
const int grid_size,
const float* input );
double tnlCUDASimpleReduction1Min( const int size,
const int block_size,
const int grid_size,
const double* input );
double tnlCUDASimpleReduction1Max( const int size,
const int block_size,
const int grid_size,
const double* input );
double tnlCUDASimpleReduction1Sum( const int size,
const int block_size,
const int grid_size,
const double* input );
#endif
......@@ -82,50 +125,94 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
{
CppUnit :: TestSuite* suiteOfTests = new CppUnit :: TestSuite( "tnlCUDAKernelsTester" );
CppUnit :: TestResult result;
suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >(
"testSimpleReduction1",
& tnlCUDAKernelsTester< T > :: testSimpleReduction1 )
);
suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >(
"testReduction",
& tnlCUDAKernelsTester< T > :: testReduction )
& tnlCUDAKernelsTester< T > :: testFastReduction )
);
return suiteOfTests;
};
void testReduction()
bool testSetup( tnlLongVector< T >& host_input,
tnlLongVector< T >& host_output,
tnlLongVectorCUDA< T >& device_input,
tnlLongVectorCUDA< T >& device_output,
int size )
{
/*
* Test by Jan Vacata.
*/
int size = 100;
int desBlockSize = 128; //Desired block size
int desGridSize = 2048; //Impose limitation on grid size so that threads could perform sequential work
tnlLongVector< T > host_input( size );
tnlLongVector< T > host_output( size );
tnlLongVectorCUDA< T > device_input( size );
tnlLongVectorCUDA< T > device_output( size );
if( ! host_input. SetNewSize( size ) )
return false;
if( ! host_output. SetNewSize( size ) )
return false;
if( ! device_input. SetNewSize( size ) )
return false;
if( ! device_output. SetNewSize( size ) )
return false;
for( int i=0; i < size; i ++ )
{
host_input[ i ] = 1;
host_input[ i ] = i + 1;
host_output[ i ] = 0;
}
device_input. copyFrom( host_input );
return true;
}
void testReduction( int algorithm_efficiency = 0 )
{
int size = 1<<16;
int desBlockSize = 128; //Desired block size
int desGridSize = 2048; //Impose limitation on grid size so that threads could perform sequential work
tnlLongVector< T > host_input, host_output;
tnlLongVectorCUDA< T > device_input, device_output;
CPPUNIT_ASSERT( testSetup( host_input,
host_output,
device_input,
device_output,
size ) );
//Calculate necessary block/grid dimensions
int block_size = :: Min( size/2, desBlockSize );
//Grid size is limited in this case
int grid_size = :: Min( desGridSize, size / block_size / 2 );
T min = tnlCUDAReductionMin( size, block_size, grid_size, device_input. Data() );
T max = tnlCUDAReductionMax( size, block_size, grid_size, device_input. Data() );
T sum = tnlCUDAReductionSum( size, block_size, grid_size, device_input. Data() );
T min, max, sum;
switch( algorithm_efficiency )
{
case 1:
min = tnlCUDASimpleReduction1Min( size, block_size, grid_size, device_input. Data(), device_output. Data() );
max = tnlCUDASimpleReduction1Max( size, block_size, grid_size, device_input. Data(), device_output. Data() );
sum = tnlCUDASimpleReduction1Sum( size, block_size, grid_size, device_input. Data(), device_output. Data() );
break;
default:
min = tnlCUDAReductionMin( size, block_size, grid_size, device_input. Data() );
max = tnlCUDAReductionMax( size, block_size, grid_size, device_input. Data() );
sum = tnlCUDAReductionSum( size, block_size, grid_size, device_input. Data() );
}
cout << "Min: " << min
<< "Max: " << max
cout << "Min: " << min << endl
<< "Max: " << max << endl
<< "Sum: " << sum << endl;
};
void testSimpleReduction1()
{
testReduction( 1 );
};
void testFastReduction()
{
testReduction( 0 );
}
};
......
......@@ -44,8 +44,8 @@ int main( int argc, char* argv[] )
runner.addTest( tnlGridCUDA2DTester< double > :: suite() );
runner.addTest( tnlCUDAKernelsTester< int > :: suite() );
runner.addTest( tnlCUDAKernelsTester< float > :: suite() );
runner.addTest( tnlCUDAKernelsTester< double > :: suite() );
//runner.addTest( tnlCUDAKernelsTester< float > :: suite() );
//runner.addTest( tnlCUDAKernelsTester< double > :: suite() );
runner.run();
return 0;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment