Skip to content
Snippets Groups Projects
Commit 9a01097d authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Implementing CUDA parallel reduction.

parent a37a9195
No related branches found
No related tags found
No related merge requests found
TODO: implementovat tridu tnlFileName pro generovani jmen souboru
TODO: metodu pro tnlString pro nahrazeni napr. podretezce XXXXX indexem 00001 tj. uXXXXX.bin -> u00001.bin
to by melo byt robustnejsi, nez doposavadni pristup
TODO: implementovat tridu tnlParabolicSolver pro odvozovani resicu k casove promennym uloham
TODO: Nahradit mGrid2D, mGrid3D za mGrid obecne dimenze
......@@ -8,6 +11,4 @@ TODO: zavets iteratory pres uzle site misto for cyklu
TODO: implementovat Mersona v CUDA
TODO: metoda Test do tnlObject
TODO: trida tnlTester pro rizeni testu
\ No newline at end of file
TODO: objekt pro osetreni chyb - zavedeni funkce tnlGetError
\ No newline at end of file
......@@ -37,6 +37,12 @@ AC_ARG_WITH(cuda_libdir,
AS_HELP_STRING([--with-cuda-libdir],
[says where the CUDA libraries can be found, default is /usr/local/cuda/lib]),
CUDA_LIBS=$withval)
AC_ARG_WITH(cuda_arch,
AS_HELP_STRING([--with-cuda-arch],
[specifies the CUDA architecture, can be 1.0, 1.1, 1.2 or 1.3 - default is 1.3]),
CUDA_ARCH=$withval,
CUDA_ARCH="1.3")
working_nvcc="no"
if test x$with_cuda = xyes;
then
......@@ -89,6 +95,21 @@ then
CUDA_LDFLAGS="$CUDA_LDFLAGS -lcudart"
CC="nvcc"
CXX="nvcc"
case "$CUDA_ARCH" in
1.0 )
CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_10"
;;
1.1 )
CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_11"
;;
1.2 )
CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_12"
;;
1.3 )
CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_13"
;;
esac
DBGCXXFLAGS="$DBGCXXFLAGS -deviceemu"
else
CUDA_LDFLAGS=""
CUDA_CXXFLAGS=""
......@@ -121,7 +142,7 @@ dnl ----------- check for debug--------------
dnl -----------------------------------------
AC_ARG_ENABLE(debug,[ --enable-debug=[no/yes] turn on debugging [default=no]] )
if test x"$enable_debug" = xyes; then
DBGCXXFLAGS="-O0 -DDEBUG"
DBGCXXFLAGS="$DBGCXXFLAGS -O0 -DDEBUG"
if test x$CXX != xnvcc;
then
DBGCXXFLAGS="$DBGCXXFLAGS -g3 -Wall -W -ansi -Wno-unused"
......
This diff is collapsed.
This diff is collapsed.
......@@ -66,49 +66,161 @@ double tnlCUDAReductionSum( const int size,
const int block_size,
const int grid_size,
const double* input );
/*
* Simple reduction 5
*/
bool tnlCUDASimpleReduction5Min( const int size,
const int* input,
int& result );
bool tnlCUDASimpleReduction5Max( const int size,
const int* input,
int& result );
bool tnlCUDASimpleReduction5Sum( const int size,
const int* input,
int& result );
bool tnlCUDASimpleReduction5Min( const int size,
const float* input,
float& result);
bool tnlCUDASimpleReduction5Max( const int size,
const float* input,
float& result);
bool tnlCUDASimpleReduction5Sum( const int size,
const float* input,
float& result);
bool tnlCUDASimpleReduction5Min( const int size,
const double* input,
double& result);
bool tnlCUDASimpleReduction5Max( const int size,
const double* input,
double& result );
bool tnlCUDASimpleReduction5Sum( const int size,
const double* input,
double& result );
/*
* Simple reduction 4
*/
bool tnlCUDASimpleReduction4Min( const int size,
const int* input,
int& result );
bool tnlCUDASimpleReduction4Max( const int size,
const int* input,
int& result );
bool tnlCUDASimpleReduction4Sum( const int size,
const int* input,
int& result );
bool tnlCUDASimpleReduction4Min( const int size,
const float* input,
float& result);
bool tnlCUDASimpleReduction4Max( const int size,
const float* input,
float& result);
bool tnlCUDASimpleReduction4Sum( const int size,
const float* input,
float& result);
bool tnlCUDASimpleReduction4Min( const int size,
const double* input,
double& result);
bool tnlCUDASimpleReduction4Max( const int size,
const double* input,
double& result );
bool tnlCUDASimpleReduction4Sum( const int size,
const double* input,
double& result );
/*
* Simple reduction 3
*/
bool tnlCUDASimpleReduction3Min( const int size,
const int* input,
int& result );
bool tnlCUDASimpleReduction3Max( const int size,
const int* input,
int& result );
bool tnlCUDASimpleReduction3Sum( const int size,
const int* input,
int& result );
bool tnlCUDASimpleReduction3Min( const int size,
const float* input,
float& result);
bool tnlCUDASimpleReduction3Max( const int size,
const float* input,
float& result);
bool tnlCUDASimpleReduction3Sum( const int size,
const float* input,
float& result);
bool tnlCUDASimpleReduction3Min( const int size,
const double* input,
double& result);
bool tnlCUDASimpleReduction3Max( const int size,
const double* input,
double& result );
bool tnlCUDASimpleReduction3Sum( const int size,
const double* input,
double& result );
/*
* Simple reduction 2
*/
bool tnlCUDASimpleReduction2Min( const int size,
const int* input,
int& result );
bool tnlCUDASimpleReduction2Max( const int size,
const int* input,
int& result );
bool tnlCUDASimpleReduction2Sum( const int size,
const int* input,
int& result );
bool tnlCUDASimpleReduction2Min( const int size,
const float* input,
float& result);
bool tnlCUDASimpleReduction2Max( const int size,
const float* input,
float& result);
bool tnlCUDASimpleReduction2Sum( const int size,
const float* input,
float& result);
bool tnlCUDASimpleReduction2Min( const int size,
const double* input,
double& result);
bool tnlCUDASimpleReduction2Max( const int size,
const double* input,
double& result );
bool tnlCUDASimpleReduction2Sum( const int size,
const double* input,
double& result );
/*
* Simple reduction 1
*/
int tnlCUDASimpleReduction1Min( const int size,
const int block_size,
const int grid_size,
bool tnlCUDASimpleReduction1Min( const int size,
const int* input,
int* output );
int tnlCUDASimpleReduction1Max( const int size,
const int block_size,
const int grid_size,
int& result );
bool tnlCUDASimpleReduction1Max( const int size,
const int* input,
int* output );
int tnlCUDASimpleReduction1Sum( const int size,
const int block_size,
const int grid_size,
int& result );
bool tnlCUDASimpleReduction1Sum( const int size,
const int* input,
int* output );
float tnlCUDASimpleReduction1Min( const int size,
const int block_size,
const int grid_size,
const float* input );
float tnlCUDASimpleReduction1Max( const int size,
const int block_size,
const int grid_size,
const float* input );
float tnlCUDASimpleReduction1Sum( const int size,
const int block_size,
const int grid_size,
const float* input );
double tnlCUDASimpleReduction1Min( const int size,
const int block_size,
const int grid_size,
const double* input );
double tnlCUDASimpleReduction1Max( const int size,
const int block_size,
const int grid_size,
const double* input );
double tnlCUDASimpleReduction1Sum( const int size,
const int block_size,
const int grid_size,
const double* input );
int& result );
bool tnlCUDASimpleReduction1Min( const int size,
const float* input,
float& result);
bool tnlCUDASimpleReduction1Max( const int size,
const float* input,
float& result);
bool tnlCUDASimpleReduction1Sum( const int size,
const float* input,
float& result);
bool tnlCUDASimpleReduction1Min( const int size,
const double* input,
double& result);
bool tnlCUDASimpleReduction1Max( const int size,
const double* input,
double& result );
bool tnlCUDASimpleReduction1Sum( const int size,
const double* input,
double& result );
#endif
......@@ -125,57 +237,71 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
{
CppUnit :: TestSuite* suiteOfTests = new CppUnit :: TestSuite( "tnlCUDAKernelsTester" );
CppUnit :: TestResult result;
suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >(
"testSimpleReduction1",
& tnlCUDAKernelsTester< T > :: testSimpleReduction1 )
);
suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >(
"testSimpleReduction2",
& tnlCUDAKernelsTester< T > :: testSimpleReduction2 )
);
suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >(
"testSimpleReduction3",
& tnlCUDAKernelsTester< T > :: testSimpleReduction3 )
);
suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >(
"testSimpleReduction4",
& tnlCUDAKernelsTester< T > :: testSimpleReduction4 )
);
suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >(
"testSimpleReduction5",
& tnlCUDAKernelsTester< T > :: testSimpleReduction5 )
);
/*suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >(
"testReduction",
& tnlCUDAKernelsTester< T > :: testFastReduction )
);
);*/
return suiteOfTests;
};
bool testSetup( tnlLongVector< T >& host_input,
tnlLongVector< T >& host_output,
tnlLongVectorCUDA< T >& device_input,
tnlLongVectorCUDA< T >& device_output,
int size )
{
if( ! host_input. SetNewSize( size ) )
return false;
if( ! host_output. SetNewSize( size ) )
return false;
if( ! device_input. SetNewSize( size ) )
return false;
if( ! device_output. SetNewSize( size ) )
return false;
for( int i=0; i < size; i ++ )
{
host_input[ i ] = i + 1;
host_output[ i ] = 0;
}
device_input. copyFrom( host_input );
return true;
}
void testReduction( int algorithm_efficiency = 0 )
{
int size = 1<<16;
int size = 1<<10;
int desBlockSize = 128; //Desired block size
int desGridSize = 2048; //Impose limitation on grid size so that threads could perform sequential work
tnlLongVector< T > host_input, host_output;
tnlLongVectorCUDA< T > device_input, device_output;
tnlLongVector< T > host_input;
tnlLongVectorCUDA< T > device_input;
CPPUNIT_ASSERT( testSetup( host_input,
host_output,
device_input,
device_output,
size ) );
T seq_min( host_input[ 0 ] ),
seq_max( host_input[ 0 ] ),
seq_sum( host_input[ 0 ] );
for( int i = 1; i < size; i ++ )
{
seq_min = :: Min( seq_min, host_input[ i ] );
seq_max = :: Max( seq_max, host_input[ i ] );
seq_sum += host_input[ i ];
}
//Calculate necessary block/grid dimensions
int block_size = :: Min( size/2, desBlockSize );
......@@ -186,9 +312,29 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
switch( algorithm_efficiency )
{
case 1:
min = tnlCUDASimpleReduction1Min( size, block_size, grid_size, device_input. Data(), device_output. Data() );
max = tnlCUDASimpleReduction1Max( size, block_size, grid_size, device_input. Data(), device_output. Data() );
sum = tnlCUDASimpleReduction1Sum( size, block_size, grid_size, device_input. Data(), device_output. Data() );
tnlCUDASimpleReduction1Min( size, device_input. Data(), min );
tnlCUDASimpleReduction1Max( size, device_input. Data(), max );
tnlCUDASimpleReduction1Sum( size, device_input. Data(), sum );
break;
case 2:
tnlCUDASimpleReduction2Min( size, device_input. Data(), min );
tnlCUDASimpleReduction2Max( size, device_input. Data(), max );
tnlCUDASimpleReduction2Sum( size, device_input. Data(), sum );
break;
case 3:
tnlCUDASimpleReduction3Min( size, device_input. Data(), min );
tnlCUDASimpleReduction3Max( size, device_input. Data(), max );
tnlCUDASimpleReduction3Sum( size, device_input. Data(), sum );
break;
case 4:
tnlCUDASimpleReduction4Min( size, device_input. Data(), min );
tnlCUDASimpleReduction4Max( size, device_input. Data(), max );
tnlCUDASimpleReduction4Sum( size, device_input. Data(), sum );
break;
case 5:
tnlCUDASimpleReduction5Min( size, device_input. Data(), min );
tnlCUDASimpleReduction5Max( size, device_input. Data(), max );
tnlCUDASimpleReduction5Sum( size, device_input. Data(), sum );
break;
default:
min = tnlCUDAReductionMin( size, block_size, grid_size, device_input. Data() );
......@@ -196,14 +342,44 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
sum = tnlCUDAReductionSum( size, block_size, grid_size, device_input. Data() );
}
cout << "Min: " << min << endl
<< "Max: " << max << endl
<< "Sum: " << sum << endl;
cout << "Min: " << min << " Seq. min: " << seq_min << endl
<< "Max: " << max << " Seq. max: " << seq_max << endl
<< "Sum: " << sum << " Seq. sum: " << seq_sum << endl;
CPPUNIT_ASSERT( min == seq_min );
CPPUNIT_ASSERT( max == seq_max );
CPPUNIT_ASSERT( sum == seq_sum );
};
void testSimpleReduction5()
{
cout << "Test reduction 5" << endl;
testReduction( 5 );
};
void testSimpleReduction4()
{
cout << "Test reduction 4" << endl;
testReduction( 4 );
};
void testSimpleReduction3()
{
cout << "Test reduction 3" << endl;
testReduction( 3 );
};
void testSimpleReduction2()
{
cout << "Test reduction 2" << endl;
testReduction( 2 );
};
void testSimpleReduction1()
{
cout << "Test reduction 1" << endl;
testReduction( 1 );
};
......
......@@ -44,8 +44,8 @@ int main( int argc, char* argv[] )
runner.addTest( tnlGridCUDA2DTester< double > :: suite() );
runner.addTest( tnlCUDAKernelsTester< int > :: suite() );
//runner.addTest( tnlCUDAKernelsTester< float > :: suite() );
//runner.addTest( tnlCUDAKernelsTester< double > :: suite() );
runner.addTest( tnlCUDAKernelsTester< float > :: suite() );
runner.addTest( tnlCUDAKernelsTester< double > :: suite() );
runner.run();
return 0;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment