Implementing CUDA parallel reduction.

9a01097d · Tomáš Oberhuber · a37a9195 · 9a01097d · 9a01097d · 9a01097d
Commit 9a01097d authored 15 years ago by Tomáš Oberhuber
--- a/TODO.txt
+++ b/TODO.txt
 TODO: implementovat tridu tnlFileName pro generovani jmen souboru

+TODO: metodu pro tnlString pro nahrazeni napr. podretezce XXXXX indexem 00001 tj. uXXXXX.bin -> u00001.bin
+      to by melo byt robustnejsi, nez doposavadni pristup 
+
 TODO: implementovat tridu tnlParabolicSolver pro odvozovani resicu k casove promennym uloham

 TODO: Nahradit mGrid2D, mGrid3D za mGrid obecne dimenze
@@ -8,6 +11,4 @@ TODO: zavets iteratory pres uzle site misto for cyklu

 TODO: implementovat Mersona v CUDA

-TODO: metoda Test do tnlObject
-
-TODO: trida tnlTester pro rizeni testu   
\ No newline at end of file
+TODO: objekt pro osetreni chyb - zavedeni funkce tnlGetError   
\ No newline at end of file
--- a/configure.ac
+++ b/configure.ac
@@ -37,6 +37,12 @@ AC_ARG_WITH(cuda_libdir,
            AS_HELP_STRING([--with-cuda-libdir],
                           [says where the CUDA libraries can be found, default is /usr/local/cuda/lib]),
            CUDA_LIBS=$withval)
+AC_ARG_WITH(cuda_arch,
+            AS_HELP_STRING([--with-cuda-arch],
+                           [specifies the CUDA architecture, can be 1.0, 1.1, 1.2 or 1.3 - default is 1.3]),
+            CUDA_ARCH=$withval,
+            CUDA_ARCH="1.3")
+
 working_nvcc="no"  
 if test x$with_cuda = xyes;
 then
@@ -89,6 +95,21 @@ then
      CUDA_LDFLAGS="$CUDA_LDFLAGS -lcudart"
      CC="nvcc"
      CXX="nvcc"
+      case "$CUDA_ARCH"  in
+         1.0 )
+            CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_10"
+         ;;
+         1.1 )
+            CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_11"
+         ;;
+         1.2 )
+            CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_12"
+         ;;
+         1.3 )
+            CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_13"
+         ;;
+      esac  
+      DBGCXXFLAGS="$DBGCXXFLAGS -deviceemu"    
   else
      CUDA_LDFLAGS=""
      CUDA_CXXFLAGS=""
@@ -121,7 +142,7 @@ dnl ----------- check for debug--------------
 dnl -----------------------------------------
 AC_ARG_ENABLE(debug,[  --enable-debug=[no/yes]	turn on debugging [default=no]] )
 if test x"$enable_debug" = xyes; then
-   DBGCXXFLAGS="-O0 -DDEBUG"
+   DBGCXXFLAGS="$DBGCXXFLAGS -O0 -DDEBUG"
   if test x$CXX != xnvcc;
   then
      DBGCXXFLAGS="$DBGCXXFLAGS -g3 -Wall -W -ansi -Wno-unused"

--- a/src/core/tnl-cuda-kernels.cu
+++ b/src/core/tnl-cuda-kernels.cu
--- a/src/core/tnl-cuda-kernels.h
+++ b/src/core/tnl-cuda-kernels.h
--- a/src/core/tnlCUDAKernelsTester.h
+++ b/src/core/tnlCUDAKernelsTester.h
@@ -66,49 +66,161 @@ double tnlCUDAReductionSum( const int size,
                            const int block_size,
                            const int grid_size,
                            const double* input );
+/*
+ * Simple reduction 5
+ */
+bool tnlCUDASimpleReduction5Min( const int size,
+                                 const int* input,
+                                 int& result );
+bool tnlCUDASimpleReduction5Max( const int size,
+                                 const int* input,
+                                 int& result );
+bool tnlCUDASimpleReduction5Sum( const int size,
+                                 const int* input,
+                                 int& result );
+bool tnlCUDASimpleReduction5Min( const int size,
+                                 const float* input,
+                                 float& result);
+bool tnlCUDASimpleReduction5Max( const int size,
+                                 const float* input,
+                                 float& result);
+bool tnlCUDASimpleReduction5Sum( const int size,
+                                 const float* input,
+                                 float& result);
+bool tnlCUDASimpleReduction5Min( const int size,
+                                 const double* input,
+                                 double& result);
+bool tnlCUDASimpleReduction5Max( const int size,
+                                 const double* input,
+                                 double& result );
+bool tnlCUDASimpleReduction5Sum( const int size,
+                                 const double* input,
+                                 double& result );
+
+
+/*
+ * Simple reduction 4
+ */
+bool tnlCUDASimpleReduction4Min( const int size,
+                          const int* input,
+                          int& result );
+bool tnlCUDASimpleReduction4Max( const int size,
+                          const int* input,
+                          int& result );
+bool tnlCUDASimpleReduction4Sum( const int size,
+                          const int* input,
+                          int& result );
+bool tnlCUDASimpleReduction4Min( const int size,
+                            const float* input,
+                            float& result);
+bool tnlCUDASimpleReduction4Max( const int size,
+                            const float* input,
+                            float& result);
+bool tnlCUDASimpleReduction4Sum( const int size,
+                            const float* input,
+                            float& result);
+bool tnlCUDASimpleReduction4Min( const int size,
+                             const double* input,
+                             double& result);
+bool tnlCUDASimpleReduction4Max( const int size,
+                             const double* input,
+                             double& result );
+bool tnlCUDASimpleReduction4Sum( const int size,
+                             const double* input,
+                             double& result );
+
+/*
+ * Simple reduction 3
+ */
+bool tnlCUDASimpleReduction3Min( const int size,
+                          const int* input,
+                          int& result );
+bool tnlCUDASimpleReduction3Max( const int size,
+                          const int* input,
+                          int& result );
+bool tnlCUDASimpleReduction3Sum( const int size,
+                          const int* input,
+                          int& result );
+bool tnlCUDASimpleReduction3Min( const int size,
+                            const float* input,
+                            float& result);
+bool tnlCUDASimpleReduction3Max( const int size,
+                            const float* input,
+                            float& result);
+bool tnlCUDASimpleReduction3Sum( const int size,
+                            const float* input,
+                            float& result);
+bool tnlCUDASimpleReduction3Min( const int size,
+                             const double* input,
+                             double& result);
+bool tnlCUDASimpleReduction3Max( const int size,
+                             const double* input,
+                             double& result );
+bool tnlCUDASimpleReduction3Sum( const int size,
+                             const double* input,
+                             double& result );
+
+/*
+ * Simple reduction 2
+ */
+bool tnlCUDASimpleReduction2Min( const int size,
+                          const int* input,
+                          int& result );
+bool tnlCUDASimpleReduction2Max( const int size,
+                          const int* input,
+                          int& result );
+bool tnlCUDASimpleReduction2Sum( const int size,
+                          const int* input,
+                          int& result );
+bool tnlCUDASimpleReduction2Min( const int size,
+                            const float* input,
+                            float& result);
+bool tnlCUDASimpleReduction2Max( const int size,
+                            const float* input,
+                            float& result);
+bool tnlCUDASimpleReduction2Sum( const int size,
+                            const float* input,
+                            float& result);
+bool tnlCUDASimpleReduction2Min( const int size,
+                             const double* input,
+                             double& result);
+bool tnlCUDASimpleReduction2Max( const int size,
+                             const double* input,
+                             double& result );
+bool tnlCUDASimpleReduction2Sum( const int size,
+                             const double* input,
+                             double& result );

 /*
 * Simple reduction 1
 */
-int tnlCUDASimpleReduction1Min( const int size,
-                          const int block_size,
-                          const int grid_size,
+bool tnlCUDASimpleReduction1Min( const int size,
                          const int* input,
-                          int* output );
-int tnlCUDASimpleReduction1Max( const int size,
-                          const int block_size,
-                          const int grid_size,
+                          int& result );
+bool tnlCUDASimpleReduction1Max( const int size,
                          const int* input,
-                          int* output );
-int tnlCUDASimpleReduction1Sum( const int size,
-                          const int block_size,
-                          const int grid_size,
+                          int& result );
+bool tnlCUDASimpleReduction1Sum( const int size,
                          const int* input,
-                          int* output );
-float tnlCUDASimpleReduction1Min( const int size,
-                            const int block_size,
-                            const int grid_size,
-                            const float* input );
-float tnlCUDASimpleReduction1Max( const int size,
-                            const int block_size,
-                            const int grid_size,
-                            const float* input );
-float tnlCUDASimpleReduction1Sum( const int size,
-                            const int block_size,
-                            const int grid_size,
-                            const float* input );
-double tnlCUDASimpleReduction1Min( const int size,
-                             const int block_size,
-                             const int grid_size,
-                             const double* input );
-double tnlCUDASimpleReduction1Max( const int size,
-                             const int block_size,
-                             const int grid_size,
-                             const double* input );
-double tnlCUDASimpleReduction1Sum( const int size,
-                             const int block_size,
-                             const int grid_size,
-                             const double* input );
+                          int& result );
+bool tnlCUDASimpleReduction1Min( const int size,
+                            const float* input,
+                            float& result);
+bool tnlCUDASimpleReduction1Max( const int size,
+                            const float* input,
+                            float& result);
+bool tnlCUDASimpleReduction1Sum( const int size,
+                            const float* input,
+                            float& result);
+bool tnlCUDASimpleReduction1Min( const int size,
+                             const double* input,
+                             double& result);
+bool tnlCUDASimpleReduction1Max( const int size,
+                             const double* input,
+                             double& result );
+bool tnlCUDASimpleReduction1Sum( const int size,
+                             const double* input,
+                             double& result );

 #endif

@@ -125,57 +237,71 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
   {
      CppUnit :: TestSuite* suiteOfTests = new CppUnit :: TestSuite( "tnlCUDAKernelsTester" );
      CppUnit :: TestResult result;
+
      suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >(
    		                   "testSimpleReduction1",
                               & tnlCUDAKernelsTester< T > :: testSimpleReduction1 )
                             );
      suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >(
+        		               "testSimpleReduction2",
+                               & tnlCUDAKernelsTester< T > :: testSimpleReduction2 )
+                              );
+      suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >(
+                      		   "testSimpleReduction3",
+                               & tnlCUDAKernelsTester< T > :: testSimpleReduction3 )
+                              );
+      suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >(
+                               "testSimpleReduction4",
+                               & tnlCUDAKernelsTester< T > :: testSimpleReduction4 )
+                              );
+      suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >(
+                               "testSimpleReduction5",
+                               & tnlCUDAKernelsTester< T > :: testSimpleReduction5 )
+                              );
+      /*suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >(
                               "testReduction",
                               & tnlCUDAKernelsTester< T > :: testFastReduction )
-                             );
+                             );*/

      return suiteOfTests;
   };

   bool testSetup( tnlLongVector< T >& host_input,
-		           tnlLongVector< T >& host_output,
 		           tnlLongVectorCUDA< T >& device_input,
-		           tnlLongVectorCUDA< T >& device_output,
 		           int size )
   {
 	   if( ! host_input. SetNewSize( size ) )
 		   return false;
-	   if( ! host_output. SetNewSize( size ) )
-		   return false;
 	   if( ! device_input. SetNewSize( size ) )
 		   return false;
-	   if( ! device_output. SetNewSize( size ) )
-		   return false;

 	   for( int i=0; i < size; i ++ )
-	   {
 		   host_input[ i ] = i + 1;
-		   host_output[ i ] = 0;
-	   }
+
 	   device_input. copyFrom( host_input );
 	   return true;
   }

   void testReduction( int algorithm_efficiency = 0 )
   {
-	   int size = 1<<16;
+	   int size = 1<<10;
 	   int desBlockSize = 128;    //Desired block size
 	   int desGridSize = 2048;    //Impose limitation on grid size so that threads could perform sequential work

-	   tnlLongVector< T > host_input, host_output;
-	   tnlLongVectorCUDA< T > device_input, device_output;
+	   tnlLongVector< T > host_input;
+	   tnlLongVectorCUDA< T > device_input;
 	   CPPUNIT_ASSERT( testSetup( host_input,
-		         	   host_output,
 		         	   device_input,
-		         	   device_output,
 		         	   size )  );
-
-
+	   T seq_min( host_input[ 0 ] ),
+	     seq_max( host_input[ 0 ] ),
+	     seq_sum( host_input[ 0 ] );
+	   for( int i = 1; i < size; i ++ )
+	   {
+		   seq_min = :: Min( seq_min, host_input[ i ] );
+		   seq_max = :: Max( seq_max, host_input[ i ] );
+		   seq_sum += host_input[ i ];
+	   }

 	   //Calculate necessary block/grid dimensions
 	   int block_size = :: Min( size/2, desBlockSize );
@@ -186,9 +312,29 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
 	   switch( algorithm_efficiency )
 	   {
 		   case 1:
-			   min = tnlCUDASimpleReduction1Min( size, block_size, grid_size, device_input. Data(), device_output. Data() );
-			   max = tnlCUDASimpleReduction1Max( size, block_size, grid_size, device_input. Data(), device_output. Data() );
-			   sum = tnlCUDASimpleReduction1Sum( size, block_size, grid_size, device_input. Data(), device_output. Data() );
+			   tnlCUDASimpleReduction1Min( size, device_input. Data(), min );
+			   tnlCUDASimpleReduction1Max( size, device_input. Data(), max );
+			   tnlCUDASimpleReduction1Sum( size, device_input. Data(), sum );
+			   break;
+		   case 2:
+			   tnlCUDASimpleReduction2Min( size, device_input. Data(), min );
+			   tnlCUDASimpleReduction2Max( size, device_input. Data(), max );
+			   tnlCUDASimpleReduction2Sum( size, device_input. Data(), sum );
+			   break;
+		   case 3:
+			   tnlCUDASimpleReduction3Min( size, device_input. Data(), min );
+			   tnlCUDASimpleReduction3Max( size, device_input. Data(), max );
+			   tnlCUDASimpleReduction3Sum( size, device_input. Data(), sum );
+			   break;
+		   case 4:
+			   tnlCUDASimpleReduction4Min( size, device_input. Data(), min );
+			   tnlCUDASimpleReduction4Max( size, device_input. Data(), max );
+			   tnlCUDASimpleReduction4Sum( size, device_input. Data(), sum );
+			   break;
+		   case 5:
+			   tnlCUDASimpleReduction5Min( size, device_input. Data(), min );
+			   tnlCUDASimpleReduction5Max( size, device_input. Data(), max );
+			   tnlCUDASimpleReduction5Sum( size, device_input. Data(), sum );
 			   break;
 		   default:
 			   min = tnlCUDAReductionMin( size, block_size, grid_size, device_input. Data() );
@@ -196,14 +342,44 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
 			   sum = tnlCUDAReductionSum( size, block_size, grid_size, device_input. Data() );
 	   }

-	   cout << "Min: " << min << endl
-			<< "Max: " << max << endl
-			<< "Sum: " << sum << endl;

+	   cout << "Min: " << min << " Seq. min: " << seq_min << endl
+			<< "Max: " << max << " Seq. max: " << seq_max << endl
+			<< "Sum: " << sum << " Seq. sum: " << seq_sum << endl;
+
+	   CPPUNIT_ASSERT( min == seq_min );
+	   CPPUNIT_ASSERT( max == seq_max );
+	   CPPUNIT_ASSERT( sum == seq_sum );
+
+   };
+
+   void testSimpleReduction5()
+   {
+   	   cout << "Test reduction 5" << endl;
+   	   testReduction( 5 );
+   };
+
+   void testSimpleReduction4()
+   {
+	   cout << "Test reduction 4" << endl;
+	   testReduction( 4 );
+   };
+
+   void testSimpleReduction3()
+   {
+   	   cout << "Test reduction 3" << endl;
+     	   testReduction( 3 );
+   };
+
+   void testSimpleReduction2()
+   {
+	   cout << "Test reduction 2" << endl;
+  	   testReduction( 2 );
   };

   void testSimpleReduction1()
   {
+	   cout << "Test reduction 1" << endl;
 	   testReduction( 1 );
   };


--- a/src/tnl-unit-tests.cpp
+++ b/src/tnl-unit-tests.cpp
@@ -44,8 +44,8 @@ int main( int argc, char* argv[] )
   runner.addTest( tnlGridCUDA2DTester< double > :: suite() );
   
   runner.addTest( tnlCUDAKernelsTester< int > :: suite() );
-   //runner.addTest( tnlCUDAKernelsTester< float > :: suite() );
-   //runner.addTest( tnlCUDAKernelsTester< double > :: suite() );
+   runner.addTest( tnlCUDAKernelsTester< float > :: suite() );
+   runner.addTest( tnlCUDAKernelsTester< double > :: suite() );
   
   runner.run();
   return 0;