Commit f10011fa authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Testing and debugging CUDA reduction.

Introducing benchmarks - transfer of data from host to CUDA device.
Fixing compilation without CUDA.
parent 025646c7
Loading
Loading
Loading
Loading
+5 −5
Original line number Diff line number Diff line
@@ -97,16 +97,16 @@ then
      CXX="nvcc"
      case "$CUDA_ARCH"  in
         1.0 )
            CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_10"
            CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_10 -DCUDA_ARCH=1.0"
         ;;
         1.1 )
            CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_11"
            CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_11 -DCUDA_ARCH=1.1"
         ;;
         1.2 )
            CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_12"
            CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_12 -DCUDA_ARCH=1.2"
         ;;
         1.3 )
            CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_13"
            CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_13 -DCUDA_ARCH=1.3"
         ;;
      esac  
      DBGCXXFLAGS="$DBGCXXFLAGS -deviceemu"    
@@ -309,7 +309,7 @@ else
fi
AC_LANG_RESTORE


AM_CONDITIONAL([BUILD_CUDA],[test x"$HAVE_CUDA" = xyes] )
AM_CONDITIONAL([BUILD_MPI],[test x"$MPISTATUS" = xyes] )
AM_CONDITIONAL([BUILD_DBG],[test x"$DBGSTATUS" = xyes] )
AM_CONDITIONAL([BUILD_MPI_DBG],[test x"$DBGSTATUS" = xyes -a x"$MPISTATUS" = xyes] )
+8 −2
Original line number Diff line number Diff line
@@ -48,12 +48,18 @@ libtnl_mpi_dbg_0_1_la_LIBADD = debug/libtnldebug-mpi-dbg-0.1.la \
endif                       

tnl_unit_tests_sources = tnl-unit-tests.cpp
tnl_benchmarks_sources = tnl-benchmarks.cpp

check_PROGRAMS = tnl-unit-tests \
                 tnl-benchmarks
                  
check_PROGRAMS = tnl-unit-tests
tnl_unit_tests_SOURCES = $(tnl_unit_tests_sources)
tnl_unit_tests_LDADD = libtnl-0.1.la \
                       core/libcore-tests.la

tnl_benchmarks_SOURCES = $(tnl_benchmarks_sources)
tnl_benchmarks_LDADD = libtnl-0.1.la

if BUILD_DBG
bin_PROGRAMS = tnl-unit-tests-dbg
tnl_unit_tests_dbg_SOURCES = $(tnl_unit_tests_sources)
@@ -63,4 +69,4 @@ endif



TESTS = tnl-unit-tests
 No newline at end of file
TESTS = tnl-benchmarks
 No newline at end of file
+7 −3
Original line number Diff line number Diff line
@@ -56,12 +56,15 @@ sources = tnlConfigDescription.cpp \
	  tnlTimerCPU.cpp \
	  tnlTimerRT.cpp \
	  tnlTester.cpp \
	  tnl-cuda-kernels.cu \
	  compress-file.cpp \
	  mfilename.cpp \
	  mpi-supp.cpp \
	  parse.cc
	  
if BUILD_CUDA
sources += tnl-cuda-kernels.cu 
endif

libmcoreincludedir = $(TNL_INCLUDE_DIR)/core
libmcoreinclude_HEADERS = $(headers)

@@ -95,10 +98,11 @@ libcore_tests_sources = tnlStringTester.cpp \
                        tnlStringTester.h \
                        tnlObjectTester.cpp \
                        tnlObjectTester.h \
                        tnlLongVectorCUDATester.cu \
                        tnlLongVectorCUDATester.cu.h \
                        tnlLongVectorCUDATester.h
                        
if BUILD_CUDA
libcore_tests_sources += tnlLongVectorCUDATester.cu
endif                        

check_LTLIBRARIES = libcore-tests.la
libcore_tests_la_SOURCES = $(libcore_tests_sources)
+379 −236
Original line number Diff line number Diff line
@@ -298,7 +298,7 @@ bool tnlCUDAReduction( const int size,
      size_reduced = grid_size. x;
      reduction_input = device_output;

      // debuging part
      // Debugging part
      /*T* host_array = new T[ size ];
      cudaMemcpy( host_array, dbg_array1,  size * sizeof( T ), cudaMemcpyDeviceToHost );
      for( int i = 0; i< size; i ++ )
@@ -353,7 +353,7 @@ __global__ void tnlCUDASimpleReductionKernel5( const int size,
   int tid = threadIdx. x;
   int gid = 2 * blockIdx. x * blockDim. x + threadIdx. x;
   // Last thread ID which manipulates meaningful data
	//int last_tid = size - 2 * blockIdx. x * blockDim. x;
   int last_tid = size - 2 * blockIdx. x * blockDim. x;
   if( gid + blockDim. x < size )
   {
      if( operation == tnlMin )
@@ -370,43 +370,105 @@ __global__ void tnlCUDASimpleReductionKernel5( const int size,
      __syncthreads();

   // Parallel reduction
	if( blockSize == 512 )
   int n = last_tid < blockDim. x ? last_tid : blockDim. x;
   int s = n / 2;
   if( gid < size )
         dbg_array1[ gid ] = -s;

   if( n == 512 )
   {
		if( tid < 256 )
      if( tid < s )
      {
         if( operation == tnlMin )
				sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 256 ] );
            sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] );
         if( operation == tnlMax )
				sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 256 ] );
            sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] );
         if( operation == tnlSum )
				sdata[ tid ] += sdata[ tid + 256 ];
            sdata[ tid ] += sdata[ tid + s ];
      }
      if( 2 * s < n && tid == n - 1 )
      {
         if( operation == tnlMin )
            sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] );
         if( operation == tnlMax )
            sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] );
         if( operation == tnlSum )
            sdata[ 0 ] += sdata[ tid ];
      }
      n = s;
      s = n / 2;
      __syncthreads();
   }
	if( blockSize >= 256 )
   if( n >= 256 )
   {
		if( tid < 128 )
      if( tid < s )
      {
         if( operation == tnlMin )
				sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 128 ] );
            sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] );
         if( operation == tnlMax )
				sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 128 ] );
            sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] );
         if( operation == tnlSum )
				sdata[ tid ] += sdata[ tid + 128 ];
            sdata[ tid ] += sdata[ tid + s ];
      }
      if( 2 * s < n && tid == n - 1 )
      {
         if( operation == tnlMin )
            sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] );
         if( operation == tnlMax )
            sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] );
         if( operation == tnlSum )
            sdata[ 0 ] += sdata[ tid ];
      }
      n = s;
      s = n / 2;
      __syncthreads();
   }
		if( blockSize >= 128 )
   if( n >= 128 )
   {
			if (tid< 64)
      if( tid < s )
      {
         if( operation == tnlMin )
					sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 64 ] );
            sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] );
         if( operation == tnlMax )
					sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 64 ] );
            sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] );
         if( operation == tnlSum )
					sdata[ tid ] += sdata[ tid + 64 ];
            sdata[ tid ] += sdata[ tid + s ];
      }
      if( 2 * s < n && tid == n - 1 )
      {
         if( operation == tnlMin )
            sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] );
         if( operation == tnlMax )
            sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] );
         if( operation == tnlSum )
            sdata[ 0 ] += sdata[ tid ];
      }
      n = s;
      s = n / 2;
      __syncthreads();
   }
   if( n >= 64 )
   {
      if( tid < s )
      {
         if( operation == tnlMin )
            sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] );
         if( operation == tnlMax )
            sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] );
         if( operation == tnlSum )
            sdata[ tid ] += sdata[ tid + s ];
      }
      if( 2 * s < n && tid == n - 1 )
      {
         if( operation == tnlMin )
            sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] );
         if( operation == tnlMax )
            sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] );
         if( operation == tnlSum )
            sdata[ 0 ] += sdata[ tid ];
      }
      n = s;
      s = n / 2;
      __syncthreads();
   }
   /*
@@ -414,52 +476,101 @@ __global__ void tnlCUDASimpleReductionKernel5( const int size,
    */
   if( tid < 32 )
   {
			if( blockSize >= 64 )
      if( n >= 32 )
      {
         if( tid < s )
         {
            if( operation == tnlMin )
					sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 32 ] );
               sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] );
            if( operation == tnlMax )
					sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 32 ] );
               sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] );
            if( operation == tnlSum )
					sdata[ tid ] += sdata[ tid + 32 ];
               sdata[ tid ] += sdata[ tid + s ];
         }
			if( blockSize >= 32 )
         if( 2 * s < n && tid == n - 1 )
         {
            if( operation == tnlMin )
					sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 16 ] );
               sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] );
            if( operation == tnlMax )
					sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 16 ] );
               sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] );
            if( operation == tnlSum )
					sdata[ tid ] += sdata[ tid + 16 ];
               sdata[ 0 ] += sdata[ tid ];
         }
			if( blockSize >= 16 )
         n = s;
         s = n / 2;
      }
      if( n >= 16 )
      {
         if( tid < s )
         {
            if( operation == tnlMin )
					sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 8 ] );
               sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] );
            if( operation == tnlMax )
					sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 8 ] );
               sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] );
            if( operation == tnlSum )
               sdata[ tid ] += sdata[ tid + 8 ];
         }
		    if( blockSize >= 8 )
         if( 2 * s < n && tid == n - 1 )
         {
            if( operation == tnlMin )
		    		sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 4 ] );
               sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] );
            if( operation == tnlMax )
		    		sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 4 ] );
               sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] );
            if( operation == tnlSum )
		    		sdata[ tid ] += sdata[ tid + 4 ];
               sdata[ 0 ] += sdata[ tid ];
         }
			if( blockSize >= 4 )
         n = s;
         s = n / 2;
      }
      if( n >= 8 )
      {
         if( tid < s )
         {
            if( operation == tnlMin )
					sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 2 ] );
               sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] );
            if( operation == tnlMax )
					sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 2 ] );
               sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] );
            if( operation == tnlSum )
					sdata[ tid ] += sdata[ tid + 2 ];
               sdata[ tid ] += sdata[ tid + s ];
         }
			if( blockSize >= 2 )
         if( 2 * s < n && tid == n - 1 )
         {
            if( operation == tnlMin )
               sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] );
            if( operation == tnlMax )
               sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] );
            if( operation == tnlSum )
               sdata[ 0 ] += sdata[ tid ];
         }
         n = s;
         s = n / 2;
      }
      if( n >= 4 )
      {
         if( tid < s )
         {
            if( operation == tnlMin )
               sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] );
            if( operation == tnlMax )
               sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] );
            if( operation == tnlSum )
               sdata[ tid ] += sdata[ tid + s ];
         }
         if( 2 * s < n && tid == n - 1 )
         {
            if( operation == tnlMin )
               sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] );
            if( operation == tnlMax )
               sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] );
            if( operation == tnlSum )
               sdata[ 0 ] += sdata[ tid ];
         }
         n = s;
         s = n / 2;
      }
      if( n >= 2 )
      {
         if( tid < s )
         {
            if( operation == tnlMin )
               sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 1 ] );
@@ -468,6 +579,16 @@ __global__ void tnlCUDASimpleReductionKernel5( const int size,
            if( operation == tnlSum )
               sdata[ tid ] += sdata[ tid + 1 ];
         }
         if( 2 * s < n && tid == n - 1 )
         {
            if( operation == tnlMin )
               sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] );
            if( operation == tnlMax )
               sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] );
            if( operation == tnlSum )
               sdata[ 0 ] += sdata[ tid ];
         }
      }
   }

   // Store the result back in global memory
@@ -485,7 +606,7 @@ bool tnlCUDASimpleReduction5( const int size,
   const int cpuThreshold = 1;
   const int desBlockSize = 16;    //Desired block size

   //T* dbg_array1;
   T* dbg_array1;

   bool device_output_allocated( false );
   if( ! device_output )
@@ -495,12 +616,12 @@ bool tnlCUDASimpleReduction5( const int size,
      if( cudaGetLastError() != cudaSuccess )
      {
         cerr << "Unable to allocate device memory with size " << bytes_alloc << "." << endl;
         abort();
         return false;
      }
      device_output_allocated = true;

       //cudaMalloc( ( void** ) &dbg_array1, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!
       //cudaMalloc( ( void** ) &dbg_array2, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!!
      cudaMalloc( ( void** ) &dbg_array1, size * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!
   }
   dim3 block_size( 0 ), grid_size( 0 );
   int shmem;
@@ -509,7 +630,16 @@ bool tnlCUDASimpleReduction5( const int size,
   while( size_reduced > cpuThreshold )
   {
      block_size. x = :: Min( size_reduced, desBlockSize );
      grid_size. x = ( size_reduced / block_size. x + 1 ) / 2;
      grid_size. x = size_reduced / block_size. x / 2;
      if( 2 * grid_size. x * block_size. x < size_reduced )
         grid_size. x ++;
      int bits = 1;
      while( block_size. x > 1 )
      {
         block_size. x >>= 1;
         bits <<= 1;
      }
      block_size. x = bits;
      shmem = block_size. x * sizeof( T );
      /*cout << "Size: " << size_reduced
           << " Grid size: " << grid_size. x
@@ -520,56 +650,71 @@ bool tnlCUDASimpleReduction5( const int size,
      switch( block_size. x )
      {
         case 512:
			  tnlCUDASimpleReductionKernel5< T, operation, 512 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output );
            //cout << "Calling kernel " << block_size. x << endl;
            tnlCUDASimpleReductionKernel5< T, operation, 512 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 );
            break;
         case 256:
			  tnlCUDASimpleReductionKernel5< T, operation, 256 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output );
            //cout << "Calling kernel " << block_size. x << endl;
            tnlCUDASimpleReductionKernel5< T, operation, 256 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 );
            break;
         case 128:
			  tnlCUDASimpleReductionKernel5< T, operation, 128 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output );
            //cout << "Calling kernel " << block_size. x << endl;
            tnlCUDASimpleReductionKernel5< T, operation, 128 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 );
            break;
         case  64:
			  tnlCUDASimpleReductionKernel5< T, operation,  64 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output );
            //cout << "Calling kernel " << block_size. x << endl;
            tnlCUDASimpleReductionKernel5< T, operation,  64 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 );
            break;
         case  32:
			  tnlCUDASimpleReductionKernel5< T, operation,  32 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output );
            //cout << "Calling kernel " << block_size. x << endl;
            tnlCUDASimpleReductionKernel5< T, operation,  32 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 );
            break;
         case  16:
			  tnlCUDASimpleReductionKernel5< T, operation,  16 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output );
            //cout << "Calling kernel " << block_size. x << endl;
            tnlCUDASimpleReductionKernel5< T, operation,  16 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 );
            break;
         case   8:
			  tnlCUDASimpleReductionKernel5< T, operation,   8 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output );
            //cout << "Calling kernel " << block_size. x << endl;
            tnlCUDASimpleReductionKernel5< T, operation,   8 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 );
            break;
         case   4:
			  tnlCUDASimpleReductionKernel5< T, operation,   4 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output );
            //cout << "Calling kernel " << block_size. x << endl;
            tnlCUDASimpleReductionKernel5< T, operation,   4 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 );
            break;
         case   2:
			  tnlCUDASimpleReductionKernel5< T, operation,   2 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output );
            //cout << "Calling kernel " << block_size. x << endl;
            tnlCUDASimpleReductionKernel5< T, operation,   2 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 );
            break;
         case   1:
            tnlAssert( false, cerr << "blockSize should not be 1." << endl );
            abort();
            break;
         default:
            tnlAssert( false, cerr << "Block size is " << block_size. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
            abort();
            break;
      }
      //cout << "+++++++++++++++++++++" << endl;
      size_reduced = grid_size. x;
      reduction_input = device_output;

      // debuging part
      /*T* host_array = new T[ desBlockSize ];
      cudaMemcpy( host_array, dbg_array1,  desBlockSize * sizeof( T ), cudaMemcpyDeviceToHost );
      for( int i = 0; i< :: Min( ( int ) block_size. x, desBlockSize ); i ++ )
    	  cout << host_array[ i ] << " - ";
      /*T* host_array = new T[ size ];
      cudaMemcpy( host_array, dbg_array1,  size * sizeof( T ), cudaMemcpyDeviceToHost );
      cout << "Dbg. array: ";
      for( int i = 0; i < size ; i ++ )
    	  cout << host_array[ i ] << "  ";
      cout << endl;
      cudaFree( dbg_array1 );

      T* output = new T[ size_reduced ];
      cudaMemcpy( output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost );
      cout << endl;
      cout << endl << "Reduced data: ";
      for( int i = 0; i < size_reduced; i ++ )
    	  cout << output[ i ] << "   ";
      cout << endl;
      delete[] output;*/

   }
   T* host_output = new T[ size_reduced ];
   if( size == 1 )
@@ -653,10 +798,8 @@ __global__ void tnlCUDASimpleReductionKernel4( const int size,
            sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] );
         if( operation == tnlSum )
            sdata[ 0 ] += sdata[ tid ];
			dbg_array1[ 0 ] = sdata[ tid ];
      }
      n = s;

      __syncthreads();
      //dbg_array1[ tid ] = -sdata[ tid ];

@@ -691,7 +834,7 @@ bool tnlCUDASimpleReduction4( const int size,
       }
       device_output_allocated = true;

       //cudaMalloc( ( void** ) &dbg_array1, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!
       cudaMalloc( ( void** ) &dbg_array1, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!
       //cudaMalloc( ( void** ) &dbg_array2, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!!
   }
   dim3 block_size( 0 ), grid_size( 0 );
+181 −168
Original line number Diff line number Diff line
@@ -174,9 +174,9 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
               GetParameterType( param ) == "double" )
      {
         if( min != seq_min )
			   cerr << "Diff. min = " << min << " seq. min = " << seq_min;
            cout << "Diff. min = " << min << " seq. min = " << seq_min;
         if( max != seq_max )
			   cerr << "Diff. max = " << max << " seq. max = " << seq_max;
            cout << "Diff. max = " << max << " seq. max = " << seq_max;
         CPPUNIT_ASSERT( min == seq_min );
         CPPUNIT_ASSERT( max == seq_max );
         if( sum == 0.0 )
@@ -187,18 +187,30 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
         {
            double diff = ( ( double ) sum - ( double ) seq_sum ) / ( double) sum;
            if( fabs( diff > 1.0e-5 ) )
				   cerr << "Diff is " << diff << " for " << GetParameterType( param ) << endl;
            {
               cout << "Diff is " << diff << " for " << GetParameterType( param ) << endl;
               abort();
            }
            CPPUNIT_ASSERT( fabs( diff ) < 1.0e-5 );
         }
      }
      else
      {
         if( min != seq_min )
			   cerr << "Diff. min = " << min << " seq. min = " << seq_min;
         {
            cout << "Diff. min = " << min << " seq. min = " << seq_min;
            abort();
         }
         if( max != seq_max )
			   cerr << "Diff. max = " << max << " seq. max = " << seq_max;
         {
            cout << "Diff. max = " << max << " seq. max = " << seq_max;
            abort();
         }
         if( sum != seq_sum )
			   cerr << "Diff. sum = " << sum << " seq. sum = " << seq_sum;
         {
            cout << "Diff. sum = " << sum << " seq. sum = " << seq_sum;
            abort();
         }
         CPPUNIT_ASSERT( min == seq_min );
         CPPUNIT_ASSERT( max == seq_max );
         CPPUNIT_ASSERT( sum == seq_sum );
@@ -210,7 +222,7 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
   {
      tnlLongVector< T > host_input;
      int size = 2;
	   for( int s = 1; s < 12; s ++ )
      /*for( int s = 1; s < 12; s ++ )
      {
         tnlLongVector< T > host_input( size );

@@ -245,12 +257,12 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
                  2048 );
         size *= 2;
         cout << endl;
	   }
      }*/
      for( size = 1; size < 5000; size ++ )
      {
         tnlLongVector< T > host_input( size );

		   cout << "Alg. " << algorithm_efficiency  << "Testing zeros with size "  << size << " ";
         //cout << "Alg. " << algorithm_efficiency  << " Testing zeros with size "  << size << " ";
         for( int i = 0; i < size; i ++ )
            host_input[ i ] = 0.0;
         mainReduction( host_input,
@@ -258,41 +270,42 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
                  256,
                  2048 );

		   cout << "Alg. " << algorithm_efficiency  << "Testing ones with size "  << size << " ";
         //cout << "Alg. " << algorithm_efficiency  << " Testing ones with size "  << size << " ";
         for( int i = 0; i < size; i ++ )
            host_input[ i ] = 1.0;
         mainReduction( host_input,
                  algorithm_efficiency,
                  256,
                  2048 );
		   cout << "Alg. " << algorithm_efficiency  << "Testing linear sequence with size "  << size << " ";
         //cout << "Alg. " << algorithm_efficiency  << " Testing linear sequence with size "  << size << " ";
         for( int i = 0; i < size; i ++ )
            host_input[ i ] = i;
         mainReduction( host_input,
                  algorithm_efficiency,
                  256,
                  2048 );
		   cout << "Alg. " << algorithm_efficiency  << "Testing quadratic sequence with size "  << size << " ";

         //cout << "Alg. " << algorithm_efficiency  << " Testing quadratic sequence with size "  << size << " ";
         for( int i = 0; i < size; i ++ )
            host_input[ i ] = ( i - size / 2 ) * ( i - size / 2 );
         mainReduction( host_input,
                  algorithm_efficiency,
                  256,
                  2048 );
		   cout << endl;
         //cout << endl;
      }
   };

   void testReduction()
   {
   	   cout << "Test FAST reduction" << endl;
	   testReduction( 0 );
      //cout << "Test FAST reduction" << endl;
      //testReduction( 0 );
   }

   void testSimpleReduction5()
   {
   	   cout << "Test reduction 5" << endl;
   	   testReduction( 5 );
      //cout << "Test reduction 5" << endl;
      //testReduction( 5 );
   };

   void testSimpleReduction4()
Loading