Testing and debugging CUDA reduction. (f10011fa) · Commits · TNL / tnl-dev

configure.ac

+5 −5

Original line number	Diff line number	Diff line
		@@ -97,16 +97,16 @@ then
		CXX="nvcc"
		case "$CUDA_ARCH" in
		1.0 )
		CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_10"
		CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_10 -DCUDA_ARCH=1.0"
		;;
		1.1 )
		CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_11"
		CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_11 -DCUDA_ARCH=1.1"
		;;
		1.2 )
		CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_12"
		CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_12 -DCUDA_ARCH=1.2"
		;;
		1.3 )
		CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_13"
		CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_13 -DCUDA_ARCH=1.3"
		;;
		esac
		DBGCXXFLAGS="$DBGCXXFLAGS -deviceemu"
		@@ -309,7 +309,7 @@ else
		fi
		AC_LANG_RESTORE


		AM_CONDITIONAL([BUILD_CUDA],[test x"$HAVE_CUDA" = xyes] )
		AM_CONDITIONAL([BUILD_MPI],[test x"$MPISTATUS" = xyes] )
		AM_CONDITIONAL([BUILD_DBG],[test x"$DBGSTATUS" = xyes] )
		AM_CONDITIONAL([BUILD_MPI_DBG],[test x"$DBGSTATUS" = xyes -a x"$MPISTATUS" = xyes] )

src/Makefile.am

+8 −2

Original line number	Diff line number	Diff line
		@@ -48,12 +48,18 @@ libtnl_mpi_dbg_0_1_la_LIBADD = debug/libtnldebug-mpi-dbg-0.1.la \
		endif

		tnl_unit_tests_sources = tnl-unit-tests.cpp
		tnl_benchmarks_sources = tnl-benchmarks.cpp

		check_PROGRAMS = tnl-unit-tests \
		tnl-benchmarks

		check_PROGRAMS = tnl-unit-tests
		tnl_unit_tests_SOURCES = $(tnl_unit_tests_sources)
		tnl_unit_tests_LDADD = libtnl-0.1.la \
		core/libcore-tests.la

		tnl_benchmarks_SOURCES = $(tnl_benchmarks_sources)
		tnl_benchmarks_LDADD = libtnl-0.1.la

		if BUILD_DBG
		bin_PROGRAMS = tnl-unit-tests-dbg
		tnl_unit_tests_dbg_SOURCES = $(tnl_unit_tests_sources)
		@@ -63,4 +69,4 @@ endif



		TESTS = tnl-unit-tests
		No newline at end of file
		TESTS = tnl-benchmarks
		No newline at end of file

src/core/Makefile.am

+7 −3

Original line number	Diff line number	Diff line
		@@ -56,12 +56,15 @@ sources = tnlConfigDescription.cpp \
		tnlTimerCPU.cpp \
		tnlTimerRT.cpp \
		tnlTester.cpp \
		tnl-cuda-kernels.cu \
		compress-file.cpp \
		mfilename.cpp \
		mpi-supp.cpp \
		parse.cc

		if BUILD_CUDA
		sources += tnl-cuda-kernels.cu
		endif

		libmcoreincludedir = $(TNL_INCLUDE_DIR)/core
		libmcoreinclude_HEADERS = $(headers)

		@@ -95,10 +98,11 @@ libcore_tests_sources = tnlStringTester.cpp \
		tnlStringTester.h \
		tnlObjectTester.cpp \
		tnlObjectTester.h \
		tnlLongVectorCUDATester.cu \
		tnlLongVectorCUDATester.cu.h \
		tnlLongVectorCUDATester.h

		if BUILD_CUDA
		libcore_tests_sources += tnlLongVectorCUDATester.cu
		endif

		check_LTLIBRARIES = libcore-tests.la
		libcore_tests_la_SOURCES = $(libcore_tests_sources)

src/core/tnl-cuda-kernels.h

+379 −236

Original line number	Diff line number	Diff line
		@@ -298,7 +298,7 @@ bool tnlCUDAReduction( const int size,
		size_reduced = grid_size. x;
		reduction_input = device_output;

		// debuging part
		// Debugging part
		/T host_array = new T[ size ];
		cudaMemcpy( host_array, dbg_array1, size * sizeof( T ), cudaMemcpyDeviceToHost );
		for( int i = 0; i< size; i ++ )
		@@ -353,7 +353,7 @@ __global__ void tnlCUDASimpleReductionKernel5( const int size,
		int tid = threadIdx. x;
		int gid = 2 * blockIdx. x * blockDim. x + threadIdx. x;
		// Last thread ID which manipulates meaningful data
		//int last_tid = size - 2 * blockIdx. x * blockDim. x;
		int last_tid = size - 2 * blockIdx. x * blockDim. x;
		if( gid + blockDim. x < size )
		{
		if( operation == tnlMin )
		@@ -370,43 +370,105 @@ __global__ void tnlCUDASimpleReductionKernel5( const int size,
		__syncthreads();

		// Parallel reduction
		if( blockSize == 512 )
		int n = last_tid < blockDim. x ? last_tid : blockDim. x;
		int s = n / 2;
		if( gid < size )
		dbg_array1[ gid ] = -s;

		if( n == 512 )
		{
		if( tid < 256 )
		if( tid < s )
		{
		if( operation == tnlMin )
		sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 256 ] );
		sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] );
		if( operation == tnlMax )
		sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 256 ] );
		sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] );
		if( operation == tnlSum )
		sdata[ tid ] += sdata[ tid + 256 ];
		sdata[ tid ] += sdata[ tid + s ];
		}
		if( 2 * s < n && tid == n - 1 )
		{
		if( operation == tnlMin )
		sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] );
		if( operation == tnlMax )
		sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] );
		if( operation == tnlSum )
		sdata[ 0 ] += sdata[ tid ];
		}
		n = s;
		s = n / 2;
		__syncthreads();
		}
		if( blockSize >= 256 )
		if( n >= 256 )
		{
		if( tid < 128 )
		if( tid < s )
		{
		if( operation == tnlMin )
		sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 128 ] );
		sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] );
		if( operation == tnlMax )
		sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 128 ] );
		sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] );
		if( operation == tnlSum )
		sdata[ tid ] += sdata[ tid + 128 ];
		sdata[ tid ] += sdata[ tid + s ];
		}
		if( 2 * s < n && tid == n - 1 )
		{
		if( operation == tnlMin )
		sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] );
		if( operation == tnlMax )
		sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] );
		if( operation == tnlSum )
		sdata[ 0 ] += sdata[ tid ];
		}
		n = s;
		s = n / 2;
		__syncthreads();
		}
		if( blockSize >= 128 )
		if( n >= 128 )
		{
		if (tid< 64)
		if( tid < s )
		{
		if( operation == tnlMin )
		sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 64 ] );
		sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] );
		if( operation == tnlMax )
		sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 64 ] );
		sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] );
		if( operation == tnlSum )
		sdata[ tid ] += sdata[ tid + 64 ];
		sdata[ tid ] += sdata[ tid + s ];
		}
		if( 2 * s < n && tid == n - 1 )
		{
		if( operation == tnlMin )
		sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] );
		if( operation == tnlMax )
		sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] );
		if( operation == tnlSum )
		sdata[ 0 ] += sdata[ tid ];
		}
		n = s;
		s = n / 2;
		__syncthreads();
		}
		if( n >= 64 )
		{
		if( tid < s )
		{
		if( operation == tnlMin )
		sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] );
		if( operation == tnlMax )
		sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] );
		if( operation == tnlSum )
		sdata[ tid ] += sdata[ tid + s ];
		}
		if( 2 * s < n && tid == n - 1 )
		{
		if( operation == tnlMin )
		sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] );
		if( operation == tnlMax )
		sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] );
		if( operation == tnlSum )
		sdata[ 0 ] += sdata[ tid ];
		}
		n = s;
		s = n / 2;
		__syncthreads();
		}
		/*
		@@ -414,52 +476,101 @@ __global__ void tnlCUDASimpleReductionKernel5( const int size,
		*/
		if( tid < 32 )
		{
		if( blockSize >= 64 )
		if( n >= 32 )
		{
		if( tid < s )
		{
		if( operation == tnlMin )
		sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 32 ] );
		sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] );
		if( operation == tnlMax )
		sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 32 ] );
		sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] );
		if( operation == tnlSum )
		sdata[ tid ] += sdata[ tid + 32 ];
		sdata[ tid ] += sdata[ tid + s ];
		}
		if( blockSize >= 32 )
		if( 2 * s < n && tid == n - 1 )
		{
		if( operation == tnlMin )
		sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 16 ] );
		sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] );
		if( operation == tnlMax )
		sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 16 ] );
		sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] );
		if( operation == tnlSum )
		sdata[ tid ] += sdata[ tid + 16 ];
		sdata[ 0 ] += sdata[ tid ];
		}
		if( blockSize >= 16 )
		n = s;
		s = n / 2;
		}
		if( n >= 16 )
		{
		if( tid < s )
		{
		if( operation == tnlMin )
		sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 8 ] );
		sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] );
		if( operation == tnlMax )
		sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 8 ] );
		sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] );
		if( operation == tnlSum )
		sdata[ tid ] += sdata[ tid + 8 ];
		}
		if( blockSize >= 8 )
		if( 2 * s < n && tid == n - 1 )
		{
		if( operation == tnlMin )
		sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 4 ] );
		sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] );
		if( operation == tnlMax )
		sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 4 ] );
		sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] );
		if( operation == tnlSum )
		sdata[ tid ] += sdata[ tid + 4 ];
		sdata[ 0 ] += sdata[ tid ];
		}
		if( blockSize >= 4 )
		n = s;
		s = n / 2;
		}
		if( n >= 8 )
		{
		if( tid < s )
		{
		if( operation == tnlMin )
		sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 2 ] );
		sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] );
		if( operation == tnlMax )
		sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 2 ] );
		sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] );
		if( operation == tnlSum )
		sdata[ tid ] += sdata[ tid + 2 ];
		sdata[ tid ] += sdata[ tid + s ];
		}
		if( blockSize >= 2 )
		if( 2 * s < n && tid == n - 1 )
		{
		if( operation == tnlMin )
		sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] );
		if( operation == tnlMax )
		sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] );
		if( operation == tnlSum )
		sdata[ 0 ] += sdata[ tid ];
		}
		n = s;
		s = n / 2;
		}
		if( n >= 4 )
		{
		if( tid < s )
		{
		if( operation == tnlMin )
		sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] );
		if( operation == tnlMax )
		sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] );
		if( operation == tnlSum )
		sdata[ tid ] += sdata[ tid + s ];
		}
		if( 2 * s < n && tid == n - 1 )
		{
		if( operation == tnlMin )
		sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] );
		if( operation == tnlMax )
		sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] );
		if( operation == tnlSum )
		sdata[ 0 ] += sdata[ tid ];
		}
		n = s;
		s = n / 2;
		}
		if( n >= 2 )
		{
		if( tid < s )
		{
		if( operation == tnlMin )
		sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 1 ] );
		@@ -468,6 +579,16 @@ __global__ void tnlCUDASimpleReductionKernel5( const int size,
		if( operation == tnlSum )
		sdata[ tid ] += sdata[ tid + 1 ];
		}
		if( 2 * s < n && tid == n - 1 )
		{
		if( operation == tnlMin )
		sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] );
		if( operation == tnlMax )
		sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] );
		if( operation == tnlSum )
		sdata[ 0 ] += sdata[ tid ];
		}
		}
		}

		// Store the result back in global memory
		@@ -485,7 +606,7 @@ bool tnlCUDASimpleReduction5( const int size,
		const int cpuThreshold = 1;
		const int desBlockSize = 16; //Desired block size

		//T* dbg_array1;
		T* dbg_array1;

		bool device_output_allocated( false );
		if( ! device_output )
		@@ -495,12 +616,12 @@ bool tnlCUDASimpleReduction5( const int size,
		if( cudaGetLastError() != cudaSuccess )
		{
		cerr << "Unable to allocate device memory with size " << bytes_alloc << "." << endl;
		abort();
		return false;
		}
		device_output_allocated = true;

		//cudaMalloc( ( void** ) &dbg_array1, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!
		//cudaMalloc( ( void** ) &dbg_array2, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!!
		cudaMalloc( ( void** ) &dbg_array1, size * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!
		}
		dim3 block_size( 0 ), grid_size( 0 );
		int shmem;
		@@ -509,7 +630,16 @@ bool tnlCUDASimpleReduction5( const int size,
		while( size_reduced > cpuThreshold )
		{
		block_size. x = :: Min( size_reduced, desBlockSize );
		grid_size. x = ( size_reduced / block_size. x + 1 ) / 2;
		grid_size. x = size_reduced / block_size. x / 2;
		if( 2 * grid_size. x * block_size. x < size_reduced )
		grid_size. x ++;
		int bits = 1;
		while( block_size. x > 1 )
		{
		block_size. x >>= 1;
		bits <<= 1;
		}
		block_size. x = bits;
		shmem = block_size. x * sizeof( T );
		/*cout << "Size: " << size_reduced
		<< " Grid size: " << grid_size. x
		@@ -520,56 +650,71 @@ bool tnlCUDASimpleReduction5( const int size,
		switch( block_size. x )
		{
		case 512:
		tnlCUDASimpleReductionKernel5< T, operation, 512 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output );
		//cout << "Calling kernel " << block_size. x << endl;
		tnlCUDASimpleReductionKernel5< T, operation, 512 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 );
		break;
		case 256:
		tnlCUDASimpleReductionKernel5< T, operation, 256 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output );
		//cout << "Calling kernel " << block_size. x << endl;
		tnlCUDASimpleReductionKernel5< T, operation, 256 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 );
		break;
		case 128:
		tnlCUDASimpleReductionKernel5< T, operation, 128 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output );
		//cout << "Calling kernel " << block_size. x << endl;
		tnlCUDASimpleReductionKernel5< T, operation, 128 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 );
		break;
		case 64:
		tnlCUDASimpleReductionKernel5< T, operation, 64 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output );
		//cout << "Calling kernel " << block_size. x << endl;
		tnlCUDASimpleReductionKernel5< T, operation, 64 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 );
		break;
		case 32:
		tnlCUDASimpleReductionKernel5< T, operation, 32 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output );
		//cout << "Calling kernel " << block_size. x << endl;
		tnlCUDASimpleReductionKernel5< T, operation, 32 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 );
		break;
		case 16:
		tnlCUDASimpleReductionKernel5< T, operation, 16 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output );
		//cout << "Calling kernel " << block_size. x << endl;
		tnlCUDASimpleReductionKernel5< T, operation, 16 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 );
		break;
		case 8:
		tnlCUDASimpleReductionKernel5< T, operation, 8 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output );
		//cout << "Calling kernel " << block_size. x << endl;
		tnlCUDASimpleReductionKernel5< T, operation, 8 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 );
		break;
		case 4:
		tnlCUDASimpleReductionKernel5< T, operation, 4 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output );
		//cout << "Calling kernel " << block_size. x << endl;
		tnlCUDASimpleReductionKernel5< T, operation, 4 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 );
		break;
		case 2:
		tnlCUDASimpleReductionKernel5< T, operation, 2 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output );
		//cout << "Calling kernel " << block_size. x << endl;
		tnlCUDASimpleReductionKernel5< T, operation, 2 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 );
		break;
		case 1:
		tnlAssert( false, cerr << "blockSize should not be 1." << endl );
		abort();
		break;
		default:
		tnlAssert( false, cerr << "Block size is " << block_size. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
		abort();
		break;
		}
		//cout << "+++++++++++++++++++++" << endl;
		size_reduced = grid_size. x;
		reduction_input = device_output;

		// debuging part
		/T host_array = new T[ desBlockSize ];
		cudaMemcpy( host_array, dbg_array1, desBlockSize * sizeof( T ), cudaMemcpyDeviceToHost );
		for( int i = 0; i< :: Min( ( int ) block_size. x, desBlockSize ); i ++ )
		cout << host_array[ i ] << " - ";
		/T host_array = new T[ size ];
		cudaMemcpy( host_array, dbg_array1, size * sizeof( T ), cudaMemcpyDeviceToHost );
		cout << "Dbg. array: ";
		for( int i = 0; i < size ; i ++ )
		cout << host_array[ i ] << " ";
		cout << endl;
		cudaFree( dbg_array1 );

		T* output = new T[ size_reduced ];
		cudaMemcpy( output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost );
		cout << endl;
		cout << endl << "Reduced data: ";
		for( int i = 0; i < size_reduced; i ++ )
		cout << output[ i ] << " ";
		cout << endl;
		delete[] output;*/

		}
		T* host_output = new T[ size_reduced ];
		if( size == 1 )
		@@ -653,10 +798,8 @@ __global__ void tnlCUDASimpleReductionKernel4( const int size,
		sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] );
		if( operation == tnlSum )
		sdata[ 0 ] += sdata[ tid ];
		dbg_array1[ 0 ] = sdata[ tid ];
		}
		n = s;

		__syncthreads();
		//dbg_array1[ tid ] = -sdata[ tid ];

		@@ -691,7 +834,7 @@ bool tnlCUDASimpleReduction4( const int size,
		}
		device_output_allocated = true;

		//cudaMalloc( ( void** ) &dbg_array1, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!
		cudaMalloc( ( void** ) &dbg_array1, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!
		//cudaMalloc( ( void** ) &dbg_array2, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!!
		}
		dim3 block_size( 0 ), grid_size( 0 );

src/core/tnlCUDAKernelsTester.h

+181 −168

Original line number	Diff line number	Diff line
		@@ -174,9 +174,9 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
		GetParameterType( param ) == "double" )
		{
		if( min != seq_min )
		cerr << "Diff. min = " << min << " seq. min = " << seq_min;
		cout << "Diff. min = " << min << " seq. min = " << seq_min;
		if( max != seq_max )
		cerr << "Diff. max = " << max << " seq. max = " << seq_max;
		cout << "Diff. max = " << max << " seq. max = " << seq_max;
		CPPUNIT_ASSERT( min == seq_min );
		CPPUNIT_ASSERT( max == seq_max );
		if( sum == 0.0 )
		@@ -187,18 +187,30 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
		{
		double diff = ( ( double ) sum - ( double ) seq_sum ) / ( double) sum;
		if( fabs( diff > 1.0e-5 ) )
		cerr << "Diff is " << diff << " for " << GetParameterType( param ) << endl;
		{
		cout << "Diff is " << diff << " for " << GetParameterType( param ) << endl;
		abort();
		}
		CPPUNIT_ASSERT( fabs( diff ) < 1.0e-5 );
		}
		}
		else
		{
		if( min != seq_min )
		cerr << "Diff. min = " << min << " seq. min = " << seq_min;
		{
		cout << "Diff. min = " << min << " seq. min = " << seq_min;
		abort();
		}
		if( max != seq_max )
		cerr << "Diff. max = " << max << " seq. max = " << seq_max;
		{
		cout << "Diff. max = " << max << " seq. max = " << seq_max;
		abort();
		}
		if( sum != seq_sum )
		cerr << "Diff. sum = " << sum << " seq. sum = " << seq_sum;
		{
		cout << "Diff. sum = " << sum << " seq. sum = " << seq_sum;
		abort();
		}
		CPPUNIT_ASSERT( min == seq_min );
		CPPUNIT_ASSERT( max == seq_max );
		CPPUNIT_ASSERT( sum == seq_sum );
		@@ -210,7 +222,7 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
		{
		tnlLongVector< T > host_input;
		int size = 2;
		for( int s = 1; s < 12; s ++ )
		/*for( int s = 1; s < 12; s ++ )
		{
		tnlLongVector< T > host_input( size );

		@@ -245,12 +257,12 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
		2048 );
		size *= 2;
		cout << endl;
		}
		}*/
		for( size = 1; size < 5000; size ++ )
		{
		tnlLongVector< T > host_input( size );

		cout << "Alg. " << algorithm_efficiency << "Testing zeros with size " << size << " ";
		//cout << "Alg. " << algorithm_efficiency << " Testing zeros with size " << size << " ";
		for( int i = 0; i < size; i ++ )
		host_input[ i ] = 0.0;
		mainReduction( host_input,
		@@ -258,41 +270,42 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
		256,
		2048 );

		cout << "Alg. " << algorithm_efficiency << "Testing ones with size " << size << " ";
		//cout << "Alg. " << algorithm_efficiency << " Testing ones with size " << size << " ";
		for( int i = 0; i < size; i ++ )
		host_input[ i ] = 1.0;
		mainReduction( host_input,
		algorithm_efficiency,
		256,
		2048 );
		cout << "Alg. " << algorithm_efficiency << "Testing linear sequence with size " << size << " ";
		//cout << "Alg. " << algorithm_efficiency << " Testing linear sequence with size " << size << " ";
		for( int i = 0; i < size; i ++ )
		host_input[ i ] = i;
		mainReduction( host_input,
		algorithm_efficiency,
		256,
		2048 );
		cout << "Alg. " << algorithm_efficiency << "Testing quadratic sequence with size " << size << " ";

		//cout << "Alg. " << algorithm_efficiency << " Testing quadratic sequence with size " << size << " ";
		for( int i = 0; i < size; i ++ )
		host_input[ i ] = ( i - size / 2 ) * ( i - size / 2 );
		mainReduction( host_input,
		algorithm_efficiency,
		256,
		2048 );
		cout << endl;
		//cout << endl;
		}
		};

		void testReduction()
		{
		cout << "Test FAST reduction" << endl;
		testReduction( 0 );
		//cout << "Test FAST reduction" << endl;
		//testReduction( 0 );
		}

		void testSimpleReduction5()
		{
		cout << "Test reduction 5" << endl;
		testReduction( 5 );
		//cout << "Test reduction 5" << endl;
		//testReduction( 5 );
		};

		void testSimpleReduction4()