Loading configure.ac +5 −5 Original line number Diff line number Diff line Loading @@ -97,16 +97,16 @@ then CXX="nvcc" case "$CUDA_ARCH" in 1.0 ) CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_10" CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_10 -DCUDA_ARCH=1.0" ;; 1.1 ) CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_11" CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_11 -DCUDA_ARCH=1.1" ;; 1.2 ) CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_12" CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_12 -DCUDA_ARCH=1.2" ;; 1.3 ) CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_13" CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_13 -DCUDA_ARCH=1.3" ;; esac DBGCXXFLAGS="$DBGCXXFLAGS -deviceemu" Loading Loading @@ -309,7 +309,7 @@ else fi AC_LANG_RESTORE AM_CONDITIONAL([BUILD_CUDA],[test x"$HAVE_CUDA" = xyes] ) AM_CONDITIONAL([BUILD_MPI],[test x"$MPISTATUS" = xyes] ) AM_CONDITIONAL([BUILD_DBG],[test x"$DBGSTATUS" = xyes] ) AM_CONDITIONAL([BUILD_MPI_DBG],[test x"$DBGSTATUS" = xyes -a x"$MPISTATUS" = xyes] ) Loading src/Makefile.am +8 −2 Original line number Diff line number Diff line Loading @@ -48,12 +48,18 @@ libtnl_mpi_dbg_0_1_la_LIBADD = debug/libtnldebug-mpi-dbg-0.1.la \ endif tnl_unit_tests_sources = tnl-unit-tests.cpp tnl_benchmarks_sources = tnl-benchmarks.cpp check_PROGRAMS = tnl-unit-tests \ tnl-benchmarks check_PROGRAMS = tnl-unit-tests tnl_unit_tests_SOURCES = $(tnl_unit_tests_sources) tnl_unit_tests_LDADD = libtnl-0.1.la \ core/libcore-tests.la tnl_benchmarks_SOURCES = $(tnl_benchmarks_sources) tnl_benchmarks_LDADD = libtnl-0.1.la if BUILD_DBG bin_PROGRAMS = tnl-unit-tests-dbg tnl_unit_tests_dbg_SOURCES = $(tnl_unit_tests_sources) Loading @@ -63,4 +69,4 @@ endif TESTS = tnl-unit-tests No newline at end of file TESTS = tnl-benchmarks No newline at end of file src/core/Makefile.am +7 −3 Original line number Diff line number Diff line Loading @@ -56,12 +56,15 @@ sources = tnlConfigDescription.cpp \ tnlTimerCPU.cpp \ tnlTimerRT.cpp \ tnlTester.cpp \ tnl-cuda-kernels.cu \ compress-file.cpp \ mfilename.cpp \ mpi-supp.cpp \ parse.cc if BUILD_CUDA sources += tnl-cuda-kernels.cu endif libmcoreincludedir = $(TNL_INCLUDE_DIR)/core libmcoreinclude_HEADERS = $(headers) Loading Loading @@ -95,10 +98,11 @@ libcore_tests_sources = tnlStringTester.cpp \ tnlStringTester.h \ tnlObjectTester.cpp \ tnlObjectTester.h \ tnlLongVectorCUDATester.cu \ tnlLongVectorCUDATester.cu.h \ tnlLongVectorCUDATester.h if BUILD_CUDA libcore_tests_sources += tnlLongVectorCUDATester.cu endif check_LTLIBRARIES = libcore-tests.la libcore_tests_la_SOURCES = $(libcore_tests_sources) Loading src/core/tnl-cuda-kernels.h +379 −236 Original line number Diff line number Diff line Loading @@ -298,7 +298,7 @@ bool tnlCUDAReduction( const int size, size_reduced = grid_size. x; reduction_input = device_output; // debuging part // Debugging part /*T* host_array = new T[ size ]; cudaMemcpy( host_array, dbg_array1, size * sizeof( T ), cudaMemcpyDeviceToHost ); for( int i = 0; i< size; i ++ ) Loading Loading @@ -353,7 +353,7 @@ __global__ void tnlCUDASimpleReductionKernel5( const int size, int tid = threadIdx. x; int gid = 2 * blockIdx. x * blockDim. x + threadIdx. x; // Last thread ID which manipulates meaningful data //int last_tid = size - 2 * blockIdx. x * blockDim. x; int last_tid = size - 2 * blockIdx. x * blockDim. x; if( gid + blockDim. x < size ) { if( operation == tnlMin ) Loading @@ -370,43 +370,105 @@ __global__ void tnlCUDASimpleReductionKernel5( const int size, __syncthreads(); // Parallel reduction if( blockSize == 512 ) int n = last_tid < blockDim. x ? last_tid : blockDim. x; int s = n / 2; if( gid < size ) dbg_array1[ gid ] = -s; if( n == 512 ) { if( tid < 256 ) if( tid < s ) { if( operation == tnlMin ) sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 256 ] ); sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlMax ) sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 256 ] ); sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlSum ) sdata[ tid ] += sdata[ tid + 256 ]; sdata[ tid ] += sdata[ tid + s ]; } if( 2 * s < n && tid == n - 1 ) { if( operation == tnlMin ) sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlMax ) sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlSum ) sdata[ 0 ] += sdata[ tid ]; } n = s; s = n / 2; __syncthreads(); } if( blockSize >= 256 ) if( n >= 256 ) { if( tid < 128 ) if( tid < s ) { if( operation == tnlMin ) sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 128 ] ); sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlMax ) sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 128 ] ); sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlSum ) sdata[ tid ] += sdata[ tid + 128 ]; sdata[ tid ] += sdata[ tid + s ]; } if( 2 * s < n && tid == n - 1 ) { if( operation == tnlMin ) sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlMax ) sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlSum ) sdata[ 0 ] += sdata[ tid ]; } n = s; s = n / 2; __syncthreads(); } if( blockSize >= 128 ) if( n >= 128 ) { if (tid< 64) if( tid < s ) { if( operation == tnlMin ) sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 64 ] ); sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlMax ) sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 64 ] ); sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlSum ) sdata[ tid ] += sdata[ tid + 64 ]; sdata[ tid ] += sdata[ tid + s ]; } if( 2 * s < n && tid == n - 1 ) { if( operation == tnlMin ) sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlMax ) sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlSum ) sdata[ 0 ] += sdata[ tid ]; } n = s; s = n / 2; __syncthreads(); } if( n >= 64 ) { if( tid < s ) { if( operation == tnlMin ) sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlMax ) sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlSum ) sdata[ tid ] += sdata[ tid + s ]; } if( 2 * s < n && tid == n - 1 ) { if( operation == tnlMin ) sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlMax ) sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlSum ) sdata[ 0 ] += sdata[ tid ]; } n = s; s = n / 2; __syncthreads(); } /* Loading @@ -414,52 +476,101 @@ __global__ void tnlCUDASimpleReductionKernel5( const int size, */ if( tid < 32 ) { if( blockSize >= 64 ) if( n >= 32 ) { if( tid < s ) { if( operation == tnlMin ) sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 32 ] ); sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlMax ) sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 32 ] ); sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlSum ) sdata[ tid ] += sdata[ tid + 32 ]; sdata[ tid ] += sdata[ tid + s ]; } if( blockSize >= 32 ) if( 2 * s < n && tid == n - 1 ) { if( operation == tnlMin ) sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 16 ] ); sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlMax ) sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 16 ] ); sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlSum ) sdata[ tid ] += sdata[ tid + 16 ]; sdata[ 0 ] += sdata[ tid ]; } if( blockSize >= 16 ) n = s; s = n / 2; } if( n >= 16 ) { if( tid < s ) { if( operation == tnlMin ) sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 8 ] ); sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlMax ) sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 8 ] ); sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlSum ) sdata[ tid ] += sdata[ tid + 8 ]; } if( blockSize >= 8 ) if( 2 * s < n && tid == n - 1 ) { if( operation == tnlMin ) sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 4 ] ); sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlMax ) sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 4 ] ); sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlSum ) sdata[ tid ] += sdata[ tid + 4 ]; sdata[ 0 ] += sdata[ tid ]; } if( blockSize >= 4 ) n = s; s = n / 2; } if( n >= 8 ) { if( tid < s ) { if( operation == tnlMin ) sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 2 ] ); sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlMax ) sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 2 ] ); sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlSum ) sdata[ tid ] += sdata[ tid + 2 ]; sdata[ tid ] += sdata[ tid + s ]; } if( blockSize >= 2 ) if( 2 * s < n && tid == n - 1 ) { if( operation == tnlMin ) sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlMax ) sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlSum ) sdata[ 0 ] += sdata[ tid ]; } n = s; s = n / 2; } if( n >= 4 ) { if( tid < s ) { if( operation == tnlMin ) sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlMax ) sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlSum ) sdata[ tid ] += sdata[ tid + s ]; } if( 2 * s < n && tid == n - 1 ) { if( operation == tnlMin ) sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlMax ) sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlSum ) sdata[ 0 ] += sdata[ tid ]; } n = s; s = n / 2; } if( n >= 2 ) { if( tid < s ) { if( operation == tnlMin ) sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 1 ] ); Loading @@ -468,6 +579,16 @@ __global__ void tnlCUDASimpleReductionKernel5( const int size, if( operation == tnlSum ) sdata[ tid ] += sdata[ tid + 1 ]; } if( 2 * s < n && tid == n - 1 ) { if( operation == tnlMin ) sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlMax ) sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlSum ) sdata[ 0 ] += sdata[ tid ]; } } } // Store the result back in global memory Loading @@ -485,7 +606,7 @@ bool tnlCUDASimpleReduction5( const int size, const int cpuThreshold = 1; const int desBlockSize = 16; //Desired block size //T* dbg_array1; T* dbg_array1; bool device_output_allocated( false ); if( ! device_output ) Loading @@ -495,12 +616,12 @@ bool tnlCUDASimpleReduction5( const int size, if( cudaGetLastError() != cudaSuccess ) { cerr << "Unable to allocate device memory with size " << bytes_alloc << "." << endl; abort(); return false; } device_output_allocated = true; //cudaMalloc( ( void** ) &dbg_array1, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!! //cudaMalloc( ( void** ) &dbg_array2, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!! cudaMalloc( ( void** ) &dbg_array1, size * sizeof( T ) ); //!!!!!!!!!!!!!!!!!! } dim3 block_size( 0 ), grid_size( 0 ); int shmem; Loading @@ -509,7 +630,16 @@ bool tnlCUDASimpleReduction5( const int size, while( size_reduced > cpuThreshold ) { block_size. x = :: Min( size_reduced, desBlockSize ); grid_size. x = ( size_reduced / block_size. x + 1 ) / 2; grid_size. x = size_reduced / block_size. x / 2; if( 2 * grid_size. x * block_size. x < size_reduced ) grid_size. x ++; int bits = 1; while( block_size. x > 1 ) { block_size. x >>= 1; bits <<= 1; } block_size. x = bits; shmem = block_size. x * sizeof( T ); /*cout << "Size: " << size_reduced << " Grid size: " << grid_size. x Loading @@ -520,56 +650,71 @@ bool tnlCUDASimpleReduction5( const int size, switch( block_size. x ) { case 512: tnlCUDASimpleReductionKernel5< T, operation, 512 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output ); //cout << "Calling kernel " << block_size. x << endl; tnlCUDASimpleReductionKernel5< T, operation, 512 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 ); break; case 256: tnlCUDASimpleReductionKernel5< T, operation, 256 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output ); //cout << "Calling kernel " << block_size. x << endl; tnlCUDASimpleReductionKernel5< T, operation, 256 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 ); break; case 128: tnlCUDASimpleReductionKernel5< T, operation, 128 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output ); //cout << "Calling kernel " << block_size. x << endl; tnlCUDASimpleReductionKernel5< T, operation, 128 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 ); break; case 64: tnlCUDASimpleReductionKernel5< T, operation, 64 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output ); //cout << "Calling kernel " << block_size. x << endl; tnlCUDASimpleReductionKernel5< T, operation, 64 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 ); break; case 32: tnlCUDASimpleReductionKernel5< T, operation, 32 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output ); //cout << "Calling kernel " << block_size. x << endl; tnlCUDASimpleReductionKernel5< T, operation, 32 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 ); break; case 16: tnlCUDASimpleReductionKernel5< T, operation, 16 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output ); //cout << "Calling kernel " << block_size. x << endl; tnlCUDASimpleReductionKernel5< T, operation, 16 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 ); break; case 8: tnlCUDASimpleReductionKernel5< T, operation, 8 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output ); //cout << "Calling kernel " << block_size. x << endl; tnlCUDASimpleReductionKernel5< T, operation, 8 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 ); break; case 4: tnlCUDASimpleReductionKernel5< T, operation, 4 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output ); //cout << "Calling kernel " << block_size. x << endl; tnlCUDASimpleReductionKernel5< T, operation, 4 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 ); break; case 2: tnlCUDASimpleReductionKernel5< T, operation, 2 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output ); //cout << "Calling kernel " << block_size. x << endl; tnlCUDASimpleReductionKernel5< T, operation, 2 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 ); break; case 1: tnlAssert( false, cerr << "blockSize should not be 1." << endl ); abort(); break; default: tnlAssert( false, cerr << "Block size is " << block_size. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); abort(); break; } //cout << "+++++++++++++++++++++" << endl; size_reduced = grid_size. x; reduction_input = device_output; // debuging part /*T* host_array = new T[ desBlockSize ]; cudaMemcpy( host_array, dbg_array1, desBlockSize * sizeof( T ), cudaMemcpyDeviceToHost ); for( int i = 0; i< :: Min( ( int ) block_size. x, desBlockSize ); i ++ ) cout << host_array[ i ] << " - "; /*T* host_array = new T[ size ]; cudaMemcpy( host_array, dbg_array1, size * sizeof( T ), cudaMemcpyDeviceToHost ); cout << "Dbg. array: "; for( int i = 0; i < size ; i ++ ) cout << host_array[ i ] << " "; cout << endl; cudaFree( dbg_array1 ); T* output = new T[ size_reduced ]; cudaMemcpy( output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost ); cout << endl; cout << endl << "Reduced data: "; for( int i = 0; i < size_reduced; i ++ ) cout << output[ i ] << " "; cout << endl; delete[] output;*/ } T* host_output = new T[ size_reduced ]; if( size == 1 ) Loading Loading @@ -653,10 +798,8 @@ __global__ void tnlCUDASimpleReductionKernel4( const int size, sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlSum ) sdata[ 0 ] += sdata[ tid ]; dbg_array1[ 0 ] = sdata[ tid ]; } n = s; __syncthreads(); //dbg_array1[ tid ] = -sdata[ tid ]; Loading Loading @@ -691,7 +834,7 @@ bool tnlCUDASimpleReduction4( const int size, } device_output_allocated = true; //cudaMalloc( ( void** ) &dbg_array1, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!! cudaMalloc( ( void** ) &dbg_array1, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!! //cudaMalloc( ( void** ) &dbg_array2, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!! } dim3 block_size( 0 ), grid_size( 0 ); Loading src/core/tnlCUDAKernelsTester.h +181 −168 Original line number Diff line number Diff line Loading @@ -174,9 +174,9 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase GetParameterType( param ) == "double" ) { if( min != seq_min ) cerr << "Diff. min = " << min << " seq. min = " << seq_min; cout << "Diff. min = " << min << " seq. min = " << seq_min; if( max != seq_max ) cerr << "Diff. max = " << max << " seq. max = " << seq_max; cout << "Diff. max = " << max << " seq. max = " << seq_max; CPPUNIT_ASSERT( min == seq_min ); CPPUNIT_ASSERT( max == seq_max ); if( sum == 0.0 ) Loading @@ -187,18 +187,30 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase { double diff = ( ( double ) sum - ( double ) seq_sum ) / ( double) sum; if( fabs( diff > 1.0e-5 ) ) cerr << "Diff is " << diff << " for " << GetParameterType( param ) << endl; { cout << "Diff is " << diff << " for " << GetParameterType( param ) << endl; abort(); } CPPUNIT_ASSERT( fabs( diff ) < 1.0e-5 ); } } else { if( min != seq_min ) cerr << "Diff. min = " << min << " seq. min = " << seq_min; { cout << "Diff. min = " << min << " seq. min = " << seq_min; abort(); } if( max != seq_max ) cerr << "Diff. max = " << max << " seq. max = " << seq_max; { cout << "Diff. max = " << max << " seq. max = " << seq_max; abort(); } if( sum != seq_sum ) cerr << "Diff. sum = " << sum << " seq. sum = " << seq_sum; { cout << "Diff. sum = " << sum << " seq. sum = " << seq_sum; abort(); } CPPUNIT_ASSERT( min == seq_min ); CPPUNIT_ASSERT( max == seq_max ); CPPUNIT_ASSERT( sum == seq_sum ); Loading @@ -210,7 +222,7 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase { tnlLongVector< T > host_input; int size = 2; for( int s = 1; s < 12; s ++ ) /*for( int s = 1; s < 12; s ++ ) { tnlLongVector< T > host_input( size ); Loading Loading @@ -245,12 +257,12 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase 2048 ); size *= 2; cout << endl; } }*/ for( size = 1; size < 5000; size ++ ) { tnlLongVector< T > host_input( size ); cout << "Alg. " << algorithm_efficiency << "Testing zeros with size " << size << " "; //cout << "Alg. " << algorithm_efficiency << " Testing zeros with size " << size << " "; for( int i = 0; i < size; i ++ ) host_input[ i ] = 0.0; mainReduction( host_input, Loading @@ -258,41 +270,42 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase 256, 2048 ); cout << "Alg. " << algorithm_efficiency << "Testing ones with size " << size << " "; //cout << "Alg. " << algorithm_efficiency << " Testing ones with size " << size << " "; for( int i = 0; i < size; i ++ ) host_input[ i ] = 1.0; mainReduction( host_input, algorithm_efficiency, 256, 2048 ); cout << "Alg. " << algorithm_efficiency << "Testing linear sequence with size " << size << " "; //cout << "Alg. " << algorithm_efficiency << " Testing linear sequence with size " << size << " "; for( int i = 0; i < size; i ++ ) host_input[ i ] = i; mainReduction( host_input, algorithm_efficiency, 256, 2048 ); cout << "Alg. " << algorithm_efficiency << "Testing quadratic sequence with size " << size << " "; //cout << "Alg. " << algorithm_efficiency << " Testing quadratic sequence with size " << size << " "; for( int i = 0; i < size; i ++ ) host_input[ i ] = ( i - size / 2 ) * ( i - size / 2 ); mainReduction( host_input, algorithm_efficiency, 256, 2048 ); cout << endl; //cout << endl; } }; void testReduction() { cout << "Test FAST reduction" << endl; testReduction( 0 ); //cout << "Test FAST reduction" << endl; //testReduction( 0 ); } void testSimpleReduction5() { cout << "Test reduction 5" << endl; testReduction( 5 ); //cout << "Test reduction 5" << endl; //testReduction( 5 ); }; void testSimpleReduction4() Loading Loading
configure.ac +5 −5 Original line number Diff line number Diff line Loading @@ -97,16 +97,16 @@ then CXX="nvcc" case "$CUDA_ARCH" in 1.0 ) CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_10" CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_10 -DCUDA_ARCH=1.0" ;; 1.1 ) CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_11" CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_11 -DCUDA_ARCH=1.1" ;; 1.2 ) CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_12" CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_12 -DCUDA_ARCH=1.2" ;; 1.3 ) CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_13" CUDA_CXXFLAGS="$CUDA_CXXFLAGS -arch=sm_13 -DCUDA_ARCH=1.3" ;; esac DBGCXXFLAGS="$DBGCXXFLAGS -deviceemu" Loading Loading @@ -309,7 +309,7 @@ else fi AC_LANG_RESTORE AM_CONDITIONAL([BUILD_CUDA],[test x"$HAVE_CUDA" = xyes] ) AM_CONDITIONAL([BUILD_MPI],[test x"$MPISTATUS" = xyes] ) AM_CONDITIONAL([BUILD_DBG],[test x"$DBGSTATUS" = xyes] ) AM_CONDITIONAL([BUILD_MPI_DBG],[test x"$DBGSTATUS" = xyes -a x"$MPISTATUS" = xyes] ) Loading
src/Makefile.am +8 −2 Original line number Diff line number Diff line Loading @@ -48,12 +48,18 @@ libtnl_mpi_dbg_0_1_la_LIBADD = debug/libtnldebug-mpi-dbg-0.1.la \ endif tnl_unit_tests_sources = tnl-unit-tests.cpp tnl_benchmarks_sources = tnl-benchmarks.cpp check_PROGRAMS = tnl-unit-tests \ tnl-benchmarks check_PROGRAMS = tnl-unit-tests tnl_unit_tests_SOURCES = $(tnl_unit_tests_sources) tnl_unit_tests_LDADD = libtnl-0.1.la \ core/libcore-tests.la tnl_benchmarks_SOURCES = $(tnl_benchmarks_sources) tnl_benchmarks_LDADD = libtnl-0.1.la if BUILD_DBG bin_PROGRAMS = tnl-unit-tests-dbg tnl_unit_tests_dbg_SOURCES = $(tnl_unit_tests_sources) Loading @@ -63,4 +69,4 @@ endif TESTS = tnl-unit-tests No newline at end of file TESTS = tnl-benchmarks No newline at end of file
src/core/Makefile.am +7 −3 Original line number Diff line number Diff line Loading @@ -56,12 +56,15 @@ sources = tnlConfigDescription.cpp \ tnlTimerCPU.cpp \ tnlTimerRT.cpp \ tnlTester.cpp \ tnl-cuda-kernels.cu \ compress-file.cpp \ mfilename.cpp \ mpi-supp.cpp \ parse.cc if BUILD_CUDA sources += tnl-cuda-kernels.cu endif libmcoreincludedir = $(TNL_INCLUDE_DIR)/core libmcoreinclude_HEADERS = $(headers) Loading Loading @@ -95,10 +98,11 @@ libcore_tests_sources = tnlStringTester.cpp \ tnlStringTester.h \ tnlObjectTester.cpp \ tnlObjectTester.h \ tnlLongVectorCUDATester.cu \ tnlLongVectorCUDATester.cu.h \ tnlLongVectorCUDATester.h if BUILD_CUDA libcore_tests_sources += tnlLongVectorCUDATester.cu endif check_LTLIBRARIES = libcore-tests.la libcore_tests_la_SOURCES = $(libcore_tests_sources) Loading
src/core/tnl-cuda-kernels.h +379 −236 Original line number Diff line number Diff line Loading @@ -298,7 +298,7 @@ bool tnlCUDAReduction( const int size, size_reduced = grid_size. x; reduction_input = device_output; // debuging part // Debugging part /*T* host_array = new T[ size ]; cudaMemcpy( host_array, dbg_array1, size * sizeof( T ), cudaMemcpyDeviceToHost ); for( int i = 0; i< size; i ++ ) Loading Loading @@ -353,7 +353,7 @@ __global__ void tnlCUDASimpleReductionKernel5( const int size, int tid = threadIdx. x; int gid = 2 * blockIdx. x * blockDim. x + threadIdx. x; // Last thread ID which manipulates meaningful data //int last_tid = size - 2 * blockIdx. x * blockDim. x; int last_tid = size - 2 * blockIdx. x * blockDim. x; if( gid + blockDim. x < size ) { if( operation == tnlMin ) Loading @@ -370,43 +370,105 @@ __global__ void tnlCUDASimpleReductionKernel5( const int size, __syncthreads(); // Parallel reduction if( blockSize == 512 ) int n = last_tid < blockDim. x ? last_tid : blockDim. x; int s = n / 2; if( gid < size ) dbg_array1[ gid ] = -s; if( n == 512 ) { if( tid < 256 ) if( tid < s ) { if( operation == tnlMin ) sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 256 ] ); sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlMax ) sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 256 ] ); sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlSum ) sdata[ tid ] += sdata[ tid + 256 ]; sdata[ tid ] += sdata[ tid + s ]; } if( 2 * s < n && tid == n - 1 ) { if( operation == tnlMin ) sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlMax ) sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlSum ) sdata[ 0 ] += sdata[ tid ]; } n = s; s = n / 2; __syncthreads(); } if( blockSize >= 256 ) if( n >= 256 ) { if( tid < 128 ) if( tid < s ) { if( operation == tnlMin ) sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 128 ] ); sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlMax ) sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 128 ] ); sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlSum ) sdata[ tid ] += sdata[ tid + 128 ]; sdata[ tid ] += sdata[ tid + s ]; } if( 2 * s < n && tid == n - 1 ) { if( operation == tnlMin ) sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlMax ) sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlSum ) sdata[ 0 ] += sdata[ tid ]; } n = s; s = n / 2; __syncthreads(); } if( blockSize >= 128 ) if( n >= 128 ) { if (tid< 64) if( tid < s ) { if( operation == tnlMin ) sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 64 ] ); sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlMax ) sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 64 ] ); sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlSum ) sdata[ tid ] += sdata[ tid + 64 ]; sdata[ tid ] += sdata[ tid + s ]; } if( 2 * s < n && tid == n - 1 ) { if( operation == tnlMin ) sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlMax ) sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlSum ) sdata[ 0 ] += sdata[ tid ]; } n = s; s = n / 2; __syncthreads(); } if( n >= 64 ) { if( tid < s ) { if( operation == tnlMin ) sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlMax ) sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlSum ) sdata[ tid ] += sdata[ tid + s ]; } if( 2 * s < n && tid == n - 1 ) { if( operation == tnlMin ) sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlMax ) sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlSum ) sdata[ 0 ] += sdata[ tid ]; } n = s; s = n / 2; __syncthreads(); } /* Loading @@ -414,52 +476,101 @@ __global__ void tnlCUDASimpleReductionKernel5( const int size, */ if( tid < 32 ) { if( blockSize >= 64 ) if( n >= 32 ) { if( tid < s ) { if( operation == tnlMin ) sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 32 ] ); sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlMax ) sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 32 ] ); sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlSum ) sdata[ tid ] += sdata[ tid + 32 ]; sdata[ tid ] += sdata[ tid + s ]; } if( blockSize >= 32 ) if( 2 * s < n && tid == n - 1 ) { if( operation == tnlMin ) sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 16 ] ); sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlMax ) sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 16 ] ); sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlSum ) sdata[ tid ] += sdata[ tid + 16 ]; sdata[ 0 ] += sdata[ tid ]; } if( blockSize >= 16 ) n = s; s = n / 2; } if( n >= 16 ) { if( tid < s ) { if( operation == tnlMin ) sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 8 ] ); sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlMax ) sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 8 ] ); sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlSum ) sdata[ tid ] += sdata[ tid + 8 ]; } if( blockSize >= 8 ) if( 2 * s < n && tid == n - 1 ) { if( operation == tnlMin ) sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 4 ] ); sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlMax ) sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 4 ] ); sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlSum ) sdata[ tid ] += sdata[ tid + 4 ]; sdata[ 0 ] += sdata[ tid ]; } if( blockSize >= 4 ) n = s; s = n / 2; } if( n >= 8 ) { if( tid < s ) { if( operation == tnlMin ) sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 2 ] ); sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlMax ) sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + 2 ] ); sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlSum ) sdata[ tid ] += sdata[ tid + 2 ]; sdata[ tid ] += sdata[ tid + s ]; } if( blockSize >= 2 ) if( 2 * s < n && tid == n - 1 ) { if( operation == tnlMin ) sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlMax ) sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlSum ) sdata[ 0 ] += sdata[ tid ]; } n = s; s = n / 2; } if( n >= 4 ) { if( tid < s ) { if( operation == tnlMin ) sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlMax ) sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlSum ) sdata[ tid ] += sdata[ tid + s ]; } if( 2 * s < n && tid == n - 1 ) { if( operation == tnlMin ) sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlMax ) sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlSum ) sdata[ 0 ] += sdata[ tid ]; } n = s; s = n / 2; } if( n >= 2 ) { if( tid < s ) { if( operation == tnlMin ) sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + 1 ] ); Loading @@ -468,6 +579,16 @@ __global__ void tnlCUDASimpleReductionKernel5( const int size, if( operation == tnlSum ) sdata[ tid ] += sdata[ tid + 1 ]; } if( 2 * s < n && tid == n - 1 ) { if( operation == tnlMin ) sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlMax ) sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlSum ) sdata[ 0 ] += sdata[ tid ]; } } } // Store the result back in global memory Loading @@ -485,7 +606,7 @@ bool tnlCUDASimpleReduction5( const int size, const int cpuThreshold = 1; const int desBlockSize = 16; //Desired block size //T* dbg_array1; T* dbg_array1; bool device_output_allocated( false ); if( ! device_output ) Loading @@ -495,12 +616,12 @@ bool tnlCUDASimpleReduction5( const int size, if( cudaGetLastError() != cudaSuccess ) { cerr << "Unable to allocate device memory with size " << bytes_alloc << "." << endl; abort(); return false; } device_output_allocated = true; //cudaMalloc( ( void** ) &dbg_array1, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!! //cudaMalloc( ( void** ) &dbg_array2, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!! cudaMalloc( ( void** ) &dbg_array1, size * sizeof( T ) ); //!!!!!!!!!!!!!!!!!! } dim3 block_size( 0 ), grid_size( 0 ); int shmem; Loading @@ -509,7 +630,16 @@ bool tnlCUDASimpleReduction5( const int size, while( size_reduced > cpuThreshold ) { block_size. x = :: Min( size_reduced, desBlockSize ); grid_size. x = ( size_reduced / block_size. x + 1 ) / 2; grid_size. x = size_reduced / block_size. x / 2; if( 2 * grid_size. x * block_size. x < size_reduced ) grid_size. x ++; int bits = 1; while( block_size. x > 1 ) { block_size. x >>= 1; bits <<= 1; } block_size. x = bits; shmem = block_size. x * sizeof( T ); /*cout << "Size: " << size_reduced << " Grid size: " << grid_size. x Loading @@ -520,56 +650,71 @@ bool tnlCUDASimpleReduction5( const int size, switch( block_size. x ) { case 512: tnlCUDASimpleReductionKernel5< T, operation, 512 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output ); //cout << "Calling kernel " << block_size. x << endl; tnlCUDASimpleReductionKernel5< T, operation, 512 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 ); break; case 256: tnlCUDASimpleReductionKernel5< T, operation, 256 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output ); //cout << "Calling kernel " << block_size. x << endl; tnlCUDASimpleReductionKernel5< T, operation, 256 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 ); break; case 128: tnlCUDASimpleReductionKernel5< T, operation, 128 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output ); //cout << "Calling kernel " << block_size. x << endl; tnlCUDASimpleReductionKernel5< T, operation, 128 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 ); break; case 64: tnlCUDASimpleReductionKernel5< T, operation, 64 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output ); //cout << "Calling kernel " << block_size. x << endl; tnlCUDASimpleReductionKernel5< T, operation, 64 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 ); break; case 32: tnlCUDASimpleReductionKernel5< T, operation, 32 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output ); //cout << "Calling kernel " << block_size. x << endl; tnlCUDASimpleReductionKernel5< T, operation, 32 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 ); break; case 16: tnlCUDASimpleReductionKernel5< T, operation, 16 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output ); //cout << "Calling kernel " << block_size. x << endl; tnlCUDASimpleReductionKernel5< T, operation, 16 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 ); break; case 8: tnlCUDASimpleReductionKernel5< T, operation, 8 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output ); //cout << "Calling kernel " << block_size. x << endl; tnlCUDASimpleReductionKernel5< T, operation, 8 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 ); break; case 4: tnlCUDASimpleReductionKernel5< T, operation, 4 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output ); //cout << "Calling kernel " << block_size. x << endl; tnlCUDASimpleReductionKernel5< T, operation, 4 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 ); break; case 2: tnlCUDASimpleReductionKernel5< T, operation, 2 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output ); //cout << "Calling kernel " << block_size. x << endl; tnlCUDASimpleReductionKernel5< T, operation, 2 ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 ); break; case 1: tnlAssert( false, cerr << "blockSize should not be 1." << endl ); abort(); break; default: tnlAssert( false, cerr << "Block size is " << block_size. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); abort(); break; } //cout << "+++++++++++++++++++++" << endl; size_reduced = grid_size. x; reduction_input = device_output; // debuging part /*T* host_array = new T[ desBlockSize ]; cudaMemcpy( host_array, dbg_array1, desBlockSize * sizeof( T ), cudaMemcpyDeviceToHost ); for( int i = 0; i< :: Min( ( int ) block_size. x, desBlockSize ); i ++ ) cout << host_array[ i ] << " - "; /*T* host_array = new T[ size ]; cudaMemcpy( host_array, dbg_array1, size * sizeof( T ), cudaMemcpyDeviceToHost ); cout << "Dbg. array: "; for( int i = 0; i < size ; i ++ ) cout << host_array[ i ] << " "; cout << endl; cudaFree( dbg_array1 ); T* output = new T[ size_reduced ]; cudaMemcpy( output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost ); cout << endl; cout << endl << "Reduced data: "; for( int i = 0; i < size_reduced; i ++ ) cout << output[ i ] << " "; cout << endl; delete[] output;*/ } T* host_output = new T[ size_reduced ]; if( size == 1 ) Loading Loading @@ -653,10 +798,8 @@ __global__ void tnlCUDASimpleReductionKernel4( const int size, sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlSum ) sdata[ 0 ] += sdata[ tid ]; dbg_array1[ 0 ] = sdata[ tid ]; } n = s; __syncthreads(); //dbg_array1[ tid ] = -sdata[ tid ]; Loading Loading @@ -691,7 +834,7 @@ bool tnlCUDASimpleReduction4( const int size, } device_output_allocated = true; //cudaMalloc( ( void** ) &dbg_array1, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!! cudaMalloc( ( void** ) &dbg_array1, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!! //cudaMalloc( ( void** ) &dbg_array2, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!! } dim3 block_size( 0 ), grid_size( 0 ); Loading
src/core/tnlCUDAKernelsTester.h +181 −168 Original line number Diff line number Diff line Loading @@ -174,9 +174,9 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase GetParameterType( param ) == "double" ) { if( min != seq_min ) cerr << "Diff. min = " << min << " seq. min = " << seq_min; cout << "Diff. min = " << min << " seq. min = " << seq_min; if( max != seq_max ) cerr << "Diff. max = " << max << " seq. max = " << seq_max; cout << "Diff. max = " << max << " seq. max = " << seq_max; CPPUNIT_ASSERT( min == seq_min ); CPPUNIT_ASSERT( max == seq_max ); if( sum == 0.0 ) Loading @@ -187,18 +187,30 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase { double diff = ( ( double ) sum - ( double ) seq_sum ) / ( double) sum; if( fabs( diff > 1.0e-5 ) ) cerr << "Diff is " << diff << " for " << GetParameterType( param ) << endl; { cout << "Diff is " << diff << " for " << GetParameterType( param ) << endl; abort(); } CPPUNIT_ASSERT( fabs( diff ) < 1.0e-5 ); } } else { if( min != seq_min ) cerr << "Diff. min = " << min << " seq. min = " << seq_min; { cout << "Diff. min = " << min << " seq. min = " << seq_min; abort(); } if( max != seq_max ) cerr << "Diff. max = " << max << " seq. max = " << seq_max; { cout << "Diff. max = " << max << " seq. max = " << seq_max; abort(); } if( sum != seq_sum ) cerr << "Diff. sum = " << sum << " seq. sum = " << seq_sum; { cout << "Diff. sum = " << sum << " seq. sum = " << seq_sum; abort(); } CPPUNIT_ASSERT( min == seq_min ); CPPUNIT_ASSERT( max == seq_max ); CPPUNIT_ASSERT( sum == seq_sum ); Loading @@ -210,7 +222,7 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase { tnlLongVector< T > host_input; int size = 2; for( int s = 1; s < 12; s ++ ) /*for( int s = 1; s < 12; s ++ ) { tnlLongVector< T > host_input( size ); Loading Loading @@ -245,12 +257,12 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase 2048 ); size *= 2; cout << endl; } }*/ for( size = 1; size < 5000; size ++ ) { tnlLongVector< T > host_input( size ); cout << "Alg. " << algorithm_efficiency << "Testing zeros with size " << size << " "; //cout << "Alg. " << algorithm_efficiency << " Testing zeros with size " << size << " "; for( int i = 0; i < size; i ++ ) host_input[ i ] = 0.0; mainReduction( host_input, Loading @@ -258,41 +270,42 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase 256, 2048 ); cout << "Alg. " << algorithm_efficiency << "Testing ones with size " << size << " "; //cout << "Alg. " << algorithm_efficiency << " Testing ones with size " << size << " "; for( int i = 0; i < size; i ++ ) host_input[ i ] = 1.0; mainReduction( host_input, algorithm_efficiency, 256, 2048 ); cout << "Alg. " << algorithm_efficiency << "Testing linear sequence with size " << size << " "; //cout << "Alg. " << algorithm_efficiency << " Testing linear sequence with size " << size << " "; for( int i = 0; i < size; i ++ ) host_input[ i ] = i; mainReduction( host_input, algorithm_efficiency, 256, 2048 ); cout << "Alg. " << algorithm_efficiency << "Testing quadratic sequence with size " << size << " "; //cout << "Alg. " << algorithm_efficiency << " Testing quadratic sequence with size " << size << " "; for( int i = 0; i < size; i ++ ) host_input[ i ] = ( i - size / 2 ) * ( i - size / 2 ); mainReduction( host_input, algorithm_efficiency, 256, 2048 ); cout << endl; //cout << endl; } }; void testReduction() { cout << "Test FAST reduction" << endl; testReduction( 0 ); //cout << "Test FAST reduction" << endl; //testReduction( 0 ); } void testSimpleReduction5() { cout << "Test reduction 5" << endl; testReduction( 5 ); //cout << "Test reduction 5" << endl; //testReduction( 5 ); }; void testSimpleReduction4() Loading