Loading src/core/tnl-cuda-kernels.h +51 −38 Original line number Diff line number Diff line Loading @@ -627,13 +627,13 @@ __global__ void tnlCUDASimpleReductionKernel4( const int size, sdata[ tid ] = d_input[ gid ]; } __syncthreads(); dbg_array1[ tid ] = sdata[ tid ]; //dbg_array1[ tid ] = tid; //sdata[ tid ]; // Parallel reduction int n = last_tid < blockDim. x ? last_tid : blockDim. x; for( int s = n / 2; s > 0; s >>= 1 ) { if( tid < s && tid + s < n ) if( tid < s ) { if( operation == tnlMin ) sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] ); Loading @@ -645,16 +645,15 @@ __global__ void tnlCUDASimpleReductionKernel4( const int size, /* This is for the case when we have odd number of elements. * The last one will be reduced using the thread with ID 0. */ if( 2 * s < n && tid == 0 ) if( 2 * s < n && tid == n - 1 ) { if( operation == tnlMin ) sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ n ] ); sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlMax ) sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ n ] ); sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlSum ) sdata[ 0 ] += sdata[ n ]; dbg_array1[ n ] = -555; //sdata[ 0 ]; sdata[ 0 ] += sdata[ tid ]; dbg_array1[ 0 ] = sdata[ tid ]; } n = s; Loading @@ -663,7 +662,7 @@ __global__ void tnlCUDASimpleReductionKernel4( const int size, } // Store the result back in global memory // Store the result back in the global memory if( tid == 0 ) d_output[ blockIdx. x ] = sdata[ 0 ]; } Loading Loading @@ -692,7 +691,7 @@ bool tnlCUDASimpleReduction4( const int size, } device_output_allocated = true; cudaMalloc( ( void** ) &dbg_array1, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!! //cudaMalloc( ( void** ) &dbg_array1, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!! //cudaMalloc( ( void** ) &dbg_array2, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!! } dim3 block_size( 0 ), grid_size( 0 ); Loading @@ -706,19 +705,19 @@ bool tnlCUDASimpleReduction4( const int size, if( grid_size. x * 2 * block_size. x < size_reduced ) grid_size. x ++; shmem = block_size. x * sizeof( T ); cout << "Size: " << size_reduced /*cout << "Size: " << size_reduced << " Grid size: " << grid_size. x << " Block size: " << block_size. x << " Shmem: " << shmem << endl; << " Shmem: " << shmem << endl;*/ tnlCUDASimpleReductionKernel4< T, operation ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 ); size_reduced = grid_size. x; reduction_input = device_output; // debuging part T* host_array = new T[ desBlockSize ]; /*T* host_array = new T[ desBlockSize ]; cudaMemcpy( host_array, dbg_array1, desBlockSize * sizeof( T ), cudaMemcpyDeviceToHost ); for( int i = 0; i< :: Min( ( int ) block_size. x, desBlockSize ); i ++ ) cout << host_array[ i ] << " - "; cout << host_array[ i ] << " "; cout << endl; T* output = new T[ size_reduced ]; Loading @@ -727,7 +726,7 @@ bool tnlCUDASimpleReduction4( const int size, for( int i = 0; i < size_reduced; i ++ ) cout << output[ i ] << " "; cout << endl; delete[] output; delete[] output;*/ } T* host_output = new T[ size_reduced ]; if( size == 1 ) Loading Loading @@ -772,7 +771,8 @@ __global__ void tnlCUDASimpleReductionKernel3( const int size, __syncthreads(); // Parallel reduction for( int s = blockDim. x / 2; s > 0; s >>= 1 ) int n = last_tid < blockDim. x ? last_tid : blockDim. x; for( int s = n / 2; s > 0; s >>= 1 ) { if( tid < s && tid + s < last_tid ) { Loading @@ -783,6 +783,19 @@ __global__ void tnlCUDASimpleReductionKernel3( const int size, if( operation == tnlSum ) sdata[ tid ] += sdata[ tid + s ]; } /* This is for the case when we have odd number of elements. * The last one will be reduced using the thread with ID 0. */ if( 2 * s < n && tid == 0 ) { if( operation == tnlMin ) sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ n - 1 ] ); if( operation == tnlMax ) sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ n - 1 ] ); if( operation == tnlSum ) sdata[ 0 ] += sdata[ n - 1 ]; } n = s; __syncthreads(); } Loading src/core/tnlCUDAKernelsTester.h +34 −22 Original line number Diff line number Diff line Loading @@ -156,7 +156,7 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase } if( min == seq_min ) /*if( min == seq_min ) cout << "Min: " << min << " Seq. min: " << seq_min << " :-)" << endl; else cout << "Min: " << min << " Seq. min: " << seq_min << " !!!!!!!!!!" << endl; Loading @@ -167,11 +167,16 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase if( sum == seq_sum ) cout << "Sum: " << sum << " Seq. sum: " << seq_sum << " :-)" << endl; else cout << "Sum: " << sum << " Seq. sum: " << seq_sum << " !!!!!!!!!!" << endl; cout << "Sum: " << sum << " Seq. sum: " << seq_sum << " !!!!!!!!!!" << endl;*/ T param; if( GetParameterType( param ) == "float" ) if( GetParameterType( param ) == "float" || GetParameterType( param ) == "double" ) { if( min != seq_min ) cerr << "Diff. min = " << min << " seq. min = " << seq_min; if( max != seq_max ) cerr << "Diff. max = " << max << " seq. max = " << seq_max; CPPUNIT_ASSERT( min == seq_min ); CPPUNIT_ASSERT( max == seq_max ); if( sum == 0.0 ) Loading @@ -181,11 +186,19 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase else { double diff = ( ( double ) sum - ( double ) seq_sum ) / ( double) sum; if( fabs( diff > 1.0e-5 ) ) cerr << "Diff is " << diff << " for " << GetParameterType( param ) << endl; CPPUNIT_ASSERT( fabs( diff ) < 1.0e-5 ); } } else { if( min != seq_min ) cerr << "Diff. min = " << min << " seq. min = " << seq_min; if( max != seq_max ) cerr << "Diff. max = " << max << " seq. max = " << seq_max; if( sum != seq_sum ) cerr << "Diff. sum = " << sum << " seq. sum = " << seq_sum; CPPUNIT_ASSERT( min == seq_min ); CPPUNIT_ASSERT( max == seq_max ); CPPUNIT_ASSERT( sum == seq_sum ); Loading @@ -197,10 +210,11 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase { tnlLongVector< T > host_input; int size = 2; /*for( int s = 1; s < 12; s ++ ) for( int s = 1; s < 12; s ++ ) { tnlLongVector< T > host_input( size ); cout << "Alg. " << algorithm_efficiency << "Testing zeros with size " << size << " "; for( int i = 0; i < size; i ++ ) host_input[ i ] = 0.0; mainReduction( host_input, Loading @@ -208,20 +222,21 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase 256, 2048 ); cout << "Alg. " << algorithm_efficiency << "Testing ones with size " << size << " "; for( int i = 0; i < size; i ++ ) host_input[ i ] = 1.0; mainReduction( host_input, algorithm_efficiency, 256, 2048 ); cout << "Alg. " << algorithm_efficiency << "Testing linear sequence with size " << size << " "; for( int i = 0; i < size; i ++ ) host_input[ i ] = i; mainReduction( host_input, algorithm_efficiency, 256, 2048 ); cout << "Alg. " << algorithm_efficiency << "Testing quadratic sequence with size " << size << " "; for( int i = 0; i < size; i ++ ) host_input[ i ] = ( i - size / 2 ) * ( i - size / 2 ); mainReduction( host_input, Loading @@ -229,46 +244,43 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase 256, 2048 ); size *= 2; }*/ for( size = 257; size < 5000; size ++ ) cout << endl; } for( size = 1; size < 5000; size ++ ) { cout << "************* Size is " << size << " ******************* " << endl; tnlLongVector< T > host_input( size ); /*for( int i = 0; i < size; i ++ ) cout << "Alg. " << algorithm_efficiency << "Testing zeros with size " << size << " "; for( int i = 0; i < size; i ++ ) host_input[ i ] = 0.0; mainReduction( host_input, algorithm_efficiency, 256, 2048 );*/ 2048 ); cout << "Alg. " << algorithm_efficiency << "Testing ones with size " << size << " "; for( int i = 0; i < size; i ++ ) host_input[ i ] = 1.0; mainReduction( host_input, algorithm_efficiency, 256, 2048 ); cout << "Alg. " << algorithm_efficiency << "Testing linear sequence with size " << size << " "; for( int i = 0; i < size; i ++ ) host_input[ i ] = i; mainReduction( host_input, algorithm_efficiency, 256, 2048 ); cout << "Alg. " << algorithm_efficiency << "Testing quadratic sequence with size " << size << " "; for( int i = 0; i < size; i ++ ) host_input[ i ] = ( i - size / 2 ) * ( i - size / 2 ); mainReduction( host_input, algorithm_efficiency, 256, 2048 ); cout << endl; } }; void testReduction() Loading Loading @@ -298,13 +310,13 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase void testSimpleReduction2() { cout << "Test reduction 2" << endl; //testReduction( 2 ); testReduction( 2 ); }; void testSimpleReduction1() { cout << "Test reduction 1" << endl; //testReduction( 1 ); testReduction( 1 ); }; Loading Loading
src/core/tnl-cuda-kernels.h +51 −38 Original line number Diff line number Diff line Loading @@ -627,13 +627,13 @@ __global__ void tnlCUDASimpleReductionKernel4( const int size, sdata[ tid ] = d_input[ gid ]; } __syncthreads(); dbg_array1[ tid ] = sdata[ tid ]; //dbg_array1[ tid ] = tid; //sdata[ tid ]; // Parallel reduction int n = last_tid < blockDim. x ? last_tid : blockDim. x; for( int s = n / 2; s > 0; s >>= 1 ) { if( tid < s && tid + s < n ) if( tid < s ) { if( operation == tnlMin ) sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] ); Loading @@ -645,16 +645,15 @@ __global__ void tnlCUDASimpleReductionKernel4( const int size, /* This is for the case when we have odd number of elements. * The last one will be reduced using the thread with ID 0. */ if( 2 * s < n && tid == 0 ) if( 2 * s < n && tid == n - 1 ) { if( operation == tnlMin ) sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ n ] ); sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlMax ) sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ n ] ); sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] ); if( operation == tnlSum ) sdata[ 0 ] += sdata[ n ]; dbg_array1[ n ] = -555; //sdata[ 0 ]; sdata[ 0 ] += sdata[ tid ]; dbg_array1[ 0 ] = sdata[ tid ]; } n = s; Loading @@ -663,7 +662,7 @@ __global__ void tnlCUDASimpleReductionKernel4( const int size, } // Store the result back in global memory // Store the result back in the global memory if( tid == 0 ) d_output[ blockIdx. x ] = sdata[ 0 ]; } Loading Loading @@ -692,7 +691,7 @@ bool tnlCUDASimpleReduction4( const int size, } device_output_allocated = true; cudaMalloc( ( void** ) &dbg_array1, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!! //cudaMalloc( ( void** ) &dbg_array1, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!! //cudaMalloc( ( void** ) &dbg_array2, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!! } dim3 block_size( 0 ), grid_size( 0 ); Loading @@ -706,19 +705,19 @@ bool tnlCUDASimpleReduction4( const int size, if( grid_size. x * 2 * block_size. x < size_reduced ) grid_size. x ++; shmem = block_size. x * sizeof( T ); cout << "Size: " << size_reduced /*cout << "Size: " << size_reduced << " Grid size: " << grid_size. x << " Block size: " << block_size. x << " Shmem: " << shmem << endl; << " Shmem: " << shmem << endl;*/ tnlCUDASimpleReductionKernel4< T, operation ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 ); size_reduced = grid_size. x; reduction_input = device_output; // debuging part T* host_array = new T[ desBlockSize ]; /*T* host_array = new T[ desBlockSize ]; cudaMemcpy( host_array, dbg_array1, desBlockSize * sizeof( T ), cudaMemcpyDeviceToHost ); for( int i = 0; i< :: Min( ( int ) block_size. x, desBlockSize ); i ++ ) cout << host_array[ i ] << " - "; cout << host_array[ i ] << " "; cout << endl; T* output = new T[ size_reduced ]; Loading @@ -727,7 +726,7 @@ bool tnlCUDASimpleReduction4( const int size, for( int i = 0; i < size_reduced; i ++ ) cout << output[ i ] << " "; cout << endl; delete[] output; delete[] output;*/ } T* host_output = new T[ size_reduced ]; if( size == 1 ) Loading Loading @@ -772,7 +771,8 @@ __global__ void tnlCUDASimpleReductionKernel3( const int size, __syncthreads(); // Parallel reduction for( int s = blockDim. x / 2; s > 0; s >>= 1 ) int n = last_tid < blockDim. x ? last_tid : blockDim. x; for( int s = n / 2; s > 0; s >>= 1 ) { if( tid < s && tid + s < last_tid ) { Loading @@ -783,6 +783,19 @@ __global__ void tnlCUDASimpleReductionKernel3( const int size, if( operation == tnlSum ) sdata[ tid ] += sdata[ tid + s ]; } /* This is for the case when we have odd number of elements. * The last one will be reduced using the thread with ID 0. */ if( 2 * s < n && tid == 0 ) { if( operation == tnlMin ) sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ n - 1 ] ); if( operation == tnlMax ) sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ n - 1 ] ); if( operation == tnlSum ) sdata[ 0 ] += sdata[ n - 1 ]; } n = s; __syncthreads(); } Loading
src/core/tnlCUDAKernelsTester.h +34 −22 Original line number Diff line number Diff line Loading @@ -156,7 +156,7 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase } if( min == seq_min ) /*if( min == seq_min ) cout << "Min: " << min << " Seq. min: " << seq_min << " :-)" << endl; else cout << "Min: " << min << " Seq. min: " << seq_min << " !!!!!!!!!!" << endl; Loading @@ -167,11 +167,16 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase if( sum == seq_sum ) cout << "Sum: " << sum << " Seq. sum: " << seq_sum << " :-)" << endl; else cout << "Sum: " << sum << " Seq. sum: " << seq_sum << " !!!!!!!!!!" << endl; cout << "Sum: " << sum << " Seq. sum: " << seq_sum << " !!!!!!!!!!" << endl;*/ T param; if( GetParameterType( param ) == "float" ) if( GetParameterType( param ) == "float" || GetParameterType( param ) == "double" ) { if( min != seq_min ) cerr << "Diff. min = " << min << " seq. min = " << seq_min; if( max != seq_max ) cerr << "Diff. max = " << max << " seq. max = " << seq_max; CPPUNIT_ASSERT( min == seq_min ); CPPUNIT_ASSERT( max == seq_max ); if( sum == 0.0 ) Loading @@ -181,11 +186,19 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase else { double diff = ( ( double ) sum - ( double ) seq_sum ) / ( double) sum; if( fabs( diff > 1.0e-5 ) ) cerr << "Diff is " << diff << " for " << GetParameterType( param ) << endl; CPPUNIT_ASSERT( fabs( diff ) < 1.0e-5 ); } } else { if( min != seq_min ) cerr << "Diff. min = " << min << " seq. min = " << seq_min; if( max != seq_max ) cerr << "Diff. max = " << max << " seq. max = " << seq_max; if( sum != seq_sum ) cerr << "Diff. sum = " << sum << " seq. sum = " << seq_sum; CPPUNIT_ASSERT( min == seq_min ); CPPUNIT_ASSERT( max == seq_max ); CPPUNIT_ASSERT( sum == seq_sum ); Loading @@ -197,10 +210,11 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase { tnlLongVector< T > host_input; int size = 2; /*for( int s = 1; s < 12; s ++ ) for( int s = 1; s < 12; s ++ ) { tnlLongVector< T > host_input( size ); cout << "Alg. " << algorithm_efficiency << "Testing zeros with size " << size << " "; for( int i = 0; i < size; i ++ ) host_input[ i ] = 0.0; mainReduction( host_input, Loading @@ -208,20 +222,21 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase 256, 2048 ); cout << "Alg. " << algorithm_efficiency << "Testing ones with size " << size << " "; for( int i = 0; i < size; i ++ ) host_input[ i ] = 1.0; mainReduction( host_input, algorithm_efficiency, 256, 2048 ); cout << "Alg. " << algorithm_efficiency << "Testing linear sequence with size " << size << " "; for( int i = 0; i < size; i ++ ) host_input[ i ] = i; mainReduction( host_input, algorithm_efficiency, 256, 2048 ); cout << "Alg. " << algorithm_efficiency << "Testing quadratic sequence with size " << size << " "; for( int i = 0; i < size; i ++ ) host_input[ i ] = ( i - size / 2 ) * ( i - size / 2 ); mainReduction( host_input, Loading @@ -229,46 +244,43 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase 256, 2048 ); size *= 2; }*/ for( size = 257; size < 5000; size ++ ) cout << endl; } for( size = 1; size < 5000; size ++ ) { cout << "************* Size is " << size << " ******************* " << endl; tnlLongVector< T > host_input( size ); /*for( int i = 0; i < size; i ++ ) cout << "Alg. " << algorithm_efficiency << "Testing zeros with size " << size << " "; for( int i = 0; i < size; i ++ ) host_input[ i ] = 0.0; mainReduction( host_input, algorithm_efficiency, 256, 2048 );*/ 2048 ); cout << "Alg. " << algorithm_efficiency << "Testing ones with size " << size << " "; for( int i = 0; i < size; i ++ ) host_input[ i ] = 1.0; mainReduction( host_input, algorithm_efficiency, 256, 2048 ); cout << "Alg. " << algorithm_efficiency << "Testing linear sequence with size " << size << " "; for( int i = 0; i < size; i ++ ) host_input[ i ] = i; mainReduction( host_input, algorithm_efficiency, 256, 2048 ); cout << "Alg. " << algorithm_efficiency << "Testing quadratic sequence with size " << size << " "; for( int i = 0; i < size; i ++ ) host_input[ i ] = ( i - size / 2 ) * ( i - size / 2 ); mainReduction( host_input, algorithm_efficiency, 256, 2048 ); cout << endl; } }; void testReduction() Loading Loading @@ -298,13 +310,13 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase void testSimpleReduction2() { cout << "Test reduction 2" << endl; //testReduction( 2 ); testReduction( 2 ); }; void testSimpleReduction1() { cout << "Test reduction 1" << endl; //testReduction( 1 ); testReduction( 1 ); }; Loading