Loading src/core/tnl-cuda-kernels.h +85 −39 Original line number Diff line number Diff line Loading @@ -60,17 +60,19 @@ __global__ void tnlCUDAReductionKernel( const int size, // Read data into the shared memory int tid = threadIdx. x; int gid = 2 * blockSize * blockDim. x + threadIdx. x; int gid = 2 * blockDim. x * blockIdx. x + threadIdx. x; if( gid + blockSize < size ) { if( operation == tnlMin ) sdata[ tid ] = :: tnlCudaMin( d_input[ gid ], d_input[ gid + blockSize ] ); if( operation == tnlMax ) sdata[ tid ] = :: tnlCudaMax( d_input[ gid ], d_input[ gid + blockSize ] ); if( operation == tnlSum ) sdata[ tid ] = d_input[ gid ] + d_input[ gid + blockSize ]; //dbg_array1[ blockIdx. x * blockDim. x + threadIdx. x ] = -1; } else if( gid < size ) { sdata[ tid ] = d_input[ gid ]; //dbg_array1[ blockIdx. x * blockDim. x + threadIdx. x ] = -2; } gid += grid_size; Loading @@ -80,8 +82,9 @@ __global__ void tnlCUDAReductionKernel( const int size, if( operation == tnlMax ) sdata[ tid ] = :: tnlCudaMax( sdata[ tid ], :: tnlCudaMax( d_input[ gid ], d_input[ gid + blockSize ] ) ); if( operation == tnlSum ) sdata[ tid ] += d_input[gid] + d_input[ gid + blockSize ]; gid += grid_size; //dbg_array1[ blockIdx. x * blockDim. x + threadIdx. x ] = -4; } dbg_array1[ blockIdx. x * blockDim. x + threadIdx. x ] = sdata[ tid ]; //dbg_array1[ blockIdx. x * blockDim. x + threadIdx. x ] = sdata[ tid ]; __syncthreads(); if( gid + blockDim. x < size ) Loading Loading @@ -216,6 +219,9 @@ bool tnlCUDAReduction( const int size, T& result, T* device_output = 0 ) { if( ! size ) return false; //Calculate necessary block/grid dimensions const int cpuThreshold = 1; const int desBlockSize = 16; //Desired block size Loading @@ -235,7 +241,7 @@ bool tnlCUDAReduction( const int size, } device_output_allocated = true; cudaMalloc( ( void** ) &dbg_array1, size * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!! //cudaMalloc( ( void** ) &dbg_array1, size * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!! //cudaMalloc( ( void** ) &dbg_array2, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!! } dim3 block_size( 0 ), grid_size( 0 ); Loading @@ -247,40 +253,40 @@ bool tnlCUDAReduction( const int size, block_size. x = :: Min( size_reduced, desBlockSize ); grid_size. x = :: Min( ( int ) ( size_reduced / block_size. x + 1 ) / 2, desGridSize ); shmem = block_size. x * sizeof( T ); cout << "Size: " << size_reduced /*cout << "Size: " << size_reduced << " Grid size: " << grid_size. x << " Block size: " << block_size. x << " Shmem: " << shmem << endl; << " Shmem: " << shmem << endl;*/ tnlAssert( shmem < 16384, cerr << shmem << " bytes are required." << endl; ); tnlAssert( block_size. x <= 512, cerr << "Block size is " << block_size. x << endl; ); switch( block_size. x ) { case 512: tnlCUDAReductionKernel< T, operation, 512 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 ); tnlCUDAReductionKernel< T, operation, 512 ><<< grid_size, block_size, shmem >>>( size_reduced, 2 * grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 ); break; case 256: tnlCUDAReductionKernel< T, operation, 256 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 ); tnlCUDAReductionKernel< T, operation, 256 ><<< grid_size, block_size, shmem >>>( size_reduced, 2 * grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 ); break; case 128: tnlCUDAReductionKernel< T, operation, 128 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 ); tnlCUDAReductionKernel< T, operation, 128 ><<< grid_size, block_size, shmem >>>( size_reduced, 2 * grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 ); break; case 64: tnlCUDAReductionKernel< T, operation, 64 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 ); tnlCUDAReductionKernel< T, operation, 64 ><<< grid_size, block_size, shmem >>>( size_reduced, 2 * grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 ); break; case 32: tnlCUDAReductionKernel< T, operation, 32 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 ); tnlCUDAReductionKernel< T, operation, 32 ><<< grid_size, block_size, shmem >>>( size_reduced, 2 * grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 ); break; case 16: tnlCUDAReductionKernel< T, operation, 16 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 ); tnlCUDAReductionKernel< T, operation, 16 ><<< grid_size, block_size, shmem >>>( size_reduced, 2 * grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 ); break; case 8: tnlCUDAReductionKernel< T, operation, 8 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 ); tnlCUDAReductionKernel< T, operation, 8 ><<< grid_size, block_size, shmem >>>( size_reduced, 2 * grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 ); break; case 4: tnlCUDAReductionKernel< T, operation, 4 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 ); tnlCUDAReductionKernel< T, operation, 4 ><<< grid_size, block_size, shmem >>>( size_reduced, 2 * grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 ); break; case 2: tnlCUDAReductionKernel< T, operation, 2 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 ); tnlCUDAReductionKernel< T, operation, 2 ><<< grid_size, block_size, shmem >>>( size_reduced, 2 * grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 ); break; case 1: tnlAssert( false, cerr << "blockSize should not be 1." << endl ); Loading @@ -293,21 +299,24 @@ bool tnlCUDAReduction( const int size, reduction_input = device_output; // debuging part T* host_array = new T[ size ]; /*T* host_array = new T[ size ]; cudaMemcpy( host_array, dbg_array1, size * sizeof( T ), cudaMemcpyDeviceToHost ); for( int i = 0; i< size; i ++ ) cout << host_array[ i ] << " - "; cout << endl; cout << endl;*/ T* output = new T[ size_reduced ]; /*T* output = new T[ size_reduced ]; cudaMemcpy( output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost ); cout << endl; for( int i = 0; i < size_reduced; i ++ ) cout << output[ i ] << " "; cout << endl; delete[] output; delete[] output;*/ } T* host_output = new T[ size_reduced ]; if( size == 1 ) cudaMemcpy( host_output, device_input, sizeof( T ), cudaMemcpyDeviceToHost ); else cudaMemcpy( host_output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost ); result = host_output[ 0 ]; for( int i = 1; i < size_reduced; i++ ) Loading Loading @@ -563,6 +572,9 @@ bool tnlCUDASimpleReduction5( const int size, delete[] output;*/ } T* host_output = new T[ size_reduced ]; if( size == 1 ) cudaMemcpy( host_output, device_input, sizeof( T ), cudaMemcpyDeviceToHost ); else cudaMemcpy( host_output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost ); result = host_output[ 0 ]; for( int i = 1; i < size_reduced; i++ ) Loading Loading @@ -615,11 +627,13 @@ __global__ void tnlCUDASimpleReductionKernel4( const int size, sdata[ tid ] = d_input[ gid ]; } __syncthreads(); dbg_array1[ tid ] = sdata[ tid ]; // Parallel reduction for( int s = blockDim. x / 2; s > 0; s >>= 1 ) int n = last_tid < blockDim. x ? last_tid : blockDim. x; for( int s = n / 2; s > 0; s >>= 1 ) { if( tid < s && tid + s < last_tid ) if( tid < s && tid + s < n ) { if( operation == tnlMin ) sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] ); Loading @@ -628,7 +642,25 @@ __global__ void tnlCUDASimpleReductionKernel4( const int size, if( operation == tnlSum ) sdata[ tid ] += sdata[ tid + s ]; } /* This is for the case when we have odd number of elements. * The last one will be reduced using the thread with ID 0. */ if( 2 * s < n && tid == 0 ) { if( operation == tnlMin ) sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ n ] ); if( operation == tnlMax ) sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ n ] ); if( operation == tnlSum ) sdata[ 0 ] += sdata[ n ]; dbg_array1[ n ] = -555; //sdata[ 0 ]; } n = s; __syncthreads(); //dbg_array1[ tid ] = -sdata[ tid ]; } // Store the result back in global memory Loading Loading @@ -660,7 +692,7 @@ bool tnlCUDASimpleReduction4( const int size, } device_output_allocated = true; //cudaMalloc( ( void** ) &dbg_array1, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!! cudaMalloc( ( void** ) &dbg_array1, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!! //cudaMalloc( ( void** ) &dbg_array2, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!! } dim3 block_size( 0 ), grid_size( 0 ); Loading @@ -670,18 +702,20 @@ bool tnlCUDASimpleReduction4( const int size, while( size_reduced > cpuThreshold ) { block_size. x = :: Min( size_reduced, desBlockSize ); grid_size. x = ( size_reduced / block_size. x + 1 ) / 2; grid_size. x = size_reduced / block_size. x / 2; if( grid_size. x * 2 * block_size. x < size_reduced ) grid_size. x ++; shmem = block_size. x * sizeof( T ); /*cout << "Size: " << size_reduced cout << "Size: " << size_reduced << " Grid size: " << grid_size. x << " Block size: " << block_size. x << " Shmem: " << shmem << endl;*/ << " Shmem: " << shmem << endl; tnlCUDASimpleReductionKernel4< T, operation ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 ); size_reduced = grid_size. x; reduction_input = device_output; // debuging part /*T* host_array = new T[ desBlockSize ]; T* host_array = new T[ desBlockSize ]; cudaMemcpy( host_array, dbg_array1, desBlockSize * sizeof( T ), cudaMemcpyDeviceToHost ); for( int i = 0; i< :: Min( ( int ) block_size. x, desBlockSize ); i ++ ) cout << host_array[ i ] << " - "; Loading @@ -693,9 +727,12 @@ bool tnlCUDASimpleReduction4( const int size, for( int i = 0; i < size_reduced; i ++ ) cout << output[ i ] << " "; cout << endl; delete[] output;*/ delete[] output; } T* host_output = new T[ size_reduced ]; if( size == 1 ) cudaMemcpy( host_output, device_input, sizeof( T ), cudaMemcpyDeviceToHost ); else cudaMemcpy( host_output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost ); result = host_output[ 0 ]; for( int i = 1; i < size_reduced; i++ ) Loading Loading @@ -783,7 +820,7 @@ bool tnlCUDASimpleReduction3( const int size, while( size_reduced > cpuThreshold ) { block_size. x = :: Min( size_reduced, desBlockSize ); grid_size. x = size_reduced / block_size. x; grid_size. x = size_reduced / block_size. x + ( size_reduced % block_size. x != 0 ); shmem = block_size. x * sizeof( T ); /*cout << "Size: " << size_reduced << " Grid size: " << grid_size. x Loading @@ -803,6 +840,9 @@ bool tnlCUDASimpleReduction3( const int size, delete[] output;*/ } T* host_output = new T[ size_reduced ]; if( size == 1 ) cudaMemcpy( host_output, device_input, sizeof( T ), cudaMemcpyDeviceToHost ); else cudaMemcpy( host_output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost ); result = host_output[ 0 ]; for( int i = 1; i < size_reduced; i++ ) Loading Loading @@ -891,7 +931,7 @@ bool tnlCUDASimpleReduction2( const int size, while( size_reduced > cpuThreshold ) { block_size. x = :: Min( size_reduced, desBlockSize ); grid_size. x = size_reduced / block_size. x; grid_size. x = size_reduced / block_size. x + ( size_reduced % block_size. x != 0 ); shmem = block_size. x * sizeof( T ); /*cout << "Size: " << size_reduced << " Grid size: " << grid_size. x Loading @@ -911,6 +951,9 @@ bool tnlCUDASimpleReduction2( const int size, delete[] output;*/ } T* host_output = new T[ size_reduced ]; if( size == 1 ) cudaMemcpy( host_output, device_input, sizeof( T ), cudaMemcpyDeviceToHost ); else cudaMemcpy( host_output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost ); result = host_output[ 0 ]; for( int i = 1; i < size_reduced; i++ ) Loading Loading @@ -998,7 +1041,7 @@ bool tnlCUDASimpleReduction1( const int size, while( size_reduced > cpuThreshold ) { block_size. x = :: Min( size_reduced, desBlockSize ); grid_size. x = size_reduced / block_size. x; grid_size. x = size_reduced / block_size. x + ( size_reduced % block_size. x != 0 ); shmem = block_size. x * sizeof( T ); /*cout << "Size: " << size_reduced << " Grid size: " << grid_size. x Loading @@ -1018,6 +1061,9 @@ bool tnlCUDASimpleReduction1( const int size, delete[] output;*/ } T* host_output = new T[ size_reduced ]; if( size == 1 ) cudaMemcpy( host_output, device_input, sizeof( T ), cudaMemcpyDeviceToHost ); else cudaMemcpy( host_output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost ); result = host_output[ 0 ]; for( int i = 1; i < size_reduced; i++ ) Loading src/core/tnlCUDAKernelsTester.h +127 −25 Original line number Diff line number Diff line Loading @@ -19,6 +19,7 @@ #define TNLCUDAKERNELSTESTER_H_ #include <iostream> #include <math.h> #include <cppunit/TestSuite.h> #include <cppunit/TestResult.h> #include <cppunit/TestCaller.h> Loading Loading @@ -98,20 +99,21 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase return true; } void testReduction( int algorithm_efficiency = 0 ) bool mainReduction( const tnlLongVector< T >& host_input, int algorithm_efficiency, const int desired_block_size, const int desired_grid_size ) { int size = 32; 1024; //1<<10; int desBlockSize = 128; //Desired block size int desGridSize = 2048; //Impose limitation on grid size so that threads could perform sequential work tnlLongVector< T > host_input; const int size = host_input. GetSize(); tnlLongVectorCUDA< T > device_input; CPPUNIT_ASSERT( testSetup( host_input, device_input, size ) ); if( ! device_input. SetNewSize( size ) ) return false; device_input. copyFrom( host_input ); T seq_min( host_input[ 0 ] ), seq_max( host_input[ 0 ] ), seq_sum( host_input[ 0 ] ); for( int i = 1; i < size; i ++ ) { seq_min = :: Min( seq_min, host_input[ i ] ); Loading @@ -119,11 +121,6 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase seq_sum += host_input[ i ]; } //Calculate necessary block/grid dimensions int block_size = :: Min( size/2, desBlockSize ); //Grid size is limited in this case int grid_size = :: Min( desGridSize, size / block_size / 2 ); T min, max, sum; switch( algorithm_efficiency ) { Loading Loading @@ -159,13 +156,118 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase } cout << "Min: " << min << " Seq. min: " << seq_min << endl << "Max: " << max << " Seq. max: " << seq_max << endl << "Sum: " << sum << " Seq. sum: " << seq_sum << endl; if( min == seq_min ) cout << "Min: " << min << " Seq. min: " << seq_min << " :-)" << endl; else cout << "Min: " << min << " Seq. min: " << seq_min << " !!!!!!!!!!" << endl; if( max == seq_max ) cout << "Max: " << max << " Seq. max: " << seq_max << " :-)" << endl; else cout << "Max: " << max << " Seq. max: " << seq_max << " !!!!!!!!!!" << endl; if( sum == seq_sum ) cout << "Sum: " << sum << " Seq. sum: " << seq_sum << " :-)" << endl; else cout << "Sum: " << sum << " Seq. sum: " << seq_sum << " !!!!!!!!!!" << endl; T param; if( GetParameterType( param ) == "float" ) { CPPUNIT_ASSERT( min == seq_min ); CPPUNIT_ASSERT( max == seq_max ); if( sum == 0.0 ) { CPPUNIT_ASSERT( sum == seq_sum ); } else { double diff = ( ( double ) sum - ( double ) seq_sum ) / ( double) sum; CPPUNIT_ASSERT( fabs( diff ) < 1.0e-5 ); } } else { CPPUNIT_ASSERT( min == seq_min ); CPPUNIT_ASSERT( max == seq_max ); CPPUNIT_ASSERT( sum == seq_sum ); } } void testReduction( int algorithm_efficiency = 0 ) { tnlLongVector< T > host_input; int size = 2; /*for( int s = 1; s < 12; s ++ ) { tnlLongVector< T > host_input( size ); for( int i = 0; i < size; i ++ ) host_input[ i ] = 0.0; mainReduction( host_input, algorithm_efficiency, 256, 2048 ); for( int i = 0; i < size; i ++ ) host_input[ i ] = 1.0; mainReduction( host_input, algorithm_efficiency, 256, 2048 ); for( int i = 0; i < size; i ++ ) host_input[ i ] = i; mainReduction( host_input, algorithm_efficiency, 256, 2048 ); for( int i = 0; i < size; i ++ ) host_input[ i ] = ( i - size / 2 ) * ( i - size / 2 ); mainReduction( host_input, algorithm_efficiency, 256, 2048 ); size *= 2; }*/ for( size = 257; size < 5000; size ++ ) { cout << "************* Size is " << size << " ******************* " << endl; tnlLongVector< T > host_input( size ); /*for( int i = 0; i < size; i ++ ) host_input[ i ] = 0.0; mainReduction( host_input, algorithm_efficiency, 256, 2048 );*/ for( int i = 0; i < size; i ++ ) host_input[ i ] = 1.0; mainReduction( host_input, algorithm_efficiency, 256, 2048 ); for( int i = 0; i < size; i ++ ) host_input[ i ] = i; mainReduction( host_input, algorithm_efficiency, 256, 2048 ); for( int i = 0; i < size; i ++ ) host_input[ i ] = ( i - size / 2 ) * ( i - size / 2 ); mainReduction( host_input, algorithm_efficiency, 256, 2048 ); } }; Loading Loading @@ -196,13 +298,13 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase void testSimpleReduction2() { cout << "Test reduction 2" << endl; testReduction( 2 ); //testReduction( 2 ); }; void testSimpleReduction1() { cout << "Test reduction 1" << endl; testReduction( 1 ); //testReduction( 1 ); }; Loading Loading
src/core/tnl-cuda-kernels.h +85 −39 Original line number Diff line number Diff line Loading @@ -60,17 +60,19 @@ __global__ void tnlCUDAReductionKernel( const int size, // Read data into the shared memory int tid = threadIdx. x; int gid = 2 * blockSize * blockDim. x + threadIdx. x; int gid = 2 * blockDim. x * blockIdx. x + threadIdx. x; if( gid + blockSize < size ) { if( operation == tnlMin ) sdata[ tid ] = :: tnlCudaMin( d_input[ gid ], d_input[ gid + blockSize ] ); if( operation == tnlMax ) sdata[ tid ] = :: tnlCudaMax( d_input[ gid ], d_input[ gid + blockSize ] ); if( operation == tnlSum ) sdata[ tid ] = d_input[ gid ] + d_input[ gid + blockSize ]; //dbg_array1[ blockIdx. x * blockDim. x + threadIdx. x ] = -1; } else if( gid < size ) { sdata[ tid ] = d_input[ gid ]; //dbg_array1[ blockIdx. x * blockDim. x + threadIdx. x ] = -2; } gid += grid_size; Loading @@ -80,8 +82,9 @@ __global__ void tnlCUDAReductionKernel( const int size, if( operation == tnlMax ) sdata[ tid ] = :: tnlCudaMax( sdata[ tid ], :: tnlCudaMax( d_input[ gid ], d_input[ gid + blockSize ] ) ); if( operation == tnlSum ) sdata[ tid ] += d_input[gid] + d_input[ gid + blockSize ]; gid += grid_size; //dbg_array1[ blockIdx. x * blockDim. x + threadIdx. x ] = -4; } dbg_array1[ blockIdx. x * blockDim. x + threadIdx. x ] = sdata[ tid ]; //dbg_array1[ blockIdx. x * blockDim. x + threadIdx. x ] = sdata[ tid ]; __syncthreads(); if( gid + blockDim. x < size ) Loading Loading @@ -216,6 +219,9 @@ bool tnlCUDAReduction( const int size, T& result, T* device_output = 0 ) { if( ! size ) return false; //Calculate necessary block/grid dimensions const int cpuThreshold = 1; const int desBlockSize = 16; //Desired block size Loading @@ -235,7 +241,7 @@ bool tnlCUDAReduction( const int size, } device_output_allocated = true; cudaMalloc( ( void** ) &dbg_array1, size * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!! //cudaMalloc( ( void** ) &dbg_array1, size * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!! //cudaMalloc( ( void** ) &dbg_array2, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!! } dim3 block_size( 0 ), grid_size( 0 ); Loading @@ -247,40 +253,40 @@ bool tnlCUDAReduction( const int size, block_size. x = :: Min( size_reduced, desBlockSize ); grid_size. x = :: Min( ( int ) ( size_reduced / block_size. x + 1 ) / 2, desGridSize ); shmem = block_size. x * sizeof( T ); cout << "Size: " << size_reduced /*cout << "Size: " << size_reduced << " Grid size: " << grid_size. x << " Block size: " << block_size. x << " Shmem: " << shmem << endl; << " Shmem: " << shmem << endl;*/ tnlAssert( shmem < 16384, cerr << shmem << " bytes are required." << endl; ); tnlAssert( block_size. x <= 512, cerr << "Block size is " << block_size. x << endl; ); switch( block_size. x ) { case 512: tnlCUDAReductionKernel< T, operation, 512 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 ); tnlCUDAReductionKernel< T, operation, 512 ><<< grid_size, block_size, shmem >>>( size_reduced, 2 * grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 ); break; case 256: tnlCUDAReductionKernel< T, operation, 256 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 ); tnlCUDAReductionKernel< T, operation, 256 ><<< grid_size, block_size, shmem >>>( size_reduced, 2 * grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 ); break; case 128: tnlCUDAReductionKernel< T, operation, 128 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 ); tnlCUDAReductionKernel< T, operation, 128 ><<< grid_size, block_size, shmem >>>( size_reduced, 2 * grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 ); break; case 64: tnlCUDAReductionKernel< T, operation, 64 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 ); tnlCUDAReductionKernel< T, operation, 64 ><<< grid_size, block_size, shmem >>>( size_reduced, 2 * grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 ); break; case 32: tnlCUDAReductionKernel< T, operation, 32 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 ); tnlCUDAReductionKernel< T, operation, 32 ><<< grid_size, block_size, shmem >>>( size_reduced, 2 * grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 ); break; case 16: tnlCUDAReductionKernel< T, operation, 16 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 ); tnlCUDAReductionKernel< T, operation, 16 ><<< grid_size, block_size, shmem >>>( size_reduced, 2 * grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 ); break; case 8: tnlCUDAReductionKernel< T, operation, 8 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 ); tnlCUDAReductionKernel< T, operation, 8 ><<< grid_size, block_size, shmem >>>( size_reduced, 2 * grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 ); break; case 4: tnlCUDAReductionKernel< T, operation, 4 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 ); tnlCUDAReductionKernel< T, operation, 4 ><<< grid_size, block_size, shmem >>>( size_reduced, 2 * grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 ); break; case 2: tnlCUDAReductionKernel< T, operation, 2 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 ); tnlCUDAReductionKernel< T, operation, 2 ><<< grid_size, block_size, shmem >>>( size_reduced, 2 * grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 ); break; case 1: tnlAssert( false, cerr << "blockSize should not be 1." << endl ); Loading @@ -293,21 +299,24 @@ bool tnlCUDAReduction( const int size, reduction_input = device_output; // debuging part T* host_array = new T[ size ]; /*T* host_array = new T[ size ]; cudaMemcpy( host_array, dbg_array1, size * sizeof( T ), cudaMemcpyDeviceToHost ); for( int i = 0; i< size; i ++ ) cout << host_array[ i ] << " - "; cout << endl; cout << endl;*/ T* output = new T[ size_reduced ]; /*T* output = new T[ size_reduced ]; cudaMemcpy( output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost ); cout << endl; for( int i = 0; i < size_reduced; i ++ ) cout << output[ i ] << " "; cout << endl; delete[] output; delete[] output;*/ } T* host_output = new T[ size_reduced ]; if( size == 1 ) cudaMemcpy( host_output, device_input, sizeof( T ), cudaMemcpyDeviceToHost ); else cudaMemcpy( host_output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost ); result = host_output[ 0 ]; for( int i = 1; i < size_reduced; i++ ) Loading Loading @@ -563,6 +572,9 @@ bool tnlCUDASimpleReduction5( const int size, delete[] output;*/ } T* host_output = new T[ size_reduced ]; if( size == 1 ) cudaMemcpy( host_output, device_input, sizeof( T ), cudaMemcpyDeviceToHost ); else cudaMemcpy( host_output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost ); result = host_output[ 0 ]; for( int i = 1; i < size_reduced; i++ ) Loading Loading @@ -615,11 +627,13 @@ __global__ void tnlCUDASimpleReductionKernel4( const int size, sdata[ tid ] = d_input[ gid ]; } __syncthreads(); dbg_array1[ tid ] = sdata[ tid ]; // Parallel reduction for( int s = blockDim. x / 2; s > 0; s >>= 1 ) int n = last_tid < blockDim. x ? last_tid : blockDim. x; for( int s = n / 2; s > 0; s >>= 1 ) { if( tid < s && tid + s < last_tid ) if( tid < s && tid + s < n ) { if( operation == tnlMin ) sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] ); Loading @@ -628,7 +642,25 @@ __global__ void tnlCUDASimpleReductionKernel4( const int size, if( operation == tnlSum ) sdata[ tid ] += sdata[ tid + s ]; } /* This is for the case when we have odd number of elements. * The last one will be reduced using the thread with ID 0. */ if( 2 * s < n && tid == 0 ) { if( operation == tnlMin ) sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ n ] ); if( operation == tnlMax ) sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ n ] ); if( operation == tnlSum ) sdata[ 0 ] += sdata[ n ]; dbg_array1[ n ] = -555; //sdata[ 0 ]; } n = s; __syncthreads(); //dbg_array1[ tid ] = -sdata[ tid ]; } // Store the result back in global memory Loading Loading @@ -660,7 +692,7 @@ bool tnlCUDASimpleReduction4( const int size, } device_output_allocated = true; //cudaMalloc( ( void** ) &dbg_array1, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!! cudaMalloc( ( void** ) &dbg_array1, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!! //cudaMalloc( ( void** ) &dbg_array2, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!! } dim3 block_size( 0 ), grid_size( 0 ); Loading @@ -670,18 +702,20 @@ bool tnlCUDASimpleReduction4( const int size, while( size_reduced > cpuThreshold ) { block_size. x = :: Min( size_reduced, desBlockSize ); grid_size. x = ( size_reduced / block_size. x + 1 ) / 2; grid_size. x = size_reduced / block_size. x / 2; if( grid_size. x * 2 * block_size. x < size_reduced ) grid_size. x ++; shmem = block_size. x * sizeof( T ); /*cout << "Size: " << size_reduced cout << "Size: " << size_reduced << " Grid size: " << grid_size. x << " Block size: " << block_size. x << " Shmem: " << shmem << endl;*/ << " Shmem: " << shmem << endl; tnlCUDASimpleReductionKernel4< T, operation ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 ); size_reduced = grid_size. x; reduction_input = device_output; // debuging part /*T* host_array = new T[ desBlockSize ]; T* host_array = new T[ desBlockSize ]; cudaMemcpy( host_array, dbg_array1, desBlockSize * sizeof( T ), cudaMemcpyDeviceToHost ); for( int i = 0; i< :: Min( ( int ) block_size. x, desBlockSize ); i ++ ) cout << host_array[ i ] << " - "; Loading @@ -693,9 +727,12 @@ bool tnlCUDASimpleReduction4( const int size, for( int i = 0; i < size_reduced; i ++ ) cout << output[ i ] << " "; cout << endl; delete[] output;*/ delete[] output; } T* host_output = new T[ size_reduced ]; if( size == 1 ) cudaMemcpy( host_output, device_input, sizeof( T ), cudaMemcpyDeviceToHost ); else cudaMemcpy( host_output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost ); result = host_output[ 0 ]; for( int i = 1; i < size_reduced; i++ ) Loading Loading @@ -783,7 +820,7 @@ bool tnlCUDASimpleReduction3( const int size, while( size_reduced > cpuThreshold ) { block_size. x = :: Min( size_reduced, desBlockSize ); grid_size. x = size_reduced / block_size. x; grid_size. x = size_reduced / block_size. x + ( size_reduced % block_size. x != 0 ); shmem = block_size. x * sizeof( T ); /*cout << "Size: " << size_reduced << " Grid size: " << grid_size. x Loading @@ -803,6 +840,9 @@ bool tnlCUDASimpleReduction3( const int size, delete[] output;*/ } T* host_output = new T[ size_reduced ]; if( size == 1 ) cudaMemcpy( host_output, device_input, sizeof( T ), cudaMemcpyDeviceToHost ); else cudaMemcpy( host_output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost ); result = host_output[ 0 ]; for( int i = 1; i < size_reduced; i++ ) Loading Loading @@ -891,7 +931,7 @@ bool tnlCUDASimpleReduction2( const int size, while( size_reduced > cpuThreshold ) { block_size. x = :: Min( size_reduced, desBlockSize ); grid_size. x = size_reduced / block_size. x; grid_size. x = size_reduced / block_size. x + ( size_reduced % block_size. x != 0 ); shmem = block_size. x * sizeof( T ); /*cout << "Size: " << size_reduced << " Grid size: " << grid_size. x Loading @@ -911,6 +951,9 @@ bool tnlCUDASimpleReduction2( const int size, delete[] output;*/ } T* host_output = new T[ size_reduced ]; if( size == 1 ) cudaMemcpy( host_output, device_input, sizeof( T ), cudaMemcpyDeviceToHost ); else cudaMemcpy( host_output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost ); result = host_output[ 0 ]; for( int i = 1; i < size_reduced; i++ ) Loading Loading @@ -998,7 +1041,7 @@ bool tnlCUDASimpleReduction1( const int size, while( size_reduced > cpuThreshold ) { block_size. x = :: Min( size_reduced, desBlockSize ); grid_size. x = size_reduced / block_size. x; grid_size. x = size_reduced / block_size. x + ( size_reduced % block_size. x != 0 ); shmem = block_size. x * sizeof( T ); /*cout << "Size: " << size_reduced << " Grid size: " << grid_size. x Loading @@ -1018,6 +1061,9 @@ bool tnlCUDASimpleReduction1( const int size, delete[] output;*/ } T* host_output = new T[ size_reduced ]; if( size == 1 ) cudaMemcpy( host_output, device_input, sizeof( T ), cudaMemcpyDeviceToHost ); else cudaMemcpy( host_output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost ); result = host_output[ 0 ]; for( int i = 1; i < size_reduced; i++ ) Loading
src/core/tnlCUDAKernelsTester.h +127 −25 Original line number Diff line number Diff line Loading @@ -19,6 +19,7 @@ #define TNLCUDAKERNELSTESTER_H_ #include <iostream> #include <math.h> #include <cppunit/TestSuite.h> #include <cppunit/TestResult.h> #include <cppunit/TestCaller.h> Loading Loading @@ -98,20 +99,21 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase return true; } void testReduction( int algorithm_efficiency = 0 ) bool mainReduction( const tnlLongVector< T >& host_input, int algorithm_efficiency, const int desired_block_size, const int desired_grid_size ) { int size = 32; 1024; //1<<10; int desBlockSize = 128; //Desired block size int desGridSize = 2048; //Impose limitation on grid size so that threads could perform sequential work tnlLongVector< T > host_input; const int size = host_input. GetSize(); tnlLongVectorCUDA< T > device_input; CPPUNIT_ASSERT( testSetup( host_input, device_input, size ) ); if( ! device_input. SetNewSize( size ) ) return false; device_input. copyFrom( host_input ); T seq_min( host_input[ 0 ] ), seq_max( host_input[ 0 ] ), seq_sum( host_input[ 0 ] ); for( int i = 1; i < size; i ++ ) { seq_min = :: Min( seq_min, host_input[ i ] ); Loading @@ -119,11 +121,6 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase seq_sum += host_input[ i ]; } //Calculate necessary block/grid dimensions int block_size = :: Min( size/2, desBlockSize ); //Grid size is limited in this case int grid_size = :: Min( desGridSize, size / block_size / 2 ); T min, max, sum; switch( algorithm_efficiency ) { Loading Loading @@ -159,13 +156,118 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase } cout << "Min: " << min << " Seq. min: " << seq_min << endl << "Max: " << max << " Seq. max: " << seq_max << endl << "Sum: " << sum << " Seq. sum: " << seq_sum << endl; if( min == seq_min ) cout << "Min: " << min << " Seq. min: " << seq_min << " :-)" << endl; else cout << "Min: " << min << " Seq. min: " << seq_min << " !!!!!!!!!!" << endl; if( max == seq_max ) cout << "Max: " << max << " Seq. max: " << seq_max << " :-)" << endl; else cout << "Max: " << max << " Seq. max: " << seq_max << " !!!!!!!!!!" << endl; if( sum == seq_sum ) cout << "Sum: " << sum << " Seq. sum: " << seq_sum << " :-)" << endl; else cout << "Sum: " << sum << " Seq. sum: " << seq_sum << " !!!!!!!!!!" << endl; T param; if( GetParameterType( param ) == "float" ) { CPPUNIT_ASSERT( min == seq_min ); CPPUNIT_ASSERT( max == seq_max ); if( sum == 0.0 ) { CPPUNIT_ASSERT( sum == seq_sum ); } else { double diff = ( ( double ) sum - ( double ) seq_sum ) / ( double) sum; CPPUNIT_ASSERT( fabs( diff ) < 1.0e-5 ); } } else { CPPUNIT_ASSERT( min == seq_min ); CPPUNIT_ASSERT( max == seq_max ); CPPUNIT_ASSERT( sum == seq_sum ); } } void testReduction( int algorithm_efficiency = 0 ) { tnlLongVector< T > host_input; int size = 2; /*for( int s = 1; s < 12; s ++ ) { tnlLongVector< T > host_input( size ); for( int i = 0; i < size; i ++ ) host_input[ i ] = 0.0; mainReduction( host_input, algorithm_efficiency, 256, 2048 ); for( int i = 0; i < size; i ++ ) host_input[ i ] = 1.0; mainReduction( host_input, algorithm_efficiency, 256, 2048 ); for( int i = 0; i < size; i ++ ) host_input[ i ] = i; mainReduction( host_input, algorithm_efficiency, 256, 2048 ); for( int i = 0; i < size; i ++ ) host_input[ i ] = ( i - size / 2 ) * ( i - size / 2 ); mainReduction( host_input, algorithm_efficiency, 256, 2048 ); size *= 2; }*/ for( size = 257; size < 5000; size ++ ) { cout << "************* Size is " << size << " ******************* " << endl; tnlLongVector< T > host_input( size ); /*for( int i = 0; i < size; i ++ ) host_input[ i ] = 0.0; mainReduction( host_input, algorithm_efficiency, 256, 2048 );*/ for( int i = 0; i < size; i ++ ) host_input[ i ] = 1.0; mainReduction( host_input, algorithm_efficiency, 256, 2048 ); for( int i = 0; i < size; i ++ ) host_input[ i ] = i; mainReduction( host_input, algorithm_efficiency, 256, 2048 ); for( int i = 0; i < size; i ++ ) host_input[ i ] = ( i - size / 2 ) * ( i - size / 2 ); mainReduction( host_input, algorithm_efficiency, 256, 2048 ); } }; Loading Loading @@ -196,13 +298,13 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase void testSimpleReduction2() { cout << "Test reduction 2" << endl; testReduction( 2 ); //testReduction( 2 ); }; void testSimpleReduction1() { cout << "Test reduction 1" << endl; testReduction( 1 ); //testReduction( 1 ); }; Loading