Loading src/core/tnl-cuda-kernels.cu +161 −1 Original line number Diff line number Diff line Loading @@ -15,7 +15,11 @@ * * ***************************************************************************/ #include <tnl-cuda-kernels.h> #include <iostream> #include <core/mfuncs.h> #include <core/tnl-cuda-kernels.h> using namespace std; int tnlCUDAReductionMin( const int size, const int block_size, Loading Loading @@ -90,4 +94,160 @@ double tnlCUDAReductionSum( const int size, return tnlCUDAReduction< double, tnlSum >( size, block_size, grid_size, input ); } /* * Simple redcution 1 */ int tnlCUDASimpleReduction1Min( const int size, const int block_size, const int grid_size, const int* input, int* output ) { //Calculate necessary block/grid dimensions const int cpuThreshold = 1; const int desBlockSize = 128; //Desired block size dim3 blockSize = :: Min( size, desBlockSize ); dim3 gridSize = size / blockSize. x; unsigned int shmem = blockSize. x * sizeof( int ); cout << "Grid size: " << gridSize. x << endl << "Block size: " << blockSize. x << endl << "Shmem: " << shmem << endl; tnlCUDASimpleReductionKernel1< int, tnlMin ><<< gridSize, blockSize, shmem >>>( size, input, output ); int sizeReduced = gridSize. x; while( sizeReduced > cpuThreshold ) { cout << "Reducing with size reduced = " << sizeReduced << endl; blockSize. x = :: Min( sizeReduced, desBlockSize ); gridSize. x = sizeReduced / blockSize. x; shmem = blockSize. x * sizeof(int); tnlCUDASimpleReductionKernel1< int, tnlMin ><<< gridSize, blockSize, shmem >>>( size, input, output ); sizeReduced = gridSize. x; } int* host_output = new int[ sizeReduced ]; cudaMemcpy( host_output, output, sizeReduced * sizeof(int), cudaMemcpyDeviceToHost ); int result = host_output[ 0 ]; for( int i = 1;i < sizeReduced; i++ ) result = :: Min( result, host_output[ i ] ); delete[] host_output; return result; } int tnlCUDASimpleReduction1Max( const int size, const int block_size, const int grid_size, const int* input, int* output ) { //Calculate necessary block/grid dimensions const int cpuThreshold = 1; const int desBlockSize = 128; //Desired block size dim3 blockSize = :: Min( size, desBlockSize ); dim3 gridSize = size / blockSize. x; unsigned int shmem = blockSize. x * sizeof( int ); cout << "Grid size: " << gridSize. x << endl << "Block size: " << blockSize. x << endl << "Shmem: " << shmem << endl; tnlCUDASimpleReductionKernel1< int, tnlMax ><<< gridSize, blockSize, shmem >>>( size, input, output ); int sizeReduced = gridSize. x; while( sizeReduced > cpuThreshold ) { cout << "Reducing with size reduced = " << sizeReduced << endl; blockSize. x = :: Min( sizeReduced, desBlockSize ); gridSize. x = sizeReduced / blockSize. x; shmem = blockSize. x * sizeof(int); tnlCUDASimpleReductionKernel1< int, tnlMax ><<< gridSize, blockSize, shmem >>>( size, input, output ); sizeReduced = gridSize. x; } int* host_output = new int[ sizeReduced ]; cudaMemcpy( host_output, output, sizeReduced * sizeof(int), cudaMemcpyDeviceToHost ); int result = host_output[ 0 ]; for( int i = 1;i < sizeReduced; i++ ) result = :: Max( result, host_output[ i ] ); delete[] host_output; return result; } int tnlCUDASimpleReduction1Sum( const int size, const int block_size, const int grid_size, const int* input, int* output ) { //Calculate necessary block/grid dimensions const int cpuThreshold = 1; const int desBlockSize = 128; //Desired block size dim3 blockSize = :: Min( size, desBlockSize ); dim3 gridSize = size / blockSize. x; unsigned int shmem = blockSize. x * sizeof( int ); cout << "Grid size: " << gridSize. x << endl << "Block size: " << blockSize. x << endl << "Shmem: " << shmem << endl; tnlCUDASimpleReductionKernel1< int, tnlSum ><<< gridSize, blockSize, shmem >>>( size, input, output ); int sizeReduced = gridSize. x; while( sizeReduced > cpuThreshold ) { cout << "Reducing with size reduced = " << sizeReduced << endl; blockSize. x = :: Min( sizeReduced, desBlockSize ); gridSize. x = sizeReduced / blockSize. x; shmem = blockSize. x * sizeof(int); tnlCUDASimpleReductionKernel1< int, tnlSum ><<< gridSize, blockSize, shmem >>>( size, input, output ); sizeReduced = gridSize. x; } int* host_output = new int[ sizeReduced ]; cudaMemcpy( host_output, output, sizeReduced * sizeof(int), cudaMemcpyDeviceToHost ); int result = host_output[ 0 ]; for( int i = 1;i < sizeReduced; i++ ) result += host_output[ i ]; delete[] host_output; return result; } /* float tnlCUDASimpleReduction1Min( const int size, const int block_size, const int grid_size, const float* input ) { return tnlCUDASimpleReduction1< float, tnlMin >( size, block_size, grid_size, input ); } float tnlCUDASimpleReduction1Max( const int size, const int block_size, const int grid_size, const float* input ) { return tnlCUDASimpleReduction1< float, tnlMax >( size, block_size, grid_size, input ); } float tnlCUDASimpleReduction1Sum( const int size, const int block_size, const int grid_size, const float* input ) { return tnlCUDASimpleReduction1< float, tnlSum >( size, block_size, grid_size, input ); } double tnlCUDASimpleReduction1Min( const int size, const int block_size, const int grid_size, const double* input ) { return tnlCUDASimpleReduction1< double, tnlMin >( size, block_size, grid_size, input ); } double tnlCUDASimpleReduction1Max( const int size, const int block_size, const int grid_size, const double* input ) { return tnlCUDASimpleReduction1< double, tnlMax >( size, block_size, grid_size, input ); } double tnlCUDASimpleReduction1Sum( const int size, const int block_size, const int grid_size, const double* input ) { return tnlCUDASimpleReduction1< double, tnlSum >( size, block_size, grid_size, input ); }*/ No newline at end of file src/core/tnl-cuda-kernels.h +87 −0 Original line number Diff line number Diff line Loading @@ -235,6 +235,93 @@ T tnlCUDAReduction( const int size, return result; } template < class T, tnlOperation operation > __global__ void tnlCUDASimpleReductionKernel1( const int size, const T* d_input, T* d_output ) { extern __shared__ int sdata[]; // Read data into the shared memory int tid = threadIdx. x; int gid = blockIdx. x * blockDim. x + threadIdx. x; // Last thread ID which manipulates meaningful data int last_tid = size - blockIdx. x * blockDim. x; if( gid < size ) sdata[tid] = d_input[gid]; __syncthreads(); // Parallel reduction for( int s = 1; s < blockDim. x; s *= 2 ) { if( ( tid % ( 2 * s ) ) == 0 && tid + s < last_tid ) { if( operation == tnlMin ) sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlMax ) sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlSum ) sdata[ tid ] += sdata[ tid + s ]; } __syncthreads(); } // Store the result back in global memory if( tid == 0 ) d_output[ blockIdx. x ] = sdata[ 0 ]; } template< class T, tnlOperation operation > T tnlCUDASimpleReduction1( const int size, const int block_size, const int grid_size, const T* d_input ) { T result; dim3 blockSize( block_size ); dim3 gridSize( grid_size ); int shmem = 512 * sizeof( T ); tnlAssert( shmem < 16384, cerr << shmem << " bytes are required." ); switch( block_size ) { case 512: tnlCUDASimpleReductionKernel1< T, operation, 512 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); break; case 256: tnlCUDASimpleReductionKernel1< T, operation, 256 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); break; case 128: tnlCUDASimpleReductionKernel1< T, operation, 128 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); break; case 64: tnlCUDASimpleReductionKernel1< T, operation, 64 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); break; case 32: tnlCUDASimpleReductionKernel1< T, operation, 32 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); break; case 16: tnlCUDASimpleReductionKernel1< T, operation, 16 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); break; case 8: tnlCUDASimpleReductionKernel1< T, operation, 8 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); break; case 4: tnlCUDAReductionKernel< T, operation, 4 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); break; case 2: tnlCUDAReductionKernel< T, operation, 2 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); break; case 1: tnlCUDAReductionKernel< T, operation, 1 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); break; default: tnlAssert( false, cerr << "Block size is " << block_size << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); break; } return result; } #endif /* HAVE_CUDA */ #endif /* TNLCUDAKERNELS_H_ */ src/core/tnlCUDAKernelsTester.h +107 −20 Original line number Diff line number Diff line Loading @@ -67,6 +67,49 @@ double tnlCUDAReductionSum( const int size, const int grid_size, const double* input ); /* * Simple reduction 1 */ int tnlCUDASimpleReduction1Min( const int size, const int block_size, const int grid_size, const int* input, int* output ); int tnlCUDASimpleReduction1Max( const int size, const int block_size, const int grid_size, const int* input, int* output ); int tnlCUDASimpleReduction1Sum( const int size, const int block_size, const int grid_size, const int* input, int* output ); float tnlCUDASimpleReduction1Min( const int size, const int block_size, const int grid_size, const float* input ); float tnlCUDASimpleReduction1Max( const int size, const int block_size, const int grid_size, const float* input ); float tnlCUDASimpleReduction1Sum( const int size, const int block_size, const int grid_size, const float* input ); double tnlCUDASimpleReduction1Min( const int size, const int block_size, const int grid_size, const double* input ); double tnlCUDASimpleReduction1Max( const int size, const int block_size, const int grid_size, const double* input ); double tnlCUDASimpleReduction1Sum( const int size, const int block_size, const int grid_size, const double* input ); #endif Loading @@ -82,50 +125,94 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase { CppUnit :: TestSuite* suiteOfTests = new CppUnit :: TestSuite( "tnlCUDAKernelsTester" ); CppUnit :: TestResult result; suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >( "testSimpleReduction1", & tnlCUDAKernelsTester< T > :: testSimpleReduction1 ) ); suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >( "testReduction", & tnlCUDAKernelsTester< T > :: testReduction ) & tnlCUDAKernelsTester< T > :: testFastReduction ) ); return suiteOfTests; }; void testReduction() bool testSetup( tnlLongVector< T >& host_input, tnlLongVector< T >& host_output, tnlLongVectorCUDA< T >& device_input, tnlLongVectorCUDA< T >& device_output, int size ) { /* * Test by Jan Vacata. */ int size = 100; int desBlockSize = 128; //Desired block size int desGridSize = 2048; //Impose limitation on grid size so that threads could perform sequential work tnlLongVector< T > host_input( size ); tnlLongVector< T > host_output( size ); tnlLongVectorCUDA< T > device_input( size ); tnlLongVectorCUDA< T > device_output( size ); if( ! host_input. SetNewSize( size ) ) return false; if( ! host_output. SetNewSize( size ) ) return false; if( ! device_input. SetNewSize( size ) ) return false; if( ! device_output. SetNewSize( size ) ) return false; for( int i=0; i < size; i ++ ) { host_input[ i ] = 1; host_input[ i ] = i + 1; host_output[ i ] = 0; } device_input. copyFrom( host_input ); return true; } void testReduction( int algorithm_efficiency = 0 ) { int size = 1<<16; int desBlockSize = 128; //Desired block size int desGridSize = 2048; //Impose limitation on grid size so that threads could perform sequential work tnlLongVector< T > host_input, host_output; tnlLongVectorCUDA< T > device_input, device_output; CPPUNIT_ASSERT( testSetup( host_input, host_output, device_input, device_output, size ) ); //Calculate necessary block/grid dimensions int block_size = :: Min( size/2, desBlockSize ); //Grid size is limited in this case int grid_size = :: Min( desGridSize, size / block_size / 2 ); T min = tnlCUDAReductionMin( size, block_size, grid_size, device_input. Data() ); T max = tnlCUDAReductionMax( size, block_size, grid_size, device_input. Data() ); T sum = tnlCUDAReductionSum( size, block_size, grid_size, device_input. Data() ); T min, max, sum; switch( algorithm_efficiency ) { case 1: min = tnlCUDASimpleReduction1Min( size, block_size, grid_size, device_input. Data(), device_output. Data() ); max = tnlCUDASimpleReduction1Max( size, block_size, grid_size, device_input. Data(), device_output. Data() ); sum = tnlCUDASimpleReduction1Sum( size, block_size, grid_size, device_input. Data(), device_output. Data() ); break; default: min = tnlCUDAReductionMin( size, block_size, grid_size, device_input. Data() ); max = tnlCUDAReductionMax( size, block_size, grid_size, device_input. Data() ); sum = tnlCUDAReductionSum( size, block_size, grid_size, device_input. Data() ); } cout << "Min: " << min << "Max: " << max cout << "Min: " << min << endl << "Max: " << max << endl << "Sum: " << sum << endl; }; void testSimpleReduction1() { testReduction( 1 ); }; void testFastReduction() { testReduction( 0 ); } }; Loading src/tnl-unit-tests.cpp +2 −2 Original line number Diff line number Diff line Loading @@ -44,8 +44,8 @@ int main( int argc, char* argv[] ) runner.addTest( tnlGridCUDA2DTester< double > :: suite() ); runner.addTest( tnlCUDAKernelsTester< int > :: suite() ); runner.addTest( tnlCUDAKernelsTester< float > :: suite() ); runner.addTest( tnlCUDAKernelsTester< double > :: suite() ); //runner.addTest( tnlCUDAKernelsTester< float > :: suite() ); //runner.addTest( tnlCUDAKernelsTester< double > :: suite() ); runner.run(); return 0; Loading Loading
src/core/tnl-cuda-kernels.cu +161 −1 Original line number Diff line number Diff line Loading @@ -15,7 +15,11 @@ * * ***************************************************************************/ #include <tnl-cuda-kernels.h> #include <iostream> #include <core/mfuncs.h> #include <core/tnl-cuda-kernels.h> using namespace std; int tnlCUDAReductionMin( const int size, const int block_size, Loading Loading @@ -90,4 +94,160 @@ double tnlCUDAReductionSum( const int size, return tnlCUDAReduction< double, tnlSum >( size, block_size, grid_size, input ); } /* * Simple redcution 1 */ int tnlCUDASimpleReduction1Min( const int size, const int block_size, const int grid_size, const int* input, int* output ) { //Calculate necessary block/grid dimensions const int cpuThreshold = 1; const int desBlockSize = 128; //Desired block size dim3 blockSize = :: Min( size, desBlockSize ); dim3 gridSize = size / blockSize. x; unsigned int shmem = blockSize. x * sizeof( int ); cout << "Grid size: " << gridSize. x << endl << "Block size: " << blockSize. x << endl << "Shmem: " << shmem << endl; tnlCUDASimpleReductionKernel1< int, tnlMin ><<< gridSize, blockSize, shmem >>>( size, input, output ); int sizeReduced = gridSize. x; while( sizeReduced > cpuThreshold ) { cout << "Reducing with size reduced = " << sizeReduced << endl; blockSize. x = :: Min( sizeReduced, desBlockSize ); gridSize. x = sizeReduced / blockSize. x; shmem = blockSize. x * sizeof(int); tnlCUDASimpleReductionKernel1< int, tnlMin ><<< gridSize, blockSize, shmem >>>( size, input, output ); sizeReduced = gridSize. x; } int* host_output = new int[ sizeReduced ]; cudaMemcpy( host_output, output, sizeReduced * sizeof(int), cudaMemcpyDeviceToHost ); int result = host_output[ 0 ]; for( int i = 1;i < sizeReduced; i++ ) result = :: Min( result, host_output[ i ] ); delete[] host_output; return result; } int tnlCUDASimpleReduction1Max( const int size, const int block_size, const int grid_size, const int* input, int* output ) { //Calculate necessary block/grid dimensions const int cpuThreshold = 1; const int desBlockSize = 128; //Desired block size dim3 blockSize = :: Min( size, desBlockSize ); dim3 gridSize = size / blockSize. x; unsigned int shmem = blockSize. x * sizeof( int ); cout << "Grid size: " << gridSize. x << endl << "Block size: " << blockSize. x << endl << "Shmem: " << shmem << endl; tnlCUDASimpleReductionKernel1< int, tnlMax ><<< gridSize, blockSize, shmem >>>( size, input, output ); int sizeReduced = gridSize. x; while( sizeReduced > cpuThreshold ) { cout << "Reducing with size reduced = " << sizeReduced << endl; blockSize. x = :: Min( sizeReduced, desBlockSize ); gridSize. x = sizeReduced / blockSize. x; shmem = blockSize. x * sizeof(int); tnlCUDASimpleReductionKernel1< int, tnlMax ><<< gridSize, blockSize, shmem >>>( size, input, output ); sizeReduced = gridSize. x; } int* host_output = new int[ sizeReduced ]; cudaMemcpy( host_output, output, sizeReduced * sizeof(int), cudaMemcpyDeviceToHost ); int result = host_output[ 0 ]; for( int i = 1;i < sizeReduced; i++ ) result = :: Max( result, host_output[ i ] ); delete[] host_output; return result; } int tnlCUDASimpleReduction1Sum( const int size, const int block_size, const int grid_size, const int* input, int* output ) { //Calculate necessary block/grid dimensions const int cpuThreshold = 1; const int desBlockSize = 128; //Desired block size dim3 blockSize = :: Min( size, desBlockSize ); dim3 gridSize = size / blockSize. x; unsigned int shmem = blockSize. x * sizeof( int ); cout << "Grid size: " << gridSize. x << endl << "Block size: " << blockSize. x << endl << "Shmem: " << shmem << endl; tnlCUDASimpleReductionKernel1< int, tnlSum ><<< gridSize, blockSize, shmem >>>( size, input, output ); int sizeReduced = gridSize. x; while( sizeReduced > cpuThreshold ) { cout << "Reducing with size reduced = " << sizeReduced << endl; blockSize. x = :: Min( sizeReduced, desBlockSize ); gridSize. x = sizeReduced / blockSize. x; shmem = blockSize. x * sizeof(int); tnlCUDASimpleReductionKernel1< int, tnlSum ><<< gridSize, blockSize, shmem >>>( size, input, output ); sizeReduced = gridSize. x; } int* host_output = new int[ sizeReduced ]; cudaMemcpy( host_output, output, sizeReduced * sizeof(int), cudaMemcpyDeviceToHost ); int result = host_output[ 0 ]; for( int i = 1;i < sizeReduced; i++ ) result += host_output[ i ]; delete[] host_output; return result; } /* float tnlCUDASimpleReduction1Min( const int size, const int block_size, const int grid_size, const float* input ) { return tnlCUDASimpleReduction1< float, tnlMin >( size, block_size, grid_size, input ); } float tnlCUDASimpleReduction1Max( const int size, const int block_size, const int grid_size, const float* input ) { return tnlCUDASimpleReduction1< float, tnlMax >( size, block_size, grid_size, input ); } float tnlCUDASimpleReduction1Sum( const int size, const int block_size, const int grid_size, const float* input ) { return tnlCUDASimpleReduction1< float, tnlSum >( size, block_size, grid_size, input ); } double tnlCUDASimpleReduction1Min( const int size, const int block_size, const int grid_size, const double* input ) { return tnlCUDASimpleReduction1< double, tnlMin >( size, block_size, grid_size, input ); } double tnlCUDASimpleReduction1Max( const int size, const int block_size, const int grid_size, const double* input ) { return tnlCUDASimpleReduction1< double, tnlMax >( size, block_size, grid_size, input ); } double tnlCUDASimpleReduction1Sum( const int size, const int block_size, const int grid_size, const double* input ) { return tnlCUDASimpleReduction1< double, tnlSum >( size, block_size, grid_size, input ); }*/ No newline at end of file
src/core/tnl-cuda-kernels.h +87 −0 Original line number Diff line number Diff line Loading @@ -235,6 +235,93 @@ T tnlCUDAReduction( const int size, return result; } template < class T, tnlOperation operation > __global__ void tnlCUDASimpleReductionKernel1( const int size, const T* d_input, T* d_output ) { extern __shared__ int sdata[]; // Read data into the shared memory int tid = threadIdx. x; int gid = blockIdx. x * blockDim. x + threadIdx. x; // Last thread ID which manipulates meaningful data int last_tid = size - blockIdx. x * blockDim. x; if( gid < size ) sdata[tid] = d_input[gid]; __syncthreads(); // Parallel reduction for( int s = 1; s < blockDim. x; s *= 2 ) { if( ( tid % ( 2 * s ) ) == 0 && tid + s < last_tid ) { if( operation == tnlMin ) sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlMax ) sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] ); if( operation == tnlSum ) sdata[ tid ] += sdata[ tid + s ]; } __syncthreads(); } // Store the result back in global memory if( tid == 0 ) d_output[ blockIdx. x ] = sdata[ 0 ]; } template< class T, tnlOperation operation > T tnlCUDASimpleReduction1( const int size, const int block_size, const int grid_size, const T* d_input ) { T result; dim3 blockSize( block_size ); dim3 gridSize( grid_size ); int shmem = 512 * sizeof( T ); tnlAssert( shmem < 16384, cerr << shmem << " bytes are required." ); switch( block_size ) { case 512: tnlCUDASimpleReductionKernel1< T, operation, 512 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); break; case 256: tnlCUDASimpleReductionKernel1< T, operation, 256 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); break; case 128: tnlCUDASimpleReductionKernel1< T, operation, 128 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); break; case 64: tnlCUDASimpleReductionKernel1< T, operation, 64 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); break; case 32: tnlCUDASimpleReductionKernel1< T, operation, 32 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); break; case 16: tnlCUDASimpleReductionKernel1< T, operation, 16 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); break; case 8: tnlCUDASimpleReductionKernel1< T, operation, 8 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); break; case 4: tnlCUDAReductionKernel< T, operation, 4 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); break; case 2: tnlCUDAReductionKernel< T, operation, 2 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); break; case 1: tnlCUDAReductionKernel< T, operation, 1 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); break; default: tnlAssert( false, cerr << "Block size is " << block_size << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); break; } return result; } #endif /* HAVE_CUDA */ #endif /* TNLCUDAKERNELS_H_ */
src/core/tnlCUDAKernelsTester.h +107 −20 Original line number Diff line number Diff line Loading @@ -67,6 +67,49 @@ double tnlCUDAReductionSum( const int size, const int grid_size, const double* input ); /* * Simple reduction 1 */ int tnlCUDASimpleReduction1Min( const int size, const int block_size, const int grid_size, const int* input, int* output ); int tnlCUDASimpleReduction1Max( const int size, const int block_size, const int grid_size, const int* input, int* output ); int tnlCUDASimpleReduction1Sum( const int size, const int block_size, const int grid_size, const int* input, int* output ); float tnlCUDASimpleReduction1Min( const int size, const int block_size, const int grid_size, const float* input ); float tnlCUDASimpleReduction1Max( const int size, const int block_size, const int grid_size, const float* input ); float tnlCUDASimpleReduction1Sum( const int size, const int block_size, const int grid_size, const float* input ); double tnlCUDASimpleReduction1Min( const int size, const int block_size, const int grid_size, const double* input ); double tnlCUDASimpleReduction1Max( const int size, const int block_size, const int grid_size, const double* input ); double tnlCUDASimpleReduction1Sum( const int size, const int block_size, const int grid_size, const double* input ); #endif Loading @@ -82,50 +125,94 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase { CppUnit :: TestSuite* suiteOfTests = new CppUnit :: TestSuite( "tnlCUDAKernelsTester" ); CppUnit :: TestResult result; suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >( "testSimpleReduction1", & tnlCUDAKernelsTester< T > :: testSimpleReduction1 ) ); suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >( "testReduction", & tnlCUDAKernelsTester< T > :: testReduction ) & tnlCUDAKernelsTester< T > :: testFastReduction ) ); return suiteOfTests; }; void testReduction() bool testSetup( tnlLongVector< T >& host_input, tnlLongVector< T >& host_output, tnlLongVectorCUDA< T >& device_input, tnlLongVectorCUDA< T >& device_output, int size ) { /* * Test by Jan Vacata. */ int size = 100; int desBlockSize = 128; //Desired block size int desGridSize = 2048; //Impose limitation on grid size so that threads could perform sequential work tnlLongVector< T > host_input( size ); tnlLongVector< T > host_output( size ); tnlLongVectorCUDA< T > device_input( size ); tnlLongVectorCUDA< T > device_output( size ); if( ! host_input. SetNewSize( size ) ) return false; if( ! host_output. SetNewSize( size ) ) return false; if( ! device_input. SetNewSize( size ) ) return false; if( ! device_output. SetNewSize( size ) ) return false; for( int i=0; i < size; i ++ ) { host_input[ i ] = 1; host_input[ i ] = i + 1; host_output[ i ] = 0; } device_input. copyFrom( host_input ); return true; } void testReduction( int algorithm_efficiency = 0 ) { int size = 1<<16; int desBlockSize = 128; //Desired block size int desGridSize = 2048; //Impose limitation on grid size so that threads could perform sequential work tnlLongVector< T > host_input, host_output; tnlLongVectorCUDA< T > device_input, device_output; CPPUNIT_ASSERT( testSetup( host_input, host_output, device_input, device_output, size ) ); //Calculate necessary block/grid dimensions int block_size = :: Min( size/2, desBlockSize ); //Grid size is limited in this case int grid_size = :: Min( desGridSize, size / block_size / 2 ); T min = tnlCUDAReductionMin( size, block_size, grid_size, device_input. Data() ); T max = tnlCUDAReductionMax( size, block_size, grid_size, device_input. Data() ); T sum = tnlCUDAReductionSum( size, block_size, grid_size, device_input. Data() ); T min, max, sum; switch( algorithm_efficiency ) { case 1: min = tnlCUDASimpleReduction1Min( size, block_size, grid_size, device_input. Data(), device_output. Data() ); max = tnlCUDASimpleReduction1Max( size, block_size, grid_size, device_input. Data(), device_output. Data() ); sum = tnlCUDASimpleReduction1Sum( size, block_size, grid_size, device_input. Data(), device_output. Data() ); break; default: min = tnlCUDAReductionMin( size, block_size, grid_size, device_input. Data() ); max = tnlCUDAReductionMax( size, block_size, grid_size, device_input. Data() ); sum = tnlCUDAReductionSum( size, block_size, grid_size, device_input. Data() ); } cout << "Min: " << min << "Max: " << max cout << "Min: " << min << endl << "Max: " << max << endl << "Sum: " << sum << endl; }; void testSimpleReduction1() { testReduction( 1 ); }; void testFastReduction() { testReduction( 0 ); } }; Loading
src/tnl-unit-tests.cpp +2 −2 Original line number Diff line number Diff line Loading @@ -44,8 +44,8 @@ int main( int argc, char* argv[] ) runner.addTest( tnlGridCUDA2DTester< double > :: suite() ); runner.addTest( tnlCUDAKernelsTester< int > :: suite() ); runner.addTest( tnlCUDAKernelsTester< float > :: suite() ); runner.addTest( tnlCUDAKernelsTester< double > :: suite() ); //runner.addTest( tnlCUDAKernelsTester< float > :: suite() ); //runner.addTest( tnlCUDAKernelsTester< double > :: suite() ); runner.run(); return 0; Loading