Loading configure.ac +1 −1 Original line number Diff line number Diff line Loading @@ -44,7 +44,7 @@ then STORE_CPPFLAGS=$CPPFLAGS STORE_LDFLAGS=$LDFLAGS CUDA_LDFLAGS="-L$CUDA_LIBS" CUDA_CXXFLAGS="-I$CUDA_HEADERS -DHAVE_CUDA -arch sm_13" CUDA_CXXFLAGS="-I$CUDA_HEADERS -DHAVE_CUDA" CXXFLAGS="$CXXFLAGS -I$CUDA_HEADERS" CPPFLAGS="$CPPFLAGS -I$CUDA_HEADERS" LDFLAGS="$LDFLAGS $CUDA_LDFLAGS" Loading src/Makefile.am +2 −0 Original line number Diff line number Diff line Loading @@ -61,4 +61,6 @@ tnl_unit_tests_dbg_LDADD = libtnl-dbg-0.1.la \ core/libcore-tests-dbg.la endif TESTS = tnl-unit-tests No newline at end of file src/core/tnl-cuda-kernels.cu +1 −27 Original line number Diff line number Diff line Loading @@ -104,33 +104,7 @@ int tnlCUDASimpleReduction1Min( const int size, const int* input, int* output ) { //Calculate necessary block/grid dimensions const int cpuThreshold = 1; const int desBlockSize = 128; //Desired block size dim3 blockSize = :: Min( size, desBlockSize ); dim3 gridSize = size / blockSize. x; unsigned int shmem = blockSize. x * sizeof( int ); cout << "Grid size: " << gridSize. x << endl << "Block size: " << blockSize. x << endl << "Shmem: " << shmem << endl; tnlCUDASimpleReductionKernel1< int, tnlMin ><<< gridSize, blockSize, shmem >>>( size, input, output ); int sizeReduced = gridSize. x; while( sizeReduced > cpuThreshold ) { cout << "Reducing with size reduced = " << sizeReduced << endl; blockSize. x = :: Min( sizeReduced, desBlockSize ); gridSize. x = sizeReduced / blockSize. x; shmem = blockSize. x * sizeof(int); tnlCUDASimpleReductionKernel1< int, tnlMin ><<< gridSize, blockSize, shmem >>>( size, input, output ); sizeReduced = gridSize. x; } int* host_output = new int[ sizeReduced ]; cudaMemcpy( host_output, output, sizeReduced * sizeof(int), cudaMemcpyDeviceToHost ); int result = host_output[ 0 ]; for( int i = 1;i < sizeReduced; i++ ) result = :: Min( result, host_output[ i ] ); delete[] host_output; return result; } int tnlCUDASimpleReduction1Max( const int size, Loading src/core/tnl-cuda-kernels.h +41 −45 Original line number Diff line number Diff line Loading @@ -273,52 +273,48 @@ __global__ void tnlCUDASimpleReductionKernel1( const int size, template< class T, tnlOperation operation > T tnlCUDASimpleReduction1( const int size, const int block_size, const int grid_size, const T* d_input ) const T* d_input, T* output = 0 ) { T result; dim3 blockSize( block_size ); dim3 gridSize( grid_size ); int shmem = 512 * sizeof( T ); tnlAssert( shmem < 16384, cerr << shmem << " bytes are required." ); switch( block_size ) //Calculate necessary block/grid dimensions const int cpuThreshold = 1; const int desBlockSize = 128; //Desired block size dim3 block_size = :: Min( size, desBlockSize ); dim3 grid_size = size / blockSize. x; unsigned int shmem = blockSize. x * sizeof( int ); cout << "Grid size: " << grid_size. x << endl << "Block size: " << block_size. x << endl << "Shmem: " << shmem << endl; if( ! output ) { case 512: tnlCUDASimpleReductionKernel1< T, operation, 512 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); break; case 256: tnlCUDASimpleReductionKernel1< T, operation, 256 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); break; case 128: tnlCUDASimpleReductionKernel1< T, operation, 128 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); break; case 64: tnlCUDASimpleReductionKernel1< T, operation, 64 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); break; case 32: tnlCUDASimpleReductionKernel1< T, operation, 32 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); break; case 16: tnlCUDASimpleReductionKernel1< T, operation, 16 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); break; case 8: tnlCUDASimpleReductionKernel1< T, operation, 8 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); break; case 4: tnlCUDAReductionKernel< T, operation, 4 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); break; case 2: tnlCUDAReductionKernel< T, operation, 2 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); break; case 1: tnlCUDAReductionKernel< T, operation, 1 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); break; default: tnlAssert( false, cerr << "Block size is " << block_size << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); break; cudaMalloc( ( void** ) &output, grid_size * sizeof( T ) ); if( cuda // TODO: add allocation check } tnlCUDASimpleReductionKernel1< T, operation ><<< grid_size, block_size, shmem >>>( size, input, output ); int size_reduced = grid_size. x; while( sizeReduced > cpuThreshold ) { cout << "Reducing with size reduced = " << size_reduced << endl; block_size. x = :: Min( size_reduced, desBlockSize ); grid_size. x = size_reduced / block_size. x; shmem = block_size. x * sizeof(int); tnlCUDASimpleReductionKernel1< T, operation ><<< grid_size, block_size, shmem >>>( size, input, output ); size_reduced = grid_size. x; } int* host_output = new int[ size_reduced ]; cudaMemcpy( host_output, output, sizeReduced * sizeof(int), cudaMemcpyDeviceToHost ); int result = host_output[ 0 ]; for( int i = 1;i < size_reduced; i++ ) { if( operation == tnlMin) result = :: Min( result, host_output[ i ] ); if( operation == tnlMax ) result = :: Max( result, host_output[ i ] ); if( operation == tnlSum ) result += host_ouput[ i ]; } delete[] host_output; return result; } Loading src/core/tnlLongVectorCUDATester.cu.h +10 −8 Original line number Diff line number Diff line Loading @@ -15,6 +15,15 @@ __global__ void setMultiBlockNumber( const T c, T* A, const int size ) if( i < size ) A[ i ] = c; }; template< class T > __global__ void setNumber( const T c, T* A, const int size ) { int i = threadIdx. x; if( i < size ) A[ i ] = c; }; template< class T > void testMultiBlockKernel( const T& number, const int size ) { Loading @@ -37,20 +46,13 @@ void testMultiBlockKernel( const T& number, const int size ) CPPUNIT_ASSERT( ! errors ); }; template< class T > __global__ void setNumber( T* A, const T c ) { int i = threadIdx. x; A[ i ] = c; }; template< class T > void testKernel( const T& number, const int size ) { tnlLongVectorCUDA< T > device_vector( size ); tnlLongVector< T > host_vector( size ); T* data = device_vector. Data(); setNumber<<< 1, size >>>( data, number ); setNumber<<< 1, size >>>( number, data, size ); host_vector. copyFrom( device_vector ); int errors( 0 ); Loading Loading
configure.ac +1 −1 Original line number Diff line number Diff line Loading @@ -44,7 +44,7 @@ then STORE_CPPFLAGS=$CPPFLAGS STORE_LDFLAGS=$LDFLAGS CUDA_LDFLAGS="-L$CUDA_LIBS" CUDA_CXXFLAGS="-I$CUDA_HEADERS -DHAVE_CUDA -arch sm_13" CUDA_CXXFLAGS="-I$CUDA_HEADERS -DHAVE_CUDA" CXXFLAGS="$CXXFLAGS -I$CUDA_HEADERS" CPPFLAGS="$CPPFLAGS -I$CUDA_HEADERS" LDFLAGS="$LDFLAGS $CUDA_LDFLAGS" Loading
src/Makefile.am +2 −0 Original line number Diff line number Diff line Loading @@ -61,4 +61,6 @@ tnl_unit_tests_dbg_LDADD = libtnl-dbg-0.1.la \ core/libcore-tests-dbg.la endif TESTS = tnl-unit-tests No newline at end of file
src/core/tnl-cuda-kernels.cu +1 −27 Original line number Diff line number Diff line Loading @@ -104,33 +104,7 @@ int tnlCUDASimpleReduction1Min( const int size, const int* input, int* output ) { //Calculate necessary block/grid dimensions const int cpuThreshold = 1; const int desBlockSize = 128; //Desired block size dim3 blockSize = :: Min( size, desBlockSize ); dim3 gridSize = size / blockSize. x; unsigned int shmem = blockSize. x * sizeof( int ); cout << "Grid size: " << gridSize. x << endl << "Block size: " << blockSize. x << endl << "Shmem: " << shmem << endl; tnlCUDASimpleReductionKernel1< int, tnlMin ><<< gridSize, blockSize, shmem >>>( size, input, output ); int sizeReduced = gridSize. x; while( sizeReduced > cpuThreshold ) { cout << "Reducing with size reduced = " << sizeReduced << endl; blockSize. x = :: Min( sizeReduced, desBlockSize ); gridSize. x = sizeReduced / blockSize. x; shmem = blockSize. x * sizeof(int); tnlCUDASimpleReductionKernel1< int, tnlMin ><<< gridSize, blockSize, shmem >>>( size, input, output ); sizeReduced = gridSize. x; } int* host_output = new int[ sizeReduced ]; cudaMemcpy( host_output, output, sizeReduced * sizeof(int), cudaMemcpyDeviceToHost ); int result = host_output[ 0 ]; for( int i = 1;i < sizeReduced; i++ ) result = :: Min( result, host_output[ i ] ); delete[] host_output; return result; } int tnlCUDASimpleReduction1Max( const int size, Loading
src/core/tnl-cuda-kernels.h +41 −45 Original line number Diff line number Diff line Loading @@ -273,52 +273,48 @@ __global__ void tnlCUDASimpleReductionKernel1( const int size, template< class T, tnlOperation operation > T tnlCUDASimpleReduction1( const int size, const int block_size, const int grid_size, const T* d_input ) const T* d_input, T* output = 0 ) { T result; dim3 blockSize( block_size ); dim3 gridSize( grid_size ); int shmem = 512 * sizeof( T ); tnlAssert( shmem < 16384, cerr << shmem << " bytes are required." ); switch( block_size ) //Calculate necessary block/grid dimensions const int cpuThreshold = 1; const int desBlockSize = 128; //Desired block size dim3 block_size = :: Min( size, desBlockSize ); dim3 grid_size = size / blockSize. x; unsigned int shmem = blockSize. x * sizeof( int ); cout << "Grid size: " << grid_size. x << endl << "Block size: " << block_size. x << endl << "Shmem: " << shmem << endl; if( ! output ) { case 512: tnlCUDASimpleReductionKernel1< T, operation, 512 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); break; case 256: tnlCUDASimpleReductionKernel1< T, operation, 256 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); break; case 128: tnlCUDASimpleReductionKernel1< T, operation, 128 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); break; case 64: tnlCUDASimpleReductionKernel1< T, operation, 64 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); break; case 32: tnlCUDASimpleReductionKernel1< T, operation, 32 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); break; case 16: tnlCUDASimpleReductionKernel1< T, operation, 16 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); break; case 8: tnlCUDASimpleReductionKernel1< T, operation, 8 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); break; case 4: tnlCUDAReductionKernel< T, operation, 4 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); break; case 2: tnlCUDAReductionKernel< T, operation, 2 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); break; case 1: tnlCUDAReductionKernel< T, operation, 1 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result ); break; default: tnlAssert( false, cerr << "Block size is " << block_size << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); break; cudaMalloc( ( void** ) &output, grid_size * sizeof( T ) ); if( cuda // TODO: add allocation check } tnlCUDASimpleReductionKernel1< T, operation ><<< grid_size, block_size, shmem >>>( size, input, output ); int size_reduced = grid_size. x; while( sizeReduced > cpuThreshold ) { cout << "Reducing with size reduced = " << size_reduced << endl; block_size. x = :: Min( size_reduced, desBlockSize ); grid_size. x = size_reduced / block_size. x; shmem = block_size. x * sizeof(int); tnlCUDASimpleReductionKernel1< T, operation ><<< grid_size, block_size, shmem >>>( size, input, output ); size_reduced = grid_size. x; } int* host_output = new int[ size_reduced ]; cudaMemcpy( host_output, output, sizeReduced * sizeof(int), cudaMemcpyDeviceToHost ); int result = host_output[ 0 ]; for( int i = 1;i < size_reduced; i++ ) { if( operation == tnlMin) result = :: Min( result, host_output[ i ] ); if( operation == tnlMax ) result = :: Max( result, host_output[ i ] ); if( operation == tnlSum ) result += host_ouput[ i ]; } delete[] host_output; return result; } Loading
src/core/tnlLongVectorCUDATester.cu.h +10 −8 Original line number Diff line number Diff line Loading @@ -15,6 +15,15 @@ __global__ void setMultiBlockNumber( const T c, T* A, const int size ) if( i < size ) A[ i ] = c; }; template< class T > __global__ void setNumber( const T c, T* A, const int size ) { int i = threadIdx. x; if( i < size ) A[ i ] = c; }; template< class T > void testMultiBlockKernel( const T& number, const int size ) { Loading @@ -37,20 +46,13 @@ void testMultiBlockKernel( const T& number, const int size ) CPPUNIT_ASSERT( ! errors ); }; template< class T > __global__ void setNumber( T* A, const T c ) { int i = threadIdx. x; A[ i ] = c; }; template< class T > void testKernel( const T& number, const int size ) { tnlLongVectorCUDA< T > device_vector( size ); tnlLongVector< T > host_vector( size ); T* data = device_vector. Data(); setNumber<<< 1, size >>>( data, number ); setNumber<<< 1, size >>>( number, data, size ); host_vector. copyFrom( device_vector ); int errors( 0 ); Loading