Loading src/TNL/Containers/Algorithms/CMakeLists.txt +5 −1 Original line number Diff line number Diff line Loading @@ -6,7 +6,11 @@ set( headers cuda-prefix-sum.h CudaReduction.h CudaReduction_impl.h CudaReductionBuffer.h CublasWrapper.h ) CublasWrapper.h CudaMultireductionKernel.h Multireduction.h Multireduction_impl.h ) SET( CURRENT_DIR ${CMAKE_SOURCE_DIR}/src/TNL/Containers/Algorithms ) IF( BUILD_CUDA ) Loading src/TNL/Containers/Algorithms/CudaMultireductionKernel.h 0 → 100644 +155 −0 Original line number Diff line number Diff line #pragma once namespace TNL { namespace Containers { namespace Algorithms { #ifdef HAVE_CUDA /**** * The performance of this kernel is very sensitive to register usage. * Compile with --ptxas-options=-v and configure these constants for given * architecture so that there are no local memory spills. */ static constexpr int Multireduction_maxThreadsPerBlock = 256; // must be a power of 2 #if (__CUDA_ARCH__ >= 300 ) static constexpr int Multireduction_minBlocksPerMultiprocessor = 6; #else static constexpr int Multireduction_minBlocksPerMultiprocessor = 4; #endif template< typename Operation, int blockSizeX > __global__ void __launch_bounds__( Multireduction_maxThreadsPerBlock, Multireduction_minBlocksPerMultiprocessor ) CudaMultireductionKernel( Operation& operation, const typename Operation::IndexType n, const typename Operation::IndexType size, const typename Operation::RealType* input1, const typename Operation::IndexType ldInput1, const typename Operation::RealType* input2, typename Operation::ResultType* output ) { typedef typename Operation::IndexType IndexType; typedef typename Operation::ResultType ResultType; extern __shared__ __align__ ( 8 ) char __sdata[]; ResultType* sdata = reinterpret_cast< ResultType* >( __sdata ); /*** * Get thread id (tid) and global element id (gid). * gridSizeX is the number of elements in the direction of x-axis * processed by all blocks at the same time. */ const IndexType tid = threadIdx.y * blockDim.x + threadIdx.x; IndexType gid = blockIdx.x * blockDim.x + threadIdx.x; const IndexType gridSizeX = blockDim.x * gridDim.x; /*** * Shift input1 and output pointers. */ const IndexType y = blockIdx.y * blockDim.y + threadIdx.y; if( y < n ) { input1 += y * ldInput1; output += y * gridDim.x; } else return; /*** * Start with the sequential reduction and push the * result into the shared memory. */ sdata[ tid ] = operation.initialValue(); while( gid + 4 * gridSizeX < size ) { operation.cudaFirstReduction( sdata[ tid ], gid, input1, input2 ); operation.cudaFirstReduction( sdata[ tid ], gid + gridSizeX, input1, input2 ); operation.cudaFirstReduction( sdata[ tid ], gid + 2 * gridSizeX, input1, input2 ); operation.cudaFirstReduction( sdata[ tid ], gid + 3 * gridSizeX, input1, input2 ); gid += 4 * gridSizeX; } while( gid + 2 * gridSizeX < size ) { operation.cudaFirstReduction( sdata[ tid ], gid, input1, input2 ); operation.cudaFirstReduction( sdata[ tid ], gid + gridSizeX, input1, input2 ); gid += 2 * gridSizeX; } while( gid < size ) { operation.cudaFirstReduction( sdata[ tid ], gid, input1, input2 ); gid += gridSizeX; } __syncthreads(); //printf( "1: tid %d data %f \n", tid, sdata[ tid ] ); /*** * Perform the parallel reduction. */ if( blockSizeX >= 1024 ) { if( threadIdx.x < 512 ) { operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 512 ] ); } __syncthreads(); } if( blockSizeX >= 512 ) { if( threadIdx.x < 256 ) { operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 256 ] ); } __syncthreads(); } if( blockSizeX >= 256 ) { if( threadIdx.x < 128 ) { operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 128 ] ); } __syncthreads(); } if( blockSizeX >= 128 ) { if( threadIdx.x < 64 ) { operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 64 ] ); } __syncthreads(); } /*** * This runs in one warp so it is synchronized implicitly. * * When the blockSizeX is less then or equal to the warp size, the shared memory * must be at least 2 * blockSizeX elements per block, otherwise unallocated memory * will be accessed !!! */ if( threadIdx.x < 32 ) { volatile ResultType* vsdata = sdata; if( blockSizeX >= 64 ) { operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 32 ] ); } if( blockSizeX >= 32 ) { operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 16 ] ); } if( blockSizeX >= 16 ) { operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 8 ] ); } if( blockSizeX >= 8 ) { operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 4 ] ); } if( blockSizeX >= 4 ) { operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 2 ] ); } if( blockSizeX >= 2 ) { operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 1 ] ); } } /*** * Store the result back in the global memory. */ if( threadIdx.x == 0 ) { output[ blockIdx.x ] = sdata[ tid ]; } } #endif } // namespace Algorithms } // namespace Containers } // namespace TNL src/TNL/Containers/Algorithms/Multireduction.h 0 → 100644 +27 −0 Original line number Diff line number Diff line #pragma once namespace TNL { namespace Containers { namespace Algorithms { template< typename Operation > bool multireductionOnCudaDevice( const Operation& operation, int n, const typename Operation::IndexType size, const typename Operation::RealType* deviceInput1, const typename Operation::RealType* deviceInput2, typename Operation::ResultType* hostResult ); template< typename Operation > bool multireductionOnHostDevice( const Operation& operation, int n, const typename Operation::IndexType size, const typename Operation::RealType* deviceInput1, const typename Operation::RealType* deviceInput2, typename Operation::ResultType* hostResult ); } // namespace Algorithms } // namespace Containers } // namespace TNL #include "Multireduction_impl.h" src/TNL/Containers/Algorithms/Multireduction_impl.h 0 → 100644 +315 −0 Original line number Diff line number Diff line #pragma once //#define CUDA_REDUCTION_PROFILING #ifdef HAVE_CUDA #include <cuda.h> #endif #include <TNL/Assert.h> #include <TNL/Containers/Algorithms/reduction-operations.h> #include <TNL/Containers/ArrayOperations.h> #include <TNL/Math.h> #include <TNL/Containers/Algorithms/CudaReductionBuffer.h> #include <TNL/Containers/Algorithms/CudaMultireductionKernel.h> #include <TNL/Devices/CudaDeviceInfo.h> #ifdef CUDA_REDUCTION_PROFILING #include <TNL/Timer.h> #include <iostream> #endif namespace TNL { namespace Containers { namespace Algorithms { /**** * Arrays smaller than the following constant are reduced on CPU. */ //static constexpr int Multireduction_minGpuDataSize = 16384;//65536; //16384;//1024;//256; // TODO: benchmarks with different values static constexpr int Multireduction_minGpuDataSize = 256;//65536; //16384;//1024;//256; #ifdef HAVE_CUDA template< typename Operation > typename Operation::IndexType multireduceOnCudaDevice( Operation& operation, int n, const typename Operation::IndexType size, const typename Operation::RealType* input1, const typename Operation::IndexType ldInput1, const typename Operation::RealType* input2, typename Operation::ResultType*& output ) { typedef typename Operation::IndexType IndexType; typedef typename Operation::RealType RealType; typedef typename Operation::ResultType ResultType; // The number of blocks should be a multiple of the number of multiprocessors // to ensure optimum balancing of the load. This is very important, because // we run the kernel with a fixed number of blocks, so the amount of work per // block increases with enlarging the problem, so even small imbalance can // cost us dearly. // On Tesla K40c, desGridSizeX = 4 * 6 * 15 = 360. // const IndexType desGridSizeX = 4 * Multireduction_minBlocksPerMultiprocessor // * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() ); // On Tesla K40c, desGridSizeX = 6 * 15 = 90. const IndexType desGridSizeX = Multireduction_minBlocksPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() ); dim3 blockSize, gridSize; // version A: max 16 rows of threads blockSize.y = min( n, 16 ); // version B: up to 16 rows of threads, then "minimize" number of inactive rows // if( n <= 16 ) // blockSize.y = n; // else { // int r = (n - 1) % 16 + 1; // if( r > 12 ) // blockSize.y = 16; // else if( r > 8 ) // blockSize.y = 4; // else if( r > 4 ) // blockSize.y = 8; // else // blockSize.y = 4; // } // blockSize.x has to be a power of 2 blockSize.x = Multireduction_maxThreadsPerBlock; while( blockSize.x * blockSize.y > Multireduction_maxThreadsPerBlock ) blockSize.x /= 2; gridSize.x = min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSizeX ); gridSize.y = Devices::Cuda::getNumberOfBlocks( n, blockSize.y ); if( gridSize.y > Devices::Cuda::getMaxGridSize() ) { std::cerr << "Maximum gridSize.y limit exceeded (limit is 65535, attempted " << gridSize.y << ")." << std::endl; throw 1; } // create reference to the reduction buffer singleton and set default size // (make an overestimate to avoid reallocation on every call if n is increased by 1 each time) const size_t buf_size = 8 * ( n / 8 + 1 ) * desGridSizeX * sizeof( ResultType ); CudaReductionBuffer & cudaReductionBuffer = CudaReductionBuffer::getInstance(); if( ! cudaReductionBuffer.setSize( buf_size ) ) throw 1; output = cudaReductionBuffer.template getData< ResultType >(); // when there is only one warp per blockSize.x, we need to allocate two warps // worth of shared memory so that we don't index shared memory out of bounds const IndexType shmem = (blockSize.x <= 32) ? 2 * blockSize.x * blockSize.y * sizeof( ResultType ) : blockSize.x * blockSize.y * sizeof( ResultType ); //cout << "Multireduction of " << n << " datasets, block size (" << blockSize.x << "," << blockSize.y << "), grid size (" << gridSize.x << "," << gridSize.y << "), shmem " << shmem << endl; /*** * Depending on the blockSize we generate appropriate template instance. */ switch( blockSize.x ) { case 512: CudaMultireductionKernel< Operation, 512 > <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output); break; case 256: cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 256 >, cudaFuncCachePreferShared); CudaMultireductionKernel< Operation, 256 > <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output); break; case 128: cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 128 >, cudaFuncCachePreferShared); CudaMultireductionKernel< Operation, 128 > <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output); break; case 64: cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 64 >, cudaFuncCachePreferShared); CudaMultireductionKernel< Operation, 64 > <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output); break; case 32: cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 32 >, cudaFuncCachePreferShared); CudaMultireductionKernel< Operation, 32 > <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output); break; case 16: cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 16 >, cudaFuncCachePreferShared); CudaMultireductionKernel< Operation, 16 > <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output); break; case 8: cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 8 >, cudaFuncCachePreferShared); CudaMultireductionKernel< Operation, 8 > <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output); break; case 4: cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 4 >, cudaFuncCachePreferShared); CudaMultireductionKernel< Operation, 4 > <<< gridSize, blockSize, shmem >>>( operation, n,size, input1, ldInput1, input2, output); break; case 2: cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 2 >, cudaFuncCachePreferShared); CudaMultireductionKernel< Operation, 2 > <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output); break; case 1: Assert( false, std::cerr << "blockSize should not be 1." << std::endl ); default: Assert( false, std::cerr << "Block size is " << blockSize.x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." << std::endl ); } checkCudaDevice; return gridSize.x; } #endif /* * Parameters: * operation: the operation used for reduction * n: number of datasets to be reduced * size: the size of each dataset * deviceInput1: input array of size = n * ldInput1 * ldInput1: leading dimension of the deviceInput1 array * deviceInput2: either nullptr or input array of size = size * hostResult: output array of size = n */ template< typename Operation > bool multireductionOnCudaDevice( Operation& operation, int n, const typename Operation::IndexType size, const typename Operation::RealType* deviceInput1, const typename Operation::IndexType ldInput1, const typename Operation::RealType* deviceInput2, typename Operation::ResultType* hostResult ) { #ifdef HAVE_CUDA Assert( n > 0, ); typedef typename Operation::IndexType IndexType; typedef typename Operation::RealType RealType; typedef typename Operation::ResultType ResultType; typedef typename Operation::LaterReductionOperation LaterReductionOperation; /*** * First check if the input array(s) is/are large enough for the multireduction on GPU. * Otherwise copy it/them to host and multireduce on CPU. */ if( n * ldInput1 < Multireduction_minGpuDataSize ) { RealType hostArray1[ Multireduction_minGpuDataSize ]; // FIXME: hostArray2 is left undefined if deviceInput2 is nullptr RealType hostArray2[ Multireduction_minGpuDataSize ]; if( ! ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( hostArray1, deviceInput1, n * ldInput1 ) ) return false; if( deviceInput2 && ! ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( hostArray2, deviceInput2, n * size ) ) return false; return multireductionOnHostDevice( operation, n, size, hostArray1, ldInput1, hostArray2, hostResult ); } #ifdef CUDA_REDUCTION_PROFILING Timer timer; timer.reset(); timer.start(); #endif /**** * Reduce the data on the CUDA device. */ ResultType* deviceAux1 = nullptr; const IndexType reducedSize = multireduceOnCudaDevice( operation, n, size, deviceInput1, ldInput1, deviceInput2, deviceAux1 ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); cout << " Multireduction of " << n << " datasets on GPU to size " << reducedSize << " took " << timer.getRealTime() << " sec. " << endl; timer.reset(); timer.start(); #endif /*** * Transfer the reduced data from device to host. */ ResultType resultArray[ n * reducedSize ]; if( ! ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< ResultType, ResultType, IndexType >( resultArray, deviceAux1, n * reducedSize ) ) return false; #ifdef CUDA_REDUCTION_PROFILING timer.stop(); cout << " Transferring data to CPU took " << timer.getRealTime() << " sec. " << endl; timer.reset(); timer.start(); #endif // cout << "resultArray = ["; // for( int i = 0; i < n * reducedSize; i++ ) { // cout << resultArray[ i ]; // if( i < n * reducedSize - 1 ) // cout << ", "; // } // cout << "]" << endl; /*** * Reduce the data on the host system. */ LaterReductionOperation laterReductionOperation; multireductionOnHostDevice( laterReductionOperation, n, reducedSize, resultArray, reducedSize, (RealType*) nullptr, hostResult ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); cout << " Multireduction of small data set on CPU took " << timer.getRealTime() << " sec. " << endl; #endif return checkCudaDevice; #else CudaSupportMissingMessage; return false; #endif }; /* * Parameters: * operation: the operation used for reduction * n: number of datasets to be reduced * size: the size of each dataset * input1: input array of size = n * ldInput1 * ldInput1: leading dimension of the input1 array * input2: either nullptr or input array of size = size * hostResult: output array of size = n */ template< typename Operation > bool multireductionOnHostDevice( Operation& operation, int n, const typename Operation::IndexType size, const typename Operation::RealType* input1, const typename Operation::IndexType ldInput1, const typename Operation::RealType* input2, typename Operation::ResultType* result ) { typedef typename Operation::IndexType IndexType; typedef typename Operation::RealType RealType; for( int k = 0; k < n; k++ ) { result[ k ] = operation.initialValue(); const RealType* _input1 = input1 + k * ldInput1; for( IndexType i = 0; i < size; i++ ) result[ k ] = operation.reduceOnHost( i, result[ k ], _input1, input2 ); } return true; } } // namespace Algorithms } // namespace Containers } // namespace TNL src/TNL/Matrices/CMakeLists.txt +2 −1 Original line number Diff line number Diff line Loading @@ -16,6 +16,7 @@ SET( headers Matrix.h ChunkedEllpack_impl.h CSR.h CSR_impl.h MatrixOperations.h MatrixReader.h MatrixReader_impl.h MatrixWriter.h Loading Loading
src/TNL/Containers/Algorithms/CMakeLists.txt +5 −1 Original line number Diff line number Diff line Loading @@ -6,7 +6,11 @@ set( headers cuda-prefix-sum.h CudaReduction.h CudaReduction_impl.h CudaReductionBuffer.h CublasWrapper.h ) CublasWrapper.h CudaMultireductionKernel.h Multireduction.h Multireduction_impl.h ) SET( CURRENT_DIR ${CMAKE_SOURCE_DIR}/src/TNL/Containers/Algorithms ) IF( BUILD_CUDA ) Loading
src/TNL/Containers/Algorithms/CudaMultireductionKernel.h 0 → 100644 +155 −0 Original line number Diff line number Diff line #pragma once namespace TNL { namespace Containers { namespace Algorithms { #ifdef HAVE_CUDA /**** * The performance of this kernel is very sensitive to register usage. * Compile with --ptxas-options=-v and configure these constants for given * architecture so that there are no local memory spills. */ static constexpr int Multireduction_maxThreadsPerBlock = 256; // must be a power of 2 #if (__CUDA_ARCH__ >= 300 ) static constexpr int Multireduction_minBlocksPerMultiprocessor = 6; #else static constexpr int Multireduction_minBlocksPerMultiprocessor = 4; #endif template< typename Operation, int blockSizeX > __global__ void __launch_bounds__( Multireduction_maxThreadsPerBlock, Multireduction_minBlocksPerMultiprocessor ) CudaMultireductionKernel( Operation& operation, const typename Operation::IndexType n, const typename Operation::IndexType size, const typename Operation::RealType* input1, const typename Operation::IndexType ldInput1, const typename Operation::RealType* input2, typename Operation::ResultType* output ) { typedef typename Operation::IndexType IndexType; typedef typename Operation::ResultType ResultType; extern __shared__ __align__ ( 8 ) char __sdata[]; ResultType* sdata = reinterpret_cast< ResultType* >( __sdata ); /*** * Get thread id (tid) and global element id (gid). * gridSizeX is the number of elements in the direction of x-axis * processed by all blocks at the same time. */ const IndexType tid = threadIdx.y * blockDim.x + threadIdx.x; IndexType gid = blockIdx.x * blockDim.x + threadIdx.x; const IndexType gridSizeX = blockDim.x * gridDim.x; /*** * Shift input1 and output pointers. */ const IndexType y = blockIdx.y * blockDim.y + threadIdx.y; if( y < n ) { input1 += y * ldInput1; output += y * gridDim.x; } else return; /*** * Start with the sequential reduction and push the * result into the shared memory. */ sdata[ tid ] = operation.initialValue(); while( gid + 4 * gridSizeX < size ) { operation.cudaFirstReduction( sdata[ tid ], gid, input1, input2 ); operation.cudaFirstReduction( sdata[ tid ], gid + gridSizeX, input1, input2 ); operation.cudaFirstReduction( sdata[ tid ], gid + 2 * gridSizeX, input1, input2 ); operation.cudaFirstReduction( sdata[ tid ], gid + 3 * gridSizeX, input1, input2 ); gid += 4 * gridSizeX; } while( gid + 2 * gridSizeX < size ) { operation.cudaFirstReduction( sdata[ tid ], gid, input1, input2 ); operation.cudaFirstReduction( sdata[ tid ], gid + gridSizeX, input1, input2 ); gid += 2 * gridSizeX; } while( gid < size ) { operation.cudaFirstReduction( sdata[ tid ], gid, input1, input2 ); gid += gridSizeX; } __syncthreads(); //printf( "1: tid %d data %f \n", tid, sdata[ tid ] ); /*** * Perform the parallel reduction. */ if( blockSizeX >= 1024 ) { if( threadIdx.x < 512 ) { operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 512 ] ); } __syncthreads(); } if( blockSizeX >= 512 ) { if( threadIdx.x < 256 ) { operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 256 ] ); } __syncthreads(); } if( blockSizeX >= 256 ) { if( threadIdx.x < 128 ) { operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 128 ] ); } __syncthreads(); } if( blockSizeX >= 128 ) { if( threadIdx.x < 64 ) { operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 64 ] ); } __syncthreads(); } /*** * This runs in one warp so it is synchronized implicitly. * * When the blockSizeX is less then or equal to the warp size, the shared memory * must be at least 2 * blockSizeX elements per block, otherwise unallocated memory * will be accessed !!! */ if( threadIdx.x < 32 ) { volatile ResultType* vsdata = sdata; if( blockSizeX >= 64 ) { operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 32 ] ); } if( blockSizeX >= 32 ) { operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 16 ] ); } if( blockSizeX >= 16 ) { operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 8 ] ); } if( blockSizeX >= 8 ) { operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 4 ] ); } if( blockSizeX >= 4 ) { operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 2 ] ); } if( blockSizeX >= 2 ) { operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 1 ] ); } } /*** * Store the result back in the global memory. */ if( threadIdx.x == 0 ) { output[ blockIdx.x ] = sdata[ tid ]; } } #endif } // namespace Algorithms } // namespace Containers } // namespace TNL
src/TNL/Containers/Algorithms/Multireduction.h 0 → 100644 +27 −0 Original line number Diff line number Diff line #pragma once namespace TNL { namespace Containers { namespace Algorithms { template< typename Operation > bool multireductionOnCudaDevice( const Operation& operation, int n, const typename Operation::IndexType size, const typename Operation::RealType* deviceInput1, const typename Operation::RealType* deviceInput2, typename Operation::ResultType* hostResult ); template< typename Operation > bool multireductionOnHostDevice( const Operation& operation, int n, const typename Operation::IndexType size, const typename Operation::RealType* deviceInput1, const typename Operation::RealType* deviceInput2, typename Operation::ResultType* hostResult ); } // namespace Algorithms } // namespace Containers } // namespace TNL #include "Multireduction_impl.h"
src/TNL/Containers/Algorithms/Multireduction_impl.h 0 → 100644 +315 −0 Original line number Diff line number Diff line #pragma once //#define CUDA_REDUCTION_PROFILING #ifdef HAVE_CUDA #include <cuda.h> #endif #include <TNL/Assert.h> #include <TNL/Containers/Algorithms/reduction-operations.h> #include <TNL/Containers/ArrayOperations.h> #include <TNL/Math.h> #include <TNL/Containers/Algorithms/CudaReductionBuffer.h> #include <TNL/Containers/Algorithms/CudaMultireductionKernel.h> #include <TNL/Devices/CudaDeviceInfo.h> #ifdef CUDA_REDUCTION_PROFILING #include <TNL/Timer.h> #include <iostream> #endif namespace TNL { namespace Containers { namespace Algorithms { /**** * Arrays smaller than the following constant are reduced on CPU. */ //static constexpr int Multireduction_minGpuDataSize = 16384;//65536; //16384;//1024;//256; // TODO: benchmarks with different values static constexpr int Multireduction_minGpuDataSize = 256;//65536; //16384;//1024;//256; #ifdef HAVE_CUDA template< typename Operation > typename Operation::IndexType multireduceOnCudaDevice( Operation& operation, int n, const typename Operation::IndexType size, const typename Operation::RealType* input1, const typename Operation::IndexType ldInput1, const typename Operation::RealType* input2, typename Operation::ResultType*& output ) { typedef typename Operation::IndexType IndexType; typedef typename Operation::RealType RealType; typedef typename Operation::ResultType ResultType; // The number of blocks should be a multiple of the number of multiprocessors // to ensure optimum balancing of the load. This is very important, because // we run the kernel with a fixed number of blocks, so the amount of work per // block increases with enlarging the problem, so even small imbalance can // cost us dearly. // On Tesla K40c, desGridSizeX = 4 * 6 * 15 = 360. // const IndexType desGridSizeX = 4 * Multireduction_minBlocksPerMultiprocessor // * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() ); // On Tesla K40c, desGridSizeX = 6 * 15 = 90. const IndexType desGridSizeX = Multireduction_minBlocksPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() ); dim3 blockSize, gridSize; // version A: max 16 rows of threads blockSize.y = min( n, 16 ); // version B: up to 16 rows of threads, then "minimize" number of inactive rows // if( n <= 16 ) // blockSize.y = n; // else { // int r = (n - 1) % 16 + 1; // if( r > 12 ) // blockSize.y = 16; // else if( r > 8 ) // blockSize.y = 4; // else if( r > 4 ) // blockSize.y = 8; // else // blockSize.y = 4; // } // blockSize.x has to be a power of 2 blockSize.x = Multireduction_maxThreadsPerBlock; while( blockSize.x * blockSize.y > Multireduction_maxThreadsPerBlock ) blockSize.x /= 2; gridSize.x = min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSizeX ); gridSize.y = Devices::Cuda::getNumberOfBlocks( n, blockSize.y ); if( gridSize.y > Devices::Cuda::getMaxGridSize() ) { std::cerr << "Maximum gridSize.y limit exceeded (limit is 65535, attempted " << gridSize.y << ")." << std::endl; throw 1; } // create reference to the reduction buffer singleton and set default size // (make an overestimate to avoid reallocation on every call if n is increased by 1 each time) const size_t buf_size = 8 * ( n / 8 + 1 ) * desGridSizeX * sizeof( ResultType ); CudaReductionBuffer & cudaReductionBuffer = CudaReductionBuffer::getInstance(); if( ! cudaReductionBuffer.setSize( buf_size ) ) throw 1; output = cudaReductionBuffer.template getData< ResultType >(); // when there is only one warp per blockSize.x, we need to allocate two warps // worth of shared memory so that we don't index shared memory out of bounds const IndexType shmem = (blockSize.x <= 32) ? 2 * blockSize.x * blockSize.y * sizeof( ResultType ) : blockSize.x * blockSize.y * sizeof( ResultType ); //cout << "Multireduction of " << n << " datasets, block size (" << blockSize.x << "," << blockSize.y << "), grid size (" << gridSize.x << "," << gridSize.y << "), shmem " << shmem << endl; /*** * Depending on the blockSize we generate appropriate template instance. */ switch( blockSize.x ) { case 512: CudaMultireductionKernel< Operation, 512 > <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output); break; case 256: cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 256 >, cudaFuncCachePreferShared); CudaMultireductionKernel< Operation, 256 > <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output); break; case 128: cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 128 >, cudaFuncCachePreferShared); CudaMultireductionKernel< Operation, 128 > <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output); break; case 64: cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 64 >, cudaFuncCachePreferShared); CudaMultireductionKernel< Operation, 64 > <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output); break; case 32: cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 32 >, cudaFuncCachePreferShared); CudaMultireductionKernel< Operation, 32 > <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output); break; case 16: cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 16 >, cudaFuncCachePreferShared); CudaMultireductionKernel< Operation, 16 > <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output); break; case 8: cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 8 >, cudaFuncCachePreferShared); CudaMultireductionKernel< Operation, 8 > <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output); break; case 4: cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 4 >, cudaFuncCachePreferShared); CudaMultireductionKernel< Operation, 4 > <<< gridSize, blockSize, shmem >>>( operation, n,size, input1, ldInput1, input2, output); break; case 2: cudaFuncSetCacheConfig(CudaMultireductionKernel< Operation, 2 >, cudaFuncCachePreferShared); CudaMultireductionKernel< Operation, 2 > <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output); break; case 1: Assert( false, std::cerr << "blockSize should not be 1." << std::endl ); default: Assert( false, std::cerr << "Block size is " << blockSize.x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." << std::endl ); } checkCudaDevice; return gridSize.x; } #endif /* * Parameters: * operation: the operation used for reduction * n: number of datasets to be reduced * size: the size of each dataset * deviceInput1: input array of size = n * ldInput1 * ldInput1: leading dimension of the deviceInput1 array * deviceInput2: either nullptr or input array of size = size * hostResult: output array of size = n */ template< typename Operation > bool multireductionOnCudaDevice( Operation& operation, int n, const typename Operation::IndexType size, const typename Operation::RealType* deviceInput1, const typename Operation::IndexType ldInput1, const typename Operation::RealType* deviceInput2, typename Operation::ResultType* hostResult ) { #ifdef HAVE_CUDA Assert( n > 0, ); typedef typename Operation::IndexType IndexType; typedef typename Operation::RealType RealType; typedef typename Operation::ResultType ResultType; typedef typename Operation::LaterReductionOperation LaterReductionOperation; /*** * First check if the input array(s) is/are large enough for the multireduction on GPU. * Otherwise copy it/them to host and multireduce on CPU. */ if( n * ldInput1 < Multireduction_minGpuDataSize ) { RealType hostArray1[ Multireduction_minGpuDataSize ]; // FIXME: hostArray2 is left undefined if deviceInput2 is nullptr RealType hostArray2[ Multireduction_minGpuDataSize ]; if( ! ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( hostArray1, deviceInput1, n * ldInput1 ) ) return false; if( deviceInput2 && ! ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< RealType, RealType, IndexType >( hostArray2, deviceInput2, n * size ) ) return false; return multireductionOnHostDevice( operation, n, size, hostArray1, ldInput1, hostArray2, hostResult ); } #ifdef CUDA_REDUCTION_PROFILING Timer timer; timer.reset(); timer.start(); #endif /**** * Reduce the data on the CUDA device. */ ResultType* deviceAux1 = nullptr; const IndexType reducedSize = multireduceOnCudaDevice( operation, n, size, deviceInput1, ldInput1, deviceInput2, deviceAux1 ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); cout << " Multireduction of " << n << " datasets on GPU to size " << reducedSize << " took " << timer.getRealTime() << " sec. " << endl; timer.reset(); timer.start(); #endif /*** * Transfer the reduced data from device to host. */ ResultType resultArray[ n * reducedSize ]; if( ! ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< ResultType, ResultType, IndexType >( resultArray, deviceAux1, n * reducedSize ) ) return false; #ifdef CUDA_REDUCTION_PROFILING timer.stop(); cout << " Transferring data to CPU took " << timer.getRealTime() << " sec. " << endl; timer.reset(); timer.start(); #endif // cout << "resultArray = ["; // for( int i = 0; i < n * reducedSize; i++ ) { // cout << resultArray[ i ]; // if( i < n * reducedSize - 1 ) // cout << ", "; // } // cout << "]" << endl; /*** * Reduce the data on the host system. */ LaterReductionOperation laterReductionOperation; multireductionOnHostDevice( laterReductionOperation, n, reducedSize, resultArray, reducedSize, (RealType*) nullptr, hostResult ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); cout << " Multireduction of small data set on CPU took " << timer.getRealTime() << " sec. " << endl; #endif return checkCudaDevice; #else CudaSupportMissingMessage; return false; #endif }; /* * Parameters: * operation: the operation used for reduction * n: number of datasets to be reduced * size: the size of each dataset * input1: input array of size = n * ldInput1 * ldInput1: leading dimension of the input1 array * input2: either nullptr or input array of size = size * hostResult: output array of size = n */ template< typename Operation > bool multireductionOnHostDevice( Operation& operation, int n, const typename Operation::IndexType size, const typename Operation::RealType* input1, const typename Operation::IndexType ldInput1, const typename Operation::RealType* input2, typename Operation::ResultType* result ) { typedef typename Operation::IndexType IndexType; typedef typename Operation::RealType RealType; for( int k = 0; k < n; k++ ) { result[ k ] = operation.initialValue(); const RealType* _input1 = input1 + k * ldInput1; for( IndexType i = 0; i < size; i++ ) result[ k ] = operation.reduceOnHost( i, result[ k ], _input1, input2 ); } return true; } } // namespace Algorithms } // namespace Containers } // namespace TNL
src/TNL/Matrices/CMakeLists.txt +2 −1 Original line number Diff line number Diff line Loading @@ -16,6 +16,7 @@ SET( headers Matrix.h ChunkedEllpack_impl.h CSR.h CSR_impl.h MatrixOperations.h MatrixReader.h MatrixReader_impl.h MatrixWriter.h Loading