Loading src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp +4 −3 Original line number Diff line number Diff line Loading @@ -220,9 +220,10 @@ containsValue( const Element* data, TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." ); TNL_ASSERT_GE( size, 0, "" ); if( size == 0 ) return false; Algorithms::ParallelReductionContainsValue< Element > reductionContainsValue; reductionContainsValue.setValue( value ); return Reduction< Devices::Cuda >::reduce( reductionContainsValue, size, data, nullptr ); //Algorithms::ParallelReductionContainsValue< Element > reductionContainsValue; //reductionContainsValue.setValue( value ); auto fetch = [=] __cuda_callable__ ( Index i ) { return ( data[ i ] == value ); }; return Reduction< Devices::Cuda >::reduce( size, TNL::sum, fetch, 0.0 ); } template< typename Element, Loading src/TNL/Containers/Algorithms/CudaReductionKernel.h +59 −53 Original line number Diff line number Diff line Loading @@ -41,20 +41,21 @@ static constexpr int Reduction_registersPerThread = 32; // empirically determi template< int blockSize, typename Real, typename FirstPhase, typename SecondPhase, typename DataFetcher, typename Reduction, typename Index, typename ResultType = decltype( std::declval< FirstPhase >( 0,0 ) ) > typename Result = decltype( std::declval< DataFetcher >( 0,0 ) ) > __global__ void __launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor ) CudaReductionKernel( const Real& initialValue, FirstReduction& firstReduction, SecondReduction& secondReduction, CudaReductionKernel( const Real& zero, const DataFetcher& dataFetcher, const Reduction& reduction, const Index size, ResultType* output ) Result* output ) { typedef Index IndexType; typedef typename Operation::ResultType ResultType; using RealType = Real; using IndexType = Index; using ResultType = Result; ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >(); Loading @@ -67,28 +68,28 @@ CudaReductionKernel( const Real& initialValue, IndexType gid = blockIdx.x * blockDim. x + threadIdx.x; const IndexType gridSize = blockDim.x * gridDim.x; sdata[ tid ] = initialValue; sdata[ tid ] = zero; /*** * Read data into the shared memory. We start with the * sequential reduction. */ while( gid + 4 * gridSize < size ) { sdata[ tid ] = firstReduction( sdata[ tid ], gid ); sdata[ tid ] = firstReduction( sdata[ tid ], gid + gridSize ); sdata[ tid ] = firstReduction( sdata[ tid ], gid + 2 * gridSize ); sdata[ tid ] = firstReduction( sdata[ tid ], gid + 3 * gridSize ); sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) ); sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ) ); sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ) ); gid += 4 * gridSize; } while( gid + 2 * gridSize < size ) { firstReduction( sdata[ tid ], gid, input1, input2 ); firstReduction( sdata[ tid ], gid + gridSize, input1, input2 ); sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) ); sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); gid += 2 * gridSize; } while( gid < size ) { operation.firstReduction( sdata[ tid ], gid, input1, input2 ); sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) ); gid += gridSize; } __syncthreads(); Loading @@ -103,19 +104,19 @@ CudaReductionKernel( const Real& initialValue, if( blockSize >= 1024 ) { if( tid < 512 ) operation.commonReduction( sdata[ tid ], sdata[ tid + 512 ] ); reduction( sdata[ tid ], sdata[ tid + 512 ] ); __syncthreads(); } if( blockSize >= 512 ) { if( tid < 256 ) operation.commonReduction( sdata[ tid ], sdata[ tid + 256 ] ); reduction( sdata[ tid ], sdata[ tid + 256 ] ); __syncthreads(); } if( blockSize >= 256 ) { if( tid < 128 ) operation.commonReduction( sdata[ tid ], sdata[ tid + 128 ] ); reduction( sdata[ tid ], sdata[ tid + 128 ] ); __syncthreads(); //printf( "2: tid %d data %f \n", tid, sdata[ tid ] ); } Loading @@ -123,7 +124,7 @@ CudaReductionKernel( const Real& initialValue, if( blockSize >= 128 ) { if( tid < 64 ) operation.commonReduction( sdata[ tid ], sdata[ tid + 64 ] ); reduction( sdata[ tid ], sdata[ tid + 64 ] ); __syncthreads(); //printf( "3: tid %d data %f \n", tid, sdata[ tid ] ); } Loading @@ -137,34 +138,34 @@ CudaReductionKernel( const Real& initialValue, volatile ResultType* vsdata = sdata; if( blockSize >= 64 ) { operation.commonReduction( vsdata[ tid ], vsdata[ tid + 32 ] ); reduction( vsdata[ tid ], vsdata[ tid + 32 ] ); //printf( "4: tid %d data %f \n", tid, sdata[ tid ] ); } // TODO: If blocksize == 32, the following does not work // We do not check if tid < 16. Fix it!!! if( blockSize >= 32 ) { operation.commonReduction( vsdata[ tid ], vsdata[ tid + 16 ] ); reduction( vsdata[ tid ], vsdata[ tid + 16 ] ); //printf( "5: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 16 ) { operation.commonReduction( vsdata[ tid ], vsdata[ tid + 8 ] ); reduction( vsdata[ tid ], vsdata[ tid + 8 ] ); //printf( "6: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 8 ) { operation.commonReduction( vsdata[ tid ], vsdata[ tid + 4 ] ); reduction( vsdata[ tid ], vsdata[ tid + 4 ] ); //printf( "7: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 4 ) { operation.commonReduction( vsdata[ tid ], vsdata[ tid + 2 ] ); reduction( vsdata[ tid ], vsdata[ tid + 2 ] ); //printf( "8: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 2 ) { operation.commonReduction( vsdata[ tid ], vsdata[ tid + 1 ] ); reduction( vsdata[ tid ], vsdata[ tid + 1 ] ); //printf( "9: tid %d data %f \n", tid, sdata[ tid ] ); } } Loading @@ -180,16 +181,21 @@ CudaReductionKernel( const Real& initialValue, } template< typename Operation, typename Index > template< typename Real, typename DataFetcher, typename Reduction, typename Index, typename Result > int CudaReductionKernelLauncher( Operation& operation, const Index size, const typename Operation::DataType1* input1, const typename Operation::DataType2* input2, typename Operation::ResultType*& output ) CudaReductionKernelLauncher( const Index size, const Reduction& reduction, const DataFetcher& dataFetcher, const Real& zero, Result*& output ) { typedef Index IndexType; typedef typename Operation::ResultType ResultType; using RealType = Real; using IndexType = Index; using ResultType = Result; // The number of blocks should be a multiple of the number of multiprocessors // to ensure optimum balancing of the load. This is very important, because Loading Loading @@ -227,55 +233,55 @@ CudaReductionKernelLauncher( Operation& operation, { case 512: CudaReductionKernel< 512 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 256: cudaFuncSetCacheConfig(CudaReductionKernel< 256, Operation, Index >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaReductionKernel< 256, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); CudaReductionKernel< 256 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 128: cudaFuncSetCacheConfig(CudaReductionKernel< 128, Operation, Index >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaReductionKernel< 128, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); CudaReductionKernel< 128 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 64: cudaFuncSetCacheConfig(CudaReductionKernel< 64, Operation, Index >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaReductionKernel< 64, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); CudaReductionKernel< 64 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 32: cudaFuncSetCacheConfig(CudaReductionKernel< 32, Operation, Index >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaReductionKernel< 32, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); CudaReductionKernel< 32 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 16: cudaFuncSetCacheConfig(CudaReductionKernel< 16, Operation, Index >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaReductionKernel< 16, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); CudaReductionKernel< 16 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 8: cudaFuncSetCacheConfig(CudaReductionKernel< 8, Operation, Index >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaReductionKernel< 8, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); CudaReductionKernel< 8 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 4: cudaFuncSetCacheConfig(CudaReductionKernel< 4, Operation, Index >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaReductionKernel< 4, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); CudaReductionKernel< 4 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 2: cudaFuncSetCacheConfig(CudaReductionKernel< 2, Operation, Index >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaReductionKernel< 2, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); CudaReductionKernel< 2 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 1: TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl ); Loading src/TNL/Containers/Algorithms/Reduction.h +29 −20 Original line number Diff line number Diff line Loading @@ -29,36 +29,45 @@ template<> class Reduction< Devices::Cuda > { public: template< typename Operation, typename Index > static typename Operation::ResultType reduce( Operation& operation, const Index size, const typename Operation::DataType1* deviceInput1, const typename Operation::DataType2* deviceInput2 ); template< typename Index, typename ReductionOperation, typename DataFetcher, typename Result = decltype( DataFetcher::operator() ) > static Result reduce( const Index size, ReductionOperation& reduction, DataFetcher& dataFetcher, const Result& zero ); }; template<> class Reduction< Devices::Host > { public: template< typename Operation, typename Index > static typename Operation::ResultType reduce( Operation& operation, const Index size, const typename Operation::DataType1* deviceInput1, const typename Operation::DataType2* deviceInput2 ); template< typename Index, typename ReductionOperation, typename DataFetcher, typename Result = decltype( DataFetcher::operator() ) > static Result reduce( const Index size, ReductionOperation& reduction, DataFetcher& dataFetcher, const Result& zero ); }; template<> class Reduction< Devices::MIC > { public: template< typename Operation, typename Index > static typename Operation::ResultType reduce( Operation& operation, const Index size, const typename Operation::DataType1* deviceInput1, const typename Operation::DataType2* deviceInput2 ); template< typename Index, typename ReductionOperation, typename DataFetcher, typename Result = decltype( DataFetcher::operator() ) > static Result reduce( const Index size, ReductionOperation& reduction, DataFetcher& dataFetcher, const Result& zero ); }; } // namespace Algorithms Loading src/TNL/Containers/Algorithms/Reduction_impl.h +76 −64 Original line number Diff line number Diff line Loading @@ -38,35 +38,37 @@ namespace Algorithms { */ static constexpr int Reduction_minGpuDataSize = 256;//65536; //16384;//1024;//256; template< typename Operation, typename Index > typename Operation::ResultType template< typename Index, typename ReductionOperation, typename DataFetcher, typename Result > Result Reduction< Devices::Cuda >:: reduce( Operation& operation, const Index size, const typename Operation::DataType1* deviceInput1, const typename Operation::DataType2* deviceInput2 ) reduce( const Index size, ReductionOperation& reduction, DataFetcher& dataFetcher, const Result& zero ) { #ifdef HAVE_CUDA typedef Index IndexType; typedef typename Operation::DataType1 DataType1; typedef typename Operation::DataType2 DataType2; typedef typename Operation::ResultType ResultType; typedef typename Operation::LaterReductionOperation LaterReductionOperation; using IndexType = Index; using ResultType = Result; /*** * Only fundamental and pointer types can be safely reduced on host. Complex * objects stored on the device might contain pointers into the device memory, * in which case reduction on host might fail. */ constexpr bool can_reduce_all_on_host = std::is_fundamental< DataType1 >::value || std::is_fundamental< DataType2 >::value || std::is_pointer< DataType1 >::value || std::is_pointer< DataType2 >::value; //constexpr bool can_reduce_all_on_host = std::is_fundamental< DataType1 >::value || std::is_fundamental< DataType2 >::value || std::is_pointer< DataType1 >::value || std::is_pointer< DataType2 >::value; constexpr bool can_reduce_later_on_host = std::is_fundamental< ResultType >::value || std::is_pointer< ResultType >::value; /*** * First check if the input array(s) is/are large enough for the reduction on GPU. * Otherwise copy it/them to host and reduce on CPU. */ if( can_reduce_all_on_host && size <= Reduction_minGpuDataSize ) // With lambda function we cannot reduce data on host - we do not know the data here. /*if( can_reduce_all_on_host && size <= Reduction_minGpuDataSize ) { typename std::remove_const< DataType1 >::type hostArray1[ Reduction_minGpuDataSize ]; ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( hostArray1, deviceInput1, size ); Loading @@ -74,12 +76,12 @@ reduce( Operation& operation, using _DT2 = typename std::conditional< std::is_same< DataType2, void >::value, DataType1, DataType2 >::type; typename std::remove_const< _DT2 >::type hostArray2[ Reduction_minGpuDataSize ]; ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( hostArray2, (_DT2*) deviceInput2, size ); return Reduction< Devices::Host >::reduce( operation, size, hostArray1, hostArray2 ); return Reduction< Devices::Host >::reduce( zero, dataFetcher, reduction, size, hostArray1, hostArray2 ); } else { return Reduction< Devices::Host >::reduce( operation, size, hostArray1, (DataType2*) nullptr ); } } }*/ #ifdef CUDA_REDUCTION_PROFILING Timer timer; Loading @@ -91,10 +93,10 @@ reduce( Operation& operation, * Reduce the data on the CUDA device. */ ResultType* deviceAux1( 0 ); IndexType reducedSize = CudaReductionKernelLauncher( operation, size, deviceInput1, deviceInput2, IndexType reducedSize = CudaReductionKernelLauncher( size, reduction, dataFetcher, zero, deviceAux1 ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); Loading @@ -120,8 +122,8 @@ reduce( Operation& operation, /*** * Reduce the data on the host system. */ LaterReductionOperation laterReductionOperation; const ResultType result = Reduction< Devices::Host >::reduce( laterReductionOperation, reducedSize, resultArray, (void*) nullptr ); auto copyFetch = [=] ( IndexType i ) { return resultArray[ i ]; }; const ResultType result = Reduction< Devices::Host >::reduce( reducedSize, reduction, copyFetch, zero ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); Loading @@ -134,12 +136,13 @@ reduce( Operation& operation, /*** * Data can't be safely reduced on host, so continue with the reduction on the CUDA device. */ LaterReductionOperation laterReductionOperation; //LaterReductionOperation laterReductionOperation; auto copyFetch = [=] __cuda_callable__ ( IndexType i ) { return deviceAux1[ i ]; }; while( reducedSize > 1 ) { reducedSize = CudaReductionKernelLauncher( laterReductionOperation, reducedSize, deviceAux1, (ResultType*) 0, reducedSize = CudaReductionKernelLauncher( reducedSize, reduction, copyFetch, zero, deviceAux1 ); } Loading @@ -166,18 +169,19 @@ reduce( Operation& operation, #endif }; template< typename Operation, typename Index > typename Operation::ResultType template< typename Index, typename ReductionOperation, typename DataFetcher, typename Result > Result Reduction< Devices::Host >:: reduce( Operation& operation, const Index size, const typename Operation::DataType1* input1, const typename Operation::DataType2* input2 ) reduce( const Index size, ReductionOperation& reduction, DataFetcher& dataFetcher, const Result& zero ) { typedef Index IndexType; typedef typename Operation::DataType1 DataType1; typedef typename Operation::DataType2 DataType2; typedef typename Operation::ResultType ResultType; using IndexType = Index; using ResultType = Result; constexpr int block_size = 128; const int blocks = size / block_size; Loading @@ -185,23 +189,24 @@ reduce( Operation& operation, #ifdef HAVE_OPENMP if( TNL::Devices::Host::isOMPEnabled() && size >= 2 * block_size ) { // global result variable ResultType result = operation.initialValue(); ResultType result = zero; #pragma omp parallel { // initialize array for thread-local results ResultType r[ 4 ] = { operation.initialValue(), operation.initialValue(), operation.initialValue(), operation.initialValue() }; ResultType r[ 4 ] = { zero, zero, zero, zero }; #pragma omp for nowait for( int b = 0; b < blocks; b++ ) { const IndexType offset = b * block_size; for( int i = 0; i < block_size; i += 4 ) { operation.firstReduction( r[ 0 ], offset + i, input1, input2 ); operation.firstReduction( r[ 1 ], offset + i + 1, input1, input2 ); operation.firstReduction( r[ 2 ], offset + i + 2, input1, input2 ); operation.firstReduction( r[ 3 ], offset + i + 3, input1, input2 ); r[ 0 ] = reduction( r[ 0 ], dataFetcher( offset + i ) ); r[ 1 ] = reduction( r[ 1 ], dataFetcher( offset + i + 1 ) ); r[ 2 ] = reduction( r[ 2 ], dataFetcher( offset + i + 2 ) ); r[ 3 ] = reduction( r[ 3 ], dataFetcher( offset + i + 3 ) ); /*operation.dataFetcher( r[ 0 ], offset + i, input1, input2 ); operation.dataFetcher( r[ 1 ], offset + i + 1, input1, input2 ); operation.dataFetcher( r[ 2 ], offset + i + 2, input1, input2 ); operation.dataFetcher( r[ 3 ], offset + i + 3, input1, input2 );*/ } } Loading @@ -209,18 +214,19 @@ reduce( Operation& operation, #pragma omp single nowait { for( IndexType i = blocks * block_size; i < size; i++ ) operation.firstReduction( r[ 0 ], i, input1, input2 ); r[ 0 ] = reduction( r[ 0 ], dataFetcher( i ) ); //operation.dataFetcher( r[ 0 ], i, input1, input2 ); } // local reduction of unrolled results operation.commonReduction( r[ 0 ], r[ 2 ] ); operation.commonReduction( r[ 1 ], r[ 3 ] ); operation.commonReduction( r[ 0 ], r[ 1 ] ); r[ 0 ] = reduction( r[ 0 ], r[ 2 ] ); r[ 1 ] = reduction( r[ 1 ], r[ 3 ] ); r[ 0 ] = reduction( r[ 0 ], r[ 1 ] ); // inter-thread reduction of local results #pragma omp critical { operation.commonReduction( result, r[ 0 ] ); result = reduction( result, r[ 0 ] ); } } return result; Loading @@ -229,37 +235,43 @@ reduce( Operation& operation, #endif if( blocks > 1 ) { // initialize array for unrolled results ResultType r[ 4 ] = { operation.initialValue(), operation.initialValue(), operation.initialValue(), operation.initialValue() }; ResultType r[ 4 ] = { zero, zero, zero, zero }; // main reduction (explicitly unrolled loop) for( int b = 0; b < blocks; b++ ) { const IndexType offset = b * block_size; for( int i = 0; i < block_size; i += 4 ) { operation.firstReduction( r[ 0 ], offset + i, input1, input2 ); operation.firstReduction( r[ 1 ], offset + i + 1, input1, input2 ); operation.firstReduction( r[ 2 ], offset + i + 2, input1, input2 ); operation.firstReduction( r[ 3 ], offset + i + 3, input1, input2 ); r[ 0 ] = reduction( r[ 0 ], dataFetcher( offset + i ) ); r[ 1 ] = reduction( r[ 1 ], dataFetcher( offset + i + 1 ) ); r[ 2 ] = reduction( r[ 2 ], dataFetcher( offset + i + 2 ) ); r[ 3 ] = reduction( r[ 3 ], dataFetcher( offset + i + 3 ) ); /*operation.dataFetcher( r[ 0 ], offset + i, input1, input2 ); operation.dataFetcher( r[ 1 ], offset + i + 1, input1, input2 ); operation.dataFetcher( r[ 2 ], offset + i + 2, input1, input2 ); operation.dataFetcher( r[ 3 ], offset + i + 3, input1, input2 );*/ } } // reduction of the last, incomplete block (not unrolled) for( IndexType i = blocks * block_size; i < size; i++ ) operation.firstReduction( r[ 0 ], i, input1, input2 ); r[ 0 ] = reduction( r[ 0 ], dataFetcher( i ) ); //operation.dataFetcher( r[ 0 ], i, input1, input2 ); // reduction of unrolled results operation.commonReduction( r[ 0 ], r[ 2 ] ); operation.commonReduction( r[ 1 ], r[ 3 ] ); operation.commonReduction( r[ 0 ], r[ 1 ] ); r[ 0 ] = reduction( r[ 0 ], r[ 2 ] ); r[ 1 ] = reduction( r[ 1 ], r[ 3 ] ); r[ 0 ] = reduction( r[ 0 ], r[ 1 ] ); /*operation.reduction( r[ 0 ], r[ 2 ] ); operation.reduction( r[ 1 ], r[ 3 ] ); operation.reduction( r[ 0 ], r[ 1 ] );*/ return r[ 0 ]; } else { ResultType result = operation.initialValue(); ResultType result = zero; for( IndexType i = 0; i < size; i++ ) operation.firstReduction( result, i, input1, input2 ); result = reduction( result, dataFetcher( i ) ); return result; } #ifdef HAVE_OPENMP Loading src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h +18 −6 File changed.Preview size limit exceeded, changes collapsed. Show changes Loading
src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp +4 −3 Original line number Diff line number Diff line Loading @@ -220,9 +220,10 @@ containsValue( const Element* data, TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." ); TNL_ASSERT_GE( size, 0, "" ); if( size == 0 ) return false; Algorithms::ParallelReductionContainsValue< Element > reductionContainsValue; reductionContainsValue.setValue( value ); return Reduction< Devices::Cuda >::reduce( reductionContainsValue, size, data, nullptr ); //Algorithms::ParallelReductionContainsValue< Element > reductionContainsValue; //reductionContainsValue.setValue( value ); auto fetch = [=] __cuda_callable__ ( Index i ) { return ( data[ i ] == value ); }; return Reduction< Devices::Cuda >::reduce( size, TNL::sum, fetch, 0.0 ); } template< typename Element, Loading
src/TNL/Containers/Algorithms/CudaReductionKernel.h +59 −53 Original line number Diff line number Diff line Loading @@ -41,20 +41,21 @@ static constexpr int Reduction_registersPerThread = 32; // empirically determi template< int blockSize, typename Real, typename FirstPhase, typename SecondPhase, typename DataFetcher, typename Reduction, typename Index, typename ResultType = decltype( std::declval< FirstPhase >( 0,0 ) ) > typename Result = decltype( std::declval< DataFetcher >( 0,0 ) ) > __global__ void __launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor ) CudaReductionKernel( const Real& initialValue, FirstReduction& firstReduction, SecondReduction& secondReduction, CudaReductionKernel( const Real& zero, const DataFetcher& dataFetcher, const Reduction& reduction, const Index size, ResultType* output ) Result* output ) { typedef Index IndexType; typedef typename Operation::ResultType ResultType; using RealType = Real; using IndexType = Index; using ResultType = Result; ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >(); Loading @@ -67,28 +68,28 @@ CudaReductionKernel( const Real& initialValue, IndexType gid = blockIdx.x * blockDim. x + threadIdx.x; const IndexType gridSize = blockDim.x * gridDim.x; sdata[ tid ] = initialValue; sdata[ tid ] = zero; /*** * Read data into the shared memory. We start with the * sequential reduction. */ while( gid + 4 * gridSize < size ) { sdata[ tid ] = firstReduction( sdata[ tid ], gid ); sdata[ tid ] = firstReduction( sdata[ tid ], gid + gridSize ); sdata[ tid ] = firstReduction( sdata[ tid ], gid + 2 * gridSize ); sdata[ tid ] = firstReduction( sdata[ tid ], gid + 3 * gridSize ); sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) ); sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ) ); sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ) ); gid += 4 * gridSize; } while( gid + 2 * gridSize < size ) { firstReduction( sdata[ tid ], gid, input1, input2 ); firstReduction( sdata[ tid ], gid + gridSize, input1, input2 ); sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) ); sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); gid += 2 * gridSize; } while( gid < size ) { operation.firstReduction( sdata[ tid ], gid, input1, input2 ); sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) ); gid += gridSize; } __syncthreads(); Loading @@ -103,19 +104,19 @@ CudaReductionKernel( const Real& initialValue, if( blockSize >= 1024 ) { if( tid < 512 ) operation.commonReduction( sdata[ tid ], sdata[ tid + 512 ] ); reduction( sdata[ tid ], sdata[ tid + 512 ] ); __syncthreads(); } if( blockSize >= 512 ) { if( tid < 256 ) operation.commonReduction( sdata[ tid ], sdata[ tid + 256 ] ); reduction( sdata[ tid ], sdata[ tid + 256 ] ); __syncthreads(); } if( blockSize >= 256 ) { if( tid < 128 ) operation.commonReduction( sdata[ tid ], sdata[ tid + 128 ] ); reduction( sdata[ tid ], sdata[ tid + 128 ] ); __syncthreads(); //printf( "2: tid %d data %f \n", tid, sdata[ tid ] ); } Loading @@ -123,7 +124,7 @@ CudaReductionKernel( const Real& initialValue, if( blockSize >= 128 ) { if( tid < 64 ) operation.commonReduction( sdata[ tid ], sdata[ tid + 64 ] ); reduction( sdata[ tid ], sdata[ tid + 64 ] ); __syncthreads(); //printf( "3: tid %d data %f \n", tid, sdata[ tid ] ); } Loading @@ -137,34 +138,34 @@ CudaReductionKernel( const Real& initialValue, volatile ResultType* vsdata = sdata; if( blockSize >= 64 ) { operation.commonReduction( vsdata[ tid ], vsdata[ tid + 32 ] ); reduction( vsdata[ tid ], vsdata[ tid + 32 ] ); //printf( "4: tid %d data %f \n", tid, sdata[ tid ] ); } // TODO: If blocksize == 32, the following does not work // We do not check if tid < 16. Fix it!!! if( blockSize >= 32 ) { operation.commonReduction( vsdata[ tid ], vsdata[ tid + 16 ] ); reduction( vsdata[ tid ], vsdata[ tid + 16 ] ); //printf( "5: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 16 ) { operation.commonReduction( vsdata[ tid ], vsdata[ tid + 8 ] ); reduction( vsdata[ tid ], vsdata[ tid + 8 ] ); //printf( "6: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 8 ) { operation.commonReduction( vsdata[ tid ], vsdata[ tid + 4 ] ); reduction( vsdata[ tid ], vsdata[ tid + 4 ] ); //printf( "7: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 4 ) { operation.commonReduction( vsdata[ tid ], vsdata[ tid + 2 ] ); reduction( vsdata[ tid ], vsdata[ tid + 2 ] ); //printf( "8: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 2 ) { operation.commonReduction( vsdata[ tid ], vsdata[ tid + 1 ] ); reduction( vsdata[ tid ], vsdata[ tid + 1 ] ); //printf( "9: tid %d data %f \n", tid, sdata[ tid ] ); } } Loading @@ -180,16 +181,21 @@ CudaReductionKernel( const Real& initialValue, } template< typename Operation, typename Index > template< typename Real, typename DataFetcher, typename Reduction, typename Index, typename Result > int CudaReductionKernelLauncher( Operation& operation, const Index size, const typename Operation::DataType1* input1, const typename Operation::DataType2* input2, typename Operation::ResultType*& output ) CudaReductionKernelLauncher( const Index size, const Reduction& reduction, const DataFetcher& dataFetcher, const Real& zero, Result*& output ) { typedef Index IndexType; typedef typename Operation::ResultType ResultType; using RealType = Real; using IndexType = Index; using ResultType = Result; // The number of blocks should be a multiple of the number of multiprocessors // to ensure optimum balancing of the load. This is very important, because Loading Loading @@ -227,55 +233,55 @@ CudaReductionKernelLauncher( Operation& operation, { case 512: CudaReductionKernel< 512 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 256: cudaFuncSetCacheConfig(CudaReductionKernel< 256, Operation, Index >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaReductionKernel< 256, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); CudaReductionKernel< 256 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 128: cudaFuncSetCacheConfig(CudaReductionKernel< 128, Operation, Index >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaReductionKernel< 128, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); CudaReductionKernel< 128 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 64: cudaFuncSetCacheConfig(CudaReductionKernel< 64, Operation, Index >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaReductionKernel< 64, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); CudaReductionKernel< 64 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 32: cudaFuncSetCacheConfig(CudaReductionKernel< 32, Operation, Index >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaReductionKernel< 32, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); CudaReductionKernel< 32 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 16: cudaFuncSetCacheConfig(CudaReductionKernel< 16, Operation, Index >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaReductionKernel< 16, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); CudaReductionKernel< 16 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 8: cudaFuncSetCacheConfig(CudaReductionKernel< 8, Operation, Index >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaReductionKernel< 8, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); CudaReductionKernel< 8 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 4: cudaFuncSetCacheConfig(CudaReductionKernel< 4, Operation, Index >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaReductionKernel< 4, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); CudaReductionKernel< 4 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 2: cudaFuncSetCacheConfig(CudaReductionKernel< 2, Operation, Index >, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(CudaReductionKernel< 2, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared); CudaReductionKernel< 2 > <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output); <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); break; case 1: TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl ); Loading
src/TNL/Containers/Algorithms/Reduction.h +29 −20 Original line number Diff line number Diff line Loading @@ -29,36 +29,45 @@ template<> class Reduction< Devices::Cuda > { public: template< typename Operation, typename Index > static typename Operation::ResultType reduce( Operation& operation, const Index size, const typename Operation::DataType1* deviceInput1, const typename Operation::DataType2* deviceInput2 ); template< typename Index, typename ReductionOperation, typename DataFetcher, typename Result = decltype( DataFetcher::operator() ) > static Result reduce( const Index size, ReductionOperation& reduction, DataFetcher& dataFetcher, const Result& zero ); }; template<> class Reduction< Devices::Host > { public: template< typename Operation, typename Index > static typename Operation::ResultType reduce( Operation& operation, const Index size, const typename Operation::DataType1* deviceInput1, const typename Operation::DataType2* deviceInput2 ); template< typename Index, typename ReductionOperation, typename DataFetcher, typename Result = decltype( DataFetcher::operator() ) > static Result reduce( const Index size, ReductionOperation& reduction, DataFetcher& dataFetcher, const Result& zero ); }; template<> class Reduction< Devices::MIC > { public: template< typename Operation, typename Index > static typename Operation::ResultType reduce( Operation& operation, const Index size, const typename Operation::DataType1* deviceInput1, const typename Operation::DataType2* deviceInput2 ); template< typename Index, typename ReductionOperation, typename DataFetcher, typename Result = decltype( DataFetcher::operator() ) > static Result reduce( const Index size, ReductionOperation& reduction, DataFetcher& dataFetcher, const Result& zero ); }; } // namespace Algorithms Loading
src/TNL/Containers/Algorithms/Reduction_impl.h +76 −64 Original line number Diff line number Diff line Loading @@ -38,35 +38,37 @@ namespace Algorithms { */ static constexpr int Reduction_minGpuDataSize = 256;//65536; //16384;//1024;//256; template< typename Operation, typename Index > typename Operation::ResultType template< typename Index, typename ReductionOperation, typename DataFetcher, typename Result > Result Reduction< Devices::Cuda >:: reduce( Operation& operation, const Index size, const typename Operation::DataType1* deviceInput1, const typename Operation::DataType2* deviceInput2 ) reduce( const Index size, ReductionOperation& reduction, DataFetcher& dataFetcher, const Result& zero ) { #ifdef HAVE_CUDA typedef Index IndexType; typedef typename Operation::DataType1 DataType1; typedef typename Operation::DataType2 DataType2; typedef typename Operation::ResultType ResultType; typedef typename Operation::LaterReductionOperation LaterReductionOperation; using IndexType = Index; using ResultType = Result; /*** * Only fundamental and pointer types can be safely reduced on host. Complex * objects stored on the device might contain pointers into the device memory, * in which case reduction on host might fail. */ constexpr bool can_reduce_all_on_host = std::is_fundamental< DataType1 >::value || std::is_fundamental< DataType2 >::value || std::is_pointer< DataType1 >::value || std::is_pointer< DataType2 >::value; //constexpr bool can_reduce_all_on_host = std::is_fundamental< DataType1 >::value || std::is_fundamental< DataType2 >::value || std::is_pointer< DataType1 >::value || std::is_pointer< DataType2 >::value; constexpr bool can_reduce_later_on_host = std::is_fundamental< ResultType >::value || std::is_pointer< ResultType >::value; /*** * First check if the input array(s) is/are large enough for the reduction on GPU. * Otherwise copy it/them to host and reduce on CPU. */ if( can_reduce_all_on_host && size <= Reduction_minGpuDataSize ) // With lambda function we cannot reduce data on host - we do not know the data here. /*if( can_reduce_all_on_host && size <= Reduction_minGpuDataSize ) { typename std::remove_const< DataType1 >::type hostArray1[ Reduction_minGpuDataSize ]; ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( hostArray1, deviceInput1, size ); Loading @@ -74,12 +76,12 @@ reduce( Operation& operation, using _DT2 = typename std::conditional< std::is_same< DataType2, void >::value, DataType1, DataType2 >::type; typename std::remove_const< _DT2 >::type hostArray2[ Reduction_minGpuDataSize ]; ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( hostArray2, (_DT2*) deviceInput2, size ); return Reduction< Devices::Host >::reduce( operation, size, hostArray1, hostArray2 ); return Reduction< Devices::Host >::reduce( zero, dataFetcher, reduction, size, hostArray1, hostArray2 ); } else { return Reduction< Devices::Host >::reduce( operation, size, hostArray1, (DataType2*) nullptr ); } } }*/ #ifdef CUDA_REDUCTION_PROFILING Timer timer; Loading @@ -91,10 +93,10 @@ reduce( Operation& operation, * Reduce the data on the CUDA device. */ ResultType* deviceAux1( 0 ); IndexType reducedSize = CudaReductionKernelLauncher( operation, size, deviceInput1, deviceInput2, IndexType reducedSize = CudaReductionKernelLauncher( size, reduction, dataFetcher, zero, deviceAux1 ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); Loading @@ -120,8 +122,8 @@ reduce( Operation& operation, /*** * Reduce the data on the host system. */ LaterReductionOperation laterReductionOperation; const ResultType result = Reduction< Devices::Host >::reduce( laterReductionOperation, reducedSize, resultArray, (void*) nullptr ); auto copyFetch = [=] ( IndexType i ) { return resultArray[ i ]; }; const ResultType result = Reduction< Devices::Host >::reduce( reducedSize, reduction, copyFetch, zero ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); Loading @@ -134,12 +136,13 @@ reduce( Operation& operation, /*** * Data can't be safely reduced on host, so continue with the reduction on the CUDA device. */ LaterReductionOperation laterReductionOperation; //LaterReductionOperation laterReductionOperation; auto copyFetch = [=] __cuda_callable__ ( IndexType i ) { return deviceAux1[ i ]; }; while( reducedSize > 1 ) { reducedSize = CudaReductionKernelLauncher( laterReductionOperation, reducedSize, deviceAux1, (ResultType*) 0, reducedSize = CudaReductionKernelLauncher( reducedSize, reduction, copyFetch, zero, deviceAux1 ); } Loading @@ -166,18 +169,19 @@ reduce( Operation& operation, #endif }; template< typename Operation, typename Index > typename Operation::ResultType template< typename Index, typename ReductionOperation, typename DataFetcher, typename Result > Result Reduction< Devices::Host >:: reduce( Operation& operation, const Index size, const typename Operation::DataType1* input1, const typename Operation::DataType2* input2 ) reduce( const Index size, ReductionOperation& reduction, DataFetcher& dataFetcher, const Result& zero ) { typedef Index IndexType; typedef typename Operation::DataType1 DataType1; typedef typename Operation::DataType2 DataType2; typedef typename Operation::ResultType ResultType; using IndexType = Index; using ResultType = Result; constexpr int block_size = 128; const int blocks = size / block_size; Loading @@ -185,23 +189,24 @@ reduce( Operation& operation, #ifdef HAVE_OPENMP if( TNL::Devices::Host::isOMPEnabled() && size >= 2 * block_size ) { // global result variable ResultType result = operation.initialValue(); ResultType result = zero; #pragma omp parallel { // initialize array for thread-local results ResultType r[ 4 ] = { operation.initialValue(), operation.initialValue(), operation.initialValue(), operation.initialValue() }; ResultType r[ 4 ] = { zero, zero, zero, zero }; #pragma omp for nowait for( int b = 0; b < blocks; b++ ) { const IndexType offset = b * block_size; for( int i = 0; i < block_size; i += 4 ) { operation.firstReduction( r[ 0 ], offset + i, input1, input2 ); operation.firstReduction( r[ 1 ], offset + i + 1, input1, input2 ); operation.firstReduction( r[ 2 ], offset + i + 2, input1, input2 ); operation.firstReduction( r[ 3 ], offset + i + 3, input1, input2 ); r[ 0 ] = reduction( r[ 0 ], dataFetcher( offset + i ) ); r[ 1 ] = reduction( r[ 1 ], dataFetcher( offset + i + 1 ) ); r[ 2 ] = reduction( r[ 2 ], dataFetcher( offset + i + 2 ) ); r[ 3 ] = reduction( r[ 3 ], dataFetcher( offset + i + 3 ) ); /*operation.dataFetcher( r[ 0 ], offset + i, input1, input2 ); operation.dataFetcher( r[ 1 ], offset + i + 1, input1, input2 ); operation.dataFetcher( r[ 2 ], offset + i + 2, input1, input2 ); operation.dataFetcher( r[ 3 ], offset + i + 3, input1, input2 );*/ } } Loading @@ -209,18 +214,19 @@ reduce( Operation& operation, #pragma omp single nowait { for( IndexType i = blocks * block_size; i < size; i++ ) operation.firstReduction( r[ 0 ], i, input1, input2 ); r[ 0 ] = reduction( r[ 0 ], dataFetcher( i ) ); //operation.dataFetcher( r[ 0 ], i, input1, input2 ); } // local reduction of unrolled results operation.commonReduction( r[ 0 ], r[ 2 ] ); operation.commonReduction( r[ 1 ], r[ 3 ] ); operation.commonReduction( r[ 0 ], r[ 1 ] ); r[ 0 ] = reduction( r[ 0 ], r[ 2 ] ); r[ 1 ] = reduction( r[ 1 ], r[ 3 ] ); r[ 0 ] = reduction( r[ 0 ], r[ 1 ] ); // inter-thread reduction of local results #pragma omp critical { operation.commonReduction( result, r[ 0 ] ); result = reduction( result, r[ 0 ] ); } } return result; Loading @@ -229,37 +235,43 @@ reduce( Operation& operation, #endif if( blocks > 1 ) { // initialize array for unrolled results ResultType r[ 4 ] = { operation.initialValue(), operation.initialValue(), operation.initialValue(), operation.initialValue() }; ResultType r[ 4 ] = { zero, zero, zero, zero }; // main reduction (explicitly unrolled loop) for( int b = 0; b < blocks; b++ ) { const IndexType offset = b * block_size; for( int i = 0; i < block_size; i += 4 ) { operation.firstReduction( r[ 0 ], offset + i, input1, input2 ); operation.firstReduction( r[ 1 ], offset + i + 1, input1, input2 ); operation.firstReduction( r[ 2 ], offset + i + 2, input1, input2 ); operation.firstReduction( r[ 3 ], offset + i + 3, input1, input2 ); r[ 0 ] = reduction( r[ 0 ], dataFetcher( offset + i ) ); r[ 1 ] = reduction( r[ 1 ], dataFetcher( offset + i + 1 ) ); r[ 2 ] = reduction( r[ 2 ], dataFetcher( offset + i + 2 ) ); r[ 3 ] = reduction( r[ 3 ], dataFetcher( offset + i + 3 ) ); /*operation.dataFetcher( r[ 0 ], offset + i, input1, input2 ); operation.dataFetcher( r[ 1 ], offset + i + 1, input1, input2 ); operation.dataFetcher( r[ 2 ], offset + i + 2, input1, input2 ); operation.dataFetcher( r[ 3 ], offset + i + 3, input1, input2 );*/ } } // reduction of the last, incomplete block (not unrolled) for( IndexType i = blocks * block_size; i < size; i++ ) operation.firstReduction( r[ 0 ], i, input1, input2 ); r[ 0 ] = reduction( r[ 0 ], dataFetcher( i ) ); //operation.dataFetcher( r[ 0 ], i, input1, input2 ); // reduction of unrolled results operation.commonReduction( r[ 0 ], r[ 2 ] ); operation.commonReduction( r[ 1 ], r[ 3 ] ); operation.commonReduction( r[ 0 ], r[ 1 ] ); r[ 0 ] = reduction( r[ 0 ], r[ 2 ] ); r[ 1 ] = reduction( r[ 1 ], r[ 3 ] ); r[ 0 ] = reduction( r[ 0 ], r[ 1 ] ); /*operation.reduction( r[ 0 ], r[ 2 ] ); operation.reduction( r[ 1 ], r[ 3 ] ); operation.reduction( r[ 0 ], r[ 1 ] );*/ return r[ 0 ]; } else { ResultType result = operation.initialValue(); ResultType result = zero; for( IndexType i = 0; i < size; i++ ) operation.firstReduction( result, i, input1, input2 ); result = reduction( result, dataFetcher( i ) ); return result; } #ifdef HAVE_OPENMP Loading
src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h +18 −6 File changed.Preview size limit exceeded, changes collapsed. Show changes