Loading Documentation/Tutorials/ReductionAndScan/ReductionWithArgument.cpp +4 −4 Original line number Diff line number Diff line Loading @@ -8,13 +8,13 @@ using namespace TNL::Containers; using namespace TNL::Algorithms; template< typename Device > std::pair< int, double > std::pair< double, int > maximumNorm( const Vector< double, Device >& v ) { auto view = v.getConstView(); auto fetch = [=] __cuda_callable__ ( int i ) { return abs( view[ i ] ); }; auto reduction = [] __cuda_callable__ ( int& aIdx, const int& bIdx, double& a, const double& b ) { auto reduction = [] __cuda_callable__ ( double& a, const double& b, int& aIdx, const int& bIdx ) { if( a < b ) { a = b; aIdx = bIdx; Loading @@ -31,13 +31,13 @@ int main( int argc, char* argv[] ) host_v.evaluate( [] __cuda_callable__ ( int i )->double { return i - 7; } ); std::cout << "host_v = " << host_v << std::endl; auto maxNormHost = maximumNorm( host_v ); std::cout << "The maximum norm of the host vector elements is " << maxNormHost.second << " at position " << maxNormHost.first << "." << std::endl; std::cout << "The maximum norm of the host vector elements is " << maxNormHost.first << " at position " << maxNormHost.second << "." << std::endl; #ifdef HAVE_CUDA Vector< double, Devices::Cuda > cuda_v( 10 ); cuda_v.evaluate( [] __cuda_callable__ ( int i )->double { return i - 7; } ); std::cout << "cuda_v = " << cuda_v << std::endl; auto maxNormCuda = maximumNorm( cuda_v ); std::cout << "The maximum norm of the device vector elements is " << maxNormCuda.second << " at position " << maxNormCuda.first << "." << std::endl; std::cout << "The maximum norm of the device vector elements is " << maxNormCuda.first << " at position " << maxNormCuda.second << "." << std::endl; #endif return EXIT_SUCCESS; } Loading src/TNL/Algorithms/CudaReductionKernel.h +28 −28 Original line number Diff line number Diff line Loading @@ -166,19 +166,19 @@ CudaReductionWithArgumentKernel( const Result zero, sdata[ tid ] = zero; } while( gid + 4 * gridSize < size ) { reduction( sidx[ tid ], idxInput[ gid ], sdata[ tid ], dataFetcher( gid ) ); reduction( sidx[ tid ], idxInput[ gid + gridSize ], sdata[ tid ], dataFetcher( gid + gridSize ) ); reduction( sidx[ tid ], idxInput[ gid + 2 * gridSize ], sdata[ tid ], dataFetcher( gid + 2 * gridSize ) ); reduction( sidx[ tid ], idxInput[ gid + 3 * gridSize ], sdata[ tid ], dataFetcher( gid + 3 * gridSize ) ); reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], idxInput[ gid ] ); reduction( sdata[ tid ], dataFetcher( gid + gridSize ), sidx[ tid ], idxInput[ gid + gridSize ] ); reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ), sidx[ tid ], idxInput[ gid + 2 * gridSize ] ); reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ), sidx[ tid ], idxInput[ gid + 3 * gridSize ] ); gid += 4 * gridSize; } while( gid + 2 * gridSize < size ) { reduction( sidx[ tid ], idxInput[ gid ], sdata[ tid ], dataFetcher( gid ) ); reduction( sidx[ tid ], idxInput[ gid + gridSize ], sdata[ tid ], dataFetcher( gid + gridSize ) ); reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], idxInput[ gid ] ); reduction( sdata[ tid ], dataFetcher( gid + gridSize ), sidx[ tid ], idxInput[ gid + gridSize ] ); gid += 2 * gridSize; } while( gid < size ) { reduction( sidx[ tid ], idxInput[ gid ], sdata[ tid ], dataFetcher( gid ) ); reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], idxInput[ gid ] ); gid += gridSize; } } Loading @@ -191,19 +191,19 @@ CudaReductionWithArgumentKernel( const Result zero, sdata[ tid ] = zero; } while( gid + 4 * gridSize < size ) { reduction( sidx[ tid ], gid, sdata[ tid ], dataFetcher( gid ) ); reduction( sidx[ tid ], gid + gridSize, sdata[ tid ], dataFetcher( gid + gridSize ) ); reduction( sidx[ tid ], gid + 2 * gridSize, sdata[ tid ], dataFetcher( gid + 2 * gridSize ) ); reduction( sidx[ tid ], gid + 3 * gridSize, sdata[ tid ], dataFetcher( gid + 3 * gridSize ) ); reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], gid ); reduction( sdata[ tid ], dataFetcher( gid + gridSize ), sidx[ tid ], gid + gridSize ); reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ), sidx[ tid ], gid + 2 * gridSize ); reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ), sidx[ tid ], gid + 3 * gridSize ); gid += 4 * gridSize; } while( gid + 2 * gridSize < size ) { reduction( sidx[ tid ], gid, sdata[ tid ], dataFetcher( gid ) ); reduction( sidx[ tid ], gid + gridSize, sdata[ tid ], dataFetcher( gid + gridSize ) ); reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], gid ); reduction( sdata[ tid ], dataFetcher( gid + gridSize ), sidx[ tid ], gid + gridSize ); gid += 2 * gridSize; } while( gid < size ) { reduction( sidx[ tid ], gid, sdata[ tid ], dataFetcher( gid ) ); reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], gid ); gid += gridSize; } } Loading @@ -212,48 +212,48 @@ CudaReductionWithArgumentKernel( const Result zero, // Perform the parallel reduction. if( blockSize >= 1024 ) { if( tid < 512 ) reduction( sidx[ tid ], sidx[ tid + 512 ], sdata[ tid ], sdata[ tid + 512 ] ); reduction( sdata[ tid ], sdata[ tid + 512 ], sidx[ tid ], sidx[ tid + 512 ] ); __syncthreads(); } if( blockSize >= 512 ) { if( tid < 256 ) reduction( sidx[ tid ], sidx[ tid + 256 ], sdata[ tid ], sdata[ tid + 256 ] ); reduction( sdata[ tid ], sdata[ tid + 256 ], sidx[ tid ], sidx[ tid + 256 ] ); __syncthreads(); } if( blockSize >= 256 ) { if( tid < 128 ) reduction( sidx[ tid ], sidx[ tid + 128 ], sdata[ tid ], sdata[ tid + 128 ] ); reduction( sdata[ tid ], sdata[ tid + 128 ], sidx[ tid ], sidx[ tid + 128 ] ); __syncthreads(); } if( blockSize >= 128 ) { if( tid < 64 ) reduction( sidx[ tid ], sidx[ tid + 64 ], sdata[ tid ], sdata[ tid + 64 ] ); reduction( sdata[ tid ], sdata[ tid + 64 ], sidx[ tid ], sidx[ tid + 64 ] ); __syncthreads(); } // This runs in one warp so we use __syncwarp() instead of __syncthreads(). if( tid < 32 ) { if( blockSize >= 64 ) reduction( sidx[ tid ], sidx[ tid + 32 ], sdata[ tid ], sdata[ tid + 32 ] ); reduction( sdata[ tid ], sdata[ tid + 32 ], sidx[ tid ], sidx[ tid + 32 ] ); __syncwarp(); // Note that here we do not have to check if tid < 16 etc, because we have // 2 * blockSize.x elements of shared memory per block, so we do not // access out of bounds. The results for the upper half will be undefined, // but unused anyway. if( blockSize >= 32 ) reduction( sidx[ tid ], sidx[ tid + 16 ], sdata[ tid ], sdata[ tid + 16 ] ); reduction( sdata[ tid ], sdata[ tid + 16 ], sidx[ tid ], sidx[ tid + 16 ] ); __syncwarp(); if( blockSize >= 16 ) reduction( sidx[ tid ], sidx[ tid + 8 ], sdata[ tid ], sdata[ tid + 8 ] ); reduction( sdata[ tid ], sdata[ tid + 8 ], sidx[ tid ], sidx[ tid + 8 ] ); __syncwarp(); if( blockSize >= 8 ) reduction( sidx[ tid ], sidx[ tid + 4 ], sdata[ tid ], sdata[ tid + 4 ] ); reduction( sdata[ tid ], sdata[ tid + 4 ], sidx[ tid ], sidx[ tid + 4 ] ); __syncwarp(); if( blockSize >= 4 ) reduction( sidx[ tid ], sidx[ tid + 2 ], sdata[ tid ], sdata[ tid + 2 ] ); reduction( sdata[ tid ], sdata[ tid + 2 ], sidx[ tid ], sidx[ tid + 2 ] ); __syncwarp(); if( blockSize >= 2 ) reduction( sidx[ tid ], sidx[ tid + 1 ], sdata[ tid ], sdata[ tid + 1 ] ); reduction( sdata[ tid ], sdata[ tid + 1 ], sidx[ tid ], sidx[ tid + 1 ] ); } // Store the result back in the global memory. Loading Loading @@ -357,7 +357,7 @@ struct CudaReductionKernelLauncher } template< typename Reduction > std::pair< Index, Result > std::pair< Result, Index > finishWithArgument( const Reduction& reduction, const Result& zero ) { Loading @@ -384,9 +384,9 @@ struct CudaReductionKernelLauncher //// // Copy result on CPU std::pair< Index, Result > result; MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( &result.first, idxOutput, 1 ); MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( &result.second, output, 1 ); std::pair< Result, Index > result; MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( &result.first, output, 1 ); MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( &result.second, idxOutput, 1 ); return result; } Loading src/TNL/Algorithms/Reduction.h +5 −5 Original line number Diff line number Diff line Loading @@ -54,7 +54,7 @@ struct Reduction< Devices::Sequential > typename Result, typename ReductionOperation, typename DataFetcher > static constexpr std::pair< Index, Result > static constexpr std::pair< Result, Index > reduceWithArgument( const Index size, const ReductionOperation& reduction, DataFetcher& dataFetcher, Loading Loading @@ -138,7 +138,7 @@ struct Reduction< Devices::Host > * The reduction lambda function takes two variables which are supposed to be reduced: * * ``` * auto reduction = [] __cuda_callable__ ( Index& aIdx, const Index& bIdx, const Result& a, const Result& b ) { return ... }; * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b, Index& aIdx, const Index& bIdx ) { return ... }; * ``` * * \par Example Loading @@ -153,7 +153,7 @@ struct Reduction< Devices::Host > typename Result, typename ReductionOperation, typename DataFetcher > static std::pair< Index, Result > static std::pair< Result, Index > reduceWithArgument( const Index size, const ReductionOperation& reduction, DataFetcher& dataFetcher, Loading Loading @@ -237,7 +237,7 @@ struct Reduction< Devices::Cuda > * The reduction lambda function takes two variables which are supposed to be reduced: * * ``` * auto reduction = [] __cuda_callable__ ( Index& aIdx, const Index& bIdx, const Result& a, const Result& b ) { return ... }; * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b, Index& aIdx, const Index& bIdx ) { return ... }; * ``` * * \par Example Loading @@ -252,7 +252,7 @@ struct Reduction< Devices::Cuda > typename Result, typename ReductionOperation, typename DataFetcher > static std::pair< Index, Result > static std::pair< Result, Index > reduceWithArgument( const Index size, const ReductionOperation& reduction, DataFetcher& dataFetcher, Loading src/TNL/Algorithms/Reduction.hpp +28 −28 Original line number Diff line number Diff line Loading @@ -86,7 +86,7 @@ template< typename Index, typename Result, typename ReductionOperation, typename DataFetcher > constexpr std::pair< Index, Result > constexpr std::pair< Result, Index > Reduction< Devices::Sequential >:: reduceWithArgument( const Index size, const ReductionOperation& reduction, Loading Loading @@ -119,27 +119,27 @@ reduceWithArgument( const Index size, initialized = true; continue; } reduction( arg[ 0 ], offset + i, r[ 0 ], dataFetcher( offset + i ) ); reduction( arg[ 1 ], offset + i + 1, r[ 1 ], dataFetcher( offset + i + 1 ) ); reduction( arg[ 2 ], offset + i + 2, r[ 2 ], dataFetcher( offset + i + 2 ) ); reduction( arg[ 3 ], offset + i + 3, r[ 3 ], dataFetcher( offset + i + 3 ) ); reduction( r[ 0 ], dataFetcher( offset + i ), arg[ 0 ], offset + i ); reduction( r[ 1 ], dataFetcher( offset + i + 1 ), arg[ 1 ], offset + i + 1 ); reduction( r[ 2 ], dataFetcher( offset + i + 2 ), arg[ 2 ], offset + i + 2 ); reduction( r[ 3 ], dataFetcher( offset + i + 3 ), arg[ 3 ], offset + i + 3 ); } } // reduction of the last, incomplete block (not unrolled) for( Index i = blocks * block_size; i < size; i++ ) reduction( arg[ 0 ], i, r[ 0 ], dataFetcher( i ) ); reduction( r[ 0 ], dataFetcher( i ), arg[ 0 ], i ); // reduction of unrolled results reduction( arg[ 0 ], arg[ 2 ], r[ 0 ], r[ 2 ] ); reduction( arg[ 1 ], arg[ 3 ], r[ 1 ], r[ 3 ] ); reduction( arg[ 0 ], arg[ 1 ], r[ 0 ], r[ 1 ] ); return std::make_pair( arg[ 0 ], r[ 0 ] ); reduction( r[ 0 ], r[ 2 ], arg[ 0 ], arg[ 2 ] ); reduction( r[ 1 ], r[ 3 ], arg[ 1 ], arg[ 3 ] ); reduction( r[ 0 ], r[ 1 ], arg[ 0 ], arg[ 1 ] ); return std::make_pair( r[ 0 ], arg[ 0 ] ); } else { std::pair< Index, Result > result( 0, dataFetcher( 0 ) ); std::pair< Result, Index > result( dataFetcher( 0 ), 0 ); for( Index i = 1; i < size; i++ ) reduction( result.first, i, result.second, dataFetcher( i ) ); reduction( result.first, dataFetcher( i ), result.second, i ); return result; } } Loading Loading @@ -208,7 +208,7 @@ template< typename Index, typename Result, typename ReductionOperation, typename DataFetcher > std::pair< Index, Result > std::pair< Result, Index > Reduction< Devices::Host >:: reduceWithArgument( const Index size, const ReductionOperation& reduction, Loading @@ -221,7 +221,7 @@ reduceWithArgument( const Index size, if( Devices::Host::isOMPEnabled() && blocks >= 2 ) { // global result variable std::pair< Index, Result > result( -1, zero ); std::pair< Result, Index > result( zero, -1 ); const int threads = TNL::min( blocks, Devices::Host::getMaxThreadsCount() ); #pragma omp parallel num_threads(threads) { Loading @@ -246,10 +246,10 @@ reduceWithArgument( const Index size, initialized = true; continue; } reduction( arg[ 0 ], offset + i, r[ 0 ], dataFetcher( offset + i ) ); reduction( arg[ 1 ], offset + i + 1, r[ 1 ], dataFetcher( offset + i + 1 ) ); reduction( arg[ 2 ], offset + i + 2, r[ 2 ], dataFetcher( offset + i + 2 ) ); reduction( arg[ 3 ], offset + i + 3, r[ 3 ], dataFetcher( offset + i + 3 ) ); reduction( r[ 0 ], dataFetcher( offset + i ), arg[ 0 ], offset + i ); reduction( r[ 1 ], dataFetcher( offset + i + 1 ), arg[ 1 ], offset + i + 1 ); reduction( r[ 2 ], dataFetcher( offset + i + 2 ), arg[ 2 ], offset + i + 2 ); reduction( r[ 3 ], dataFetcher( offset + i + 3 ), arg[ 3 ], offset + i + 3 ); } } Loading @@ -257,20 +257,20 @@ reduceWithArgument( const Index size, #pragma omp single nowait { for( Index i = blocks * block_size; i < size; i++ ) reduction( arg[ 0 ], i, r[ 0 ], dataFetcher( i ) ); reduction( r[ 0 ], dataFetcher( i ), arg[ 0 ], i ); } // local reduction of unrolled results reduction( arg[ 0 ], arg[ 2 ], r[ 0 ], r[ 2 ] ); reduction( arg[ 1 ], arg[ 3 ], r[ 1 ], r[ 3 ] ); reduction( arg[ 0 ], arg[ 1 ], r[ 0 ], r[ 1 ] ); reduction( r[ 0 ], r[ 2 ], arg[ 0 ], arg[ 2 ] ); reduction( r[ 1 ], r[ 3 ], arg[ 1 ], arg[ 3 ] ); reduction( r[ 0 ], r[ 1 ], arg[ 0 ], arg[ 1 ] ); // inter-thread reduction of local results #pragma omp critical { if( result.first == -1 ) result.first = arg[ 0 ]; reduction( result.first, arg[ 0 ], result.second, r[ 0 ] ); if( result.second == -1 ) result.second = arg[ 0 ]; reduction( result.first, r[ 0 ], result.second, arg[ 0 ] ); } } return result; Loading Loading @@ -373,7 +373,7 @@ template< typename Index, typename Result, typename ReductionOperation, typename DataFetcher > std::pair< Index, Result > std::pair< Result, Index > Reduction< Devices::Cuda >:: reduceWithArgument( const Index size, const ReductionOperation& reduction, Loading Loading @@ -454,13 +454,13 @@ reduceWithArgument( const Index size, // auto fetch = [&] ( Index i ) { return resultArray[ i ]; }; // const Result result = Reduction< Devices::Sequential >::reduceWithArgument( reducedSize, argument, reduction, fetch, zero ); for( Index i = 1; i < reducedSize; i++ ) reduction( indexArray[ 0 ], indexArray[ i ], resultArray[ 0 ], resultArray[ i ] ); reduction( resultArray[ 0 ], resultArray[ i ], indexArray[ 0 ], indexArray[ i ] ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); std::cout << " Reduction of small data set on CPU took " << timer.getRealTime() << " sec. " << std::endl; #endif return std::make_pair( indexArray[ 0 ], resultArray[ 0 ] ); return std::make_pair( resultArray[ 0 ], indexArray[ 0 ] ); } else { // data can't be safely reduced on host, so continue with the reduction on the GPU Loading src/TNL/Containers/Expressions/DistributedVerticalOperations.h +2 −2 Original line number Diff line number Diff line Loading @@ -35,7 +35,7 @@ auto DistributedExpressionMin( const Expression& expression ) -> std::decay_t< d template< typename Expression > auto DistributedExpressionArgMin( const Expression& expression ) -> std::pair< typename Expression::IndexType, std::decay_t< decltype( expression[0] ) > > -> std::pair< std::decay_t< decltype( expression[0] ) >, typename Expression::IndexType > { using RealType = std::decay_t< decltype( expression[0] ) >; using IndexType = typename Expression::IndexType; Loading Loading @@ -92,7 +92,7 @@ auto DistributedExpressionMax( const Expression& expression ) -> std::decay_t< d template< typename Expression > auto DistributedExpressionArgMax( const Expression& expression ) -> std::pair< typename Expression::IndexType, std::decay_t< decltype( expression[0] ) > > -> std::pair< std::decay_t< decltype( expression[0] ) >, typename Expression::IndexType > { using RealType = std::decay_t< decltype( expression[0] ) >; using IndexType = typename Expression::IndexType; Loading Loading
Documentation/Tutorials/ReductionAndScan/ReductionWithArgument.cpp +4 −4 Original line number Diff line number Diff line Loading @@ -8,13 +8,13 @@ using namespace TNL::Containers; using namespace TNL::Algorithms; template< typename Device > std::pair< int, double > std::pair< double, int > maximumNorm( const Vector< double, Device >& v ) { auto view = v.getConstView(); auto fetch = [=] __cuda_callable__ ( int i ) { return abs( view[ i ] ); }; auto reduction = [] __cuda_callable__ ( int& aIdx, const int& bIdx, double& a, const double& b ) { auto reduction = [] __cuda_callable__ ( double& a, const double& b, int& aIdx, const int& bIdx ) { if( a < b ) { a = b; aIdx = bIdx; Loading @@ -31,13 +31,13 @@ int main( int argc, char* argv[] ) host_v.evaluate( [] __cuda_callable__ ( int i )->double { return i - 7; } ); std::cout << "host_v = " << host_v << std::endl; auto maxNormHost = maximumNorm( host_v ); std::cout << "The maximum norm of the host vector elements is " << maxNormHost.second << " at position " << maxNormHost.first << "." << std::endl; std::cout << "The maximum norm of the host vector elements is " << maxNormHost.first << " at position " << maxNormHost.second << "." << std::endl; #ifdef HAVE_CUDA Vector< double, Devices::Cuda > cuda_v( 10 ); cuda_v.evaluate( [] __cuda_callable__ ( int i )->double { return i - 7; } ); std::cout << "cuda_v = " << cuda_v << std::endl; auto maxNormCuda = maximumNorm( cuda_v ); std::cout << "The maximum norm of the device vector elements is " << maxNormCuda.second << " at position " << maxNormCuda.first << "." << std::endl; std::cout << "The maximum norm of the device vector elements is " << maxNormCuda.first << " at position " << maxNormCuda.second << "." << std::endl; #endif return EXIT_SUCCESS; } Loading
src/TNL/Algorithms/CudaReductionKernel.h +28 −28 Original line number Diff line number Diff line Loading @@ -166,19 +166,19 @@ CudaReductionWithArgumentKernel( const Result zero, sdata[ tid ] = zero; } while( gid + 4 * gridSize < size ) { reduction( sidx[ tid ], idxInput[ gid ], sdata[ tid ], dataFetcher( gid ) ); reduction( sidx[ tid ], idxInput[ gid + gridSize ], sdata[ tid ], dataFetcher( gid + gridSize ) ); reduction( sidx[ tid ], idxInput[ gid + 2 * gridSize ], sdata[ tid ], dataFetcher( gid + 2 * gridSize ) ); reduction( sidx[ tid ], idxInput[ gid + 3 * gridSize ], sdata[ tid ], dataFetcher( gid + 3 * gridSize ) ); reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], idxInput[ gid ] ); reduction( sdata[ tid ], dataFetcher( gid + gridSize ), sidx[ tid ], idxInput[ gid + gridSize ] ); reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ), sidx[ tid ], idxInput[ gid + 2 * gridSize ] ); reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ), sidx[ tid ], idxInput[ gid + 3 * gridSize ] ); gid += 4 * gridSize; } while( gid + 2 * gridSize < size ) { reduction( sidx[ tid ], idxInput[ gid ], sdata[ tid ], dataFetcher( gid ) ); reduction( sidx[ tid ], idxInput[ gid + gridSize ], sdata[ tid ], dataFetcher( gid + gridSize ) ); reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], idxInput[ gid ] ); reduction( sdata[ tid ], dataFetcher( gid + gridSize ), sidx[ tid ], idxInput[ gid + gridSize ] ); gid += 2 * gridSize; } while( gid < size ) { reduction( sidx[ tid ], idxInput[ gid ], sdata[ tid ], dataFetcher( gid ) ); reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], idxInput[ gid ] ); gid += gridSize; } } Loading @@ -191,19 +191,19 @@ CudaReductionWithArgumentKernel( const Result zero, sdata[ tid ] = zero; } while( gid + 4 * gridSize < size ) { reduction( sidx[ tid ], gid, sdata[ tid ], dataFetcher( gid ) ); reduction( sidx[ tid ], gid + gridSize, sdata[ tid ], dataFetcher( gid + gridSize ) ); reduction( sidx[ tid ], gid + 2 * gridSize, sdata[ tid ], dataFetcher( gid + 2 * gridSize ) ); reduction( sidx[ tid ], gid + 3 * gridSize, sdata[ tid ], dataFetcher( gid + 3 * gridSize ) ); reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], gid ); reduction( sdata[ tid ], dataFetcher( gid + gridSize ), sidx[ tid ], gid + gridSize ); reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ), sidx[ tid ], gid + 2 * gridSize ); reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ), sidx[ tid ], gid + 3 * gridSize ); gid += 4 * gridSize; } while( gid + 2 * gridSize < size ) { reduction( sidx[ tid ], gid, sdata[ tid ], dataFetcher( gid ) ); reduction( sidx[ tid ], gid + gridSize, sdata[ tid ], dataFetcher( gid + gridSize ) ); reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], gid ); reduction( sdata[ tid ], dataFetcher( gid + gridSize ), sidx[ tid ], gid + gridSize ); gid += 2 * gridSize; } while( gid < size ) { reduction( sidx[ tid ], gid, sdata[ tid ], dataFetcher( gid ) ); reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], gid ); gid += gridSize; } } Loading @@ -212,48 +212,48 @@ CudaReductionWithArgumentKernel( const Result zero, // Perform the parallel reduction. if( blockSize >= 1024 ) { if( tid < 512 ) reduction( sidx[ tid ], sidx[ tid + 512 ], sdata[ tid ], sdata[ tid + 512 ] ); reduction( sdata[ tid ], sdata[ tid + 512 ], sidx[ tid ], sidx[ tid + 512 ] ); __syncthreads(); } if( blockSize >= 512 ) { if( tid < 256 ) reduction( sidx[ tid ], sidx[ tid + 256 ], sdata[ tid ], sdata[ tid + 256 ] ); reduction( sdata[ tid ], sdata[ tid + 256 ], sidx[ tid ], sidx[ tid + 256 ] ); __syncthreads(); } if( blockSize >= 256 ) { if( tid < 128 ) reduction( sidx[ tid ], sidx[ tid + 128 ], sdata[ tid ], sdata[ tid + 128 ] ); reduction( sdata[ tid ], sdata[ tid + 128 ], sidx[ tid ], sidx[ tid + 128 ] ); __syncthreads(); } if( blockSize >= 128 ) { if( tid < 64 ) reduction( sidx[ tid ], sidx[ tid + 64 ], sdata[ tid ], sdata[ tid + 64 ] ); reduction( sdata[ tid ], sdata[ tid + 64 ], sidx[ tid ], sidx[ tid + 64 ] ); __syncthreads(); } // This runs in one warp so we use __syncwarp() instead of __syncthreads(). if( tid < 32 ) { if( blockSize >= 64 ) reduction( sidx[ tid ], sidx[ tid + 32 ], sdata[ tid ], sdata[ tid + 32 ] ); reduction( sdata[ tid ], sdata[ tid + 32 ], sidx[ tid ], sidx[ tid + 32 ] ); __syncwarp(); // Note that here we do not have to check if tid < 16 etc, because we have // 2 * blockSize.x elements of shared memory per block, so we do not // access out of bounds. The results for the upper half will be undefined, // but unused anyway. if( blockSize >= 32 ) reduction( sidx[ tid ], sidx[ tid + 16 ], sdata[ tid ], sdata[ tid + 16 ] ); reduction( sdata[ tid ], sdata[ tid + 16 ], sidx[ tid ], sidx[ tid + 16 ] ); __syncwarp(); if( blockSize >= 16 ) reduction( sidx[ tid ], sidx[ tid + 8 ], sdata[ tid ], sdata[ tid + 8 ] ); reduction( sdata[ tid ], sdata[ tid + 8 ], sidx[ tid ], sidx[ tid + 8 ] ); __syncwarp(); if( blockSize >= 8 ) reduction( sidx[ tid ], sidx[ tid + 4 ], sdata[ tid ], sdata[ tid + 4 ] ); reduction( sdata[ tid ], sdata[ tid + 4 ], sidx[ tid ], sidx[ tid + 4 ] ); __syncwarp(); if( blockSize >= 4 ) reduction( sidx[ tid ], sidx[ tid + 2 ], sdata[ tid ], sdata[ tid + 2 ] ); reduction( sdata[ tid ], sdata[ tid + 2 ], sidx[ tid ], sidx[ tid + 2 ] ); __syncwarp(); if( blockSize >= 2 ) reduction( sidx[ tid ], sidx[ tid + 1 ], sdata[ tid ], sdata[ tid + 1 ] ); reduction( sdata[ tid ], sdata[ tid + 1 ], sidx[ tid ], sidx[ tid + 1 ] ); } // Store the result back in the global memory. Loading Loading @@ -357,7 +357,7 @@ struct CudaReductionKernelLauncher } template< typename Reduction > std::pair< Index, Result > std::pair< Result, Index > finishWithArgument( const Reduction& reduction, const Result& zero ) { Loading @@ -384,9 +384,9 @@ struct CudaReductionKernelLauncher //// // Copy result on CPU std::pair< Index, Result > result; MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( &result.first, idxOutput, 1 ); MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( &result.second, output, 1 ); std::pair< Result, Index > result; MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( &result.first, output, 1 ); MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( &result.second, idxOutput, 1 ); return result; } Loading
src/TNL/Algorithms/Reduction.h +5 −5 Original line number Diff line number Diff line Loading @@ -54,7 +54,7 @@ struct Reduction< Devices::Sequential > typename Result, typename ReductionOperation, typename DataFetcher > static constexpr std::pair< Index, Result > static constexpr std::pair< Result, Index > reduceWithArgument( const Index size, const ReductionOperation& reduction, DataFetcher& dataFetcher, Loading Loading @@ -138,7 +138,7 @@ struct Reduction< Devices::Host > * The reduction lambda function takes two variables which are supposed to be reduced: * * ``` * auto reduction = [] __cuda_callable__ ( Index& aIdx, const Index& bIdx, const Result& a, const Result& b ) { return ... }; * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b, Index& aIdx, const Index& bIdx ) { return ... }; * ``` * * \par Example Loading @@ -153,7 +153,7 @@ struct Reduction< Devices::Host > typename Result, typename ReductionOperation, typename DataFetcher > static std::pair< Index, Result > static std::pair< Result, Index > reduceWithArgument( const Index size, const ReductionOperation& reduction, DataFetcher& dataFetcher, Loading Loading @@ -237,7 +237,7 @@ struct Reduction< Devices::Cuda > * The reduction lambda function takes two variables which are supposed to be reduced: * * ``` * auto reduction = [] __cuda_callable__ ( Index& aIdx, const Index& bIdx, const Result& a, const Result& b ) { return ... }; * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b, Index& aIdx, const Index& bIdx ) { return ... }; * ``` * * \par Example Loading @@ -252,7 +252,7 @@ struct Reduction< Devices::Cuda > typename Result, typename ReductionOperation, typename DataFetcher > static std::pair< Index, Result > static std::pair< Result, Index > reduceWithArgument( const Index size, const ReductionOperation& reduction, DataFetcher& dataFetcher, Loading
src/TNL/Algorithms/Reduction.hpp +28 −28 Original line number Diff line number Diff line Loading @@ -86,7 +86,7 @@ template< typename Index, typename Result, typename ReductionOperation, typename DataFetcher > constexpr std::pair< Index, Result > constexpr std::pair< Result, Index > Reduction< Devices::Sequential >:: reduceWithArgument( const Index size, const ReductionOperation& reduction, Loading Loading @@ -119,27 +119,27 @@ reduceWithArgument( const Index size, initialized = true; continue; } reduction( arg[ 0 ], offset + i, r[ 0 ], dataFetcher( offset + i ) ); reduction( arg[ 1 ], offset + i + 1, r[ 1 ], dataFetcher( offset + i + 1 ) ); reduction( arg[ 2 ], offset + i + 2, r[ 2 ], dataFetcher( offset + i + 2 ) ); reduction( arg[ 3 ], offset + i + 3, r[ 3 ], dataFetcher( offset + i + 3 ) ); reduction( r[ 0 ], dataFetcher( offset + i ), arg[ 0 ], offset + i ); reduction( r[ 1 ], dataFetcher( offset + i + 1 ), arg[ 1 ], offset + i + 1 ); reduction( r[ 2 ], dataFetcher( offset + i + 2 ), arg[ 2 ], offset + i + 2 ); reduction( r[ 3 ], dataFetcher( offset + i + 3 ), arg[ 3 ], offset + i + 3 ); } } // reduction of the last, incomplete block (not unrolled) for( Index i = blocks * block_size; i < size; i++ ) reduction( arg[ 0 ], i, r[ 0 ], dataFetcher( i ) ); reduction( r[ 0 ], dataFetcher( i ), arg[ 0 ], i ); // reduction of unrolled results reduction( arg[ 0 ], arg[ 2 ], r[ 0 ], r[ 2 ] ); reduction( arg[ 1 ], arg[ 3 ], r[ 1 ], r[ 3 ] ); reduction( arg[ 0 ], arg[ 1 ], r[ 0 ], r[ 1 ] ); return std::make_pair( arg[ 0 ], r[ 0 ] ); reduction( r[ 0 ], r[ 2 ], arg[ 0 ], arg[ 2 ] ); reduction( r[ 1 ], r[ 3 ], arg[ 1 ], arg[ 3 ] ); reduction( r[ 0 ], r[ 1 ], arg[ 0 ], arg[ 1 ] ); return std::make_pair( r[ 0 ], arg[ 0 ] ); } else { std::pair< Index, Result > result( 0, dataFetcher( 0 ) ); std::pair< Result, Index > result( dataFetcher( 0 ), 0 ); for( Index i = 1; i < size; i++ ) reduction( result.first, i, result.second, dataFetcher( i ) ); reduction( result.first, dataFetcher( i ), result.second, i ); return result; } } Loading Loading @@ -208,7 +208,7 @@ template< typename Index, typename Result, typename ReductionOperation, typename DataFetcher > std::pair< Index, Result > std::pair< Result, Index > Reduction< Devices::Host >:: reduceWithArgument( const Index size, const ReductionOperation& reduction, Loading @@ -221,7 +221,7 @@ reduceWithArgument( const Index size, if( Devices::Host::isOMPEnabled() && blocks >= 2 ) { // global result variable std::pair< Index, Result > result( -1, zero ); std::pair< Result, Index > result( zero, -1 ); const int threads = TNL::min( blocks, Devices::Host::getMaxThreadsCount() ); #pragma omp parallel num_threads(threads) { Loading @@ -246,10 +246,10 @@ reduceWithArgument( const Index size, initialized = true; continue; } reduction( arg[ 0 ], offset + i, r[ 0 ], dataFetcher( offset + i ) ); reduction( arg[ 1 ], offset + i + 1, r[ 1 ], dataFetcher( offset + i + 1 ) ); reduction( arg[ 2 ], offset + i + 2, r[ 2 ], dataFetcher( offset + i + 2 ) ); reduction( arg[ 3 ], offset + i + 3, r[ 3 ], dataFetcher( offset + i + 3 ) ); reduction( r[ 0 ], dataFetcher( offset + i ), arg[ 0 ], offset + i ); reduction( r[ 1 ], dataFetcher( offset + i + 1 ), arg[ 1 ], offset + i + 1 ); reduction( r[ 2 ], dataFetcher( offset + i + 2 ), arg[ 2 ], offset + i + 2 ); reduction( r[ 3 ], dataFetcher( offset + i + 3 ), arg[ 3 ], offset + i + 3 ); } } Loading @@ -257,20 +257,20 @@ reduceWithArgument( const Index size, #pragma omp single nowait { for( Index i = blocks * block_size; i < size; i++ ) reduction( arg[ 0 ], i, r[ 0 ], dataFetcher( i ) ); reduction( r[ 0 ], dataFetcher( i ), arg[ 0 ], i ); } // local reduction of unrolled results reduction( arg[ 0 ], arg[ 2 ], r[ 0 ], r[ 2 ] ); reduction( arg[ 1 ], arg[ 3 ], r[ 1 ], r[ 3 ] ); reduction( arg[ 0 ], arg[ 1 ], r[ 0 ], r[ 1 ] ); reduction( r[ 0 ], r[ 2 ], arg[ 0 ], arg[ 2 ] ); reduction( r[ 1 ], r[ 3 ], arg[ 1 ], arg[ 3 ] ); reduction( r[ 0 ], r[ 1 ], arg[ 0 ], arg[ 1 ] ); // inter-thread reduction of local results #pragma omp critical { if( result.first == -1 ) result.first = arg[ 0 ]; reduction( result.first, arg[ 0 ], result.second, r[ 0 ] ); if( result.second == -1 ) result.second = arg[ 0 ]; reduction( result.first, r[ 0 ], result.second, arg[ 0 ] ); } } return result; Loading Loading @@ -373,7 +373,7 @@ template< typename Index, typename Result, typename ReductionOperation, typename DataFetcher > std::pair< Index, Result > std::pair< Result, Index > Reduction< Devices::Cuda >:: reduceWithArgument( const Index size, const ReductionOperation& reduction, Loading Loading @@ -454,13 +454,13 @@ reduceWithArgument( const Index size, // auto fetch = [&] ( Index i ) { return resultArray[ i ]; }; // const Result result = Reduction< Devices::Sequential >::reduceWithArgument( reducedSize, argument, reduction, fetch, zero ); for( Index i = 1; i < reducedSize; i++ ) reduction( indexArray[ 0 ], indexArray[ i ], resultArray[ 0 ], resultArray[ i ] ); reduction( resultArray[ 0 ], resultArray[ i ], indexArray[ 0 ], indexArray[ i ] ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); std::cout << " Reduction of small data set on CPU took " << timer.getRealTime() << " sec. " << std::endl; #endif return std::make_pair( indexArray[ 0 ], resultArray[ 0 ] ); return std::make_pair( resultArray[ 0 ], indexArray[ 0 ] ); } else { // data can't be safely reduced on host, so continue with the reduction on the GPU Loading
src/TNL/Containers/Expressions/DistributedVerticalOperations.h +2 −2 Original line number Diff line number Diff line Loading @@ -35,7 +35,7 @@ auto DistributedExpressionMin( const Expression& expression ) -> std::decay_t< d template< typename Expression > auto DistributedExpressionArgMin( const Expression& expression ) -> std::pair< typename Expression::IndexType, std::decay_t< decltype( expression[0] ) > > -> std::pair< std::decay_t< decltype( expression[0] ) >, typename Expression::IndexType > { using RealType = std::decay_t< decltype( expression[0] ) >; using IndexType = typename Expression::IndexType; Loading Loading @@ -92,7 +92,7 @@ auto DistributedExpressionMax( const Expression& expression ) -> std::decay_t< d template< typename Expression > auto DistributedExpressionArgMax( const Expression& expression ) -> std::pair< typename Expression::IndexType, std::decay_t< decltype( expression[0] ) > > -> std::pair< std::decay_t< decltype( expression[0] ) >, typename Expression::IndexType > { using RealType = std::decay_t< decltype( expression[0] ) >; using IndexType = typename Expression::IndexType; Loading