Commit 58fc7ce1 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Swaping parameters of tuple returned by reductionWithArgument. It makes better...

Swaping parameters of tuple returned by reductionWithArgument. It makes better sence for ordering of tuples.
parent 84db6147
Loading
Loading
Loading
Loading
+4 −4
Original line number Diff line number Diff line
@@ -8,13 +8,13 @@ using namespace TNL::Containers;
using namespace TNL::Algorithms;

template< typename Device >
std::pair< int, double >
std::pair< double, int >
maximumNorm( const Vector< double, Device >& v )
{
   auto view = v.getConstView();

   auto fetch = [=] __cuda_callable__ ( int i ) { return abs( view[ i ] ); };
   auto reduction = [] __cuda_callable__ ( int& aIdx, const int& bIdx, double& a, const double& b ) {
   auto reduction = [] __cuda_callable__ ( double& a, const double& b, int& aIdx, const int& bIdx ) {
      if( a < b ) {
         a = b;
         aIdx = bIdx;
@@ -31,13 +31,13 @@ int main( int argc, char* argv[] )
   host_v.evaluate( [] __cuda_callable__ ( int i )->double { return i - 7; } );
   std::cout << "host_v = " << host_v << std::endl;
   auto maxNormHost = maximumNorm( host_v );
   std::cout << "The maximum norm of the host vector elements is " <<  maxNormHost.second << " at position " << maxNormHost.first << "." << std::endl;
   std::cout << "The maximum norm of the host vector elements is " <<  maxNormHost.first << " at position " << maxNormHost.second << "." << std::endl;
#ifdef HAVE_CUDA
   Vector< double, Devices::Cuda > cuda_v( 10 );
   cuda_v.evaluate( [] __cuda_callable__ ( int i )->double { return i - 7; } );
   std::cout << "cuda_v = " << cuda_v << std::endl;
   auto maxNormCuda = maximumNorm( cuda_v );
   std::cout << "The maximum norm of the device vector elements is " <<  maxNormCuda.second << " at position " << maxNormCuda.first << "." << std::endl;
   std::cout << "The maximum norm of the device vector elements is " <<  maxNormCuda.first << " at position " << maxNormCuda.second << "." << std::endl;
#endif
   return EXIT_SUCCESS;
}
+28 −28
Original line number Diff line number Diff line
@@ -166,19 +166,19 @@ CudaReductionWithArgumentKernel( const Result zero,
         sdata[ tid ] = zero;
      }
      while( gid + 4 * gridSize < size ) {
         reduction( sidx[ tid ], idxInput[ gid ], sdata[ tid ], dataFetcher( gid ) );
         reduction( sidx[ tid ], idxInput[ gid + gridSize ], sdata[ tid ], dataFetcher( gid + gridSize ) );
         reduction( sidx[ tid ], idxInput[ gid + 2 * gridSize ], sdata[ tid ], dataFetcher( gid + 2 * gridSize ) );
         reduction( sidx[ tid ], idxInput[ gid + 3 * gridSize ], sdata[ tid ], dataFetcher( gid + 3 * gridSize ) );
         reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], idxInput[ gid ] );
         reduction( sdata[ tid ], dataFetcher( gid + gridSize ), sidx[ tid ], idxInput[ gid + gridSize ] );
         reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ), sidx[ tid ], idxInput[ gid + 2 * gridSize ] );
         reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ), sidx[ tid ], idxInput[ gid + 3 * gridSize ] );
         gid += 4 * gridSize;
      }
      while( gid + 2 * gridSize < size ) {
         reduction( sidx[ tid ], idxInput[ gid ], sdata[ tid ], dataFetcher( gid ) );
         reduction( sidx[ tid ], idxInput[ gid + gridSize ], sdata[ tid ], dataFetcher( gid + gridSize ) );
         reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], idxInput[ gid ] );
         reduction( sdata[ tid ], dataFetcher( gid + gridSize ), sidx[ tid ], idxInput[ gid + gridSize ] );
         gid += 2 * gridSize;
      }
      while( gid < size ) {
         reduction( sidx[ tid ], idxInput[ gid ], sdata[ tid ], dataFetcher( gid ) );
         reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], idxInput[ gid ] );
         gid += gridSize;
      }
   }
@@ -191,19 +191,19 @@ CudaReductionWithArgumentKernel( const Result zero,
         sdata[ tid ] = zero;
      }
      while( gid + 4 * gridSize < size ) {
         reduction( sidx[ tid ], gid, sdata[ tid ], dataFetcher( gid ) );
         reduction( sidx[ tid ], gid + gridSize, sdata[ tid ], dataFetcher( gid + gridSize ) );
         reduction( sidx[ tid ], gid + 2 * gridSize, sdata[ tid ], dataFetcher( gid + 2 * gridSize ) );
         reduction( sidx[ tid ], gid + 3 * gridSize, sdata[ tid ], dataFetcher( gid + 3 * gridSize ) );
         reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], gid );
         reduction( sdata[ tid ], dataFetcher( gid + gridSize ), sidx[ tid ], gid + gridSize );
         reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ), sidx[ tid ], gid + 2 * gridSize );
         reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ), sidx[ tid ], gid + 3 * gridSize );
         gid += 4 * gridSize;
      }
      while( gid + 2 * gridSize < size ) {
         reduction( sidx[ tid ], gid, sdata[ tid ], dataFetcher( gid ) );
         reduction( sidx[ tid ], gid + gridSize, sdata[ tid ], dataFetcher( gid + gridSize ) );
         reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], gid );
         reduction( sdata[ tid ], dataFetcher( gid + gridSize ), sidx[ tid ], gid + gridSize );
         gid += 2 * gridSize;
      }
      while( gid < size ) {
         reduction( sidx[ tid ], gid, sdata[ tid ], dataFetcher( gid ) );
         reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], gid );
         gid += gridSize;
      }
   }
@@ -212,48 +212,48 @@ CudaReductionWithArgumentKernel( const Result zero,
   // Perform the parallel reduction.
   if( blockSize >= 1024 ) {
      if( tid < 512 )
         reduction( sidx[ tid ], sidx[ tid + 512 ], sdata[ tid ], sdata[ tid + 512 ] );
         reduction( sdata[ tid ], sdata[ tid + 512 ], sidx[ tid ], sidx[ tid + 512 ] );
      __syncthreads();
   }
   if( blockSize >= 512 ) {
      if( tid < 256 )
         reduction( sidx[ tid ], sidx[ tid + 256 ], sdata[ tid ], sdata[ tid + 256 ] );
         reduction( sdata[ tid ], sdata[ tid + 256 ], sidx[ tid ], sidx[ tid + 256 ] );
      __syncthreads();
   }
   if( blockSize >= 256 ) {
      if( tid < 128 )
         reduction( sidx[ tid ], sidx[ tid + 128 ], sdata[ tid ], sdata[ tid + 128 ] );
         reduction( sdata[ tid ], sdata[ tid + 128 ], sidx[ tid ], sidx[ tid + 128 ] );
      __syncthreads();
   }
   if( blockSize >= 128 ) {
      if( tid <  64 )
         reduction( sidx[ tid ], sidx[ tid + 64 ], sdata[ tid ], sdata[ tid + 64 ] );
         reduction( sdata[ tid ], sdata[ tid + 64 ], sidx[ tid ], sidx[ tid + 64 ] );
      __syncthreads();
   }

   // This runs in one warp so we use __syncwarp() instead of __syncthreads().
   if( tid < 32 ) {
      if( blockSize >= 64 )
         reduction( sidx[ tid ], sidx[ tid + 32 ], sdata[ tid ], sdata[ tid + 32 ] );
         reduction( sdata[ tid ], sdata[ tid + 32 ], sidx[ tid ], sidx[ tid + 32 ] );
      __syncwarp();
      // Note that here we do not have to check if tid < 16 etc, because we have
      // 2 * blockSize.x elements of shared memory per block, so we do not
      // access out of bounds. The results for the upper half will be undefined,
      // but unused anyway.
      if( blockSize >= 32 )
         reduction( sidx[ tid ], sidx[ tid + 16 ], sdata[ tid ], sdata[ tid + 16 ] );
         reduction( sdata[ tid ], sdata[ tid + 16 ], sidx[ tid ], sidx[ tid + 16 ] );
      __syncwarp();
      if( blockSize >= 16 )
         reduction( sidx[ tid ], sidx[ tid + 8 ], sdata[ tid ], sdata[ tid + 8 ] );
         reduction( sdata[ tid ], sdata[ tid + 8 ], sidx[ tid ], sidx[ tid + 8 ] );
      __syncwarp();
      if( blockSize >=  8 )
         reduction( sidx[ tid ], sidx[ tid + 4 ], sdata[ tid ], sdata[ tid + 4 ] );
         reduction( sdata[ tid ], sdata[ tid + 4 ], sidx[ tid ], sidx[ tid + 4 ] );
      __syncwarp();
      if( blockSize >=  4 )
         reduction( sidx[ tid ], sidx[ tid + 2 ], sdata[ tid ], sdata[ tid + 2 ] );
         reduction( sdata[ tid ], sdata[ tid + 2 ], sidx[ tid ], sidx[ tid + 2 ] );
      __syncwarp();
      if( blockSize >=  2 )
         reduction( sidx[ tid ], sidx[ tid + 1 ], sdata[ tid ], sdata[ tid + 1 ] );
         reduction( sdata[ tid ], sdata[ tid + 1 ], sidx[ tid ], sidx[ tid + 1 ] );
   }

   // Store the result back in the global memory.
@@ -357,7 +357,7 @@ struct CudaReductionKernelLauncher
   }

   template< typename Reduction >
   std::pair< Index, Result >
   std::pair< Result, Index >
   finishWithArgument( const Reduction& reduction,
                       const Result& zero )
   {
@@ -384,9 +384,9 @@ struct CudaReductionKernelLauncher

      ////
      // Copy result on CPU
      std::pair< Index, Result > result;
      MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( &result.first, idxOutput, 1 );
      MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( &result.second, output, 1 );
      std::pair< Result, Index > result;
      MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( &result.first, output, 1 );
      MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( &result.second, idxOutput, 1 );
      return result;
   }

+5 −5
Original line number Diff line number Diff line
@@ -54,7 +54,7 @@ struct Reduction< Devices::Sequential >
             typename Result,
             typename ReductionOperation,
             typename DataFetcher >
   static constexpr std::pair< Index, Result >
   static constexpr std::pair< Result, Index >
   reduceWithArgument( const Index size,
                       const ReductionOperation& reduction,
                       DataFetcher& dataFetcher,
@@ -138,7 +138,7 @@ struct Reduction< Devices::Host >
    * The reduction lambda function takes two variables which are supposed to be reduced:
    * 
    * ```
    * auto reduction = [] __cuda_callable__ ( Index& aIdx, const Index& bIdx, const Result& a, const Result& b ) { return ... };
    * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b, Index& aIdx, const Index& bIdx ) { return ... };
    * ```
    * 
    * \par Example
@@ -153,7 +153,7 @@ struct Reduction< Devices::Host >
             typename Result,
             typename ReductionOperation,
             typename DataFetcher >
   static std::pair< Index, Result >
   static std::pair< Result, Index >
   reduceWithArgument( const Index size,
                       const ReductionOperation& reduction,
                       DataFetcher& dataFetcher,
@@ -237,7 +237,7 @@ struct Reduction< Devices::Cuda >
    * The reduction lambda function takes two variables which are supposed to be reduced:
    *
    * ```
    * auto reduction = [] __cuda_callable__ ( Index& aIdx, const Index& bIdx, const Result& a, const Result& b ) { return ... };
    * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b, Index& aIdx, const Index& bIdx ) { return ... };
    * ```
    *
    * \par Example
@@ -252,7 +252,7 @@ struct Reduction< Devices::Cuda >
             typename Result,
             typename ReductionOperation,
             typename DataFetcher >
   static std::pair< Index, Result >
   static std::pair< Result, Index >
   reduceWithArgument( const Index size,
                       const ReductionOperation& reduction,
                       DataFetcher& dataFetcher,
+28 −28
Original line number Diff line number Diff line
@@ -86,7 +86,7 @@ template< typename Index,
          typename Result,
          typename ReductionOperation,
          typename DataFetcher >
constexpr std::pair< Index, Result >
constexpr std::pair< Result, Index >
Reduction< Devices::Sequential >::
reduceWithArgument( const Index size,
                    const ReductionOperation& reduction,
@@ -119,27 +119,27 @@ reduceWithArgument( const Index size,
               initialized = true;
               continue;
            }
            reduction( arg[ 0 ], offset + i,     r[ 0 ], dataFetcher( offset + i ) );
            reduction( arg[ 1 ], offset + i + 1, r[ 1 ], dataFetcher( offset + i + 1 ) );
            reduction( arg[ 2 ], offset + i + 2, r[ 2 ], dataFetcher( offset + i + 2 ) );
            reduction( arg[ 3 ], offset + i + 3, r[ 3 ], dataFetcher( offset + i + 3 ) );
            reduction( r[ 0 ], dataFetcher( offset + i ),     arg[ 0 ], offset + i );
            reduction( r[ 1 ], dataFetcher( offset + i + 1 ), arg[ 1 ], offset + i + 1 );
            reduction( r[ 2 ], dataFetcher( offset + i + 2 ), arg[ 2 ], offset + i + 2 );
            reduction( r[ 3 ], dataFetcher( offset + i + 3 ), arg[ 3 ], offset + i + 3 );
         }
      }

      // reduction of the last, incomplete block (not unrolled)
      for( Index i = blocks * block_size; i < size; i++ )
         reduction( arg[ 0 ], i, r[ 0 ], dataFetcher( i ) );
         reduction( r[ 0 ], dataFetcher( i ), arg[ 0 ], i );

      // reduction of unrolled results
      reduction( arg[ 0 ], arg[ 2 ], r[ 0 ], r[ 2 ] );
      reduction( arg[ 1 ], arg[ 3 ], r[ 1 ], r[ 3 ] );
      reduction( arg[ 0 ], arg[ 1 ], r[ 0 ], r[ 1 ] );
      return std::make_pair( arg[ 0 ], r[ 0 ] );
      reduction( r[ 0 ], r[ 2 ], arg[ 0 ], arg[ 2 ] );
      reduction( r[ 1 ], r[ 3 ], arg[ 1 ], arg[ 3 ] );
      reduction( r[ 0 ], r[ 1 ], arg[ 0 ], arg[ 1 ] );
      return std::make_pair( r[ 0 ], arg[ 0 ] );
   }
   else {
      std::pair< Index, Result > result( 0, dataFetcher( 0 ) );
      std::pair< Result, Index > result( dataFetcher( 0 ), 0 );
      for( Index i = 1; i < size; i++ )
         reduction( result.first, i, result.second, dataFetcher( i ) );
         reduction( result.first, dataFetcher( i ), result.second, i );
      return result;
   }
}
@@ -208,7 +208,7 @@ template< typename Index,
          typename Result,
          typename ReductionOperation,
          typename DataFetcher >
std::pair< Index, Result >
std::pair< Result, Index >
Reduction< Devices::Host >::
reduceWithArgument( const Index size,
                    const ReductionOperation& reduction,
@@ -221,7 +221,7 @@ reduceWithArgument( const Index size,

   if( Devices::Host::isOMPEnabled() && blocks >= 2 ) {
      // global result variable
      std::pair< Index, Result > result( -1, zero );
      std::pair< Result, Index > result( zero, -1 );
      const int threads = TNL::min( blocks, Devices::Host::getMaxThreadsCount() );
#pragma omp parallel num_threads(threads)
      {
@@ -246,10 +246,10 @@ reduceWithArgument( const Index size,
                  initialized = true;
                  continue;
               }
               reduction( arg[ 0 ], offset + i,     r[ 0 ], dataFetcher( offset + i ) );
               reduction( arg[ 1 ], offset + i + 1, r[ 1 ], dataFetcher( offset + i + 1 ) );
               reduction( arg[ 2 ], offset + i + 2, r[ 2 ], dataFetcher( offset + i + 2 ) );
               reduction( arg[ 3 ], offset + i + 3, r[ 3 ], dataFetcher( offset + i + 3 ) );
               reduction( r[ 0 ], dataFetcher( offset + i ),     arg[ 0 ], offset + i );
               reduction( r[ 1 ], dataFetcher( offset + i + 1 ), arg[ 1 ], offset + i + 1 );
               reduction( r[ 2 ], dataFetcher( offset + i + 2 ), arg[ 2 ], offset + i + 2 );
               reduction( r[ 3 ], dataFetcher( offset + i + 3 ), arg[ 3 ], offset + i + 3 );
            }
         }

@@ -257,20 +257,20 @@ reduceWithArgument( const Index size,
         #pragma omp single nowait
         {
            for( Index i = blocks * block_size; i < size; i++ )
               reduction( arg[ 0 ], i, r[ 0 ], dataFetcher( i ) );
               reduction( r[ 0 ], dataFetcher( i ), arg[ 0 ], i );
         }

         // local reduction of unrolled results
         reduction( arg[ 0 ], arg[ 2 ], r[ 0 ], r[ 2 ] );
         reduction( arg[ 1 ], arg[ 3 ], r[ 1 ], r[ 3 ] );
         reduction( arg[ 0 ], arg[ 1 ], r[ 0 ], r[ 1 ] );
         reduction( r[ 0 ], r[ 2 ], arg[ 0 ], arg[ 2 ] );
         reduction( r[ 1 ], r[ 3 ], arg[ 1 ], arg[ 3 ] );
         reduction( r[ 0 ], r[ 1 ], arg[ 0 ], arg[ 1 ] );

         // inter-thread reduction of local results
         #pragma omp critical
         {
            if( result.first == -1 )
               result.first = arg[ 0 ];
            reduction( result.first, arg[ 0 ], result.second, r[ 0 ] );
            if( result.second == -1 )
               result.second = arg[ 0 ];
            reduction( result.first, r[ 0 ], result.second, arg[ 0 ] );
         }
      }
      return result;
@@ -373,7 +373,7 @@ template< typename Index,
          typename Result,
          typename ReductionOperation,
          typename DataFetcher >
std::pair< Index, Result >
std::pair< Result, Index >
Reduction< Devices::Cuda >::
reduceWithArgument( const Index size,
                    const ReductionOperation& reduction,
@@ -454,13 +454,13 @@ reduceWithArgument( const Index size,
//      auto fetch = [&] ( Index i ) { return resultArray[ i ]; };
//      const Result result = Reduction< Devices::Sequential >::reduceWithArgument( reducedSize, argument, reduction, fetch, zero );
      for( Index i = 1; i < reducedSize; i++ )
         reduction( indexArray[ 0 ], indexArray[ i ], resultArray[ 0 ], resultArray[ i ] );
         reduction( resultArray[ 0 ], resultArray[ i ], indexArray[ 0 ], indexArray[ i ]  );

      #ifdef CUDA_REDUCTION_PROFILING
         timer.stop();
         std::cout << "   Reduction of small data set on CPU took " << timer.getRealTime() << " sec. " << std::endl;
      #endif
      return std::make_pair( indexArray[ 0 ], resultArray[ 0 ] );
      return std::make_pair( resultArray[ 0 ], indexArray[ 0 ] );
   }
   else {
      // data can't be safely reduced on host, so continue with the reduction on the GPU
+2 −2
Original line number Diff line number Diff line
@@ -35,7 +35,7 @@ auto DistributedExpressionMin( const Expression& expression ) -> std::decay_t< d

template< typename Expression >
auto DistributedExpressionArgMin( const Expression& expression )
-> std::pair< typename Expression::IndexType, std::decay_t< decltype( expression[0] ) > >
-> std::pair< std::decay_t< decltype( expression[0] ) >, typename Expression::IndexType >
{
   using RealType = std::decay_t< decltype( expression[0] ) >;
   using IndexType = typename Expression::IndexType;
@@ -92,7 +92,7 @@ auto DistributedExpressionMax( const Expression& expression ) -> std::decay_t< d

template< typename Expression >
auto DistributedExpressionArgMax( const Expression& expression )
-> std::pair< typename Expression::IndexType, std::decay_t< decltype( expression[0] ) > >
-> std::pair< std::decay_t< decltype( expression[0] ) >, typename Expression::IndexType >
{
   using RealType = std::decay_t< decltype( expression[0] ) >;
   using IndexType = typename Expression::IndexType;
Loading