Commit d165d1b4 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Changed return type of Reduction::reduce from void to ResultType

Now that error handling is done via exceptions, the return value is free
for more sensible things.
parent 021e795f
Loading
Loading
Loading
Loading
+45 −72
Original line number Diff line number Diff line
@@ -88,15 +88,12 @@ benchmarkVectorOperations( Benchmark & benchmark,
      resultHost = hostVector.max();
   };
   auto maxHostGeneral = [&]() {
      Real result( 0 );
      Containers::Algorithms::ParallelReductionMax< Real > operation;
      Containers::Algorithms::Reduction< Devices::Host >::reduce(
      return Containers::Algorithms::Reduction< Devices::Host >::reduce(
                 operation,
                 hostVector.getSize(),
                 hostVector.getData(),
              ( Real* ) 0,
              result );
      return result;
                 ( Real* ) 0 );
   };
   auto maxCuda = [&]() {
      resultDevice = deviceVector.max();
@@ -113,15 +110,12 @@ benchmarkVectorOperations( Benchmark & benchmark,
      resultHost = hostVector.min();
   };
   auto minHostGeneral = [&]() {
      Real result( 0 );
      Containers::Algorithms::ParallelReductionMin< Real > operation;
      Containers::Algorithms::Reduction< Devices::Host >::reduce(
      return Containers::Algorithms::Reduction< Devices::Host >::reduce(
                 operation,
                 hostVector.getSize(),
                 hostVector.getData(),
              ( Real* ) 0,
              result );
      return result;
                 ( Real* ) 0 );
   };
   auto minCuda = [&]() {
      resultDevice = deviceVector.min();
@@ -138,15 +132,12 @@ benchmarkVectorOperations( Benchmark & benchmark,
      resultHost = hostVector.absMax();
   };
   auto absMaxHostGeneral = [&]() {
      Real result( 0 );
      Containers::Algorithms::ParallelReductionAbsMax< Real > operation;
      Containers::Algorithms::Reduction< Devices::Host >::reduce(
      return Containers::Algorithms::Reduction< Devices::Host >::reduce(
                 operation,
                 hostVector.getSize(),
                 hostVector.getData(),
              ( Real* ) 0,
              result );
      return result;
                 ( Real* ) 0 );
   };
   auto absMaxCuda = [&]() {
      resultDevice = deviceVector.absMax();
@@ -173,15 +164,12 @@ benchmarkVectorOperations( Benchmark & benchmark,
      resultHost = hostVector.absMin();
   };
   auto absMinHostGeneral = [&]() {
      Real result( 0 );
      Containers::Algorithms::ParallelReductionAbsMin< Real > operation;
      Containers::Algorithms::Reduction< Devices::Host >::reduce(
      return Containers::Algorithms::Reduction< Devices::Host >::reduce(
                 operation,
                 hostVector.getSize(),
                 hostVector.getData(),
              ( Real* ) 0,
              result );
      return result;
                 ( Real* ) 0 );
   };
   auto absMinCuda = [&]() {
      resultDevice = deviceVector.absMin();
@@ -208,15 +196,12 @@ benchmarkVectorOperations( Benchmark & benchmark,
      resultHost = hostVector.sum();
   };
   auto sumHostGeneral = [&]() {
      Real result( 0 );
      Containers::Algorithms::ParallelReductionSum< Real > operation;
      Containers::Algorithms::Reduction< Devices::Host >::reduce(
      return Containers::Algorithms::Reduction< Devices::Host >::reduce(
                 operation,
                 hostVector.getSize(),
                 hostVector.getData(),
              ( Real* ) 0,
              result );
      return result;
                 ( Real* ) 0 );
   };
   auto sumCuda = [&]() {
      resultDevice = deviceVector.sum();
@@ -233,15 +218,12 @@ benchmarkVectorOperations( Benchmark & benchmark,
      resultHost = hostVector.lpNorm( 1.0 );
   };
   auto l1normHostGeneral = [&]() {
      Real result( 0 );
      Containers::Algorithms::ParallelReductionAbsSum< Real > operation;
      Containers::Algorithms::Reduction< Devices::Host >::reduce(
      return Containers::Algorithms::Reduction< Devices::Host >::reduce(
                 operation,
                 hostVector.getSize(),
                 hostVector.getData(),
              ( Real* ) 0,
              result );
      return result;
                 ( Real* ) 0 );
   };
   auto l1normCuda = [&]() {
      resultDevice = deviceVector.lpNorm( 1.0 );
@@ -266,15 +248,12 @@ benchmarkVectorOperations( Benchmark & benchmark,
      resultHost = hostVector.lpNorm( 2.0 );
   };
   auto l2normHostGeneral = [&]() {
      Real result( 0 );
      Containers::Algorithms::ParallelReductionL2Norm< Real > operation;
      Containers::Algorithms::Reduction< Devices::Host >::reduce(
      return Containers::Algorithms::Reduction< Devices::Host >::reduce(
                 operation,
                 hostVector.getSize(),
                 hostVector.getData(),
              ( Real* ) 0,
              result );
      return result;
                 ( Real* ) 0 );
   };
   auto l2normCuda = [&]() {
      resultDevice = deviceVector.lpNorm( 2.0 );
@@ -299,16 +278,13 @@ benchmarkVectorOperations( Benchmark & benchmark,
      resultHost = hostVector.lpNorm( 3.0 );
   };
   auto l3normHostGeneral = [&]() {
      Real result( 0 );
      Containers::Algorithms::ParallelReductionLpNorm< Real > operation;
      operation.setPower( 3.0 );
      Containers::Algorithms::Reduction< Devices::Host >::reduce(
      return Containers::Algorithms::Reduction< Devices::Host >::reduce(
                 operation,
                 hostVector.getSize(),
                 hostVector.getData(),
              ( Real* ) 0,
              result );
      return result;
                 ( Real* ) 0 );
   };
   auto l3normCuda = [&]() {
      resultDevice = deviceVector.lpNorm( 3.0 );
@@ -325,15 +301,12 @@ benchmarkVectorOperations( Benchmark & benchmark,
      resultHost = hostVector.scalarProduct( hostVector2 );
   };
   auto scalarProductHostGeneral = [&]() {
      Real result( 0 );
      Containers::Algorithms::ParallelReductionScalarProduct< Real, Real > operation;
      Containers::Algorithms::Reduction< Devices::Host >::reduce(
      return Containers::Algorithms::Reduction< Devices::Host >::reduce(
                 operation,
                 hostVector.getSize(),
                 hostVector.getData(),
              hostVector2.getData(),
              result );
      return result;
                 hostVector2.getData() );
   };
   auto scalarProductCuda = [&]() {
      resultDevice = deviceVector.scalarProduct( deviceVector2 );
+3 −10
Original line number Diff line number Diff line
@@ -183,11 +183,8 @@ compareMemory( const Element1* destination,
{
   TNL_ASSERT_TRUE( destination, "Attempted to compare data through a nullptr." );
   TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." );
   //TODO: The parallel reduction on the CUDA device with different element types is needed.
   bool result = false;
   Algorithms::ParallelReductionEqualities< Element1, Element2 > reductionEqualities;
   Reduction< Devices::Cuda >::reduce( reductionEqualities, size, destination, source, result );
   return result;
   return Reduction< Devices::Cuda >::reduce( reductionEqualities, size, destination, source );
}

template< typename Element,
@@ -201,11 +198,9 @@ containsValue( const Element* data,
   TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
   TNL_ASSERT_GE( size, 0, "" );
   if( size == 0 ) return false;
   bool result = false;
   Algorithms::ParallelReductionContainsValue< Element > reductionContainsValue;
   reductionContainsValue.setValue( value );
   Reduction< Devices::Cuda >::reduce( reductionContainsValue, size, data, 0, result );
   return result;
   return Reduction< Devices::Cuda >::reduce( reductionContainsValue, size, data, nullptr );
}

template< typename Element,
@@ -219,11 +214,9 @@ containsOnlyValue( const Element* data,
   TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
   TNL_ASSERT_GE( size, 0, "" );
   if( size == 0 ) return false;
   bool result = false;
   Algorithms::ParallelReductionContainsOnlyValue< Element > reductionContainsOnlyValue;
   reductionContainsOnlyValue.setValue( value );
   Reduction< Devices::Cuda >::reduce( reductionContainsOnlyValue, size, data, 0, result );
   return result;
   return Reduction< Devices::Cuda >::reduce( reductionContainsOnlyValue, size, data, nullptr );
}


+6 −9
Original line number Diff line number Diff line
@@ -30,12 +30,11 @@ class Reduction< Devices::Cuda >
{
public:
   template< typename Operation, typename Index >
   static void
   static typename Operation::ResultType
   reduce( Operation& operation,
           const Index size,
           const typename Operation::DataType1* deviceInput1,
           const typename Operation::DataType2* deviceInput2,
           typename Operation::ResultType& result );
           const typename Operation::DataType2* deviceInput2 );
};

template<>
@@ -43,12 +42,11 @@ class Reduction< Devices::Host >
{
public:
   template< typename Operation, typename Index >
   static void
   static typename Operation::ResultType
   reduce( Operation& operation,
           const Index size,
           const typename Operation::DataType1* deviceInput1,
           const typename Operation::DataType2* deviceInput2,
           typename Operation::ResultType& result );
           const typename Operation::DataType2* deviceInput2 );
};

template<>
@@ -56,12 +54,11 @@ class Reduction< Devices::MIC >
{
public:
   template< typename Operation, typename Index >
   static void
   static typename Operation::ResultType
   reduce( Operation& operation,
           const Index size,
           const typename Operation::DataType1* deviceInput1,
           const typename Operation::DataType2* deviceInput2,
           typename Operation::ResultType& result );
           const typename Operation::DataType2* deviceInput2 );
};

} // namespace Algorithms
+42 −45
Original line number Diff line number Diff line
@@ -39,13 +39,12 @@ namespace Algorithms {
static constexpr int Reduction_minGpuDataSize = 256;//65536; //16384;//1024;//256;

template< typename Operation, typename Index >
void
typename Operation::ResultType
Reduction< Devices::Cuda >::
reduce( Operation& operation,
        const Index size,
        const typename Operation::DataType1* deviceInput1,
        const typename Operation::DataType2* deviceInput2,
        typename Operation::ResultType& result )
        const typename Operation::DataType2* deviceInput2 )
{
#ifdef HAVE_CUDA

@@ -75,12 +74,11 @@ reduce( Operation& operation,
         using _DT2 = typename std::conditional< std::is_same< DataType2, void >::value, DataType1, DataType2 >::type;
         typename std::remove_const< _DT2 >::type hostArray2[ Reduction_minGpuDataSize ];
         ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( hostArray2, (_DT2*) deviceInput2, size );
         Reduction< Devices::Host >::reduce( operation, size, hostArray1, hostArray2, result );
         return Reduction< Devices::Host >::reduce( operation, size, hostArray1, hostArray2 );
      }
      else {
         Reduction< Devices::Host >::reduce( operation, size, hostArray1, (DataType2*) nullptr, result );
         return Reduction< Devices::Host >::reduce( operation, size, hostArray1, (DataType2*) nullptr );
      }
      return;
   }

   #ifdef CUDA_REDUCTION_PROFILING
@@ -123,12 +121,14 @@ reduce( Operation& operation,
       * Reduce the data on the host system.
       */
      LaterReductionOperation laterReductionOperation;
      Reduction< Devices::Host >::reduce( laterReductionOperation, reducedSize, resultArray, (void*) nullptr, result );
      const ResultType result = Reduction< Devices::Host >::reduce( laterReductionOperation, reducedSize, resultArray, (void*) nullptr );

      #ifdef CUDA_REDUCTION_PROFILING
         timer.stop();
         std::cout << "   Reduction of small data set on CPU took " << timer.getRealTime() << " sec. " << std::endl;
      #endif

      return result;
   }
   else {
      /***
@@ -152,28 +152,27 @@ reduce( Operation& operation,

      ResultType resultArray[ 1 ];
      ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray, deviceAux1, reducedSize );
      result = resultArray[ 0 ];
      const ResultType result = resultArray[ 0 ];

      #ifdef CUDA_REDUCTION_PROFILING
         timer.stop();
         std::cout << "   Transferring the result to CPU took " << timer.getRealTime() << " sec. " << std::endl;
      #endif
   }

   TNL_CHECK_CUDA_DEVICE;
      return result;
   }
#else
   throw Exceptions::CudaSupportMissing();
#endif
};

template< typename Operation, typename Index >
void
typename Operation::ResultType
Reduction< Devices::Host >::
reduce( Operation& operation,
        const Index size,
        const typename Operation::DataType1* input1,
        const typename Operation::DataType2* input2,
        typename Operation::ResultType& result )
        const typename Operation::DataType2* input2 )
{
   typedef Index IndexType;
   typedef typename Operation::DataType1 DataType1;
@@ -182,17 +181,13 @@ reduce( Operation& operation,

#ifdef HAVE_OPENMP
   constexpr int block_size = 128;
   if( TNL::Devices::Host::isOMPEnabled() && size >= 2 * block_size )
   if( TNL::Devices::Host::isOMPEnabled() && size >= 2 * block_size ) {
      // global result variable
      ResultType result = operation.initialValue();
#pragma omp parallel
      {
         const int blocks = size / block_size;

      // first thread initializes the global result variable
      #pragma omp single nowait
      {
         result = operation.initialValue();
      }

         // initialize thread-local result variable
         ResultType r = operation.initialValue();

@@ -215,12 +210,14 @@ reduce( Operation& operation,
         {
            operation.commonReduction( result, r );
      }
      return result;
   }
   else {
#endif
      result = operation.initialValue();
      ResultType result = operation.initialValue();
      for( IndexType i = 0; i < size; i++ )
         operation.firstReduction( result, i, input1, input2 );
      return result;
#ifdef HAVE_OPENMP
   }
#endif
+74 −121

File changed.

Preview size limit exceeded, changes collapsed.