Commit 02999815 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Merge branch 'cineca/reductions' into 'develop'

General reductions on host

See merge request mmg/tnl-dev!7
parents 04e118e9 7729b41c
Loading
Loading
Loading
Loading
+0 −4
Original line number Diff line number Diff line
@@ -409,10 +409,6 @@ endif()
#   endif()
#endif()

if( OPTIMIZED_VECTOR_HOST_OPERATIONS STREQUAL "yes" )
   AddCompilerFlag( "-DOPTIMIZED_VECTOR_HOST_OPERATIONS " )
endif()

CONFIGURE_FILE( "tnlConfig.h.in" "${PROJECT_BUILD_PATH}/TNL/tnlConfig.h" )
INSTALL( FILES ${PROJECT_BUILD_PATH}/TNL/tnlConfig.h DESTINATION ${TNL_TARGET_INCLUDE_DIRECTORY} )

+0 −3
Original line number Diff line number Diff line
@@ -34,7 +34,6 @@ INSTANTIATE_INT="yes"
INSTANTIATE_LONG_DOUBLE="no"
INSTANTIATE_DOUBLE="yes"
INSTANTIATE_FLOAT="no"
OPTIMIZED_VECTOR_HOST_OPERATIONS="no"

for option in "$@"
do
@@ -75,7 +74,6 @@ do
                                           INSTANTIATE_DOUBLE="yes"
                                           INSTANTIATE_FLOAT="no"
                                           WITH_CUDA_ARCH="auto" ;;
        --optimize-vector-host-operations=* ) OPTIMIZED_VECTOR_HOST_OPERATIONS="yes" ;;
        *                                ) 
           echo "Unknown option ${option}. Use --help for more information."
           exit 1 ;;
@@ -175,7 +173,6 @@ cmake_command=(
         -DINSTANTIATE_LONG_DOUBLE=${INSTANTIATE_LONG_DOUBLE}
         -DINSTANTIATE_INT=${INSTANTIATE_INT}
         -DINSTANTIATE_LONG_INT=${INSTANTIATE_LONG_INT}
         -DOPTIMIZED_VECTOR_HOST_OPERATIONS=${OPTIMIZED_VECTOR_HOST_OPERATIONS}
)

# Skip running cmake if it was already run and the cmake command is the same.
+0 −109
Original line number Diff line number Diff line
@@ -87,23 +87,11 @@ benchmarkVectorOperations( Benchmark & benchmark,
   auto maxHost = [&]() {
      resultHost = hostVector.max();
   };
   auto maxHostGeneral = [&]() {
      Real result( 0 );
      Containers::Algorithms::ParallelReductionMax< Real > operation;
      Containers::Algorithms::Reduction< Devices::Host >::reduce(
              operation,
              hostVector.getSize(),
              hostVector.getData(),
              ( Real* ) 0,
              result );
      return result;
   };
   auto maxCuda = [&]() {
      resultDevice = deviceVector.max();
   };
   benchmark.setOperation( "max", datasetSize );
   benchmark.time( reset1, "CPU", maxHost );
   benchmark.time( reset1, "CPU (general)", maxHostGeneral );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", maxCuda );
#endif
@@ -112,23 +100,11 @@ benchmarkVectorOperations( Benchmark & benchmark,
   auto minHost = [&]() {
      resultHost = hostVector.min();
   };
   auto minHostGeneral = [&]() {
      Real result( 0 );
      Containers::Algorithms::ParallelReductionMin< Real > operation;
      Containers::Algorithms::Reduction< Devices::Host >::reduce(
              operation,
              hostVector.getSize(),
              hostVector.getData(),
              ( Real* ) 0,
              result );
      return result;
   };
   auto minCuda = [&]() {
      resultDevice = deviceVector.min();
   };
   benchmark.setOperation( "min", datasetSize );
   benchmark.time( reset1, "CPU", minHost );
   benchmark.time( reset1, "CPU (general)", minHostGeneral );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", minCuda );
#endif
@@ -137,17 +113,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
   auto absMaxHost = [&]() {
      resultHost = hostVector.absMax();
   };
   auto absMaxHostGeneral = [&]() {
      Real result( 0 );
      Containers::Algorithms::ParallelReductionAbsMax< Real > operation;
      Containers::Algorithms::Reduction< Devices::Host >::reduce(
              operation,
              hostVector.getSize(),
              hostVector.getData(),
              ( Real* ) 0,
              result );
      return result;
   };
   auto absMaxCuda = [&]() {
      resultDevice = deviceVector.absMax();
   };
@@ -162,7 +127,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
#endif
   benchmark.setOperation( "absMax", datasetSize );
   benchmark.time( reset1, "CPU", absMaxHost );
   benchmark.time( reset1, "CPU (general)", absMaxHostGeneral );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", absMaxCuda );
   benchmark.time( reset1, "cuBLAS", absMaxCublas );
@@ -172,17 +136,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
   auto absMinHost = [&]() {
      resultHost = hostVector.absMin();
   };
   auto absMinHostGeneral = [&]() {
      Real result( 0 );
      Containers::Algorithms::ParallelReductionAbsMin< Real > operation;
      Containers::Algorithms::Reduction< Devices::Host >::reduce(
              operation,
              hostVector.getSize(),
              hostVector.getData(),
              ( Real* ) 0,
              result );
      return result;
   };
   auto absMinCuda = [&]() {
      resultDevice = deviceVector.absMin();
   };
@@ -197,7 +150,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
#endif
   benchmark.setOperation( "absMin", datasetSize );
   benchmark.time( reset1, "CPU", absMinHost );
   benchmark.time( reset1, "CPU (general)", absMinHostGeneral );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", absMinCuda );
   benchmark.time( reset1, "cuBLAS", absMinCublas );
@@ -207,23 +159,11 @@ benchmarkVectorOperations( Benchmark & benchmark,
   auto sumHost = [&]() {
      resultHost = hostVector.sum();
   };
   auto sumHostGeneral = [&]() {
      Real result( 0 );
      Containers::Algorithms::ParallelReductionSum< Real > operation;
      Containers::Algorithms::Reduction< Devices::Host >::reduce(
              operation,
              hostVector.getSize(),
              hostVector.getData(),
              ( Real* ) 0,
              result );
      return result;
   };
   auto sumCuda = [&]() {
      resultDevice = deviceVector.sum();
   };
   benchmark.setOperation( "sum", datasetSize );
   benchmark.time( reset1, "CPU", sumHost );
   benchmark.time( reset1, "CPU (general)", sumHostGeneral );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", sumCuda );
#endif
@@ -232,17 +172,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
   auto l1normHost = [&]() {
      resultHost = hostVector.lpNorm( 1.0 );
   };
   auto l1normHostGeneral = [&]() {
      Real result( 0 );
      Containers::Algorithms::ParallelReductionAbsSum< Real > operation;
      Containers::Algorithms::Reduction< Devices::Host >::reduce(
              operation,
              hostVector.getSize(),
              hostVector.getData(),
              ( Real* ) 0,
              result );
      return result;
   };
   auto l1normCuda = [&]() {
      resultDevice = deviceVector.lpNorm( 1.0 );
   };
@@ -255,7 +184,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
#endif
   benchmark.setOperation( "l1 norm", datasetSize );
   benchmark.time( reset1, "CPU", l1normHost );
   benchmark.time( reset1, "CPU (general)", l1normHostGeneral );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", l1normCuda );
   benchmark.time( reset1, "cuBLAS", l1normCublas );
@@ -265,17 +193,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
   auto l2normHost = [&]() {
      resultHost = hostVector.lpNorm( 2.0 );
   };
   auto l2normHostGeneral = [&]() {
      Real result( 0 );
      Containers::Algorithms::ParallelReductionL2Norm< Real > operation;
      Containers::Algorithms::Reduction< Devices::Host >::reduce(
              operation,
              hostVector.getSize(),
              hostVector.getData(),
              ( Real* ) 0,
              result );
      return result;
   };
   auto l2normCuda = [&]() {
      resultDevice = deviceVector.lpNorm( 2.0 );
   };
@@ -288,7 +205,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
#endif
   benchmark.setOperation( "l2 norm", datasetSize );
   benchmark.time( reset1, "CPU", l2normHost );
   benchmark.time( reset1, "CPU (general)", l2normHostGeneral );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", l2normCuda );
   benchmark.time( reset1, "cuBLAS", l2normCublas );
@@ -298,24 +214,11 @@ benchmarkVectorOperations( Benchmark & benchmark,
   auto l3normHost = [&]() {
      resultHost = hostVector.lpNorm( 3.0 );
   };
   auto l3normHostGeneral = [&]() {
      Real result( 0 );
      Containers::Algorithms::ParallelReductionLpNorm< Real > operation;
      operation.setPower( 3.0 );
      Containers::Algorithms::Reduction< Devices::Host >::reduce(
              operation,
              hostVector.getSize(),
              hostVector.getData(),
              ( Real* ) 0,
              result );
      return result;
   };
   auto l3normCuda = [&]() {
      resultDevice = deviceVector.lpNorm( 3.0 );
   };
   benchmark.setOperation( "l3 norm", datasetSize );
   benchmark.time( reset1, "CPU", l3normHost );
   benchmark.time( reset1, "CPU (general)", l3normHostGeneral );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", l3normCuda );
#endif
@@ -324,17 +227,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
   auto scalarProductHost = [&]() {
      resultHost = hostVector.scalarProduct( hostVector2 );
   };
   auto scalarProductHostGeneral = [&]() {
      Real result( 0 );
      Containers::Algorithms::ParallelReductionScalarProduct< Real, Real > operation;
      Containers::Algorithms::Reduction< Devices::Host >::reduce(
              operation,
              hostVector.getSize(),
              hostVector.getData(),
              hostVector2.getData(),
              result );
      return result;
   };
   auto scalarProductCuda = [&]() {
      resultDevice = deviceVector.scalarProduct( deviceVector2 );
   };
@@ -348,7 +240,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
#endif
   benchmark.setOperation( "scalar product", 2 * datasetSize );
   benchmark.time( reset1, "CPU", scalarProductHost );
   benchmark.time( reset1, "CPU (general)", scalarProductHostGeneral );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", scalarProductCuda );
   benchmark.time( reset1, "cuBLAS", scalarProductCublas );
+3 −10
Original line number Diff line number Diff line
@@ -183,11 +183,8 @@ compareMemory( const Element1* destination,
{
   TNL_ASSERT_TRUE( destination, "Attempted to compare data through a nullptr." );
   TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." );
   //TODO: The parallel reduction on the CUDA device with different element types is needed.
   bool result = false;
   Algorithms::ParallelReductionEqualities< Element1, Element2 > reductionEqualities;
   Reduction< Devices::Cuda >::reduce( reductionEqualities, size, destination, source, result );
   return result;
   return Reduction< Devices::Cuda >::reduce( reductionEqualities, size, destination, source );
}

template< typename Element,
@@ -201,11 +198,9 @@ containsValue( const Element* data,
   TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
   TNL_ASSERT_GE( size, 0, "" );
   if( size == 0 ) return false;
   bool result = false;
   Algorithms::ParallelReductionContainsValue< Element > reductionContainsValue;
   reductionContainsValue.setValue( value );
   Reduction< Devices::Cuda >::reduce( reductionContainsValue, size, data, 0, result );
   return result;
   return Reduction< Devices::Cuda >::reduce( reductionContainsValue, size, data, nullptr );
}

template< typename Element,
@@ -219,11 +214,9 @@ containsOnlyValue( const Element* data,
   TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
   TNL_ASSERT_GE( size, 0, "" );
   if( size == 0 ) return false;
   bool result = false;
   Algorithms::ParallelReductionContainsOnlyValue< Element > reductionContainsOnlyValue;
   reductionContainsOnlyValue.setValue( value );
   Reduction< Devices::Cuda >::reduce( reductionContainsOnlyValue, size, data, 0, result );
   return result;
   return Reduction< Devices::Cuda >::reduce( reductionContainsOnlyValue, size, data, nullptr );
}


+6 −9
Original line number Diff line number Diff line
@@ -30,12 +30,11 @@ class Reduction< Devices::Cuda >
{
public:
   template< typename Operation, typename Index >
   static void
   static typename Operation::ResultType
   reduce( Operation& operation,
           const Index size,
           const typename Operation::DataType1* deviceInput1,
           const typename Operation::DataType2* deviceInput2,
           typename Operation::ResultType& result );
           const typename Operation::DataType2* deviceInput2 );
};

template<>
@@ -43,12 +42,11 @@ class Reduction< Devices::Host >
{
public:
   template< typename Operation, typename Index >
   static void
   static typename Operation::ResultType
   reduce( Operation& operation,
           const Index size,
           const typename Operation::DataType1* deviceInput1,
           const typename Operation::DataType2* deviceInput2,
           typename Operation::ResultType& result );
           const typename Operation::DataType2* deviceInput2 );
};

template<>
@@ -56,12 +54,11 @@ class Reduction< Devices::MIC >
{
public:
   template< typename Operation, typename Index >
   static void
   static typename Operation::ResultType
   reduce( Operation& operation,
           const Index size,
           const typename Operation::DataType1* deviceInput1,
           const typename Operation::DataType2* deviceInput2,
           typename Operation::ResultType& result );
           const typename Operation::DataType2* deviceInput2 );
};

} // namespace Algorithms
Loading