Commit 7729b41c authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Replaced explicit algorithms for host in VectorOperations with general implementation

According to benchmarks, there is practically no difference in
performance. Only explicit unrolling is helpful, but that has been
implemented for the general algorithm in Reduction::reduce as well.
parent 40d68215
Loading
Loading
Loading
Loading
+0 −4
Original line number Diff line number Diff line
@@ -409,10 +409,6 @@ endif()
#   endif()
#endif()

if( OPTIMIZED_VECTOR_HOST_OPERATIONS STREQUAL "yes" )
   AddCompilerFlag( "-DOPTIMIZED_VECTOR_HOST_OPERATIONS " )
endif()

CONFIGURE_FILE( "tnlConfig.h.in" "${PROJECT_BUILD_PATH}/TNL/tnlConfig.h" )
INSTALL( FILES ${PROJECT_BUILD_PATH}/TNL/tnlConfig.h DESTINATION ${TNL_TARGET_INCLUDE_DIRECTORY} )

+0 −3
Original line number Diff line number Diff line
@@ -34,7 +34,6 @@ INSTANTIATE_INT="yes"
INSTANTIATE_LONG_DOUBLE="no"
INSTANTIATE_DOUBLE="yes"
INSTANTIATE_FLOAT="no"
OPTIMIZED_VECTOR_HOST_OPERATIONS="no"

for option in "$@"
do
@@ -75,7 +74,6 @@ do
                                           INSTANTIATE_DOUBLE="yes"
                                           INSTANTIATE_FLOAT="no"
                                           WITH_CUDA_ARCH="auto" ;;
        --optimize-vector-host-operations=* ) OPTIMIZED_VECTOR_HOST_OPERATIONS="yes" ;;
        *                                ) 
           echo "Unknown option ${option}. Use --help for more information."
           exit 1 ;;
@@ -175,7 +173,6 @@ cmake_command=(
         -DINSTANTIATE_LONG_DOUBLE=${INSTANTIATE_LONG_DOUBLE}
         -DINSTANTIATE_INT=${INSTANTIATE_INT}
         -DINSTANTIATE_LONG_INT=${INSTANTIATE_LONG_INT}
         -DOPTIMIZED_VECTOR_HOST_OPERATIONS=${OPTIMIZED_VECTOR_HOST_OPERATIONS}
)

# Skip running cmake if it was already run and the cmake command is the same.
+0 −82
Original line number Diff line number Diff line
@@ -87,20 +87,11 @@ benchmarkVectorOperations( Benchmark & benchmark,
   auto maxHost = [&]() {
      resultHost = hostVector.max();
   };
   auto maxHostGeneral = [&]() {
      Containers::Algorithms::ParallelReductionMax< Real > operation;
      return Containers::Algorithms::Reduction< Devices::Host >::reduce(
                 operation,
                 hostVector.getSize(),
                 hostVector.getData(),
                 ( Real* ) 0 );
   };
   auto maxCuda = [&]() {
      resultDevice = deviceVector.max();
   };
   benchmark.setOperation( "max", datasetSize );
   benchmark.time( reset1, "CPU", maxHost );
   benchmark.time( reset1, "CPU (general)", maxHostGeneral );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", maxCuda );
#endif
@@ -109,20 +100,11 @@ benchmarkVectorOperations( Benchmark & benchmark,
   auto minHost = [&]() {
      resultHost = hostVector.min();
   };
   auto minHostGeneral = [&]() {
      Containers::Algorithms::ParallelReductionMin< Real > operation;
      return Containers::Algorithms::Reduction< Devices::Host >::reduce(
                 operation,
                 hostVector.getSize(),
                 hostVector.getData(),
                 ( Real* ) 0 );
   };
   auto minCuda = [&]() {
      resultDevice = deviceVector.min();
   };
   benchmark.setOperation( "min", datasetSize );
   benchmark.time( reset1, "CPU", minHost );
   benchmark.time( reset1, "CPU (general)", minHostGeneral );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", minCuda );
#endif
@@ -131,14 +113,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
   auto absMaxHost = [&]() {
      resultHost = hostVector.absMax();
   };
   auto absMaxHostGeneral = [&]() {
      Containers::Algorithms::ParallelReductionAbsMax< Real > operation;
      return Containers::Algorithms::Reduction< Devices::Host >::reduce(
                 operation,
                 hostVector.getSize(),
                 hostVector.getData(),
                 ( Real* ) 0 );
   };
   auto absMaxCuda = [&]() {
      resultDevice = deviceVector.absMax();
   };
@@ -153,7 +127,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
#endif
   benchmark.setOperation( "absMax", datasetSize );
   benchmark.time( reset1, "CPU", absMaxHost );
   benchmark.time( reset1, "CPU (general)", absMaxHostGeneral );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", absMaxCuda );
   benchmark.time( reset1, "cuBLAS", absMaxCublas );
@@ -163,14 +136,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
   auto absMinHost = [&]() {
      resultHost = hostVector.absMin();
   };
   auto absMinHostGeneral = [&]() {
      Containers::Algorithms::ParallelReductionAbsMin< Real > operation;
      return Containers::Algorithms::Reduction< Devices::Host >::reduce(
                 operation,
                 hostVector.getSize(),
                 hostVector.getData(),
                 ( Real* ) 0 );
   };
   auto absMinCuda = [&]() {
      resultDevice = deviceVector.absMin();
   };
@@ -185,7 +150,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
#endif
   benchmark.setOperation( "absMin", datasetSize );
   benchmark.time( reset1, "CPU", absMinHost );
   benchmark.time( reset1, "CPU (general)", absMinHostGeneral );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", absMinCuda );
   benchmark.time( reset1, "cuBLAS", absMinCublas );
@@ -195,20 +159,11 @@ benchmarkVectorOperations( Benchmark & benchmark,
   auto sumHost = [&]() {
      resultHost = hostVector.sum();
   };
   auto sumHostGeneral = [&]() {
      Containers::Algorithms::ParallelReductionSum< Real > operation;
      return Containers::Algorithms::Reduction< Devices::Host >::reduce(
                 operation,
                 hostVector.getSize(),
                 hostVector.getData(),
                 ( Real* ) 0 );
   };
   auto sumCuda = [&]() {
      resultDevice = deviceVector.sum();
   };
   benchmark.setOperation( "sum", datasetSize );
   benchmark.time( reset1, "CPU", sumHost );
   benchmark.time( reset1, "CPU (general)", sumHostGeneral );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", sumCuda );
#endif
@@ -217,14 +172,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
   auto l1normHost = [&]() {
      resultHost = hostVector.lpNorm( 1.0 );
   };
   auto l1normHostGeneral = [&]() {
      Containers::Algorithms::ParallelReductionAbsSum< Real > operation;
      return Containers::Algorithms::Reduction< Devices::Host >::reduce(
                 operation,
                 hostVector.getSize(),
                 hostVector.getData(),
                 ( Real* ) 0 );
   };
   auto l1normCuda = [&]() {
      resultDevice = deviceVector.lpNorm( 1.0 );
   };
@@ -237,7 +184,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
#endif
   benchmark.setOperation( "l1 norm", datasetSize );
   benchmark.time( reset1, "CPU", l1normHost );
   benchmark.time( reset1, "CPU (general)", l1normHostGeneral );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", l1normCuda );
   benchmark.time( reset1, "cuBLAS", l1normCublas );
@@ -247,14 +193,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
   auto l2normHost = [&]() {
      resultHost = hostVector.lpNorm( 2.0 );
   };
   auto l2normHostGeneral = [&]() {
      Containers::Algorithms::ParallelReductionL2Norm< Real > operation;
      return Containers::Algorithms::Reduction< Devices::Host >::reduce(
                 operation,
                 hostVector.getSize(),
                 hostVector.getData(),
                 ( Real* ) 0 );
   };
   auto l2normCuda = [&]() {
      resultDevice = deviceVector.lpNorm( 2.0 );
   };
@@ -267,7 +205,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
#endif
   benchmark.setOperation( "l2 norm", datasetSize );
   benchmark.time( reset1, "CPU", l2normHost );
   benchmark.time( reset1, "CPU (general)", l2normHostGeneral );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", l2normCuda );
   benchmark.time( reset1, "cuBLAS", l2normCublas );
@@ -277,21 +214,11 @@ benchmarkVectorOperations( Benchmark & benchmark,
   auto l3normHost = [&]() {
      resultHost = hostVector.lpNorm( 3.0 );
   };
   auto l3normHostGeneral = [&]() {
      Containers::Algorithms::ParallelReductionLpNorm< Real > operation;
      operation.setPower( 3.0 );
      return Containers::Algorithms::Reduction< Devices::Host >::reduce(
                 operation,
                 hostVector.getSize(),
                 hostVector.getData(),
                 ( Real* ) 0 );
   };
   auto l3normCuda = [&]() {
      resultDevice = deviceVector.lpNorm( 3.0 );
   };
   benchmark.setOperation( "l3 norm", datasetSize );
   benchmark.time( reset1, "CPU", l3normHost );
   benchmark.time( reset1, "CPU (general)", l3normHostGeneral );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", l3normCuda );
#endif
@@ -300,14 +227,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
   auto scalarProductHost = [&]() {
      resultHost = hostVector.scalarProduct( hostVector2 );
   };
   auto scalarProductHostGeneral = [&]() {
      Containers::Algorithms::ParallelReductionScalarProduct< Real, Real > operation;
      return Containers::Algorithms::Reduction< Devices::Host >::reduce(
                 operation,
                 hostVector.getSize(),
                 hostVector.getData(),
                 hostVector2.getData() );
   };
   auto scalarProductCuda = [&]() {
      resultDevice = deviceVector.scalarProduct( deviceVector2 );
   };
@@ -321,7 +240,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
#endif
   benchmark.setOperation( "scalar product", 2 * datasetSize );
   benchmark.time( reset1, "CPU", scalarProductHost );
   benchmark.time( reset1, "CPU (general)", scalarProductHostGeneral );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", scalarProductCuda );
   benchmark.time( reset1, "cuBLAS", scalarProductCublas );
+94 −244

File changed.

Preview size limit exceeded, changes collapsed.