Commit 5b42f5bc authored by Vít Hanousek's avatar Vít Hanousek
Browse files

Merge branch 'mpi' into anselm-special-log

parents 393c5ba5 6987efa0
Loading
Loading
Loading
Loading
+0 −4
Original line number Diff line number Diff line
@@ -409,10 +409,6 @@ endif()
#   endif()
#endif()

if( OPTIMIZED_VECTOR_HOST_OPERATIONS STREQUAL "yes" )
   AddCompilerFlag( "-DOPTIMIZED_VECTOR_HOST_OPERATIONS " )
endif()

CONFIGURE_FILE( "tnlConfig.h.in" "${PROJECT_BUILD_PATH}/TNL/tnlConfig.h" )
INSTALL( FILES ${PROJECT_BUILD_PATH}/TNL/tnlConfig.h DESTINATION ${TNL_TARGET_INCLUDE_DIRECTORY} )

+0 −3
Original line number Diff line number Diff line
@@ -34,7 +34,6 @@ INSTANTIATE_INT="yes"
INSTANTIATE_LONG_DOUBLE="no"
INSTANTIATE_DOUBLE="yes"
INSTANTIATE_FLOAT="no"
OPTIMIZED_VECTOR_HOST_OPERATIONS="no"

for option in "$@"
do
@@ -75,7 +74,6 @@ do
                                           INSTANTIATE_DOUBLE="yes"
                                           INSTANTIATE_FLOAT="no"
                                           WITH_CUDA_ARCH="auto" ;;
        --optimize-vector-host-operations=* ) OPTIMIZED_VECTOR_HOST_OPERATIONS="yes" ;;
        *                                ) 
           echo "Unknown option ${option}. Use --help for more information."
           exit 1 ;;
@@ -175,7 +173,6 @@ cmake_command=(
         -DINSTANTIATE_LONG_DOUBLE=${INSTANTIATE_LONG_DOUBLE}
         -DINSTANTIATE_INT=${INSTANTIATE_INT}
         -DINSTANTIATE_LONG_INT=${INSTANTIATE_LONG_INT}
         -DOPTIMIZED_VECTOR_HOST_OPERATIONS=${OPTIMIZED_VECTOR_HOST_OPERATIONS}
)

# Skip running cmake if it was already run and the cmake command is the same.
+0 −109
Original line number Diff line number Diff line
@@ -87,23 +87,11 @@ benchmarkVectorOperations( Benchmark & benchmark,
   auto maxHost = [&]() {
      resultHost = hostVector.max();
   };
   auto maxHostGeneral = [&]() {
      Real result( 0 );
      Containers::Algorithms::ParallelReductionMax< Real > operation;
      Containers::Algorithms::Reduction< Devices::Host >::reduce(
              operation,
              hostVector.getSize(),
              hostVector.getData(),
              ( Real* ) 0,
              result );
      return result;
   };
   auto maxCuda = [&]() {
      resultDevice = deviceVector.max();
   };
   benchmark.setOperation( "max", datasetSize );
   benchmark.time( reset1, "CPU", maxHost );
   benchmark.time( reset1, "CPU (general)", maxHostGeneral );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", maxCuda );
#endif
@@ -112,23 +100,11 @@ benchmarkVectorOperations( Benchmark & benchmark,
   auto minHost = [&]() {
      resultHost = hostVector.min();
   };
   auto minHostGeneral = [&]() {
      Real result( 0 );
      Containers::Algorithms::ParallelReductionMin< Real > operation;
      Containers::Algorithms::Reduction< Devices::Host >::reduce(
              operation,
              hostVector.getSize(),
              hostVector.getData(),
              ( Real* ) 0,
              result );
      return result;
   };
   auto minCuda = [&]() {
      resultDevice = deviceVector.min();
   };
   benchmark.setOperation( "min", datasetSize );
   benchmark.time( reset1, "CPU", minHost );
   benchmark.time( reset1, "CPU (general)", minHostGeneral );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", minCuda );
#endif
@@ -137,17 +113,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
   auto absMaxHost = [&]() {
      resultHost = hostVector.absMax();
   };
   auto absMaxHostGeneral = [&]() {
      Real result( 0 );
      Containers::Algorithms::ParallelReductionAbsMax< Real > operation;
      Containers::Algorithms::Reduction< Devices::Host >::reduce(
              operation,
              hostVector.getSize(),
              hostVector.getData(),
              ( Real* ) 0,
              result );
      return result;
   };
   auto absMaxCuda = [&]() {
      resultDevice = deviceVector.absMax();
   };
@@ -162,7 +127,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
#endif
   benchmark.setOperation( "absMax", datasetSize );
   benchmark.time( reset1, "CPU", absMaxHost );
   benchmark.time( reset1, "CPU (general)", absMaxHostGeneral );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", absMaxCuda );
   benchmark.time( reset1, "cuBLAS", absMaxCublas );
@@ -172,17 +136,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
   auto absMinHost = [&]() {
      resultHost = hostVector.absMin();
   };
   auto absMinHostGeneral = [&]() {
      Real result( 0 );
      Containers::Algorithms::ParallelReductionAbsMin< Real > operation;
      Containers::Algorithms::Reduction< Devices::Host >::reduce(
              operation,
              hostVector.getSize(),
              hostVector.getData(),
              ( Real* ) 0,
              result );
      return result;
   };
   auto absMinCuda = [&]() {
      resultDevice = deviceVector.absMin();
   };
@@ -197,7 +150,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
#endif
   benchmark.setOperation( "absMin", datasetSize );
   benchmark.time( reset1, "CPU", absMinHost );
   benchmark.time( reset1, "CPU (general)", absMinHostGeneral );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", absMinCuda );
   benchmark.time( reset1, "cuBLAS", absMinCublas );
@@ -207,23 +159,11 @@ benchmarkVectorOperations( Benchmark & benchmark,
   auto sumHost = [&]() {
      resultHost = hostVector.sum();
   };
   auto sumHostGeneral = [&]() {
      Real result( 0 );
      Containers::Algorithms::ParallelReductionSum< Real > operation;
      Containers::Algorithms::Reduction< Devices::Host >::reduce(
              operation,
              hostVector.getSize(),
              hostVector.getData(),
              ( Real* ) 0,
              result );
      return result;
   };
   auto sumCuda = [&]() {
      resultDevice = deviceVector.sum();
   };
   benchmark.setOperation( "sum", datasetSize );
   benchmark.time( reset1, "CPU", sumHost );
   benchmark.time( reset1, "CPU (general)", sumHostGeneral );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", sumCuda );
#endif
@@ -232,17 +172,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
   auto l1normHost = [&]() {
      resultHost = hostVector.lpNorm( 1.0 );
   };
   auto l1normHostGeneral = [&]() {
      Real result( 0 );
      Containers::Algorithms::ParallelReductionAbsSum< Real > operation;
      Containers::Algorithms::Reduction< Devices::Host >::reduce(
              operation,
              hostVector.getSize(),
              hostVector.getData(),
              ( Real* ) 0,
              result );
      return result;
   };
   auto l1normCuda = [&]() {
      resultDevice = deviceVector.lpNorm( 1.0 );
   };
@@ -255,7 +184,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
#endif
   benchmark.setOperation( "l1 norm", datasetSize );
   benchmark.time( reset1, "CPU", l1normHost );
   benchmark.time( reset1, "CPU (general)", l1normHostGeneral );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", l1normCuda );
   benchmark.time( reset1, "cuBLAS", l1normCublas );
@@ -265,17 +193,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
   auto l2normHost = [&]() {
      resultHost = hostVector.lpNorm( 2.0 );
   };
   auto l2normHostGeneral = [&]() {
      Real result( 0 );
      Containers::Algorithms::ParallelReductionL2Norm< Real > operation;
      Containers::Algorithms::Reduction< Devices::Host >::reduce(
              operation,
              hostVector.getSize(),
              hostVector.getData(),
              ( Real* ) 0,
              result );
      return result;
   };
   auto l2normCuda = [&]() {
      resultDevice = deviceVector.lpNorm( 2.0 );
   };
@@ -288,7 +205,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
#endif
   benchmark.setOperation( "l2 norm", datasetSize );
   benchmark.time( reset1, "CPU", l2normHost );
   benchmark.time( reset1, "CPU (general)", l2normHostGeneral );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", l2normCuda );
   benchmark.time( reset1, "cuBLAS", l2normCublas );
@@ -298,24 +214,11 @@ benchmarkVectorOperations( Benchmark & benchmark,
   auto l3normHost = [&]() {
      resultHost = hostVector.lpNorm( 3.0 );
   };
   auto l3normHostGeneral = [&]() {
      Real result( 0 );
      Containers::Algorithms::ParallelReductionLpNorm< Real > operation;
      operation.setPower( 3.0 );
      Containers::Algorithms::Reduction< Devices::Host >::reduce(
              operation,
              hostVector.getSize(),
              hostVector.getData(),
              ( Real* ) 0,
              result );
      return result;
   };
   auto l3normCuda = [&]() {
      resultDevice = deviceVector.lpNorm( 3.0 );
   };
   benchmark.setOperation( "l3 norm", datasetSize );
   benchmark.time( reset1, "CPU", l3normHost );
   benchmark.time( reset1, "CPU (general)", l3normHostGeneral );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", l3normCuda );
#endif
@@ -324,17 +227,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
   auto scalarProductHost = [&]() {
      resultHost = hostVector.scalarProduct( hostVector2 );
   };
   auto scalarProductHostGeneral = [&]() {
      Real result( 0 );
      Containers::Algorithms::ParallelReductionScalarProduct< Real, Real > operation;
      Containers::Algorithms::Reduction< Devices::Host >::reduce(
              operation,
              hostVector.getSize(),
              hostVector.getData(),
              hostVector2.getData(),
              result );
      return result;
   };
   auto scalarProductCuda = [&]() {
      resultDevice = deviceVector.scalarProduct( deviceVector2 );
   };
@@ -348,7 +240,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
#endif
   benchmark.setOperation( "scalar product", 2 * datasetSize );
   benchmark.time( reset1, "CPU", scalarProductHost );
   benchmark.time( reset1, "CPU (general)", scalarProductHostGeneral );
#ifdef HAVE_CUDA
   benchmark.time( reset1, "GPU", scalarProductCuda );
   benchmark.time( reset1, "cuBLAS", scalarProductCublas );
+2 −2
Original line number Diff line number Diff line
@@ -514,7 +514,7 @@ getExplicitUpdate( const RealType& time,
                    cell.getBasis(),
                    gridXIdx,
                    gridYIdx );
         cudaThreadSynchronize();
         cudaDeviceSynchronize();
         TNL_CHECK_CUDA_DEVICE;
         
         //std::cerr << "Computing the heat equation ..." << std::endl;
@@ -534,7 +534,7 @@ getExplicitUpdate( const RealType& time,
                    cell.getBasis(),
                    gridXIdx,
                    gridYIdx );
         cudaThreadSynchronize();         
         cudaDeviceSynchronize();         
         TNL_CHECK_CUDA_DEVICE;
      }
      if( this->cudaKernelType == "templated" )
+1 −1
Original line number Diff line number Diff line
@@ -47,7 +47,7 @@ int main( int argc, char* argv[] )
   while( iteration < 10000 )
   {
      testKernel< GridEntity ><<< cudaGridSize, cudaBlockSize >>>();
      cudaThreadSynchronize();
      cudaDeviceSynchronize();
      iteration++;
   }
   auto t_stop = std::chrono::high_resolution_clock::now();   
Loading