Commit 5b36e3c1 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Benchmarks: add a case with explicit cudaThreadsSynchronize() call

parent 742581b2
Loading
Loading
Loading
Loading
+16 −1
Original line number Diff line number Diff line
@@ -62,11 +62,13 @@ benchmarkCuda( const int & loops,
               CheckFunction check = trueFunc,
               ResetFunction reset = voidFunc )
{
    tnlTimerRT timerHost, timerCuda;
    tnlTimerRT timerHost, timerCuda, timerCudaSync;
    timerHost.reset();
    timerHost.stop();
    timerCuda.reset();
    timerCuda.stop();
    timerCudaSync.reset();
    timerCudaSync.stop();

    for(int i = 0; i < loops; ++i) {
        timerHost.start();
@@ -81,15 +83,28 @@ benchmarkCuda( const int & loops,
            throw BenchmarkError();

        reset();

        // Compute again on CUDA, with explicit synchronization of threads
        timerCudaSync.start();
        computeCuda();
        cudaThreadSynchronize();
        timerCudaSync.stop();

        reset();
    }

    const double timeHost = timerHost.getTime();
    const double timeCuda = timerCuda.getTime();
    const double timeCudaSync = timerCudaSync.getTime();
    const double bandwidthHost = datasetSize / timeHost;
    const double bandwidthCuda = datasetSize / timeCuda;
    const double bandwidthCudaSync = datasetSize / timeCudaSync;
    std::cout << "  CPU: bandwidth: " << bandwidthHost << " GB/sec, time: " << timeHost << " sec." << std::endl;
    std::cout << "  GPU: bandwidth: " << bandwidthCuda << " GB/sec, time: " << timeCuda << " sec." << std::endl;
    std::cout << "  GPU (sync): bandwidth: " << bandwidthCudaSync << " GB/sec, time: " << timeCudaSync << " sec." << std::endl;
    std::cout << "  CPU/GPU speedup: " << timeHost / timeCuda << std::endl;
    std::cout << "  CPU/GPU (sync) speedup: " << timeHost / timeCudaSync << std::endl;
    std::cout << std::endl;
}

} // namespace benchmarks
+3 −3
Original line number Diff line number Diff line
@@ -43,7 +43,9 @@ using SlicedEllpackMatrix = tnlSlicedEllpackMatrix< Real, Device, Index >;
//   - reset() clears the timer and starts it again
//   - getTime() stops the timer and starts it again !!!
//   - data members are not zero-initialized - reset has to be called manually, but it immediately starts the timer
// FIXME: scalarProduct is not const method
// FIXME:
// - scalarProduct is not const method
// - cudaThreadSynchronize() should be called from all CUDA methods


template< typename Matrix >
@@ -183,8 +185,6 @@ benchmarkSpMV( const int & loops,
   };
   auto spmvCuda = [&]() {
      deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
      // TODO: tnlCSRMatrix does not synchronize
      cudaThreadSynchronize();
   };

   benchmarkCuda( loops, datasetSize, spmvHost, spmvCuda, check, reset );
+0 −2
Original line number Diff line number Diff line
@@ -78,8 +78,6 @@ benchmarkVectorOperations( const int & loops,
    };
    auto addVectorCuda = [&]() {
        deviceVector.addVector( deviceVector2 );
        // TODO: synchronization should be part of addVector
        cudaThreadSynchronize();
    };
    benchmarkCuda( loops, 3 * datasetSize, addVectorHost, addVectorCuda, compare1, reset1 );