From fed5d45ca61dc013d570845412096454120428a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 21 Aug 2019 12:49:36 +0200
Subject: [PATCH] Added default stream synchronizations after kernel launches
 in CudaReductionKernel.h

---
 src/TNL/Containers/Algorithms/CudaReductionKernel.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
index 82b030e1a9..36bd5c88b5 100644
--- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h
+++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
@@ -473,6 +473,7 @@ struct CudaReductionKernelLauncher
             default:
                TNL_ASSERT( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
          }
+         cudaStreamSynchronize(0);
          TNL_CHECK_CUDA_DEVICE;
 */
 
@@ -482,6 +483,8 @@ struct CudaReductionKernelLauncher
 
             CudaReductionKernel< Reduction_maxThreadsPerBlock >
             <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
+            cudaStreamSynchronize(0);
+            TNL_CHECK_CUDA_DEVICE;
          }
          else {
             TNL_ASSERT( false, std::cerr << "Block size was expected to be " << Reduction_maxThreadsPerBlock << ", but " << blockSize.x << " was specified." << std::endl; );
@@ -578,6 +581,7 @@ struct CudaReductionKernelLauncher
             default:
                TNL_ASSERT( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
          }
+         cudaStreamSynchronize(0);
          TNL_CHECK_CUDA_DEVICE;
 */
 
@@ -587,6 +591,8 @@ struct CudaReductionKernelLauncher
 
             CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock >
             <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
+            cudaStreamSynchronize(0);
+            TNL_CHECK_CUDA_DEVICE;
          }
          else {
             TNL_ASSERT( false, std::cerr << "Block size was expected to be " << Reduction_maxThreadsPerBlock << ", but " << blockSize.x << " was specified." << std::endl; );
-- 
GitLab