From 946198871b4885b16ac628647077787ae53fae69 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Wed, 10 Jul 2019 13:23:55 +0200
Subject: [PATCH] Fixed the finish method of CudaReductionKernelLauncher

---
 .../Algorithms/CudaReductionKernel.h          | 38 ++++++++++++-------
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
index b9b9e0acb6..21613c4978 100644
--- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h
+++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
@@ -172,7 +172,7 @@ CudaReductionWithArgumentKernel( const Result zero,
    using ResultType = Result;
 
    ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >();
-   IndexType* sidx = static_cast< IndexType* >( static_cast< void* >( &sdata[ blockDim.x ] ) );
+   IndexType* sidx = reinterpret_cast< IndexType* >( &sdata[ blockDim.x ] );
 
    /***
     * Get thread id (tid) and global thread id (gid).
@@ -344,12 +344,12 @@ struct CudaReductionKernelLauncher
    {
       ////
       // create reference to the reduction buffer singleton and set size
-      const size_t buf_size = 2 * desGridSize * sizeof( ResultType );
+      const std::size_t buf_size = 2 * desGridSize * sizeof( ResultType );
       CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
       cudaReductionBuffer.setSize( buf_size );
       output = cudaReductionBuffer.template getData< ResultType >();
 
-      this-> reducedSize = this->launch( originalSize, reduction, volatileReduction, dataFetcher, zero, output );
+      this->reducedSize = this->launch( originalSize, reduction, volatileReduction, dataFetcher, zero, output );
       return this->reducedSize;
    }
 
@@ -365,13 +365,13 @@ struct CudaReductionKernelLauncher
    {
       ////
       // create reference to the reduction buffer singleton and set size
-      const size_t buf_size = 2 * desGridSize * ( sizeof( ResultType ) + sizeof( IndexType ) );
+      const std::size_t buf_size = 2 * desGridSize * ( sizeof( ResultType ) + sizeof( IndexType ) );
       CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
       cudaReductionBuffer.setSize( buf_size );
       output = cudaReductionBuffer.template getData< ResultType >();
-      idxOutput = static_cast< IndexType* >( static_cast< void* >( &output[ 2 * desGridSize ] ) );
+      idxOutput = reinterpret_cast< IndexType* >( &output[ 2 * desGridSize ] );
 
-      this-> reducedSize = this->launchWithArgument( originalSize, reduction, volatileReduction, dataFetcher, zero, output, idxOutput, nullptr );
+      this->reducedSize = this->launchWithArgument( originalSize, reduction, volatileReduction, dataFetcher, zero, output, idxOutput, nullptr );
       return this->reducedSize;
    }
 
@@ -383,18 +383,22 @@ struct CudaReductionKernelLauncher
    {
       ////
       // Input is the first half of the buffer, output is the second half
-      const size_t buf_size = desGridSize * sizeof( ResultType );
       CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
       ResultType* input = cudaReductionBuffer.template getData< ResultType >();
-      ResultType* output = &input[ buf_size ];
+      ResultType* output = &input[ desGridSize ];
 
-      auto copyFetch = [=] __cuda_callable__ ( IndexType i ) { return input[ i ]; };
       while( this->reducedSize > 1 )
       {
-         this-> reducedSize = this->launch( this->reducedSize, reduction, volatileReduction, copyFetch, zero, output );
+         // this lambda has to be defined inside the loop, because the captured variable changes
+         auto copyFetch = [input] __cuda_callable__ ( IndexType i ) { return input[ i ]; };
+         this->reducedSize = this->launch( this->reducedSize, reduction, volatileReduction, copyFetch, zero, output );
          std::swap( input, output );
       }
 
+      // swap again to revert the swap from the last iteration
+      // AND to solve the case when this->reducedSize was 1 since the beginning
+      std::swap( input, output );
+
       ////
       // Copy result on CPU
       ResultType result;
@@ -411,20 +415,26 @@ struct CudaReductionKernelLauncher
    {
       ////
       // Input is the first half of the buffer, output is the second half
-      //const size_t buf_size = desGridSize * sizeof( ResultType );
       CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
       ResultType* input = cudaReductionBuffer.template getData< ResultType >();
       ResultType* output = &input[ desGridSize ];
-      IndexType* idxInput = static_cast< IndexType* >( static_cast< void* >( &output[ desGridSize ] ) );
+      IndexType* idxInput = reinterpret_cast< IndexType* >( &output[ desGridSize ] );
       IndexType* idxOutput = &idxInput[ desGridSize ];
 
-      auto copyFetch = [=] __cuda_callable__ ( IndexType i ) { return input[ i ]; };
       while( this->reducedSize > 1 )
       {
-         this-> reducedSize = this->launchWithArgument( this->reducedSize, reduction, volatileReduction, copyFetch, zero, output, idxOutput, idxInput );
+         // this lambda has to be defined inside the loop, because the captured variable changes
+         auto copyFetch = [input] __cuda_callable__ ( IndexType i ) { return input[ i ]; };
+         this->reducedSize = this->launchWithArgument( this->reducedSize, reduction, volatileReduction, copyFetch, zero, output, idxOutput, idxInput );
          std::swap( input, output );
+         std::swap( idxInput, idxOutput );
       }
 
+      // swap again to revert the swap from the last iteration
+      // AND to solve the case when this->reducedSize was 1 since the beginning
+      std::swap( input, output );
+      std::swap( idxInput, idxOutput );
+
       ////
       // Copy result on CPU
       ResultType result;
-- 
GitLab