From b8f60127437d2150a2995ef7a418799b70fba83d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Sun, 7 Jul 2019 22:12:22 +0200
Subject: [PATCH] Optimized CudaReductionKernel to improve compilation times

---
 .../Algorithms/CudaReductionKernel.h          | 35 +++++++++++++++++--
 1 file changed, 32 insertions(+), 3 deletions(-)

diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
index ea3bbb6cd8..b9b9e0acb6 100644
--- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h
+++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
@@ -456,6 +456,9 @@ struct CudaReductionKernelLauncher
                   ? 2 * blockSize.x * sizeof( ResultType )
                   : blockSize.x * sizeof( ResultType );
 
+        // This is "general", but this method always sets blockSize.x to a specific value,
+        // so runtime switch is not necessary - it only prolongs the compilation time.
+/*
         /////
         // Depending on the blockSize we generate appropriate template instance.
         switch( blockSize.x )
@@ -518,6 +521,18 @@ struct CudaReductionKernelLauncher
               TNL_ASSERT( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
         }
         TNL_CHECK_CUDA_DEVICE;
+*/
+
+        // Check just to future-proof the code setting blockSize.x
+        if( blockSize.x == Reduction_maxThreadsPerBlock ) {
+           cudaFuncSetCacheConfig(CudaReductionKernel< Reduction_maxThreadsPerBlock, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+           CudaReductionKernel< Reduction_maxThreadsPerBlock >
+           <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+        }
+        else {
+           TNL_ASSERT( false, std::cerr << "Block size was expected to be " << Reduction_maxThreadsPerBlock << ", but " << blockSize.x << " was specified." << std::endl; );
+        }
 
         ////
         // Return the size of the output array on the CUDA device
@@ -547,9 +562,11 @@ struct CudaReductionKernelLauncher
                   ? 2 * blockSize.x * ( sizeof( ResultType ) + sizeof( Index ) )
                   : blockSize.x * ( sizeof( ResultType ) + sizeof( Index ) );
 
-        /***
-         * Depending on the blockSize we generate appropriate template instance.
-         */
+        // This is "general", but this method always sets blockSize.x to a specific value,
+        // so runtime switch is not necessary - it only prolongs the compilation time.
+/*
+        /////
+        // Depending on the blockSize we generate appropriate template instance.
         switch( blockSize.x )
         {
            case 512:
@@ -610,6 +627,18 @@ struct CudaReductionKernelLauncher
               TNL_ASSERT( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
         }
         TNL_CHECK_CUDA_DEVICE;
+*/
+
+        // Check just to future-proof the code setting blockSize.x
+        if( blockSize.x == Reduction_maxThreadsPerBlock ) {
+           cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+           CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock >
+           <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
+        }
+        else {
+           TNL_ASSERT( false, std::cerr << "Block size was expected to be " << Reduction_maxThreadsPerBlock << ", but " << blockSize.x << " was specified." << std::endl; );
+        }
 
         ////
         // return the size of the output array on the CUDA device
-- 
GitLab