From 61f8e317c6d03c0257d9391dca6aca58e5d27a83 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 8 May 2019 22:14:20 +0200
Subject: [PATCH] Implementing reduction with argument.

---
 .../Algorithms/CudaReductionKernel.h          | 470 ++++++++++++++----
 src/TNL/Containers/Algorithms/Reduction.h     | 112 +++--
 src/TNL/Containers/Algorithms/Reduction.hpp   | 260 +++++++++-
 .../Expressions/VerticalOperations.h          |  82 +++
 src/TNL/Containers/StaticVectorExpressions.h  |  16 +
 src/TNL/Containers/VectorViewExpressions.h    |  18 +
 src/UnitTests/Containers/VectorTest-7.h       |  22 +-
 7 files changed, 810 insertions(+), 170 deletions(-)

diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
index d7a711cc7f..70c3fbab00 100644
--- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h
+++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
@@ -74,22 +74,19 @@ CudaReductionKernel( const Result zero,
     * Read data into the shared memory. We start with the
     * sequential reduction.
     */
-   while( gid + 4 * gridSize < size )
-   {
+   while( gid + 4 * gridSize < size ) {
       reduction( sdata[ tid ], dataFetcher( gid ) );
       reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
       reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ) );
       reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ) );
       gid += 4 * gridSize;
    }
-   while( gid + 2 * gridSize < size )
-   {
+   while( gid + 2 * gridSize < size ) {
       reduction( sdata[ tid ], dataFetcher( gid ) );
       reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
       gid += 2 * gridSize;
    }
-   while( gid < size )
-   {
+   while( gid < size ) {
       reduction( sdata[ tid ], dataFetcher( gid ) );
       gid += gridSize;
    }
@@ -99,71 +96,51 @@ CudaReductionKernel( const Result zero,
    /***
     *  Perform the parallel reduction.
     */
-   if( blockSize >= 1024 )
-   {
+   if( blockSize >= 1024 ) {
       if( tid < 512 )
          reduction( sdata[ tid ], sdata[ tid + 512 ] );
       __syncthreads();
    }
-   if( blockSize >= 512 )
-   {
+   if( blockSize >= 512 ) {
       if( tid < 256 )
          reduction( sdata[ tid ], sdata[ tid + 256 ] );
       __syncthreads();
    }
-   if( blockSize >= 256 )
-   {
+   if( blockSize >= 256 ) {
       if( tid < 128 )
          reduction( sdata[ tid ], sdata[ tid + 128 ] );
       __syncthreads();
-      //printf( "2: tid %d data %f \n", tid, sdata[ tid ] );
    }
-
-   if( blockSize >= 128 )
-   {
+   if( blockSize >= 128 ) {
       if( tid <  64 )
          reduction( sdata[ tid ], sdata[ tid + 64 ] );
       __syncthreads();
-      //printf( "3: tid %d data %f \n", tid, sdata[ tid ] );
    }
 
    /***
     * This runs in one warp so it is synchronized implicitly.
     */
-   if( tid < 32 )
-   {
+   if( tid < 32 ) {
       volatile ResultType* vsdata = sdata;
-      if( blockSize >= 64 )
-      {
+      if( blockSize >= 64 ) {
          volatileReduction( vsdata[ tid ], vsdata[ tid + 32 ] );
-         //printf( "4: tid %d data %f \n", tid, sdata[ tid ] );
       }
       // TODO: If blocksize == 32, the following does not work
       // We do not check if tid < 16. Fix it!!!
-      if( blockSize >= 32 )
-      {
+      if( blockSize >= 32 ) {
          volatileReduction( vsdata[ tid ], vsdata[ tid + 16 ] );
-         //printf( "5: tid %d data %f \n", tid, sdata[ tid ] );
       }
-      if( blockSize >= 16 )
-      {
+      if( blockSize >= 16 ) {
          volatileReduction( vsdata[ tid ], vsdata[ tid + 8 ] );
-         //printf( "6: tid %d data %f \n", tid, sdata[ tid ] );
       }
-      if( blockSize >=  8 )
-      {
+      if( blockSize >=  8 ) {
          volatileReduction( vsdata[ tid ], vsdata[ tid + 4 ] );
-         //printf( "7: tid %d data %f \n", tid, sdata[ tid ] );
       }
-      if( blockSize >=  4 )
-      {
+      if( blockSize >=  4 ) {
          volatileReduction( vsdata[ tid ], vsdata[ tid + 2 ] );
-         //printf( "8: tid %d data %f \n", tid, sdata[ tid ] );
       }
-      if( blockSize >=  2 )
-      {
+      if( blockSize >=  2 ) {
          volatileReduction( vsdata[ tid ], vsdata[ tid + 1 ] );
-         //printf( "9: tid %d data %f \n", tid, sdata[ tid ] );
       }
    }
 
@@ -171,13 +148,152 @@ CudaReductionKernel( const Result zero,
     * Store the result back in the global memory.
     */
    if( tid == 0 )
-   {
-      //printf( "Block %d result = %f \n", blockIdx.x, sdata[ 0 ] );
       output[ blockIdx.x ] = sdata[ 0 ];
+}
+
+template< int blockSize,
+   typename Result,
+   typename DataFetcher,
+   typename Reduction,
+   typename VolatileReduction,
+   typename Index >
+__global__ void
+__launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor )
+CudaReductionWithArgumentKernel( const Result zero,
+                                 const DataFetcher dataFetcher,
+                                 const Reduction reduction,
+                                 const VolatileReduction volatileReduction,
+                                 const Index size,
+                                 Result* output,
+                                 Index* idxOutput,
+                                 const Index* idxInput = nullptr )
+{
+   using IndexType = Index;
+   using ResultType = Result;
+
+   ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >();
+   IndexType* sidx = static_cast< IndexType* >( static_cast< void* >( &sdata[ blockDim.x ] ) );
+
+   /***
+    * Get thread id (tid) and global thread id (gid).
+    * gridSize is the number of element processed by all blocks at the
+    * same time.
+    */
+   const IndexType tid = threadIdx.x;
+         IndexType gid = blockIdx.x * blockDim. x + threadIdx.x;
+   const IndexType gridSize = blockDim.x * gridDim.x;
+
+   /***
+    * Read data into the shared memory. We start with the
+    * sequential reduction.
+    */
+   if( idxInput ) {
+      sdata[ tid ] = dataFetcher( gid );
+      sidx[ tid ] = idxInput[ gid ];
+      gid += gridSize;
+      while( gid + 4 * gridSize < size ) {
+         reduction( sidx[ tid ], idxInput[ gid ], sdata[ tid ], dataFetcher( gid ) );
+         reduction( sidx[ tid ], idxInput[ gid + gridSize ], sdata[ tid ], dataFetcher( gid + gridSize ) );
+         reduction( sidx[ tid ], idxInput[ gid + 2 * gridSize ], sdata[ tid ], dataFetcher( gid + 2 * gridSize ) );
+         reduction( sidx[ tid ], idxInput[ gid + 3 * gridSize ], sdata[ tid ], dataFetcher( gid + 3 * gridSize ) );
+         gid += 4 * gridSize;
+      }
+      while( gid + 2 * gridSize < size ) {
+         reduction( sidx[ tid ], idxInput[ gid ], sdata[ tid ], dataFetcher( gid ) );
+         reduction( sidx[ tid ], idxInput[ gid + gridSize ], sdata[ tid ], dataFetcher( gid + gridSize ) );
+         gid += 2 * gridSize;
+      }
+      while( gid < size ) {
+         reduction( sidx[ tid ], idxInput[ gid ], sdata[ tid ], dataFetcher( gid ) );
+         gid += gridSize;
+      }
+   }
+   else {
+      sdata[ tid ] = dataFetcher( gid );
+      sidx[ tid ] = gid;
+      gid += gridSize;
+      while( gid + 4 * gridSize < size ) {
+         reduction( sidx[ tid ], gid, sdata[ tid ], dataFetcher( gid ) );
+         reduction( sidx[ tid ], gid + gridSize, sdata[ tid ], dataFetcher( gid + gridSize ) );
+         reduction( sidx[ tid ], gid + 2 * gridSize, sdata[ tid ], dataFetcher( gid + 2 * gridSize ) );
+         reduction( sidx[ tid ], gid + 3 * gridSize, sdata[ tid ], dataFetcher( gid + 3 * gridSize ) );
+         gid += 4 * gridSize;
+      }
+      while( gid + 2 * gridSize < size ) {
+         reduction( sidx[ tid ], gid, sdata[ tid ], dataFetcher( gid ) );
+         reduction( sidx[ tid ], gid + gridSize, sdata[ tid ], dataFetcher( gid + gridSize ) );
+         gid += 2 * gridSize;
+      }
+      while( gid < size ) {
+         reduction( sidx[ tid ], gid, sdata[ tid ], dataFetcher( gid ) );
+         gid += gridSize;
+      }
+   }
+   __syncthreads();
+
+   //printf( "1: tid %d data %f \n", tid, sdata[ tid ] );
+   /***
+    *  Perform the parallel reduction.
+    */
+   if( blockSize >= 1024 ) {
+      if( tid < 512 )
+         reduction( sidx[ tid ], sidx[ tid + 512 ], sdata[ tid ], sdata[ tid + 512 ] );
+      __syncthreads();
+   }
+   if( blockSize >= 512 ) {
+      if( tid < 256 )
+         reduction( sidx[ tid ], sidx[ tid + 256 ], sdata[ tid ], sdata[ tid + 256 ] );
+      __syncthreads();
+   }
+   if( blockSize >= 256 ) {
+      if( tid < 128 )
+         reduction( sidx[ tid ], sidx[ tid + 128 ], sdata[ tid ], sdata[ tid + 128 ] );
+      __syncthreads();
+   }
+   if( blockSize >= 128 ) {
+      if( tid <  64 )
+         reduction( sidx[ tid ], sidx[ tid + 64 ], sdata[ tid ], sdata[ tid + 64 ] );
+      __syncthreads();
+   }
+
+   /***
+    * This runs in one warp so it is synchronized implicitly.
+    */
+   if( tid < 32 ) {
+      volatile ResultType* vsdata = sdata;
+      volatile IndexType* vsidx = sidx;
+      if( blockSize >= 64 ) {
+         volatileReduction( vsidx[ tid ], vsidx[ tid + 32 ], vsdata[ tid ], vsdata[ tid + 32 ] );
+      }
+      // TODO: If blocksize == 32, the following does not work
+      // We do not check if tid < 16. Fix it!!!
+      if( blockSize >= 32 ) {
+         volatileReduction( vsidx[ tid ], vsidx[ tid + 16 ], vsdata[ tid ], vsdata[ tid + 16 ] );
+      }
+      if( blockSize >= 16 ) {
+         volatileReduction( vsidx[ tid ], vsidx[ tid + 8 ], vsdata[ tid ], vsdata[ tid + 8 ] );
+      }
+      if( blockSize >=  8 ) {
+         volatileReduction( vsidx[ tid ], vsidx[ tid + 4 ], vsdata[ tid ], vsdata[ tid + 4 ] );
+      }
+      if( blockSize >=  4 ) {
+         volatileReduction( vsidx[ tid ], vsidx[ tid + 2 ], vsdata[ tid ], vsdata[ tid + 2 ] );
+      }
+      if( blockSize >=  2 ) {
+         volatileReduction( vsidx[ tid ], vsidx[ tid + 1 ], vsdata[ tid ], vsdata[ tid + 1 ] );
+      }
    }
 
+   /***
+    * Store the result back in the global memory.
+    */
+   if( tid == 0 ) {
+      output[ blockIdx.x ] = sdata[ 0 ];
+      idxOutput[ blockIdx.x ] = sidx[ 0 ];
+   }
 }
 
+
 template< typename Index,
           typename Result >
 struct CudaReductionKernelLauncher
@@ -229,6 +345,28 @@ struct CudaReductionKernelLauncher
       return this->reducedSize;
    }
 
+   template< typename DataFetcher,
+             typename Reduction,
+             typename VolatileReduction >
+   int startWithArgument( const Reduction& reduction,
+                          const VolatileReduction& volatileReduction,
+                          const DataFetcher& dataFetcher,
+                          const Result& zero,
+                          ResultType*& output,
+                          IndexType*& idxOutput )
+   {
+      ////
+      // create reference to the reduction buffer singleton and set size
+      const size_t buf_size = 2 * desGridSize * ( sizeof( ResultType ) + sizeof( IndexType ) );
+      CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
+      cudaReductionBuffer.setSize( buf_size );
+      output = cudaReductionBuffer.template getData< ResultType >();
+      idxOutput = static_cast< IndexType* >( static_cast< void* >( &output[ 2 * desGridSize ] ) );
+
+      this-> reducedSize = this->launchWithArgument( originalSize, reduction, volatileReduction, dataFetcher, zero, output, idxOutput, nullptr );
+      return this->reducedSize;
+   }
+
    template< typename Reduction,
              typename VolatileReduction >
    Result finish( const Reduction& reduction,
@@ -256,6 +394,38 @@ struct CudaReductionKernelLauncher
       return result;
    }
 
+   template< typename Reduction,
+             typename VolatileReduction >
+   Result finishWithArgument( IndexType& argument,
+                              const Reduction& reduction,
+                              const VolatileReduction& volatileReduction,
+                              const Result& zero )
+   {
+      ////
+      // Input is the first half of the buffer, output is the second half
+      //const size_t buf_size = desGridSize * sizeof( ResultType );
+      CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
+      ResultType* input = cudaReductionBuffer.template getData< ResultType >();
+      ResultType* output = &input[ desGridSize ];
+      IndexType* idxInput = static_cast< IndexType* >( &output[ desGridSize ] );
+      IndexType* idxOutput = &idxInput[ desGridSize ];
+
+      auto copyFetch = [=] __cuda_callable__ ( IndexType i ) { return input[ i ]; };
+      while( this->reducedSize > 1 )
+      {
+         this-> reducedSize = this->launchWithArgument( this->reducedSize, reduction, volatileReduction, copyFetch, zero, output, idxOutput, idxInput );
+         std::swap( input, output );
+      }
+
+      ////
+      // Copy result on CPU
+      ResultType result;
+      ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( &result, output, 1 );
+      ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( &argument, idxOutput, 1 );
+      return result;
+   }
+
+
    protected:
       template< typename DataFetcher,
                 typename Reduction,
@@ -278,75 +448,167 @@ struct CudaReductionKernelLauncher
                   ? 2 * blockSize.x * sizeof( ResultType )
                   : blockSize.x * sizeof( ResultType );
 
-         /***
-          * Depending on the blockSize we generate appropriate template instance.
-          */
-         switch( blockSize.x )
-         {
-            case 512:
-               CudaReductionKernel< 512 >
-               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
-               break;
-            case 256:
-               cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-               CudaReductionKernel< 256 >
-               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
-               break;
-            case 128:
-               cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-               CudaReductionKernel< 128 >
-               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
-               break;
-            case  64:
-               cudaFuncSetCacheConfig(CudaReductionKernel<  64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-               CudaReductionKernel<  64 >
-               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
-               break;
-            case  32:
-               cudaFuncSetCacheConfig(CudaReductionKernel<  32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-               CudaReductionKernel<  32 >
-               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
-               break;
-            case  16:
-               cudaFuncSetCacheConfig(CudaReductionKernel<  16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-               CudaReductionKernel<  16 >
-               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
-               break;
-            case   8:
-               cudaFuncSetCacheConfig(CudaReductionKernel<   8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-               CudaReductionKernel<   8 >
-               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
-               break;
-            case   4:
-               cudaFuncSetCacheConfig(CudaReductionKernel<   4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-               CudaReductionKernel<   4 >
-               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
-               break;
-            case   2:
-               cudaFuncSetCacheConfig(CudaReductionKernel<   2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-               CudaReductionKernel<   2 >
-               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
-               break;
-            case   1:
-               throw std::logic_error( "blockSize should not be 1." );
-            default:
-               throw std::logic_error( "Block size is " + std::to_string(blockSize.x) + " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
-         }
-         TNL_CHECK_CUDA_DEVICE;
+        /////
+        // Depending on the blockSize we generate appropriate template instance.
+        switch( blockSize.x )
+        {
+           case 512:
+              CudaReductionKernel< 512 >
+              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+              break;
+           case 256:
+              cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+              CudaReductionKernel< 256 >
+              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+              break;
+           case 128:
+              cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+              CudaReductionKernel< 128 >
+              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+              break;
+           case  64:
+              cudaFuncSetCacheConfig(CudaReductionKernel<  64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+              CudaReductionKernel<  64 >
+              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+              break;
+           case  32:
+              cudaFuncSetCacheConfig(CudaReductionKernel<  32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+              CudaReductionKernel<  32 >
+              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+              break;
+           case  16:
+              cudaFuncSetCacheConfig(CudaReductionKernel<  16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+              CudaReductionKernel<  16 >
+              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+              break;
+          case   8:
+              cudaFuncSetCacheConfig(CudaReductionKernel<   8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+              CudaReductionKernel<   8 >
+              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+              break;
+           case   4:
+              cudaFuncSetCacheConfig(CudaReductionKernel<   4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+              CudaReductionKernel<   4 >
+              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+              break;
+           case   2:
+              cudaFuncSetCacheConfig(CudaReductionKernel<   2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+              CudaReductionKernel<   2 >
+              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+              break;
+           case   1:
+              TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl );
+           default:
+              TNL_ASSERT( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
+        }
+        TNL_CHECK_CUDA_DEVICE;
+
+        ////
+        // Return the size of the output array on the CUDA device
+        return gridSize.x;
+      }
+
+      template< typename DataFetcher,
+                typename Reduction,
+                typename VolatileReduction >
+      int launchWithArgument( const Index size,
+                              const Reduction& reduction,
+                              const VolatileReduction& volatileReduction,
+                              const DataFetcher& dataFetcher,
+                              const Result& zero,
+                              Result* output,
+                              Index* idxOutput,
+                              const Index* idxInput )
+      {
+         dim3 blockSize, gridSize;
+         blockSize.x = Reduction_maxThreadsPerBlock;
+         gridSize.x = TNL::min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );
 
          ////
-         // return the size of the output array on the CUDA device
-         return gridSize.x;
+         // when there is only one warp per blockSize.x, we need to allocate two warps
+         // worth of shared memory so that we don't index shared memory out of bounds
+         const IndexType shmem = (blockSize.x <= 32)
+                  ? 2 * blockSize.x * ( sizeof( ResultType ) + sizeof( Index ) )
+                  : blockSize.x * ( sizeof( ResultType ) + sizeof( Index ) );
+
+        /***
+         * Depending on the blockSize we generate appropriate template instance.
+         */
+        switch( blockSize.x )
+        {
+           case 512:
+              CudaReductionWithArgumentKernel< 512 >
+              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
+              break;
+           case 256:
+              cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+              CudaReductionWithArgumentKernel< 256 >
+              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
+              break;
+           case 128:
+              cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+              CudaReductionWithArgumentKernel< 128 >
+              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
+              break;
+           case  64:
+              cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+              CudaReductionWithArgumentKernel<  64 >
+              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
+              break;
+           case  32:
+              cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+              CudaReductionWithArgumentKernel<  32 >
+              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
+              break;
+           case  16:
+              cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+              CudaReductionWithArgumentKernel<  16 >
+              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
+              break;
+          case   8:
+              cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+              CudaReductionWithArgumentKernel<   8 >
+              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
+              break;
+           case   4:
+              cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+              CudaReductionWithArgumentKernel<   4 >
+              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
+              break;
+           case   2:
+              cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+              CudaReductionWithArgumentKernel<   2 >
+              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
+              break;
+           case   1:
+              TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl );
+           default:
+              TNL_ASSERT( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
+        }
+        TNL_CHECK_CUDA_DEVICE;
+
+        ////
+        // return the size of the output array on the CUDA device
+        return gridSize.x;
       }
 
+
       const int activeDevice;
       const int blocksdPerMultiprocessor;
       const int desGridSize;
diff --git a/src/TNL/Containers/Algorithms/Reduction.h b/src/TNL/Containers/Algorithms/Reduction.h
index 7ac0d4c028..8c7af3da90 100644
--- a/src/TNL/Containers/Algorithms/Reduction.h
+++ b/src/TNL/Containers/Algorithms/Reduction.h
@@ -28,52 +28,92 @@ class Reduction
 template<>
 class Reduction< Devices::Cuda >
 {
-public:
-   template< typename Index,
-             typename Result,
-             typename ReductionOperation,
-             typename VolatileReductionOperation,
-             typename DataFetcher >
-   static Result
-   reduce( const Index size,
-           ReductionOperation& reduction,
-           VolatileReductionOperation& volatileReduction,
-           DataFetcher& dataFetcher,
-           const Result& zero );
+   public:
+      template< typename Index,
+                typename Result,
+                typename ReductionOperation,
+                typename VolatileReductionOperation,
+                typename DataFetcher >
+      static Result
+      reduce( const Index size,
+              ReductionOperation& reduction,
+              VolatileReductionOperation& volatileReduction,
+              DataFetcher& dataFetcher,
+              const Result& zero );
+
+      template< typename Index,
+                typename Result,
+                typename ReductionOperation,
+                typename VolatileReductionOperation,
+                typename DataFetcher >
+      static Result
+      reduceWithArgument( const Index size,
+                          Index& argument,
+                          ReductionOperation& reduction,
+                          VolatileReductionOperation& volatileReduction,
+                          DataFetcher& dataFetcher,
+                          const Result& zero );
 };
 
 template<>
 class Reduction< Devices::Host >
 {
-public:
-   template< typename Index,
-             typename Result,
-             typename ReductionOperation,
-             typename VolatileReductionOperation,
-             typename DataFetcher >
-   static Result
-   reduce( const Index size,
-           ReductionOperation& reduction,
-           VolatileReductionOperation& volatileReduction,
-           DataFetcher& dataFetcher,
-           const Result& zero );
+   public:
+      template< typename Index,
+                typename Result,
+                typename ReductionOperation,
+                typename VolatileReductionOperation,
+                typename DataFetcher >
+      static Result
+      reduce( const Index size,
+              ReductionOperation& reduction,
+              VolatileReductionOperation& volatileReduction,
+              DataFetcher& dataFetcher,
+              const Result& zero );
+
+      template< typename Index,
+                typename Result,
+                typename ReductionOperation,
+                typename VolatileReductionOperation,
+                typename DataFetcher >
+      static Result
+      reduceWithArgument( const Index size,
+                          Index& argument,
+                          ReductionOperation& reduction,
+                          VolatileReductionOperation& volatileReduction,
+                          DataFetcher& dataFetcher,
+                          const Result& zero );
 };
 
 template<>
 class Reduction< Devices::MIC >
 {
-public:
-   template< typename Index,
-             typename Result,
-             typename ReductionOperation,
-             typename VolatileReductionOperation,
-             typename DataFetcher >
-   static Result
-   reduce( const Index size,
-           ReductionOperation& reduction,
-           VolatileReductionOperation& volatileReduction,
-           DataFetcher& dataFetcher,
-           const Result& zero );
+   public:
+      template< typename Index,
+                typename Result,
+                typename ReductionOperation,
+                typename VolatileReductionOperation,
+                typename DataFetcher >
+      static Result
+      reduce( const Index size,
+              ReductionOperation& reduction,
+              VolatileReductionOperation& volatileReduction,
+              DataFetcher& dataFetcher,
+              const Result& zero );
+
+     template< typename Index,
+                typename Result,
+                typename ReductionOperation,
+                typename VolatileReductionOperation,
+                typename DataFetcher >
+      static Result
+      reduceWithArgument( const Index size,
+                          Index& argument,
+                          ReductionOperation& reduction,
+                          VolatileReductionOperation& volatileReduction,
+                          DataFetcher& dataFetcher,
+                          const Result& zero );
+
 };
 
 } // namespace Algorithms
diff --git a/src/TNL/Containers/Algorithms/Reduction.hpp b/src/TNL/Containers/Algorithms/Reduction.hpp
index 31c93504cc..7647ff94b7 100644
--- a/src/TNL/Containers/Algorithms/Reduction.hpp
+++ b/src/TNL/Containers/Algorithms/Reduction.hpp
@@ -65,26 +65,6 @@ Reduction< Devices::Cuda >::
    //constexpr bool can_reduce_all_on_host = std::is_fundamental< DataType1 >::value || std::is_fundamental< DataType2 >::value || std::is_pointer< DataType1 >::value || std::is_pointer< DataType2 >::value;
    constexpr bool can_reduce_later_on_host = std::is_fundamental< ResultType >::value || std::is_pointer< ResultType >::value;
 
-   /***
-    * First check if the input array(s) is/are large enough for the reduction on GPU.
-    * Otherwise copy it/them to host and reduce on CPU.
-    */
-   // With lambda function we cannot reduce data on host - we do not know the data here.
-   /*if( can_reduce_all_on_host && size <= Reduction_minGpuDataSize )
-   {
-      typename std::remove_const< DataType1 >::type hostArray1[ Reduction_minGpuDataSize ];
-      ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( hostArray1, deviceInput1, size );
-      if( deviceInput2 ) {
-         using _DT2 = typename std::conditional< std::is_same< DataType2, void >::value, DataType1, DataType2 >::type;
-         typename std::remove_const< _DT2 >::type hostArray2[ Reduction_minGpuDataSize ];
-         ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( hostArray2, (_DT2*) deviceInput2, size );
-         return Reduction< Devices::Host >::reduce( zero, dataFetcher, reduction, size, hostArray1, hostArray2 );
-      }
-      else {
-         return Reduction< Devices::Host >::reduce( operation, size, hostArray1, (DataType2*) nullptr );
-      }
-   }*/
-
    #ifdef CUDA_REDUCTION_PROFILING
       Timer timer;
       timer.reset();
@@ -149,22 +129,115 @@ Reduction< Devices::Cuda >::
          timer.start();
       #endif
 
-      //ResultType resultArray[ 1 ];
-      //ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray, deviceAux1, reducedSize );
-      //const ResultType result = resultArray[ 0 ];
+      return result;
+   }
+#else
+   throw Exceptions::CudaSupportMissing();
+#endif
+};
+
+template< typename Index,
+          typename Result,
+          typename ReductionOperation,
+          typename VolatileReductionOperation,
+          typename DataFetcher >
+Result
+Reduction< Devices::Cuda >::
+reduceWithArgument( const Index size,
+                    Index& argument,
+                    ReductionOperation& reduction,
+                    VolatileReductionOperation& volatileReduction,
+                    DataFetcher& dataFetcher,
+                    const Result& zero )
+{
+   #ifdef HAVE_CUDA
+
+   using IndexType = Index;
+   using ResultType = Result;
+
+   /***
+    * Only fundamental and pointer types can be safely reduced on host. Complex
+    * objects stored on the device might contain pointers into the device memory,
+    * in which case reduction on host might fail.
+    */
+   //constexpr bool can_reduce_all_on_host = std::is_fundamental< DataType1 >::value || std::is_fundamental< DataType2 >::value || std::is_pointer< DataType1 >::value || std::is_pointer< DataType2 >::value;
+   constexpr bool can_reduce_later_on_host = std::is_fundamental< ResultType >::value || std::is_pointer< ResultType >::value;
+
+   #ifdef CUDA_REDUCTION_PROFILING
+      Timer timer;
+      timer.reset();
+      timer.start();
+   #endif
+
+   CudaReductionKernelLauncher< IndexType, ResultType > reductionLauncher( size );
+
+   /****
+    * Reduce the data on the CUDA device.
+    */
+   ResultType* deviceAux1( nullptr );
+   IndexType* deviceIndexes( nullptr );
+   IndexType reducedSize = reductionLauncher.startWithArgument( 
+      reduction,
+      volatileReduction,
+      dataFetcher,
+      zero,
+      deviceAux1,
+      deviceIndexes );
+   #ifdef CUDA_REDUCTION_PROFILING
+      timer.stop();
+      std::cout << "   Reduction on GPU to size " << reducedSize << " took " << timer.getRealTime() << " sec. " << std::endl;
+      timer.reset();
+      timer.start();
+   #endif
+
+   if( can_reduce_later_on_host ) {
+      /***
+       * Transfer the reduced data from device to host.
+       */
+      std::unique_ptr< ResultType[] > resultArray{ new ResultType[ reducedSize ] };
+      ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray.get(), deviceAux1, reducedSize );
+
+      #ifdef CUDA_REDUCTION_PROFILING
+         timer.stop();
+         std::cout << "   Transferring data to CPU took " << timer.getRealTime() << " sec. " << std::endl;
+         timer.reset();
+         timer.start();
+      #endif
+
+      /***
+       * Reduce the data on the host system.
+       */
+      auto fetch = [&] ( IndexType i ) { return resultArray[ i ]; };
+      const ResultType result = Reduction< Devices::Host >::reduceWithArgument( reducedSize, reduction, volatileReduction, fetch, zero );
+
+      #ifdef CUDA_REDUCTION_PROFILING
+         timer.stop();
+         std::cout << "   Reduction of small data set on CPU took " << timer.getRealTime() << " sec. " << std::endl;
+      #endif
+      return result;
+   }
+   else {
+      /***
+       * Data can't be safely reduced on host, so continue with the reduction on the CUDA device.
+       */
+      auto result = reductionLauncher.finishWithArgument( argument, reduction, volatileReduction, zero );
 
-      /*#ifdef CUDA_REDUCTION_PROFILING
+      #ifdef CUDA_REDUCTION_PROFILING
          timer.stop();
-         std::cout << "   Transferring the result to CPU took " << timer.getRealTime() << " sec. " << std::endl;
-      #endif*/
+         std::cout << "   Reduction of small data set on GPU took " << timer.getRealTime() << " sec. " << std::endl;
+         timer.reset();
+         timer.start();
+      #endif
 
       return result;
    }
 #else
    throw Exceptions::CudaSupportMissing();
 #endif
-};
+}
 
+////
+// Reduction on host
 template< typename Index,
           typename Result,
           typename ReductionOperation,
@@ -263,6 +336,139 @@ Reduction< Devices::Host >::
 #endif
 }
 
+template< typename Index,
+          typename Result,
+          typename ReductionOperation,
+          typename VolatileReductionOperation,
+          typename DataFetcher >
+Result
+Reduction< Devices::Host >::
+reduceWithArgument( const Index size,
+                    Index& argument,
+                    ReductionOperation& reduction,
+                    VolatileReductionOperation& volatileReduction,
+                    DataFetcher& dataFetcher,
+                    const Result& zero )
+{
+   using IndexType = Index;
+   using ResultType = Result;
+
+   constexpr int block_size = 128;
+   const int blocks = size / block_size;
+
+#ifdef HAVE_OPENMP
+   if( TNL::Devices::Host::isOMPEnabled() && size >= 2 * block_size ) {
+      // global result variable
+      ResultType result = zero;
+      argument = -1;
+#pragma omp parallel
+      {
+         // initialize array for thread-local results
+         ResultType r[ 4 ] = { zero, zero, zero, zero  };
+         IndexType arg[ 4 ] = { 0, 0, 0, 0 };
+         bool initialised( false );
+
+         #pragma omp for nowait
+         for( int b = 0; b < blocks; b++ ) {
+            const IndexType offset = b * block_size;
+            for( int i = 0; i < block_size; i += 4 ) {
+               if( ! initialised ) {
+                  arg[ 0 ] = offset + i;
+                  arg[ 1 ] = offset + i + 1;
+                  arg[ 2 ] = offset + i + 2;
+                  arg[ 3 ] = offset + i + 3;
+                  r[ 0 ] = dataFetcher( offset + i );
+                  r[ 1 ] = dataFetcher( offset + i + 1 );
+                  r[ 2 ] = dataFetcher( offset + i + 2 );
+                  r[ 3 ] = dataFetcher( offset + i + 3 );
+                  initialised = true;
+                  continue;
+               }
+               reduction( arg[ 0 ], offset + i,     r[ 0 ], dataFetcher( offset + i ) );
+               reduction( arg[ 1 ], offset + i + 1, r[ 1 ], dataFetcher( offset + i + 1 ) );
+               reduction( arg[ 2 ], offset + i + 2, r[ 2 ], dataFetcher( offset + i + 2 ) );
+               reduction( arg[ 3 ], offset + i + 3, r[ 3 ], dataFetcher( offset + i + 3 ) );
+            }
+         }
+
+         // the first thread that reaches here processes the last, incomplete block
+         #pragma omp single nowait
+         {
+            for( IndexType i = blocks * block_size; i < size; i++ )
+               reduction( arg[ 0 ], i, r[ 0 ], dataFetcher( i ) );
+         }
+
+         // local reduction of unrolled results
+         reduction( arg[ 0 ], arg[ 2 ], r[ 0 ], r[ 2 ] );
+         reduction( arg[ 1 ], arg[ 3 ], r[ 1 ], r[ 3 ] );
+         reduction( arg[ 0 ], arg[ 1 ], r[ 0 ], r[ 1 ] );
+
+         // inter-thread reduction of local results
+         #pragma omp critical
+         {
+            if( argument == - 1 )
+               argument = arg[ 0 ];
+            reduction( argument, arg[ 0 ], result, r[ 0 ] );
+         }
+      }
+      return result;
+   }
+   else {
+#endif
+      if( blocks > 1 ) {
+         // initialize array for unrolled results
+         ResultType r[ 4 ] = { zero, zero, zero, zero };
+         IndexType arg[ 4 ] = { 0, 0, 0, 0 };
+         bool initialised( false );
+
+         // main reduction (explicitly unrolled loop)
+         for( int b = 0; b < blocks; b++ ) {
+            const IndexType offset = b * block_size;
+            for( int i = 0; i < block_size; i += 4 ) {
+               if( ! initialised )
+               {
+                  arg[ 0 ] = offset + i;
+                  arg[ 1 ] = offset + i + 1;
+                  arg[ 2 ] = offset + i + 2;
+                  arg[ 3 ] = offset + i + 3;
+                  r[ 0 ] = dataFetcher( offset + i );
+                  r[ 1 ] = dataFetcher( offset + i + 1 );
+                  r[ 2 ] = dataFetcher( offset + i + 2 );
+                  r[ 3 ] = dataFetcher( offset + i + 3 );
+                  initialised = true;
+                  continue;
+               }
+               reduction( arg[ 0 ], offset + i,     r[ 0 ], dataFetcher( offset + i ) );
+               reduction( arg[ 1 ], offset + i + 1, r[ 1 ], dataFetcher( offset + i + 1 ) );
+               reduction( arg[ 2 ], offset + i + 2, r[ 2 ], dataFetcher( offset + i + 2 ) );
+               reduction( arg[ 3 ], offset + i + 3, r[ 3 ], dataFetcher( offset + i + 3 ) );
+            }
+         }
+
+         // reduction of the last, incomplete block (not unrolled)
+         for( IndexType i = blocks * block_size; i < size; i++ )
+            reduction( arg[ 0 ], i, r[ 0 ], dataFetcher( i ) );
+
+         // reduction of unrolled results
+         reduction( arg[ 0 ], arg[ 2 ], r[ 0 ], r[ 2 ] );
+         reduction( arg[ 1 ], arg[ 3 ], r[ 1 ], r[ 3 ] );
+         reduction( arg[ 0 ], arg[ 1 ], r[ 0 ], r[ 1 ] );
+         argument = arg[ 0 ];
+         return r[ 0 ];
+      }
+      else {
+         ResultType result = dataFetcher( 0 );
+         argument = 0;
+         for( IndexType i = 1; i < size; i++ )
+            reduction( argument, i, result, dataFetcher( i ) );
+         return result;
+      }
+#ifdef HAVE_OPENMP
+   }
+#endif
+}
+
+
 } // namespace Algorithms
 } // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Containers/Expressions/VerticalOperations.h b/src/TNL/Containers/Expressions/VerticalOperations.h
index e7549e90ac..94c9781ba2 100644
--- a/src/TNL/Containers/Expressions/VerticalOperations.h
+++ b/src/TNL/Containers/Expressions/VerticalOperations.h
@@ -31,6 +31,23 @@ auto StaticExpressionMin( const Expression& expression ) -> typename std::remove
    return aux;
 }
 
+template< typename Expression, typename Real >
+__cuda_callable__
+auto StaticExpressionArgMin( const Expression& expression, int& arg ) -> typename std::remove_reference< decltype( expression[ 0 ] ) >::type
+{
+   auto value = expression[ 0 ];
+   arg = 0;
+   for( int i = 1; i < expression.getSize(); i++ )
+   {
+      if( expression[ i ] < value )
+      {
+         value = expression[ i ];
+         arg = i;
+      }
+   }
+   return value;
+}
+
 template< typename Expression >
 __cuda_callable__
 auto StaticExpressionMax( const Expression& expression ) -> typename std::remove_reference< decltype( expression[ 0 ] ) >::type
@@ -41,6 +58,23 @@ auto StaticExpressionMax( const Expression& expression ) -> typename std::remove
    return aux;
 }
 
+template< typename Expression, typename Real >
+__cuda_callable__
+auto StaticExpressionArgMax( const Expression& expression, int& arg ) -> typename std::remove_reference< decltype( expression[ 0 ] ) >::type
+{
+   auto value = expression[ 0 ];
+   arg = 0;
+   for( int i = 1; i < expression.getSize(); i++ )
+   {
+      if( expression[ i ] > value )
+      {
+         value = expression[ i ];
+         arg = i;
+      }
+   }
+   return value;
+}
+
 template< typename Expression >
 __cuda_callable__
 auto StaticExpressionSum( const Expression& expression ) -> typename std::remove_reference< decltype( expression[ 0 ] ) >::type
@@ -140,6 +174,28 @@ auto ExpressionMin( const Expression& expression ) -> typename std::remove_refer
    return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
 }
 
+template< typename Expression >
+auto ExpressionArgMin( const Expression& expression, typename Expression::IndexType& arg ) -> typename std::remove_reference< decltype( expression[ 0 ] ) >::type
+{
+   using ResultType = typename std::remove_cv< typename std::remove_reference< decltype( expression[ 0 ] ) >::type >::type;
+   using IndexType = typename Expression::IndexType;
+
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  expression[ i ]; };
+   auto reduction = [=] __cuda_callable__ ( IndexType& aIdx, const IndexType& bIdx, ResultType& a, const ResultType& b ) {
+      if( a < b ) {
+         a = b;
+         aIdx = bIdx;
+      }
+   };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile IndexType& aIdx, volatile IndexType& bIdx, volatile ResultType& a, volatile ResultType& b ) { 
+      if( a < b ) {
+         a = b;
+         aIdx = bIdx;
+      }
+   };
+   return Algorithms::Reduction< typename Expression::DeviceType >::reduceWithArgument( expression.getSize(), arg, reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
+}
+
 template< typename Expression >
 auto ExpressionMax( const Expression& expression ) -> typename std::remove_reference< decltype( expression[ 0 ] ) >::type
 {
@@ -152,6 +208,32 @@ auto ExpressionMax( const Expression& expression ) -> typename std::remove_refer
    return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::min() );
 }
 
+template< typename Expression >
+auto ExpressionArgMax( const Expression& expression, typename Expression::IndexType& arg ) -> typename std::remove_reference< decltype( expression[ 0 ] ) >::type
+{
+   using ResultType = typename std::remove_cv< typename std::remove_reference< decltype( expression[ 0 ] ) >::type >::type;
+   using IndexType = typename Expression::IndexType;
+
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  expression[ i ]; };
+   auto reduction = [=] __cuda_callable__ ( IndexType& aIdx, const IndexType& bIdx, ResultType& a, const ResultType& b ) {
+      if( a > b ) {
+         a = b;
+         aIdx = bIdx;
+      }
+      else if( a == b && bIdx < aIdx )
+         aIdx = bIdx;
+   };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile IndexType& aIdx, volatile IndexType& bIdx, volatile ResultType& a, volatile ResultType& b ) { 
+      if( a > b ) {
+         a = b;
+         aIdx = bIdx;
+      }
+      else if( a == b && bIdx < aIdx )
+         aIdx = bIdx;
+   };
+   return Algorithms::Reduction< typename Expression::DeviceType >::reduceWithArgument( expression.getSize(), arg, reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::min() );
+}
+
 template< typename Expression >
 auto ExpressionSum( const Expression& expression ) -> typename std::remove_reference< decltype( expression[ 0 ] ) >::type
 {
diff --git a/src/TNL/Containers/StaticVectorExpressions.h b/src/TNL/Containers/StaticVectorExpressions.h
index 9b6b78d64b..e5c5a4900c 100644
--- a/src/TNL/Containers/StaticVectorExpressions.h
+++ b/src/TNL/Containers/StaticVectorExpressions.h
@@ -524,6 +524,14 @@ min( const Containers::StaticVector< Size, Real >& a )
    return Containers::Expressions::StaticExpressionMin( a );
 }
 
+template< int Size, typename Real >
+__cuda_callable__
+typename Containers::StaticVector< Size, Real >::RealType
+argMin( const Containers::StaticVector< Size, Real >& a, int& arg )
+{
+   return Containers::Expressions::StaticExpressionArgMin( a, arg );
+}
+
 template< int Size, typename Real >
 __cuda_callable__
 typename Containers::StaticVector< Size, Real >::RealType
@@ -532,6 +540,14 @@ max( const Containers::StaticVector< Size, Real >& a )
    return Containers::Expressions::StaticExpressionMax( a );
 }
 
+template< int Size, typename Real >
+__cuda_callable__
+typename Containers::StaticVector< Size, Real >::RealType
+argMax( const Containers::StaticVector< Size, Real >& a, int& arg )
+{
+   return Containers::Expressions::StaticExpressionArgMax( a, arg );
+}
+
 template< int Size, typename Real >
 __cuda_callable__
 typename Containers::StaticVector< Size, Real >::RealType
diff --git a/src/TNL/Containers/VectorViewExpressions.h b/src/TNL/Containers/VectorViewExpressions.h
index b57089fd7b..e1aaeaac60 100644
--- a/src/TNL/Containers/VectorViewExpressions.h
+++ b/src/TNL/Containers/VectorViewExpressions.h
@@ -470,6 +470,15 @@ min( const Containers::VectorView< Real, Device, Index >& a )
    return Containers::Expressions::ExpressionMin( a );
 }
 
+template< typename Real,
+          typename Device,
+          typename Index >
+typename Containers::VectorView< Real, Device, Index >::RealType
+argMin( const Containers::VectorView< Real, Device, Index >& a, Index& arg )
+{
+   return Containers::Expressions::ExpressionArgMin( a, arg );
+}
+
 template< typename Real,
           typename Device,
           typename Index >
@@ -479,6 +488,15 @@ max( const Containers::VectorView< Real, Device, Index >& a )
    return Containers::Expressions::ExpressionMax( a );
 }
 
+template< typename Real,
+          typename Device,
+          typename Index >
+typename Containers::VectorView< Real, Device, Index >::RealType
+argMax( const Containers::VectorView< Real, Device, Index >& a, Index& arg )
+{
+   return Containers::Expressions::ExpressionArgMax( a, arg );
+}
+
 template< typename Real,
           typename Device,
           typename Index >
diff --git a/src/UnitTests/Containers/VectorTest-7.h b/src/UnitTests/Containers/VectorTest-7.h
index 4843383d7c..4ed937d4a6 100644
--- a/src/UnitTests/Containers/VectorTest-7.h
+++ b/src/UnitTests/Containers/VectorTest-7.h
@@ -36,19 +36,23 @@ TYPED_TEST( VectorTest, verticalOperations )
    using VectorType = typename TestFixture::VectorType;
    using ViewType = typename TestFixture::ViewType;
    using RealType = typename VectorType::RealType;
+   using IndexType = typename VectorType::IndexType;
    const int size = VECTOR_TEST_SIZE;
 
-   VectorType _u( size ), _v( size );
-   ViewType u( _u ), v( _v );
+   VectorType _u( size ), _v( size ), _w( size );
+   ViewType u( _u ), v( _v ), w( _w );
    RealType sum_( 0.0 ), absSum( 0.0 ), diffSum( 0.0 ), diffAbsSum( 0.0 ),
    absMin( size + 10.0 ), absMax( -size - 10.0 ),
    diffMin( 2 * size + 10.0 ), diffMax( - 2.0 * size - 10.0 ),
-   l2Norm( 0.0 ), l2NormDiff( 0.0 );
+   l2Norm( 0.0 ), l2NormDiff( 0.0 ), argMinValue( size * size ), argMaxValue( -size * size );
+   IndexType argMin( 0 ), argMax( 0 );
    for( int i = 0; i < size; i++ )
    {
       const RealType aux = ( RealType )( i - size / 2 ) / ( RealType ) size;
+      const RealType w_value = aux * aux - 5.0;
       u.setElement( i, aux );
       v.setElement( i, -aux );
+      w.setElement( i, w_value );
       absMin = TNL::min( absMin, TNL::abs( aux ) );
       absMax = TNL::max( absMax, TNL::abs( aux ) );
       diffMin = TNL::min( diffMin, 2 * aux );
@@ -59,6 +63,14 @@ TYPED_TEST( VectorTest, verticalOperations )
       diffAbsSum += TNL::abs( 2.0* aux );
       l2Norm += aux * aux;
       l2NormDiff += 4.0 * aux * aux;
+      if( w_value < argMinValue ) {
+         argMinValue = w_value;
+         argMin = i;
+      }
+      if( w_value > argMaxValue ) {
+         argMaxValue = w_value;
+         argMax = i;
+      }
    }
    l2Norm = TNL::sqrt( l2Norm );
    l2NormDiff = TNL::sqrt( l2NormDiff );
@@ -74,6 +86,10 @@ TYPED_TEST( VectorTest, verticalOperations )
    EXPECT_NEAR( sum( abs( u - v ) ), diffAbsSum, 2.0e-5 );
    EXPECT_NEAR( lpNorm( u, 2.0 ), l2Norm, 2.0e-5 );
    EXPECT_NEAR( lpNorm( u - v, 2.0 ), l2NormDiff, 2.0e-5 );
+   IndexType wArgMin, wArgMax;
+   EXPECT_NEAR( TNL::argMin( w, wArgMin ), argMinValue, 2.0e-5 );
+   EXPECT_NEAR( TNL::argMax( w, wArgMax ), argMaxValue, 2.0e-5 );
+   EXPECT_EQ( argMax, wArgMax );
 }
 
 TYPED_TEST( VectorTest, scalarProduct )
-- 
GitLab