diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h index d7a711cc7f554b57bbd152a1e89b8fd54d2db806..70c3fbab00df31ffb79f1f408dae22b330afd0c1 100644 --- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h +++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h @@ -74,22 +74,19 @@ CudaReductionKernel( const Result zero, * Read data into the shared memory. We start with the * sequential reduction. */ - while( gid + 4 * gridSize < size ) - { + while( gid + 4 * gridSize < size ) { reduction( sdata[ tid ], dataFetcher( gid ) ); reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ) ); reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ) ); gid += 4 * gridSize; } - while( gid + 2 * gridSize < size ) - { + while( gid + 2 * gridSize < size ) { reduction( sdata[ tid ], dataFetcher( gid ) ); reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); gid += 2 * gridSize; } - while( gid < size ) - { + while( gid < size ) { reduction( sdata[ tid ], dataFetcher( gid ) ); gid += gridSize; } @@ -99,71 +96,51 @@ CudaReductionKernel( const Result zero, /*** * Perform the parallel reduction. */ - if( blockSize >= 1024 ) - { + if( blockSize >= 1024 ) { if( tid < 512 ) reduction( sdata[ tid ], sdata[ tid + 512 ] ); __syncthreads(); } - if( blockSize >= 512 ) - { + if( blockSize >= 512 ) { if( tid < 256 ) reduction( sdata[ tid ], sdata[ tid + 256 ] ); __syncthreads(); } - if( blockSize >= 256 ) - { + if( blockSize >= 256 ) { if( tid < 128 ) reduction( sdata[ tid ], sdata[ tid + 128 ] ); __syncthreads(); - //printf( "2: tid %d data %f \n", tid, sdata[ tid ] ); } - - if( blockSize >= 128 ) - { + if( blockSize >= 128 ) { if( tid < 64 ) reduction( sdata[ tid ], sdata[ tid + 64 ] ); __syncthreads(); - //printf( "3: tid %d data %f \n", tid, sdata[ tid ] ); } /*** * This runs in one warp so it is synchronized implicitly. */ - if( tid < 32 ) - { + if( tid < 32 ) { volatile ResultType* vsdata = sdata; - if( blockSize >= 64 ) - { + if( blockSize >= 64 ) { volatileReduction( vsdata[ tid ], vsdata[ tid + 32 ] ); - //printf( "4: tid %d data %f \n", tid, sdata[ tid ] ); } // TODO: If blocksize == 32, the following does not work // We do not check if tid < 16. Fix it!!! - if( blockSize >= 32 ) - { + if( blockSize >= 32 ) { volatileReduction( vsdata[ tid ], vsdata[ tid + 16 ] ); - //printf( "5: tid %d data %f \n", tid, sdata[ tid ] ); } - if( blockSize >= 16 ) - { + if( blockSize >= 16 ) { volatileReduction( vsdata[ tid ], vsdata[ tid + 8 ] ); - //printf( "6: tid %d data %f \n", tid, sdata[ tid ] ); } - if( blockSize >= 8 ) - { + if( blockSize >= 8 ) { volatileReduction( vsdata[ tid ], vsdata[ tid + 4 ] ); - //printf( "7: tid %d data %f \n", tid, sdata[ tid ] ); } - if( blockSize >= 4 ) - { + if( blockSize >= 4 ) { volatileReduction( vsdata[ tid ], vsdata[ tid + 2 ] ); - //printf( "8: tid %d data %f \n", tid, sdata[ tid ] ); } - if( blockSize >= 2 ) - { + if( blockSize >= 2 ) { volatileReduction( vsdata[ tid ], vsdata[ tid + 1 ] ); - //printf( "9: tid %d data %f \n", tid, sdata[ tid ] ); } } @@ -171,13 +148,152 @@ CudaReductionKernel( const Result zero, * Store the result back in the global memory. */ if( tid == 0 ) - { - //printf( "Block %d result = %f \n", blockIdx.x, sdata[ 0 ] ); output[ blockIdx.x ] = sdata[ 0 ]; +} + +template< int blockSize, + typename Result, + typename DataFetcher, + typename Reduction, + typename VolatileReduction, + typename Index > +__global__ void +__launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor ) +CudaReductionWithArgumentKernel( const Result zero, + const DataFetcher dataFetcher, + const Reduction reduction, + const VolatileReduction volatileReduction, + const Index size, + Result* output, + Index* idxOutput, + const Index* idxInput = nullptr ) +{ + using IndexType = Index; + using ResultType = Result; + + ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >(); + IndexType* sidx = static_cast< IndexType* >( static_cast< void* >( &sdata[ blockDim.x ] ) ); + + /*** + * Get thread id (tid) and global thread id (gid). + * gridSize is the number of element processed by all blocks at the + * same time. + */ + const IndexType tid = threadIdx.x; + IndexType gid = blockIdx.x * blockDim. x + threadIdx.x; + const IndexType gridSize = blockDim.x * gridDim.x; + + /*** + * Read data into the shared memory. We start with the + * sequential reduction. + */ + if( idxInput ) { + sdata[ tid ] = dataFetcher( gid ); + sidx[ tid ] = idxInput[ gid ]; + gid += gridSize; + while( gid + 4 * gridSize < size ) { + reduction( sidx[ tid ], idxInput[ gid ], sdata[ tid ], dataFetcher( gid ) ); + reduction( sidx[ tid ], idxInput[ gid + gridSize ], sdata[ tid ], dataFetcher( gid + gridSize ) ); + reduction( sidx[ tid ], idxInput[ gid + 2 * gridSize ], sdata[ tid ], dataFetcher( gid + 2 * gridSize ) ); + reduction( sidx[ tid ], idxInput[ gid + 3 * gridSize ], sdata[ tid ], dataFetcher( gid + 3 * gridSize ) ); + gid += 4 * gridSize; + } + while( gid + 2 * gridSize < size ) { + reduction( sidx[ tid ], idxInput[ gid ], sdata[ tid ], dataFetcher( gid ) ); + reduction( sidx[ tid ], idxInput[ gid + gridSize ], sdata[ tid ], dataFetcher( gid + gridSize ) ); + gid += 2 * gridSize; + } + while( gid < size ) { + reduction( sidx[ tid ], idxInput[ gid ], sdata[ tid ], dataFetcher( gid ) ); + gid += gridSize; + } + } + else { + sdata[ tid ] = dataFetcher( gid ); + sidx[ tid ] = gid; + gid += gridSize; + while( gid + 4 * gridSize < size ) { + reduction( sidx[ tid ], gid, sdata[ tid ], dataFetcher( gid ) ); + reduction( sidx[ tid ], gid + gridSize, sdata[ tid ], dataFetcher( gid + gridSize ) ); + reduction( sidx[ tid ], gid + 2 * gridSize, sdata[ tid ], dataFetcher( gid + 2 * gridSize ) ); + reduction( sidx[ tid ], gid + 3 * gridSize, sdata[ tid ], dataFetcher( gid + 3 * gridSize ) ); + gid += 4 * gridSize; + } + while( gid + 2 * gridSize < size ) { + reduction( sidx[ tid ], gid, sdata[ tid ], dataFetcher( gid ) ); + reduction( sidx[ tid ], gid + gridSize, sdata[ tid ], dataFetcher( gid + gridSize ) ); + gid += 2 * gridSize; + } + while( gid < size ) { + reduction( sidx[ tid ], gid, sdata[ tid ], dataFetcher( gid ) ); + gid += gridSize; + } + } + __syncthreads(); + + //printf( "1: tid %d data %f \n", tid, sdata[ tid ] ); + /*** + * Perform the parallel reduction. + */ + if( blockSize >= 1024 ) { + if( tid < 512 ) + reduction( sidx[ tid ], sidx[ tid + 512 ], sdata[ tid ], sdata[ tid + 512 ] ); + __syncthreads(); + } + if( blockSize >= 512 ) { + if( tid < 256 ) + reduction( sidx[ tid ], sidx[ tid + 256 ], sdata[ tid ], sdata[ tid + 256 ] ); + __syncthreads(); + } + if( blockSize >= 256 ) { + if( tid < 128 ) + reduction( sidx[ tid ], sidx[ tid + 128 ], sdata[ tid ], sdata[ tid + 128 ] ); + __syncthreads(); + } + if( blockSize >= 128 ) { + if( tid < 64 ) + reduction( sidx[ tid ], sidx[ tid + 64 ], sdata[ tid ], sdata[ tid + 64 ] ); + __syncthreads(); + } + + /*** + * This runs in one warp so it is synchronized implicitly. + */ + if( tid < 32 ) { + volatile ResultType* vsdata = sdata; + volatile IndexType* vsidx = sidx; + if( blockSize >= 64 ) { + volatileReduction( vsidx[ tid ], vsidx[ tid + 32 ], vsdata[ tid ], vsdata[ tid + 32 ] ); + } + // TODO: If blocksize == 32, the following does not work + // We do not check if tid < 16. Fix it!!! + if( blockSize >= 32 ) { + volatileReduction( vsidx[ tid ], vsidx[ tid + 16 ], vsdata[ tid ], vsdata[ tid + 16 ] ); + } + if( blockSize >= 16 ) { + volatileReduction( vsidx[ tid ], vsidx[ tid + 8 ], vsdata[ tid ], vsdata[ tid + 8 ] ); + } + if( blockSize >= 8 ) { + volatileReduction( vsidx[ tid ], vsidx[ tid + 4 ], vsdata[ tid ], vsdata[ tid + 4 ] ); + } + if( blockSize >= 4 ) { + volatileReduction( vsidx[ tid ], vsidx[ tid + 2 ], vsdata[ tid ], vsdata[ tid + 2 ] ); + } + if( blockSize >= 2 ) { + volatileReduction( vsidx[ tid ], vsidx[ tid + 1 ], vsdata[ tid ], vsdata[ tid + 1 ] ); + } } + /*** + * Store the result back in the global memory. + */ + if( tid == 0 ) { + output[ blockIdx.x ] = sdata[ 0 ]; + idxOutput[ blockIdx.x ] = sidx[ 0 ]; + } } + template< typename Index, typename Result > struct CudaReductionKernelLauncher @@ -229,6 +345,28 @@ struct CudaReductionKernelLauncher return this->reducedSize; } + template< typename DataFetcher, + typename Reduction, + typename VolatileReduction > + int startWithArgument( const Reduction& reduction, + const VolatileReduction& volatileReduction, + const DataFetcher& dataFetcher, + const Result& zero, + ResultType*& output, + IndexType*& idxOutput ) + { + //// + // create reference to the reduction buffer singleton and set size + const size_t buf_size = 2 * desGridSize * ( sizeof( ResultType ) + sizeof( IndexType ) ); + CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance(); + cudaReductionBuffer.setSize( buf_size ); + output = cudaReductionBuffer.template getData< ResultType >(); + idxOutput = static_cast< IndexType* >( static_cast< void* >( &output[ 2 * desGridSize ] ) ); + + this-> reducedSize = this->launchWithArgument( originalSize, reduction, volatileReduction, dataFetcher, zero, output, idxOutput, nullptr ); + return this->reducedSize; + } + template< typename Reduction, typename VolatileReduction > Result finish( const Reduction& reduction, @@ -256,6 +394,38 @@ struct CudaReductionKernelLauncher return result; } + template< typename Reduction, + typename VolatileReduction > + Result finishWithArgument( IndexType& argument, + const Reduction& reduction, + const VolatileReduction& volatileReduction, + const Result& zero ) + { + //// + // Input is the first half of the buffer, output is the second half + //const size_t buf_size = desGridSize * sizeof( ResultType ); + CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance(); + ResultType* input = cudaReductionBuffer.template getData< ResultType >(); + ResultType* output = &input[ desGridSize ]; + IndexType* idxInput = static_cast< IndexType* >( &output[ desGridSize ] ); + IndexType* idxOutput = &idxInput[ desGridSize ]; + + auto copyFetch = [=] __cuda_callable__ ( IndexType i ) { return input[ i ]; }; + while( this->reducedSize > 1 ) + { + this-> reducedSize = this->launchWithArgument( this->reducedSize, reduction, volatileReduction, copyFetch, zero, output, idxOutput, idxInput ); + std::swap( input, output ); + } + + //// + // Copy result on CPU + ResultType result; + ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( &result, output, 1 ); + ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( &argument, idxOutput, 1 ); + return result; + } + + protected: template< typename DataFetcher, typename Reduction, @@ -278,75 +448,167 @@ struct CudaReductionKernelLauncher ? 2 * blockSize.x * sizeof( ResultType ) : blockSize.x * sizeof( ResultType ); - /*** - * Depending on the blockSize we generate appropriate template instance. - */ - switch( blockSize.x ) - { - case 512: - CudaReductionKernel< 512 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 256: - cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 256 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 128: - cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 128 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 64: - cudaFuncSetCacheConfig(CudaReductionKernel< 64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 64 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 32: - cudaFuncSetCacheConfig(CudaReductionKernel< 32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 32 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 16: - cudaFuncSetCacheConfig(CudaReductionKernel< 16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 16 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 8: - cudaFuncSetCacheConfig(CudaReductionKernel< 8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 8 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 4: - cudaFuncSetCacheConfig(CudaReductionKernel< 4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 4 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 2: - cudaFuncSetCacheConfig(CudaReductionKernel< 2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 2 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 1: - throw std::logic_error( "blockSize should not be 1." ); - default: - throw std::logic_error( "Block size is " + std::to_string(blockSize.x) + " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); - } - TNL_CHECK_CUDA_DEVICE; + ///// + // Depending on the blockSize we generate appropriate template instance. + switch( blockSize.x ) + { + case 512: + CudaReductionKernel< 512 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 256: + cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 256 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 128: + cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 128 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 64: + cudaFuncSetCacheConfig(CudaReductionKernel< 64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 64 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 32: + cudaFuncSetCacheConfig(CudaReductionKernel< 32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 32 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 16: + cudaFuncSetCacheConfig(CudaReductionKernel< 16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 16 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 8: + cudaFuncSetCacheConfig(CudaReductionKernel< 8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 8 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 4: + cudaFuncSetCacheConfig(CudaReductionKernel< 4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 4 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 2: + cudaFuncSetCacheConfig(CudaReductionKernel< 2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 2 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); + break; + case 1: + TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl ); + default: + TNL_ASSERT( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); + } + TNL_CHECK_CUDA_DEVICE; + + //// + // Return the size of the output array on the CUDA device + return gridSize.x; + } + + template< typename DataFetcher, + typename Reduction, + typename VolatileReduction > + int launchWithArgument( const Index size, + const Reduction& reduction, + const VolatileReduction& volatileReduction, + const DataFetcher& dataFetcher, + const Result& zero, + Result* output, + Index* idxOutput, + const Index* idxInput ) + { + dim3 blockSize, gridSize; + blockSize.x = Reduction_maxThreadsPerBlock; + gridSize.x = TNL::min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize ); //// - // return the size of the output array on the CUDA device - return gridSize.x; + // when there is only one warp per blockSize.x, we need to allocate two warps + // worth of shared memory so that we don't index shared memory out of bounds + const IndexType shmem = (blockSize.x <= 32) + ? 2 * blockSize.x * ( sizeof( ResultType ) + sizeof( Index ) ) + : blockSize.x * ( sizeof( ResultType ) + sizeof( Index ) ); + + /*** + * Depending on the blockSize we generate appropriate template instance. + */ + switch( blockSize.x ) + { + case 512: + CudaReductionWithArgumentKernel< 512 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); + break; + case 256: + cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionWithArgumentKernel< 256 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); + break; + case 128: + cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionWithArgumentKernel< 128 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); + break; + case 64: + cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionWithArgumentKernel< 64 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); + break; + case 32: + cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionWithArgumentKernel< 32 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); + break; + case 16: + cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionWithArgumentKernel< 16 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); + break; + case 8: + cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionWithArgumentKernel< 8 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); + break; + case 4: + cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionWithArgumentKernel< 4 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); + break; + case 2: + cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + + CudaReductionWithArgumentKernel< 2 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); + break; + case 1: + TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl ); + default: + TNL_ASSERT( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); + } + TNL_CHECK_CUDA_DEVICE; + + //// + // return the size of the output array on the CUDA device + return gridSize.x; } + const int activeDevice; const int blocksdPerMultiprocessor; const int desGridSize; diff --git a/src/TNL/Containers/Algorithms/Reduction.h b/src/TNL/Containers/Algorithms/Reduction.h index 7ac0d4c028a5365430b67707d63ca185cca4a275..8c7af3da907173b677dbd6d0dc4a3a263360599a 100644 --- a/src/TNL/Containers/Algorithms/Reduction.h +++ b/src/TNL/Containers/Algorithms/Reduction.h @@ -28,52 +28,92 @@ class Reduction template<> class Reduction< Devices::Cuda > { -public: - template< typename Index, - typename Result, - typename ReductionOperation, - typename VolatileReductionOperation, - typename DataFetcher > - static Result - reduce( const Index size, - ReductionOperation& reduction, - VolatileReductionOperation& volatileReduction, - DataFetcher& dataFetcher, - const Result& zero ); + public: + template< typename Index, + typename Result, + typename ReductionOperation, + typename VolatileReductionOperation, + typename DataFetcher > + static Result + reduce( const Index size, + ReductionOperation& reduction, + VolatileReductionOperation& volatileReduction, + DataFetcher& dataFetcher, + const Result& zero ); + + template< typename Index, + typename Result, + typename ReductionOperation, + typename VolatileReductionOperation, + typename DataFetcher > + static Result + reduceWithArgument( const Index size, + Index& argument, + ReductionOperation& reduction, + VolatileReductionOperation& volatileReduction, + DataFetcher& dataFetcher, + const Result& zero ); }; template<> class Reduction< Devices::Host > { -public: - template< typename Index, - typename Result, - typename ReductionOperation, - typename VolatileReductionOperation, - typename DataFetcher > - static Result - reduce( const Index size, - ReductionOperation& reduction, - VolatileReductionOperation& volatileReduction, - DataFetcher& dataFetcher, - const Result& zero ); + public: + template< typename Index, + typename Result, + typename ReductionOperation, + typename VolatileReductionOperation, + typename DataFetcher > + static Result + reduce( const Index size, + ReductionOperation& reduction, + VolatileReductionOperation& volatileReduction, + DataFetcher& dataFetcher, + const Result& zero ); + + template< typename Index, + typename Result, + typename ReductionOperation, + typename VolatileReductionOperation, + typename DataFetcher > + static Result + reduceWithArgument( const Index size, + Index& argument, + ReductionOperation& reduction, + VolatileReductionOperation& volatileReduction, + DataFetcher& dataFetcher, + const Result& zero ); }; template<> class Reduction< Devices::MIC > { -public: - template< typename Index, - typename Result, - typename ReductionOperation, - typename VolatileReductionOperation, - typename DataFetcher > - static Result - reduce( const Index size, - ReductionOperation& reduction, - VolatileReductionOperation& volatileReduction, - DataFetcher& dataFetcher, - const Result& zero ); + public: + template< typename Index, + typename Result, + typename ReductionOperation, + typename VolatileReductionOperation, + typename DataFetcher > + static Result + reduce( const Index size, + ReductionOperation& reduction, + VolatileReductionOperation& volatileReduction, + DataFetcher& dataFetcher, + const Result& zero ); + + template< typename Index, + typename Result, + typename ReductionOperation, + typename VolatileReductionOperation, + typename DataFetcher > + static Result + reduceWithArgument( const Index size, + Index& argument, + ReductionOperation& reduction, + VolatileReductionOperation& volatileReduction, + DataFetcher& dataFetcher, + const Result& zero ); + }; } // namespace Algorithms diff --git a/src/TNL/Containers/Algorithms/Reduction.hpp b/src/TNL/Containers/Algorithms/Reduction.hpp index 31c93504cc18f3616171b855a286345953ca19d0..7647ff94b71f2f0a1aaaa75cae9d1aa03edc8b82 100644 --- a/src/TNL/Containers/Algorithms/Reduction.hpp +++ b/src/TNL/Containers/Algorithms/Reduction.hpp @@ -65,26 +65,6 @@ Reduction< Devices::Cuda >:: //constexpr bool can_reduce_all_on_host = std::is_fundamental< DataType1 >::value || std::is_fundamental< DataType2 >::value || std::is_pointer< DataType1 >::value || std::is_pointer< DataType2 >::value; constexpr bool can_reduce_later_on_host = std::is_fundamental< ResultType >::value || std::is_pointer< ResultType >::value; - /*** - * First check if the input array(s) is/are large enough for the reduction on GPU. - * Otherwise copy it/them to host and reduce on CPU. - */ - // With lambda function we cannot reduce data on host - we do not know the data here. - /*if( can_reduce_all_on_host && size <= Reduction_minGpuDataSize ) - { - typename std::remove_const< DataType1 >::type hostArray1[ Reduction_minGpuDataSize ]; - ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( hostArray1, deviceInput1, size ); - if( deviceInput2 ) { - using _DT2 = typename std::conditional< std::is_same< DataType2, void >::value, DataType1, DataType2 >::type; - typename std::remove_const< _DT2 >::type hostArray2[ Reduction_minGpuDataSize ]; - ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( hostArray2, (_DT2*) deviceInput2, size ); - return Reduction< Devices::Host >::reduce( zero, dataFetcher, reduction, size, hostArray1, hostArray2 ); - } - else { - return Reduction< Devices::Host >::reduce( operation, size, hostArray1, (DataType2*) nullptr ); - } - }*/ - #ifdef CUDA_REDUCTION_PROFILING Timer timer; timer.reset(); @@ -149,22 +129,115 @@ Reduction< Devices::Cuda >:: timer.start(); #endif - //ResultType resultArray[ 1 ]; - //ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray, deviceAux1, reducedSize ); - //const ResultType result = resultArray[ 0 ]; + return result; + } +#else + throw Exceptions::CudaSupportMissing(); +#endif +}; + +template< typename Index, + typename Result, + typename ReductionOperation, + typename VolatileReductionOperation, + typename DataFetcher > +Result +Reduction< Devices::Cuda >:: +reduceWithArgument( const Index size, + Index& argument, + ReductionOperation& reduction, + VolatileReductionOperation& volatileReduction, + DataFetcher& dataFetcher, + const Result& zero ) +{ + #ifdef HAVE_CUDA + + using IndexType = Index; + using ResultType = Result; + + /*** + * Only fundamental and pointer types can be safely reduced on host. Complex + * objects stored on the device might contain pointers into the device memory, + * in which case reduction on host might fail. + */ + //constexpr bool can_reduce_all_on_host = std::is_fundamental< DataType1 >::value || std::is_fundamental< DataType2 >::value || std::is_pointer< DataType1 >::value || std::is_pointer< DataType2 >::value; + constexpr bool can_reduce_later_on_host = std::is_fundamental< ResultType >::value || std::is_pointer< ResultType >::value; + + #ifdef CUDA_REDUCTION_PROFILING + Timer timer; + timer.reset(); + timer.start(); + #endif + + CudaReductionKernelLauncher< IndexType, ResultType > reductionLauncher( size ); + + /**** + * Reduce the data on the CUDA device. + */ + ResultType* deviceAux1( nullptr ); + IndexType* deviceIndexes( nullptr ); + IndexType reducedSize = reductionLauncher.startWithArgument( + reduction, + volatileReduction, + dataFetcher, + zero, + deviceAux1, + deviceIndexes ); + #ifdef CUDA_REDUCTION_PROFILING + timer.stop(); + std::cout << " Reduction on GPU to size " << reducedSize << " took " << timer.getRealTime() << " sec. " << std::endl; + timer.reset(); + timer.start(); + #endif + + if( can_reduce_later_on_host ) { + /*** + * Transfer the reduced data from device to host. + */ + std::unique_ptr< ResultType[] > resultArray{ new ResultType[ reducedSize ] }; + ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray.get(), deviceAux1, reducedSize ); + + #ifdef CUDA_REDUCTION_PROFILING + timer.stop(); + std::cout << " Transferring data to CPU took " << timer.getRealTime() << " sec. " << std::endl; + timer.reset(); + timer.start(); + #endif + + /*** + * Reduce the data on the host system. + */ + auto fetch = [&] ( IndexType i ) { return resultArray[ i ]; }; + const ResultType result = Reduction< Devices::Host >::reduceWithArgument( reducedSize, reduction, volatileReduction, fetch, zero ); + + #ifdef CUDA_REDUCTION_PROFILING + timer.stop(); + std::cout << " Reduction of small data set on CPU took " << timer.getRealTime() << " sec. " << std::endl; + #endif + return result; + } + else { + /*** + * Data can't be safely reduced on host, so continue with the reduction on the CUDA device. + */ + auto result = reductionLauncher.finishWithArgument( argument, reduction, volatileReduction, zero ); - /*#ifdef CUDA_REDUCTION_PROFILING + #ifdef CUDA_REDUCTION_PROFILING timer.stop(); - std::cout << " Transferring the result to CPU took " << timer.getRealTime() << " sec. " << std::endl; - #endif*/ + std::cout << " Reduction of small data set on GPU took " << timer.getRealTime() << " sec. " << std::endl; + timer.reset(); + timer.start(); + #endif return result; } #else throw Exceptions::CudaSupportMissing(); #endif -}; +} +//// +// Reduction on host template< typename Index, typename Result, typename ReductionOperation, @@ -263,6 +336,139 @@ Reduction< Devices::Host >:: #endif } +template< typename Index, + typename Result, + typename ReductionOperation, + typename VolatileReductionOperation, + typename DataFetcher > +Result +Reduction< Devices::Host >:: +reduceWithArgument( const Index size, + Index& argument, + ReductionOperation& reduction, + VolatileReductionOperation& volatileReduction, + DataFetcher& dataFetcher, + const Result& zero ) +{ + using IndexType = Index; + using ResultType = Result; + + constexpr int block_size = 128; + const int blocks = size / block_size; + +#ifdef HAVE_OPENMP + if( TNL::Devices::Host::isOMPEnabled() && size >= 2 * block_size ) { + // global result variable + ResultType result = zero; + argument = -1; +#pragma omp parallel + { + // initialize array for thread-local results + ResultType r[ 4 ] = { zero, zero, zero, zero }; + IndexType arg[ 4 ] = { 0, 0, 0, 0 }; + bool initialised( false ); + + #pragma omp for nowait + for( int b = 0; b < blocks; b++ ) { + const IndexType offset = b * block_size; + for( int i = 0; i < block_size; i += 4 ) { + if( ! initialised ) { + arg[ 0 ] = offset + i; + arg[ 1 ] = offset + i + 1; + arg[ 2 ] = offset + i + 2; + arg[ 3 ] = offset + i + 3; + r[ 0 ] = dataFetcher( offset + i ); + r[ 1 ] = dataFetcher( offset + i + 1 ); + r[ 2 ] = dataFetcher( offset + i + 2 ); + r[ 3 ] = dataFetcher( offset + i + 3 ); + initialised = true; + continue; + } + reduction( arg[ 0 ], offset + i, r[ 0 ], dataFetcher( offset + i ) ); + reduction( arg[ 1 ], offset + i + 1, r[ 1 ], dataFetcher( offset + i + 1 ) ); + reduction( arg[ 2 ], offset + i + 2, r[ 2 ], dataFetcher( offset + i + 2 ) ); + reduction( arg[ 3 ], offset + i + 3, r[ 3 ], dataFetcher( offset + i + 3 ) ); + } + } + + // the first thread that reaches here processes the last, incomplete block + #pragma omp single nowait + { + for( IndexType i = blocks * block_size; i < size; i++ ) + reduction( arg[ 0 ], i, r[ 0 ], dataFetcher( i ) ); + } + + // local reduction of unrolled results + reduction( arg[ 0 ], arg[ 2 ], r[ 0 ], r[ 2 ] ); + reduction( arg[ 1 ], arg[ 3 ], r[ 1 ], r[ 3 ] ); + reduction( arg[ 0 ], arg[ 1 ], r[ 0 ], r[ 1 ] ); + + // inter-thread reduction of local results + #pragma omp critical + { + if( argument == - 1 ) + argument = arg[ 0 ]; + reduction( argument, arg[ 0 ], result, r[ 0 ] ); + } + } + return result; + } + else { +#endif + if( blocks > 1 ) { + // initialize array for unrolled results + ResultType r[ 4 ] = { zero, zero, zero, zero }; + IndexType arg[ 4 ] = { 0, 0, 0, 0 }; + bool initialised( false ); + + // main reduction (explicitly unrolled loop) + for( int b = 0; b < blocks; b++ ) { + const IndexType offset = b * block_size; + for( int i = 0; i < block_size; i += 4 ) { + if( ! initialised ) + { + arg[ 0 ] = offset + i; + arg[ 1 ] = offset + i + 1; + arg[ 2 ] = offset + i + 2; + arg[ 3 ] = offset + i + 3; + r[ 0 ] = dataFetcher( offset + i ); + r[ 1 ] = dataFetcher( offset + i + 1 ); + r[ 2 ] = dataFetcher( offset + i + 2 ); + r[ 3 ] = dataFetcher( offset + i + 3 ); + initialised = true; + continue; + } + reduction( arg[ 0 ], offset + i, r[ 0 ], dataFetcher( offset + i ) ); + reduction( arg[ 1 ], offset + i + 1, r[ 1 ], dataFetcher( offset + i + 1 ) ); + reduction( arg[ 2 ], offset + i + 2, r[ 2 ], dataFetcher( offset + i + 2 ) ); + reduction( arg[ 3 ], offset + i + 3, r[ 3 ], dataFetcher( offset + i + 3 ) ); + } + } + + // reduction of the last, incomplete block (not unrolled) + for( IndexType i = blocks * block_size; i < size; i++ ) + reduction( arg[ 0 ], i, r[ 0 ], dataFetcher( i ) ); + + // reduction of unrolled results + reduction( arg[ 0 ], arg[ 2 ], r[ 0 ], r[ 2 ] ); + reduction( arg[ 1 ], arg[ 3 ], r[ 1 ], r[ 3 ] ); + reduction( arg[ 0 ], arg[ 1 ], r[ 0 ], r[ 1 ] ); + argument = arg[ 0 ]; + return r[ 0 ]; + } + else { + ResultType result = dataFetcher( 0 ); + argument = 0; + for( IndexType i = 1; i < size; i++ ) + reduction( argument, i, result, dataFetcher( i ) ); + return result; + } +#ifdef HAVE_OPENMP + } +#endif +} + + } // namespace Algorithms } // namespace Containers } // namespace TNL diff --git a/src/TNL/Containers/Expressions/VerticalOperations.h b/src/TNL/Containers/Expressions/VerticalOperations.h index e7549e90acc30209b6ab6f3ddcb4888be9179e14..94c9781ba2f05c9a50dd3599bd1e68994e079cc4 100644 --- a/src/TNL/Containers/Expressions/VerticalOperations.h +++ b/src/TNL/Containers/Expressions/VerticalOperations.h @@ -31,6 +31,23 @@ auto StaticExpressionMin( const Expression& expression ) -> typename std::remove return aux; } +template< typename Expression, typename Real > +__cuda_callable__ +auto StaticExpressionArgMin( const Expression& expression, int& arg ) -> typename std::remove_reference< decltype( expression[ 0 ] ) >::type +{ + auto value = expression[ 0 ]; + arg = 0; + for( int i = 1; i < expression.getSize(); i++ ) + { + if( expression[ i ] < value ) + { + value = expression[ i ]; + arg = i; + } + } + return value; +} + template< typename Expression > __cuda_callable__ auto StaticExpressionMax( const Expression& expression ) -> typename std::remove_reference< decltype( expression[ 0 ] ) >::type @@ -41,6 +58,23 @@ auto StaticExpressionMax( const Expression& expression ) -> typename std::remove return aux; } +template< typename Expression, typename Real > +__cuda_callable__ +auto StaticExpressionArgMax( const Expression& expression, int& arg ) -> typename std::remove_reference< decltype( expression[ 0 ] ) >::type +{ + auto value = expression[ 0 ]; + arg = 0; + for( int i = 1; i < expression.getSize(); i++ ) + { + if( expression[ i ] > value ) + { + value = expression[ i ]; + arg = i; + } + } + return value; +} + template< typename Expression > __cuda_callable__ auto StaticExpressionSum( const Expression& expression ) -> typename std::remove_reference< decltype( expression[ 0 ] ) >::type @@ -140,6 +174,28 @@ auto ExpressionMin( const Expression& expression ) -> typename std::remove_refer return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() ); } +template< typename Expression > +auto ExpressionArgMin( const Expression& expression, typename Expression::IndexType& arg ) -> typename std::remove_reference< decltype( expression[ 0 ] ) >::type +{ + using ResultType = typename std::remove_cv< typename std::remove_reference< decltype( expression[ 0 ] ) >::type >::type; + using IndexType = typename Expression::IndexType; + + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; }; + auto reduction = [=] __cuda_callable__ ( IndexType& aIdx, const IndexType& bIdx, ResultType& a, const ResultType& b ) { + if( a < b ) { + a = b; + aIdx = bIdx; + } + }; + auto volatileReduction = [=] __cuda_callable__ ( volatile IndexType& aIdx, volatile IndexType& bIdx, volatile ResultType& a, volatile ResultType& b ) { + if( a < b ) { + a = b; + aIdx = bIdx; + } + }; + return Algorithms::Reduction< typename Expression::DeviceType >::reduceWithArgument( expression.getSize(), arg, reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() ); +} + template< typename Expression > auto ExpressionMax( const Expression& expression ) -> typename std::remove_reference< decltype( expression[ 0 ] ) >::type { @@ -152,6 +208,32 @@ auto ExpressionMax( const Expression& expression ) -> typename std::remove_refer return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::min() ); } +template< typename Expression > +auto ExpressionArgMax( const Expression& expression, typename Expression::IndexType& arg ) -> typename std::remove_reference< decltype( expression[ 0 ] ) >::type +{ + using ResultType = typename std::remove_cv< typename std::remove_reference< decltype( expression[ 0 ] ) >::type >::type; + using IndexType = typename Expression::IndexType; + + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; }; + auto reduction = [=] __cuda_callable__ ( IndexType& aIdx, const IndexType& bIdx, ResultType& a, const ResultType& b ) { + if( a > b ) { + a = b; + aIdx = bIdx; + } + else if( a == b && bIdx < aIdx ) + aIdx = bIdx; + }; + auto volatileReduction = [=] __cuda_callable__ ( volatile IndexType& aIdx, volatile IndexType& bIdx, volatile ResultType& a, volatile ResultType& b ) { + if( a > b ) { + a = b; + aIdx = bIdx; + } + else if( a == b && bIdx < aIdx ) + aIdx = bIdx; + }; + return Algorithms::Reduction< typename Expression::DeviceType >::reduceWithArgument( expression.getSize(), arg, reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::min() ); +} + template< typename Expression > auto ExpressionSum( const Expression& expression ) -> typename std::remove_reference< decltype( expression[ 0 ] ) >::type { diff --git a/src/TNL/Containers/StaticVectorExpressions.h b/src/TNL/Containers/StaticVectorExpressions.h index 9b6b78d64b29961579d77cb7248be1e3b16f90fc..e5c5a4900c9c39ee9e2ff18920ff3ceb28b727de 100644 --- a/src/TNL/Containers/StaticVectorExpressions.h +++ b/src/TNL/Containers/StaticVectorExpressions.h @@ -524,6 +524,14 @@ min( const Containers::StaticVector< Size, Real >& a ) return Containers::Expressions::StaticExpressionMin( a ); } +template< int Size, typename Real > +__cuda_callable__ +typename Containers::StaticVector< Size, Real >::RealType +argMin( const Containers::StaticVector< Size, Real >& a, int& arg ) +{ + return Containers::Expressions::StaticExpressionArgMin( a, arg ); +} + template< int Size, typename Real > __cuda_callable__ typename Containers::StaticVector< Size, Real >::RealType @@ -532,6 +540,14 @@ max( const Containers::StaticVector< Size, Real >& a ) return Containers::Expressions::StaticExpressionMax( a ); } +template< int Size, typename Real > +__cuda_callable__ +typename Containers::StaticVector< Size, Real >::RealType +argMax( const Containers::StaticVector< Size, Real >& a, int& arg ) +{ + return Containers::Expressions::StaticExpressionArgMax( a, arg ); +} + template< int Size, typename Real > __cuda_callable__ typename Containers::StaticVector< Size, Real >::RealType diff --git a/src/TNL/Containers/VectorViewExpressions.h b/src/TNL/Containers/VectorViewExpressions.h index b57089fd7b934b79d4f9092c1ce3594168ceff36..e1aaeaac6091f13e71f98b799362533bae1cb7a1 100644 --- a/src/TNL/Containers/VectorViewExpressions.h +++ b/src/TNL/Containers/VectorViewExpressions.h @@ -470,6 +470,15 @@ min( const Containers::VectorView< Real, Device, Index >& a ) return Containers::Expressions::ExpressionMin( a ); } +template< typename Real, + typename Device, + typename Index > +typename Containers::VectorView< Real, Device, Index >::RealType +argMin( const Containers::VectorView< Real, Device, Index >& a, Index& arg ) +{ + return Containers::Expressions::ExpressionArgMin( a, arg ); +} + template< typename Real, typename Device, typename Index > @@ -479,6 +488,15 @@ max( const Containers::VectorView< Real, Device, Index >& a ) return Containers::Expressions::ExpressionMax( a ); } +template< typename Real, + typename Device, + typename Index > +typename Containers::VectorView< Real, Device, Index >::RealType +argMax( const Containers::VectorView< Real, Device, Index >& a, Index& arg ) +{ + return Containers::Expressions::ExpressionArgMax( a, arg ); +} + template< typename Real, typename Device, typename Index > diff --git a/src/UnitTests/Containers/VectorTest-7.h b/src/UnitTests/Containers/VectorTest-7.h index 4843383d7cf1e7d460ead504c2943fc966ed9072..4ed937d4a6ccdcbde192362be635d18b07da0741 100644 --- a/src/UnitTests/Containers/VectorTest-7.h +++ b/src/UnitTests/Containers/VectorTest-7.h @@ -36,19 +36,23 @@ TYPED_TEST( VectorTest, verticalOperations ) using VectorType = typename TestFixture::VectorType; using ViewType = typename TestFixture::ViewType; using RealType = typename VectorType::RealType; + using IndexType = typename VectorType::IndexType; const int size = VECTOR_TEST_SIZE; - VectorType _u( size ), _v( size ); - ViewType u( _u ), v( _v ); + VectorType _u( size ), _v( size ), _w( size ); + ViewType u( _u ), v( _v ), w( _w ); RealType sum_( 0.0 ), absSum( 0.0 ), diffSum( 0.0 ), diffAbsSum( 0.0 ), absMin( size + 10.0 ), absMax( -size - 10.0 ), diffMin( 2 * size + 10.0 ), diffMax( - 2.0 * size - 10.0 ), - l2Norm( 0.0 ), l2NormDiff( 0.0 ); + l2Norm( 0.0 ), l2NormDiff( 0.0 ), argMinValue( size * size ), argMaxValue( -size * size ); + IndexType argMin( 0 ), argMax( 0 ); for( int i = 0; i < size; i++ ) { const RealType aux = ( RealType )( i - size / 2 ) / ( RealType ) size; + const RealType w_value = aux * aux - 5.0; u.setElement( i, aux ); v.setElement( i, -aux ); + w.setElement( i, w_value ); absMin = TNL::min( absMin, TNL::abs( aux ) ); absMax = TNL::max( absMax, TNL::abs( aux ) ); diffMin = TNL::min( diffMin, 2 * aux ); @@ -59,6 +63,14 @@ TYPED_TEST( VectorTest, verticalOperations ) diffAbsSum += TNL::abs( 2.0* aux ); l2Norm += aux * aux; l2NormDiff += 4.0 * aux * aux; + if( w_value < argMinValue ) { + argMinValue = w_value; + argMin = i; + } + if( w_value > argMaxValue ) { + argMaxValue = w_value; + argMax = i; + } } l2Norm = TNL::sqrt( l2Norm ); l2NormDiff = TNL::sqrt( l2NormDiff ); @@ -74,6 +86,10 @@ TYPED_TEST( VectorTest, verticalOperations ) EXPECT_NEAR( sum( abs( u - v ) ), diffAbsSum, 2.0e-5 ); EXPECT_NEAR( lpNorm( u, 2.0 ), l2Norm, 2.0e-5 ); EXPECT_NEAR( lpNorm( u - v, 2.0 ), l2NormDiff, 2.0e-5 ); + IndexType wArgMin, wArgMax; + EXPECT_NEAR( TNL::argMin( w, wArgMin ), argMinValue, 2.0e-5 ); + EXPECT_NEAR( TNL::argMax( w, wArgMax ), argMaxValue, 2.0e-5 ); + EXPECT_EQ( argMax, wArgMax ); } TYPED_TEST( VectorTest, scalarProduct )